-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap.py
executable file
·120 lines (96 loc) · 3.76 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
"""scrap.py module
This module gets CESAs from LWN.net and official CentOS ML and generates yaml files
Todo:
- make it more parallel
- unit test
"""
import re
from concurrent.futures import ThreadPoolExecutor as PoolExecutor
import requests
from jinja2 import Template
from bs4 import BeautifulSoup
def get_it(url):
"""Get content of url
Args:
url (str): The url to reach
"""
response = requests.get(url)
return response.text
def clean_word(word):
"""Clean word from unwanted chars: [, ], '
Args:
word (str): The string to be cleaned
"""
return word.replace('[', '').replace(']', '').replace("'", '')
def get_cesa_links(max_link=50):
"""Returns a list: of CESA links to be processed from LWN
Args:
max_link (int): max number of links to consider
"""
cesa_url = "https://lwn.net/Alerts/CentOS/?n={}".format(max_link)
lwn_url = 'https://lwn.net'
cesa_pattern = "CESA-*"
response = requests.get(cesa_url)
if response:
soup = BeautifulSoup(response.text, "html.parser")
cesas = [link['href'] for link in soup.findAll(
'a', href=True) if re.match(cesa_pattern, link.text)]
cesa_links = {}
if cesas is None:
print("No links found from LWN.net")
else:
for cesa in cesas:
cesa_links[cesa.split("/")[2]] = lwn_url + cesa
return cesa_links
def get_cesa_details(cesa_found):
"""Parse the CESAs and generates the yaml files for ATLAS
Args:
CESAs (dict): The CESAs dict generated by the getCESALinks() function.
official (bool) : Flag for LWN/CentOS source
"""
cesa_to_6_template = {}
cesa_to_7_template = {}
if isinstance(cesa_found, dict):
cesas_links = list(cesa_found.values())
else:
cesas_links = list(cesa_found)
responses = []
with PoolExecutor(max_workers=16) as executor:
for resp in executor.map(get_it, iter(cesas_links)):
responses.append(resp)
for resp in responses:
response = resp
prm_pattern = r"([0-9a-zA-Z\.\-_]*.rpm)"
package_pattern = r"\([a-zA-Z0-9-.^\(^\)]*\)"
centos_version_pattern = r"(CentOS\s[6-7])"
cesa_number_pattern = r"(CESA\-[0-9]{4}\:[0-9]{4})"
soup = BeautifulSoup(response, 'html.parser')
cesa_number = str(re.findall(
cesa_number_pattern, str(response))[0])
cesa_title = clean_word(str(re.findall(package_pattern, str(soup.title))).replace(
'(', '').replace(')', ''))
cesa_os = str(re.findall(centos_version_pattern, str(response)))
if '6' in cesa_os:
cesa_subject = cesa_title + "#" + cesa_number
cesa_to_6_template[cesa_subject] = re.findall(
prm_pattern, soup.find('p').text)
rpms = sorted(cesa_to_6_template[cesa_subject])
with open('templates/template6.yml') as file_:
template = Template(file_.read())
template.stream(cesa_number=cesa_number, package_name=cesa_title,
rpms=rpms).dump('C6/'+cesa_number+'.yml')
else:
cesa_subject = cesa_title + "#" + cesa_number
cesa_to_7_template[cesa_subject] = re.findall(
prm_pattern, soup.find('p').text)
rpms = sorted(cesa_to_7_template[cesa_subject])
with open('templates/template7.yml') as file_:
template = Template(file_.read())
template.stream(cesa_number=cesa_number, package_name=cesa_title,
rpms=rpms).dump('C7/'+cesa_number+'.yml')
if __name__ == "__main__":
try:
get_cesa_details(get_cesa_links())
except IndexError:
print("Error in finding content from LWN.net!")