Skip to content

Commit 5b3a30f

Browse files
dtemkin1psvenk
andauthored
Add pylint ci check, make scrapers a package (#129)
Resolves #126 --------- Co-authored-by: Pratyush Venkatakrishnan <[email protected]>
1 parent ff81747 commit 5b3a30f

File tree

15 files changed

+208
-152
lines changed

15 files changed

+208
-152
lines changed

.editorconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ max_line_length = 80
1313

1414
[*.py]
1515
indent_size = 4
16-
max_line_length = 80
16+
max_line_length = 88

.github/workflows/ci.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ on:
88

99
env:
1010
node-version: 20.x
11+
python-version: 3.7
1112

1213
jobs:
1314
black:
@@ -16,6 +17,24 @@ jobs:
1617
steps:
1718
- uses: actions/checkout@v4
1819
- uses: psf/black@stable
20+
pylint:
21+
name: Pylint
22+
runs-on: ubuntu-22.04
23+
steps:
24+
- uses: actions/checkout@v4
25+
- name: Set up Python 3.7
26+
uses: actions/setup-python@v5
27+
with:
28+
python-version: ${{ env.python-version }}
29+
cache: "pip"
30+
- name: Install dependencies
31+
run: |
32+
python -m pip install --upgrade pip
33+
pip install -r requirements.txt
34+
pip install pylint
35+
- name: Analysing the code with pylint
36+
run: |
37+
pylint $(git ls-files '*.py')
1938
prettier:
2039
name: Prettier
2140
runs-on: ubuntu-latest

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ There's the frontend, which is the website and the interface. Then there's the b
2222

2323
To spin up the site, we need two steps:
2424

25-
(1) We need to update the backend to get the data. `cd scrapers` then run `python update.py`.
25+
(1) We need to update the backend to get the data. Run `python3 -m scrapers`.
2626

2727
(2) We then can update the frontend, via running `npm run dev`. This will start a developer server. Open a browser tab to [`http://localhost:5173/`](http://localhost:5173/), which will update live as you edit code.
2828

deploy/cron_scripts/update_latest.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ REPO_DIR="/afs/sipb.mit.edu/project/hydrant/hydrant"
1313
# internet. In the locker, this is ~/web_scripts/hydrant.
1414
OUT_DIR="/afs/sipb.mit.edu/project/hydrant/web_scripts/hydrant"
1515

16-
cd "$REPO_DIR/scrapers"
16+
cd "$REPO_DIR"
1717

1818
# -q means quietly; don't report anything in stdout or stderr.
1919
# make sure we're in the right branch:
@@ -22,7 +22,7 @@ git pull -q
2222

2323
# The scripts machine we use has Python 3.7, so use that.
2424
# This updates $OUT_FILE.
25-
python3.7 update.py
25+
python3.7 -m scrapers
2626
OUT_FILE="$REPO_DIR/public/*.json"
2727

2828
# Copy $OUT_FILE to the output directory, so it can be served to the internet.

deploy/web_scripts/notify_build.py

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
#!/usr/bin/env python3
22

3-
# Accept a web-hook from GitHub telling us about
4-
# a new built version of Hydrant.
3+
""" Accept a web-hook from GitHub telling us about a new built version of Hydrant. """
54

6-
import json, requests, traceback
7-
8-
from sys import stdin, stdout
9-
from os import environ, path
5+
import json
6+
import traceback
107
from hmac import digest
11-
from tempfile import TemporaryDirectory
8+
from os import environ, path
9+
from sys import stdin, stdout
1210
from zipfile import ZipFile
1311

12+
import requests
13+
1414
LOCKER_DIR = "/afs/sipb.mit.edu/project/hydrant"
1515

1616
OUTPUT_DIR = path.join(LOCKER_DIR, "web_scripts/hydrant")
@@ -21,13 +21,17 @@
2121
GITHUB_TOKEN = path.join(CI_SECRETS_DIR, "github_token")
2222

2323

24+
# pylint: disable=too-many-locals
2425
def main():
26+
"""
27+
Fetch the artifact from the GitHub API and extract it into the output directory.
28+
"""
2529
# Secret, used for HMAC input validation (so we know GitHub is being real)
26-
with open(HASH_SECRET) as fh:
27-
secret = fh.read().strip().encode("utf-8")
30+
with open(HASH_SECRET, encoding="utf-8") as file_hash:
31+
secret = file_hash.read().strip().encode("utf-8")
2832
# API token for GitHub API requests (to get a path to the file).
29-
with open(GITHUB_TOKEN) as fh:
30-
token = fh.read().strip()
33+
with open(GITHUB_TOKEN, encoding="utf-8") as file_token:
34+
token = file_token.read().strip()
3135

3236
# Slurp content and validate with HMAC
3337
body = stdin.read()
@@ -45,9 +49,8 @@ def main():
4549

4650
# Fetch a list of artifacts from the GitHub API
4751
response = requests.get(
48-
"https://api.github.com/repos/sipb/hydrant/actions/runs/{}/artifacts".format(
49-
job_id
50-
)
52+
f"https://api.github.com/repos/sipb/hydrant/actions/runs/{job_id}/artifacts",
53+
timeout=3,
5154
)
5255
if not response.ok:
5356
raise ValueError("bad artifact fetch response: " + str(response.status_code))
@@ -64,31 +67,33 @@ def main():
6467
if not url:
6568
continue
6669
# then fetch it.
67-
response = requests.get(url, headers={"Authorization": ("Bearer " + token)})
70+
response = requests.get(
71+
url, headers={"Authorization": ("Bearer " + token)}, timeout=3
72+
)
6873
fname = path.join(LOCKER_DIR, "build_artifact.zip")
69-
with open(fname, "wb") as fh:
74+
with open(fname, "wb") as file_buffer:
7075
for chunk in response.iter_content(chunk_size=4096):
71-
fh.write(chunk)
76+
file_buffer.write(chunk)
7277
# Extract into the output directory.
7378
with ZipFile(fname, "r") as zfh:
7479
zfh.extractall(OUTPUT_DIR)
7580
success = True
7681
break
77-
return (
78-
"Fetched artifact successfully"
79-
if success
80-
else "Could not find artifact among {}: {}".format(
81-
len(artifacts), ", ".join(a.get("name") for a in artifacts)
82-
)
83-
)
82+
83+
if success:
84+
return "Fetched artifact successfully"
85+
86+
artifact_names = ", ".join(a.get("name") for a in artifacts)
87+
return f"Could not find artifact among {len(artifacts)}: {artifact_names}"
8488

8589

8690
if __name__ == "__main__":
8791
# Respond to the request, it's only polite.
8892
print("Content-Type: text/plain\r\n\r")
8993
try:
9094
print(main())
95+
# pylint: disable=broad-except
9196
except Exception as e:
9297
print(traceback.format_exc(), file=stdout)
93-
with open(ERROR_LOG, "w") as fh:
94-
print(traceback.format_exc(), file=fh)
98+
with open(ERROR_LOG, "w", encoding="utf-8") as fe:
99+
print(traceback.format_exc(), file=fe)

pylintrc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[format]
2+
max-line-length = 88

scrapers/.pylintrc

Lines changed: 0 additions & 13 deletions
This file was deleted.

scrapers/README.md

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
This folder contains several files. The files tracked by git are:
44

55
* `__init__.py`
6+
* `__main__.py`
67
* `catalog.py`
8+
* `cim.py`
79
* `fireroad.py`
810
* `math_dept.py`
911
* `package.py`
@@ -16,13 +18,15 @@ This folder contains several files. The files tracked by git are:
1618
The files intentionally left out of git are:
1719

1820
* `catalog.json`
21+
* `cim.json`
1922
* `fireroad.json`
23+
* `fireroad-presem.json`
2024
* `__pycache__`
2125
* `.DS_Store`
2226

2327
## Usage ##
2428

25-
Run `python3 update.py` to execute the code. In production, there is a cron job that runs this every hour.
29+
Run `python3 -m scrapers` from the root directory to execute the code. In production, there is a cron job that runs this every hour.
2630

2731
This program gets its data from MIT classes from two sources:
2832

@@ -33,16 +37,17 @@ It is mainly intended to serve as a data source for the frontend, which is the r
3337

3438
## How it works ##
3539

36-
`update.py` calls three other programs, in this order: `fireroad.py`, `catalog.py`, `package.py`. Each of these four files has a `run()` function, which is its main entry point to the codebase. Broadly speaking:
40+
`__main__.py` calls four other programs, in this order: `fireroad.py`, `catalog.py`, `cim.py`, `package.py`. Each of these four files has a `run()` function, which is its main entry point to the codebase. Broadly speaking:
3741

38-
* `fireroad.py` creates `fireroad.json`
42+
* `fireroad.py` creates `fireroad.json` and `fireroad-presem.json`
3943
* `catalog.py` creates `catalog.json`
40-
* `package.py` combines these to create `../public/latest.json`. (This is the final product that our frontend ingests.)
44+
* `cim.py` creates `cim.json`
45+
* `package.py` combines these to create `../public/latest.json` and another JSON file under `../public/` that corresponds to IAP or summer. (This is the final product that our frontend ingests.)
4146

4247
`math_dept.py` is an irregularly run file that helps create override data for courses in the MIT math department (since those are formatted slightly differently). `utils.py` contains a few utility functions and variables, which in turn are used by `fireroad.py` and `package.py`. The file `__init__.py` is empty but we include it anyways for completeness.
4348

4449
## Contributing ##
4550

4651
This folder is actually a subfolder of a larger git repository. If you want to contribute to this repository, submit a pull request to https://github.com/sipb/hydrant and we'll merge it if it looks good.
4752

48-
Depending on how you work, you might find `pylint` and/or running individual programs one at a time and then playing around with the Python shell to be helpful.
53+
Depending on how you work, you might find `pylint` and/or running individual programs one at a time and then playing around with the Python shell to be helpful.

scrapers/update.py renamed to scrapers/__main__.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,32 @@
11
"""
2-
This is the entry point. Run `python3 update.py` to test this code.
2+
This is the entry point. Run `python3 -m scrapers` to test this code.
33
44
In production, there's a cron job that runs this script every hour.
55
66
Functions:
77
* run()
88
"""
99

10-
import fireroad
11-
import catalog
12-
import cim
13-
import package
10+
from .fireroad import run as fireroad_run
11+
from .catalog import run as catalog_run
12+
from .cim import run as cim_run
13+
from .package import run as package_run
1414

1515

1616
def run():
1717
"""
1818
This function is the entry point. There are no arguments.
1919
"""
2020
print("=== Update fireroad data (pre-semester) ===")
21-
fireroad.run(False)
21+
fireroad_run(False)
2222
print("=== Update fireroad data (semester) ===")
23-
fireroad.run(True)
23+
fireroad_run(True)
2424
print("=== Update catalog data ===")
25-
catalog.run()
25+
catalog_run()
2626
print("=== Update CI-M data ===")
27-
cim.run()
27+
cim_run()
2828
print("=== Packaging ===")
29-
package.run()
29+
package_run()
3030

3131

3232
if __name__ == "__main__":

scrapers/catalog.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
"""
1717

1818
import json
19+
import os.path
1920
import re
21+
2022
import requests
2123
from bs4 import BeautifulSoup, Tag
2224

@@ -105,7 +107,7 @@ def get_half(html):
105107
"""
106108
if html.find(text=re.compile("first half of term")):
107109
return 1
108-
elif html.find(text=re.compile("second half of term")):
110+
if html.find(text=re.compile("second half of term")):
109111
return 2
110112
return False
111113

@@ -118,7 +120,6 @@ def is_limited(html):
118120
Returns:
119121
* bool: True if enrollment in the class is limited
120122
"""
121-
# TODO: can we do better?
122123
if html.find(text=re.compile("[Ll]imited")):
123124
return True
124125
return False
@@ -149,8 +150,8 @@ def get_home_catalog_links():
149150
Returns:
150151
* list[str]: relative links to major-specific subpages to scrape
151152
"""
152-
r = requests.get(BASE_URL + "/index.cgi", timeout=3)
153-
html = BeautifulSoup(r.content, "html.parser")
153+
catalog_req = requests.get(BASE_URL + "/index.cgi", timeout=3)
154+
html = BeautifulSoup(catalog_req.content, "html.parser")
154155
home_list = html.select_one("td[valign=top][align=left] > ul")
155156
return [a["href"] for a in home_list.find_all("a", href=True)]
156157

@@ -166,12 +167,12 @@ def get_all_catalog_links(initial_hrefs):
166167
* list[str]: A more complete list of relative links to subpages to scrape
167168
"""
168169
hrefs = []
169-
for il in initial_hrefs:
170-
r = requests.get(f"{BASE_URL}/{il}", timeout=3)
171-
html = BeautifulSoup(r.content, "html.parser")
170+
for initial_href in initial_hrefs:
171+
href_req = requests.get(f"{BASE_URL}/{initial_href}", timeout=3)
172+
html = BeautifulSoup(href_req.content, "html.parser")
172173
# Links should be in the only table in the #contentmini div
173174
tables = html.find("div", id="contentmini").find_all("table")
174-
hrefs.append(il)
175+
hrefs.append(initial_href)
175176
for table in tables:
176177
hrefs.extend([ele["href"] for ele in table.findAll("a", href=True)])
177178
return hrefs
@@ -206,17 +207,18 @@ def get_anchors_with_classname(element):
206207
def scrape_courses_from_page(courses, href):
207208
"""
208209
Fills courses with course data from the href
209-
(This function does NOT return a value. Instead, it modifies the `courses` variable.)
210+
211+
This function does NOT return a value. Instead, it modifies the `courses` variable.
210212
211213
Args:
212214
* courses
213215
* href
214216
215217
Returns: none
216218
"""
217-
r = requests.get(f"{BASE_URL}/{href}", timeout=3)
219+
href_req = requests.get(f"{BASE_URL}/{href}", timeout=3)
218220
# The "html.parser" parses pretty badly
219-
html = BeautifulSoup(r.content, "lxml")
221+
html = BeautifulSoup(href_req.content, "lxml")
220222
classes_content = html.find("table", width="100%", border="0").find("td")
221223

222224
# For index idx, contents[idx] corresponds to the html content for the courses in
@@ -229,7 +231,7 @@ def scrape_courses_from_page(courses, href):
229231
if anchors:
230232
new_course_nums = [anchor["name"] for anchor in anchors]
231233
# This means the course listed is a class range (e.g. 11.S196-11.S199)
232-
# Therefore, we continue looking for content but also add an extra course_num
234+
# Thus, we continue looking for content but also add an extra course_num
233235
if contents and not contents[-1]:
234236
course_nums_list[-1].extend(new_course_nums)
235237
continue
@@ -265,8 +267,10 @@ def run():
265267
print(f"Scraping page: {href}")
266268
scrape_courses_from_page(courses, href)
267269
print(f"Got {len(courses)} courses")
268-
with open("catalog.json", "w", encoding="utf-8") as f:
269-
json.dump(courses, f)
270+
271+
fname = os.path.join(os.path.dirname(__file__), "catalog.json")
272+
with open(fname, "w", encoding="utf-8") as catalog_file:
273+
json.dump(courses, catalog_file)
270274

271275

272276
if __name__ == "__main__":

0 commit comments

Comments
 (0)