sipb
diff --git a/‎.editorconfig
Lines changed: 1 addition & 1 deletion b/‎.editorconfig
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci.yml
Lines changed: 19 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 19 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/cron_scripts/update_latest.sh
Lines changed: 2 additions & 2 deletions b/‎deploy/cron_scripts/update_latest.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎deploy/web_scripts/notify_build.py
Lines changed: 31 additions & 26 deletions b/‎deploy/web_scripts/notify_build.py
Lines changed: 31 additions & 26 deletions
diff --git a/‎pylintrc
Lines changed: 2 additions & 0 deletions b/‎pylintrc
Lines changed: 2 additions & 0 deletions
diff --git a/‎scrapers/.pylintrc
Lines changed: 0 additions & 13 deletions b/‎scrapers/.pylintrc
Lines changed: 0 additions & 13 deletions
diff --git a/‎scrapers/README.md
Lines changed: 10 additions & 5 deletions b/‎scrapers/README.md
Lines changed: 10 additions & 5 deletions
diff --git a/‎scrapers/update.py renamed to ‎scrapers/__main__.py
Lines changed: 10 additions & 10 deletions b/‎scrapers/update.py renamed to ‎scrapers/__main__.py
Lines changed: 10 additions & 10 deletions
diff --git a/‎scrapers/catalog.py
Lines changed: 18 additions & 14 deletions b/‎scrapers/catalog.py
Lines changed: 18 additions & 14 deletions
@@ -13,4 +13,4 @@ max_line_length = 80
 
 [*.py]
 indent_size = 4
-max_line_length = 80
+max_line_length = 88
@@ -8,6 +8,7 @@ on:
 
 env:
   node-version: 20.x
+  python-version: 3.7
 
 jobs:
   black:
@@ -16,6 +17,24 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: psf/black@stable
+  pylint:
+    name: Pylint
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.python-version }}
+          cache: "pip"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pylint
+      - name: Analysing the code with pylint
+        run: |
+          pylint $(git ls-files '*.py')
   prettier:
     name: Prettier
     runs-on: ubuntu-latest
 
@@ -22,7 +22,7 @@ There's the frontend, which is the website and the interface. Then there's the b
 
 To spin up the site, we need two steps:
 
-(1) We need to update the backend to get the data. `cd scrapers` then run `python update.py`.
+(1) We need to update the backend to get the data. Run `python3 -m scrapers`.
 
 (2) We then can update the frontend, via running `npm run dev`. This will start a developer server. Open a browser tab to [`http://localhost:5173/`](http://localhost:5173/), which will update live as you edit code.
 
 
@@ -13,7 +13,7 @@ REPO_DIR="/afs/sipb.mit.edu/project/hydrant/hydrant"
 # internet. In the locker, this is ~/web_scripts/hydrant.
 OUT_DIR="/afs/sipb.mit.edu/project/hydrant/web_scripts/hydrant"
 
-cd "$REPO_DIR/scrapers"
+cd "$REPO_DIR"
 
 # -q means quietly; don't report anything in stdout or stderr.
 # make sure we're in the right branch:
@@ -22,7 +22,7 @@ git pull -q
 
 # The scripts machine we use has Python 3.7, so use that.
 # This updates $OUT_FILE.
-python3.7 update.py
+python3.7 -m scrapers
 OUT_FILE="$REPO_DIR/public/*.json"
 
 # Copy $OUT_FILE to the output directory, so it can be served to the internet.
 
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 
-# Accept a web-hook from GitHub telling us about
-# a new built version of Hydrant.
+""" Accept a web-hook from GitHub telling us about a new built version of Hydrant. """
 
-import json, requests, traceback
-
-from sys import stdin, stdout
-from os import environ, path
+import json
+import traceback
 from hmac import digest
-from tempfile import TemporaryDirectory
+from os import environ, path
+from sys import stdin, stdout
 from zipfile import ZipFile
 
+import requests
+
 LOCKER_DIR = "/afs/sipb.mit.edu/project/hydrant"
 
 OUTPUT_DIR = path.join(LOCKER_DIR, "web_scripts/hydrant")
@@ -21,13 +21,17 @@
 GITHUB_TOKEN = path.join(CI_SECRETS_DIR, "github_token")
 
 
+# pylint: disable=too-many-locals
 def main():
+    """
+    Fetch the artifact from the GitHub API and extract it into the output directory.
+    """
     # Secret, used for HMAC input validation (so we know GitHub is being real)
-    with open(HASH_SECRET) as fh:
-        secret = fh.read().strip().encode("utf-8")
+    with open(HASH_SECRET, encoding="utf-8") as file_hash:
+        secret = file_hash.read().strip().encode("utf-8")
     # API token for GitHub API requests (to get a path to the file).
-    with open(GITHUB_TOKEN) as fh:
-        token = fh.read().strip()
+    with open(GITHUB_TOKEN, encoding="utf-8") as file_token:
+        token = file_token.read().strip()
 
     # Slurp content and validate with HMAC
     body = stdin.read()
@@ -45,9 +49,8 @@ def main():
 
     # Fetch a list of artifacts from the GitHub API
     response = requests.get(
-        "https://api.github.com/repos/sipb/hydrant/actions/runs/{}/artifacts".format(
-            job_id
-        )
+        f"https://api.github.com/repos/sipb/hydrant/actions/runs/{job_id}/artifacts",
+        timeout=3,
     )
     if not response.ok:
         raise ValueError("bad artifact fetch response: " + str(response.status_code))
@@ -64,31 +67,33 @@ def main():
         if not url:
             continue
         # then fetch it.
-        response = requests.get(url, headers={"Authorization": ("Bearer " + token)})
+        response = requests.get(
+            url, headers={"Authorization": ("Bearer " + token)}, timeout=3
+        )
         fname = path.join(LOCKER_DIR, "build_artifact.zip")
-        with open(fname, "wb") as fh:
+        with open(fname, "wb") as file_buffer:
             for chunk in response.iter_content(chunk_size=4096):
-                fh.write(chunk)
+                file_buffer.write(chunk)
         # Extract into the output directory.
         with ZipFile(fname, "r") as zfh:
             zfh.extractall(OUTPUT_DIR)
         success = True
         break
-    return (
-        "Fetched artifact successfully"
-        if success
-        else "Could not find artifact among {}: {}".format(
-            len(artifacts), ", ".join(a.get("name") for a in artifacts)
-        )
-    )
+
+    if success:
+        return "Fetched artifact successfully"
+
+    artifact_names = ", ".join(a.get("name") for a in artifacts)
+    return f"Could not find artifact among {len(artifacts)}: {artifact_names}"
 
 
 if __name__ == "__main__":
     # Respond to the request, it's only polite.
     print("Content-Type: text/plain\r\n\r")
     try:
         print(main())
+    # pylint: disable=broad-except
     except Exception as e:
         print(traceback.format_exc(), file=stdout)
-        with open(ERROR_LOG, "w") as fh:
-            print(traceback.format_exc(), file=fh)
+        with open(ERROR_LOG, "w", encoding="utf-8") as fe:
+            print(traceback.format_exc(), file=fe)
@@ -0,0 +1,2 @@
+[format]
+max-line-length = 88
@@ -3,7 +3,9 @@
 This folder contains several files. The files tracked by git are:
 
 * `__init__.py`
+* `__main__.py`
 * `catalog.py`
+* `cim.py`
 * `fireroad.py`
 * `math_dept.py`
 * `package.py`
@@ -16,13 +18,15 @@ This folder contains several files. The files tracked by git are:
 The files intentionally left out of git are:
 
 * `catalog.json`
+* `cim.json`
 * `fireroad.json`
+* `fireroad-presem.json`
 * `__pycache__`
 * `.DS_Store`
 
 ## Usage ##
 
-Run `python3 update.py` to execute the code. In production, there is a cron job that runs this every hour.
+Run `python3 -m scrapers` from the root directory to execute the code. In production, there is a cron job that runs this every hour.
 
 This program gets its data from MIT classes from two sources:
 
@@ -33,16 +37,17 @@ It is mainly intended to serve as a data source for the frontend, which is the r
 
 ## How it works ##
 
-`update.py` calls three other programs, in this order: `fireroad.py`, `catalog.py`, `package.py`. Each of these four files has a `run()` function, which is its main entry point to the codebase. Broadly speaking:
+`__main__.py` calls four other programs, in this order: `fireroad.py`, `catalog.py`, `cim.py`, `package.py`. Each of these four files has a `run()` function, which is its main entry point to the codebase. Broadly speaking:
 
-* `fireroad.py` creates `fireroad.json`
+* `fireroad.py` creates `fireroad.json` and `fireroad-presem.json`
 * `catalog.py` creates `catalog.json`
-* `package.py` combines these to create `../public/latest.json`. (This is the final product that our frontend ingests.)
+* `cim.py` creates `cim.json`
+* `package.py` combines these to create `../public/latest.json` and another JSON file under `../public/` that corresponds to IAP or summer. (This is the final product that our frontend ingests.)
 
 `math_dept.py` is an irregularly run file that helps create override data for courses in the MIT math department (since those are formatted slightly differently). `utils.py` contains a few utility functions and variables, which in turn are used by `fireroad.py` and `package.py`. The file `__init__.py` is empty but we include it anyways for completeness.
 
 ## Contributing ##
 
 This folder is actually a subfolder of a larger git repository. If you want to contribute to this repository, submit a pull request to https://github.com/sipb/hydrant and we'll merge it if it looks good.
 
-Depending on how you work, you might find `pylint` and/or running individual programs one at a time and then playing around with the Python shell to be helpful.
+Depending on how you work, you might find `pylint` and/or running individual programs one at a time and then playing around with the Python shell to be helpful.
@@ -1,32 +1,32 @@
 """
-This is the entry point. Run `python3 update.py` to test this code.
+This is the entry point. Run `python3 -m scrapers` to test this code.
 
 In production, there's a cron job that runs this script every hour.
 
 Functions:
 * run()
 """
 
-import fireroad
-import catalog
-import cim
-import package
+from .fireroad import run as fireroad_run
+from .catalog import run as catalog_run
+from .cim import run as cim_run
+from .package import run as package_run
 
 
 def run():
     """
     This function is the entry point. There are no arguments.
     """
     print("=== Update fireroad data (pre-semester) ===")
-    fireroad.run(False)
+    fireroad_run(False)
     print("=== Update fireroad data (semester) ===")
-    fireroad.run(True)
+    fireroad_run(True)
     print("=== Update catalog data ===")
-    catalog.run()
+    catalog_run()
     print("=== Update CI-M data ===")
-    cim.run()
+    cim_run()
     print("=== Packaging ===")
-    package.run()
+    package_run()
 
 
 if __name__ == "__main__":
 
@@ -16,7 +16,9 @@
 """
 
 import json
+import os.path
 import re
+
 import requests
 from bs4 import BeautifulSoup, Tag
 
@@ -105,7 +107,7 @@ def get_half(html):
     """
     if html.find(text=re.compile("first half of term")):
         return 1
-    elif html.find(text=re.compile("second half of term")):
+    if html.find(text=re.compile("second half of term")):
         return 2
     return False
 
@@ -118,7 +120,6 @@ def is_limited(html):
     Returns:
     * bool: True if enrollment in the class is limited
     """
-    # TODO: can we do better?
     if html.find(text=re.compile("[Ll]imited")):
         return True
     return False
@@ -149,8 +150,8 @@ def get_home_catalog_links():
     Returns:
     * list[str]: relative links to major-specific subpages to scrape
     """
-    r = requests.get(BASE_URL + "/index.cgi", timeout=3)
-    html = BeautifulSoup(r.content, "html.parser")
+    catalog_req = requests.get(BASE_URL + "/index.cgi", timeout=3)
+    html = BeautifulSoup(catalog_req.content, "html.parser")
     home_list = html.select_one("td[valign=top][align=left] > ul")
     return [a["href"] for a in home_list.find_all("a", href=True)]
 
@@ -166,12 +167,12 @@ def get_all_catalog_links(initial_hrefs):
     * list[str]: A more complete list of relative links to subpages to scrape
     """
     hrefs = []
-    for il in initial_hrefs:
-        r = requests.get(f"{BASE_URL}/{il}", timeout=3)
-        html = BeautifulSoup(r.content, "html.parser")
+    for initial_href in initial_hrefs:
+        href_req = requests.get(f"{BASE_URL}/{initial_href}", timeout=3)
+        html = BeautifulSoup(href_req.content, "html.parser")
         # Links should be in the only table in the #contentmini div
         tables = html.find("div", id="contentmini").find_all("table")
-        hrefs.append(il)
+        hrefs.append(initial_href)
         for table in tables:
             hrefs.extend([ele["href"] for ele in table.findAll("a", href=True)])
     return hrefs
@@ -206,17 +207,18 @@ def get_anchors_with_classname(element):
 def scrape_courses_from_page(courses, href):
     """
     Fills courses with course data from the href
-    (This function does NOT return a value. Instead, it modifies the `courses` variable.)
+
+    This function does NOT return a value. Instead, it modifies the `courses` variable.
 
     Args:
     * courses
     * href
 
     Returns: none
     """
-    r = requests.get(f"{BASE_URL}/{href}", timeout=3)
+    href_req = requests.get(f"{BASE_URL}/{href}", timeout=3)
     # The "html.parser" parses pretty badly
-    html = BeautifulSoup(r.content, "lxml")
+    html = BeautifulSoup(href_req.content, "lxml")
     classes_content = html.find("table", width="100%", border="0").find("td")
 
     # For index idx, contents[idx] corresponds to the html content for the courses in
@@ -229,7 +231,7 @@ def scrape_courses_from_page(courses, href):
         if anchors:
             new_course_nums = [anchor["name"] for anchor in anchors]
             # This means the course listed is a class range (e.g. 11.S196-11.S199)
-            # Therefore, we continue looking for content but also add an extra course_num
+            # Thus, we continue looking for content but also add an extra course_num
             if contents and not contents[-1]:
                 course_nums_list[-1].extend(new_course_nums)
                 continue
@@ -265,8 +267,10 @@ def run():
         print(f"Scraping page: {href}")
         scrape_courses_from_page(courses, href)
     print(f"Got {len(courses)} courses")
-    with open("catalog.json", "w", encoding="utf-8") as f:
-        json.dump(courses, f)
+
+    fname = os.path.join(os.path.dirname(__file__), "catalog.json")
+    with open(fname, "w", encoding="utf-8") as catalog_file:
+        json.dump(courses, catalog_file)
 
 
 if __name__ == "__main__":