jobs/scrape.py at master · NathanWCarlson/jobs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Scrape BLS Occupational Outlook Handbook detail pages (raw HTML).

Saves raw HTML to html/<slug>.html as the source of truth.
Run process.py afterwards to derive data/<slug>.json and pages/<slug>.md.

Usage:
    uv run python scrape.py                      # scrape all (0 to 342)
    uv run python scrape.py --start 0 --end 5    # scrape first 5
    uv run python scrape.py --start 10 --end 20  # scrape indices 10-19
    uv run python scrape.py --force               # re-scrape ignoring cache

Caching: skips any occupation where html/<slug>.html already exists.
"""

import argparse
import json
import os
import time
from playwright.sync_api import sync_playwright

# ---------------------------------------------------------------------------
# Main scraper
# ---------------------------------------------------------------------------
def main():
    parser = argparse.ArgumentParser(description="Scrape BLS OOH pages")
    parser.add_argument("--start", type=int, default=0, help="Start index (inclusive)")
    parser.add_argument("--end", type=int, default=None, help="End index (exclusive)")
    parser.add_argument("--force", action="store_true", help="Re-scrape even if cached")
    parser.add_argument("--delay", type=float, default=1.0, help="Seconds between requests")
    args = parser.parse_args()

    # Load master list
    with open("occupations.json") as f:
        occupations = json.load(f)

    end = args.end if args.end is not None else len(occupations)
    subset = occupations[args.start:end]

    # Create output dirs
    os.makedirs("html", exist_ok=True)
    os.makedirs("data", exist_ok=True)
    os.makedirs("pages", exist_ok=True)

    # Figure out what needs scraping (cache based on html/ existence)
    to_scrape = []
    for i, occ in enumerate(subset, start=args.start):
        html_path = f"html/{occ['slug']}.html"
        if not args.force and os.path.exists(html_path):
            print(f"  [{i}] CACHED {occ['title']}")
            continue
        to_scrape.append((i, occ))

    if not to_scrape:
        print("Nothing to scrape — all cached.")
        return

    print(f"\nScraping {len(to_scrape)} occupations (non-headless Chromium)...\n")

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        page = browser.new_page()

        for idx, (i, occ) in enumerate(to_scrape):
            slug = occ["slug"]
            url = occ["url"]
            html_path = f"html/{slug}.html"

            print(f"  [{i}] {occ['title']}...", end=" ", flush=True)

            try:
                resp = page.goto(url, wait_until="domcontentloaded", timeout=15000)
                if resp.status != 200:
                    print(f"HTTP {resp.status} — SKIPPED")
                    continue

                # Save raw HTML — this is the source of truth
                html = page.content()
                with open(html_path, "w") as f:
                    f.write(html)

                print(f"OK ({len(html):,} bytes)")

            except Exception as e:
                print(f"ERROR: {e}")

            # Be polite
            if idx < len(to_scrape) - 1:
                time.sleep(args.delay)

        browser.close()

    # Summary
    cached = len([f for f in os.listdir("html") if f.endswith(".html")])
    print(f"\nDone. {cached}/{len(occupations)} HTML files cached in html/")


if __name__ == "__main__":
    main()