forked from karpathy/jobs
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
99 lines (76 loc) · 3.34 KB
/
scrape.py
File metadata and controls
99 lines (76 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Scrape BLS Occupational Outlook Handbook detail pages (raw HTML).
Saves raw HTML to html/<slug>.html as the source of truth.
Run process.py afterwards to derive data/<slug>.json and pages/<slug>.md.
Usage:
uv run python scrape.py # scrape all (0 to 342)
uv run python scrape.py --start 0 --end 5 # scrape first 5
uv run python scrape.py --start 10 --end 20 # scrape indices 10-19
uv run python scrape.py --force # re-scrape ignoring cache
Caching: skips any occupation where html/<slug>.html already exists.
"""
import argparse
import json
import os
import time
from playwright.sync_api import sync_playwright
# ---------------------------------------------------------------------------
# Main scraper
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="Scrape BLS OOH pages")
parser.add_argument("--start", type=int, default=0, help="Start index (inclusive)")
parser.add_argument("--end", type=int, default=None, help="End index (exclusive)")
parser.add_argument("--force", action="store_true", help="Re-scrape even if cached")
parser.add_argument("--delay", type=float, default=1.0, help="Seconds between requests")
args = parser.parse_args()
# Load master list
with open("occupations.json") as f:
occupations = json.load(f)
end = args.end if args.end is not None else len(occupations)
subset = occupations[args.start:end]
# Create output dirs
os.makedirs("html", exist_ok=True)
os.makedirs("data", exist_ok=True)
os.makedirs("pages", exist_ok=True)
# Figure out what needs scraping (cache based on html/ existence)
to_scrape = []
for i, occ in enumerate(subset, start=args.start):
html_path = f"html/{occ['slug']}.html"
if not args.force and os.path.exists(html_path):
print(f" [{i}] CACHED {occ['title']}")
continue
to_scrape.append((i, occ))
if not to_scrape:
print("Nothing to scrape — all cached.")
return
print(f"\nScraping {len(to_scrape)} occupations (non-headless Chromium)...\n")
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
for idx, (i, occ) in enumerate(to_scrape):
slug = occ["slug"]
url = occ["url"]
html_path = f"html/{slug}.html"
print(f" [{i}] {occ['title']}...", end=" ", flush=True)
try:
resp = page.goto(url, wait_until="domcontentloaded", timeout=15000)
if resp.status != 200:
print(f"HTTP {resp.status} — SKIPPED")
continue
# Save raw HTML — this is the source of truth
html = page.content()
with open(html_path, "w") as f:
f.write(html)
print(f"OK ({len(html):,} bytes)")
except Exception as e:
print(f"ERROR: {e}")
# Be polite
if idx < len(to_scrape) - 1:
time.sleep(args.delay)
browser.close()
# Summary
cached = len([f for f in os.listdir("html") if f.endswith(".html")])
print(f"\nDone. {cached}/{len(occupations)} HTML files cached in html/")
if __name__ == "__main__":
main()