forked from karpathy/jobs
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_occupations.py
More file actions
68 lines (56 loc) · 2.19 KB
/
parse_occupations.py
File metadata and controls
68 lines (56 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""Parse BLS Occupational Outlook Handbook A-Z index to extract all occupations."""
from bs4 import BeautifulSoup
import json
with open("occupational_outlook_handbook.html", "r") as f:
soup = BeautifulSoup(f.read(), "html.parser")
# The occupation listings are inside <div class="a-z-list">
az_list = soup.find("div", class_="a-z-list")
# Each <li> contains either:
# 1. A direct link: <a href="url">Occupation Name</a>
# 2. An alias: <a href="url">Alias</a>, see: <a href="url">Canonical Name</a>
#
# We want the unique canonical occupations (deduplicated by URL).
occupations = {} # url -> canonical name
aliases = [] # list of (alias_name, canonical_name, url)
for li in az_list.find_all("li"):
links = li.find_all("a")
text = li.get_text()
if ", see:" in text or ", see " in text:
# This is an alias entry — the second link is the canonical one
if len(links) >= 2:
alias_name = links[0].get_text(strip=True)
canonical_name = links[1].get_text(strip=True)
url = links[1]["href"]
aliases.append((alias_name, canonical_name, url))
# Still register the canonical occupation
if url not in occupations:
occupations[url] = canonical_name
else:
# Direct entry
if links:
name = links[0].get_text(strip=True)
url = links[0]["href"]
if url not in occupations:
occupations[url] = name
# Sort by name
sorted_occupations = sorted(occupations.items(), key=lambda x: x[1].lower())
print(f"Total unique occupations: {len(sorted_occupations)}")
print(f"Total aliases (redirects): {len(aliases)}")
print()
print("--- First 20 occupations ---")
for url, name in sorted_occupations[:20]:
print(f" {name}")
print(f" {url}")
print("...")
print()
print("--- Last 10 occupations ---")
for url, name in sorted_occupations[-10:]:
print(f" {name}")
print(f" {url}")
# Save to JSON for further analysis
output = []
for url, name in sorted_occupations:
output.append({"title": name, "url": url})
with open("occupations.json", "w") as f:
json.dump(output, f, indent=2)
print(f"\nSaved {len(output)} occupations to occupations.json")