jobs/parse_occupations.py at master · NathanWCarlson/jobs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""Parse BLS Occupational Outlook Handbook A-Z index to extract all occupations."""

from bs4 import BeautifulSoup
import json

with open("occupational_outlook_handbook.html", "r") as f:
    soup = BeautifulSoup(f.read(), "html.parser")

# The occupation listings are inside <div class="a-z-list">
az_list = soup.find("div", class_="a-z-list")

# Each <li> contains either:
# 1. A direct link: <a href="url">Occupation Name</a>
# 2. An alias: <a href="url">Alias</a>, see: <a href="url">Canonical Name</a>
#
# We want the unique canonical occupations (deduplicated by URL).

occupations = {}  # url -> canonical name
aliases = []  # list of (alias_name, canonical_name, url)

for li in az_list.find_all("li"):
    links = li.find_all("a")
    text = li.get_text()

    if ", see:" in text or ", see " in text:
        # This is an alias entry — the second link is the canonical one
        if len(links) >= 2:
            alias_name = links[0].get_text(strip=True)
            canonical_name = links[1].get_text(strip=True)
            url = links[1]["href"]
            aliases.append((alias_name, canonical_name, url))
            # Still register the canonical occupation
            if url not in occupations:
                occupations[url] = canonical_name
    else:
        # Direct entry
        if links:
            name = links[0].get_text(strip=True)
            url = links[0]["href"]
            if url not in occupations:
                occupations[url] = name

# Sort by name
sorted_occupations = sorted(occupations.items(), key=lambda x: x[1].lower())

print(f"Total unique occupations: {len(sorted_occupations)}")
print(f"Total aliases (redirects): {len(aliases)}")
print()
print("--- First 20 occupations ---")
for url, name in sorted_occupations[:20]:
    print(f"  {name}")
    print(f"    {url}")
print("...")
print()
print("--- Last 10 occupations ---")
for url, name in sorted_occupations[-10:]:
    print(f"  {name}")
    print(f"    {url}")

# Save to JSON for further analysis
output = []
for url, name in sorted_occupations:
    output.append({"title": name, "url": url})

with open("occupations.json", "w") as f:
    json.dump(output, f, indent=2)

print(f"\nSaved {len(output)} occupations to occupations.json")