-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_artists.py
68 lines (45 loc) · 1.42 KB
/
check_artists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
import re
import json
import time
def has_artist(artist):
url = f"https://kworb.net/itunes/artist/{artist}.html"
try:
response = requests.get(url)
if response.status_code == 404:
return True
else:
print(f"[!] Missing {artist}")
return False
except requests.RequestException as e:
print(f"[!] Error finding {artist}")
return False
def process_name(name):
name = name.lower()
# Remove spaces
name = name.replace(" ", "")
pattern = r"[^a-z0-9]"
# Use re.sub() to replace the matched characters with an empty string
name = re.sub(pattern, "", name)
return name
def get_artists(input_file) -> set:
with open(input_file, "r") as json_file:
data = json.load(json_file)
artists = {}
for year in data:
for song, artist in data[year]:
artists[artist] = process_name(artist)
print(f"[*] Total artists: {len(artists)}")
return artists
def find_kworb_missing_artists(artists: dict):
missing_artists = set()
for name in artists:
if not has_artist(name):
missing_artists.add(name)
return missing_artists
if __name__ == "__main__":
input_file = "data/scraped/cleaned_data.json"
artists = get_artists(input_file)
missing = find_kworb_missing_artists(artists)
print(missing)
print(f"Missing artists: {len(missing)}")