-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdl3.py
180 lines (167 loc) · 6.1 KB
/
dl3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/python3
import requests, bs4, re
import sys, random, datetime
from fractions import Fraction
def get_by_index(n):
#Download RPI Directory page by index
#Note: entries span n = 1 .. 10532
#Note: generated 9703 the first time
#requests session object must be used to preserve session info
url = "http://prod3.server.rpi.edu/peopledirectory/entry.do"
params = {
'datasetName': 'directory',
'key': str(n)
}
s = requests.Session()
return s.get(url, params=params)
def parse(html):
#Convert RPI Directory html into dictionary of fields and values
#It's a little wonky, but the page formatting doesn't really lend itself to parsing
soup = bs4.BeautifulSoup(html, "lxml")
data = str(soup.find_all(attrs={'id': 'singleDirectoryEntry'}))
data = data.replace('&', '&')#what's the better way to do this?
w = re.sub('<.*?>', ' ', data) #remove tags
x = re.split("\n | :", w) #tokenize
y = [i.strip()
for i in x
if len(i.strip()) > 1] #remove garbage
#convert to dict
last = ''
d = {}
for i in y:
if i[-1] == ':':
last = i[:-1]
d[last] = ''
elif last in d:
d[last] += (i + " ")
for i in d:
d[i] = d[i].strip()
return d
def extract(data):
#extract relevant info from dictionary of directory data
#If the page is a student's, should return [name, 1, Major, Year]
#Otherwise, should return [name, 0, Position, Department]
result = ['?', -1, '?', '?']
result[0] = data.get('Name')
if 'Class' in data:
result[1] = 1
result[2] = data.get('Curriculum')
result[3] = data.get('Class')
else:
result[1] = 0
result[2] = data.get('Title')
result[3] = data.get('Department')
return result
def fetch(n):
#Get list of details based only on the index
r = get_by_index(n)
data = parse(r.text)
details = extract(data)
return details
def write_data(ratios, fout):
#write to csv file delimited by tabs/newlines
f = open(fout, 'w')
f.write('Field\tRating\tEntries\tP(Woman)\n')
rename = {0: 'Non-student', 1: 'Student'}
skip = ['', None]
for category, result in sorted(ratios.items(), key=lambda x: x[1][1])[::-1]:
#iterate through dictionary, sorted by number of entries, from most to least
if category in skip or result[1] <= 5:
#limit entries to categories with actual data
continue
s = str(rename.get(category) or category) + '\t'
s += str(result[0]) + '\t'
s += str(result[1]) + '\t'
s += str(result[0]/result[1]) + '\n'
f.write(s)
f.close()
def names_to_probs(years, folder=''):
#calculates gender breakdown for each first name based on SSA data
#arg 1 is a list of the filenames for to search (e.g. "yob1946.txt")
#arg 2 is an optional folder these filenames reside in (including a slash)
#more data from https://www.ssa.gov/oact/babynames/names.zip
#wget https://www.ssa.gov/oact/babynames/names.zip && unzip names.zip -d ss_names
#1980-1999 returns 58k name records
m_ = {} #overall averages
f_ = {}
t_ = 0
for y in years:
#yearly data
(f, m, t) = names_dict(folder + str(y))
for i,j in f.items():
prev = f_.get(i) or 0
f_[i] = prev + Fraction(j,t)
for i,j in m.items():
prev = m_.get(i) or 0
m_[i] = prev + Fraction(j,t)
t_ += t
print('Total records: ', t_)
p_female = {}
for name in set(list(m_) + list(f_)):
m = (m_.get(name) or 0) #P(name|Man)
f = (f_.get(name) or 0)
p_female[name] = float(f / (m+f))
return p_female
def names_dict(fn):
#ss data source
#returns dictionaries of {name:count} for males and females, as well as total people
m = {}
f = {}
total = 0
for l in open(fn, 'r'):
line = l.split(',')
name = line[0].upper()
count= int(line[2])
sex = line[1].upper()
if sex == 'F':
f[name] = count
total += count
elif sex == 'M':
m[name] = count
total += count
else:
print('ERROR: UNKNWON LINE: <' + l + '>')
return (f, m, total)
def lookup_sex(people, names):
#calculate probability someone with a given name is female; populate dictionary of probabilities
#names not found were also tracked for testing purposes
#probabilities expressed as ratio of Women:All (i.e. P(random person in this category is female))
#0 = male-dominated, 1 = female-dominated, .5 = even
#return value is a tuple of the probability dictionary and the list of absent names
d = {}
#d = dictionary of gender probabilities
# key = field (e.g. 'Math' or 'Administrator' or 'Sophomore')
# val = list: [probability, number of data points (i.e. names found in the database)]
not_found = []
for person in people:
first_name = person[0].split()[0]
sex_p = names.get(first_name.upper())
if sex_p == None:
not_found.append((first_name, person[0]))
continue
for x in person[1:]:
old = d.get(x) or [0,0]
old[0] += sex_p
old[1] += 1
d[x] = old
return (d, not_found)
def main():
start = datetime.datetime.now()
people = []
for n in range(1,10532):
#for n in [random.randint(1,10532) for i in range(100)]:
#people.append(fetch(n))
result = fetch(n)
if result[0]: #only append results with a name
people.append(result)
print("Time Elapsed: ", datetime.datetime.now() - start)
print("People found: ", len(people))
files = ['yob'+str(i)+'.txt' for i in range(1993,1998)] #name files pertinent to undergrads
names = names_to_probs(files, 'ss_names/')
print('Names on record: ', len(names))
ratios = lookup_sex(people, names)
print('Names not found: ', len(ratios[1]))
write_data(ratios[0], 'results.csv')
#return (people, names, ratios[0], ratios[1])
if __name__ == "__main__":
main()