Skip to content

Commit 69ca5d4

Browse files
committed
(#87)(skip-ci)(refactor gender inference codes)
1 parent 14fcc6b commit 69ca5d4

3 files changed

Lines changed: 58 additions & 27 deletions

File tree

src/genderize/c2g.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# load the labeled dataset and generate {id_name: (gender, accuracy)}
2+
# this is needed only because in the old code (labelDataset.py -> main.py), we
3+
# 1) extract names from datasets,
4+
# 2) do api call to genderize
5+
# 3) update the dataset with the gender labels
6+
7+
# In yet-to-be-refactor code, for future, we don't need to update/touch the dataset. We simply do
8+
# 1) (same as before) extract id_names from datasets,
9+
# 2) (same as before) do api call to genderize
10+
# 3) generate {id_name: (gender, accuracy)}
11+
12+
# datasets:
13+
# dblp: for all experts (authors)
14+
# imdb: for missing cast'ncrew like directors. based on actor/actress, we know for some of experts
15+
# uspt: none. all experts are labeled in original dataset
16+
17+
import csv, pickle
18+
19+
def imdb_extract_gender_dict(tsv_path, output_dir):
20+
result = {}
21+
with open(tsv_path, newline='', encoding='utf-8') as f:
22+
reader = csv.reader(f, delimiter='\t', quotechar='"')
23+
next(reader, None) # skip the header
24+
for row in reader:
25+
id_int = int(row[0].replace('nm', '')) # nm0000003
26+
fullname = row[1].lower().replace(' ', '_') # Brigitte Bardot
27+
gender = row[2].strip() #if row[2] else '' # 'M' or 'F' or ''
28+
gender_accuracy = int(float(row[3]) * 100) if row[3] else 0 # may be empty
29+
profession = row[6].lower()
30+
31+
id_name = f'{id_int}_{fullname}'
32+
33+
if 'actress' in profession: value = (True, 100)
34+
elif 'actor' in profession: value = (False,100)
35+
elif gender: value = (True if gender == 'F' else False, gender_accuracy)
36+
else: value = (None, 0)
37+
result[id_name] = value
38+
print(f'{row} --> {id_name} --> {value}')
39+
40+
with open(f'{output_dir}c2g.pkl', 'wb') as f: pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)
41+
42+
imdb_extract_gender_dict('../../output/imdb/title.basics.tsv/name.basics.tsv.gender.tsv', '../../output/imdb/title.basics.tsv/')

src/genderize/i2g.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,32 @@
33
def generate_i2g_and_female_csv(c2g_file, c2i_file, output_dir):
44
"""
55
Generate i2g mapping (col_index: (isfemale, acc)) and save csv with column indexes where isfemale==True.
6-
- c2g_file: pickle file for expert's idname -> gender (idname: (isfemale, acc))
7-
- c2i_file: pickle file for index.pkl in opentf that has (idname: index)
6+
- c2g_file: pickle file for expert's idname -> gender (idname: (isfemale, acc)), ideally the superset and includes for all experts
7+
- c2i_file: pickle file for index.pkl in opentf that has (idname: index), ideally subset including the ones after some filterings
88
"""
99
with open(c2g_file, 'rb') as f: c2g = pickle.load(f)
1010
with open(c2i_file, 'rb') as f: c2i = pickle.load(f)['c2i']
1111

12-
missing_ids = [idname for idname in c2i if idname not in c2g]
13-
assert not missing_ids, f'The following idnames in c2i are missing in c2g: {missing_ids}'
14-
15-
i2g = {}
16-
for idname, col_idx in c2i.items(): i2g[col_idx] = c2g[idname]
12+
def _(idname): return f'{int(float(idname.split("_")[0]))}_{"_".join(idname.split("_")[1:])}' # to handle xxx.0 bug in ids, no change if correct int ids
1713

14+
i2g = {}; missing_ids = []
15+
for idname, col_idx in c2i.items():
16+
try: i2g[col_idx] = c2g[_(idname)]
17+
except KeyError: missing_ids.append(_(idname))
18+
if not missing_ids: print(f'The following idnames in c2i are missing in c2g: {missing_ids}')
1819
with open(f'{output_dir}i2g.pkl', 'wb') as f: pickle.dump(i2g, f)
1920

2021
female_columns = sorted(idx for idx, (isfemale, acc) in i2g.items() if isfemale is True)
2122
pd.DataFrame(female_columns, columns=['teamsvecs-females-col-idx']).to_csv(f'{output_dir}females.csv', index=False)
2223

2324
# generate_i2g_and_female_csv(c2g_file='../../output/dblp/toy.dblp.v12.json/c2g.pkl', c2i_file='../../output/dblp/toy.dblp.v12.json/indexes.pkl', output_dir='../../output/dblp/toy.dblp.v12.json/')
24-
# generate_i2g_and_female_csv(c2g_file='../../output/dblp/dblp.v12.json.mt10.ts2/c2g.pkl', c2i_file='../../output/dblp/dblp.v12.json.mt10.ts2/indexes.pkl', output_dir='../../output/dblp/dblp.v12.json.mt10.ts2/')
25+
# generate_i2g_and_female_csv(c2g_file='../../output/dblp/dblp.v12.json/c2g.pkl', c2i_file='../../output/dblp/dblp.v12.json.mt10.ts2/indexes.pkl', output_dir='../../output/dblp/dblp.v12.json.mt10.ts2/')
2526
# generate_i2g_and_female_csv(c2g_file='../../output/dblp/dblp.v12.json/c2g.pkl', c2i_file='../../output/dblp/dblp.v12.json/indexes.pkl', output_dir='../../output/dblp/dblp.v12.json/')
2627
#
2728
# generate_i2g_and_female_csv(c2g_file='../../output/imdb/toy.title.basics.tsv/c2g.pkl', c2i_file='../../output/imdb/toy.title.basics.tsv/indexes.pkl', output_dir='../../output/imdb/toy.title.basics.tsv/')
28-
# generate_i2g_and_female_csv(c2g_file='../../output/imdb/title.basics.tsv.mt10.ts2/c2g.pkl', c2i_file='../../output/imdb/title.basics.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/title.basics.tsv.mt10.ts2/')
29+
# generate_i2g_and_female_csv(c2g_file='../../output/imdb/title.basics.tsv/c2g.pkl', c2i_file='../../output/imdb/title.basics.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/title.basics.tsv.mt10.ts2/')
2930
# generate_i2g_and_female_csv(c2g_file='../../output/imdb/title.basics.tsv/c2g.pkl', c2i_file='../../output/imdb/title.basics.tsv/indexes.pkl', output_dir='../../output/imdb/title.basics.tsv/')
3031
#
3132
# generate_i2g_and_female_csv(c2g_file='../../output/imdb/toy.patent.tsv/c2g.pkl', c2i_file='../../output/imdb/toy.patent.tsv/indexes.pkl', output_dir='../../output/imdb/toy.patent.tsv/')
32-
# generate_i2g_and_female_csv(c2g_file='../../output/imdb/patent.tsv.mt10.ts2/c2g.pkl', c2i_file='../../output/imdb/patent.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/patent.tsv.mt10.ts2/')
33+
# generate_i2g_and_female_csv(c2g_file='../../output/imdb/patent.tsv/c2g.pkl', c2i_file='../../output/imdb/patent.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/patent.tsv.mt10.ts2/')
3334
# generate_i2g_and_female_csv(c2g_file='../../output/imdb/patent.tsv/c2g.pkl', c2i_file='../../output/imdb/patent.tsv/indexes.pkl', output_dir='../../output/imdb/patent.tsv/')
Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -239,28 +239,20 @@ def addGenderResultsFromFile(self, folderOfResults, numOfEntries, inc=1000, star
239239
# Adds Probability value:
240240
self.df.loc[results['name'], 'Probability'] = results['probability']
241241
print(f"{i} to {i+inc} has been searched")
242-
243-
244-
245-
246242

247243
# Private method used in makeParallelAPIReqs
248-
def exception_handler(request, exception):
249-
print("Request failed")
250-
244+
def exception_handler(request, exception): print("Request failed")
251245

252246
# Using the grequests library, each name from the dataframe will make a request for the gender information
253247
# NOTE: Please specify a range to make the API requests
254248
# For my machine, I had no problems using a range of 1000 for every call
255249
# The range is [a,b) -> inclusive a, exclusive b
256-
257250
def makeParallelAPIReqs(self, apiKeyDirectory, a, b):
258251
key = ""
259252
rawOutput = open(f'src/util/UniqueNames/IMDBResults/ApiResults/apiOutput_{a}_to_{b}.txt', 'w')
260253
resultCodes = open(f'src/util/UniqueNames/IMDBResults/ApiResults/resultCodes_{a}_to_{b}.txt', 'w')
261254

262-
with open(apiKeyDirectory, 'r') as f:
263-
key = f.readline()
255+
with open(apiKeyDirectory, 'r') as f: key = f.readline()
264256

265257
urls = []
266258
# creates url string for all the names
@@ -279,10 +271,8 @@ def makeParallelAPIReqs(self, apiKeyDirectory, a, b):
279271
rawOutput.write(f"{result.text}\n")
280272

281273
def printResults(self, head=None):
282-
if(head):
283-
print(self.df.head(head))
284-
else:
285-
print(self.df)
274+
if(head): print(self.df.head(head))
275+
else: print(self.df)
286276

287277
# Labels IMDB name.basics.tsv file
288278
def labelIMDB_gender(self, input_tsvFile, output_tsvFile, error_tsvFile):
@@ -448,8 +438,7 @@ def confirmSortedAndUnique(self):
448438
print(f"UNIQUE: {self.df.index.is_unique}")
449439
print(f"SORTED: {self.df.index.is_monotonic_increasing}")
450440

451-
def getCount(self):
452-
print(f"Number of unique names: {self.df.shape[0]}")
441+
def getCount(self): print(f"Number of unique names: {self.df.shape[0]}")
453442

454443
def runIMDB(labelIMDB: LabelDataset):
455444
labelIMDB.searchIMDB('../name.basics.tsv')
@@ -501,7 +490,6 @@ def runDBLP(labelDBLP: LabelDataset):
501490

502491

503492
# STEP 2: Complete API Requests
504-
505493
# API Requests for First Part:
506494
for i in range(0, 273000, 1000):
507495
print(f"Working on {i} to {i+1000}")

0 commit comments

Comments
 (0)