(#87)(skip-ci)(refactor gender inference codes)

hosseinfani · hosseinfani · commit 69ca5d4fde14 · 2025-12-30T19:31:51.000-05:00
diff --git a/src/genderize/c2g.py b/src/genderize/c2g.py
@@ -0,0 +1,42 @@
+# load the labeled dataset and generate {id_name: (gender, accuracy)}
+# this is needed only because in the old code (labelDataset.py -> main.py), we
+#   1) extract names from datasets,
+#   2) do api call to genderize
+#   3) update the dataset with the gender labels
+
+# In yet-to-be-refactor code, for future, we don't need to update/touch the dataset. We simply do
+#   1) (same as before) extract id_names from datasets,
+#   2) (same as before) do api call to genderize
+#   3) generate {id_name: (gender, accuracy)}
+
+# datasets:
+# dblp: for all experts (authors)
+# imdb: for missing cast'ncrew like directors. based on actor/actress, we know for some of experts
+# uspt: none. all experts are labeled in original dataset
+
+import csv, pickle
+
+def imdb_extract_gender_dict(tsv_path, output_dir):
+    result = {}
+    with open(tsv_path, newline='', encoding='utf-8') as f:
+        reader = csv.reader(f, delimiter='\t', quotechar='"')
+        next(reader, None) # skip the header
+        for row in reader:
+            id_int = int(row[0].replace('nm', ''))  # nm0000003
+            fullname = row[1].lower().replace(' ', '_') # Brigitte Bardot
+            gender = row[2].strip() #if row[2] else '' # 'M' or 'F' or ''
+            gender_accuracy = int(float(row[3]) * 100) if row[3] else 0 # may be empty
+            profession = row[6].lower()
+
+            id_name = f'{id_int}_{fullname}'
+
+            if 'actress' in profession: value = (True, 100)
+            elif 'actor' in profession: value = (False,100)
+            elif gender: value = (True if gender == 'F' else False, gender_accuracy)
+            else: value = (None, 0)
+            result[id_name] = value
+            print(f'{row} --> {id_name} --> {value}')
+
+    with open(f'{output_dir}c2g.pkl', 'wb') as f: pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+imdb_extract_gender_dict('../../output/imdb/title.basics.tsv/name.basics.tsv.gender.tsv', '../../output/imdb/title.basics.tsv/')
diff --git a/src/genderize/i2g.py b/src/genderize/i2g.py
@@ -3,31 +3,32 @@
 def generate_i2g_and_female_csv(c2g_file, c2i_file, output_dir):
     """
     Generate i2g mapping (col_index: (isfemale, acc)) and save csv with column indexes where isfemale==True.
-    - c2g_file: pickle file for expert's idname -> gender (idname: (isfemale, acc))
-    - c2i_file: pickle file for index.pkl in opentf that has (idname: index)
+    - c2g_file: pickle file for expert's idname -> gender (idname: (isfemale, acc)), ideally the superset and includes for all experts
+    - c2i_file: pickle file for index.pkl in opentf that has (idname: index), ideally subset including the ones after some filterings
     """
     with open(c2g_file, 'rb') as f: c2g = pickle.load(f)
     with open(c2i_file, 'rb') as f: c2i = pickle.load(f)['c2i']
 
-    missing_ids = [idname for idname in c2i if idname not in c2g]
-    assert not missing_ids, f'The following idnames in c2i are missing in c2g: {missing_ids}'
-
-    i2g = {}
-    for idname, col_idx in c2i.items(): i2g[col_idx] = c2g[idname]
+    def _(idname): return f'{int(float(idname.split("_")[0]))}_{"_".join(idname.split("_")[1:])}' # to handle xxx.0 bug in ids, no change if correct int ids
 
+    i2g = {}; missing_ids = []
+    for idname, col_idx in c2i.items():
+        try: i2g[col_idx] = c2g[_(idname)]
+        except KeyError: missing_ids.append(_(idname))
+    if not missing_ids: print(f'The following idnames in c2i are missing in c2g: {missing_ids}')
     with open(f'{output_dir}i2g.pkl', 'wb') as f: pickle.dump(i2g, f)
 
     female_columns = sorted(idx for idx, (isfemale, acc) in i2g.items() if isfemale is True)
     pd.DataFrame(female_columns, columns=['teamsvecs-females-col-idx']).to_csv(f'{output_dir}females.csv', index=False)
 
 # generate_i2g_and_female_csv(c2g_file='../../output/dblp/toy.dblp.v12.json/c2g.pkl', c2i_file='../../output/dblp/toy.dblp.v12.json/indexes.pkl', output_dir='../../output/dblp/toy.dblp.v12.json/')
-# generate_i2g_and_female_csv(c2g_file='../../output/dblp/dblp.v12.json.mt10.ts2/c2g.pkl', c2i_file='../../output/dblp/dblp.v12.json.mt10.ts2/indexes.pkl', output_dir='../../output/dblp/dblp.v12.json.mt10.ts2/')
+# generate_i2g_and_female_csv(c2g_file='../../output/dblp/dblp.v12.json/c2g.pkl', c2i_file='../../output/dblp/dblp.v12.json.mt10.ts2/indexes.pkl', output_dir='../../output/dblp/dblp.v12.json.mt10.ts2/')
 # generate_i2g_and_female_csv(c2g_file='../../output/dblp/dblp.v12.json/c2g.pkl', c2i_file='../../output/dblp/dblp.v12.json/indexes.pkl', output_dir='../../output/dblp/dblp.v12.json/')
 #
 # generate_i2g_and_female_csv(c2g_file='../../output/imdb/toy.title.basics.tsv/c2g.pkl', c2i_file='../../output/imdb/toy.title.basics.tsv/indexes.pkl', output_dir='../../output/imdb/toy.title.basics.tsv/')
-# generate_i2g_and_female_csv(c2g_file='../../output/imdb/title.basics.tsv.mt10.ts2/c2g.pkl', c2i_file='../../output/imdb/title.basics.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/title.basics.tsv.mt10.ts2/')
+# generate_i2g_and_female_csv(c2g_file='../../output/imdb/title.basics.tsv/c2g.pkl', c2i_file='../../output/imdb/title.basics.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/title.basics.tsv.mt10.ts2/')
 # generate_i2g_and_female_csv(c2g_file='../../output/imdb/title.basics.tsv/c2g.pkl', c2i_file='../../output/imdb/title.basics.tsv/indexes.pkl', output_dir='../../output/imdb/title.basics.tsv/')
 #
 # generate_i2g_and_female_csv(c2g_file='../../output/imdb/toy.patent.tsv/c2g.pkl', c2i_file='../../output/imdb/toy.patent.tsv/indexes.pkl', output_dir='../../output/imdb/toy.patent.tsv/')
-# generate_i2g_and_female_csv(c2g_file='../../output/imdb/patent.tsv.mt10.ts2/c2g.pkl', c2i_file='../../output/imdb/patent.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/patent.tsv.mt10.ts2/')
+# generate_i2g_and_female_csv(c2g_file='../../output/imdb/patent.tsv/c2g.pkl', c2i_file='../../output/imdb/patent.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/patent.tsv.mt10.ts2/')
 # generate_i2g_and_female_csv(c2g_file='../../output/imdb/patent.tsv/c2g.pkl', c2i_file='../../output/imdb/patent.tsv/indexes.pkl', output_dir='../../output/imdb/patent.tsv/')
diff --git a/src/genderize/main.py b/src/genderize/main.py
@@ -239,28 +239,20 @@ def addGenderResultsFromFile(self, folderOfResults, numOfEntries, inc=1000, star
                     # Adds Probability value:
                     self.df.loc[results['name'], 'Probability'] = results['probability']
                 print(f"{i} to {i+inc} has been searched")
-        
-        
-
-
 
     # Private method used in makeParallelAPIReqs
-    def exception_handler(request, exception):
-        print("Request failed")
-        
+    def exception_handler(request, exception): print("Request failed")
 
     # Using the grequests library, each name from the dataframe will make a request for the gender information
     # NOTE: Please specify a range to make the API requests
     # For my machine, I had no problems using a range of 1000 for every call
     # The range is [a,b) -> inclusive a, exclusive b
-
     def makeParallelAPIReqs(self, apiKeyDirectory, a, b):
         key = ""
         rawOutput = open(f'src/util/UniqueNames/IMDBResults/ApiResults/apiOutput_{a}_to_{b}.txt', 'w')
         resultCodes = open(f'src/util/UniqueNames/IMDBResults/ApiResults/resultCodes_{a}_to_{b}.txt', 'w')
 
-        with open(apiKeyDirectory, 'r') as f:
-            key = f.readline()
+        with open(apiKeyDirectory, 'r') as f: key = f.readline()
 
         urls = []
         # creates url string for all the names
@@ -279,10 +271,8 @@ def makeParallelAPIReqs(self, apiKeyDirectory, a, b):
             rawOutput.write(f"{result.text}\n")
 
     def printResults(self, head=None):
-        if(head):
-            print(self.df.head(head))
-        else:
-            print(self.df)
+        if(head): print(self.df.head(head))
+        else: print(self.df)
 
     # Labels IMDB name.basics.tsv file
     def labelIMDB_gender(self, input_tsvFile, output_tsvFile, error_tsvFile):
@@ -448,8 +438,7 @@ def confirmSortedAndUnique(self):
         print(f"UNIQUE: {self.df.index.is_unique}")
         print(f"SORTED: {self.df.index.is_monotonic_increasing}")
 
-    def getCount(self):
-        print(f"Number of unique names: {self.df.shape[0]}")
+    def getCount(self): print(f"Number of unique names: {self.df.shape[0]}")
 
 def runIMDB(labelIMDB: LabelDataset):
     labelIMDB.searchIMDB('../name.basics.tsv')
@@ -501,7 +490,6 @@ def runDBLP(labelDBLP: LabelDataset):
 
 
     # STEP 2: Complete API Requests
-
     # API Requests for First Part: 
     for i in range(0, 273000, 1000):
         print(f"Working on {i} to {i+1000}")