|
3 | 3 | def generate_i2g_and_female_csv(c2g_file, c2i_file, output_dir): |
4 | 4 | """ |
5 | 5 | Generate i2g mapping (col_index: (isfemale, acc)) and save csv with column indexes where isfemale==True. |
6 | | - - c2g_file: pickle file for expert's idname -> gender (idname: (isfemale, acc)) |
7 | | - - c2i_file: pickle file for index.pkl in opentf that has (idname: index) |
| 6 | + - c2g_file: pickle file for expert's idname -> gender (idname: (isfemale, acc)), ideally the superset and includes for all experts |
| 7 | + - c2i_file: pickle file for index.pkl in opentf that has (idname: index), ideally subset including the ones after some filterings |
8 | 8 | """ |
9 | 9 | with open(c2g_file, 'rb') as f: c2g = pickle.load(f) |
10 | 10 | with open(c2i_file, 'rb') as f: c2i = pickle.load(f)['c2i'] |
11 | 11 |
|
12 | | - missing_ids = [idname for idname in c2i if idname not in c2g] |
13 | | - assert not missing_ids, f'The following idnames in c2i are missing in c2g: {missing_ids}' |
14 | | - |
15 | | - i2g = {} |
16 | | - for idname, col_idx in c2i.items(): i2g[col_idx] = c2g[idname] |
| 12 | + def _(idname): return f'{int(float(idname.split("_")[0]))}_{"_".join(idname.split("_")[1:])}' # to handle xxx.0 bug in ids, no change if correct int ids |
17 | 13 |
|
| 14 | + i2g = {}; missing_ids = [] |
| 15 | + for idname, col_idx in c2i.items(): |
| 16 | + try: i2g[col_idx] = c2g[_(idname)] |
| 17 | + except KeyError: missing_ids.append(_(idname)) |
| 18 | + if not missing_ids: print(f'The following idnames in c2i are missing in c2g: {missing_ids}') |
18 | 19 | with open(f'{output_dir}i2g.pkl', 'wb') as f: pickle.dump(i2g, f) |
19 | 20 |
|
20 | 21 | female_columns = sorted(idx for idx, (isfemale, acc) in i2g.items() if isfemale is True) |
21 | 22 | pd.DataFrame(female_columns, columns=['teamsvecs-females-col-idx']).to_csv(f'{output_dir}females.csv', index=False) |
22 | 23 |
|
23 | 24 | # generate_i2g_and_female_csv(c2g_file='../../output/dblp/toy.dblp.v12.json/c2g.pkl', c2i_file='../../output/dblp/toy.dblp.v12.json/indexes.pkl', output_dir='../../output/dblp/toy.dblp.v12.json/') |
24 | | -# generate_i2g_and_female_csv(c2g_file='../../output/dblp/dblp.v12.json.mt10.ts2/c2g.pkl', c2i_file='../../output/dblp/dblp.v12.json.mt10.ts2/indexes.pkl', output_dir='../../output/dblp/dblp.v12.json.mt10.ts2/') |
| 25 | +# generate_i2g_and_female_csv(c2g_file='../../output/dblp/dblp.v12.json/c2g.pkl', c2i_file='../../output/dblp/dblp.v12.json.mt10.ts2/indexes.pkl', output_dir='../../output/dblp/dblp.v12.json.mt10.ts2/') |
25 | 26 | # generate_i2g_and_female_csv(c2g_file='../../output/dblp/dblp.v12.json/c2g.pkl', c2i_file='../../output/dblp/dblp.v12.json/indexes.pkl', output_dir='../../output/dblp/dblp.v12.json/') |
26 | 27 | # |
27 | 28 | # generate_i2g_and_female_csv(c2g_file='../../output/imdb/toy.title.basics.tsv/c2g.pkl', c2i_file='../../output/imdb/toy.title.basics.tsv/indexes.pkl', output_dir='../../output/imdb/toy.title.basics.tsv/') |
28 | | -# generate_i2g_and_female_csv(c2g_file='../../output/imdb/title.basics.tsv.mt10.ts2/c2g.pkl', c2i_file='../../output/imdb/title.basics.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/title.basics.tsv.mt10.ts2/') |
| 29 | +# generate_i2g_and_female_csv(c2g_file='../../output/imdb/title.basics.tsv/c2g.pkl', c2i_file='../../output/imdb/title.basics.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/title.basics.tsv.mt10.ts2/') |
29 | 30 | # generate_i2g_and_female_csv(c2g_file='../../output/imdb/title.basics.tsv/c2g.pkl', c2i_file='../../output/imdb/title.basics.tsv/indexes.pkl', output_dir='../../output/imdb/title.basics.tsv/') |
30 | 31 | # |
31 | 32 | # generate_i2g_and_female_csv(c2g_file='../../output/imdb/toy.patent.tsv/c2g.pkl', c2i_file='../../output/imdb/toy.patent.tsv/indexes.pkl', output_dir='../../output/imdb/toy.patent.tsv/') |
32 | | -# generate_i2g_and_female_csv(c2g_file='../../output/imdb/patent.tsv.mt10.ts2/c2g.pkl', c2i_file='../../output/imdb/patent.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/patent.tsv.mt10.ts2/') |
| 33 | +# generate_i2g_and_female_csv(c2g_file='../../output/imdb/patent.tsv/c2g.pkl', c2i_file='../../output/imdb/patent.tsv.mt10.ts2/indexes.pkl', output_dir='../../output/imdb/patent.tsv.mt10.ts2/') |
33 | 34 | # generate_i2g_and_female_csv(c2g_file='../../output/imdb/patent.tsv/c2g.pkl', c2i_file='../../output/imdb/patent.tsv/indexes.pkl', output_dir='../../output/imdb/patent.tsv/') |
0 commit comments