-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathname_generator.py
49 lines (30 loc) · 1.49 KB
/
name_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
import pandas as pd
from path import DATA_EXTERNAL_DIR
# Generate names by race probabilistically given dataset
# https://imai.fas.harvard.edu/research/files/names.pfn_df
# Note that this makes several assumptions (which need to be checked)
# - the probabilities of first names and sur names are independent
# - male and female names are equally represented in the data
# - Turns out this is probably not true
fn_df = pd.read_csv(DATA_EXTERNAL_DIR / "firstnames.csv")
ln_df = pd.read_csv(DATA_EXTERNAL_DIR / "censusSurnames.csv")
fn_df["pctblack_frac"] = fn_df["pctblack"] / fn_df["pctblack"].sum()
fn_df["pctwhite_frac"] = fn_df["pctwhite"] / fn_df["pctwhite"].sum()
ln_df["pctblack_frac"] = ln_df["bla.last"] / ln_df["bla.last"].sum()
ln_df["pctwhite_frac"] = ln_df["whi.last"] / ln_df["whi.last"].sum()
def gen_black_firstname():
return np.random.choice(fn_df["firstname"], 1, p=fn_df["pctblack_frac"])[0]
def gen_black_surname():
return np.random.choice(ln_df["surname"], 1, p=ln_df["pctblack_frac"])[0]
def gen_black_name():
return f"{gen_black_firstname().title()} {gen_black_surname()}"
def gen_white_firstname():
return np.random.choice(fn_df["firstname"], 1, p=fn_df["pctwhite_frac"])[0]
def gen_white_surname():
return np.random.choice(ln_df["surname"], 1, p=ln_df["pctwhite_frac"])[0]
def gen_white_name():
return f"{gen_white_firstname().title()} {gen_white_surname()}"
if __name__ == "__main__":
print(gen_black_name())
print(gen_white_name())