Skip to content

Commit 8927da5

Browse files
committed
initializing regerssion functions; gitignore for senthu
1 parent b2ba753 commit 8927da5

File tree

5 files changed

+228
-1
lines changed

5 files changed

+228
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
power_for_regression_metrics.ipynb
22
SAM_sujay.ipynb
3+
*senthu*
34
papers/
45
*.csv
56
bca_python.ipynb

SAM_template.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@
573573
"name": "python",
574574
"nbconvert_exporter": "python",
575575
"pygments_lexer": "ipython3",
576-
"version": "3.7.3"
576+
"version": "3.7.10"
577577
}
578578
},
579579
"nbformat": 4,

funs_reg.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import numpy as np
2+
from scipy.stats import norm
3+
from scipy.optimize import minimize_scalar
4+
from scipy.stats import multivariate_normal as MVN
5+
6+
class cond_dist():
7+
def __init__(self, k, n1, n2, null=False):
8+
self.s = +1
9+
if null is True:
10+
self.s = -1
11+
self.k, self.n1, self.n2 = k, n1, n2
12+
self.r = np.sqrt(n2 / n1)
13+
14+
def cdf_w(self, w):
15+
a = np.sqrt(1+self.r**2) * self.k * self.s
16+
b = -self.r * self.s
17+
rho = -b/np.sqrt(1+b**2)
18+
Sigma = np.array([[1,rho],[rho,1]])
19+
dist_MVN = MVN(mean=np.repeat(0,2),cov=Sigma)
20+
x1 = a / np.sqrt(1+b**2)
21+
if isinstance(w, float):
22+
X = [x1, w]
23+
else:
24+
X = np.c_[np.repeat(x1,len(w)), w]
25+
pval = dist_MVN.cdf(X)
26+
return pval
27+
28+
def cdf_x(self, x):
29+
const = 1 / norm.cdf(self.s * self.k)
30+
w = (x + self.r * self.k) / np.sqrt(1+self.r**2)
31+
pval = self.cdf_w(w) * const
32+
return pval
33+
34+
def quantile(self, p):
35+
res = minimize_scalar(fun=lambda x: (self.cdf_x(x)-p)**2, method='brent').x
36+
return res
37+
38+
39+
def power_est(n2, k, n1, alpha):
40+
dist_true = cond_dist(k=k, n1=n1, n2=n2, null=True)
41+
dist_false = cond_dist(k=k, n1=n1, n2=n2, null=False)
42+
crit_value = dist_true.quantile(alpha)
43+
power = dist_false.cdf_x(crit_value)
44+
return power
45+
46+
def power_find(pp, k, n1, alpha):
47+
n2 = minimize_scalar(fun=lambda x: (power_est(x, k, n1, alpha)-pp)**2,method='brent').x
48+
return n2
49+
50+

regression.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from funs_reg import power_est
5+
6+
power_est(n2=100, k=1, n1=100, alpha=0.05)
7+
power_est(n2=100, k=2, n1=100, alpha=0.05)
8+

support_funs_sens.py

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
"""
2+
THIS SCRIPT CONTAINS THE SUPPORT FUNCTIONS NEEDED TO CARRY OUT SAP
3+
"""
4+
5+
# Load necessary modules
6+
import numpy as np
7+
import pandas as pd
8+
from scipy import stats
9+
from scipy.stats import norm, t
10+
from sklearn.utils import resample
11+
from time import time
12+
13+
14+
def beta_fun(n, pt, pm, alpha):
15+
ta = stats.norm.ppf(1-alpha)
16+
sigma_t = np.sqrt(pt*(1-pt)/n)
17+
sigma_m = np.sqrt(pm*(1-pm)/n)
18+
Phi = stats.norm.cdf( (sigma_t*ta-(pm-pt))/sigma_m )
19+
return Phi
20+
21+
def perf_fun(*args, **kwargs):
22+
"""
23+
Function to calculate the performance metric of interest
24+
1) You must use *args and **kwargs
25+
2) 'thresh' must be one of the kwargs
26+
3) This function must return a scalar
27+
"""
28+
assert len(args) == 2
29+
assert 'thresh' in kwargs
30+
thresh = kwargs['thresh']
31+
y, score = args[0], args[1]
32+
assert np.all( (y==0) | (y==1) )
33+
assert thresh <= score.max()
34+
yhat = np.where(score >= thresh, 1, 0)
35+
sens = np.mean(yhat[y == 1])
36+
return sens
37+
38+
# args=(df.y.values, df.score.values);kwargs={'target':0.8}
39+
def thresh_find(*args, **kwargs):
40+
"""
41+
Function to find threshold for performance of interest
42+
1) You must use *args and **kwargs
43+
2) 'target' must be one of the kwargs. This is the value you want to get from perf_fun
44+
3) 'jackknife' must be an optional argument in kwargs that will return the function output by leaving one observation out
45+
See: https://en.wikipedia.org/wiki/Jackknife_resampling
46+
Note that many statistics have fast way to calculate the jackknife beyond brute-force
47+
4) This function must return a scalar, or a np.array is jackknife=True
48+
"""
49+
# --- assign --- #
50+
jackknife = False
51+
ret_df = False
52+
if 'jackknife' in kwargs:
53+
jackknife = kwargs['jackknife']
54+
assert 'target' in kwargs
55+
target = kwargs['target']
56+
assert len(args) == 2
57+
y, score = args[0], args[1]
58+
assert len(y) == len(score)
59+
assert np.all((y==0) | (y==1))
60+
# --- Find quantile --- #
61+
s1 = np.sort(score[y == 1])
62+
n1 = len(s1)
63+
n0 = len(y) - n1
64+
sidx = np.arange(n1,0,-1) / n1
65+
sidx = np.argmax(np.where(sidx >= target)[0])
66+
tstar = np.quantile(s1, 1-target)
67+
if jackknife:
68+
# Effect of dropping an observation on the choice of the sensivity threshold
69+
tstar0 = np.repeat(tstar, n0) # Zero class has no impact
70+
tmed = np.quantile(np.delete(s1,sidx),1-target)
71+
thigh = np.quantile(np.delete(s1,sidx-1),1-target)
72+
tlow = np.quantile(np.delete(s1,sidx+1),1-target)
73+
assert tlow <= tmed <= thigh
74+
tstar1 = np.append(np.repeat(thigh,sidx), np.array([tmed]))
75+
tstar1 = np.append(tstar1, np.repeat(tlow,n1-sidx-1))
76+
tstar = np.append(tstar0, tstar1)
77+
return tstar
78+
79+
80+
def draw_samp(*args, strata=None):
81+
"""
82+
FUNCTION DRAWS DATA WITH REPLACEMENT (WITH STRATIFICATION IF DESIRED)
83+
"""
84+
args = list(args)
85+
if strata is not None:
86+
out = resample(*args, stratify=strata)
87+
else:
88+
out = resample(*args)
89+
if len(args) == 1:
90+
out = [out]
91+
return out
92+
93+
94+
class bootstrap():
95+
def __init__(self, nboot, func):
96+
self.nboot = nboot
97+
self.stat = func
98+
99+
def fit(self, *args, mm=100, **kwargs):
100+
strata=None
101+
if 'strata' in kwargs:
102+
strata = kwargs['strata']
103+
# Get the baseline stat
104+
self.theta = self.stat(*args, **kwargs)
105+
self.store_theta = np.zeros(self.nboot)
106+
self.jn = self.stat(*args, **kwargs, jackknife=True)
107+
self.n = len(self.jn)
108+
stime = time()
109+
for ii in range(self.nboot): # Fit bootstrap
110+
if (ii+1) % mm == 0:
111+
nleft = self.nboot - (ii+1)
112+
rtime = time() - stime
113+
rate = (ii+1)/rtime
114+
eta = nleft / rate
115+
#print('Bootstrap %i of %i (ETA=%0.1f minutes)' % (ii+1, self.nboot, eta/60))
116+
args_til = draw_samp(*args, strata=strata)
117+
self.store_theta[ii] = self.stat(*args_til, **kwargs)
118+
self.se = self.store_theta.std()
119+
120+
def get_ci(self, alpha=0.05, symmetric=True):
121+
assert (symmetric==True) | (symmetric=='upper') | (symmetric=='lower')
122+
self.di_ci = {'quantile':[], 'se':[], 'bca':[]}
123+
self.di_ci['quantile'] = self.ci_quantile(alpha, symmetric)
124+
self.di_ci['se'] = self.ci_se(alpha, symmetric)
125+
self.di_ci['bca'] = self.ci_bca(alpha, symmetric)
126+
127+
def ci_quantile(self, alpha, symmetric):
128+
if symmetric==True:
129+
return np.quantile(self.store_theta, [alpha/2,1-alpha/2])
130+
elif symmetric == 'lower':
131+
return np.quantile(self.store_theta, alpha)
132+
else:
133+
return np.quantile(self.store_theta, 1-alpha)
134+
135+
def ci_se(self, alpha, symmetric):
136+
if symmetric==True:
137+
qq = t(df=self.n).ppf(1-alpha/2)
138+
return np.array([self.theta - self.se*qq, self.theta + self.se*qq])
139+
else:
140+
qq = t(df=self.n).ppf(1-alpha)
141+
if symmetric == 'lower':
142+
return self.theta - qq*self.se
143+
else:
144+
return self.theta + qq*self.se
145+
146+
def ci_bca(self, alpha, symmetric):
147+
if symmetric==True:
148+
ql, qu = norm.ppf(alpha/2), norm.ppf(1-alpha/2)
149+
else:
150+
ql, qu = norm.ppf(alpha), norm.ppf(1-alpha)
151+
# Acceleration factor
152+
num = np.sum((self.jn.mean() - self.jn)**3)
153+
den = 6*np.sum((self.jn.mean() - self.jn)**2)**1.5
154+
self.ahat = num / den
155+
# Bias correction factor
156+
self.zhat = norm.ppf(np.mean(self.store_theta < self.theta))
157+
self.a1 = norm.cdf(self.zhat + (self.zhat + ql)/(1-self.ahat*(self.zhat+ql)))
158+
self.a2 = norm.cdf(self.zhat + (self.zhat + qu)/(1-self.ahat*(self.zhat+qu)))
159+
160+
if symmetric==True:
161+
return np.quantile(self.store_theta, [self.a1, self.a2])
162+
elif symmetric=='lower':
163+
return np.quantile(self.store_theta, self.a1)
164+
else:
165+
return np.quantile(self.store_theta, self.a2)
166+
167+
168+

0 commit comments

Comments
 (0)