1
+ """
2
+ THIS SCRIPT CONTAINS THE SUPPORT FUNCTIONS NEEDED TO CARRY OUT SAP
3
+ """
4
+
5
+ # Load necessary modules
6
+ import numpy as np
7
+ import pandas as pd
8
+ from scipy import stats
9
+ from scipy .stats import norm , t
10
+ from sklearn .utils import resample
11
+ from time import time
12
+
13
+
14
+ def beta_fun (n , pt , pm , alpha ):
15
+ ta = stats .norm .ppf (1 - alpha )
16
+ sigma_t = np .sqrt (pt * (1 - pt )/ n )
17
+ sigma_m = np .sqrt (pm * (1 - pm )/ n )
18
+ Phi = stats .norm .cdf ( (sigma_t * ta - (pm - pt ))/ sigma_m )
19
+ return Phi
20
+
21
+ def perf_fun (* args , ** kwargs ):
22
+ """
23
+ Function to calculate the performance metric of interest
24
+ 1) You must use *args and **kwargs
25
+ 2) 'thresh' must be one of the kwargs
26
+ 3) This function must return a scalar
27
+ """
28
+ assert len (args ) == 2
29
+ assert 'thresh' in kwargs
30
+ thresh = kwargs ['thresh' ]
31
+ y , score = args [0 ], args [1 ]
32
+ assert np .all ( (y == 0 ) | (y == 1 ) )
33
+ assert thresh <= score .max ()
34
+ yhat = np .where (score >= thresh , 1 , 0 )
35
+ sens = np .mean (yhat [y == 1 ])
36
+ return sens
37
+
38
+ # args=(df.y.values, df.score.values);kwargs={'target':0.8}
39
+ def thresh_find (* args , ** kwargs ):
40
+ """
41
+ Function to find threshold for performance of interest
42
+ 1) You must use *args and **kwargs
43
+ 2) 'target' must be one of the kwargs. This is the value you want to get from perf_fun
44
+ 3) 'jackknife' must be an optional argument in kwargs that will return the function output by leaving one observation out
45
+ See: https://en.wikipedia.org/wiki/Jackknife_resampling
46
+ Note that many statistics have fast way to calculate the jackknife beyond brute-force
47
+ 4) This function must return a scalar, or a np.array is jackknife=True
48
+ """
49
+ # --- assign --- #
50
+ jackknife = False
51
+ ret_df = False
52
+ if 'jackknife' in kwargs :
53
+ jackknife = kwargs ['jackknife' ]
54
+ assert 'target' in kwargs
55
+ target = kwargs ['target' ]
56
+ assert len (args ) == 2
57
+ y , score = args [0 ], args [1 ]
58
+ assert len (y ) == len (score )
59
+ assert np .all ((y == 0 ) | (y == 1 ))
60
+ # --- Find quantile --- #
61
+ s1 = np .sort (score [y == 1 ])
62
+ n1 = len (s1 )
63
+ n0 = len (y ) - n1
64
+ sidx = np .arange (n1 ,0 ,- 1 ) / n1
65
+ sidx = np .argmax (np .where (sidx >= target )[0 ])
66
+ tstar = np .quantile (s1 , 1 - target )
67
+ if jackknife :
68
+ # Effect of dropping an observation on the choice of the sensivity threshold
69
+ tstar0 = np .repeat (tstar , n0 ) # Zero class has no impact
70
+ tmed = np .quantile (np .delete (s1 ,sidx ),1 - target )
71
+ thigh = np .quantile (np .delete (s1 ,sidx - 1 ),1 - target )
72
+ tlow = np .quantile (np .delete (s1 ,sidx + 1 ),1 - target )
73
+ assert tlow <= tmed <= thigh
74
+ tstar1 = np .append (np .repeat (thigh ,sidx ), np .array ([tmed ]))
75
+ tstar1 = np .append (tstar1 , np .repeat (tlow ,n1 - sidx - 1 ))
76
+ tstar = np .append (tstar0 , tstar1 )
77
+ return tstar
78
+
79
+
80
+ def draw_samp (* args , strata = None ):
81
+ """
82
+ FUNCTION DRAWS DATA WITH REPLACEMENT (WITH STRATIFICATION IF DESIRED)
83
+ """
84
+ args = list (args )
85
+ if strata is not None :
86
+ out = resample (* args , stratify = strata )
87
+ else :
88
+ out = resample (* args )
89
+ if len (args ) == 1 :
90
+ out = [out ]
91
+ return out
92
+
93
+
94
+ class bootstrap ():
95
+ def __init__ (self , nboot , func ):
96
+ self .nboot = nboot
97
+ self .stat = func
98
+
99
+ def fit (self , * args , mm = 100 , ** kwargs ):
100
+ strata = None
101
+ if 'strata' in kwargs :
102
+ strata = kwargs ['strata' ]
103
+ # Get the baseline stat
104
+ self .theta = self .stat (* args , ** kwargs )
105
+ self .store_theta = np .zeros (self .nboot )
106
+ self .jn = self .stat (* args , ** kwargs , jackknife = True )
107
+ self .n = len (self .jn )
108
+ stime = time ()
109
+ for ii in range (self .nboot ): # Fit bootstrap
110
+ if (ii + 1 ) % mm == 0 :
111
+ nleft = self .nboot - (ii + 1 )
112
+ rtime = time () - stime
113
+ rate = (ii + 1 )/ rtime
114
+ eta = nleft / rate
115
+ #print('Bootstrap %i of %i (ETA=%0.1f minutes)' % (ii+1, self.nboot, eta/60))
116
+ args_til = draw_samp (* args , strata = strata )
117
+ self .store_theta [ii ] = self .stat (* args_til , ** kwargs )
118
+ self .se = self .store_theta .std ()
119
+
120
+ def get_ci (self , alpha = 0.05 , symmetric = True ):
121
+ assert (symmetric == True ) | (symmetric == 'upper' ) | (symmetric == 'lower' )
122
+ self .di_ci = {'quantile' :[], 'se' :[], 'bca' :[]}
123
+ self .di_ci ['quantile' ] = self .ci_quantile (alpha , symmetric )
124
+ self .di_ci ['se' ] = self .ci_se (alpha , symmetric )
125
+ self .di_ci ['bca' ] = self .ci_bca (alpha , symmetric )
126
+
127
+ def ci_quantile (self , alpha , symmetric ):
128
+ if symmetric == True :
129
+ return np .quantile (self .store_theta , [alpha / 2 ,1 - alpha / 2 ])
130
+ elif symmetric == 'lower' :
131
+ return np .quantile (self .store_theta , alpha )
132
+ else :
133
+ return np .quantile (self .store_theta , 1 - alpha )
134
+
135
+ def ci_se (self , alpha , symmetric ):
136
+ if symmetric == True :
137
+ qq = t (df = self .n ).ppf (1 - alpha / 2 )
138
+ return np .array ([self .theta - self .se * qq , self .theta + self .se * qq ])
139
+ else :
140
+ qq = t (df = self .n ).ppf (1 - alpha )
141
+ if symmetric == 'lower' :
142
+ return self .theta - qq * self .se
143
+ else :
144
+ return self .theta + qq * self .se
145
+
146
+ def ci_bca (self , alpha , symmetric ):
147
+ if symmetric == True :
148
+ ql , qu = norm .ppf (alpha / 2 ), norm .ppf (1 - alpha / 2 )
149
+ else :
150
+ ql , qu = norm .ppf (alpha ), norm .ppf (1 - alpha )
151
+ # Acceleration factor
152
+ num = np .sum ((self .jn .mean () - self .jn )** 3 )
153
+ den = 6 * np .sum ((self .jn .mean () - self .jn )** 2 )** 1.5
154
+ self .ahat = num / den
155
+ # Bias correction factor
156
+ self .zhat = norm .ppf (np .mean (self .store_theta < self .theta ))
157
+ self .a1 = norm .cdf (self .zhat + (self .zhat + ql )/ (1 - self .ahat * (self .zhat + ql )))
158
+ self .a2 = norm .cdf (self .zhat + (self .zhat + qu )/ (1 - self .ahat * (self .zhat + qu )))
159
+
160
+ if symmetric == True :
161
+ return np .quantile (self .store_theta , [self .a1 , self .a2 ])
162
+ elif symmetric == 'lower' :
163
+ return np .quantile (self .store_theta , self .a1 )
164
+ else :
165
+ return np .quantile (self .store_theta , self .a2 )
166
+
167
+
168
+
0 commit comments