-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper.py
112 lines (97 loc) · 4.99 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# function to visualize distribution of numerical variable
def plot_histogram(data, var_list):
import plotly.express as px
if var_list[1] == None:
fig = px.histogram(data, x=var_list[0], marginal='box', height=800, histnorm='',
color_discrete_sequence=px.colors.qualitative.Plotly, title=f'Distribution of {var_list[0]}')
else:
fig = px.histogram(data, x=var_list[0], color=var_list[1], marginal='box', height=800, histnorm='percent',
color_discrete_sequence=px.colors.qualitative.Plotly,
title=f'Distribution of {var_list[0]} by {var_list[1]}')
fig.update_layout(template='plotly_dark', xaxis=dict(showticklabels=True))
fig.update_layout(xaxis_title=var_list[0], yaxis_title='Percentage (%)', barmode='overlay',
title_font=dict(size=25),
margin=dict(l=50, r=50, t=50, b=50),
legend=dict(orientation='h', # Horizontal orientation
yanchor='top', # Aligns the top of the legend with the specified y position
y=-0.2, # Position the legend below the plot
xanchor='center', # Center the legend horizontally
x=0.5 # Center the legend horizontally
))
return fig
# function to perform student t-test
def ttest(var, data):
from scipy.stats import ttest_ind
data_copy = data[~data[var].isna()]
group_data = data_copy.groupby('Status')
alive = group_data.get_group('Alive')
dead = group_data.get_group('Dead')
res = ttest_ind(alive[var], dead[var])
test_stat = round(res[0], 4)
pvalue = round(res[1], 4)
return test_stat, pvalue
# function to perform chi2 test
def chi2test(var, data):
import pandas as pd
from scipy.stats import chi2_contingency
contingency_table = pd.crosstab(data[var], data['Status'])
res = chi2_contingency(contingency_table)
test_stat = round(res[0], 4)
pvalue = round(res[1], 4)
return test_stat, pvalue
# helper function to perform hyperparameters tuning via grid search and return the tuned model
def hyperparameterTuning(model, param_grid, Xtrain, ytrain):
from sklearn.model_selection import GridSearchCV, StratifiedKFold
stratifiedCV = StratifiedKFold(n_splits=3)
# create the GridSearchCV object
grid_search = GridSearchCV(model, # model to be tuned
param_grid, # search grid for the parameters
cv=stratifiedCV, # stratified K-fold cross validation to evaluate the model performance
scoring='roc_auc', # metric to assess the model performance, weighted F1 score (consider the proportion of classes in the dataset)
n_jobs=-1) # use all cpu cores to speed-up CV search
# fit the data into the grid search space
grid_search.fit(Xtrain, ytrain)
# print the best parameters and the corresponding ROC_AUC score
print('Best Hyperparameters from Grid Search : ', grid_search.best_params_)
print('Best AUROC Score: ', grid_search.best_score_)
print()
# get the best model
best_model = grid_search.best_estimator_
# return the hyperparameters tuned model
return best_model
# function to get the performance of all trained models
def model_performance(models, X, y):
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
import pandas as pd
precision_list = []
recall_list = []
f1_list = []
mcc_list = []
cont = []
for model_name, model in models.items():
ypred = model.predict(X)
precision = precision_score(y, ypred, average='macro')
recall = recall_score(y, ypred, average='macro')
f1 = f1_score(y, ypred, average='macro')
mcc = matthews_corrcoef(y, ypred)
res = [model_name, precision, recall, f1, mcc]
cont.append(res)
res_df = pd.DataFrame(cont, columns=['Model','Precision','Recall','F1','MCC'])
return res_df
# function to plot confusion matrix
def plot_confusion_matrix(model, Xtrain, ytrain, Xtest, ytest):
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
label = ['Alive', 'Dead']
ypred_train = model.predict(Xtrain)
ypred_test = model.predict(Xtest)
cm_train = confusion_matrix(ytrain, ypred_train)
cm_test = confusion_matrix(ytest, ypred_test)
figure, axes = plt.subplots(1,2, figsize=(12,5))
sns.heatmap(cm_train, annot=True, cmap='plasma', cbar=False, ax=axes[0], xticklabels=label, yticklabels=label, fmt='d')
axes[0].set_title('Training Data')
sns.heatmap(cm_test, annot=True, cmap='plasma', cbar=False, ax=axes[1], xticklabels=label, yticklabels=label, fmt='d')
axes[1].set_title('Test Data')
plt.tight_layout()
plt.show()