Skip to content

Commit c576775

Browse files
authored
Merge branch 'gpu_version' into main
2 parents 52c6a08 + c191ba4 commit c576775

15 files changed

+2231
-1
lines changed

classification.py

+4
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,11 @@ def model_train(path, x, y, classifier, debug_mode, iteration, hyper_opt, best_p
117117
#from dask.distributed import Client
118118
#client = Client(processes = False)
119119
#with joblib.parallel_backend("dask"):
120+
import time
121+
start = time.time()
120122
model.fit(x, y)
123+
end = time.time()
124+
print("CLASSIFIER TRAINING TIME: ", end - start)
121125

122126
if hyper_opt == "random_search" or hyper_opt == "grid_search":
123127
best_parameters = model.best_params_

feature_selection.py

+7
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import xgboost
88
from xgboost import plot_importance
99
import sys
10+
import time
1011

1112
# Feature selection wrapper, chooses the correct feature selection technique based on the configuration file parameters
1213
def feature_selection(path, fs, iteration, input, labels, feature_size, classifiers, feature_counter, debug_mode, project_info, drug_name):
@@ -30,7 +31,10 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
3031
# SHAPLEY VALUE FEATURE SELECTION
3132
if fs == 'shap':
3233
print("PERFORMING SHAP: ")
34+
start = time.time()
3335
dataset = shapley(path + fs + "/" + classifiers[0] + "/" + iteration, input["x_train"], input["y_train"], feature_size, 1)
36+
end = time.time()
37+
print("SHAP RUN TIME: ", end - start)
3438

3539
# PRINCIPAL COMPONENT ANALYSIS
3640
elif fs == 'pca':
@@ -116,7 +120,10 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
116120
# SHAPLEY VALUE FEATURE SELECTION
117121
if fs == 'shap':
118122
print("PERFORMING SHAP: " + str(iteration) + "/" + str(total_iterations))
123+
start = time.time()
119124
dataset = shapley(path + fs + "/" + classifiers[0] + "/" + str(iteration), input["x_train"][iteration], input["y_train"][iteration], feature_size, 1)
125+
end = time.time()
126+
print("SHAP RUN TIME: ", end - start)
120127

121128
# PRINCIPAL COMPONENT ANALYSIS
122129
elif fs == 'pca':
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
from sklearn.ensemble import RandomForestClassifier
2+
from sklearn.model_selection import RandomizedSearchCV
3+
import xgboost
4+
import lightgbm
5+
import numpy as np
6+
import pandas as pd
7+
import data_preprocess as dp
8+
import cudf
9+
from cuml.ensemble import RandomForestClassifier as cuRFC
10+
import time
11+
12+
# List of hyperparameters to search for the Random Forest scikit-learn implementation
13+
rf_parameters = {
14+
'bootstrap': [True, False],
15+
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
16+
'min_samples_leaf': [1, 2, 4],
17+
'min_samples_split': [2, 5, 10],
18+
'n_estimators': [100, 150, 200, 250, 500, 750, 1000]}
19+
20+
# List of hyperparameters to search for the XGBoost gradient boosting implementation
21+
gdb_parameters = {
22+
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
23+
'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
24+
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
25+
'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
26+
'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
27+
'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
28+
'gamma': [0, 0.25, 0.5, 1.0],
29+
'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
30+
'n_estimators': [100, 150, 200, 250, 500, 750, 1000]}
31+
32+
lgbm_parameters = {
33+
'max_depth':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
34+
'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
35+
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
36+
'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
37+
'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
38+
'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
39+
'n_estimators': [100, 150, 200, 250, 500, 750, 1000]}
40+
41+
42+
# Classification wrapper used to select the correct classifier based on the configuration file selection
43+
def get_model(classifier, hyper_opt):
44+
45+
# Classifier: "rf"
46+
# Random Forest, scikit-learn
47+
if classifier == 'rf':
48+
#model = RandomForestClassifier()
49+
model = cuRFC()
50+
# Random Search CV used for Hyperparameter optimization, sets up the operation for
51+
# going through the list of hyperparameters above and selects best performing model
52+
if hyper_opt == "random_search":
53+
model = RandomizedSearchCV(model, rf_parameters, n_iter=30,
54+
n_jobs=-1, verbose=0, cv=5,
55+
scoring='roc_auc', refit=True, random_state=42)
56+
# Classifier: "gdb"
57+
# Gradient Boosting, xgboost
58+
elif classifier == 'gdb':
59+
model = xgboost.XGBClassifier(eval_metric='logloss')
60+
# model = xgboost.XGBClassifier(eval_metric='logloss')
61+
# Random Search CV used for Hyperparameter optimization, sets up the operation for
62+
# going through the list of hyperparameters above and selects best performing model
63+
if hyper_opt == "random_search":
64+
model = RandomizedSearchCV(model, gdb_parameters, n_iter=30,
65+
n_jobs=-1, verbose=0, cv=5,
66+
scoring='roc_auc', refit=True, random_state=42)
67+
elif classifier == 'lgbm':
68+
model = lightgbm.LGBMClassifier()
69+
# Random Search CV used for Hyperparameter optimization, sets up the operation for
70+
# going through the list of hyperparameters above and selects best performing model
71+
if hyper_opt == "random_search":
72+
model = RandomizedSearchCV(model, lgbm_parameters, n_iter=30,
73+
n_jobs=-1, verbose=0, cv=5,
74+
scoring='roc_auc', refit=True, random_state=42)
75+
else:
76+
sys.exit("ERROR: Unrecognized classification technique in configuration file. Please pick one or more from these options: ['rf', 'gdb']")
77+
return model
78+
79+
# Function that tranverse the data matrix so that it matches with the sickit-learn format and converts the labels to binary format
80+
def prepare_dataset(x, y):
81+
x = x.T
82+
y = y.apply(lambda x: dp.bool_to_binary(x))
83+
start = time.time()
84+
x = cudf.from_pandas(x)
85+
y = cudf.from_pandas(y)
86+
end = time.time()
87+
print("COPY ARRAY: ", end - start)
88+
return x, y
89+
90+
# Performs the classifier training using the training dataset
91+
def model_train(path, x, y, classifier, debug_mode, iteration, hyper_opt, best_parameters):
92+
# DEBUG MODE
93+
if debug_mode:
94+
# Saves input training dataset and labels
95+
debug_path = path + classifier + "/debug/" + str(iteration) + "/"
96+
dp.make_result_dir(debug_path)
97+
x.to_csv(debug_path + "/input_dataset.tsv", sep="\t")
98+
y.to_csv(debug_path + "/labels.tsv", sep="\t")
99+
100+
# Selects correct model
101+
model = get_model(classifier, hyper_opt)
102+
x, y = prepare_dataset(x, y)
103+
if hyper_opt == "best":
104+
#print(best_parameters[1])
105+
#print(best_parameters)
106+
model.set_params(**best_parameters[1])
107+
# Transforms the dataset for correct scikit-learn format
108+
print("CLASSIFIER: " + classifier)
109+
# Trains the model
110+
start = time.time()
111+
model.fit(x, y)
112+
end = time.time()
113+
print("CLASSIFIER TRAINING TIME: ", end - start)
114+
115+
if hyper_opt == "random_search":
116+
print(hyper_opt)
117+
best_parameters = model.best_params_
118+
119+
# DEBUG MODE
120+
if debug_mode:
121+
# Saves the trained model
122+
# Load function can be implemented to load the model back for debugging purposes
123+
from joblib import dump, load
124+
dump(model, debug_path + 'model.joblib')
125+
126+
return model, best_parameters
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import pandas as pd
2+
import cudf
3+
import sys
4+
from pathlib import Path
5+
from sklearn.utils import resample
6+
7+
## Handler functions
8+
9+
# Handles wether to load the dataset from the BeatAML project or a different dataset
10+
def load_dataset(url, project, normalization):
11+
# Loads BeatAML data
12+
if project.lower() == "beataml":
13+
dataset, samples = load_dataset_beatAML(url, normalization)
14+
else:
15+
dataset, samples = load_dataset_rnaseq(url)
16+
return dataset, samples
17+
18+
# Handles wether to load the labels from the BeatAML project or from a different dataset
19+
def load_labels(url, project, drug_name):
20+
# Loads BeatAML data
21+
if project.lower() == "beataml":
22+
labels = load_labels_beatAML(url, drug_name)
23+
else:
24+
labels = load_labels_rnaseq(url)
25+
return labels
26+
27+
# Matches the samples from the dataset and labels, gets rid of any samples that are not available in both data matrices
28+
def sample_match(dataset, labels, dataset_samples):
29+
labels = labels[labels['SID'].isin(dataset_samples)]
30+
dataset = dataset[labels['SID']]
31+
# dataset = dataset[labels['SID'].to_pandas()]
32+
samples = labels['SID']
33+
return dataset, labels, samples
34+
35+
## Functions that change label notation
36+
37+
def category_to_binary(group):
38+
if group == "high":
39+
return 1
40+
elif group == "low":
41+
return 0
42+
else:
43+
return -1
44+
45+
def group_to_bool(group):
46+
if group == "Positive":
47+
return True
48+
elif group == "Negative":
49+
return False
50+
else:
51+
return -1
52+
53+
def bool_to_group(bool):
54+
if bool == True:
55+
return "Positive"
56+
elif bool == False:
57+
return "Negative"
58+
else:
59+
return -1
60+
61+
def group_to_binary(group):
62+
if group == "Group 1" or group == 1:
63+
return 0
64+
elif group == "Group 2" or group == 2:
65+
return 1
66+
else:
67+
return -1
68+
69+
def binary_to_group(binary):
70+
if binary == 0:
71+
return "Group 1"
72+
elif binary == 1:
73+
return "Group 2"
74+
else:
75+
return "Unknown"
76+
77+
def bool_to_binary(bool):
78+
if bool == True:
79+
return 0
80+
elif bool == False:
81+
return 1
82+
else:
83+
return -1
84+
85+
def auc_to_binary(value, q1, q3):
86+
if value >= q3:
87+
return 1
88+
elif value <= q1:
89+
return 0
90+
else:
91+
return -1
92+
93+
### PROJECT DATASETS
94+
95+
## Loads the RNA Sequence Data Matrix from the BeatAML Project
96+
def load_dataset_beatAML(url, normalization):
97+
if normalization == "cpm":
98+
#dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S9-Gene Counts CPM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str})
99+
dataset = pd.read_csv(url + "read_count_matrix.txt", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, sep="\t")
100+
# dataset = cudf.read_csv(url + "read_count_matrix.txt", dtype = 'float64', sep="\t")
101+
# dataset = dataset.astype({'Gene': str, 'Symbol': str})
102+
elif normalization == "rpkm":
103+
#dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S8-Gene Counts RPKM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, engine="openpyxl")
104+
dataset = cudf.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S8-Gene Counts RPKM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, engine="openpyxl")
105+
else:
106+
sys.exit("ERROR BeatAML Project: Dataset requested not available. List of available datasets are ['cpm', 'rpkm']")
107+
# Sets the gene ID as the index for the data matrix rows. THESE GENE ROWS ARE THE FEATURES
108+
# Makes selection/manipulation by features easier
109+
dataset = dataset.set_index('Gene')
110+
# Drops symbol column since gene ID is already being used to track back
111+
dataset = dataset.drop('Symbol', axis = 1)
112+
# Gets the list of sample IDs from the dataset
113+
dataset.columns = [s.replace('X','-') for s in dataset.columns]
114+
samples = dataset.columns
115+
# dataset = cudf.from_pandas(dataset)
116+
return dataset, samples
117+
118+
119+
## Loads the corresponding high responder/low responder labels for "drug_name" from the BeatAML Project
120+
def load_labels_beatAML(url, drug_name):
121+
labels = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S10-Drug Responses", usecols = ['inhibitor', 'lab_id', 'auc', 'counts'], engine="openpyxl")
122+
# labels = cudf.read_csv(url + "drug_response.csv")
123+
# Gets rid of any drugs that was tested on less than 300 samples
124+
labels = labels[labels['counts'] > 300]
125+
labels = labels.drop('counts', axis = 1)
126+
# Modifies the drug names so that only the first name is used (Gets rid of everything that's inside the parenthesis)
127+
# This makes it easier for performing operations based on drug names and saving results
128+
129+
## GPU
130+
# labels['drugs'] = labels[(labels.columns[0])].str.split(' ')
131+
# print(labels)
132+
# from numba import cuda
133+
134+
# @cuda.jit
135+
# def sel_first(in_col, out_col):
136+
# i = cuda.grid(1)
137+
# if i < in_col.size: # boundary guard
138+
# out_col[i] = in_col[i][0]
139+
# size = labels[(labels.columns[0])].size
140+
# labels['inhibitor'] = 0.0
141+
# sel_first.forall(size)(labels[labels.columns[0]], labels['inhibitor'])
142+
# print(labels)
143+
144+
145+
#labels['inhibitor'] = labels['inhibitor'].apply(lambda x: x.split(' ')[0])
146+
# Checks if "drug_name" exists in the dataset
147+
if labels['inhibitor'].str.contains(drug_name).any():
148+
# Selects the "drug_name" drug data
149+
labels = labels[labels['inhibitor'] == drug_name]
150+
labels = labels[['lab_id', 'auc']]
151+
# Calculates the 1st and 3rd quantile of the AUC distribution for "drug_name"
152+
q1 = labels['auc'].quantile(.25)
153+
q3 = labels['auc'].quantile(.75)
154+
# Assigns classification group to each sample:
155+
# If the auc score <= q1, then the sample is classified as a "low responder" or "0"
156+
# if auc score >= q3, then the sample is classified as a "high responder" or "1"
157+
# anything else is classified as -1 (which gets removed later)
158+
labels['GROUP'] = labels['auc'].apply(lambda x: auc_to_binary(x, q1, q3))
159+
labels = labels.drop('auc', axis = 1)
160+
# Filters out any samples that fell inside the 1st and 3rd Quantile (Anything classified as -1)
161+
labels = labels[labels['GROUP'].isin([0, 1])]
162+
labels = labels.rename(columns = {'lab_id':'SID'})
163+
# labels = cudf.from_pandas(labels)
164+
# print(labels)
165+
else:
166+
sys.exit("ERROR beatAML Project: Labels requested not available. List of available labels are ['UNC2025A', 'original']")
167+
return labels
168+
169+
# Creates new directory and subdirectories if given a path and the directory does not exist
170+
# Used extensively to save results
171+
def make_result_dir(path):
172+
Path(path).mkdir(parents=True, exist_ok=True)
173+
174+
#### !!!! CAN BE MODIFIED TO FIT YOUR OWN DATASET !!!! ####
175+
176+
## Function to load new dataset
177+
def load_dataset_rnaseq(url):
178+
# dataset = pd.read_csv(url, sep='\t', index_col=0)
179+
dataset = cudf.read_csv(url, sep='\t', index_col=0)
180+
samples = dataset.columns
181+
return dataset, samples
182+
183+
## Function to load new labels
184+
def load_labels_rnaseq(url):
185+
# labels = pd.read_csv(url, sep='\t', index_col=0)
186+
labels = cudf.read_csv(url, sep='\t', index_col=0)
187+
return labels

0 commit comments

Comments
 (0)