UD-CRPL
diff --git a/‎classification.py
+4 b/‎classification.py
+4
diff --git a/‎feature_selection.py
+7 b/‎feature_selection.py
+7
diff --git a/‎gpu/.ipynb_checkpoints/classification-checkpoint.py
+126 b/‎gpu/.ipynb_checkpoints/classification-checkpoint.py
+126
diff --git a/‎gpu/.ipynb_checkpoints/data_preprocess-checkpoint.py
+187 b/‎gpu/.ipynb_checkpoints/data_preprocess-checkpoint.py
+187
@@ -117,7 +117,11 @@ def model_train(path, x, y, classifier, debug_mode, iteration, hyper_opt, best_p
     #from dask.distributed import Client
     #client = Client(processes = False)
     #with joblib.parallel_backend("dask"):
+    import time
+    start = time.time()
     model.fit(x, y)
+    end = time.time()
+    print("CLASSIFIER TRAINING TIME: ", end - start)
 
     if hyper_opt == "random_search" or hyper_opt == "grid_search":
         best_parameters = model.best_params_
 
@@ -7,6 +7,7 @@
 import xgboost
 from xgboost import plot_importance
 import sys
+import time
 
 # Feature selection wrapper, chooses the correct feature selection technique based on the configuration file parameters
 def feature_selection(path, fs, iteration, input, labels, feature_size, classifiers, feature_counter, debug_mode, project_info, drug_name):
@@ -30,7 +31,10 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
             # SHAPLEY VALUE FEATURE SELECTION
         if fs == 'shap':
             print("PERFORMING SHAP: ")
+            start = time.time()
             dataset = shapley(path + fs + "/" + classifiers[0] + "/" + iteration, input["x_train"], input["y_train"], feature_size, 1)
+            end = time.time()
+            print("SHAP RUN TIME: ", end - start)
 
             # PRINCIPAL COMPONENT ANALYSIS
         elif fs == 'pca':
@@ -116,7 +120,10 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
             # SHAPLEY VALUE FEATURE SELECTION
         if fs == 'shap':
             print("PERFORMING SHAP: " + str(iteration) + "/" + str(total_iterations))
+            start = time.time()
             dataset = shapley(path + fs + "/" + classifiers[0] + "/" + str(iteration), input["x_train"][iteration], input["y_train"][iteration], feature_size, 1)
+            end = time.time()
+            print("SHAP RUN TIME: ", end - start)
 
             # PRINCIPAL COMPONENT ANALYSIS
         elif fs == 'pca':
 
@@ -0,0 +1,126 @@
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import RandomizedSearchCV
+import xgboost
+import lightgbm
+import numpy as np
+import pandas as pd
+import data_preprocess as dp
+import cudf
+from cuml.ensemble import RandomForestClassifier as cuRFC
+import time
+
+# List of hyperparameters to search for the Random Forest scikit-learn implementation
+rf_parameters = {
+'bootstrap': [True, False],
+'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
+'min_samples_leaf': [1, 2, 4],
+'min_samples_split': [2, 5, 10],
+'n_estimators': [100, 150, 200, 250, 500, 750, 1000]}
+
+# List of hyperparameters to search for the XGBoost gradient boosting implementation
+gdb_parameters = {
+'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
+'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
+'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
+'gamma': [0, 0.25, 0.5, 1.0],
+'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
+'n_estimators': [100, 150, 200, 250, 500, 750, 1000]}
+
+lgbm_parameters = {
+'max_depth':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
+'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
+'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
+'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
+'n_estimators': [100, 150, 200, 250, 500, 750, 1000]}
+
+
+# Classification wrapper used to select the correct classifier based on the configuration file selection
+def get_model(classifier, hyper_opt):
+
+    # Classifier: "rf"
+    # Random Forest, scikit-learn
+    if classifier == 'rf':
+        #model = RandomForestClassifier()
+        model = cuRFC()
+        # Random Search CV used for Hyperparameter optimization, sets up the operation for
+        # going through the list of hyperparameters above and selects best performing model
+        if hyper_opt == "random_search":
+            model = RandomizedSearchCV(model, rf_parameters, n_iter=30,
+                                    n_jobs=-1, verbose=0, cv=5,
+                                    scoring='roc_auc', refit=True, random_state=42)
+    # Classifier: "gdb"
+    # Gradient Boosting, xgboost
+    elif classifier == 'gdb':
+        model = xgboost.XGBClassifier(eval_metric='logloss')
+       # model = xgboost.XGBClassifier(eval_metric='logloss')
+        # Random Search CV used for Hyperparameter optimization, sets up the operation for
+        # going through the list of hyperparameters above and selects best performing model
+        if hyper_opt == "random_search":
+            model = RandomizedSearchCV(model, gdb_parameters, n_iter=30,
+                                            n_jobs=-1, verbose=0, cv=5,
+                                            scoring='roc_auc', refit=True, random_state=42)
+    elif classifier == 'lgbm':
+        model = lightgbm.LGBMClassifier()
+        # Random Search CV used for Hyperparameter optimization, sets up the operation for
+        # going through the list of hyperparameters above and selects best performing model
+        if hyper_opt == "random_search":
+            model = RandomizedSearchCV(model, lgbm_parameters, n_iter=30,
+                                            n_jobs=-1, verbose=0, cv=5,
+                                            scoring='roc_auc', refit=True, random_state=42)
+    else:
+        sys.exit("ERROR: Unrecognized classification technique in configuration file. Please pick one or more from these options: ['rf', 'gdb']")
+    return model
+
+# Function that tranverse the data matrix so that it matches with the sickit-learn format and converts the labels to binary format
+def prepare_dataset(x, y):
+    x = x.T
+    y = y.apply(lambda x: dp.bool_to_binary(x))
+    start = time.time()
+    x = cudf.from_pandas(x)
+    y = cudf.from_pandas(y)
+    end = time.time()
+    print("COPY ARRAY: ", end - start)
+    return x, y
+
+# Performs the classifier training using the training dataset
+def model_train(path, x, y, classifier, debug_mode, iteration, hyper_opt, best_parameters):
+    # DEBUG MODE
+    if debug_mode:
+        # Saves input training dataset and labels
+        debug_path = path + classifier + "/debug/" + str(iteration) + "/"
+        dp.make_result_dir(debug_path)
+        x.to_csv(debug_path + "/input_dataset.tsv", sep="\t")
+        y.to_csv(debug_path + "/labels.tsv", sep="\t")
+
+    # Selects correct model
+    model = get_model(classifier, hyper_opt)
+    x, y = prepare_dataset(x, y)
+    if hyper_opt == "best":
+        #print(best_parameters[1])
+        #print(best_parameters)
+        model.set_params(**best_parameters[1])
+        # Transforms the dataset for correct scikit-learn format
+    print("CLASSIFIER: " + classifier)
+    # Trains the model
+    start = time.time()
+    model.fit(x, y)
+    end = time.time()
+    print("CLASSIFIER TRAINING TIME: ", end - start)
+
+    if hyper_opt == "random_search":
+        print(hyper_opt)
+        best_parameters = model.best_params_
+
+    # DEBUG MODE
+    if debug_mode:
+        # Saves the trained model
+        # Load function can be implemented to load the model back for debugging purposes
+        from joblib import dump, load
+        dump(model, debug_path + 'model.joblib')
+
+    return model, best_parameters
@@ -0,0 +1,187 @@
+import pandas as pd
+import cudf
+import sys
+from pathlib import Path
+from sklearn.utils import resample
+
+## Handler functions
+
+# Handles wether to load the dataset from the BeatAML project or a different dataset
+def load_dataset(url, project, normalization):
+    # Loads BeatAML data
+    if project.lower() == "beataml":
+        dataset, samples = load_dataset_beatAML(url, normalization)
+    else:
+        dataset, samples = load_dataset_rnaseq(url)
+    return dataset, samples
+
+# Handles wether to load the labels from the BeatAML project or from a different dataset
+def load_labels(url, project, drug_name):
+    # Loads BeatAML data
+    if project.lower() == "beataml":
+        labels = load_labels_beatAML(url, drug_name)
+    else:
+        labels = load_labels_rnaseq(url)
+    return labels
+
+# Matches the samples from the dataset and labels, gets rid of any samples that are not available in both data matrices
+def sample_match(dataset, labels, dataset_samples):
+    labels = labels[labels['SID'].isin(dataset_samples)]
+    dataset = dataset[labels['SID']]
+   # dataset = dataset[labels['SID'].to_pandas()]
+    samples = labels['SID']
+    return dataset, labels, samples
+
+## Functions that change label notation
+
+def category_to_binary(group):
+    if group == "high":
+        return 1
+    elif group == "low":
+        return 0
+    else:
+        return -1
+
+def group_to_bool(group):
+    if group == "Positive":
+        return True
+    elif group == "Negative":
+        return False
+    else:
+        return -1
+
+def bool_to_group(bool):
+    if bool == True:
+        return "Positive"
+    elif bool == False:
+        return "Negative"
+    else:
+        return -1
+
+def group_to_binary(group):
+    if group == "Group 1" or group == 1:
+        return 0
+    elif group == "Group 2" or group == 2:
+        return 1
+    else:
+        return -1
+
+def binary_to_group(binary):
+    if binary == 0:
+        return "Group 1"
+    elif binary == 1:
+        return "Group 2"
+    else:
+        return "Unknown"
+
+def bool_to_binary(bool):
+    if bool == True:
+        return 0
+    elif bool == False:
+        return 1
+    else:
+        return -1
+
+def auc_to_binary(value, q1, q3):
+    if value >= q3:
+        return 1
+    elif value <= q1:
+        return 0
+    else:
+        return -1
+
+### PROJECT DATASETS
+
+## Loads the RNA Sequence Data Matrix from the BeatAML Project
+def load_dataset_beatAML(url, normalization):
+    if normalization == "cpm":
+        #dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S9-Gene Counts CPM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str})
+        dataset = pd.read_csv(url + "read_count_matrix.txt", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, sep="\t")
+   #     dataset = cudf.read_csv(url + "read_count_matrix.txt", dtype = 'float64', sep="\t")
+   #     dataset = dataset.astype({'Gene': str, 'Symbol': str})
+    elif normalization == "rpkm":
+       #dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S8-Gene Counts RPKM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, engine="openpyxl")
+        dataset = cudf.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S8-Gene Counts RPKM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, engine="openpyxl")
+    else:
+        sys.exit("ERROR BeatAML Project: Dataset requested not available. List of available datasets are ['cpm', 'rpkm']")
+    # Sets the gene ID as the index for the data matrix rows. THESE GENE ROWS ARE THE FEATURES
+    # Makes selection/manipulation by features easier
+    dataset = dataset.set_index('Gene')
+    # Drops symbol column since gene ID is already being used to track back
+    dataset = dataset.drop('Symbol', axis = 1)
+    # Gets the list of sample IDs from the dataset
+    dataset.columns = [s.replace('X','-') for s in dataset.columns]
+    samples = dataset.columns
+  #  dataset = cudf.from_pandas(dataset)
+    return dataset, samples
+
+
+## Loads the corresponding high responder/low responder labels for "drug_name" from the BeatAML Project
+def load_labels_beatAML(url, drug_name):
+    labels = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S10-Drug Responses", usecols = ['inhibitor', 'lab_id', 'auc', 'counts'], engine="openpyxl")
+   # labels = cudf.read_csv(url + "drug_response.csv")
+    # Gets rid of any drugs that was tested on less than 300 samples
+    labels = labels[labels['counts'] > 300]
+    labels = labels.drop('counts', axis = 1)
+    # Modifies the drug names so that only the first name is used (Gets rid of everything that's inside the parenthesis)
+    # This makes it easier for performing operations based on drug names and saving results
+    
+    ## GPU
+  #  labels['drugs'] = labels[(labels.columns[0])].str.split(' ')
+  #  print(labels)
+  #  from numba import cuda
+
+  #  @cuda.jit
+  #  def sel_first(in_col, out_col):
+  #      i = cuda.grid(1)
+  #      if i < in_col.size: # boundary guard
+  #          out_col[i] = in_col[i][0]
+  #  size = labels[(labels.columns[0])].size
+  #  labels['inhibitor'] = 0.0
+  #  sel_first.forall(size)(labels[labels.columns[0]], labels['inhibitor'])
+  #  print(labels)
+    
+
+    #labels['inhibitor'] = labels['inhibitor'].apply(lambda x: x.split(' ')[0])
+    # Checks if "drug_name" exists in the dataset
+    if labels['inhibitor'].str.contains(drug_name).any():
+        # Selects the "drug_name" drug data
+        labels = labels[labels['inhibitor'] == drug_name]
+        labels = labels[['lab_id', 'auc']]
+        # Calculates the 1st and 3rd quantile of the AUC distribution for "drug_name"
+        q1 = labels['auc'].quantile(.25)
+        q3 = labels['auc'].quantile(.75)
+        # Assigns classification group to each sample:
+        # If the auc score <= q1, then the sample is classified as a "low responder" or "0"
+        # if auc score >= q3, then the sample is classified as a "high responder" or "1"
+        # anything else is classified as -1 (which gets removed later)
+        labels['GROUP'] = labels['auc'].apply(lambda x: auc_to_binary(x, q1, q3))
+        labels = labels.drop('auc', axis = 1)
+        # Filters out any samples that fell inside the 1st and 3rd Quantile (Anything classified as -1)
+        labels = labels[labels['GROUP'].isin([0, 1])]
+        labels = labels.rename(columns = {'lab_id':'SID'})
+  #      labels = cudf.from_pandas(labels)
+       # print(labels)
+    else:
+        sys.exit("ERROR beatAML Project: Labels requested not available. List of available labels are ['UNC2025A', 'original']")
+    return labels
+
+# Creates new directory and subdirectories if given a path and the directory does not exist
+# Used extensively to save results
+def make_result_dir(path):
+    Path(path).mkdir(parents=True, exist_ok=True)
+
+#### !!!! CAN BE MODIFIED TO FIT YOUR OWN DATASET !!!! ####
+
+## Function to load new dataset
+def load_dataset_rnaseq(url):
+#    dataset = pd.read_csv(url, sep='\t', index_col=0)
+    dataset = cudf.read_csv(url, sep='\t', index_col=0)
+    samples = dataset.columns
+    return dataset, samples
+
+## Function to load new labels
+def load_labels_rnaseq(url):
+#    labels = pd.read_csv(url, sep='\t', index_col=0)
+    labels = cudf.read_csv(url, sep='\t', index_col=0)
+    return labels