UD-CRPL
diff --git a/‎.Rhistory
Lines changed: 100 additions & 0 deletions b/‎.Rhistory
Lines changed: 100 additions & 0 deletions
diff --git a/‎collect_results.py
Lines changed: 0 additions & 66 deletions b/‎collect_results.py
Lines changed: 0 additions & 66 deletions
diff --git a/‎data_preprocess.py
Lines changed: 3 additions & 1 deletion b/‎data_preprocess.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎feature_selection.py
Lines changed: 45 additions & 3 deletions b/‎feature_selection.py
Lines changed: 45 additions & 3 deletions
diff --git a/‎get_accuracy_report.py
Lines changed: 0 additions & 114 deletions b/‎get_accuracy_report.py
Lines changed: 0 additions & 114 deletions
@@ -0,0 +1,100 @@
+if(require("docopt")){
+print("docopt loaded correctly")
+} else {
+print("Trying to install docopt")
+install.packages('docopt', repos="http://cran.r-project.org")
+if (require("docopt")){
+print("docopt installed and loaded")
+} else {
+stop("could not install docopt")
+}
+}
+if(require("limma")){
+print("limma loaded correctly")
+} else {
+print("Trying to install limma")
+source("http://bioconductor.org/biocLite.R")
+biocLite("limma")
+if (require("limma")){
+print("limma installed and loaded")
+} else {
+stop("could not install limma")
+}
+}
+if (!require("BiocManager", quietly = TRUE))
+BiocManager::install(version = "3.15")
+BiocManager::install(c("limma"))
+BiocManager::install(c("limma", "edgeR"))
+if(require("limma")){
+print("limma loaded correctly")
+} else {
+print("Trying to install limma")
+source("http://bioconductor.org/biocLite.R")
+biocLite("limma")
+if (require("limma")){
+print("limma installed and loaded")
+} else {
+stop("could not install limma")
+}
+}
+if(require("edgeR")){
+print("edgeR loaded correctly")
+} else {
+print("Trying to install edgeR")
+source("http://bioconductor.org/biocLite.R")
+biocLite("edgeR")
+if (require("edgeR")){
+print("edgeR installed and loaded")
+} else {
+stop("could not install edgeR")
+}
+}
+if(require("tidyr")){
+print("tidyr loaded correctly")
+} else {
+print("Trying to install tidyr")
+install.packages('tidyr', repos="http://cran.r-project.org")
+if (require("tidyr")){
+print("tidyr installed and loaded")
+} else {
+stop("could not install tidyr")
+}
+}
+if(require("matrixStats")){
+print("matrixStats loaded correctly")
+} else {
+print("Trying to install matrixStats")
+install.packages('matrixStats', repos="http://cran.r-project.org")
+if (require("matrixStats")){
+print("matrixStats installed and loaded")
+} else {
+stop("could not install matrixStats")
+}
+}
+## Args to vars
+file <- args$`--file`
+args    <- docopt(doc)
+"
+Usage:
+edger_pipe.r --file=<file> --name=<name> --dir=<dir>
+edger_pipe.r (-h | --help)
+Description:   Runs an edgeR analysis on input RSEM files and conditions.
+Options:
+--file=<file>     File detailing samples and groups
+--dir=<dir>       Working directory
+--name=<name>     A prefix for output files
+" -> doc
+args    <- docopt(doc)
+## Args to vars
+file <- args$`--file`
+"
+Usage:
+edger_pipe.r --file=<file> --name=<name> --dir=<dir>
+edger_pipe.r (-h | --help)
+Description:   Runs an edgeR analysis on input RSEM files and conditions.
+Options:
+--file=<file>     File detailing samples and groups
+--dir=<dir>       Working directory
+--name=<name>     A prefix for output files
+" -> doc
+args    <- docopt(doc)
@@ -93,7 +93,8 @@ def auc_to_binary(value, q1, q3):
 ## Loads the RNA Sequence Data Matrix from the BeatAML Project
 def load_dataset_beatAML(url, normalization):
     if normalization == "cpm":
-        dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S9-Gene Counts CPM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str})
+        #dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S9-Gene Counts CPM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str})
+        dataset = pd.read_csv(url + "read_count_matrix.txt", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, sep="\t")
     elif normalization == "rpkm":
         dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S8-Gene Counts RPKM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str})
     else:
@@ -104,6 +105,7 @@ def load_dataset_beatAML(url, normalization):
     # Drops symbol column since gene ID is already being used to track back
     dataset = dataset.drop('Symbol', axis = 1)
     # Gets the list of sample IDs from the dataset
+    dataset.columns = [s.replace('X','-') for s in dataset.columns]
     samples = dataset.columns
     return dataset, samples
 
 
@@ -39,7 +39,7 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
             # DIFFERENTIAL GENE EXPRESSION ANALYSIS
         elif fs == 'dge':
             print("PERFORMING DGE: ")
-            dataset = dge(path, input["x_train"].T, input["y_train"], drug_name, project_info)
+            dataset = dge(path + fs + "/" + classifiers[0] + "/" + iteration, input["x_train"].T, input["y_train"], drug_name, project_info)
 
             # FEATURE SWAPPING EXPERIMENT
         elif fs == 'swap':
@@ -123,7 +123,7 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
             # DIFFERENTIAL GENE EXPRESSION ANALYSIS
         elif fs == 'dge':
             print("PERFORMING DGE: " + str(iteration) + "/" + str(total_iterations))
-            dataset = dge(path, input["x_train"][iteration].T, input["y_train"][iteration], drug_name, project_info)
+            dataset = dge(path + fs + "/" + classifiers[0] + "/" + str(iteration) + "/", input["x_train"][iteration].T, input["y_train"][iteration], drug_name, project_info)
 
             # FEATURE SWAPPING EXPERIMENT
         elif fs == 'swap':
@@ -198,7 +198,49 @@ def from_feature_list(path, dataset, labels, iteration, project_info):
 # Feature Selection: "dge"
 # Loads the features/genes that were selected by the DGE analysis
 def dge(path, dataset, labels, drug_name, project_info):
-    feature_set = pd.read_csv(project_info['dge_path'] + drug_name + '_genes_selected.tsv', names=[drug_name])
+
+    # Generate DGE label file used in for the limma R script
+    dge_labels_file = project_info['dge_path'] + drug_name + '_dge_input.txt'
+    dge_labels = labels.copy()
+    dge_labels = dge_labels.reset_index()
+    dge_labels["SID"] = [s.replace('-','X') for s in dge_labels["SID"]]
+    dge_labels['SID'] = 'X' + dge_labels['SID'].astype(str)
+    dge_labels = dge_labels.rename(columns = {'SID':'Sample'})
+    dge_labels = dge_labels.rename(columns = {'GROUP':'high'})
+    dge_labels['low'] = np.logical_xor(dge_labels['high'],1).astype(int)
+    dge_labels.to_csv(dge_labels_file, index=False, sep="\t")
+    #print(dge_labels)
+
+    import sys
+    import subprocess
+
+    dge_script  = "./beataml_deg_commandline.R"
+    workdir   = "--dir=" + project_info['dge_path']
+    file = "--file=" + dge_labels_file
+    name = "--name=" + path + drug_name
+    sys.stdout.flush()
+    jobargz = []
+    jobargz.append(file)
+    jobargz.append(name)
+    jobargz.append(workdir)
+    runlaunch = subprocess.Popen([project_info['dge_path'] + dge_script] + jobargz)
+    runlaunch.wait()
+
+    limma_script  = "limma.py"
+    dataset_path = "--dataset=" + project_info['dataset_path']
+    dname = "--drug=" + drug_name
+    result_path = "--dir=" + path
+    sys.stdout.flush()
+    jobargz = []
+    jobargz.append(dataset_path)
+    jobargz.append(result_path)
+    jobargz.append(dname)
+    #jobargz.append(workdir)
+    runlaunch = subprocess.Popen(["python", project_info['dge_path'] + limma_script] + jobargz)
+    runlaunch.wait()
+
+    feature_set = pd.read_csv(path + drug_name + '_genes_selected.tsv', names=[drug_name])
+    #feature_set = pd.read_csv(project_info['dge_path'] + drug_name + '_genes_selected.tsv', names=[drug_name])
     filtered = dataset[feature_set[drug_name].values]
     return filtered