Skip to content

Commit a59e512

Browse files
committed
Changed DGE so that instead of taking a result file, runs the DGE algorithm
1 parent 4aa3ceb commit a59e512

15 files changed

+1007
-327
lines changed

.Rhistory

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
if(require("docopt")){
2+
print("docopt loaded correctly")
3+
} else {
4+
print("Trying to install docopt")
5+
install.packages('docopt', repos="http://cran.r-project.org")
6+
if (require("docopt")){
7+
print("docopt installed and loaded")
8+
} else {
9+
stop("could not install docopt")
10+
}
11+
}
12+
if(require("limma")){
13+
print("limma loaded correctly")
14+
} else {
15+
print("Trying to install limma")
16+
source("http://bioconductor.org/biocLite.R")
17+
biocLite("limma")
18+
if (require("limma")){
19+
print("limma installed and loaded")
20+
} else {
21+
stop("could not install limma")
22+
}
23+
}
24+
if (!require("BiocManager", quietly = TRUE))
25+
BiocManager::install(version = "3.15")
26+
BiocManager::install(c("limma"))
27+
BiocManager::install(c("limma", "edgeR"))
28+
if(require("limma")){
29+
print("limma loaded correctly")
30+
} else {
31+
print("Trying to install limma")
32+
source("http://bioconductor.org/biocLite.R")
33+
biocLite("limma")
34+
if (require("limma")){
35+
print("limma installed and loaded")
36+
} else {
37+
stop("could not install limma")
38+
}
39+
}
40+
if(require("edgeR")){
41+
print("edgeR loaded correctly")
42+
} else {
43+
print("Trying to install edgeR")
44+
source("http://bioconductor.org/biocLite.R")
45+
biocLite("edgeR")
46+
if (require("edgeR")){
47+
print("edgeR installed and loaded")
48+
} else {
49+
stop("could not install edgeR")
50+
}
51+
}
52+
if(require("tidyr")){
53+
print("tidyr loaded correctly")
54+
} else {
55+
print("Trying to install tidyr")
56+
install.packages('tidyr', repos="http://cran.r-project.org")
57+
if (require("tidyr")){
58+
print("tidyr installed and loaded")
59+
} else {
60+
stop("could not install tidyr")
61+
}
62+
}
63+
if(require("matrixStats")){
64+
print("matrixStats loaded correctly")
65+
} else {
66+
print("Trying to install matrixStats")
67+
install.packages('matrixStats', repos="http://cran.r-project.org")
68+
if (require("matrixStats")){
69+
print("matrixStats installed and loaded")
70+
} else {
71+
stop("could not install matrixStats")
72+
}
73+
}
74+
## Args to vars
75+
file <- args$`--file`
76+
args <- docopt(doc)
77+
"
78+
Usage:
79+
edger_pipe.r --file=<file> --name=<name> --dir=<dir>
80+
edger_pipe.r (-h | --help)
81+
Description: Runs an edgeR analysis on input RSEM files and conditions.
82+
Options:
83+
--file=<file> File detailing samples and groups
84+
--dir=<dir> Working directory
85+
--name=<name> A prefix for output files
86+
" -> doc
87+
args <- docopt(doc)
88+
## Args to vars
89+
file <- args$`--file`
90+
"
91+
Usage:
92+
edger_pipe.r --file=<file> --name=<name> --dir=<dir>
93+
edger_pipe.r (-h | --help)
94+
Description: Runs an edgeR analysis on input RSEM files and conditions.
95+
Options:
96+
--file=<file> File detailing samples and groups
97+
--dir=<dir> Working directory
98+
--name=<name> A prefix for output files
99+
" -> doc
100+
args <- docopt(doc)

collect_results.py

Lines changed: 0 additions & 66 deletions
This file was deleted.

data_preprocess.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ def auc_to_binary(value, q1, q3):
9393
## Loads the RNA Sequence Data Matrix from the BeatAML Project
9494
def load_dataset_beatAML(url, normalization):
9595
if normalization == "cpm":
96-
dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S9-Gene Counts CPM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str})
96+
#dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S9-Gene Counts CPM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str})
97+
dataset = pd.read_csv(url + "read_count_matrix.txt", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, sep="\t")
9798
elif normalization == "rpkm":
9899
dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S8-Gene Counts RPKM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str})
99100
else:
@@ -104,6 +105,7 @@ def load_dataset_beatAML(url, normalization):
104105
# Drops symbol column since gene ID is already being used to track back
105106
dataset = dataset.drop('Symbol', axis = 1)
106107
# Gets the list of sample IDs from the dataset
108+
dataset.columns = [s.replace('X','-') for s in dataset.columns]
107109
samples = dataset.columns
108110
return dataset, samples
109111

feature_selection.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
3939
# DIFFERENTIAL GENE EXPRESSION ANALYSIS
4040
elif fs == 'dge':
4141
print("PERFORMING DGE: ")
42-
dataset = dge(path, input["x_train"].T, input["y_train"], drug_name, project_info)
42+
dataset = dge(path + fs + "/" + classifiers[0] + "/" + iteration, input["x_train"].T, input["y_train"], drug_name, project_info)
4343

4444
# FEATURE SWAPPING EXPERIMENT
4545
elif fs == 'swap':
@@ -123,7 +123,7 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
123123
# DIFFERENTIAL GENE EXPRESSION ANALYSIS
124124
elif fs == 'dge':
125125
print("PERFORMING DGE: " + str(iteration) + "/" + str(total_iterations))
126-
dataset = dge(path, input["x_train"][iteration].T, input["y_train"][iteration], drug_name, project_info)
126+
dataset = dge(path + fs + "/" + classifiers[0] + "/" + str(iteration) + "/", input["x_train"][iteration].T, input["y_train"][iteration], drug_name, project_info)
127127

128128
# FEATURE SWAPPING EXPERIMENT
129129
elif fs == 'swap':
@@ -198,7 +198,49 @@ def from_feature_list(path, dataset, labels, iteration, project_info):
198198
# Feature Selection: "dge"
199199
# Loads the features/genes that were selected by the DGE analysis
200200
def dge(path, dataset, labels, drug_name, project_info):
201-
feature_set = pd.read_csv(project_info['dge_path'] + drug_name + '_genes_selected.tsv', names=[drug_name])
201+
202+
# Generate DGE label file used in for the limma R script
203+
dge_labels_file = project_info['dge_path'] + drug_name + '_dge_input.txt'
204+
dge_labels = labels.copy()
205+
dge_labels = dge_labels.reset_index()
206+
dge_labels["SID"] = [s.replace('-','X') for s in dge_labels["SID"]]
207+
dge_labels['SID'] = 'X' + dge_labels['SID'].astype(str)
208+
dge_labels = dge_labels.rename(columns = {'SID':'Sample'})
209+
dge_labels = dge_labels.rename(columns = {'GROUP':'high'})
210+
dge_labels['low'] = np.logical_xor(dge_labels['high'],1).astype(int)
211+
dge_labels.to_csv(dge_labels_file, index=False, sep="\t")
212+
#print(dge_labels)
213+
214+
import sys
215+
import subprocess
216+
217+
dge_script = "./beataml_deg_commandline.R"
218+
workdir = "--dir=" + project_info['dge_path']
219+
file = "--file=" + dge_labels_file
220+
name = "--name=" + path + drug_name
221+
sys.stdout.flush()
222+
jobargz = []
223+
jobargz.append(file)
224+
jobargz.append(name)
225+
jobargz.append(workdir)
226+
runlaunch = subprocess.Popen([project_info['dge_path'] + dge_script] + jobargz)
227+
runlaunch.wait()
228+
229+
limma_script = "limma.py"
230+
dataset_path = "--dataset=" + project_info['dataset_path']
231+
dname = "--drug=" + drug_name
232+
result_path = "--dir=" + path
233+
sys.stdout.flush()
234+
jobargz = []
235+
jobargz.append(dataset_path)
236+
jobargz.append(result_path)
237+
jobargz.append(dname)
238+
#jobargz.append(workdir)
239+
runlaunch = subprocess.Popen(["python", project_info['dge_path'] + limma_script] + jobargz)
240+
runlaunch.wait()
241+
242+
feature_set = pd.read_csv(path + drug_name + '_genes_selected.tsv', names=[drug_name])
243+
#feature_set = pd.read_csv(project_info['dge_path'] + drug_name + '_genes_selected.tsv', names=[drug_name])
202244
filtered = dataset[feature_set[drug_name].values]
203245
return filtered
204246

get_accuracy_report.py

Lines changed: 0 additions & 114 deletions
This file was deleted.

0 commit comments

Comments
 (0)