@@ -39,7 +39,7 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
39
39
# DIFFERENTIAL GENE EXPRESSION ANALYSIS
40
40
elif fs == 'dge' :
41
41
print ("PERFORMING DGE: " )
42
- dataset = dge (path , input ["x_train" ].T , input ["y_train" ], drug_name , project_info )
42
+ dataset = dge (path + fs + "/" + classifiers [ 0 ] + "/" + iteration , input ["x_train" ].T , input ["y_train" ], drug_name , project_info )
43
43
44
44
# FEATURE SWAPPING EXPERIMENT
45
45
elif fs == 'swap' :
@@ -123,7 +123,7 @@ def feature_selection(path, fs, iteration, input, labels, feature_size, classifi
123
123
# DIFFERENTIAL GENE EXPRESSION ANALYSIS
124
124
elif fs == 'dge' :
125
125
print ("PERFORMING DGE: " + str (iteration ) + "/" + str (total_iterations ))
126
- dataset = dge (path , input ["x_train" ][iteration ].T , input ["y_train" ][iteration ], drug_name , project_info )
126
+ dataset = dge (path + fs + "/" + classifiers [ 0 ] + "/" + str ( iteration ) + "/" , input ["x_train" ][iteration ].T , input ["y_train" ][iteration ], drug_name , project_info )
127
127
128
128
# FEATURE SWAPPING EXPERIMENT
129
129
elif fs == 'swap' :
@@ -198,7 +198,49 @@ def from_feature_list(path, dataset, labels, iteration, project_info):
198
198
# Feature Selection: "dge"
199
199
# Loads the features/genes that were selected by the DGE analysis
200
200
def dge (path , dataset , labels , drug_name , project_info ):
201
- feature_set = pd .read_csv (project_info ['dge_path' ] + drug_name + '_genes_selected.tsv' , names = [drug_name ])
201
+
202
+ # Generate DGE label file used in for the limma R script
203
+ dge_labels_file = project_info ['dge_path' ] + drug_name + '_dge_input.txt'
204
+ dge_labels = labels .copy ()
205
+ dge_labels = dge_labels .reset_index ()
206
+ dge_labels ["SID" ] = [s .replace ('-' ,'X' ) for s in dge_labels ["SID" ]]
207
+ dge_labels ['SID' ] = 'X' + dge_labels ['SID' ].astype (str )
208
+ dge_labels = dge_labels .rename (columns = {'SID' :'Sample' })
209
+ dge_labels = dge_labels .rename (columns = {'GROUP' :'high' })
210
+ dge_labels ['low' ] = np .logical_xor (dge_labels ['high' ],1 ).astype (int )
211
+ dge_labels .to_csv (dge_labels_file , index = False , sep = "\t " )
212
+ #print(dge_labels)
213
+
214
+ import sys
215
+ import subprocess
216
+
217
+ dge_script = "./beataml_deg_commandline.R"
218
+ workdir = "--dir=" + project_info ['dge_path' ]
219
+ file = "--file=" + dge_labels_file
220
+ name = "--name=" + path + drug_name
221
+ sys .stdout .flush ()
222
+ jobargz = []
223
+ jobargz .append (file )
224
+ jobargz .append (name )
225
+ jobargz .append (workdir )
226
+ runlaunch = subprocess .Popen ([project_info ['dge_path' ] + dge_script ] + jobargz )
227
+ runlaunch .wait ()
228
+
229
+ limma_script = "limma.py"
230
+ dataset_path = "--dataset=" + project_info ['dataset_path' ]
231
+ dname = "--drug=" + drug_name
232
+ result_path = "--dir=" + path
233
+ sys .stdout .flush ()
234
+ jobargz = []
235
+ jobargz .append (dataset_path )
236
+ jobargz .append (result_path )
237
+ jobargz .append (dname )
238
+ #jobargz.append(workdir)
239
+ runlaunch = subprocess .Popen (["python" , project_info ['dge_path' ] + limma_script ] + jobargz )
240
+ runlaunch .wait ()
241
+
242
+ feature_set = pd .read_csv (path + drug_name + '_genes_selected.tsv' , names = [drug_name ])
243
+ #feature_set = pd.read_csv(project_info['dge_path'] + drug_name + '_genes_selected.tsv', names=[drug_name])
202
244
filtered = dataset [feature_set [drug_name ].values ]
203
245
return filtered
204
246
0 commit comments