2
2
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~#~Cell-State-info~#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
3
3
###############################################################################
4
4
5
- #### Note: This script is used when Cyclin B1 is gated on the *biaxial plot*.
6
- # This script takes txt files exported from Cytobank as input
7
- # The gating strategy is decribed in the Organoid Methods Paper, Supplementary Figure 2
8
- # Since G1 cells are identified with negative selection, they are not in an explicit cell gate
9
- # Therefore the identification of G1 cells is based on the 'Cell_Index' column (ungated minus all the others)
10
- # The output dataframes have two extra columns: 'cell-state' and 'cell-state_num'
11
- # 0 - apoptosis, 1 - G0, 2 - G1, 3 - S-phase, 4 - G2, 5 - M-phase
12
- # For each cell-type, 6 input files are needed: Ungated, apoptosis, G0, S-phase, G2, and M-phase
13
- # The files need to be named as 'sample-name_cell-type_cell-state'
14
-
15
5
# setup the environment
16
6
import sys
17
7
import os
20
10
import numpy as np
21
11
from aux_functions import yes_or_NO
22
12
13
+ #### Note: This script is used when Cyclin B1 is gated on the *histogram*.
14
+ # This script takes txt files exported from Cytobank as input
15
+ # The gating strategy is described in the Organoid Methods Paper, Supplementary Figure 2
16
+ # When Cyclin B1 is gated on the histogram, G1 cells can be identified as a population and exported as a standalone txt file
17
+ # That makes the assignment of the G1 cell-state easier than the one implemented with the use of cell-index
18
+ # The output dataframes have two extra columns: 'cell-state' and 'cell-state_num'
19
+ # 0 - apoptosis, 1 - G0, 2 - G1, 3 - S-phase, 4 - G2, 5 - M-phase
20
+
23
21
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Sanity Check~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
24
22
file_name_format = yes_or_NO ("Are all the files named in the 'sample-name_cell-type_..._cell-state' format?" )
25
23
if file_name_format == False :
26
24
sys .exit (f"Please rename the files to the 'sample_cell-type_cell-state' format\n Accepted cell-states (literal): Ungated, apoptosis, G0, S-phase, G2, and M-phase" )
27
25
28
-
29
26
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Preparatory steps~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
30
27
folder_name = "6-cell_state_info"
31
28
32
29
# prepare files
33
30
filelist = [f for f in os .listdir (f"./input/{ folder_name } " ) if f .endswith (".txt" )]
34
31
35
- # generate lists of sample-id, cell-types, and cell-states for iteration
32
+ # generate sample summary: cell-type per sample
36
33
# the sample files should be named in the following format:
37
34
# 'sample-id_cell-type_..._cell-state.txt'
38
35
39
- sample_id = [f .split ('_' )[0 ] for f in filelist ]
40
- sample_id = list (set (sample_id ))
41
- print ("Samples:" )
42
- print ('\n ' .join ([s for s in sample_id ]))
43
-
44
- cell_type = [f .split ('_' )[1 ] for f in filelist ]
45
- cell_type = list (set (cell_type ))
46
- print ("\n Cell-types:" )
47
- print ([t for t in cell_type ])
36
+ sample_id = list (set ([f .split ('_' )[0 ] for f in filelist ]))
37
+ for s in sample_id :
38
+ sample_files = [f for f in os .listdir (f"./input/{ folder_name } " ) if f .startswith (f'{ s } ' )]
39
+ cell_type = list (set ([f .split ('_' )[1 ] for f in sample_files ]))
40
+ print (f"Sample: { s } \n Cell-type(s):{ cell_type } \n " )
48
41
49
- print ("\n Cell-states:" )
50
- cell_state = [f .split ('.txt' )[0 ].split ('_' )[- 1 ] for f in filelist ]
51
- cell_state = list (set (cell_state ))
52
- print ([s for s in cell_state ])
53
-
54
- # generate a dictionary of dataframes
55
- # keys: sample-id_cell-type_cell-state
56
- # values: a dataframe per txt file
42
+ # for each sample, generate a dictionary 'dfs' with cell-types as keys (dictionary level 1)
43
+ # then for each cell-type, generate a nested dictionary with cell-states as keys
44
+ # the value of the dfs[sample_id][cell_type][cell_state] item is a dataframe for each cell-state
45
+ # with the newly assigned 'cell-state' and 'cell-state_num' columns
57
46
dfs = {}
58
- for file in filelist :
59
- df = pd .read_csv (f'./input/{ folder_name } /{ file } ' , sep = '\t ' , index_col = 0 )
60
- s_id = file .split ('_' )[0 ]
61
- c_type = file .split ('_' )[1 ]
62
- c_state = file .split ('.txt' )[0 ].split ('_' )[- 1 ]
63
- dfs [s_id + '_' + c_type + '_' + c_state ] = df
64
-
65
- # dfs.keys()
66
-
67
- # subset the dfs dictionary based on sample-id and cell-type
68
- # initialise a nested dictionary: dfs_sub[sample-id][cell-type]
69
- dfs_sub = {}
70
- for s_id in sample_id :
71
- dfs_sub [s_id ] = {}
72
- for c_type in cell_type :
73
- dfs_sub [s_id ][c_type ] = {k :v for k ,v in dfs .items () if k .split ('_' )[0 ]
74
- == s_id and k .split ('_' )[1 ] == c_type }
75
-
76
- # for s_id in sample_id:
77
- # print(f'{s_id}: {dfs_sub[s_id].keys()}')
78
-
79
- # make a copy of the dfs_sub dictionary for cell-state annotation later
80
- dfs_sub_copy = copy .deepcopy (dfs_sub )
81
-
82
-
83
- #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Cell state ID~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
84
- # keep only the 'Cell_Index' column to compare each cell-state population against the 'Ungated'
85
- # this overwrites the dfs_sub dictionary and replaces the dataframes with Series...
86
- # may not be the best solution
87
- for k , v in dfs_sub .items ():
88
- for k1 , v1 in v .items ():
89
- for k2 , v2 in v1 .items ():
90
- v1 [k2 ] = v2 .loc [:,'Cell_Index' ].copy ()
91
-
92
- # for each sample, perform the following steps:
93
- # for each cell-type, concatenate all the gated cell-states (apoptosis, G0, S,
94
- # G2, and M) with the 'Ungated' population and then remove duplicates so all
95
- # the gated cell-states will be removed from the ungated population
96
- # --> leaving the indices of cells in G1
97
- # now the values of the dfs_sub dictionary are the indices of cells of all the six cell-states
98
- for k , v in dfs_sub .items ():
99
- for k1 , v1 in v .items ():
100
- s_id = k
101
- c_type = k1
102
- g1 = s_id + '_' + c_type + '_G1'
103
- df_tmp = pd .DataFrame ()
104
-
105
- for k2 , v2 in v1 .items ():
106
- df_tmp = pd .concat ([df_tmp , v2 ])
107
-
108
- df_tmp = df_tmp .drop_duplicates (keep = False ).copy ()
109
- v1 [g1 ] = df_tmp .iloc [:, 0 ] # save the indices of G1 cells as a pandas Series
110
-
111
- # use the indices of G1 cells stored in dfs_sub to subset the dfs_sub_copy and
112
- # get the G1 population of each cell-type
113
47
for s_id in sample_id :
114
- for c_type in cell_type :
115
- df_ungated = dfs_sub_copy [s_id ][c_type ][s_id + '_' + c_type + '_Ungated' ].copy ()
116
- g1 = s_id + '_' + c_type + '_G1'
117
- g1_idx = list (dfs_sub [s_id ][c_type ][g1 ])
118
- dfs_sub_copy [s_id ][c_type ][g1 ] = df_ungated .loc [
119
- df_ungated ['Cell_Index' ].isin (g1_idx )].copy ()
120
-
121
- # add the cell-state information (text & numerical) to the dataframes
122
- for k , v in dfs_sub_copy .items ():
123
- s_id = k
124
- for k1 , v1 in v .items ():
125
- c_type = k1
126
- for k2 , v2 in v1 .items ():
127
- c_state = k2 .split ('_' )[- 1 ]
128
- v2 ['cell-state' ] = c_state
129
-
130
- if c_state == 'apoptosis' :
131
- v2 ['cell-state_num' ] = 0
132
- if c_state == 'G0' :
133
- v2 ['cell-state_num' ] = 1
134
- if c_state == 'G1' :
135
- v2 ['cell-state_num' ] = 2
136
- if c_state == 'S-phase' :
137
- v2 ['cell-state_num' ] = 3
138
- if c_state == 'G2' :
139
- v2 ['cell-state_num' ] = 4
140
- if c_state == 'M-phase' :
141
- v2 ['cell-state_num' ] = 5
142
-
143
-
144
- #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Save to file~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
145
- # concatenate all the cell-state dataframes within each cell-type and save as txt files
146
- for k , v in dfs_sub_copy .items ():
48
+ dfs [s_id ] = {}
49
+ sample_filelist = [f for f in os .listdir (f"./input/{ folder_name } " ) if f .startswith (f'{ s_id } ' )]
50
+ for s_file in sample_filelist :
51
+ c_type = s_file .split ('.txt' )[0 ].split ('_' )[1 ]
52
+ dfs [s_id ][c_type ] = {}
53
+ c_type_filelist = [f for f in os .listdir (f"./input/{ folder_name } " ) if f .startswith (f'{ s_id } _{ c_type } ' )]
54
+ for file in c_type_filelist :
55
+ df = pd .read_csv (f'./input/{ folder_name } /{ file } ' , sep = '\t ' , index_col = 0 )
56
+ c_state = file .split ('.txt' )[0 ].split ('_' )[- 1 ]
57
+ df ['cell-state' ] = c_state
58
+ if c_state .tolower () == 'apoptosis' :
59
+ df ['cell-state_num' ] = 0
60
+ elif c_state .tolower () == 'g0' :
61
+ df ['cell-state_num' ] = 1
62
+ elif c_state .tolower () == 'g1' :
63
+ df ['cell-state_num' ] = 2
64
+ elif c_state .tolower () == 's-phase' :
65
+ df ['cell-state_num' ] = 3
66
+ elif c_state .tolower () == 'g2' :
67
+ df ['cell-state_num' ] = 4
68
+ elif c_state .tolower () == 'm-phase' :
69
+ df ['cell-state_num' ] = 5
70
+ else :
71
+ df ['cell-state_num' ] = - 1 # for mislabelled cell-states
72
+ dfs [s_id ][c_type ][c_state ]= df
73
+
74
+ # iterate through the nested dictionaries, concatenate all cell-state dataframes per sample/cell-type
75
+ for k , v in dfs .items ():
147
76
s_id = k
148
77
for k1 , v1 in v .items ():
149
78
c_type = k1
150
79
data = pd .DataFrame ()
151
-
152
- for k2 , v1 in v1 .items ():
153
- c_state = k2 .split ('_' )[- 1 ]
154
- if c_state != 'Ungated' :
155
- data = pd .concat ([data , v1 ])
156
-
157
- data .sort_values (by = 'Cell_Index' ).to_csv (f"./output/{ folder_name } /{ s_id } _{ c_type } _w-cell-state.txt" , index = False , sep = '\t ' )
158
-
159
- print (f"Output files saved in the folder './output/{ folder_name } '" )
80
+ for k2 , v2 in v1 .items ():
81
+ data = pd .concat ([data , v2 ])
82
+ data .to_csv (f"./output/{ folder_name } /{ s_id } _{ c_type } _w-cell-state.txt" , sep = '\t ' , index = False )
0 commit comments