Skip to content

Commit 3ba4f2b

Browse files
authored
Merge pull request #40 from TAPE-Lab/Xiao_local
6-cell-state-info.py rewritten
2 parents 8ba8074 + 93c794c commit 3ba4f2b

File tree

1 file changed

+49
-126
lines changed

1 file changed

+49
-126
lines changed

Workflow/6-cell_state_info.py

+49-126
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,6 @@
22
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~#~Cell-State-info~#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
33
###############################################################################
44

5-
#### Note: This script is used when Cyclin B1 is gated on the *biaxial plot*.
6-
# This script takes txt files exported from Cytobank as input
7-
# The gating strategy is decribed in the Organoid Methods Paper, Supplementary Figure 2
8-
# Since G1 cells are identified with negative selection, they are not in an explicit cell gate
9-
# Therefore the identification of G1 cells is based on the 'Cell_Index' column (ungated minus all the others)
10-
# The output dataframes have two extra columns: 'cell-state' and 'cell-state_num'
11-
# 0 - apoptosis, 1 - G0, 2 - G1, 3 - S-phase, 4 - G2, 5 - M-phase
12-
# For each cell-type, 6 input files are needed: Ungated, apoptosis, G0, S-phase, G2, and M-phase
13-
# The files need to be named as 'sample-name_cell-type_cell-state'
14-
155
# setup the environment
166
import sys
177
import os
@@ -20,140 +10,73 @@
2010
import numpy as np
2111
from aux_functions import yes_or_NO
2212

13+
#### Note: This script is used when Cyclin B1 is gated on the *histogram*.
14+
# This script takes txt files exported from Cytobank as input
15+
# The gating strategy is described in the Organoid Methods Paper, Supplementary Figure 2
16+
# When Cyclin B1 is gated on the histogram, G1 cells can be identified as a population and exported as a standalone txt file
17+
# That makes the assignment of the G1 cell-state easier than the one implemented with the use of cell-index
18+
# The output dataframes have two extra columns: 'cell-state' and 'cell-state_num'
19+
# 0 - apoptosis, 1 - G0, 2 - G1, 3 - S-phase, 4 - G2, 5 - M-phase
20+
2321
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Sanity Check~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
2422
file_name_format = yes_or_NO("Are all the files named in the 'sample-name_cell-type_..._cell-state' format?")
2523
if file_name_format == False:
2624
sys.exit(f"Please rename the files to the 'sample_cell-type_cell-state' format\n Accepted cell-states (literal): Ungated, apoptosis, G0, S-phase, G2, and M-phase")
2725

28-
2926
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Preparatory steps~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
3027
folder_name = "6-cell_state_info"
3128

3229
# prepare files
3330
filelist = [f for f in os.listdir(f"./input/{folder_name}") if f.endswith(".txt")]
3431

35-
# generate lists of sample-id, cell-types, and cell-states for iteration
32+
# generate sample summary: cell-type per sample
3633
# the sample files should be named in the following format:
3734
# 'sample-id_cell-type_..._cell-state.txt'
3835

39-
sample_id = [f.split('_')[0] for f in filelist]
40-
sample_id= list(set(sample_id))
41-
print("Samples:")
42-
print('\n'.join([s for s in sample_id]))
43-
44-
cell_type = [f.split('_')[1] for f in filelist]
45-
cell_type = list(set(cell_type))
46-
print("\nCell-types:")
47-
print([t for t in cell_type])
36+
sample_id = list(set([f.split('_')[0] for f in filelist]))
37+
for s in sample_id:
38+
sample_files = [f for f in os.listdir(f"./input/{folder_name}") if f.startswith(f'{s}')]
39+
cell_type = list(set([f.split('_')[1] for f in sample_files]))
40+
print(f"Sample: {s}\nCell-type(s):{cell_type}\n")
4841

49-
print("\nCell-states:")
50-
cell_state = [f.split('.txt')[0].split('_')[-1] for f in filelist]
51-
cell_state = list(set(cell_state))
52-
print([s for s in cell_state])
53-
54-
# generate a dictionary of dataframes
55-
# keys: sample-id_cell-type_cell-state
56-
# values: a dataframe per txt file
42+
# for each sample, generate a dictionary 'dfs' with cell-types as keys (dictionary level 1)
43+
# then for each cell-type, generate a nested dictionary with cell-states as keys
44+
# the value of the dfs[sample_id][cell_type][cell_state] item is a dataframe for each cell-state
45+
# with the newly assigned 'cell-state' and 'cell-state_num' columns
5746
dfs = {}
58-
for file in filelist:
59-
df = pd.read_csv(f'./input/{folder_name}/{file}', sep = '\t', index_col = 0)
60-
s_id = file.split('_')[0]
61-
c_type = file.split('_')[1]
62-
c_state = file.split('.txt')[0].split('_')[-1]
63-
dfs[s_id + '_' + c_type + '_' + c_state] = df
64-
65-
# dfs.keys()
66-
67-
# subset the dfs dictionary based on sample-id and cell-type
68-
# initialise a nested dictionary: dfs_sub[sample-id][cell-type]
69-
dfs_sub = {}
70-
for s_id in sample_id:
71-
dfs_sub[s_id] = {}
72-
for c_type in cell_type:
73-
dfs_sub[s_id][c_type] = {k:v for k,v in dfs.items() if k.split('_')[0]
74-
== s_id and k.split('_')[1] == c_type}
75-
76-
# for s_id in sample_id:
77-
# print(f'{s_id}: {dfs_sub[s_id].keys()}')
78-
79-
# make a copy of the dfs_sub dictionary for cell-state annotation later
80-
dfs_sub_copy = copy.deepcopy(dfs_sub)
81-
82-
83-
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Cell state ID~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
84-
# keep only the 'Cell_Index' column to compare each cell-state population against the 'Ungated'
85-
# this overwrites the dfs_sub dictionary and replaces the dataframes with Series...
86-
# may not be the best solution
87-
for k, v in dfs_sub.items():
88-
for k1, v1 in v.items():
89-
for k2, v2 in v1.items():
90-
v1[k2] = v2.loc[:,'Cell_Index'].copy()
91-
92-
# for each sample, perform the following steps:
93-
# for each cell-type, concatenate all the gated cell-states (apoptosis, G0, S,
94-
# G2, and M) with the 'Ungated' population and then remove duplicates so all
95-
# the gated cell-states will be removed from the ungated population
96-
# --> leaving the indices of cells in G1
97-
# now the values of the dfs_sub dictionary are the indices of cells of all the six cell-states
98-
for k, v in dfs_sub.items():
99-
for k1, v1 in v.items():
100-
s_id = k
101-
c_type = k1
102-
g1 = s_id + '_' + c_type + '_G1'
103-
df_tmp = pd.DataFrame()
104-
105-
for k2, v2 in v1.items():
106-
df_tmp = pd.concat([df_tmp, v2])
107-
108-
df_tmp = df_tmp.drop_duplicates(keep = False).copy()
109-
v1[g1] = df_tmp.iloc[:, 0] # save the indices of G1 cells as a pandas Series
110-
111-
# use the indices of G1 cells stored in dfs_sub to subset the dfs_sub_copy and
112-
# get the G1 population of each cell-type
11347
for s_id in sample_id:
114-
for c_type in cell_type:
115-
df_ungated = dfs_sub_copy[s_id][c_type][s_id + '_' + c_type + '_Ungated'].copy()
116-
g1 = s_id + '_' + c_type + '_G1'
117-
g1_idx = list(dfs_sub[s_id][c_type][g1])
118-
dfs_sub_copy[s_id][c_type][g1] = df_ungated.loc[
119-
df_ungated['Cell_Index'].isin(g1_idx)].copy()
120-
121-
# add the cell-state information (text & numerical) to the dataframes
122-
for k, v in dfs_sub_copy.items():
123-
s_id = k
124-
for k1, v1 in v.items():
125-
c_type = k1
126-
for k2, v2 in v1.items():
127-
c_state = k2.split('_')[-1]
128-
v2['cell-state'] = c_state
129-
130-
if c_state == 'apoptosis':
131-
v2['cell-state_num'] = 0
132-
if c_state == 'G0':
133-
v2['cell-state_num'] = 1
134-
if c_state == 'G1':
135-
v2['cell-state_num'] = 2
136-
if c_state == 'S-phase':
137-
v2['cell-state_num'] = 3
138-
if c_state == 'G2':
139-
v2['cell-state_num'] = 4
140-
if c_state == 'M-phase':
141-
v2['cell-state_num'] = 5
142-
143-
144-
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Save to file~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
145-
# concatenate all the cell-state dataframes within each cell-type and save as txt files
146-
for k, v in dfs_sub_copy.items():
48+
dfs[s_id] = {}
49+
sample_filelist = [f for f in os.listdir(f"./input/{folder_name}") if f.startswith(f'{s_id}')]
50+
for s_file in sample_filelist:
51+
c_type = s_file.split('.txt')[0].split('_')[1]
52+
dfs[s_id][c_type] = {}
53+
c_type_filelist = [f for f in os.listdir(f"./input/{folder_name}") if f.startswith(f'{s_id}_{c_type}')]
54+
for file in c_type_filelist:
55+
df = pd.read_csv(f'./input/{folder_name}/{file}', sep = '\t', index_col = 0)
56+
c_state = file.split('.txt')[0].split('_')[-1]
57+
df['cell-state'] = c_state
58+
if c_state.tolower() == 'apoptosis':
59+
df['cell-state_num'] = 0
60+
elif c_state.tolower() == 'g0':
61+
df['cell-state_num'] = 1
62+
elif c_state.tolower() == 'g1':
63+
df['cell-state_num'] = 2
64+
elif c_state.tolower() == 's-phase':
65+
df['cell-state_num'] = 3
66+
elif c_state.tolower() == 'g2':
67+
df['cell-state_num'] = 4
68+
elif c_state.tolower() == 'm-phase':
69+
df['cell-state_num'] = 5
70+
else:
71+
df['cell-state_num'] = -1 # for mislabelled cell-states
72+
dfs[s_id][c_type][c_state]= df
73+
74+
# iterate through the nested dictionaries, concatenate all cell-state dataframes per sample/cell-type
75+
for k, v in dfs.items():
14776
s_id = k
14877
for k1, v1 in v.items():
14978
c_type = k1
15079
data = pd.DataFrame()
151-
152-
for k2, v1 in v1.items():
153-
c_state = k2.split('_')[-1]
154-
if c_state != 'Ungated':
155-
data = pd.concat([data, v1])
156-
157-
data.sort_values(by='Cell_Index').to_csv(f"./output/{folder_name}/{s_id}_{c_type}_w-cell-state.txt", index = False, sep = '\t')
158-
159-
print(f"Output files saved in the folder './output/{folder_name}'")
80+
for k2, v2 in v1.items():
81+
data = pd.concat([data, v2])
82+
data.to_csv(f"./output/{folder_name}/{s_id}_{c_type}_w-cell-state.txt", sep = '\t', index = False)

0 commit comments

Comments
 (0)