-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathFig2.HumanLungCellRef.4.prune.R
210 lines (173 loc) · 9.24 KB
/
Fig2.HumanLungCellRef.4.prune.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# With the consistenly predicted cell types, we integrated the data from different donors using RPCA, calculated kNN purity using 20 nearest neighbors for each cell in the integration,
# pruned cells with less than 60% kNN purity to obtain the final set of cells for the LungMAP Human Lung CellRef.
# The final data were integrated using RPCA for visualization and reference mapping applications.
library(Seurat)
library(SeuratDisk)
library(ggplot2)
library(patchwork)
library(dplyr)
library(SingleCellExperiment)
options(future.globals.maxSize = 480000 * 1024^2)
#################################################################################################################################################
#################################################################################################################################################
batch_var = "DonorID"
ident_var = "cellref_type"
# load the merged data from the previous step
obj = readRDS(file="merged.rds")
obj = CreateSeuratObject(counts=obj@assays$RNA@counts, [email protected])
obj = NormalizeData(obj)
# integrate merged data using Seurat v4 RPCA and find nearest neighbors of each cell for kNN purity calculation
# As the number of batches (104 donors) is large for the RPCA integration, we selected a set of donors as "reference" in the RPCA integration
# Sort donors based on the number of predicted cell types. Cell types with at least 9 cells were included in the calculation.
donor_celltype = table([email protected]$DonorID, [email protected][, ident_var])
donor_celltype = sort(rowSums(donor_celltype>9))
donor_celltype = donor_celltype[which(donor_celltype>9)] # donors with at least 10 cell types
# Get the Dataset information of each donor
donor_dataset = [email protected][, c("DonorID","Dataset")]
donor_dataset = unique(donor_dataset)
rownames(donor_dataset) = donor_dataset$DonorID
# For each dataset, we selected the donors with top 2 highest number of cell types predicted
donor_celltype = data.frame(donor_celltype)
colnames(donor_celltype) = "Types"
donor_celltype$Dataset = donor_dataset[rownames(donor_celltype),"Dataset"]
donor_celltype = donor_celltype[order(donor_celltype$Dataset, -donor_celltype$Types), ]
donor_celltype$Donor = rownames(donor_celltype)
refid = donor_celltype %>% group_by(Dataset) %>% slice_max(order_by = Types, n = 2)
reference.donors = as.character(refid$Donor)
# Perform SCT based integration usign RPCA
DefaultAssay(obj) = "RNA"
# split the dataset into a list of seurat objects, one for the data from a donor
objlist <- SplitObject(obj, split.by = batch_var)
# the positions of selected reference donors in the list
ref.ids = which(names(objlist) %in% reference.donors)
objlist <- lapply(X = objlist, FUN = function(x) {
x <- SCTransform(x, vars.to.regress = c("S.Score","G2M.Score","pMT"), verbose = FALSE)
})
features <- SelectIntegrationFeatures(object.list = objlist, nfeatures = 2000)
objlist <- PrepSCTIntegration(object.list = objlist, anchor.features = features)
objlist <- lapply(X = objlist, FUN = function(x) {
x <- RunPCA(x, features = features, verbose = FALSE)
})
# Find integration anchors
obj.anchors <- FindIntegrationAnchors(object.list = objlist, anchor.features = features, reduction = 'rpca',
normalization.method = "SCT",
k.anchor = 10, l2.norm = TRUE,
reference = ref.ids)
# prune anchors connecting two cells with different cell type predictions
anchors.df = obj.anchors@anchors
anchors.df$type1 = NA
anchors.df$type2 = NA
for (i in 1:length([email protected])) {
i.idx = i.type = NULL
i.idx = which(anchors.df$dataset1 == i)
if (length(i.idx)>0){
i.type = as.character(objlist[[i]]@meta.data[as.numeric(anchors.df$cell1[i.idx]), ident_var])
anchors.df$type1[i.idx] = i.type
}
}
for (i in 1:length([email protected])) {
i.idx = i.type = NULL
i.idx = which(anchors.df$dataset2 == i)
if (length(i.idx)>0){
i.type = as.character(objlist[[i]]@meta.data[as.numeric(anchors.df$cell2[i.idx]), ident_var])
anchors.df$type2[i.idx] = i.type
}
}
anchors.df$type_matched = (anchors.df$type1 == anchors.df$type2)
obj.anchors1 = obj.anchors
obj.anchors1@anchors$type1 = anchors.df$type1
obj.anchors1@anchors$type2 = anchors.df$type2
obj.anchors1@anchors$type_matched = anchors.df$type_matched
anchors.df = droplevels(subset(anchors.df, type_matched==TRUE))
obj.anchors1@anchors = anchors.df
# Integrate data from different donors
combined = NULL
combined <- IntegrateData(anchorset = obj.anchors1, normalization.method = "SCT", k.weight=100)
DefaultAssay(combined) <- "integrated"
combined <- RunPCA(combined, npcs = npcs, verbose = FALSE)
# Find 20 nearest neighbors for each cell
combined = FindNeighbors(combined, reduction = "pca", dims=1:200, k.param = 20, nn.method="annoy", annoy.metric="cosine")
# for each cell, calculate the purity of cell type prediction in the neighbors of the cell
nn = combined@graphs$integrated_nn
purity = data.frame(cell=rownames(nn), [email protected][rownames(nn), ident.var])
purity$score = 0
rownames(purity) = as.character(purity$cell)
purity_helper <- function(x) {
x.cell = as.character(x["cell"])
x.ident = as.character(x["ident"])
x.nn = names(which(nn[x.cell, ]==1))
x.nn.types = purity[x.nn, "ident"]
return(length(which(x.nn.types==x.ident)))
}
score = apply(purity, 1, FUN=function(x) purity_helper(x))
purity$score = score
[email protected]$nn.purity = nn.purity[rownames([email protected]), "score"]/20
# removed cells with purity less than 60%. Seed cells were not removed.
cells.selected = [email protected]
cells.selected$selected = FALSE
cells.selected$selected[which(cells.selected$nn.purity>=0.6)] = TRUE
cells.selected$selected[which(cells.selected$id=="Seed")] = TRUE
cells.selected = subset(cells.selected, selected==TRUE)
# use the pruned data to construct the final Seurat object for the LungMAP Human Lung CellRef
counts.selected = obj@assays$RNA@counts[, rownames(cells.selected)]
obj = CreateSeuratObject(counts=counts.selected, meta.data=cells.selected)
obj = NormalizeData(obj)
#################################################################################################################################################
#################################################################################################################################################
# For visualization and automated cell type annotation, we used RPCA to integrate the final set of cells from different donors
DefaultAssay(obj) = "RNA"
# split the dataset into a list of seurat objects, one for the data from a donor
objlist <- SplitObject(obj, split.by = batch_var)
# the positions of selected reference donors in the list
ref.ids = which(names(objlist) %in% reference.donors)
objlist <- lapply(X = objlist, FUN = function(x) {
x <- SCTransform(x, vars.to.regress = c("S.Score","G2M.Score","pMT"), verbose = FALSE)
})
features <- SelectIntegrationFeatures(object.list = objlist, nfeatures = 2000)
objlist <- PrepSCTIntegration(object.list = objlist, anchor.features = features)
objlist <- lapply(X = objlist, FUN = function(x) {
x <- RunPCA(x, features = features, verbose = FALSE)
})
# Find integration anchors
obj.anchors <- FindIntegrationAnchors(object.list = objlist, anchor.features = features, reduction = 'rpca',
normalization.method = "SCT",
k.anchor = 10, l2.norm = TRUE,
reference = ref.ids)
# prune anchors linking cells of different cell type predictions
anchors.df = obj.anchors@anchors
anchors.df$type1 = NA
anchors.df$type2 = NA
for (i in 1:length([email protected])) {
i.idx = i.type = NULL
i.idx = which(anchors.df$dataset1 == i)
if (length(i.idx)>0){
i.type = as.character(objlist[[i]]@meta.data[as.numeric(anchors.df$cell1[i.idx]), ident_var])
anchors.df$type1[i.idx] = i.type
}
}
for (i in 1:length([email protected])) {
i.idx = i.type = NULL
i.idx = which(anchors.df$dataset2 == i)
if (length(i.idx)>0){
i.type = as.character(objlist[[i]]@meta.data[as.numeric(anchors.df$cell2[i.idx]), ident_var])
anchors.df$type2[i.idx] = i.type
}
}
anchors.df$type_matched = (anchors.df$type1 == anchors.df$type2)
obj.anchors1 = obj.anchors
obj.anchors1@anchors$type1 = anchors.df$type1
obj.anchors1@anchors$type2 = anchors.df$type2
obj.anchors1@anchors$type_matched = anchors.df$type_matched
anchors.df = droplevels(subset(anchors.df, type_matched==TRUE))
obj.anchors1@anchors = anchors.df
# Integrate donor from different donors
combined = NULL
combined <- IntegrateData(anchorset = obj.anchors1, normalization.method = "SCT", k.weight=100)
DefaultAssay(combined) <- "integrated"
combined <- RunPCA(combined, npcs = npcs, verbose = FALSE)
combined <- RunUMAP(combined, reduction = "pca", dims = 1:200, min.dist=0.5, return.model = T, n.neighbors = 50, seed.use=123L)
p1 <- DimPlot(combined, reduction = "umap", group.by = batch_var, pt.size=0.001) + NoLegend()
p2 <- DimPlot(combined, reduction = "umap", group.by = ident_var,label = TRUE, label.size=3, repel = TRUE, pt.size=0.001) + NoLegend()
g = p1 + p2
ggsave(file=paste0("LungMAP_HumanLung_CellRef_v1_umap.tiff"), plot=g, width=15, height=7.25, dpi=300, units="in", compression="lzw")
saveRDS(combined, file="LungMAP_HumanLung_CellRef.v1.rds")