-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCustom_Peaks_To_Mutations.Rmd
94 lines (68 loc) · 3 KB
/
Custom_Peaks_To_Mutations.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
---
title: "Peaks to Mutations"
author: "Irzam Sarfraz"
date: "`r Sys.Date()`"
output: html_document
---
```{r}
library(SingleCellExperiment)
library(VariantAnnotation)
library(GenomicRanges)
library(plyranges)
library(AnnotationHub)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
```
read atac peaks
```{r}
# scATAC-seq Hematopoeisis + MPAL cell x peak Summarized Experiment
data <- readRDS("/projectnb/paxlab/isarfraz/Greenleaf_Data_RDS/scATAC-All-Hematopoiesis-MPAL-191120.rds")
# data <- readRDS("/projectnb/paxlab/isarfraz/Greenleaf_Data_RDS/scATAC-Healthy-Hematopoiesis-191120.rds") # this is healthy only
# subset to mpal only
# data <- data[, which(colData(data)[["ProjectClassification"]] != "Reference")]
data <- as(data, "SingleCellExperiment")
# subset to variable only
# data <- data[which(metadata(data)$variablePeaks %in% rownames(data)), ]
data
```
read vcf mutations
```{r}
path_icgc_wgs_laml_kr <- "/projectnb/paxlab/ICGC/WGS/LAML-KR/"
# vcf <- readVcf(paste0(path_icgc_wgs_laml_kr, "1c300960-e51c-4477-8fdd-026c7e545dc4.MUSE_1-0rc-b391201-vcf.20160401.somatic.snv_mnv.vcf.gz"))
vcf <- readVcf(paste0(path_icgc_wgs_laml_kr, "fa718a69-7d09-424b-90a3-4839ba7dc9b2.dkfz-snvCalling_1-0-132-1.20150824.germline.snv_mnv.vcf.gz"))
head(rowRanges(vcf))
```
mapping (only one file at the moment which is one donor and one type of mutations)
```{r}
peaks_gr <- rowRanges(data)
mut_gr <- rowRanges(vcf)
# check for specific chr
# mut_gr %>% filter(seqnames == 'hs37d5')
# mut_gr %>% filter(seqnames == 'GL000233.1')
seqlevels(mut_gr) <- paste0("chr", seqlevels(mut_gr)) # what about rows that are not chr?
#subset to PASS and tier1
mut_gr <- subset(mut_gr, FILTER %in% c("PASS", "Tier1"))
overlaps <- findOverlaps(mut_gr, peaks_gr) #296 hits what does it mean? 296 mutations from vcf were mapped to peaks in atac data. col 1 i.e. queryHits are the indices from mutations, col 2 i.e. subjectHits are indices from peaks
peaks_overlap <- as.data.frame(mcols(peaks_gr[subjectHits(overlaps)])) #subjectHits gives peak indices
muts_overlap <- mut_gr[queryHits(overlaps), ] #queryhits gives mutation indices
muts_overlap_df <- as.data.frame(muts_overlap)
peaks_muts_combined <- cbind(muts_overlap_df, peaks_overlap, data.frame(peaks = rownames(peaks_overlap)))
peaks_muts_combined$ALT <- as.vector(CharacterList(muts_overlap_df$ALT))
peaks_muts_combined$paramRangeID <- NULL
peaks_muts_combined$QUAL <- NULL
peaks_muts_combined$name <- NULL
head(peaks_muts_combined)
###
# what about 200kb upstream that Pawel talked about?
###
```
annotate peaks_gr
```{r}
txdb_hg19 <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevels(txdb_hg19)
genome(peaks_gr) <- "hg19"
loc_hg19 <- locateVariants(peaks_gr, txdb_hg19, AllVariants()) #contains now all annotated regions (ranges are now smaller since multiple regions in original ranges, so overall more rows now)
table(loc_hg19$LOCATION) # is intergenic distal region?
# subset to non-coding regions
peaks_non_coding <- subset(loc_hg19, LOCATION %in% c("intron", "intergenic")) #confirm?
peaks_gr <- peaks_non_coding
```