Skip to content

Commit cebb178

Browse files
authored
Add files via upload
0 parents  commit cebb178

File tree

3 files changed

+536
-0
lines changed

3 files changed

+536
-0
lines changed

cfmedip_GeneBiomarker.R

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
library(dplyr)
2+
library(ggplot2)
3+
library(readr)
4+
library(BoutrosLab.plotting.general);
5+
library(annotatr);
6+
library(GenomicRanges);
7+
load("/Users/arnavaz/Library/CloudStorage/OneDrive-UHN/Projects/COMBAT/hg38_gene_annotations.RData")
8+
load("/Users/arnavaz/Library/CloudStorage/OneDrive-UHN/Projects/COMBAT/hg38_cpg_annotations.RData")
9+
COMBAT_meth = read_tsv("/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/COMBAT_rpkm.txt")
10+
COMBAT_meth = COMBAT_meth %>% mutate(interval = paste0(chr, "_", start, "_", end))
11+
combat_meth = COMBAT_meth[,4:22]
12+
13+
14+
intervals = read_tsv()
15+
hg38.annotations <- annotate_regions(
16+
regions = hg38_gene_annotations,
17+
annotations = hg38_cpg_annotations
18+
)
19+
20+
21+
genes = c("TLR5","C1orf61","NR2F1-AS1" ,
22+
"ZDHHC1" ,"OTP" ,"FEZF2" ,"METAP1D" ,"TSHZ3",
23+
"THEG5" ,"FLJ45513","DLX4" ,"POU3F1" ,"NKX2-1-AS1",
24+
"NKX2-8" ,"PNPLA1" ,"TIFAB","NEUROG1" ,"RADIL",
25+
"EFCAB10" , "MAST1" ,"ZFHX4-AS1" ,"POLR1A",
26+
"CAMKMT", "SIX3-AS1" ,"HOXB13","TTLL6" ,
27+
"LHX5-AS1","LECT1" ,"ALX3" ,"BMP7" ,"PRDM13" ,
28+
"ULBP1","RAET1K")
29+
30+
#hg38.annotations = as.data.frame(hg38.annotations)
31+
32+
hg38.annotations = hg38.annotations[!is.na(hg38.annotations$symbol),]
33+
34+
gene_intervals = hg38.annotations[hg38.annotations$symbol %in% genes,]
35+
36+
gene_df = as.data.frame(gene_intervals)
37+
38+
gene_df$intervals = paste(gene_df$seqnames, gene_df$annot.start, gene_df$annot.end, sep = "_")
39+
40+
41+
combat_gr = GRanges(seqnames = COMBAT_meth$chr, ranges = IRanges(start = COMBAT_meth$start, end = COMBAT_meth$end), mcols = combat_meth)
42+
43+
overlaps <- findOverlaps(gene_intervals, combat_gr)
44+
overlap_results <- data.frame(
45+
query_interval = gene_intervals[queryHits(overlaps)], # Intervals from table
46+
subject_interval = combat_gr[subjectHits(overlaps)], # Intervals from matrix
47+
mcols(combat_gr[subjectHits(overlaps)]) # Overlapping values
48+
)
49+
50+
51+
52+
pdf("test.pdf")
53+
ggplot(overlap_results, aes(x = query_interval.annot.id, y = subject_interval.mcols.COMBAT_0001)) +
54+
geom_boxplot() +
55+
theme_minimal() +
56+
scale_fill_brewer(palette = "Set1")
57+
dev.off()

cfmedip_emseq.R

+188
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
2+
3+
4+
5+
### Test ###
6+
7+
library(dplyr)
8+
library(ggplot2)
9+
library(readr)
10+
library(BoutrosLab.plotting.general);
11+
library(annotatr);
12+
library(GenomicRanges);
13+
load("/Users/arnavaz/Library/CloudStorage/OneDrive-UHN/Projects/COMBAT/hg38_gene_annotations.RData")
14+
15+
# load("/Users/arnavaz/Library/CloudStorage/OneDrive-UHN/Projects/COMBAT/cfMeDIP_src_Kui/black_bin_v2.RData")
16+
# Define directories
17+
directory1 = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/MethylDackel/cfmedip"
18+
19+
directory2 = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/MethylDackel/em-seq"
20+
21+
load("/Users/arnavaz/Library/CloudStorage/OneDrive-UHN/Projects/COMBAT/hg38_cpg_annotations.RData")
22+
23+
# Get list of files in each directory
24+
files1 <- list.files(directory1, pattern = "count.txt", full.names = TRUE)
25+
files2 <- list.files(directory2, pattern = "summed_intervals", full.names = TRUE)
26+
27+
# Ensure both directories have the same number of files
28+
if (length(files1) != length(files2)) {
29+
stop("The number of files in the directories does not match!")
30+
}
31+
32+
# Initialize an empty list for plots
33+
island <- subset(hg38_cpg_annotations, type == "hg38_cpg_islands")
34+
shores = subset(hg38_cpg_annotations, type == "hg38_cpg_shores")
35+
shelf = subset(hg38_cpg_annotations, type == "hg38_cpg_shelves")
36+
37+
# Loop through the files in both directories
38+
for (i in seq_along(files1)) {
39+
# Read the tab-separated files
40+
data1 <- read_tsv(files1[i], col_names = F)
41+
data2 <- read_tsv(files2[i], col_names = FALSE)
42+
43+
data2 = data2 %>% mutate(interval_id = paste(X1, X2, X3, sep = "_"))
44+
interval = data2 %>% select(interval_id)
45+
data1$interval_id = interval$interval_id
46+
# data2 = data2 %>% filter(!interval_id %in% black_bin$black_bin)
47+
# data1 = data1 %>% filter(!interval_id %in% black_bin$black_bin)
48+
combat_gr = GRanges(seqnames = data1$X1, ranges = IRanges(start = data1$X2, data1$X3), mcols = data1$X4)
49+
combat_emseq = GRanges(seqnames = data2$X1, ranges = IRanges(start = data2$X2, data2$X3), mcols = data2$X4)
50+
#Island Shores Shelf for Cfmedip
51+
52+
overlaps <- findOverlaps(island, combat_gr)
53+
overlap_results_island <- data.frame(
54+
query_interval = island[queryHits(overlaps)], # Intervals from table
55+
subject_interval = combat_gr[subjectHits(overlaps)], # Intervals from matrix
56+
mcols(combat_gr[subjectHits(overlaps)]) # Overlapping values
57+
)
58+
59+
60+
overlaps <- findOverlaps(shores, combat_gr)
61+
overlap_results_shores <- data.frame(
62+
query_interval = shores[queryHits(overlaps)], # Intervals from table
63+
subject_interval = combat_gr[subjectHits(overlaps)], # Intervals from matrix
64+
mcols(combat_gr[subjectHits(overlaps)]) # Overlapping values
65+
)
66+
overlaps <- findOverlaps(shelf, combat_gr)
67+
overlap_results_shelf <- data.frame(
68+
query_interval = shelf[queryHits(overlaps)], # Intervals from table
69+
subject_interval = combat_gr[subjectHits(overlaps)], # Intervals from matrix
70+
mcols(combat_gr[subjectHits(overlaps)]) # Overlapping values
71+
)
72+
73+
74+
75+
##### Island Shores Shelf for Emseq
76+
overlaps <- findOverlaps(island, combat_emseq)
77+
overlap_results_island2 <- data.frame(
78+
query_interval = island[queryHits(overlaps)], # Intervals from table
79+
subject_interval = combat_emseq[subjectHits(overlaps)], # Intervals from matrix
80+
mcols(combat_emseq[subjectHits(overlaps)]) # Overlapping values
81+
)
82+
83+
84+
overlaps <- findOverlaps(shores, combat_emseq)
85+
overlap_results_shores2 <- data.frame(
86+
query_interval = shores[queryHits(overlaps)], # Intervals from table
87+
subject_interval = combat_emseq[subjectHits(overlaps)], # Intervals from matrix
88+
mcols(combat_emseq[subjectHits(overlaps)]) # Overlapping values
89+
)
90+
overlaps <- findOverlaps(shelf, combat_emseq)
91+
overlap_results_shelf2 <- data.frame(
92+
query_interval = shelf[queryHits(overlaps)], # Intervals from table
93+
subject_interval = combat_emseq[subjectHits(overlaps)], # Intervals from matrix
94+
mcols(combat_emseq[subjectHits(overlaps)]) # Overlapping values
95+
)
96+
97+
98+
99+
values1 <- overlap_results_island$mcols
100+
values2 <- overlap_results_island2$mcols
101+
102+
103+
values1_shore <- overlap_results_shores$mcols
104+
values2_shore <- overlap_results_shores2$mcols
105+
106+
107+
values1_shelf <- overlap_results_shelf$mcols
108+
values2_shelf <- overlap_results_shelf2$mcols
109+
110+
# Create a data frame for plotting
111+
comparison_df <- data.frame(
112+
Value1 = values1/sum(values1),
113+
Value2 = values2/sum(values2)
114+
)
115+
116+
117+
comparison_df_shore <- data.frame(
118+
Value1 = values1_shore/sum(values1_shore),
119+
Value2 = values2_shore/sum(values2_shore)
120+
)
121+
122+
123+
comparison_df_shelf <- data.frame(
124+
Value1 = values1_shelf/sum(values1_shelf),
125+
Value2 = values2_shelf/sum(values2_shelf)
126+
)
127+
128+
129+
fname = gsub("_count.txt","", basename(files1[i]))
130+
131+
132+
comparison_df <- comparison_df
133+
134+
corval = cor(comparison_df$Value1, comparison_df$Value2, method = "pearson")
135+
pdf(paste0(fname, "_Island.pdf"))
136+
137+
# Generate a scatter plot
138+
ggplot(comparison_df, aes(x = log10(Value1), y = log10(Value2))) +
139+
geom_point(alpha = 0.7, color = "blue") +
140+
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
141+
labs(
142+
title = paste0("Cfmedip and Em-seq methylation correlation on CpG Islands:", fname ),
143+
x = "cfMedip methylation normalized counts (log10)" ,
144+
y = "Em-seq methylation normalized counts (log10)"
145+
) +
146+
# xlim(-9,-5)+
147+
# ylim(-9,-5)+
148+
annotate(
149+
"text",
150+
x = log10(1e-06) , y = max(log10(comparison_df$Value2)), # Position (top-left)
151+
label = paste0("Correlation: ", round(corval, 2)),
152+
hjust = 0, size = 5, color = "red"
153+
)+
154+
theme_minimal()
155+
156+
dev.off()
157+
158+
159+
160+
}
161+
162+
file1 = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/MethylDackel/cfmedip/COMBAT_0006_CpG_Island_cfmedip.tsv"
163+
file2 = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/MethylDackel/em-seq/COMBAT_0006_CpG_Island_emseq.tsv"
164+
165+
data1 <- read_tsv(file1, col_names = F)
166+
data2 <- read_tsv(file2, col_names = F)
167+
168+
values1 <- data1[,4]
169+
values2 <- data2[,4]
170+
171+
comparison_df <- data.frame(
172+
Value1 = values1/sum(values1),
173+
Value2 = values2/sum(values2)
174+
)
175+
176+
177+
colnames(comparison_df) <- c("Value1", "Value2")
178+
179+
ggplot(comparison_df, aes(x = log10(Value1), y = log10(Value2))) +
180+
geom_point(alpha = 0.7, color = "blue") +
181+
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
182+
labs(
183+
title = paste0("Comparison cfmedip and Em-seq methylation at CpG Islands: ", "COMBAT 0006" ),
184+
x = "cfMedip methylation normalized counts (log10)" ,
185+
y = "Em-seq methylation normalized counts (log10)"
186+
) +
187+
188+
theme_minimal()

0 commit comments

Comments
 (0)