|
| 1 | + |
| 2 | + |
| 3 | + |
| 4 | + |
| 5 | +### Test ### |
| 6 | + |
| 7 | +library(dplyr) |
| 8 | +library(ggplot2) |
| 9 | +library(readr) |
| 10 | +library(BoutrosLab.plotting.general); |
| 11 | +library(annotatr); |
| 12 | +library(GenomicRanges); |
| 13 | +load("/Users/arnavaz/Library/CloudStorage/OneDrive-UHN/Projects/COMBAT/hg38_gene_annotations.RData") |
| 14 | + |
| 15 | +# load("/Users/arnavaz/Library/CloudStorage/OneDrive-UHN/Projects/COMBAT/cfMeDIP_src_Kui/black_bin_v2.RData") |
| 16 | +# Define directories |
| 17 | +directory1 = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/MethylDackel/cfmedip" |
| 18 | + |
| 19 | +directory2 = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/MethylDackel/em-seq" |
| 20 | + |
| 21 | +load("/Users/arnavaz/Library/CloudStorage/OneDrive-UHN/Projects/COMBAT/hg38_cpg_annotations.RData") |
| 22 | + |
| 23 | +# Get list of files in each directory |
| 24 | +files1 <- list.files(directory1, pattern = "count.txt", full.names = TRUE) |
| 25 | +files2 <- list.files(directory2, pattern = "summed_intervals", full.names = TRUE) |
| 26 | + |
| 27 | +# Ensure both directories have the same number of files |
| 28 | +if (length(files1) != length(files2)) { |
| 29 | + stop("The number of files in the directories does not match!") |
| 30 | +} |
| 31 | + |
| 32 | +# Initialize an empty list for plots |
| 33 | +island <- subset(hg38_cpg_annotations, type == "hg38_cpg_islands") |
| 34 | +shores = subset(hg38_cpg_annotations, type == "hg38_cpg_shores") |
| 35 | +shelf = subset(hg38_cpg_annotations, type == "hg38_cpg_shelves") |
| 36 | + |
| 37 | +# Loop through the files in both directories |
| 38 | +for (i in seq_along(files1)) { |
| 39 | + # Read the tab-separated files |
| 40 | + data1 <- read_tsv(files1[i], col_names = F) |
| 41 | + data2 <- read_tsv(files2[i], col_names = FALSE) |
| 42 | + |
| 43 | + data2 = data2 %>% mutate(interval_id = paste(X1, X2, X3, sep = "_")) |
| 44 | + interval = data2 %>% select(interval_id) |
| 45 | + data1$interval_id = interval$interval_id |
| 46 | + # data2 = data2 %>% filter(!interval_id %in% black_bin$black_bin) |
| 47 | + # data1 = data1 %>% filter(!interval_id %in% black_bin$black_bin) |
| 48 | + combat_gr = GRanges(seqnames = data1$X1, ranges = IRanges(start = data1$X2, data1$X3), mcols = data1$X4) |
| 49 | + combat_emseq = GRanges(seqnames = data2$X1, ranges = IRanges(start = data2$X2, data2$X3), mcols = data2$X4) |
| 50 | + #Island Shores Shelf for Cfmedip |
| 51 | + |
| 52 | + overlaps <- findOverlaps(island, combat_gr) |
| 53 | + overlap_results_island <- data.frame( |
| 54 | + query_interval = island[queryHits(overlaps)], # Intervals from table |
| 55 | + subject_interval = combat_gr[subjectHits(overlaps)], # Intervals from matrix |
| 56 | + mcols(combat_gr[subjectHits(overlaps)]) # Overlapping values |
| 57 | + ) |
| 58 | + |
| 59 | + |
| 60 | + overlaps <- findOverlaps(shores, combat_gr) |
| 61 | + overlap_results_shores <- data.frame( |
| 62 | + query_interval = shores[queryHits(overlaps)], # Intervals from table |
| 63 | + subject_interval = combat_gr[subjectHits(overlaps)], # Intervals from matrix |
| 64 | + mcols(combat_gr[subjectHits(overlaps)]) # Overlapping values |
| 65 | + ) |
| 66 | + overlaps <- findOverlaps(shelf, combat_gr) |
| 67 | + overlap_results_shelf <- data.frame( |
| 68 | + query_interval = shelf[queryHits(overlaps)], # Intervals from table |
| 69 | + subject_interval = combat_gr[subjectHits(overlaps)], # Intervals from matrix |
| 70 | + mcols(combat_gr[subjectHits(overlaps)]) # Overlapping values |
| 71 | + ) |
| 72 | + |
| 73 | + |
| 74 | + |
| 75 | + ##### Island Shores Shelf for Emseq |
| 76 | + overlaps <- findOverlaps(island, combat_emseq) |
| 77 | + overlap_results_island2 <- data.frame( |
| 78 | + query_interval = island[queryHits(overlaps)], # Intervals from table |
| 79 | + subject_interval = combat_emseq[subjectHits(overlaps)], # Intervals from matrix |
| 80 | + mcols(combat_emseq[subjectHits(overlaps)]) # Overlapping values |
| 81 | + ) |
| 82 | + |
| 83 | + |
| 84 | + overlaps <- findOverlaps(shores, combat_emseq) |
| 85 | + overlap_results_shores2 <- data.frame( |
| 86 | + query_interval = shores[queryHits(overlaps)], # Intervals from table |
| 87 | + subject_interval = combat_emseq[subjectHits(overlaps)], # Intervals from matrix |
| 88 | + mcols(combat_emseq[subjectHits(overlaps)]) # Overlapping values |
| 89 | + ) |
| 90 | + overlaps <- findOverlaps(shelf, combat_emseq) |
| 91 | + overlap_results_shelf2 <- data.frame( |
| 92 | + query_interval = shelf[queryHits(overlaps)], # Intervals from table |
| 93 | + subject_interval = combat_emseq[subjectHits(overlaps)], # Intervals from matrix |
| 94 | + mcols(combat_emseq[subjectHits(overlaps)]) # Overlapping values |
| 95 | + ) |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | + values1 <- overlap_results_island$mcols |
| 100 | + values2 <- overlap_results_island2$mcols |
| 101 | + |
| 102 | + |
| 103 | + values1_shore <- overlap_results_shores$mcols |
| 104 | + values2_shore <- overlap_results_shores2$mcols |
| 105 | + |
| 106 | + |
| 107 | + values1_shelf <- overlap_results_shelf$mcols |
| 108 | + values2_shelf <- overlap_results_shelf2$mcols |
| 109 | + |
| 110 | + # Create a data frame for plotting |
| 111 | + comparison_df <- data.frame( |
| 112 | + Value1 = values1/sum(values1), |
| 113 | + Value2 = values2/sum(values2) |
| 114 | + ) |
| 115 | + |
| 116 | + |
| 117 | + comparison_df_shore <- data.frame( |
| 118 | + Value1 = values1_shore/sum(values1_shore), |
| 119 | + Value2 = values2_shore/sum(values2_shore) |
| 120 | + ) |
| 121 | + |
| 122 | + |
| 123 | + comparison_df_shelf <- data.frame( |
| 124 | + Value1 = values1_shelf/sum(values1_shelf), |
| 125 | + Value2 = values2_shelf/sum(values2_shelf) |
| 126 | + ) |
| 127 | + |
| 128 | + |
| 129 | +fname = gsub("_count.txt","", basename(files1[i])) |
| 130 | + |
| 131 | + |
| 132 | +comparison_df <- comparison_df |
| 133 | + |
| 134 | +corval = cor(comparison_df$Value1, comparison_df$Value2, method = "pearson") |
| 135 | +pdf(paste0(fname, "_Island.pdf")) |
| 136 | + |
| 137 | + # Generate a scatter plot |
| 138 | + ggplot(comparison_df, aes(x = log10(Value1), y = log10(Value2))) + |
| 139 | + geom_point(alpha = 0.7, color = "blue") + |
| 140 | + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") + |
| 141 | + labs( |
| 142 | + title = paste0("Cfmedip and Em-seq methylation correlation on CpG Islands:", fname ), |
| 143 | + x = "cfMedip methylation normalized counts (log10)" , |
| 144 | + y = "Em-seq methylation normalized counts (log10)" |
| 145 | + ) + |
| 146 | + # xlim(-9,-5)+ |
| 147 | + # ylim(-9,-5)+ |
| 148 | + annotate( |
| 149 | + "text", |
| 150 | + x = log10(1e-06) , y = max(log10(comparison_df$Value2)), # Position (top-left) |
| 151 | + label = paste0("Correlation: ", round(corval, 2)), |
| 152 | + hjust = 0, size = 5, color = "red" |
| 153 | + )+ |
| 154 | + theme_minimal() |
| 155 | + |
| 156 | +dev.off() |
| 157 | + |
| 158 | + |
| 159 | + |
| 160 | +} |
| 161 | + |
| 162 | +file1 = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/MethylDackel/cfmedip/COMBAT_0006_CpG_Island_cfmedip.tsv" |
| 163 | +file2 = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/MethylDackel/em-seq/COMBAT_0006_CpG_Island_emseq.tsv" |
| 164 | + |
| 165 | +data1 <- read_tsv(file1, col_names = F) |
| 166 | +data2 <- read_tsv(file2, col_names = F) |
| 167 | + |
| 168 | +values1 <- data1[,4] |
| 169 | +values2 <- data2[,4] |
| 170 | + |
| 171 | +comparison_df <- data.frame( |
| 172 | + Value1 = values1/sum(values1), |
| 173 | + Value2 = values2/sum(values2) |
| 174 | +) |
| 175 | + |
| 176 | + |
| 177 | +colnames(comparison_df) <- c("Value1", "Value2") |
| 178 | + |
| 179 | +ggplot(comparison_df, aes(x = log10(Value1), y = log10(Value2))) + |
| 180 | + geom_point(alpha = 0.7, color = "blue") + |
| 181 | + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") + |
| 182 | + labs( |
| 183 | + title = paste0("Comparison cfmedip and Em-seq methylation at CpG Islands: ", "COMBAT 0006" ), |
| 184 | + x = "cfMedip methylation normalized counts (log10)" , |
| 185 | + y = "Em-seq methylation normalized counts (log10)" |
| 186 | + ) + |
| 187 | + |
| 188 | + theme_minimal() |
0 commit comments