|
| 1 | +# Load necessary libraries |
| 2 | +library(ggplot2) |
| 3 | +library(dplyr) |
| 4 | +library(tidyr) |
| 5 | + |
| 6 | +# File paths (adjust these if necessary) |
| 7 | +setwd("/Users/arnavaz/Desktop") |
| 8 | +markdup_file <- "Sample1_markdup.txt" |
| 9 | +wgs_file <- "Sample1_WGS_metrics.txt" |
| 10 | + |
| 11 | +# Read the files, skipping the comment lines |
| 12 | +markdup_data <- read.delim(markdup_file, comment.char = "#", header = TRUE, sep = "\t") |
| 13 | +wgs_data <- read.delim(wgs_file, comment.char = "#", header = TRUE, sep = "\t") |
| 14 | + |
| 15 | +# Extract metrics sections and histograms |
| 16 | +# MARKDUP metrics |
| 17 | +markdup_metrics <- markdup_data[1, ] |
| 18 | +tmp <- markdup_data[-c(1, 2),] |
| 19 | +colnames(tmp) <- NULL |
| 20 | +# First row contains the metrics |
| 21 | +markdup_histogram <- tmp[, 1:2]# Remaining rows are the histogram |
| 22 | +colnames(markdup_histogram) <- c("BIN", "VALUE") |
| 23 | + |
| 24 | +# WGS metrics |
| 25 | +wgs_metrics <- wgs_data[1, ] # First row contains the metrics |
| 26 | +wgs_histogram <- wgs_data[-1, ] # Remaining rows are the histogram |
| 27 | + |
| 28 | +# Convert histogram data to numeric |
| 29 | +markdup_histogram <- markdup_histogram %>% |
| 30 | + mutate(BIN = as.numeric(BIN), VALUE = as.numeric(VALUE)) |
| 31 | + |
| 32 | + |
| 33 | +tmp <- wgs_data[-c(1, 2),] |
| 34 | +colnames(tmp) <- NULL |
| 35 | +tmp <- tmp[, 1:2] |
| 36 | +colnames(tmp) <- c("coverage","high_quality_coverage_count") |
| 37 | +wgs_histogram <- tmp %>% |
| 38 | + mutate(coverage = as.numeric(coverage), |
| 39 | + high_quality_coverage_count = as.numeric(high_quality_coverage_count)) |
| 40 | + |
| 41 | +# 1. Bar Plot: Percent Duplication from MarkDuplicates |
| 42 | +ggplot(markdup_metrics, aes(x = LIBRARY, y = PERCENT_DUPLICATION)) + |
| 43 | + geom_bar(stat = "identity", fill = "blue") + |
| 44 | + theme_minimal() + |
| 45 | + labs( |
| 46 | + title = "Percent Duplication by Library", |
| 47 | + x = "Library", |
| 48 | + y = "Percent Duplication" |
| 49 | + ) |
| 50 | + |
| 51 | +# 2. Line Plot: Histogram from MarkDuplicates |
| 52 | +ggplot(markdup_histogram, aes(x = BIN, y = VALUE)) + |
| 53 | + geom_line(color = "darkgreen") + |
| 54 | + theme_minimal() + |
| 55 | + labs( |
| 56 | + title = "MarkDuplicates Histogram", |
| 57 | + x = "BIN", |
| 58 | + y = "VALUE" |
| 59 | + ) |
| 60 | + |
| 61 | +# 3. Coverage Histogram: WGS Metrics |
| 62 | +ggplot(wgs_histogram, aes(x = coverage, y = high_quality_coverage_count)) + |
| 63 | + geom_bar(stat = "identity", fill = "darkred") + |
| 64 | + theme_minimal() + |
| 65 | + labs( |
| 66 | + title = "Coverage Distribution (WGS)", |
| 67 | + x = "Coverage", |
| 68 | + y = "High Quality Coverage Count" |
| 69 | + ) |
| 70 | + |
| 71 | +# 4. Metrics Summary Table: WGS Metrics |
| 72 | +wgs_summary <- wgs_metrics %>% |
| 73 | + select(GENOME_TERRITORY, MEAN_COVERAGE, SD_COVERAGE, MEDIAN_COVERAGE, |
| 74 | + PCT_EXC_MAPQ, PCT_EXC_DUPE, PCT_EXC_UNPAIRED, PCT_EXC_BASEQ, PCT_EXC_TOTAL, |
| 75 | + PCT_1X, PCT_5X, PCT_10X, PCT_20X, PCT_30X) %>% |
| 76 | + gather(key = "Metric", value = "Value") |
| 77 | + |
| 78 | +# 5. Bar Plot: WGS Metrics Summary |
| 79 | +ggplot(wgs_summary, aes(x = reorder(Metric, Value), y = Value, fill = Metric)) + |
| 80 | + geom_bar(stat = "identity") + |
| 81 | + coord_flip() + |
| 82 | + theme_minimal() + |
| 83 | + labs( |
| 84 | + title = "WGS Metrics Summary", |
| 85 | + x = "Metrics", |
| 86 | + y = "Value" |
| 87 | + ) |
| 88 | + |
| 89 | +# 6. Combined Histogram and Line Plot for Coverage vs. Duplication |
| 90 | +combined_histogram <- wgs_histogram %>% |
| 91 | + inner_join(markdup_histogram, by = c("coverage" = "BIN")) %>% |
| 92 | + rename(Duplication_Value = VALUE, Coverage_Count = high_quality_coverage_count) |
| 93 | + |
| 94 | + |
| 95 | + |
| 96 | +###### Visualizaing Flagstat output from Samtools ##### |
| 97 | + |
| 98 | + |
| 99 | +# Load necessary libraries |
| 100 | +library(ggplot2) |
| 101 | + |
| 102 | +# Function to read the flagstat output and extract relevant values |
| 103 | +extract_flagstat_values <- function(file_path) { |
| 104 | + # Read the file |
| 105 | + flagstat_output <- readLines(file_path) |
| 106 | + |
| 107 | + # Initialize a named vector for the rates |
| 108 | + rates <- c("primary_mapped_rate" = NA, "secondary_mapped_rate" = NA, "duplicate_rate" = NA) |
| 109 | + |
| 110 | + # Extract the values from the output |
| 111 | + for (line in flagstat_output) { |
| 112 | + if (grepl("primary", line) && grepl("mapped", line)) { |
| 113 | + # Extract primary mapped values |
| 114 | + primary_mapped <- as.numeric(strsplit(strsplit(line, "\\+")[1], " ")[[1]][1]) |
| 115 | + total_reads <- as.numeric(strsplit(flagstat_output[1], "\\+")[1]) |
| 116 | + rates["primary_mapped_rate"] <- primary_mapped / total_reads |
| 117 | + } |
| 118 | + |
| 119 | + if (grepl("secondary", line) && grepl("alignments", line)) { |
| 120 | + # Extract secondary mapped values |
| 121 | + secondary_mapped <- as.numeric(strsplit(strsplit(line, "\\+")[1], " ")[[1]][1]) |
| 122 | + rates["secondary_mapped_rate"] <- secondary_mapped / total_reads |
| 123 | + } |
| 124 | + |
| 125 | + if (grepl("duplicates", line)) { |
| 126 | + # Extract duplicates |
| 127 | + duplicates <- as.numeric(strsplit(strsplit(line, "\\+")[1], " ")[[1]][1]) |
| 128 | + rates["duplicate_rate"] <- duplicates / total_reads |
| 129 | + } |
| 130 | + } |
| 131 | + |
| 132 | + return(rates) |
| 133 | +} |
| 134 | + |
| 135 | +# File path to your samtools flagstat output |
| 136 | +file_path <- "flagstat_output.txt" |
| 137 | + |
| 138 | +# Extract rates from the file |
| 139 | +rates <- extract_flagstat_values(file_path) |
| 140 | + |
| 141 | +# Prepare data for plotting |
| 142 | +plot_data <- data.frame( |
| 143 | + Rate = names(rates), |
| 144 | + Value = rates |
| 145 | +) |
| 146 | + |
| 147 | +# Plot the rates as a bar plot |
| 148 | +ggplot(plot_data, aes(x = Rate, y = Value, fill = Rate)) + |
| 149 | + geom_bar(stat = "identity", color = "black") + |
| 150 | + labs(title = "Mapping Rates", y = "Rate", x = "Metric") + |
| 151 | + scale_y_continuous(labels = scales::percent) + |
| 152 | + theme_minimal() + |
| 153 | + theme(legend.position = "none") + |
| 154 | + geom_text(aes(label = scales::percent(Value)), vjust = -0.5) |
| 155 | + |
| 156 | +ggplot(combined_histogram, aes(x = coverage)) + |
| 157 | + geom_bar(aes(y = Coverage_Count), stat = "identity", fill = "blue", alpha = 0.5) + |
| 158 | + scale_y_continuous( |
| 159 | + name = "Coverage Count", |
| 160 | + sec.axis = sec_axis(~ . / 1000, name = "Duplication Value") |
| 161 | + ) + |
| 162 | + theme_minimal() + |
| 163 | + labs( |
| 164 | + title = "Coverage Count vs. Duplication Value", |
| 165 | + x = "Coverage" |
| 166 | + ) |
| 167 | + |
| 168 | +# Save plots |
| 169 | +ggsave("Percent_Duplication.png") |
| 170 | +ggsave("MarkDuplicates_Histogram.png") |
| 171 | +ggsave("Coverage_Distribution.png") |
| 172 | +ggsave("WGS_Metrics_Summary.png") |
| 173 | +ggsave("Combined_Coverage_Duplication.png") |
| 174 | + |
| 175 | +# 7. Save Merged Metrics Table (Optional) |
| 176 | +merged_metrics <- cbind(markdup_metrics, wgs_metrics) |
| 177 | +write.table(merged_metrics, "Merged_Metrics.txt", sep = "\t", row.names = FALSE, quote = FALSE) |
| 178 | + |
0 commit comments