Skip to content

Commit 5c9e471

Browse files
authored
Add files via upload
1 parent cebb178 commit 5c9e471

File tree

2 files changed

+262
-0
lines changed

2 files changed

+262
-0
lines changed

Em_seq_QC.R

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
# Load necessary libraries
2+
library(ggplot2)
3+
library(dplyr)
4+
library(tidyr)
5+
6+
# File paths (adjust these if necessary)
7+
setwd("/Users/arnavaz/Desktop")
8+
markdup_file <- "Sample1_markdup.txt"
9+
wgs_file <- "Sample1_WGS_metrics.txt"
10+
11+
# Read the files, skipping the comment lines
12+
markdup_data <- read.delim(markdup_file, comment.char = "#", header = TRUE, sep = "\t")
13+
wgs_data <- read.delim(wgs_file, comment.char = "#", header = TRUE, sep = "\t")
14+
15+
# Extract metrics sections and histograms
16+
# MARKDUP metrics
17+
markdup_metrics <- markdup_data[1, ]
18+
tmp <- markdup_data[-c(1, 2),]
19+
colnames(tmp) <- NULL
20+
# First row contains the metrics
21+
markdup_histogram <- tmp[, 1:2]# Remaining rows are the histogram
22+
colnames(markdup_histogram) <- c("BIN", "VALUE")
23+
24+
# WGS metrics
25+
wgs_metrics <- wgs_data[1, ] # First row contains the metrics
26+
wgs_histogram <- wgs_data[-1, ] # Remaining rows are the histogram
27+
28+
# Convert histogram data to numeric
29+
markdup_histogram <- markdup_histogram %>%
30+
mutate(BIN = as.numeric(BIN), VALUE = as.numeric(VALUE))
31+
32+
33+
tmp <- wgs_data[-c(1, 2),]
34+
colnames(tmp) <- NULL
35+
tmp <- tmp[, 1:2]
36+
colnames(tmp) <- c("coverage","high_quality_coverage_count")
37+
wgs_histogram <- tmp %>%
38+
mutate(coverage = as.numeric(coverage),
39+
high_quality_coverage_count = as.numeric(high_quality_coverage_count))
40+
41+
# 1. Bar Plot: Percent Duplication from MarkDuplicates
42+
ggplot(markdup_metrics, aes(x = LIBRARY, y = PERCENT_DUPLICATION)) +
43+
geom_bar(stat = "identity", fill = "blue") +
44+
theme_minimal() +
45+
labs(
46+
title = "Percent Duplication by Library",
47+
x = "Library",
48+
y = "Percent Duplication"
49+
)
50+
51+
# 2. Line Plot: Histogram from MarkDuplicates
52+
ggplot(markdup_histogram, aes(x = BIN, y = VALUE)) +
53+
geom_line(color = "darkgreen") +
54+
theme_minimal() +
55+
labs(
56+
title = "MarkDuplicates Histogram",
57+
x = "BIN",
58+
y = "VALUE"
59+
)
60+
61+
# 3. Coverage Histogram: WGS Metrics
62+
ggplot(wgs_histogram, aes(x = coverage, y = high_quality_coverage_count)) +
63+
geom_bar(stat = "identity", fill = "darkred") +
64+
theme_minimal() +
65+
labs(
66+
title = "Coverage Distribution (WGS)",
67+
x = "Coverage",
68+
y = "High Quality Coverage Count"
69+
)
70+
71+
# 4. Metrics Summary Table: WGS Metrics
72+
wgs_summary <- wgs_metrics %>%
73+
select(GENOME_TERRITORY, MEAN_COVERAGE, SD_COVERAGE, MEDIAN_COVERAGE,
74+
PCT_EXC_MAPQ, PCT_EXC_DUPE, PCT_EXC_UNPAIRED, PCT_EXC_BASEQ, PCT_EXC_TOTAL,
75+
PCT_1X, PCT_5X, PCT_10X, PCT_20X, PCT_30X) %>%
76+
gather(key = "Metric", value = "Value")
77+
78+
# 5. Bar Plot: WGS Metrics Summary
79+
ggplot(wgs_summary, aes(x = reorder(Metric, Value), y = Value, fill = Metric)) +
80+
geom_bar(stat = "identity") +
81+
coord_flip() +
82+
theme_minimal() +
83+
labs(
84+
title = "WGS Metrics Summary",
85+
x = "Metrics",
86+
y = "Value"
87+
)
88+
89+
# 6. Combined Histogram and Line Plot for Coverage vs. Duplication
90+
combined_histogram <- wgs_histogram %>%
91+
inner_join(markdup_histogram, by = c("coverage" = "BIN")) %>%
92+
rename(Duplication_Value = VALUE, Coverage_Count = high_quality_coverage_count)
93+
94+
95+
96+
###### Visualizaing Flagstat output from Samtools #####
97+
98+
99+
# Load necessary libraries
100+
library(ggplot2)
101+
102+
# Function to read the flagstat output and extract relevant values
103+
extract_flagstat_values <- function(file_path) {
104+
# Read the file
105+
flagstat_output <- readLines(file_path)
106+
107+
# Initialize a named vector for the rates
108+
rates <- c("primary_mapped_rate" = NA, "secondary_mapped_rate" = NA, "duplicate_rate" = NA)
109+
110+
# Extract the values from the output
111+
for (line in flagstat_output) {
112+
if (grepl("primary", line) && grepl("mapped", line)) {
113+
# Extract primary mapped values
114+
primary_mapped <- as.numeric(strsplit(strsplit(line, "\\+")[1], " ")[[1]][1])
115+
total_reads <- as.numeric(strsplit(flagstat_output[1], "\\+")[1])
116+
rates["primary_mapped_rate"] <- primary_mapped / total_reads
117+
}
118+
119+
if (grepl("secondary", line) && grepl("alignments", line)) {
120+
# Extract secondary mapped values
121+
secondary_mapped <- as.numeric(strsplit(strsplit(line, "\\+")[1], " ")[[1]][1])
122+
rates["secondary_mapped_rate"] <- secondary_mapped / total_reads
123+
}
124+
125+
if (grepl("duplicates", line)) {
126+
# Extract duplicates
127+
duplicates <- as.numeric(strsplit(strsplit(line, "\\+")[1], " ")[[1]][1])
128+
rates["duplicate_rate"] <- duplicates / total_reads
129+
}
130+
}
131+
132+
return(rates)
133+
}
134+
135+
# File path to your samtools flagstat output
136+
file_path <- "flagstat_output.txt"
137+
138+
# Extract rates from the file
139+
rates <- extract_flagstat_values(file_path)
140+
141+
# Prepare data for plotting
142+
plot_data <- data.frame(
143+
Rate = names(rates),
144+
Value = rates
145+
)
146+
147+
# Plot the rates as a bar plot
148+
ggplot(plot_data, aes(x = Rate, y = Value, fill = Rate)) +
149+
geom_bar(stat = "identity", color = "black") +
150+
labs(title = "Mapping Rates", y = "Rate", x = "Metric") +
151+
scale_y_continuous(labels = scales::percent) +
152+
theme_minimal() +
153+
theme(legend.position = "none") +
154+
geom_text(aes(label = scales::percent(Value)), vjust = -0.5)
155+
156+
ggplot(combined_histogram, aes(x = coverage)) +
157+
geom_bar(aes(y = Coverage_Count), stat = "identity", fill = "blue", alpha = 0.5) +
158+
scale_y_continuous(
159+
name = "Coverage Count",
160+
sec.axis = sec_axis(~ . / 1000, name = "Duplication Value")
161+
) +
162+
theme_minimal() +
163+
labs(
164+
title = "Coverage Count vs. Duplication Value",
165+
x = "Coverage"
166+
)
167+
168+
# Save plots
169+
ggsave("Percent_Duplication.png")
170+
ggsave("MarkDuplicates_Histogram.png")
171+
ggsave("Coverage_Distribution.png")
172+
ggsave("WGS_Metrics_Summary.png")
173+
ggsave("Combined_Coverage_Duplication.png")
174+
175+
# 7. Save Merged Metrics Table (Optional)
176+
merged_metrics <- cbind(markdup_metrics, wgs_metrics)
177+
write.table(merged_metrics, "Merged_Metrics.txt", sep = "\t", row.names = FALSE, quote = FALSE)
178+

WGS_EM-seq_QC.R

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
2+
### QC for EM-seq## Basic plots
3+
4+
5+
bd.g.path = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/MethylDackel/em-seq"
6+
wgs.met.path = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/WgsMetrics"
7+
bd.g.flist = list.files(bd.g.path, full.names = T, pattern ="CpG.bedGraph")
8+
# wgs.metrics = list.files(wgs.met.path, full.names = T, pattern ="alignment_metrics.txt")
9+
10+
for (file in bd.g.flist){
11+
12+
tab = read.table(file, sep = "\t", stringsAsFactors = F, header = T, skip=1)
13+
14+
}
15+
16+
17+
18+
19+
# Load necessary libraries
20+
library(ggplot2)
21+
22+
# Read the GC Bias Detail Metrics output from Picard
23+
# Replace "GcBiasDetailMetrics.txt" with your actual file name
24+
f = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/WgsMetrics/COMBAT_0006_09_T1_bwamem_aligned_sorted_merged_markdup.bam_gc_bias_metrics.txt"
25+
gc_bias_data <- read.table(f, comment.char = "#", header = TRUE, sep = "\t", stringsAsFactors = FALSE)
26+
27+
# Explore the data (Optional)
28+
head(gc_bias_data)
29+
30+
# Key columns typically include:
31+
# - "WINDOWS": Number of windows at each GC content
32+
# - "READS_USED": Number of reads contributing to GC content
33+
# - "NORMALIZED_COVERAGE": Coverage normalized to average
34+
35+
# Create a plot of GC content vs. normalized coverage
36+
gc_bias_plot <- ggplot(gc_bias_data, aes(x = GC, y = NORMALIZED_COVERAGE)) +
37+
geom_line(color = "blue") + # Line plot for GC bias trend
38+
geom_point(size = 1, color = "red", alpha = 0.6) + # Points for individual GC windows
39+
theme_minimal() + # Clean plot theme
40+
labs(
41+
title = "GC Bias Visualization",
42+
x = "GC Content (%)",
43+
y = "Normalized Coverage",
44+
caption = "Source: Picard GcBiasDetailMetrics"
45+
) +
46+
theme(
47+
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
48+
axis.title = element_text(size = 12)
49+
) +
50+
scale_x_continuous(breaks = seq(0, 100, 10)) # Adjust X-axis ticks
51+
52+
# Display the plot
53+
print(gc_bias_plot)
54+
55+
# Save the plot as an image (Optional)
56+
ggsave("GC_Bias_Plot.png", gc_bias_plot, width = 8, height = 6)
57+
58+
59+
60+
61+
62+
63+
64+
# Load required libraries
65+
library(ggplot2)
66+
library(dplyr)
67+
68+
# List of file paths for CollectAlignmentSummaryMetrics output
69+
file_paths <- list.files(path = "/Users/arnavaz/Desktop/HPC/pughlab/projects/COMBAT/WgsMetrics", pattern = ".*alignment_metrics.txt", full.names = TRUE)
70+
71+
# Function to read and extract the relevant columns
72+
read_picard_metrics <- function(file_path) {
73+
data <- read.delim(file_path, skip = 6, header = TRUE) # Skip the header rows
74+
sample_name <- gsub(".*\\/|_alignment_metrics.txt", "", file_path) # Extract the sample name
75+
data$Sample <- sample_name
76+
return(data)
77+
}
78+
79+
# Apply this function to all the files and bind them together
80+
all_samples_data <- do.call(rbind, lapply(file_paths, read_picard_metrics))
81+
82+
83+
84+

0 commit comments

Comments
 (0)