Skip to content

Commit 3408edf

Browse files
authored
Merge pull request #1 from AstrobioMike/master
merge in dev fork
2 parents 53f4094 + e5aba10 commit 3408edf

19 files changed

+25355
-23982
lines changed

Exploratory-Viz-and-DESeq.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Exploratory visualizations and [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)
2+
3+
## Summary
4+
An RStudio binder with example code for generating hierarchical clustering and ordination exploratory visualizations and some example code for running a differential expression analysis in [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).
5+
6+
## Authors
7+
Michael D. Lee
8+
Shengwei Hou
9+
10+
## Binder
11+
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/AstrobioMike/speeding-up-science-binder/master?urlpath=rstudio)
12+
13+
## Github repo
14+
[https://github.com/AstrobioMike/speeding-up-science-binder](https://github.com/AstrobioMike/speeding-up-science-binder)
15+
16+
## HTML version
17+
[Exploratory visualizations](https://github.com/AstrobioMike/speeding-up-science-binder/blob/master/hclust-ord-plot.html)
18+
[DESeq2](https://github.com/AstrobioMike/speeding-up-science-binder/blob/master/deseq.html)
19+
20+
## Thumbnail image
21+
<img width="400" src="https://github.com/AstrobioMike/AstrobioMike.github.io/blob/master/images/deseq2-MA-thumbnail.png">

README.md

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,25 @@
1-
# speeding up science metatranscriptomics binder
1+
# Speeding up science binder
22

33
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/AstrobioMike/speeding-up-science-binder/master?urlpath=rstudio)
4+
5+
# Exploratory visualizations and [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)
6+
7+
## Summary
8+
An RStudio binder with example code for generating hierarchical clustering and ordination exploratory visualizations and some example code for running a differential expression analysis in [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).
9+
10+
## Authors
11+
Michael D. Lee
12+
Shengwei Hou
13+
14+
## Binder
15+
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/AstrobioMike/speeding-up-science-binder/master?urlpath=rstudio)
16+
17+
## Github repo
18+
[https://github.com/AstrobioMike/speeding-up-science-binder](https://github.com/AstrobioMike/speeding-up-science-binder)
19+
20+
## HTML version
21+
[Exploratory visualizations](https://github.com/AstrobioMike/speeding-up-science-binder/blob/master/hclust-ord-plot.html)
22+
[DESeq2](https://github.com/AstrobioMike/speeding-up-science-binder/blob/master/deseq.html)
23+
24+
## Thumbnail image
25+
<img width="400" src="https://github.com/AstrobioMike/AstrobioMike.github.io/blob/master/images/deseq2-MA-thumbnail.png">

deseq.R

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
library("DESeq2")
2+
library("ggplot2")
3+
library("apeglm")
4+
library("pheatmap")
5+
6+
## Reading in data
7+
count_tab <- read.table("example_data/sample_raw_counts.tsv", sep="\t", header=T, row.names=1)
8+
sample_info_tab <- read.table("example_data/sample_info_tab.tsv", sep="\t", header=T, row.names=1)
9+
10+
## DESeq
11+
# making the deseq object
12+
deseq <- DESeqDataSetFromMatrix(countData = count_tab, colData = sample_info_tab, design = ~treatment)
13+
# setting the baseline treatment as the "Low" treatment
14+
deseq$treatment <- relevel(deseq$treatment, ref = "Low")
15+
# and running deseq standard analysis:
16+
deseq <- DESeq(deseq)
17+
# pulling out our results table, we specify the object, the p-value we are going to use to filter our results, and what contrast we want to consider by first naming the column, then the two groups we care about (we don't necessarily need this here because in this case we set the base level and there are only 2, but this is how you would state which things you want to contrast)
18+
high_vs_low_contrast <- results(deseq, alpha=0.01, contrast=c("treatment", "High", "Low"))
19+
# we can get a glimpse at what this table currently holds with the summary command
20+
summary(high_vs_low_contrast)
21+
# this tells us out of ~20,000 CDSs, with adj-p < 0.01, there are 667 increased when comparing the high CO2 treatment to the low CO2 treatment, and 140 decreased
22+
# "decreased" in this case means at a lower expression level in the High CO2 treatment than in the Low CO2 treatment, and "increased" means greater expression in the High as compared to the Low
23+
24+
# next let's stitch that together with KEGG annotations
25+
anot_tab <- read.table("example_data/sample_annotation_classifications.tsv", header=T, row.names=1, sep="\t")[,1, drop=F]
26+
# reordering both for easier merging
27+
anot_tab <- anot_tab[order(row.names(anot_tab)), , drop=F]
28+
deseq_tab <- data.frame(high_vs_low_contrast)
29+
30+
# all.equal(row.names(anot_tab), row.names(deseq_tab)) # checking to make sure they are the same
31+
deseq_res_with_KOs <- cbind(deseq_tab, anot_tab)
32+
33+
# let's subset this table to only include these that pass our specified significance level
34+
sigtab_high_vs_low <- deseq_res_with_KOs[which(deseq_res_with_KOs$padj < 0.01), ]
35+
36+
# and now let's sort that table by the baseMean column
37+
sigtab_high_vs_low <- sigtab_high_vs_low[order(sigtab_high_vs_low$baseMean, decreasing=T), ]
38+
39+
out_tab <- data.frame("CDS_ID"=row.names(sigtab_high_vs_low), sigtab_high_vs_low, row.names = NULL)
40+
# writing out table
41+
write.table(out_tab, "DESeq_high_vs_low_contrast.tsv", sep="\t", quote=F, row.names=F)
42+
43+
## Some visualizations
44+
# visualizing top gene (based on adj. p-value)
45+
topGene <- rownames(high_vs_low_contrast)[which.min(high_vs_low_contrast$padj)]
46+
data <- plotCounts(deseq, gene=topGene, intgroup=c("treatment"), returnData = T)
47+
top_gene_KO <- anot_tab[row.names(anot_tab) %in% topGene, ]
48+
49+
pdf("Most-sig-gene.pdf")
50+
ggplot(data, aes(x=treatment, y=count, fill=treatment)) +
51+
scale_y_log10() + theme_bw() +
52+
geom_dotplot(binaxis="y", stackdir="center") +
53+
ggtitle(paste0(topGene, " (", top_gene_KO, ")"))
54+
dev.off()
55+
56+
# applying shrinkage to lessen the impact of those with very low expression and those highly variable for plotting
57+
high_vs_low_shr <- lfcShrink(deseq, coef="treatment_High_vs_Low", type="apeglm")
58+
59+
# plotting MA plots of both
60+
pdf("DESeq_MA_plots.pdf")
61+
par(mfrow=c(1,2))
62+
# original logFC
63+
plotMA(deseq, ylim=c(min(high_vs_low_contrast$log2FoldChange),max(high_vs_low_contrast$log2FoldChange)), alpha=0.01, main="No log-fold-change shrinkage")
64+
# apeglm shrinkage logFC
65+
plotMA(high_vs_low_shr, ylim=c(min(high_vs_low_shr$log2FoldChange),max(high_vs_low_shr$log2FoldChange)), alpha=0.01, main="With log-fold-change shrinkage")
66+
dev.off()
67+
68+
# getting variance stabilized transformed table
69+
deseq_vst <- varianceStabilizingTransformation(deseq)
70+
# NOTE: If you get this error here with your dataset: "Error in
71+
# estimateSizeFactorsForMatrix(counts(object), locfunc =locfunc, : every
72+
# gene contains at least one zero, cannot compute log geometric means", that
73+
# can be because the count table is sparse with many zeroes, which is common
74+
# with marker-gene surveys. In that case you'd need to use a specific
75+
# function first that is equipped to deal with that. You could run:
76+
# deseq_counts <- estimateSizeFactors(deseq_counts, type = "poscounts")
77+
# now followed by the transformation function:
78+
# deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts)
79+
80+
# and here is pulling out our transformed table
81+
vst_tab <- assay(deseq_vst)
82+
83+
# making heatmap of top 20 most highly expressed genes
84+
select <- order(rowMeans(counts(deseq, normalized=TRUE)),
85+
decreasing=TRUE)[1:20]
86+
87+
pdf("DESeq-highly-expressed-heatmap.pdf")
88+
pheatmap(vst_tab[select, ], cluster_rows=FALSE, cluster_cols=FALSE, annotation_col=sample_info_tab[, 1, drop=F])
89+
dev.off()
90+
91+
# making heatmap of sample distances
92+
euc_dists <- dist(t(vst_tab))
93+
euc_dist_mat <- as.matrix(euc_dists)
94+
95+
pdf("DESeq-sample-euc-dist-heatmap.pdf")
96+
pheatmap(euc_dist_mat, clustering_distance_rows=euc_dists, clustering_distance_cols=euc_dists, clustering_method="ward.D2")
97+
dev.off()
98+
99+
100+
101+
102+

deseq.Rmd

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
---
2+
title: "DESeq2"
3+
author: "Michael D. Lee and Shengwei Hou"
4+
date: "5/10/2019"
5+
output: html_document
6+
---
7+
8+
```{r setup, include=FALSE}
9+
knitr::opts_chunk$set(echo = TRUE)
10+
```
11+
12+
Much of this is built off of the [excellent documentation](http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html) provided by the developers of [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).
13+
14+
## Libraries used here
15+
``` {r, message=FALSE, warning=FALSE}
16+
library("DESeq2")
17+
library("ggplot2")
18+
library("apeglm")
19+
library("pheatmap")
20+
```
21+
22+
## Reading in data
23+
```{r}
24+
count_tab <- read.table("example_data/sample_raw_counts.tsv", sep="\t", header=T, row.names=1)
25+
sample_info_tab <- read.table("example_data/sample_info_tab.tsv", sep="\t", header=T, row.names=1)
26+
```
27+
28+
## DESeq
29+
```{r}
30+
# making the deseq object
31+
deseq <- DESeqDataSetFromMatrix(countData = count_tab, colData = sample_info_tab, design = ~treatment)
32+
# setting the baseline treatment as the "Low" treatment
33+
deseq$treatment <- relevel(deseq$treatment, ref = "Low")
34+
# and running deseq standard analysis:
35+
deseq <- DESeq(deseq)
36+
# pulling out our results table, we specify the object, the p-value we are going to use to filter our results, and what contrast we want to consider by first naming the column, then the two groups we care about (we don't necessarily need this here because in this case we set the base level and there are only 2, but this is how you would state which things you want to contrast)
37+
high_vs_low_contrast <- results(deseq, alpha=0.01, contrast=c("treatment", "High", "Low"))
38+
# we can get a glimpse at what this table currently holds with the summary command
39+
summary(high_vs_low_contrast)
40+
```
41+
This tells us out of ~20,000 CDSs, with adj-p < 0.01, there are 667 increased when comparing the high CO2 treatment to the low CO2 treatment, and 140 decreased. With the way this contrast was specified, "decreased" means a lower expression level in the High CO2 treatment than in the Low CO2 treatment, and "increased" means greater expression in the High as compared to the Low.
42+
43+
Next let's stitch that together with the KEGG annotations we have:
44+
```{r}
45+
anot_tab <- read.table("example_data/sample_annotation_classifications.tsv", header=T, row.names=1, sep="\t")[,1, drop=F]
46+
# reordering both for easier merging
47+
anot_tab <- anot_tab[order(row.names(anot_tab)), , drop=F]
48+
deseq_tab <- data.frame(high_vs_low_contrast)
49+
50+
all.equal(row.names(anot_tab), row.names(deseq_tab)) # checking to make sure they are the same
51+
deseq_res_with_KOs <- cbind(deseq_tab, anot_tab)
52+
53+
# let's subset this table to only include these that pass our specified significance level
54+
sigtab_high_vs_low <- deseq_res_with_KOs[which(deseq_res_with_KOs$padj < 0.01), ]
55+
56+
# and now let's sort that table by the baseMean column
57+
sigtab_high_vs_low <- sigtab_high_vs_low[order(sigtab_high_vs_low$baseMean, decreasing=T), ]
58+
59+
out_tab <- data.frame("CDS_ID"=row.names(sigtab_high_vs_low), sigtab_high_vs_low, row.names = NULL)
60+
# writing out table
61+
write.table(out_tab, "DESeq_high_vs_low_contrast.tsv", sep="\t", quote=F, row.names=F)
62+
```
63+
64+
## Some possible visualization examples
65+
```{r}
66+
# visualizing top gene (based on adj. p-value)
67+
topGene <- rownames(high_vs_low_contrast)[which.min(high_vs_low_contrast$padj)]
68+
data <- plotCounts(deseq, gene=topGene, intgroup=c("treatment"), returnData = T)
69+
top_gene_KO <- anot_tab[row.names(anot_tab) %in% topGene, ]
70+
71+
ggplot(data, aes(x=treatment, y=count, fill=treatment)) +
72+
scale_y_log10() + theme_bw() +
73+
geom_dotplot(binaxis="y", stackdir="center") +
74+
ggtitle(paste0(topGene, " (", top_gene_KO, ")"))
75+
```
76+
77+
```{r}
78+
# applying shrinkage to lessen the impact of those with very low expression and those highly variable for plotting
79+
high_vs_low_shr <- lfcShrink(deseq, coef="treatment_High_vs_Low", type="apeglm")
80+
81+
# plotting MA plots of both
82+
par(mfrow=c(1,2))
83+
# original logFC
84+
plotMA(deseq, ylim=c(min(high_vs_low_contrast$log2FoldChange),max(high_vs_low_contrast$log2FoldChange)), alpha=0.01, main="No log-fold-change shrinkage")
85+
# apeglm shrinkage logFC
86+
plotMA(high_vs_low_shr, ylim=c(min(high_vs_low_shr$log2FoldChange),max(high_vs_low_shr$log2FoldChange)), alpha=0.01, main="With log-fold-change shrinkage")
87+
```
88+
89+
```{r}
90+
# getting variance stabilized transformed table
91+
deseq_vst <- varianceStabilizingTransformation(deseq)
92+
# NOTE: If you get this error here with your dataset: "Error in
93+
# estimateSizeFactorsForMatrix(counts(object), locfunc =locfunc, : every
94+
# gene contains at least one zero, cannot compute log geometric means", that
95+
# can be because the count table is sparse with many zeroes, which is common
96+
# with marker-gene surveys. In that case you'd need to use a specific
97+
# function first that is equipped to deal with that. You could run:
98+
# deseq_counts <- estimateSizeFactors(deseq_counts, type = "poscounts")
99+
# now followed by the transformation function:
100+
# deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts)
101+
102+
# and here is pulling out our transformed table
103+
vst_tab <- assay(deseq_vst)
104+
105+
# making heatmap of top 20 most highly expressed genes
106+
select <- order(rowMeans(counts(deseq, normalized=TRUE)),
107+
decreasing=TRUE)[1:20]
108+
109+
pheatmap(vst_tab[select, ], cluster_rows=FALSE, cluster_cols=FALSE, annotation_col=sample_info_tab[, 1, drop=F])
110+
```
111+
112+
```{r}
113+
# making heatmap of sample distances
114+
euc_dists <- dist(t(vst_tab))
115+
euc_dist_mat <- as.matrix(euc_dists)
116+
117+
pheatmap(euc_dist_mat, clustering_distance_rows=euc_dists, clustering_distance_cols=euc_dists, clustering_method="ward.D2")
118+
```
119+
120+
121+
122+

deseq.html

Lines changed: 345 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)