Skip to content
This repository has been archived by the owner on Dec 19, 2023. It is now read-only.

Commit

Permalink
Merge pull request #22 from databio/dev_newplots
Browse files Browse the repository at this point in the history
Some changes
  • Loading branch information
stolarczyk authored May 28, 2020
2 parents d3cbe8e + 4c71e52 commit 71d1e8b
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 69 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The above command will build the csv file looper needs to run the pipeline on al
The input PEP can be validated against the [JSON schema in this repository](pep_schema.yaml). This ensures the PEP consists of all required attributes to run `bedstat` pipeline.

```
eido -p <path/to/pep> -s pep_schema.yaml
eido validate <path/to/pep> -s https://schema.databio.org/pipelines/bedstat.yaml
```

### 2. Create a persistent volume to house elasticsearch data
Expand All @@ -58,7 +58,7 @@ looper run project/bedstat_config.yaml

The data loaded into elasticsearch should persist between elasticsearch invocations, on the es-data docker volume created above in step 2.

### Optional step = run Kibana
### 5. (optional) Run Kibana

Kibana can be used in order to see ElasticSearch data in a "GUI" kind of a way.

Expand Down
77 changes: 51 additions & 26 deletions pipeline/bedstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
bedfile statistics generating pipeline
"""

__author__ = ["Michal Stolarczyk", "Ognen Duzlevski"]
__author__ = ["Michal Stolarczyk", "Ognen Duzlevski", "Jose Verdezoto"]
__email__ = "[email protected]"
__version__ = "0.0.1"

Expand All @@ -12,41 +12,64 @@
import json
import yaml
import os
import warnings

import pypiper
from bbconf.const import *
import pypiper
import bbconf

parser = ArgumentParser(description="A pipeline to read a file in BED format and produce metadata in JSON format.")
parser = ArgumentParser(
description="A pipeline to read a file in BED format and produce metadata "
"in JSON format.")

parser.add_argument('--bedfile', help='a full path to bed file to process', required=True)
parser.add_argument("--bedbase-config", dest="bedbase_config", type=str, required=False, default=None,
help="a path to the bedbase configuratiion file")
parser.add_argument("-y", "--sample-yaml", dest="sample_yaml", type=str, required=False,
help="a yaml config file with sample attributes to pass on more metadata into the database")
parser.add_argument(
'--bedfile', help='a full path to bed file to process', required=True)
parser.add_argument(
'--open-signal-matrix', type=str, required=False, default=None,
help='a full path to the openSignalMatrix required for the tissue '
'specificity plots')
parser.add_argument(
"--bedbase-config", dest="bedbase_config", type=str, default=None,
help="a path to the bedbase configuratiion file")
parser.add_argument(
"-y", "--sample-yaml", dest="sample_yaml", type=str, required=False,
help="a yaml config file with sample attributes to pass on more metadata "
"into the database")
exclusive_group = parser.add_mutually_exclusive_group()
exclusive_group.add_argument('--no-db-commit', action='store_true',
help='whether the JSON commit to the database should be skipped')
exclusive_group.add_argument('--just-db-commit', action='store_true',
help='whether just to commit the JSON to the database')
parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "looper", "ngs"])
exclusive_group.add_argument(
'--no-db-commit', action='store_true',
help='whether the JSON commit to the database should be skipped')
exclusive_group.add_argument(
'--just-db-commit', action='store_true',
help='whether just to commit the JSON to the database')
parser = pypiper.add_pypiper_args(parser,
groups=["pypiper", "common", "looper", "ngs"])

args = parser.parse_args()

bbc = bbconf.BedBaseConf(filepath=bbconf.get_bedbase_cfg(args.bedbase_config))

bed_digest = md5(open(args.bedfile, 'rb').read()).hexdigest()
bedfile_name = os.path.split(args.bedfile)[1]
fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0] # twice since there are 2 exts
outfolder = os.path.abspath(os.path.join(bbc[CFG_PATH_KEY][CFG_BEDSTAT_OUTPUT_KEY], bed_digest))
# need to split twice since there are 2 exts
fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0]
outfolder = os.path.abspath(os.path.join(
bbc[CFG_PATH_KEY][CFG_BEDSTAT_OUTPUT_KEY], bed_digest))
json_file_path = os.path.abspath(os.path.join(outfolder, fileid + ".json"))

if not args.just_db_commit:
pm = pypiper.PipelineManager(name="bedstat-pipeline", outfolder=outfolder, args=args)
rscript_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "tools", "regionstat.R")
assert os.path.exists(rscript_path), FileNotFoundError("'{}' script not found".format(rscript_path))
cmd_vars = dict(rscript=rscript_path, bed=args.bedfile, id=fileid, out=outfolder, genome=args.genome_assembly, digest=bed_digest)
command = "Rscript {rscript} --bedfile={bed} --fileId={id} --outputfolder={out} --genome={genome} --digest={digest}".format(**cmd_vars)
pm = pypiper.PipelineManager(name="bedstat-pipeline", outfolder=outfolder,
args=args)
rscript_path = os.path.join(os.path.dirname(
os.path.dirname(os.path.abspath(__file__))), "tools", "regionstat.R")
assert os.path.exists(rscript_path), \
FileNotFoundError("'{}' script not found".format(rscript_path))
cmd_vars = dict(rscript=rscript_path, bed=args.bedfile, id=fileid,
matrix=args.open_signal_matrix, out=outfolder,
genome=args.genome_assembly, digest=bed_digest)
command = "Rscript {rscript} --bedfile={bed} --fileId={id} " \
"--openSignalMatrix={matrix} --outputfolder={out} " \
"--genome={genome} --digest={digest}".format(**cmd_vars)
pm.run(cmd=command, target=json_file_path)
pm.stop_pipeline()

Expand All @@ -68,13 +91,15 @@
except KeyError:
print("'{}' metadata not available".format(key))
else:
warnings.warn("Specified sample_yaml path does not exist: {}".format(args.sample_yaml))
warnings.warn("Specified sample_yaml path does not exist: {}".
format(args.sample_yaml))
# enrich the data from R with the data from the sample line itself
# the bedfile_path below needs to be overwritten in Elastic in case the pipeline run was split
# into two computing environments. Currently used for the development.
# This concept leverages the potability introduced by environment variable in the
# bedfile path in the PEP. Locally the environment variable points to a different path than on
# the compute cluster where the heavy calculations happen.
# the bedfile_path below needs to be overwritten in Elastic in case the
# pipeline run was split into two computing environments. Currently used
# for the development. This concept leverages the potability introduced by
# environment variable in the bedfile path in the PEP. Locally the
# environment variable points to a different path than on the compute
# cluster where the heavy calculations happen.
data[BEDFILE_PATH_KEY] = [args.bedfile]
print("Data: {}".format(data))
bbc.insert_bedfiles_data(data=data, doc_id=bed_digest)
Expand Down
3 changes: 3 additions & 0 deletions pipeline_interface.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ pipelines:
"--bedfile": output_file_path
"--genome": genome
"--sample-yaml": yaml_file
optional_arguments:
"--open-signal-matrix": open_signal_matrix

24 changes: 11 additions & 13 deletions pipeline_interface_new.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
protocol_mapping:
bedstat: bedstat

pipelines:
bedstat:
name: BEDSTAT
path: pipeline/bedstat.py
schema: pep_schema.yaml
looper_args: True
command_template: >
{ pipeline.path }
--bedfile { sample.output_file_path }
--genome { sample.genome }
pipeline_name: BEDSTAT
pipeline_type: sample
path: pipeline/bedstat.py
input_schema: http://schema.databio.org/pipelines/bedstat.yaml
command_template: >
{pipeline.path}
--bedfile {sample.output_file_path}
--genome {sample.genome}
--sample-yaml {sample.yaml_file}
{% if sample.bedbase_config is defined %} --bedbase-config {sample.bedbase_config} {% endif %}
{% if sample.open_signal_matrix is defined %} --open-signal-matrix {sample.open_signal_matrix} {% endif %}
7 changes: 7 additions & 0 deletions project/bedstat_config_new.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
pep_version: 2.0.0
sample_table: path
looper:
output_dir: output
sample_modifiers:
append:
pipeline_interfaces: ../pipeline_interface_new.yaml
21 changes: 10 additions & 11 deletions scripts/installRdeps.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,18 @@
}
}

.install_pkg("R.utils")
.install_pkg("BiocManager")
.install_pkg("optparse")
.install_pkg("devtools")
devtools::install_github("databio/GenomicDistributions", ref="dev")
genomes = list(Hsapiens = c("hg18","hg19","hg38"),
Mmusculus = c("mm10","mm9"))
for(name in names(genomes)) {
for(genome in genomes[[name]]) {
# should install non-masked too
.install_pkg(p=paste0("BSgenome.", name,
".UCSC.", genome,".masked"),
bioc=TRUE)
}
}
.install_pkg("GenomicRanges", bioc=TRUE)
.install_pkg("GenomicFeatures", bioc=TRUE)
.install_pkg("ensembldb", bioc=TRUE)
.install_pkg("LOLA", bioc=TRUE)
.install_pkg("BSgenome", bioc=TRUE)
if(!require(package = "GenomicDistributions", character.only=TRUE)) {
devtools::install_github("databio/GenomicDistributions")
}
if(!require(package = "GenomicDistributionsData", character.only=TRUE)) {
install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.1.tar.gz", repos=NULL)
}
100 changes: 83 additions & 17 deletions tools/regionstat.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
library(GenomicDistributions)
library(GenomicDistributionsData)
library(optparse)
library(tools)
data(TSS_hg38)

option_list = list(
make_option(c("--bedfile"), type="character", default=NULL,
help="path to a BED file to process", metavar="character"),
make_option(c("--fileId"), type="character", default=NULL,
help="BED file ID to use for output files prefix", metavar="character"),
make_option(c("--openSignalMatrix"), type="character",
help="path to the open signal matrix required for the tissue specificity plot", metavar="character"),
make_option(c("--digest"), type="character", default=NULL,
help="digest of the BED file", metavar="character"),
help="digest of the BED file", metavar="character"),
make_option(c("--outputfolder"), type="character", default="output",
help="base output folder for results", metavar="character"),
make_option(c("--genome"), type="character", default="hg38",
Expand All @@ -32,15 +36,16 @@ if (is.null(opt$digest)) {
stop("digest input missing.")
}


plotBoth <- function(plotPth, g){
print(paste0("Plotting: ", plotPth))
ggplot2::ggsave(paste0(plotPth, ".png"), g, device="png", width=8, height=8, units="cm")
ggplot2::ggsave(paste0(plotPth, ".pdf"), g, device="pdf", width=12, height=12, units="cm")
ggplot2::ggsave(paste0(plotPth, ".png"), g, device="png", width=8, height=8, units="in")
ggplot2::ggsave(paste0(plotPth, ".pdf"), g, device="pdf", width=8, height=8, units="in")
}

doitall <- function(query, fname, fileId, genome) {
doItAall <- function(query, fname, fileId, genome, cellMatrix) {
plots = data.frame(stringsAsFactors=F)

bsGenomeAvail = ifelse((requireNamespace(BSg, quietly=TRUE) | requireNamespace(BSgm, quietly=TRUE)), TRUE, FALSE)
## continue on with calculations
TSSdist = calcFeatureDistRefTSS(query, genome)
plotId = "tssdist"
Expand All @@ -50,54 +55,115 @@ doitall <- function(query, fname, fileId, genome) {
plots = rbind(plots, newPlot)


# Chromosomes region distribution plot
x = calcChromBinsRef(query, genome)
plotId = "chrombins"
plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
plotChromBins(x))
newPlot = data.frame("name"=plotId, "caption"="Regions distribution over chromosomes")
plots = rbind(plots, newPlot)

gcvec = calcGCContentRef(query, genome)
plotId = "gccontent"
plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
plotGCContent(gcvec))
newPlot = data.frame("name"=plotId, "caption"="GC content")
plots = rbind(plots, newPlot)

# OPTIONAL: Plot GC content only if proper BSgenome package is installed.
if (bsGenomeAvail) {
gcvec = calcGCContentRef(query, genome)
plotId = "gccontent"
plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
plotGCContent(gcvec))
newPlot = data.frame("name"=plotId, "caption"="GC content")
plots = rbind(plots, newPlot)
}
# Partition Plots, default to percentages
gp = calcPartitionsRef(query, genome)
plotId = "partitions"
plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
plotPartitions(gp))
newPlot = data.frame("name"=plotId, "caption"="Regions distribution over genomic partitions")
plots = rbind(plots, newPlot)


ep = calcExpectedPartitionsRef(query, genome)
plotId = "expected_partitions"
plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
plotExpectedPartitions(ep))
newPlot = data.frame("name"=plotId, "caption"="Expected distribution over genomic partitions")
plots = rbind(plots, newPlot)

cp = calcCumulativePartitionsRef(query, genome)
plotId = "cumulative_partitions"
plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
plotCumulativePartitions(cp))
newPlot = data.frame("name"=plotId, "caption"="Cumulative distribution over genomic partitions")
plots = rbind(plots, newPlot)


# flatten the result returned by the function above
partiotionNames = as.vector(gp[,"partition"])
partitionsList = list()
for(i in seq_along(partiotionNames)){
partitionsList[[paste0(partiotionNames[i], "_frequency")]] =
as.vector(gp[,"Freq"])[i]
partitionsList[[paste0(partiotionNames[i], "_percentage")]] =
as.vector(gp[,"Freq"])[i]/length(query)
as.vector(gp[,"Freq"])[i]/length(query)
}

# QThist plot
widths = calcWidth(query)
plotId = "widths_histogram"
plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
plotQTHist(widths))
newPlot = data.frame("name"=plotId, "caption"="Quantile-Trimmed Histogram of Widths")
plots = rbind(plots, newPlot)

# Neighbor regions distance plots
dist = calcNeighborDist(query)
plotId = "neighbor_distances"
plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
plotNeighborDist(dist))
newPlot = data.frame("name"=plotId, "caption"="Distance between neighbor regions")
plots = rbind(plots, newPlot)

# OPTIONAL: Add tissue specificity plot if open signal matrix is provided
if (cellMatrix == "None") {
message("open signal matrix not provided. Skipping tissue specificity plot ... ")
} else {
matrix = data.table::fread(cellMatrix)
op = calcOpenSignal(query, matrix)
plotId = "open_chromatin"
plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
plotOpenSignal(op))
newPlot = data.frame("name"=plotId, "caption"="Cell specific enrichment for open chromatin")
plots = rbind(plots, newPlot)
}

# Note: names of the list elements MUST match what's defined in: https://github.com/databio/bbconf/blob/master/bbconf/const.py
bedmeta = list(
id=fileId,
gc_content=mean(gcvec),
gc_content=ifelse(bsGenomeAvail, mean(gcvec), NA),
regions_no=length(query),
mean_absolute_TSS_dist=mean(abs(TSSdist), na.rm=TRUE),
mean_region_width=mean(widths),
md5sum=opt$digest,
plots=plots,
bedfile_path=fname
)
write(jsonlite::toJSON(c(bedmeta, partitionsList), pretty=TRUE), paste0(outfolder, "/", fileId, ".json"))
}

# set query to bed file
# define values and output folder for doitall()
fileId = opt$fileId
fn = opt$bedfile
outfolder = opt$outputfolder
genome = opt$genome
cellMatrix = opt$openSignalMatrix
orgName = "Mmusculus"

# build BSgenome package ID to check whether it's installed
if (startsWith(genome, "hg") | startsWith(genome, "grch")) orgName = "Hsapiens"

BSg = paste0("BSgenome.", orgName , ".UCSC.", genome)
BSgm = paste0(BSg, ".masked")

# read bed file and run doitall()
query = LOLA::readBed(fn)
doitall(query, fn, fileId, genome)
doItAall(query, fn, fileId, genome, cellMatrix)


0 comments on commit 71d1e8b

Please sign in to comment.