Merge pull request #22 from databio/dev_newplots

Some changes
databio · May 28, 2020 · 71d1e8b · 71d1e8b
2 parents d3cbe8e + 4c71e52
commit 71d1e8b
Show file tree

Hide file tree

Showing 7 changed files with 167 additions and 69 deletions.
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ The above command will build the csv file looper needs to run the pipeline on al
 The input PEP can be validated against the [JSON schema in this repository](pep_schema.yaml). This ensures the PEP consists of all required attributes to run `bedstat` pipeline.
 
 ```
-eido -p <path/to/pep> -s pep_schema.yaml
+eido validate <path/to/pep> -s https://schema.databio.org/pipelines/bedstat.yaml
 ```
 
 ### 2. Create a persistent volume to house elasticsearch data
@@ -58,7 +58,7 @@ looper run project/bedstat_config.yaml
 
 The data loaded into elasticsearch should persist between elasticsearch invocations, on the es-data docker volume created above in step 2.
 
-### Optional step = run Kibana
+### 5. (optional) Run Kibana
 
 Kibana can be used in order to see ElasticSearch data in a "GUI" kind of a way.
 

diff --git a/pipeline/bedstat.py b/pipeline/bedstat.py
@@ -3,7 +3,7 @@
 bedfile statistics generating pipeline
 """
 
-__author__ = ["Michal Stolarczyk", "Ognen Duzlevski"]
+__author__ = ["Michal Stolarczyk", "Ognen Duzlevski", "Jose Verdezoto"]
 __email__ = "[email protected]"
 __version__ = "0.0.1"
 
@@ -12,41 +12,64 @@
 import json
 import yaml
 import os
+import warnings
 
-import pypiper
 from bbconf.const import *
+import pypiper
 import bbconf
 
-parser = ArgumentParser(description="A pipeline to read a file in BED format and produce metadata in JSON format.")
+parser = ArgumentParser(
+    description="A pipeline to read a file in BED format and produce metadata "
+                "in JSON format.")
 
-parser.add_argument('--bedfile', help='a full path to bed file to process', required=True)
-parser.add_argument("--bedbase-config", dest="bedbase_config", type=str, required=False, default=None,
-                    help="a path to the bedbase configuratiion file")
-parser.add_argument("-y", "--sample-yaml", dest="sample_yaml", type=str, required=False,
-                    help="a yaml config file with sample attributes to pass on more metadata into the database")
+parser.add_argument(
+    '--bedfile', help='a full path to bed file to process', required=True)
+parser.add_argument(
+    '--open-signal-matrix', type=str, required=False, default=None,
+    help='a full path to the openSignalMatrix required for the tissue '
+         'specificity plots')
+parser.add_argument(
+    "--bedbase-config", dest="bedbase_config", type=str, default=None,
+    help="a path to the bedbase configuratiion file")
+parser.add_argument(
+    "-y", "--sample-yaml", dest="sample_yaml", type=str, required=False,
+    help="a yaml config file with sample attributes to pass on more metadata "
+         "into the database")
 exclusive_group = parser.add_mutually_exclusive_group()
-exclusive_group.add_argument('--no-db-commit', action='store_true',
-                             help='whether the JSON commit to the database should be skipped')
-exclusive_group.add_argument('--just-db-commit', action='store_true',
-                             help='whether just to commit the JSON to the database')
-parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "looper", "ngs"])
+exclusive_group.add_argument(
+    '--no-db-commit', action='store_true',
+    help='whether the JSON commit to the database should be skipped')
+exclusive_group.add_argument(
+    '--just-db-commit', action='store_true',
+    help='whether just to commit the JSON to the database')
+parser = pypiper.add_pypiper_args(parser,
+                                  groups=["pypiper", "common", "looper", "ngs"])
 
 args = parser.parse_args()
 
 bbc = bbconf.BedBaseConf(filepath=bbconf.get_bedbase_cfg(args.bedbase_config))
 
 bed_digest = md5(open(args.bedfile, 'rb').read()).hexdigest()
 bedfile_name = os.path.split(args.bedfile)[1]
-fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0]  # twice since there are 2 exts
-outfolder = os.path.abspath(os.path.join(bbc[CFG_PATH_KEY][CFG_BEDSTAT_OUTPUT_KEY], bed_digest))
+# need to split twice since there are 2 exts
+fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0]
+outfolder = os.path.abspath(os.path.join(
+    bbc[CFG_PATH_KEY][CFG_BEDSTAT_OUTPUT_KEY], bed_digest))
 json_file_path = os.path.abspath(os.path.join(outfolder, fileid + ".json"))
 
 if not args.just_db_commit:
-    pm = pypiper.PipelineManager(name="bedstat-pipeline", outfolder=outfolder, args=args)
-    rscript_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "tools", "regionstat.R")
-    assert os.path.exists(rscript_path), FileNotFoundError("'{}' script not found".format(rscript_path))
-    cmd_vars = dict(rscript=rscript_path, bed=args.bedfile, id=fileid, out=outfolder, genome=args.genome_assembly, digest=bed_digest)
-    command = "Rscript {rscript} --bedfile={bed} --fileId={id} --outputfolder={out} --genome={genome} --digest={digest}".format(**cmd_vars)
+    pm = pypiper.PipelineManager(name="bedstat-pipeline", outfolder=outfolder,
+                                 args=args)
+    rscript_path = os.path.join(os.path.dirname(
+        os.path.dirname(os.path.abspath(__file__))), "tools", "regionstat.R")
+    assert os.path.exists(rscript_path), \
+        FileNotFoundError("'{}' script not found".format(rscript_path))
+    cmd_vars = dict(rscript=rscript_path, bed=args.bedfile, id=fileid,
+                    matrix=args.open_signal_matrix, out=outfolder,
+                    genome=args.genome_assembly, digest=bed_digest)
+    command = "Rscript {rscript} --bedfile={bed} --fileId={id} " \
+              "--openSignalMatrix={matrix} --outputfolder={out} " \
+              "--genome={genome} --digest={digest}".format(**cmd_vars)
     pm.run(cmd=command, target=json_file_path)
     pm.stop_pipeline()
 
@@ -68,13 +91,15 @@
                 except KeyError:
                     print("'{}' metadata not available".format(key))
         else:
-            warnings.warn("Specified sample_yaml path does not exist: {}".format(args.sample_yaml))
+            warnings.warn("Specified sample_yaml path does not exist: {}".
+                          format(args.sample_yaml))
     # enrich the data from R with the data from the sample line itself
-    # the bedfile_path below needs to be overwritten in Elastic in case the pipeline run was split
-    # into two computing environments. Currently used for the development.
-    # This concept leverages the potability introduced by environment variable in the
-    # bedfile path in the PEP. Locally the environment variable points to a different path than on
-    # the compute cluster where the heavy calculations happen.
+    # the bedfile_path below needs to be overwritten in Elastic in case the
+    # pipeline run was split into two computing environments. Currently used
+    # for the development. This concept leverages the potability introduced by
+    # environment variable in the bedfile path in the PEP. Locally the
+    # environment variable points to a different path than on the compute
+    # cluster where the heavy calculations happen.
     data[BEDFILE_PATH_KEY] = [args.bedfile]
     print("Data: {}".format(data))
     bbc.insert_bedfiles_data(data=data, doc_id=bed_digest)

diff --git a/pipeline_interface.yaml b/pipeline_interface.yaml
@@ -10,3 +10,6 @@ pipelines:
       "--bedfile": output_file_path
       "--genome": genome
       "--sample-yaml": yaml_file
+    optional_arguments:
+       "--open-signal-matrix": open_signal_matrix
+
diff --git a/pipeline_interface_new.yaml b/pipeline_interface_new.yaml
@@ -1,13 +1,11 @@
-protocol_mapping:
-    bedstat: bedstat
-
-pipelines:
-  bedstat:
-    name: BEDSTAT
-    path: pipeline/bedstat.py
-    schema: pep_schema.yaml
-    looper_args: True
-    command_template: >
-      { pipeline.path }
-      --bedfile { sample.output_file_path }
-      --genome { sample.genome }
+pipeline_name: BEDSTAT
+pipeline_type: sample
+path: pipeline/bedstat.py
+input_schema: http://schema.databio.org/pipelines/bedstat.yaml
+command_template: >
+  {pipeline.path}
+  --bedfile {sample.output_file_path}
+  --genome {sample.genome}
+  --sample-yaml {sample.yaml_file}
+  {% if sample.bedbase_config is defined %} --bedbase-config {sample.bedbase_config} {% endif %}
+  {% if sample.open_signal_matrix is defined %} --open-signal-matrix {sample.open_signal_matrix} {% endif %}
diff --git a/project/bedstat_config_new.yaml b/project/bedstat_config_new.yaml
@@ -0,0 +1,7 @@
+pep_version: 2.0.0
+sample_table: path
+looper:
+  output_dir: output
+sample_modifiers:
+  append:
+    pipeline_interfaces: ../pipeline_interface_new.yaml
diff --git a/scripts/installRdeps.R b/scripts/installRdeps.R
@@ -8,19 +8,18 @@
     }
 }
 
+.install_pkg("R.utils")
 .install_pkg("BiocManager")
 .install_pkg("optparse")
 .install_pkg("devtools")
-devtools::install_github("databio/GenomicDistributions", ref="dev")
-genomes = list(Hsapiens = c("hg18","hg19","hg38"), 
-               Mmusculus = c("mm10","mm9"))
-for(name in names(genomes)) {
-    for(genome in genomes[[name]]) {
-        # should install non-masked too
-        .install_pkg(p=paste0("BSgenome.", name, 
-                                ".UCSC.", genome,".masked"), 
-                     bioc=TRUE)
-    }
-}
 .install_pkg("GenomicRanges", bioc=TRUE)
+.install_pkg("GenomicFeatures", bioc=TRUE)
+.install_pkg("ensembldb", bioc=TRUE)
 .install_pkg("LOLA", bioc=TRUE)
+.install_pkg("BSgenome", bioc=TRUE)
+if(!require(package = "GenomicDistributions", character.only=TRUE)) {
+    devtools::install_github("databio/GenomicDistributions")
+}
+if(!require(package = "GenomicDistributionsData", character.only=TRUE)) {
+    install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.1.tar.gz", repos=NULL)
+}
diff --git a/tools/regionstat.R b/tools/regionstat.R
@@ -1,14 +1,18 @@
 library(GenomicDistributions)
+library(GenomicDistributionsData)
 library(optparse)
 library(tools)
+data(TSS_hg38)
 
 option_list = list(
     make_option(c("--bedfile"), type="character", default=NULL, 
               help="path to a BED file to process", metavar="character"),
 	make_option(c("--fileId"), type="character", default=NULL,
               help="BED file ID to use for output files prefix", metavar="character"),
+	make_option(c("--openSignalMatrix"), type="character",
+			  help="path to the open signal matrix required for the tissue specificity plot", metavar="character"),
     make_option(c("--digest"), type="character", default=NULL,
-                help="digest of the BED file", metavar="character"),
+              help="digest of the BED file", metavar="character"),
     make_option(c("--outputfolder"), type="character", default="output",
               help="base output folder for results", metavar="character"),
     make_option(c("--genome"), type="character", default="hg38",
@@ -32,15 +36,16 @@ if (is.null(opt$digest)) {
     stop("digest input missing.")
 }
 
+
 plotBoth <- function(plotPth, g){
     print(paste0("Plotting: ", plotPth))
-    ggplot2::ggsave(paste0(plotPth, ".png"), g, device="png", width=8, height=8, units="cm")
-    ggplot2::ggsave(paste0(plotPth, ".pdf"), g, device="pdf", width=12, height=12, units="cm")
+    ggplot2::ggsave(paste0(plotPth, ".png"), g, device="png", width=8, height=8, units="in")
+    ggplot2::ggsave(paste0(plotPth, ".pdf"), g, device="pdf", width=8, height=8, units="in")
 }
 
-doitall <- function(query, fname, fileId, genome) {
+doItAall <- function(query, fname, fileId, genome, cellMatrix) {
     plots = data.frame(stringsAsFactors=F)
-
+    bsGenomeAvail = ifelse((requireNamespace(BSg, quietly=TRUE) | requireNamespace(BSgm, quietly=TRUE)), TRUE, FALSE)
     ## continue on with calculations
 	TSSdist = calcFeatureDistRefTSS(query, genome)
 	plotId = "tssdist"
@@ -50,54 +55,115 @@ doitall <- function(query, fname, fileId, genome) {
     plots = rbind(plots, newPlot)
 
 
+    # Chromosomes region distribution plot
 	x = calcChromBinsRef(query, genome)
     plotId = "chrombins"
     plotBoth(paste0(outfolder, "/", fileId, "_", plotId), 
              plotChromBins(x))
     newPlot = data.frame("name"=plotId, "caption"="Regions distribution over chromosomes")
     plots = rbind(plots, newPlot)
 
-	gcvec = calcGCContentRef(query, genome)
-	plotId = "gccontent"
-    plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
-             plotGCContent(gcvec))
-    newPlot = data.frame("name"=plotId, "caption"="GC content")
-    plots = rbind(plots, newPlot)
-
+	# OPTIONAL: Plot GC content only if proper BSgenome package is installed. 
+	if (bsGenomeAvail) {
+		gcvec = calcGCContentRef(query, genome)
+		plotId = "gccontent"
+		plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
+					plotGCContent(gcvec))
+		newPlot = data.frame("name"=plotId, "caption"="GC content")
+		plots = rbind(plots, newPlot)
+	}
+    # Partition Plots, default to percentages
 	gp = calcPartitionsRef(query, genome)
 	plotId = "partitions"
 	plotBoth(paste0(outfolder, "/", fileId, "_", plotId), 
 	         plotPartitions(gp))
 	newPlot = data.frame("name"=plotId, "caption"="Regions distribution over genomic partitions")
 	plots = rbind(plots, newPlot)
-
+
+	ep = calcExpectedPartitionsRef(query, genome)
+	plotId = "expected_partitions"
+	plotBoth(paste0(outfolder, "/", fileId, "_", plotId), 
+	         plotExpectedPartitions(ep))
+	newPlot = data.frame("name"=plotId, "caption"="Expected distribution over genomic partitions")
+	plots = rbind(plots, newPlot)
+
+	cp = calcCumulativePartitionsRef(query, genome)
+	plotId = "cumulative_partitions"
+	plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
+			 plotCumulativePartitions(cp))
+	newPlot = data.frame("name"=plotId, "caption"="Cumulative distribution over genomic partitions")
+	plots = rbind(plots, newPlot)
+
+
 	# flatten the result returned by the function above
 	partiotionNames = as.vector(gp[,"partition"])
 	partitionsList = list()
 	for(i in seq_along(partiotionNames)){
 	    partitionsList[[paste0(partiotionNames[i], "_frequency")]] = 
 	        as.vector(gp[,"Freq"])[i]
 	    partitionsList[[paste0(partiotionNames[i], "_percentage")]] = 
-	        as.vector(gp[,"Freq"])[i]/length(query)
+	        as.vector(gp[,"Freq"])[i]/length(query)	        
 	}
+
+	# QThist plot
+	widths = calcWidth(query)
+	plotId = "widths_histogram"
+	plotBoth(paste0(outfolder, "/", fileId, "_", plotId),
+		plotQTHist(widths))
+	newPlot = data.frame("name"=plotId, "caption"="Quantile-Trimmed Histogram of Widths")
+	plots = rbind(plots, newPlot)
+
+	# Neighbor regions distance plots
+	dist = calcNeighborDist(query)
+	plotId = "neighbor_distances"
+	plotBoth(paste0(outfolder, "/", fileId, "_", plotId), 
+	         plotNeighborDist(dist))
+	newPlot = data.frame("name"=plotId, "caption"="Distance between neighbor regions")
+	plots = rbind(plots, newPlot)
+
+	# OPTIONAL: Add tissue specificity plot if open signal matrix is provided
+	if (cellMatrix == "None") {
+		message("open signal matrix not provided. Skipping tissue specificity plot ... ")
+	} else {
+		matrix = data.table::fread(cellMatrix)
+		op = calcOpenSignal(query, matrix)
+		plotId = "open_chromatin"
+		plotBoth(paste0(outfolder, "/", fileId, "_", plotId), 
+		         plotOpenSignal(op))
+		newPlot = data.frame("name"=plotId, "caption"="Cell specific enrichment for open chromatin")
+		plots = rbind(plots, newPlot)
+	}
+
 	# Note: names of the list elements MUST match what's defined in: https://github.com/databio/bbconf/blob/master/bbconf/const.py
 	bedmeta = list(
 	    id=fileId,
-		gc_content=mean(gcvec),
+		gc_content=ifelse(bsGenomeAvail, mean(gcvec), NA),
 		regions_no=length(query),
 		mean_absolute_TSS_dist=mean(abs(TSSdist), na.rm=TRUE),
+		mean_region_width=mean(widths),
 		md5sum=opt$digest,
 		plots=plots,
 		bedfile_path=fname
 	)
 	write(jsonlite::toJSON(c(bedmeta, partitionsList), pretty=TRUE), paste0(outfolder, "/", fileId, ".json"))
 }
 
-# set query to bed file
+# define values and output folder for doitall()
 fileId = opt$fileId
 fn = opt$bedfile
 outfolder = opt$outputfolder
 genome = opt$genome
+cellMatrix = opt$openSignalMatrix
+orgName = "Mmusculus"
+
+# build BSgenome package ID to check whether it's installed
+if (startsWith(genome, "hg") | startsWith(genome, "grch")) orgName = "Hsapiens"
 
+BSg = paste0("BSgenome.", orgName , ".UCSC.", genome)
+BSgm = paste0(BSg, ".masked")
+
+# read bed file and run doitall()
 query = LOLA::readBed(fn)
-doitall(query, fn, fileId, genome)
+doItAall(query, fn, fileId, genome, cellMatrix)
+
+