Skip to content

Commit

Permalink
fix blacklist bug (lines matching chr[\dXY]_* were included)
Browse files Browse the repository at this point in the history
  • Loading branch information
leepc12 committed Dec 20, 2016
1 parent babb503 commit 92a84d1
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 20 deletions.
32 changes: 20 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -469,15 +469,19 @@ There are two kinds of HTML reports provided by the pipeline.
# Output directory structure and file naming
For more details, refer to the file table section in an HTML report generated by the pipeline.
For more details, refer to the file table section in an HTML report generated by the pipeline. Files marked as (E) are outputs to be uploaded during ENCODE accession.
```
out # root dir. of outputs
*report.html # HTML report
*tracks.json # Tracks datahub (JSON) for WashU browser
├ ENCODE_summary.json # Metadata of all datafiles and QC results
├ align # mapped alignments
│ ├ rep1 # for true replicate 1
│ │ ├ *.trim.fastq.gz # adapter-trimmed fastq
│ │ ├ *.bam # raw bam
│ │ ├ *.nodup.bam # filtered and deduped bam
│ │ ├ *.nodup.bam (E) # filtered and deduped bam
│ │ ├ *.tagAlign.gz # tagAlign (bed6) generated from filtered bam
│ │ └ *.tn5.tagAlign.gz # TN5 shifted tagAlign for ATAC pipeline (not for DNase pipeline)
│ ├ rep2 # for true repilicate 2
Expand All @@ -491,18 +495,17 @@ out # root dir. of outputs
│ │ ...
│ └ pooled_pseudo_reps # for pooled pseudo replicates
│ ├ ppr1 # for pooled pseudo replicate 1 (rep1-pr1 + rep2-pr1 + ...)
│ ├ ppr2 # for pooled pseudo replicate 2 (rep1-pr2 + rep2-pr2 + ...)
│ ...
│ └ ppr2 # for pooled pseudo replicate 2 (rep1-pr2 + rep2-pr2 + ...)
├ peak # peaks called
│ ├ macs2 # peaks generated by MACS2
│ │ ├ rep1 # for replicate 1
│ │ │ ├ *.narrowPeak.gz # narrowPeak (p-val threshold = 0.01)
│ │ │ ├ *.gappedPeak.gz # gappedPeak (p-val threshold = 0.01)
│ │ │ ├ *.filt.narrowPeak.gz # blacklist filtered narrowPeak
│ │ │ ├ *.filt.gappedPeak.gz # blacklist filtered gappedPeak
│ │ │ ├ *.narrowPeak.bb # narrowPeak bigBed
│ │ │ ├ *.gappedPeak.bb # gappedPeak bigBed
│ │ │ ├ *.filt.narrowPeak.gz (E) # blacklist filtered narrowPeak
│ │ │ ├ *.filt.gappedPeak.gz (E) # blacklist filtered gappedPeak
│ │ │ ├ *.narrowPeak.bb (E) # narrowPeak bigBed
│ │ │ ├ *.gappedPeak.bb (E) # gappedPeak bigBed
│ │ │ ├ *.narrowPeak.hammock.gz # narrowPeak track for WashU browser
│ │ │ ├ *.gappedPeak.hammock.gz # gappedPeak track for WashU browser
│ │ │ ├ *.pval0.1.narrowPeak.gz # narrowPeak (p-val threshold = 0.1)
Expand All @@ -515,11 +518,16 @@ out # root dir. of outputs
│ └ idr # IDR thresholded peaks
│ ├ true_reps # for replicate 1
│ │ ├ *.narrowPeak.gz # IDR thresholded narrowPeak
│ │ ├ *.filt.narrowPeak.gz # IDR thresholded narrowPeak (blacklist filtered)
│ │ ├ *.filt.narrowPeak.gz (E) # IDR thresholded narrowPeak (blacklist filtered)
│ │ └ *.12-col.bed.gz # IDR thresholded narrowPeak track for WashU browser
│ ├ pseudo_reps # for self pseudo replicates
│ │ ├ rep1 # for replicate 1
│ │ ...
│ ├ optimal_set # optimal IDR thresholded peaks
│ │ └ *.filt.narrowPeak.gz (E) # IDR thresholded narrowPeak (blacklist filtered)
│ ├ conservative_set # optimal IDR thresholded peaks
│ │ └ *.filt.narrowPeak.gz (E) # IDR thresholded narrowPeak (blacklist filtered)
│ ├ pseudo_reps # for self pseudo replicates
│ └ pooled_pseudo_reps # for pooled pseudo replicate
├ qc # QC logs
Expand All @@ -537,12 +545,12 @@ out # root dir. of outputs
├ signal # signal tracks
│ ├ macs2 # signal tracks generated by MACS2
│ │ ├ rep1 # for true replicate 1
│ │ │ ├ *.pval.signal.bigwig # signal track for p-val
│ │ │ └ *.fc.signal.bigwig # signal track for fold change
│ │ │ ├ *.pval.signal.bigwig (E) # signal track for p-val
│ │ │ └ *.fc.signal.bigwig (E) # signal track for fold change
│ ...
│ └ pooled_rep # for pooled replicate
└ report # files for HTML report
└ report # files for HTML report
```
# ENCODE accession guideline
Expand Down
10 changes: 5 additions & 5 deletions modules/callpeak_bigbed.bds
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ string peak_to_bigbed( string filetype, string peak, string o_dir, string group
sys $shcmd_init

sys zcat $peak | sort -k1,1 -k2,2n > $bigbed.tmp
// sys cat $chrsz | grep -P 'chr[0-9XY]+(?!_)' > $bigbed.chrsz.tmp
// sys bedToBigBed $bed_param $bigbed.tmp $bigbed.chrsz.tmp $bigbed
// sys rm -f $bigbed.tmp $bigbed.chrsz.tmp
sys bedToBigBed $bed_param $bigbed.tmp $chrsz $bigbed
sys rm -f $bigbed.tmp
sys cat $chrsz | grep -P 'chr[\dXY]+[ \t]' > $bigbed.chrsz.tmp
sys bedToBigBed $bed_param $bigbed.tmp $bigbed.chrsz.tmp $bigbed
sys rm -f $bigbed.tmp $bigbed.chrsz.tmp
// sys bedToBigBed $bed_param $bigbed.tmp $chrsz $bigbed
// sys rm -f $bigbed.tmp

sys $shcmd_finalize
}
Expand Down
2 changes: 1 addition & 1 deletion modules/callpeak_blacklist_filter.bds
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ string blacklist_filter_peak( string filetype, string peak, string o_dir, string

sys bedtools intersect -v -a $peak -b $blacklist \
| awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}' \
| grep -P 'chr[0-9XY]+(?!_)' | gzip -nc > $filtered
| grep -P 'chr[\dXY]+[ \t]' | gzip -nc > $filtered

sys $shcmd_finalize
}
Expand Down
2 changes: 1 addition & 1 deletion modules/callpeak_idr.bds
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ string[] idr2( string peak1, string peak2, string pooled, string idr_thresh, str

if ( path_exists( blacklist ) ) {
cmd1 = "bedtools intersect -v -a $peak_idr_trk_tmp -b $blacklist"+\
" | grep -P 'chr[0-9XY]+(?!_)'"+\
" | grep -P 'chr[\dXY]+[ \t]'"+\
" | awk 'BEGIN{OFS=\"\t\"} {if ($5>1000) $5=1000; print $0}'"+\
" | gzip -nc > $filt_peak_idr_trk_tmp"
cmd2 = "zcat $filt_peak_idr_trk_tmp | awk 'BEGIN{OFS=\"\t\"} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}' | gzip -nc > $filt_peak_idr"
Expand Down
2 changes: 1 addition & 1 deletion modules/callpeak_naive_overlap.bds
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ string naive_overlap_peak( string filetype, string peak_pooled, string peak_rep1
string cmd1
if ( path_exists( blacklist ) ) {
cmd1 = "bedtools intersect -v -a $peak_final -b $blacklist "+\
" | grep -P 'chr[0-9XY]+(?!_)'"+\
" | grep -P 'chr[\dXY]+[ \t]'"+\
" | awk 'BEGIN{OFS=\"\t\"} {if ($5>1000) $5=1000; print $0}'"+\
" | gzip -nc > $filt_peak_final"
}
Expand Down

0 comments on commit 92a84d1

Please sign in to comment.