Prepare for Release Version 1.2.2 (#43)

* Allow genome size to be given in scientific notation * Update genome size parameter info * Bump up version numbers * Don't require genome_size argument * Calculate dist using only read lengths > min_size * Use 2 * cut length as lower limit for dist * Add time commands to long-to-linked and minimap2 steps for tigmint-long * use 1000 as lower bound for dist * create fastq file for cut reads * simplify get_genome_size * create argument for dist read length percentile * v1.2.2 * update pipeline flow chart * change string formatting * change expected long-to-linked outputs to fastq * new expected output for long-to-linked * Use p50 for dist calculation * Update tests to use lower-bounded read length p50 as auto dist * Don't upgrade pip installations * Use python3.8 and add upgrade command back * Fix python paths * Update docstring Co-authored-by: Lauren Coombe <[email protected]>
bcgsc · Jan 15, 2021 · 0873233 · 0873233
1 parent ab2a39b
commit 0873233
Show file tree

Hide file tree

Showing 14 changed files with 136 additions and 119 deletions.
diff --git a/README.md b/README.md
@@ -118,10 +118,10 @@ tigmint-make metrics draft=myassembly reads=myreads ref=GRCh38 G=3088269832
 ```
 ***
 
-To run Tigmint with long reads in fasta or fastq format (`myreads.fa.gz` or `myreads.fq.gz`) on the draft assembly `myassembly.fa`:
+To run Tigmint with long reads in fasta or fastq format (`myreads.fa.gz` or `myreads.fq.gz`) on the draft assembly `myassembly.fa` for an organism with a genome size of gsize:
 
 ```sh
-tigmint-make tigmint-long draft=myassembly reads=myreads span=auto G=genomesize dist=auto
+tigmint-make tigmint-long draft=myassembly reads=myreads span=auto G=gsize dist=auto
 ```
 
 - `minimap2 map-ont` is used to align long reads from the Oxford Nanopore Technologies (ONT) platform, which is the default input for Tigmint. To use PacBio long reads specify the parameter `longmap=pb`
@@ -145,7 +145,7 @@ tigmint-make tigmint-long draft=myassembly reads=myreads span=auto G=genomesize
 
 + `draft`: Name of the draft assembly, `myassembly.fa`
 + `reads`: Name of the reads, `myreads.fq.gz`
-+ `G`: Haploid genome size of the draft assembly organism. Used to calculate `span` parameter automatically
++ `G`: Haploid genome size of the draft assembly organism. Used to calculate `span` parameter automatically. Can be given as an integer or in scientific notation (e.g. '3e9' for human)
 + `span=20`: Number of spanning molecules threshold. Set `span=auto` to automatically select span parameter (currently only recommended for `tigmint-long`)
 + `cut=500`: Cut length for long reads (`tigmint-long` only)
 + `longmap=ont`: Long read platform; `ont` for Oxford Nanopore Technologies (ONT) long reads, `pb` for PacBio long reads (`tigmint-long` only)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -10,8 +10,8 @@ jobs:
       versionSpec: "3.8"
       architecture: x64
   - script: |
-      sudo HOMEBREW_NO_AUTO_UPDATE=1 /home/linuxbrew/.linuxbrew/bin/brew install python
-      sudo /home/linuxbrew/.linuxbrew/bin/pip3 install --upgrade setuptools \
+      sudo HOMEBREW_NO_AUTO_UPDATE=1 /home/linuxbrew/.linuxbrew/bin/brew install python@3.8
+      sudo /home/linuxbrew/.linuxbrew/opt/[email protected]/bin/pip3 install --upgrade setuptools \
       -U pip --no-cache-dir \
       pylint .
     displayName: Install Python packages
@@ -23,14 +23,14 @@ jobs:
       cd ../
     displayName: Run pylint
   - script: |
-      export PATH=/home/linuxbrew/.linuxbrew/bin:$PATH
-      echo path:
-      echo $PATH
-      sudo /home/linuxbrew/.linuxbrew/bin/pip3 install pytest
+      sudo /home/linuxbrew/.linuxbrew/opt/[email protected]/bin/pip3 install pytest
       sudo HOMEBREW_NO_AUTOUPDATE=1 /home/linuxbrew/.linuxbrew/bin/brew tap brewsci/bio 
       sudo HOMEBREW_NO_AUTOUPDATE=1 /home/linuxbrew/.linuxbrew/bin/brew install bedtools bwa samtools gnu-time minimap2
-      cd tests/
+      export PATH=/home/linuxbrew/.linuxbrew/bin:$PATH
+      export PATH=/home/linuxbrew/.linuxbrew/opt/[email protected]/bin:$PATH
+      echo $PATH
+      cd tests
       pytest -vs tigmint_test.py
     env:
       HOMEBREW_NO_AUTO_UPDATE: 1
-    displayName: Run pytests
+    displayName: Run pytests
diff --git a/bin/long-to-linked b/bin/long-to-linked
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3 
 """
-Cut long reads and assign barcodes to the subreads. Optionally calculate span parameter for tigmint-cut.
+Cuts long reads and assigns barcodes to the subreads, and optionally calculates tigmint-long span and dist parameters.
 Usage: gunzip -c reads.fa.gz (or reads.fq.gz) | python3 long-to-linked -l length -m min_size -s -g genome_size | gzip > reads.cutlength.fa.gz
 @author: Janet Li
 """
@@ -22,44 +22,58 @@ def cut_reads(length, min_size, auto_span, genome_size, param_outfile, auto_dist
     """
     # Span automatically calculated as 1/4 of sequence coverage
     cov_to_span = 0.25
-    # Dist automatically calculated as p5 of read length
-    dist_read_perc = 5
+    # Lower bound for dist
+    dist_lower_bound = 1000
+    # Dist parameter read percentile
+    dist_perc = 50
+
     total_bases = 0
     if auto_dist:
         read_lengths = []
     with open("/dev/stdin", 'rt') as reads:
         barcode = 0
         if not reads.isatty():  # Check that reads are being piped in from stdin
-            for header, seq, _, _ in read_fasta(reads):
+            for header, seq, _, qual in read_fasta(reads):
                 read_length = len(seq)
                 total_bases += read_length
-                if auto_dist:
+                if auto_dist and read_length >= dist_lower_bound:
                     read_lengths.append(read_length)
                 if read_length >= min_size:
                     split_reads = split_long_read(seq, length)
+                    if qual:
+                        split_quals = split_long_read(qual, length)
+                    else:
+                        split_quals = split_long_read(read_length * "#", length)
                     for i, read in enumerate(split_reads):
-                        print(">" + header + "_" + str(i) + " BX:Z:" + str(barcode), file=sys.stdout)
-                        print(read, file=sys.stdout)
+                        outheader = "@{0}_{1} BX:Z:{2}".format(header, i, barcode)
+                        print(outheader, read, "+", split_quals[i], sep="\n", file=sys.stdout)
                     barcode += 1
             if auto_span or auto_dist:
                 with open(param_outfile, "w") as param_file:
                     if auto_span:
                         span = int(total_bases / genome_size * cov_to_span)
-                        print("span\t%s" % str(span), file=param_file)
+                        print("span\t{}".format(span), file=param_file)
                     if auto_dist:
-                        dist = int(np.percentile(read_lengths, dist_read_perc) // 1)
-                        print("read_p%i\t%i" % (dist_read_perc, dist), file=param_file)
-
+                        dist = int(np.percentile(read_lengths, dist_perc))
+                        print("read_p%i\t%i" % (dist_perc, dist), file=param_file)
         else:
             print("long-to-linked: error: long reads must be piped from stdin", file=sys.stderr)
             sys.exit(1)
 
+def get_genome_size(size_string):
+    """Read genome size from a string."""
+    try:
+        return float(size_string)
+    except ValueError:
+        print("long-to-linked: error: improper format for genome size", file=sys.stderr)
+        sys.exit(1)
+
 def main():
     """Parse arguments and cut long reads."""
     parser = argparse.ArgumentParser(description="Segment long reads into short reads and calculate tigmint-long parameters.")
     parser.add_argument("-v", "--version",
                         action="version",
-                        version="long-to-linked 1.2.1")
+                        version="long-to-linked 1.2.2")
     parser.add_argument("-l", "--length",
                         type=int,
                         default=500,
@@ -75,12 +89,12 @@ def main():
     parser.add_argument("-d", "--auto_dist",
                         default=False,
                         action="store_true",
-                        help="Option to calculate read length p5 for dist parameter")
+                        help="Option to calculate read length p5 for dist parameter") 
     parser.add_argument("-g", "--genome_size",
-                        type=int,
-                        default=-1,
-                        help="Genome size (bp) for calculating sequence coverage and \
-                        minimum spanning molecules parameter automatically")
+                        type=str,
+                        default=None,
+                        help="Genome size for calculating sequence coverage and \
+                        minimum spanning molecules parameter (e.g. 3e9)")
     parser.add_argument("-o", "--param_outfile",
                         type=str,
                         default="tigmint-long.params.tsv",
@@ -89,11 +103,14 @@ def main():
     args = parser.parse_args()
 
     if args.auto_span:
-        if args.genome_size == -1:
-            raise ValueError("Genome size (bp) must be provided to calculate span parameter.")
+        if args.genome_size is None:
+            print("long-to-linked: error: genome size must be given to calculate span",
+                file=sys.stderr)
+            sys.exit(1)
+        args.genome_size = get_genome_size(args.genome_size)
 
-    cut_reads(args.length, args.min_size, args.auto_span,
-            args.genome_size, args.param_outfile, args.auto_dist)
+    cut_reads(args.length, args.min_size, args.auto_span, args.genome_size,
+        args.param_outfile, args.auto_dist)
 
 if __name__ == "__main__":
     main()
diff --git a/bin/tigmint-cut b/bin/tigmint-cut
@@ -241,7 +241,7 @@ def get_span(filename):
 
 def main():
     parser = argparse.ArgumentParser(description="Find misassembled regions in assembly using Chromium molecule extents")
-    parser.add_argument("--version", action="version", version="tigmint-cut 1.2.1")
+    parser.add_argument("--version", action="version", version="tigmint-cut 1.2.2")
     parser.add_argument("fasta", type=str, help="Reference genome fasta file (must have FAI index generated)")
     parser.add_argument("bed", type=str, help="Sorted bed file of molecule extents")
     parser.add_argument("-o", "--fastaout", type=str, help="The output FASTA file.", required=True)

diff --git a/bin/tigmint-make b/bin/tigmint-make
@@ -123,7 +123,7 @@ help:
 	@echo 'For more information see https://bcgsc.github.io/tigmint/'
 
 version:
-	@echo "Tigmint 1.2.1"
+	@echo "Tigmint 1.2.2"
 	@echo "Written by Shaun Jackman @sjackman."
 
 all: tigmint arcs
@@ -194,21 +194,21 @@ $(draft).%.sortbx.bam: %.fq.gz $(draft).fa.bwt
 
 # Align cut long reads to the draft genome and output primary alignments sorted by BX tag.
 $(draft).%.cut$(cut).sortbx.bam: %.cut$(cut).fa.gz $(draft).fa
-	minimap2 -y -t$t -ax map-$(longmap) --secondary=no $(draft).fa $< | samtools view -b -u -F4 | samtools sort -@$t -tBX -T$$(mktemp -u -t $@.XXXXXX) -o $@
+	$(gtime) minimap2 -y -t$t -ax map-$(longmap) --secondary=no $(draft).fa $< | samtools view -b -u -F4 | samtools sort -@$t -tBX -T$$(mktemp -u -t $@.XXXXXX) -o $@
 
 # Segment long reads from gzipped fasta file, optionally calculating tigmint-long parameters.
-$(reads).cut$(cut).fa.gz: $(longreads)
+$(reads).cut$(cut).fq.gz: $(longreads)
 ifeq ($(span), auto)
 ifeq ($(dist), auto)
-	$(gzip) -dc $< | $(bin)/long-to-linked -l$(cut) -m$(minsize) -g$(G) -s -d -o $(reads).tigmint-long.params.tsv | $(gzip) > $@
+	$(gtime) $(gzip) -dc $< | $(bin)/long-to-linked -l$(cut) -m$(minsize) -g$(G) -s -d -o $(reads).tigmint-long.params.tsv | $(gzip) > $@
 else
-	$(gzip) -dc $< | $(bin)/long-to-linked -l$(cut) -m$(minsize) -g$(G) -s -o $(reads).tigmint-long.params.tsv | $(gzip) > $@
+	$(gtime) $(gzip) -dc $< | $(bin)/long-to-linked -l$(cut) -m$(minsize) -g$(G) -s -o $(reads).tigmint-long.params.tsv | $(gzip) > $@
 endif
 else
 ifeq ($(dist), auto)
-	$(gzip) -dc $< | $(bin)/long-to-linked -l$(cut) -m$(minsize) -d -o $(reads).tigmint-long.params.tsv | $(gzip) > $@
+	$(gtime) $(gzip) -dc $< | $(bin)/long-to-linked -l$(cut) -m$(minsize) -d -o $(reads).tigmint-long.params.tsv | $(gzip) > $@
 else
-	$(gzip) -dc $< | $(bin)/long-to-linked -l$(cut) -m$(minsize) | $(gzip) > $@
+	$(gtime) $(gzip) -dc $< | $(bin)/long-to-linked -l$(cut) -m$(minsize) | $(gzip) > $@
 endif
 endif
 
@@ -223,7 +223,7 @@ endif
 	$(gtime) $(bin)/tigmint-molecule -a$(as) -n$(nm) -q$(mapq) -d$(dist) -s$(minsize) $< | sort -k1,1 -k2,2n -k3,3n >$@
 
 # Create molecule extents BED using cut long reads
-$(draft).$(reads).cut$(cut).as$(as).nm$(cut).molecule.size$(minsize).bed: $(reads).cut$(cut).fa.gz $(draft).fa
+$(draft).$(reads).cut$(cut).as$(as).nm$(cut).molecule.size$(minsize).bed: $(reads).cut$(cut).fq.gz $(draft).fa
 ifeq ($(dist), auto)
 	$(gtime) minimap2 -y -t$t -as map-$(longmap) --secondary=no $(draft).fa $< | samtools view -b -u -F4 | samtools sort -@$t -tBX -T$$(mktemp -u -t [email protected]) | \
 		$(bin)/tigmint-molecule -a$(as) -n$(cut) -q$(mapq) -s$(minsize) -p $(reads).tigmint-long.params.tsv - | sort -k1,1 -k2,2n -k3,3n > $@

diff --git a/bin/tigmint-molecule b/bin/tigmint-molecule
@@ -243,7 +243,7 @@ class MolecIdentifier:
             "Read a SAM/BAM file and output a TSV file. "
             "The SAM/BAM file must be sorted by BX tag and then by position.")
         parser.add_argument(
-            '--version', action='version', version='tigmint-molecule 1.2.1')
+            '--version', action='version', version='tigmint-molecule 1.2.2')
         parser.add_argument(
             metavar="BAM", dest="in_bam_filename",
             help="Input BAM file sorted by BX tag then position, - for stdin")

diff --git a/pipeline.gv b/pipeline.gv
@@ -16,7 +16,7 @@ digraph {
 	subgraph {
 	    node [width=5]
 
-	    cut [label="long-to-linked\nSegment long reads (FASTA)"]
+	    cut [label="long-to-linked\nSegment long reads (FASTQ)"]
 	    minimap2 [label="Minimap2\nAlign segmented reads to the draft assembly (BAM)"]
 
 	}

diff --git a/pipeline.gv.png b/pipeline.gv.png