Skip to content

Commit 4c41400

Browse files
committed
Bug fix for constructing hints from protein sequences.
NOTE: This update requires that you update your Kent repository to at least commit 5b8e436 and rebuild to get the latest version of `pslCheck`.
1 parent 9c84e48 commit 4c41400

3 files changed

Lines changed: 21 additions & 12 deletions

File tree

cat/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1632,8 +1632,9 @@ def validate(self):
16321632
'package not in global path.'.format(tool))
16331633
if not tools.misc.is_exec('halLiftover'):
16341634
raise ToolMissingException('halLiftover from the halTools package not in global path.')
1635-
if not tools.misc.is_exec('bedtools'):
1636-
raise ToolMissingException('bedtools is required for the homGeneMapping module.')
1635+
for tool in ['bedtools', 'bedSort']:
1636+
if not tools.misc.is_exec(tool):
1637+
raise ToolMissingException('{} is required for the homGeneMapping module.'.format(tool))
16371638

16381639
def requires(self):
16391640
pipeline_args = self.get_pipeline_args()

cat/hgm.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,17 @@ def extract_exon_hints(hints_db, in_gtf, genome):
159159
cmd = ['bedtools', 'merge', '-i', hints_file, '-c', '4', '-o', 'mean']
160160
tools.procOps.run_proc(cmd, stdout=merged_hints_file, stderr='/dev/null')
161161
# overlap the merged exons with the given GTF, producing a final set.
162-
cmd = [['grep', '\texon\t', in_gtf], # exons only
163-
['cut', '-d', '\t', '-f', '1,4,5'], # slice into BED format
164-
['sort'], # sort to make unique
165-
['uniq'], # make unique to remove duplicate exons
166-
['bedtools', 'intersect', '-a', 'stdin', '-b', merged_hints_file, '-f', '0.8', '-wa', '-wb'],
162+
tmp_bed = tools.fileOps.get_tmp_file()
163+
cmd = [['grep', '-P', '(\texon\t|\tCDS\t)', in_gtf], # exons or CDS only
164+
['cut', '-d', '\t', '-f', '1,4,5']] # slice into BED-like format with GTF intervals
165+
tools.procOps.run_proc(cmd, stdout=tmp_bed)
166+
# sort the BED
167+
tools.procOps.run_proc(['bedSort', tmp_bed, tmp_bed])
168+
# merge the CDS and exon intervals
169+
tmp_merged = tools.fileOps.get_tmp_file()
170+
tools.procOps.run_proc(['bedtools', 'merge', '-i', tmp_bed], stdout=tmp_merged)
171+
# intersect with hints and retain scores
172+
cmd = [['bedtools', 'intersect', '-a', tmp_merged, '-b', merged_hints_file, '-f', '0.8', '-wa', '-wb'],
167173
# bedtools reports both entire A and entire B if at least 80% of A overlaps a B
168174
['cut', '-d', '\t', '-f', '1,2,3,7']] # retain the A positions with the B score
169175
# these BED-like records are actually GFF intervals with 1-based starts and closed intervals

cat/hints_db.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import tools.transcripts
2222
import tools.bio
2323
from exceptions import UserException
24+
from tools.pipeline import ProcException
2425

2526
logger = logging.getLogger(__name__)
2627

@@ -299,8 +300,12 @@ def run_protein_blat(job, protein_subset, genome_fasta_file_id):
299300
tools.bio.write_fasta(outf, name, str(seq))
300301
# perform alignment
301302
tmp_psl = tools.fileOps.get_tmp_toil_file()
302-
cmd = ['blat', '-t=dnax', '-q=prot', '-noHead', genome_fasta, protein_fasta, tmp_psl]
303-
tools.procOps.run_proc(cmd)
303+
cmd = [['blat', '-t=dnax', '-q=prot', '-noHead', genome_fasta, protein_fasta, '/dev/stdout'],
304+
['pslCheck', '-skipInsertCounts', '/dev/stdin', '-pass={}'.format(tmp_psl)]]
305+
try: # we expect pslCheck to fail
306+
tools.procOps.run_proc(cmd, stderr='/dev/null')
307+
except ProcException:
308+
pass
304309
return job.fileStore.writeGlobalFile(tmp_psl)
305310

306311

@@ -320,9 +325,6 @@ def convert_blat_results_to_hints(job, results):
320325
['perl', '-ne', '@f=split; print if ($f[0]>=100)'],
321326
['blat2hints.pl', '--in=/dev/stdin', '--nomult', '--ep_cutoff=5', '--out={}'.format(out_hints)]]
322327
tools.procOps.run_proc(cmd)
323-
# fix the names
324-
cmd = ['sed', '-i', 's/exon/CDS/; s/ep/CDSpart/', out_hints]
325-
tools.procOps.run_proc(cmd)
326328
return job.fileStore.writeGlobalFile(out_hints)
327329

328330

0 commit comments

Comments
 (0)