Skip to content

Commit f315b88

Browse files
authored
Merge pull request #154 from eernst/fix-transcript-ids-munging
Fix for transcript IDs like rice from ensembl plants: Os01t0100500-01
2 parents eefbf10 + 07afad7 commit f315b88

File tree

2 files changed

+17
-9
lines changed

2 files changed

+17
-9
lines changed

cat/filter_transmap.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -387,18 +387,21 @@ def resolve_split_genes(tmp_size_filtered, transcript_gene_map, resolved_df, unf
387387
"""
388388
Use localNearBest algorithm to determine split genes and populate that field
389389
"""
390-
with tools.fileOps.TemporaryFilePath() as local_tmp:
391-
cmd = [['sed', 's/\-[0-9]\+//', tmp_size_filtered], # strip unique identifiers for comparative filters
392-
['pslCDnaFilter', '-localNearBest=0.05',
393-
'-minCover=0.1', '-verbose=0',
394-
'-minSpan=0.2', '/dev/stdin', '/dev/stdout']]
390+
with tools.fileOps.TemporaryFilePath() as local_tmp, tools.fileOps.TemporaryFilePath() as stripped_tmp:
391+
with open(stripped_tmp, 'w') as outf:
392+
for rec in tools.psl.psl_iterator(tmp_size_filtered):
393+
rec.q_name = tools.nameConversions.strip_alignment_numbers(rec.q_name)
394+
tools.fileOps.print_row(outf, rec.psl_string())
395+
cmd = ['pslCDnaFilter', '-localNearBest=0.05',
396+
'-minCover=0.1', '-verbose=0',
397+
'-minSpan=0.2', stripped_tmp, '/dev/stdout']
395398
tools.procOps.run_proc(cmd, stdout=local_tmp)
396399
filtered_alns = list(tools.psl.psl_iterator(local_tmp))
397400

398401
# remove alignments that we didn't resolve
399402
resolved_ids = set(resolved_df.TranscriptId)
400403
filtered_alns = [x for x in filtered_alns if x.q_name in resolved_ids]
401-
grouped = tools.psl.group_alignments_by_qname(filtered_alns)
404+
grouped = tools.psl.group_alignments_by_qname(filtered_alns, strip=False)
402405

403406
# construct the transcript interval for resolved transcripts
404407
tx_intervals = {tx_id: unfiltered_tx_dict[aln_id].interval for
@@ -418,3 +421,4 @@ def resolve_split_genes(tmp_size_filtered, transcript_gene_map, resolved_df, unf
418421
'Number of intra-contig split genes': len(split_gene_data['intra'])}
419422

420423
return merged, split_gene_metrics
424+

tools/psl.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,13 @@ def get_alignment_dict(psl_file, make_unique=False):
168168
return {psl.q_name: psl for psl in psl_iterator(psl_file, make_unique)}
169169

170170

171-
def group_alignments_by_qname(psl_iterator):
171+
def group_alignments_by_qname(psl_iterator, strip=True):
172172
"""Convenience function for grouping PSLs by their name"""
173173
r = defaultdict(list)
174-
for p in psl_iterator:
175-
r[strip_alignment_numbers(p.q_name)].append(p)
174+
if strip is True:
175+
for p in psl_iterator:
176+
r[strip_alignment_numbers(p.q_name)].append(p)
177+
else:
178+
for p in psl_iterator:
179+
r[p.q_name].append(p)
176180
return r

0 commit comments

Comments
 (0)