4
4
# v1.6 2023-01-20 python3 update and some changes to log output
5
5
6
6
'''
7
- add_taxa_to_align.py v1.6 2023-01-20
7
+ add_taxa_to_align.py v1.6 2023-02-01
8
8
add new taxa to an existing untrimmed alignment
9
9
requires Bio Python library
10
10
get hmmbuild and hmmscan (from hmmer package at http://hmmer.org/)
@@ -206,14 +206,14 @@ def hmmtable_to_seqids(hmmtable, evaluecutoff, bitlencutoff, seqdict, verbose ):
206
206
if verbose :
207
207
print ( "# CANDIDATE {}: EVALUE {}, BITS {}, BpL {}" .format ( targetname , evalue , bitscore , bitsperlen ), file = sys .stderr )
208
208
if targetname in seqids_to_keep : # since multiple domains are allowed, keep highest scoring one
209
- if seqids_to_keep .get (targetname ,0 ) < bitsperlen :
210
- seqids_to_keep [targetname ] = bitsperlen
209
+ if seqids_to_keep .get (targetname ,0 ) < bitscore :
210
+ seqids_to_keep [targetname ] = bitscore
211
211
else :
212
- seqids_to_keep [targetname ] = bitsperlen
213
- # sort IDs by highest bits-per-length
212
+ seqids_to_keep [targetname ] = bitscore
213
+ # sort IDs by value, in this case bitscore
214
214
sorted_ids = [ k for k ,v in sorted (seqids_to_keep .items (), key = lambda x : x [1 ], reverse = True ) ]
215
215
if seqids_to_keep : # meaning if any hits
216
- print ( "# retaining {}/{} seqs from {}, highest bits/len is {:.4f} for {}" .format ( len (seqids_to_keep ) , hitcounter , os .path .basename (hmmtable ), maxbpl , sorted_ids [0 ] ), file = sys .stderr )
216
+ print ( "# retaining {}/{} seqs from {}, highest bits is {:.4f} for {}" .format ( len (seqids_to_keep ) , hitcounter , os .path .basename (hmmtable ), maxbpl , sorted_ids [0 ] ), file = sys .stderr )
217
217
if maxbpl != seqids_to_keep [sorted_ids [0 ]]: # in case best reasonable seq is not the max
218
218
print ( "# WARNING: MAX {} DOES NOT MATCH TOP SEQ {} WITH {}" .format ( maxbpl , sorted_ids [0 ] , seqids_to_keep [sorted_ids [0 ]] ), file = sys .stderr )
219
219
else : # meaning no hits
@@ -308,13 +308,14 @@ def collect_sequences(unalignednewtaxa, alignment, hitlistolists, lengthcutoff,
308
308
speciescounts = defaultdict (int ) # key is species, value is number of written seqs per species
309
309
median = unalign_sequences (unalignednewtaxa , alignment , notrim , calculatemedian = True , removeempty = False )
310
310
with open (unalignednewtaxa ,'a' ) as notaln :
311
+ print ( "###COLLECT:{}" .format (os .path .basename (alignment )), file = sys .stderr )
311
312
# hitlistolists is a list of lists, so that order of species is preserved
312
313
for i ,hitlist in enumerate (hitlistolists ):
313
314
writeout = 0
314
315
if not hitlist :
315
316
print ( "# NO HITS FOR {} IN {}" .format (speciesnames [i ], os .path .basename (alignment ) ), file = sys .stderr )
316
- print >> notaln , ">{}" .format (speciesnames [i ])
317
- continue
317
+ print ( ">{}" .format (speciesnames [i ]), file = notaln )
318
+ continue # no hits so skip to next taxon
318
319
for seqrec in hitlist : # sublist, each item is a SeqRecord object
319
320
old_id = str (seqrec .id )
320
321
if writeout == maxhits : # if already have enough candidates
@@ -323,23 +324,23 @@ def collect_sequences(unalignednewtaxa, alignment, hitlistolists, lengthcutoff,
323
324
if keep_old_ids is False : # should be False by default
324
325
if seqrec .id == speciesnames [i ]: # check if seq was already used, so dict entry was renamed
325
326
print ( "WARNING: REDUNDANT SEQ {} FOR {}" .format (seqrec .name , os .path .basename (alignment ) ), file = sys .stderr )
327
+ seqrec .description = old_id
326
328
seqrec .id = str (speciesnames [i ])
327
- seqrec .description = ""
328
329
print ( "# using seq {} for {}" .format ( old_id , speciesnames [i ] ), file = sys .stderr )
329
330
notaln .write ( seqrec .format ("fasta" ) )
330
331
writeout += 1
331
332
else : # meaning too short
332
333
print ( "# SEQ {} TOO SHORT FOR {} IN {}" .format (seqrec .name , speciesnames [i ], os .path .basename (alignment ) ), file = sys .stderr )
333
334
if writeout == 0 : # all hits missed the cut or had no hits, give a dummy entry
334
- print >> notaln , ">{}" .format (speciesnames [i ])
335
+ print ( ">{}" .format (speciesnames [i ]), file = notaln )
335
336
print ( "# ALL HITS TOO SHORT FOR {} IN {}" .format (speciesnames [i ], os .path .basename (alignment ) ), file = sys .stderr )
336
337
# no return
337
338
338
339
def run_mafft (MAFFT , rawseqsfile , errorlog ):
339
340
'''generate multiple sequence alignment from fasta and return MSA filename'''
340
341
aln_output = "{}.aln" .format (os .path .splitext (rawseqsfile )[0 ] )
341
342
aligner_args = [MAFFT , "--maxiterate" , "1000" , "--localpair" , "--quiet" , rawseqsfile ]
342
- print ( "#TIME {}\n {} > {}" .format (time .asctime (), " " .join (aligner_args ), aln_output ), file = errorlog )
343
+ print ( "### TIME {}\n {} > {}" .format (time .asctime (), " " .join (aligner_args ), aln_output ), file = errorlog )
343
344
with open (aln_output , 'w' ) as msa :
344
345
subprocess .call (aligner_args , stdout = msa )
345
346
print ( "# alignment of {} completed {}" .format (aln_output , time .asctime () ), file = errorlog )
0 commit comments