Skip to content

Commit d0e034b

Browse files
committed
python3 bugfix
1 parent 0eab9c4 commit d0e034b

File tree

1 file changed

+12
-11
lines changed

1 file changed

+12
-11
lines changed

add_taxa_to_align.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# v1.6 2023-01-20 python3 update and some changes to log output
55

66
'''
7-
add_taxa_to_align.py v1.6 2023-01-20
7+
add_taxa_to_align.py v1.6 2023-02-01
88
add new taxa to an existing untrimmed alignment
99
requires Bio Python library
1010
get hmmbuild and hmmscan (from hmmer package at http://hmmer.org/)
@@ -206,14 +206,14 @@ def hmmtable_to_seqids(hmmtable, evaluecutoff, bitlencutoff, seqdict, verbose ):
206206
if verbose:
207207
print( "# CANDIDATE {}: EVALUE {}, BITS {}, BpL {}".format( targetname, evalue, bitscore, bitsperlen ), file=sys.stderr )
208208
if targetname in seqids_to_keep: # since multiple domains are allowed, keep highest scoring one
209-
if seqids_to_keep.get(targetname,0) < bitsperlen:
210-
seqids_to_keep[targetname] = bitsperlen
209+
if seqids_to_keep.get(targetname,0) < bitscore:
210+
seqids_to_keep[targetname] = bitscore
211211
else:
212-
seqids_to_keep[targetname] = bitsperlen
213-
# sort IDs by highest bits-per-length
212+
seqids_to_keep[targetname] = bitscore
213+
# sort IDs by value, in this case bitscore
214214
sorted_ids = [ k for k,v in sorted(seqids_to_keep.items(), key=lambda x: x[1], reverse=True ) ]
215215
if seqids_to_keep: # meaning if any hits
216-
print( "# retaining {}/{} seqs from {}, highest bits/len is {:.4f} for {}".format( len(seqids_to_keep) , hitcounter, os.path.basename(hmmtable), maxbpl, sorted_ids[0] ), file=sys.stderr )
216+
print( "# retaining {}/{} seqs from {}, highest bits is {:.4f} for {}".format( len(seqids_to_keep) , hitcounter, os.path.basename(hmmtable), maxbpl, sorted_ids[0] ), file=sys.stderr )
217217
if maxbpl != seqids_to_keep[sorted_ids[0]]: # in case best reasonable seq is not the max
218218
print( "# WARNING: MAX {} DOES NOT MATCH TOP SEQ {} WITH {}".format( maxbpl, sorted_ids[0] , seqids_to_keep[sorted_ids[0]] ), file=sys.stderr )
219219
else: # meaning no hits
@@ -308,13 +308,14 @@ def collect_sequences(unalignednewtaxa, alignment, hitlistolists, lengthcutoff,
308308
speciescounts = defaultdict(int) # key is species, value is number of written seqs per species
309309
median = unalign_sequences(unalignednewtaxa, alignment, notrim, calculatemedian=True, removeempty=False)
310310
with open(unalignednewtaxa,'a') as notaln:
311+
print( "###COLLECT:{}".format(os.path.basename(alignment)), file=sys.stderr )
311312
# hitlistolists is a list of lists, so that order of species is preserved
312313
for i,hitlist in enumerate(hitlistolists):
313314
writeout = 0
314315
if not hitlist:
315316
print( "# NO HITS FOR {} IN {}".format(speciesnames[i], os.path.basename(alignment) ), file=sys.stderr )
316-
print >> notaln, ">{}".format(speciesnames[i])
317-
continue
317+
print( ">{}".format(speciesnames[i]), file=notaln)
318+
continue # no hits so skip to next taxon
318319
for seqrec in hitlist: # sublist, each item is a SeqRecord object
319320
old_id = str(seqrec.id)
320321
if writeout==maxhits: # if already have enough candidates
@@ -323,23 +324,23 @@ def collect_sequences(unalignednewtaxa, alignment, hitlistolists, lengthcutoff,
323324
if keep_old_ids is False: # should be False by default
324325
if seqrec.id==speciesnames[i]: # check if seq was already used, so dict entry was renamed
325326
print( "WARNING: REDUNDANT SEQ {} FOR {}".format(seqrec.name, os.path.basename(alignment) ), file=sys.stderr )
327+
seqrec.description = old_id
326328
seqrec.id = str(speciesnames[i])
327-
seqrec.description = ""
328329
print( "# using seq {} for {}".format( old_id, speciesnames[i] ), file=sys.stderr )
329330
notaln.write( seqrec.format("fasta") )
330331
writeout += 1
331332
else: # meaning too short
332333
print( "# SEQ {} TOO SHORT FOR {} IN {}".format(seqrec.name, speciesnames[i], os.path.basename(alignment) ), file=sys.stderr )
333334
if writeout==0: # all hits missed the cut or had no hits, give a dummy entry
334-
print >> notaln, ">{}".format(speciesnames[i])
335+
print( ">{}".format(speciesnames[i]), file=notaln)
335336
print( "# ALL HITS TOO SHORT FOR {} IN {}".format(speciesnames[i], os.path.basename(alignment) ), file=sys.stderr )
336337
# no return
337338

338339
def run_mafft(MAFFT, rawseqsfile, errorlog):
339340
'''generate multiple sequence alignment from fasta and return MSA filename'''
340341
aln_output = "{}.aln".format(os.path.splitext(rawseqsfile)[0] )
341342
aligner_args = [MAFFT, "--maxiterate", "1000", "--localpair", "--quiet", rawseqsfile]
342-
print( "#TIME {}\n{} > {}".format(time.asctime(), " ".join(aligner_args), aln_output ), file=errorlog )
343+
print( "###TIME {}\n{} > {}".format(time.asctime(), " ".join(aligner_args), aln_output ), file=errorlog )
343344
with open(aln_output, 'w') as msa:
344345
subprocess.call(aligner_args, stdout=msa)
345346
print( "# alignment of {} completed {}".format(aln_output, time.asctime() ), file=errorlog )

0 commit comments

Comments
 (0)