-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbb.fastareorder
executable file
·1081 lines (949 loc) · 39.1 KB
/
bb.fastareorder
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl
#----------------------------------------------------------#
# Author: Douglas Senalik [email protected] #
# http://www.vcru.wisc.edu/simonlab/sdata/software/ #
#----------------------------------------------------------#
# "Black Box" program series
=bb
Rearrange the order of sequences in a FASTA file
=cut bb
use strict;
use warnings;
use Getopt::Long; # for getting command line parameters
# version 1.0 September 3, 2011
# Version 1.1 September 3, 2011 - Add subrange feature to --seq
# Modify to work with scaffolds also
# Version 1.2 August 21, 2014 - Add --spacer flag
# Version 1.3 April 5, 2016 - Add --gff and --keyforgff, now supports .gz compression
# Version 1.4 June 15, 2016 - Update to --lifttable to support --oneseq
# Version 1.5 July 13, 2016 - add --removezeroes, this is no longer default behavior
# Version 1.6 July 13, 2016 - allow sequence name of "0"
# Version 1.7 August 13, 2017 - correction so --removezeroes is truly not default behavior
# Version 1.8 August 14, 2017 - reverse complement bugfix, and suffix changed from ' RC' to 'rc'
my $version = "1.8";
############################################################
# configuration variables
############################################################
my $spacerlen = 20;
my $defaultwrap = 100;
my $defaultglobalprefix = "contig";
my $defaultspacer = 100;
my $fwdflag = '+'; # set to null string if you don't want '+' in the FASTA header
############################################################
# global variables
############################################################
my $ansiup = "\033[1A"; # terminal control
(my $prognopath = $0) =~ s/^.*[\/\\]//;
my %inhdr = (); # FASTA headers without ">", key is header up to first white space
my %inseq = (); # sequence
my %inqhdr = (); # quality headers without ">"
my %inqseq = (); # quality "sequence"
my @originalorder = (); # keys in original order
my $doqual = 0; # 1 if found and read in quality file
my $outqualname = ""; # output quality file name ( if used )
my $currbp = 0; # current number of base pairs written, for $startstop
my %excludehash = (); # contigs to exclude, converted to hash for easy lookup
my %seen; # for --unique
############################################################
# command line parameters
############################################################
my @infilenames = (); # input file name(s)
my $outfilename = ""; # output file name
my $append = 0; # append to existing output file
my $coordinates; # output file for coordinates, where each contig starts and ends
my $lifttable; # make a lift table for later transferring annotation coordinates
my @seq = (); # sequences to keep
my @exclude = (); # sequences to exclude
my $globalprefix = $defaultglobalprefix;
my $help = 0; # print help and exit
my $oneseq; # to concatenate all sequences, and optional header
my $spacer; # N spacer length when using --onesequence
my $gff; # use a gff3 file to obtain sequences and coordinates
my $keyforgff; # use only these features (column#3) from the gff3 file
my $annotate = 0; # extra information in header
my $startstop = 0; # put start and stop coordinates in file
my $noqual = 0; # do not attempt to include qual file
my $blankline = 0; # blank line between sequences in --onesequence mode
my $random = 0; # return this many random contigs
my $trim = 0; # trim header after first white space
my $removezeroes; # remove internal leading zeroes
my $flank = 0; # flanking sequence
my $replace; # for search and replace
my $with; # for search and replace
my $wrap; # wrap all lines to uniform length, negative=one long line, 0=default length, undefined disables
my $makeunique; # all fasta headers must be unique, so if not, append a copy number
my $quiet = 0; # only show errors
my $debug = 0; # print extra debugging information
GetOptions (
"infile=s" => \@infilenames, # string
"outfile=s" => \$outfilename, # string
"coordinates=s" => \$coordinates, # string
"lifttable=s" => \$lifttable, # string
"sequence=s" => \@seq, # string array
"exclude=s" => \@exclude, # string array
"onesequence:s" => \$oneseq, # flag/string
"gff:s" => \$gff, # string(filename)
"keyforgff:s" => \$keyforgff, # string
"spacer:s" => \$spacer, # flag/string
"prefix:s" => \$globalprefix, # flag/string
"annotate" => \$annotate, # flag
"startstop" => \$startstop, # flag
"noqual" => \$noqual, # flag
"blankline" => \$blankline, # flag
"append" => \$append, # flag
"trim" => \$trim, # flag
"flank=i" => \$flank, # integer
"random=i" => \$random, # integer
"removezeroes" => \$removezeroes, # flag
"wrap:i" => \$wrap, # flag/integer
"unique" => \$makeunique, # flag
"replace=s" => \$replace, # string
"with:s" => \$with, # string
"help" => \$help, # flag
"quiet" => \$quiet, # flag
"debug" => \$debug); # flag
unless ( scalar @infilenames ) { $help = 1; }
unless ( $outfilename ) { $help = 1; }
if ( ( ( ( scalar @seq ) + ( scalar @exclude ) + $random ) < 1 ) and ( ! $gff ) )
{ $help = 1; }
if ( ( defined $wrap ) and ( $wrap == 0 ) ) { $wrap = $defaultwrap; }
if ( $outfilename eq '-' ) { $quiet = 1; }
# debug implies not quiet
if ( $debug ) { $quiet = 0; }
############################################################
# print help screen
############################################################
if ( $help )
{
print "$prognopath Version $version
Rearrange the order of sequences in a FASTA file based on
your specified contigs and orientations
Required parameters:
--infile=xxx input FASTA file name(s) with multiple sequences
--outfile=xxx output file name, or \"-\" for stdout
--seq=xxx sequences to keep, multiple allowed, a plus
\"+\" for forward orientation is optional,
or use \"-\" before the : to indicate reverse
complement. Use \"..\" to indicate a range.
Use \",\" to separate entries. Examples:
--seq=contig45 --seq=46,49,-21..23
--seq=+32 --seq=-65 --seq=76-
--seq=00021..45- -seq=45+..47
or use \"s\" for a spacer of $spacerlen Ns
e.g. --seq=00021+,S,45- Follow S by a
different length, if desired, e.g. ...,S100,...
disable or change the $defaultglobalprefix prefix with --prefix
You can also extract a subset of a contig using
colon dividers, specifying start and stop,
e.g. --seq=12:100:200 The subrange is taken
before reverse complementing, if so indicated
--seq=xx:-100 returns the last 100 nt
1:-1 is the whole string, -1:-1 is the last char
The --seq parameter may be omitted if --exclude
--random or --gff are used instead
Optional parameters:
--prefix[=xxx] if --seq uses only a number, this text will be
prefixed to the contig name. Default value = \"$defaultglobalprefix\"
use --prefix without a value to disable
--exclude=xxx use this in place of the --seq parameter to
output all contigs except these. Order will be
unchanged from the original file in this case.
--random=xxx return this many sequences selected at random
and placed in random order
--gff=xxx instead of using --seq, use this gff3 format file to
obtain sequences and coordinates
--keyforgff=xxx use only lines with this gff3 feature (column #3)
e.g. \"gene\" or \"rRNA\"
--coordinates=xxx create this additional output file, which has
the starting and ending position of each contig
--lifttable=xxx create a lift table compatible with --bb.liftgff,
only useful if using --seq with subranges
--onesequence[=xxx] concatenate all sequences into one. If a value
is specified, that is used as the header.
use --trim to inhibit supplemental info
--removezeroes remove internal leading zeroes in sequence name,
e.g. contig00001 is renamed to contig1
--spacer[=xxx] when using --onesequence, add a spacer of $defaultspacer N's
between every sequence, or other length as specified
--annotate if --onesequence used, put extra information in
header: sequence lengths
--startstop add starting and ending base position
to FASTA headers
--blankline for --onesequence mode, put a blank line
between each sequence
--trim trim header after first white space
--flank when using subrange, add this much flanking sequence
on each side
--wrap[=xxx] wrap sequence lines to this length, if value not
specified, use $defaultwrap; any negative value
returns a single line of sequence
--append append to existing --outfile
--noqual if a .qual file is present, a corresponding
output .qual file is created. This flag
turns off this quality file processing
--unique insure that all FASTA headers are unique by
appending a number if seen more than once
--replace=xxx search and replace regex
--with[=xxx] replace string, or undefined for delete
--help print this screen
--quiet only print error messages
--debug print extra debugging information
";
exit 1;
} # if ( $help )
############################################################
# expand ranges in specified sequences
############################################################
# load any --seq from the -gff parameter
if ( $gff )
{
my @gffdata = loadtsv( $gff );
my @gffseq = convertgfftoseq( \@gffdata, $keyforgff );
push( @seq, @gffseq );
}
@seq = expandseq ( "--seq", @seq );
@exclude = expandseq ( "--exclude", @exclude );
# convert excludes to a hash for easy lookup
foreach my $anexclude ( @exclude )
{
$anexclude =~ s/^\-//g; # remove directional information for excludes
$excludehash{$anexclude} = 1;
} # foreach my $anexclude ( @exclude )
############################################################
# read input file into hash
############################################################
foreach my $infilename ( @infilenames )
{
debugmsg ( "Reading input file \"$infilename\"" );
my $currcontig = 0;
my $nlines = 0;
my $nheaders = 0;
my $INF = stdopen ( "<", $infilename );
my $currkey;
while ( my $aline = <$INF> )
{
$nlines++;
$aline =~ s/[\r]//g; # remove DOS returns
if ( $aline =~ m/^>(.*)$/ )
{
my $hdr = $1;
$nheaders++;
$hdr =~ s/[\s\n]+$//g; # header remove newlines and trailing white space
if ( $trim ) { $hdr =~ s/\s.*//; }
( $currkey = $hdr ) =~ s/\s.*$//; # header up to first white space
# remove internal leading zeros in key, e.g. contig00001 becomes contig1
if ( $removezeroes )
{ $currkey =~ s/^([^\d]*)0+([^0])/$1$2/; }
$inhdr{$currkey} = $hdr; # ">" has been removed
push ( @originalorder, $currkey );
if ( ( scalar keys %inhdr ) <= 10 ) { debugmsg ( "Reading in sequence with key \"$currkey\", header \"$hdr\"" ); }
}
else
{
unless ( defined $currkey ) { die "Error, no current key, maybe this is not a fasta file. line $nlines=\"$aline\"\n"; }
$inseq{$currkey} .= $aline; # retains newlines
}
} # while <$INF>
stdclose ( $INF );
unless ( $quiet )
{ print "Input file \"$infilename\" read, " . commify($nlines) . " lines, " . commify($nheaders) . " sequences\n"; }
############################################################
# read quality file into array
############################################################
unless ( $noqual )
{
my $inqualname = qualname( $infilename );
if ( -e $inqualname )
{
debugmsg ( "Reading input quality file \"$inqualname\"" );
my $currcontig = 0;
my $nlines = 0;
my $nheaders = 0;
my $contigindex = 0;
my $INF = stdopen ( "<", $inqualname, "quality" );
while ( my $aline = <$INF> )
{
$nlines++;
$aline =~ s/[\r]//g; # remove DOS returns
if ( $aline =~ m/^>(.*)$/ )
{
my $hdr = $1;
$contigindex++;
$nheaders++;
$hdr =~ s/[\s\n]+$//g; # header remove newlines and trailing white space
if ( $trim ) { $hdr =~ s/\s.*//; }
( $currkey = $hdr ) =~ s/\s.*$//; # header up to first white space
# remove internal leading zeros in key, e.g. contig00001 becomes contig1
$currkey =~ s/^([^\d]*)0+([^0])/$1$2/;
$inqhdr{$currkey} = $hdr; # ">" has been removed
if ( ( scalar keys %inqhdr ) <= 10 ) { debugmsg ( "Reading in quality with key \"$currkey\", header \"$hdr\"" ); }
}
else
{
$inqseq{$currkey} .= $aline; # retains newlines
}
} # while <$INF>
stdclose ( $INF );
unless ( $quiet )
{ print "Quality file read, " . commify($nlines) . " lines, " . commify($nheaders) . " sequences\n"; }
$doqual = 1;
} # if quality file found
else # no quality file found
{
debugmsg( "No corresponding quality file \"$inqualname\" found" );
} # else no quality file found
} # unless ( $noqual )
} # foreach $infilename
############################################################
# random mode, create list of contigs
############################################################
if ( $random )
{
# # create a new list of just defined contig indices ( some contig numbers may be blank )
# my @definedcontigs = ();
# for ( my $i=0; $i<=$#inhdr; $i++ )
# {
# if ( defined($inhdr[$i]) )
# { push ( @definedcontigs, $i ) }
# } # for $i
# test to prevent getting more random contigs than are available
if ( ( scalar keys %inhdr ) < $random )
{ die "Error, requested more random contigs (".commify($random).") than are in the input file (".commify(scalar keys %inhdr).")\n"; }
# pick a contig at random until we have enough
my @allkeys = keys %inhdr;
for my $i ( 1 .. $random )
{
# select a random contig and save it to the @seq list
my $randomindex = int(rand(@allkeys));
my $key = $allkeys[$randomindex];
debugmsg ( "Selecting random contig \"$key\"" );
push ( @seq, $key );
# remove selected contig from array to prevent re-use
splice ( @allkeys, $randomindex, 1 );
} # for $i
} # if ( $random )
############################################################
# create output file
############################################################
{
debugmsg ( "Creating output FASTA file \"$outfilename\"" );
my $outfdata = "";
my $outqdata = "";
my @coordinatesdata = ();
my $currposition = 1;
my $nseq = 0;
my $nlines = 0;
my $OUTF = stdopen ( $append?">>":">", $outfilename );
my $OUTQ;
if ( $doqual )
{
$outqualname = qualname( $outfilename );
debugmsg ( "Creating output quality file \"$outqualname\"" );
$OUTQ = stdopen ( $append?">>":">", $outqualname, "quality" );
}
############################################################
# --seq mode
############################################################
my $LFT;
if ( $lifttable )
{ $LFT = stdopen( ">", $lifttable ); }
for my $i ( 0 .. $#seq )
{
my $aseqnum = $seq[$i];
# save full string for --annotation
my $ann = $aseqnum;
# remove subrange values if present
my @range = split ( /:/, $aseqnum );
$aseqnum = $range[0];
# strip off reverse complement flag if present
my $rc = 0;
if ( ( $aseqnum =~ m/^\-/ ) or ( $aseqnum =~ m/\-$/ ) ) { $rc = 1; }
$aseqnum =~ s/[\-\+]$//g;
$aseqnum =~ s/^[\-\+]//g;
# check for special spacer
unless ( $aseqnum =~ m/^S\d*$/i )
{
# check validity of number
unless ( defined $inseq{$aseqnum} )
{ die "Error, no such sequence \"$aseqnum\"\n"; }
# skip this one if it is on the exclude list
if ( $excludehash{$aseqnum} )
{
debugmsg ( "Specified sequence \"$aseqnum\" is on the exclude list, skipping it" );
next;
}
}
# get relevant sequence
my $aseq;
my $aqual = "";
if ( $aseqnum =~ m/^S(\d*)$/i ) # generate spacer
{
my $Nlen = $1;
unless ( $Nlen ) { $Nlen = $spacerlen; }
$aseq = "N"x$Nlen;
if ( $doqual ) { $aqual = "00 "x$Nlen; } # not tested this may be wrong @@@
$inhdr{$aseqnum} = $aseqnum; # for --lifttable
}
else
{
$aseq = $inseq{$aseqnum};
if ( $doqual ) { $aqual = $inqseq{$aseqnum}; }
}
# calculate sequence length for --startstop and for statistics line at end
my $seqlen = 0;
my $bseq = $aseq;
$bseq =~ s/[^A-Za-z]//g;
$seqlen = length ( $bseq );
# trim sequence and quality if subrange was specified
if ( ( $range[1] ) or ( $range[2] ) )
{
# validity checking
if ( ( $range[1] ) and ( $range[1] !~ m/^[\d\-]*$/ ) ) { die "Error, invalid character in subrange \"$range[1]\" for sequence $aseqnum is > sequence length $seqlen\n"; }
if ( ( $range[2] ) and ( $range[2] !~ m/^[\d\-]*$/ ) ) { die "Error, invalid character in subrange \"$range[2]\" for sequence $aseqnum is > sequence length $seqlen\n"; }
# e.g. 1:-1 is the whole string, -1:-1 is the last 1 character
unless ( $range[1] ) { $range[1] = 1; }
if ( $range[1] < 0 ) # i.e. right trim when negative start point
{ $range[1] = $seqlen + $range[1] + 1; } # remember that $range[1] is negative here
unless ( $range[2] ) { $range[2] = $seqlen; }
if ( $range[2] < 0 ) # i.e. right trim when negative start point
{ $range[2] = $seqlen + $range[2] + 1; } # remember that $range[2] is negative here
# range checking
if ( $range[1] > $seqlen ) { die "Error, start coordinate $range[1] for sequence $aseqnum is > sequence length $seqlen\n"; }
if ( $range[2] > $seqlen ) { die "Error, end coordinate $range[2] for sequence $aseqnum is > sequence length $seqlen\n"; }
# flanking sequence
if ( $flank )
{
$range[1] -= $flank;
if ( $range[1] < 1 ) { $range[1] = 1; }
$range[2] += $flank;
if ( $range[2] > $seqlen ) { $range[2] = $seqlen; }
}
$seqlen = ( $range[2] - $range[1] + 1 );
my $newseq = "";
my $i = 0;
while ( $aseq =~ m/([A-Za-z][^A-Za-z]*)/g )
{
$i++;
if ( $i >= $range[1] )
{
if ( $i <= $range[2] )
{ $newseq .= $1; }
else
{ last; }
}
}
$aseq = $newseq;
my $newqual = "";
$i = 0;
if ( $doqual )
{
while ( $aqual =~ m/(\d{2}[^\d]*)/g )
{
$i++;
if ( $i >= $range[1] )
{
if ( $i <= $range[2] )
{ $newseq .= $1; }
else
{ last; }
}
}
$aqual = $newqual;
} # if ( $doqual )
} # if subrange was specified
# annotation
$ann .= "(" . ($seqlen) . 'nt@' . ($currposition) . "-" . ($currposition+$seqlen-1) . ")";
debugmsg ( "Adding sequence \"$aseqnum\" " . ($rc?"=ReverseComplement":"") . ", " . commify($seqlen) . " nt" );
# strip leading and trailing white space from sequence
$aseq =~ s/^\s+//;
$aseq =~ s/\s+$//;
if ( $doqual )
{
$aqual =~ s/^\s+//;
$aqual =~ s/\s+$//;
}
# reverse complement it if appropriate
if ( $rc )
{
$aseq = revcomp ( $aseq );
if ( $doqual ) { $aqual = qualrev ( $aqual ); }
}
# print header to ouput ( unless concatenating )
$nseq++;
my $thishdr;
unless ( defined $oneseq )
{
if ( $aseqnum !~ m/^S\d*$/i ) # skip spacer for multi-sequence mode
{
unless ( defined $inhdr{$aseqnum} ) { die "Error, undefined header for sequence number \"$aseqnum\", sequence index $nseq\n"; }
$thishdr = ">" . replacewith($inhdr{$aseqnum});
if ( $makeunique )
{
$seen{$inhdr{$aseqnum}}++;
if ( $seen{$inhdr{$aseqnum}} > 1 )
{ $thishdr .= "." . $seen{$inhdr{$aseqnum}}; }
}
if ( $startstop )
{ $thishdr .= " " . ($currbp+1) . ".." . ($currbp+$seqlen); }
# append "rc" to end of FASTA header if we reverse complemented it
if ( $rc )
{ $thishdr .= "rc"; }
$thishdr .= "\n";
$outfdata .= $thishdr;
$nlines++;
if ( $doqual )
{ $outqdata .= $thishdr; }
}
}
# optional lift table
if ( $lifttable )
{
my $shortouthdr;
if ( $oneseq )
{ $shortouthdr = $oneseq; }
else
{ ( $shortouthdr = $thishdr ) =~ s/ .*//; }
( my $shortinhdr = $inhdr{$aseqnum} ) =~ s/ .*//;
my $s = ( $range[1] || 1 );
my $e = ( $range[2] || $seqlen );
if ( $rc ) { ( $s, $e ) = ( $e, $s ); }
print $LFT join( "\t", $shortouthdr, $currbp+1, $currbp+$seqlen,
$shortinhdr, $s, $e ), "\n";
}
if ( ( $startstop ) or ( $lifttable ) ) { $currbp += $seqlen; }
# store contig coordinates
if ( $coordinates )
{
push ( @coordinatesdata, join ( "\t", $aseqnum, $rc?"-":"+", $currposition, ( $currposition+$seqlen-1 ) ) );
}
$currposition += $seqlen;
# print sequence to output, and follow with a return
$outfdata .= $aseq . "\n";
# optional spacer for --onesequence mode for all but last sequence
if ( ( $spacer ) and ( $i < $#seq ) )
{ $outfdata .= 'N' x $spacer . "\n"; }
if ( $blankline ) { $outfdata .= "\n"; }
$nlines += ( $aseq =~ tr/\n// + 1 );
if ( $doqual )
{
$outqdata .= $aqual . "\n";
if ( $blankline ) { $outqdata .= "\n"; }
}
if ( $annotate )
{ $aseqnum = $ann }
} # for my $i ( 0 .. $#seq )
if ( $LFT ) { stdclose( $LFT ); }
############################################################
# --exclude mode
############################################################
if ( ( scalar @exclude ) and ( ! scalar @seq ) ) # if both --seq and --exclude were specified, we are done already
{
# loop through all input sequences in their original order
foreach my $key ( @originalorder )
{
# skip undefined sequences
next unless ( defined $inhdr{$key} );
# skip if on the exclude list
if ( $excludehash{$key} )
{
debugmsg ( "Sequence \"$key\" is on the exclude list, skipping it" );
next;
}
# get relevant sequence
my $aseq = $inseq{$key};
my $aqual = "";
if ( $doqual ) { $aqual = $inqseq{$key}; }
debugmsg ( "Adding sequence $key, " . commify(length($aseq)) . " nt" );
# strip leading and trailing white space from sequence
$aseq =~ s/^\s+//;
$aseq =~ s/\s+$//;
if ( $doqual )
{
$aqual =~ s/^\s+//;
$aqual =~ s/\s+$//;
}
# calculate sequence length for --startstop and for statistics line at end
my $seqlen = 0;
my $bseq = $aseq;
$bseq =~ s/[^A-Za-z]//g;
$seqlen = length ( $bseq );
if ( $coordinates )
{
push ( @coordinatesdata, join ( "\t", $key, "+", $currposition, ( $currposition+$seqlen-1 ) ) );
}
$currposition += $seqlen;
# print header to ouput ( unless concatenating )
$nseq++;
unless ( defined $oneseq )
{
$outfdata .= ">" . replacewith($inhdr{$key});
if ( $startstop )
{ $outfdata .= " " . ($currbp+1) . ".." . ($currbp+$seqlen); }
$outfdata .= "\n";
$nlines++;
if ( $doqual )
{
$outqdata .= ">" . replacewith($inqhdr{$key});
if ( $startstop )
{ $outqdata .= " " . ($currbp+1) . ".." . ($currbp+$seqlen); }
$outqdata .= "\n";
}
}
if ( $startstop ) { $currbp += $seqlen; }
# print sequence to output, and follow with a return
$outfdata .= $aseq . "\n";
if ( $blankline ) { $outfdata .= "\n"; }
$nlines += ( $aseq =~ tr/\n// + 1 );
if ( $doqual )
{
$outqdata .= $aqual . "\n";
if ( $blankline ) { $outqdata .= "\n"; }
}
} # for $key
} # if ( ( scalar @exclude ) and ( ! scalar @seq ) )
############################################################
# if --oneseq mode, print header now
############################################################
if ( defined $oneseq )
{
my $header = "";
if ( $oneseq )
{ $header = $oneseq; }
else
{ $header = "concatenated"; }
unless ( $trim )
{ $header .= " ".scalar(@seq)." sequences: ".join (",", @seq); }
print $OUTF ">", $header, "\n";
if ( $doqual ) { print $OUTQ ">", $header, "\n"; }
}
############################################################
# print saved data to output file
############################################################
print $OUTF wrapper($outfdata);
stdclose ( $OUTF );
unless ( $quiet )
{
print "Output file created, " . commify($nlines) . " lines, " . commify($nseq) . " sequences, ".commify($currposition-1)." nt\n";
if ( $annotate ) { debugmsg ( "++ " . join ( "\n++ ", @seq ) ); }
}
if ( $doqual )
{
print $OUTQ wrapper($outqdata);
stdclose ( $OUTQ );
}
if ( $coordinates )
{
debugmsg ( "Storing coordinates data to file \"$coordinates\"" );
my $OUTC = stdopen ( ">", $coordinates );
print $OUTC join ( "\n", @coordinatesdata ), "\n";
stdclose ( $OUTC );
unless ( $quiet )
{ print "Coordinates file created, " . commify(scalar @coordinatesdata) . " lines\n"; }
} # if ( $coordinates )
}
exit 0;
############################################################
sub wrapper { my ( $sequence ) = @_;
############################################################
if ( $wrap )
{
my $wrappedseq = "";
my $currseq = "";
foreach my $aline ( split ( /\n/, $sequence ) )
{
if ( $aline =~ m/^>/ )
{
if ( $wrap > 0 )
{ $currseq =~ s/(.{$wrap})/$1\n/g; }
else
{ $currseq =~ s/[\r\n]//g; }
if ( $currseq )
{
$wrappedseq .= $currseq;
unless ( $wrappedseq =~ m/\n$/ ) { $wrappedseq .= "\n"; }
}
$currseq = "";
$wrappedseq .= $aline . "\n"; # header line
}
else
{
if ( $wrap ) { $aline =~ s/\s//g; } # remove extra white space in any wrap mode
$currseq .= $aline;
}
} # foreach $aline
if ( $currseq ) # last sequence in file
{
if ( $wrap > 0 )
{ $currseq =~ s/(.{$wrap})/$1\n/g; }
else
{ $currseq =~ s/[\r\n]//g; }
$wrappedseq .= $currseq;
unless ( $wrappedseq =~ m/\n$/ ) { $wrappedseq .= "\n"; }
}
return $wrappedseq;
}
else
{ return $sequence; }
} # sub wrapper
############################################################
sub loadtsv { my ( $infilename, $delimiter ) = @_;
############################################################
# load a tab-delimited text file and return as a 2d array
# global variable $quiet controls verbosity
my @data;
$delimiter //= "\t"; # defaults to tab delimited
unless ( $quiet ) { print "Loading \"$infilename\" ".timestr()."\n"; }
my $INF = stdopen( "<", $infilename );
my $nlines = 0;
my $ncells = 0;
while ( my $aline = <$INF> )
{
$nlines++;
$aline =~ s/[\r\n]//g;
my @cols = split( /$delimiter/, $aline );
push( @data, \@cols );
$ncells += scalar( @cols );
}
stdclose( $INF );
unless ( $quiet )
{ print "Loaded ".commify($nlines)." lines, ".commify($ncells)." cells\n"; }
return( @data );
} # sub loadtsv
############################################################
sub convertgfftoseq { my ( $dataref, $col3match ) = @_;
############################################################
# converts a gff3 file loaded with loadtsv() into a structure
# to replicate the --seq parameter information
my @seqinfo;
my $nqueries = 0;
foreach my $rowref ( @$dataref )
{
next unless ( $rowref->[0] );
next if ( $rowref->[0] =~ m/^#/ );
next if ( ( $col3match ) and ( $rowref->[2] ne $col3match ) );
my $query = $rowref->[0];
if ( $rowref->[6] eq '-' ) { $query .= '-'; }
$query .= ':' . $rowref->[3] . ':' . $rowref->[4];
push( @seqinfo, $query );
debugmsg( "gff query \"$query\" added" );
$nqueries++;
}
unless ( $quiet ) { print "Loaded ".commify($nqueries)." sequence queries from gff file\n"; }
return @seqinfo;
} # sub convertgfftoseq
############################################################
sub replacewith { my ( $text ) = @_;
############################################################
if ( defined $replace )
{
if ( length($replace) == 0 )
{ die "Error, --replace string is zero length\n"; }
if ( length($with//"") )
{ $text =~ s/$replace/$with/; }
else # delete
{ $text =~ s/$replace//; }
}
return $text;
} # sub replacewith
############################################################
sub expandseq { my ( $debugtxt, @unexpanded ) = @_;
############################################################
# expand --seq or --exclude parameter ranges
# After parsing, all entries will be an optional "-" followed
# by contig number with leading zeroes stripped off
# e.g. 456 or -678
# if a subrange was indicated, that is still present unchanged
my @expanded = ();
my $index = 0; # for debug message
foreach my $aseq ( @unexpanded )
{
$index++;
if ( $index <= 10 ) { debugmsg ( "Parsing sequence \"$aseq\"" ); }
# split at commas or white space
foreach my $asubseq ( split ( /[,\s]+/, $aseq ) )
{
# skip null items
next unless ( $asubseq );
# remove subrange info and store separately
my @range = split ( /:/, $asubseq );
$asubseq = $range[0];
my $subrange = "";
if ( defined $range[1] ) { $subrange .= ":" . $range[1]; }
if ( defined $range[2] ) { $subrange .= ":" . $range[2]; }
# save reverse complement flag
my $revcomp = $fwdflag;
if ( ( $asubseq =~ m/^\-/ ) or ( $asubseq =~ m/\-$/ ) ) { $revcomp = '-'; }
# remove "-" or "+"
$asubseq =~ s/[\-\+]$//g;
$asubseq =~ s/^[\-\+]//g;
# expand ranges
if ( $asubseq =~ m/^(.*)(\d+)\.\.([^:]*)(\d+)/ )
{
my $start = $2;
my $end = $4;
if ( $removezeroes )
{
$start =~ s/^0+//;
$end =~ s/^0+//;
}
my $prefix = $1;
unless ( $prefix ) { $prefix = $globalprefix; }
for ( my $i=$start; $i<=$end; $i++ )
{
my $fullid = $revcomp.$prefix.$i.$subrange;
push ( @expanded, $fullid );
if ( $index <= 10 ) { debugmsg ( " Saving expanded sequence \"$fullid\"" ); }
}
}
elsif ( $asubseq =~ m/^S$/i )
{
push ( @expanded, $asubseq );
if ( $index <= 10 ) { debugmsg ( " Saving single spacer \"$asubseq\"" ); }
}
else # single contig
{
my $prefix = "";
my $number = "";
if ( $asubseq =~ m/^([^\d]*)(\d+)$/ )
{
$prefix = $1;
$number = $2;
}
else
{ $prefix = $asubseq; }
unless ( $prefix ) { $prefix = $globalprefix; }
# trim leading zeroes
if ( $removezeroes )
{ $number =~ s/^0+([1-9])/$1/; }
my $fullid = $revcomp.$prefix.$number.$subrange;
debugmsg ( "Fullid = \"$fullid\"" );
push ( @expanded, $fullid );
if ( $index <= 10 ) { debugmsg ( " Saving single contig \"$fullid\"" ); }
}
} # foreach my $asubseq
} # foreach my $aseq ( @unexpanded )
debugmsg ( "Original $debugtxt array contained ".commify(scalar(@unexpanded))." items, after expansion it contains ".commify(scalar(@expanded))." items" );
return @expanded;
} # sub expandseq
############################################################
sub debugmsg { my ( $text, $noreturn, $nolinenum ) = @_;
############################################################
if ( $debug )
{
my ($package, $filename, $line, $sub) = caller(0);
unless ( $nolinenum ) { $text = "Line $line: " . $text; }
if ( ! ( $noreturn ) ) { $text .= "\n"; }
print $text;
} # if ( $debug )
} # sub debugmsg
###############################################################
sub timestr {
###############################################################
@_ = localtime(shift || time);
return(sprintf("%04d/%02d/%02d %02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1]));
} # sub timestr
###############################################################
sub commify {
###############################################################
# http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas
local $_ = shift;
1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
return $_;
} # commify
############################################################
sub qualname { my ( $fastaname ) = @_;
############################################################
# convert a FASTA file name to the corresponding quality file name
my $qualname = $fastaname;
$qualname =~ s/\.[^\.]+$//; # remove existing extension whatever it is
$qualname .= ".qual"; # replace with ".qual";
} # sub qualname
############################################################
sub revcomp { my ( $dna ) = @_;
############################################################
# standard DNA reverse complement, including degenerate bases
my $revcomp = reverse ( $dna );
$revcomp =~ tr/AaCcTtGgMmRrYyKkVvHhDdBb/TtGgAaCcKkYyRrMmBbDdHhVv/;
return $revcomp;
} # sub revcomp
############################################################
sub qualrev { my ( $qual ) = @_;
############################################################
# reverse quality score order, retain returns in string
$qual =~ s/\n/ \n /g; # space on both sides of return so we can retain them
# convert to array, some elements will be returns, and reverse the order
my @qualarr = reverse( split ( /[ \t]+/, $qual ) );
# convert back to string, space between values
$qual = join ( " ", @qualarr );
# remove extra spaces around returns that we introduced earlier
$qual =~ s/ *\n *//g;
return $qual
} # sub revcomp