-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathbb.454contignet
More file actions
executable file
·1538 lines (1350 loc) · 64.2 KB
/
bb.454contignet
File metadata and controls
executable file
·1538 lines (1350 loc) · 64.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl
#-------------------------------------------------------------------------#
# Author: Douglas Senalik dsenalik@wisc.edu #
# http://www.vcru.wisc.edu/simonlab/sdata/software/index.html#contignet #
# Modified by: Simon Gladman simon.gladman@csiro.au #
# 2011 #
#-------------------------------------------------------------------------#
# "Black Box" program series
=bb
Create a network of all interconnected 454 contigs
=cut bb
use strict;
use warnings;
use Getopt::Long; # for getting command line parameters
# 1.0.1 - Dec 29, 2010
# 1.0.2 - Feb 15, 2011 - removed extra line that prevented count of nodes from working
# 1.0.3 - Mar 12, 2011 - Support paired end data output files, add "@" prefix for list files,
# add --nospline and --overlapmode parameters
# 1.0.3-Simon - May 18, 2011 - Added pseudo-links formed by paired end information instead of just
# newbler links
# 1.0.4 - September 2, 2011
# Allow "+" or "-" in contig numbers, but it is ignored
# Add parameter to specify output image type
# Correct error in --tag help description
# Add --label as a synonym for --tag
# Fix bug in dead end and recursion limit labeling
# 1.0.5 - October 18, 2011
# Allow keeping graphviz command file
# change default overlap mode to "false" (it was "none" before)
# to have same default output with newest version of neato
# make scaffold connections optional, add --scaffold to use scaffold connections
# 1.0.6 - November 4, 2011
# Filter out null contigs, excludes, etc. from command line
# Optional ABySS-Explorer output file added
# 1.0.7 - May 4, 2012
# Text output data file is now sorted by contig number
# change the --scaffold parameter to --pairlinks,
# since I may use --scaffold in the future for the actual scaffold section
# Show paired end links and labels ( --pairlinks ) in a different color
# Add support for flowthrough links ( --flowthrough ) in a third color
# Add support for flowbetween links ( --flowbetween ) in a fourth color
# Add paired end support to ABySS-Explorer ouput, and use exclusively the
# new ABySS-Explorer 1.3.0 .dot format
# 1.0.8 - August 3, 2013
# Error message if contig number does not exist in the assembly
# Add --nondirectdashed
my $version = "1.0.8";
############################################################
# configuration variables
############################################################
my $bpabbreviation = "nt"; # set to b.p., bp, nt, or even a null string, as you prefer
my $defaultmaxlevel = 2;
my $debuglimit = 1000; # how long print debug messages while extending
my $numdigits = 5; # contig numbers filled out with leading zeroes to this length
my $scalefactor = 0.01; # convert b.p. to graphviz length by multiplying by this value
my $defaultoverlapmode = "false"; # neato overlap mode in graph section, "true" to disable
my $defaultouttype = "png"; # default image type ( passed to neato )
my $dotheaderid = "adj"; # ABySS-Explorer .dot file header id
my $contiggraphtxt = "454ContigGraph.txt"; # this is defined by gsAssembler/Newbler
my $allcontigsfna = "454AllContigs.fna"; # this is defined by gsAssembler/Newbler
my $defaultminflowbetween = 1;
# default color definitions
my $deadendcolor = "tomato";
my $recursionlimitcolor = "gold";
my $normallinkcolor = ""; # leave blank for the default color of black
my $normalfontcolor = $normallinkcolor;
my $forcedlinkcolor = "red"; # links from --force parameter
my $forcedfontcolor = $forcedlinkcolor;
my $pairedendlinkcolor = "dodgerblue"; # links from paired end information
my $pairedendfontcolor = $pairedendlinkcolor;
my $flowbetweenlinkcolor = "purple"; # links from flowthrough "F" information
my $flowbetweenfontcolor = $flowbetweenlinkcolor;
my $flowthroughlinkcolor = "forestgreen"; # links from flowthrough "I" information
my $flowthroughfontcolor = $flowthroughlinkcolor;
my $abyssk = 1; # we don't use kmers, so set this to 1 so coverage will be direct conversion
my $abyssedge = 0; # negative kmer minus 1, thus zero
my $abyssevalue = 0; # we don't have a value for this, so just use zero always
############################################################
# global variables
############################################################
my $ansiup = "\033[1A"; # terminal control
(my $prognopath = $0) =~ s/^.*[\/\\]//;
my @contiglen = (); # contig length
my @contigcov = (); # contig average coverage
my %ends = (); # key is contig . "3'" or "5'" or "0'", with optional "p", "f", or "i",
# values are @[ contig, 3'|5', #reads, flag ]
my %pairs = (); #used for temporary storage of paired end information until link ends can be sorted.. [Simon Gladman - 2011]
my %deadends = (); # key is contig, value is # of ends with reads extending ( so 1 = dead end )
my %minrl = (); # key is contig, value is lowest recursion level seen for this contig
my %seen = (); # key is contig, value is >=1 if we already traversed, undefined if not
# actual value reflects recursion level when seen
my %edgeseen = (); # key is contig 3'or5' contig 3'or5', value = 1 or undefined
my %taghash = (); # key is contig, value is array of tags
my %colorhash = (); # key is contig, value is array of colors
my %excludehash = (); # key is contig, value is 1 to exclude
my %inverthash = (); # key is contig, value is 1 to exclude
my %data = (); # subset of %ends that will end up in graph
my %flowdata = (); # store flow through read data here
my $extensions = 0; # number of auto-extensions so far (for --extend)
my @extarr = (); # extensions will apply only to supplied contigs, not auto-added ones
my $infilename = ""; # 454ContigGraph.txt path and name
my @listofexcl = (); # list of excluded contigs, --listexcluded turns this on
my $deletecmdfile = 1; # becomes 0 if a command file name is explicitly specified
my $returncode = 0; # return code of this program
############################################################
# command line parameters
############################################################
my $indirname = ""; # input file name
my $outfilename = ""; # output file name
my $outtype = $defaultouttype; # output image file format
my $cmdfilename = ""; # graphviz .dot language command file name
my $imgfilename = ""; # image file name
my $outfastaname = ""; # create FASTA file of contigs in output
my $level = $defaultmaxlevel; # maximum number of levels of recursion
my $boldabove = 0;
my @contig = (); # starting contig(s)
my @tag = (); # tag certain contigs
my @color = (); # color certain contigs
my @exclude = (); # never go into these contigs (e.g. repeat regions)
my @invert = (); # contigs to plot backwards
my $len = 1; # neato len parameter
my $listexcluded = 0; # list contigs that have been excluded
my $extend = 0; # auto extend for best contig
my @forcelink = (); # force links where none may exist
my $showbp = 0; # include length in b.p. in graph
my $showcov = 0; # include contig average coverage in graph
my $lowlimit = 0; # ignore links with read limit < this
my $highlimit = 0; # ignore links with read number > this
my $nolabel = 0; # disable dead end and recursion limit labelling
my $overlapmode = $defaultoverlapmode;
my $nospline = 0; # to disable splines
my $scaffold = 0; # to enable scaffold connection information
my $pairlinks = 0; # to enable paired end read connection information
my $flowthrough = 0; # to enable flowthrough connection information
my $flowbetween; # to enable flow between connection information, has optional value also
my $alllinks = 0; # sets --pairlinks --flowthrough --flowbetween and --scaffold
my $nondirectdashed = ""; # all links except direct are dashed lines
my $abyssdotfile; # generate file for use with ABySS-Explorer
my $help = 0; # print help and exit
my $quiet = 0; # only show errors
my $debug = 0; # print extra debugging information
GetOptions (
"indir=s" => \$indirname, # string
"outfile=s" => \$outfilename, # string
"type=s" => \$outtype, # string
"cmdfile=s" => \$cmdfilename, # string
"imgfile=s" => \$imgfilename, # string
"fastaout=s" => \$outfastaname, # string
"abyssexplorer=s"=> \$abyssdotfile, # string
"level=i" => \$level, # integer
"boldabove=i" => \$boldabove, # integer
"contig=s" => \@contig, # string array
"exclude=s" => \@exclude, # string array
"invert=s" => \@invert, # string array
"tag|label=s" => \@tag, # string array
"color=s" => \@color, # string array
"forcelink=s" => \@forcelink, # string array
"len=s" => \$len, # real
"extend=i" => \$extend, # integer
"lowlimit=i" => \$lowlimit, # integer
"highlimit=i" => \$highlimit, # integer
"nolabel" => \$nolabel, # flag
"nospline" => \$nospline, # flag
"pairlinks" => \$pairlinks, # flag
"scaffold" => \$scaffold, # flag
"flowthrough" => \$flowthrough, # flag
"flowbetween:s" => \$flowbetween, # flag/string
"alllinks" => \$alllinks, # flag
"overlapmode=s" => \$overlapmode, # string
"listexcluded" => \$listexcluded, # flag
"showbp|shownt" => \$showbp, # flag
"showcoverage" => \$showcov, # flag
"nondirectdashed"=> \$nondirectdashed, # flag
"help" => \$help, # flag
"quiet" => \$quiet, # flag
"debug" => \$debug); # flag
# debug implies not quiet
if ( $debug ) { $quiet = 0; }
unless ( ( $indirname ) and ( $outfilename ) and ( scalar @contig ) ) { $help = 1; }
# changing meaning of --scaffold for future use
if ( $scaffold )
{ die "--scaffold has been changed to --pairlinks\n"; }
# $flowbetween is a flag with an optional value, set default value if no value was specified
if ( ( defined $flowbetween ) and ( $flowbetween eq "" ) ) { $flowbetween = $defaultminflowbetween; }
if ( $alllinks )
{
$pairlinks = 1;
$flowthrough = 1;
$flowbetween = $defaultminflowbetween;
$scaffold = 1; # future use
}
if ( $nondirectdashed ) { $nondirectdashed = " style=dashed"; } else { $nondirectdashed = ""; }
# allow specification of only the directory
unless ( ( -d $indirname ) or ( $help ) )
{
print "Error, specified input directory \"$indirname\" does not exist or is not a directory\n";
$help = 1;
}
$infilename = $indirname;
unless ( $infilename =~ m/\/$/ ) { $infilename .= "/"; }
$infilename .= $contiggraphtxt;
# make sure input file exists
unless ( ( -e $infilename ) or ( $help ) )
{
print "Error, input file \"$infilename\" does not exist\n";
$help = 1;
}
# if no --imgfile, create name based on --outfile
unless ( $imgfilename ) { $imgfilename = $outfilename . "." . $outtype; }
# if no --cmdfile, create name based on --outfile
if ( $cmdfilename )
{ $deletecmdfile = 0; }
else
{ $cmdfilename = $outfilename . ".graphviz"; }
if ( $outfilename =~ m/png$/i ) { warn "You probably do not want to append a .png extension on --outfile\n"; }
# OBSOLETE
# ABySS-Explorer idiosyncracy
#if ( ( $abyssexplorer ) and ( $abyssexplorer !~ m/-4.adj/ ) )
# { print "WARNING: ABySS-Explorer versions <= 1.0.1 require that the input file name ends in \"-4.adj\"\n"; }
############################################################
# print help screen
############################################################
if ( $help )
{
print "$prognopath version $version
Required parameters:
--indir=xxx path to 454 assembly directory
--outfile=xxx output text file of results
--contig=xxx[,xxx]...
one or more starting contig numbers,
separated by comma, or multiple --contig
parameters may be used. Use just the
numeric portion of the contig
Optional parameters:
--type=xxx output file format, default is \"$defaultouttype\"
( anything besides \"png\" is experimental )
--cmdfile=xxx graphviz command file in .dot language will be created
using this name. If not specified, a temporary command
file will be created, and it will be deleted when done
--imgfile=xxx graph image file will be created with
this name. If not specified, will be
--outfile with .${outtype} extension added
--fastaout=xxx create a FASTA file of all contigs in
the output, save in this file
--abyssexplorer=xxx Generate a .dot file that can be used for
visualization with ABySS-Explorer 1.3.0,
http://www.bcgsc.ca/platform/bioinfo/software/abyss-explorer";
# for compatibility, the file name must end in \"-4.adj\"
# If paired end information is available, and the
# --pairlinks parameter is used, corresponding
# \"-3.dist\" and \"-contigs.fa\" files will also be created
print "
--flowthrough include connection information derived from
reads that flow through more than two contigs
--flowbetween[=x] include connection information derived from
reads that flow from one contig into another
by default, if the distance value is zero, it will not be
shown, the optional value for this parameter is a minimum
distance, defaulting to $defaultminflowbetween, set to --flowbetween=0 to show
these links also
--pairlinks include connection information derived
from paired end reads, only applicable for assemblies
containing paired end reads
--alllinks sets --flowthrough, --flowbetween, and --pairlinks
--nondirectdashed all links except direct links are dashed lines
--tag=tagname,contig[,contig]...
list of 1 or more contigs will be given
this tag. Multiple --tag allowed.
tagname is a text label that will be shown
in the final image, e.g. --tag=\"ATP1,14,34\"
--label a synonym for --tag
--showbp show length in b.p. in graph
--shownt a synonym for --showbp
--showcoverage show average contig read coverage in graph
--color=colorname,contig[,contig]...
like --tag, but color the contig.
for list of valid color names see
http://www.graphviz.org/doc/info/colors.html
--forcelink=xxx-5:yyy-3 force a link where none exists
between specified ends, xxx and yyy are
contig numbers
--level=xxx maximum recursion level, default=$level
--boldabove=xxx lines with read coverage >= this value
will be drawn in bold. no default value
--exclude=xxx[,xxx]...
one or contigs to never traverse past,
for example a repeated region contig
--listexcluded print out a list of which excluded contigs
are being ignored
--invert=xxx[,xxx]...
one or more contigs to plot backwards on
the graph, i.e. 3' to 5' direction
--extend=xxx auto extension for the single best
path, value is maximum steps, default=$extend
--lowlimit=xxx ignore connections < this number of reads
--highlimit=xxx ignore connections > this number of reads
--len=xxx len parameter to neato, default=$len
--nolabel disable highlighting of dead ends, and limit
of recursion contigs
--overlapmode neato paramter, default is $overlapmode, one of
none, true, scale
--nospline disable spline when edges would overlap
--help print this screen
--quiet only print error messages
--debug print extra debugging information
In place of lists of contigs, you can use \@filename to read in
values for that parameter from a file, e.g. --exclude=\@excl.txt
This program requires that the graphviz program \"neato\" be
available in the default PATH. The graphviz web site is
http://www.graphviz.org/
";
exit 1;
} # if ( $help )
############################################################
# expand --contig lists separated by commas into single array
############################################################
{
my @tmp = ();
foreach my $acontig (@contig)
{
$acontig = expandatprefix ( $acontig );
push ( @tmp, split ( /\s*[,;]\s*/, $acontig ) );
}
# cleaning and validation
@contig = ();
foreach my $item ( @tmp )
{
$item = expandatprefix ( $item );
$item =~ s/^0+//; # remove leading zeroes
$item =~ s/\s//g; # remove any white space
$item =~ s/[\+\-]//g; # allow "+" or "-" in contig numbers, but it is ignored
if ( $item =~ m/[^\d]/ ) { die "Error, non-numeric character used for --contig \"$item\"\n"; }
unless ( $item =~ m/^$/ ) # skip null items
{ push ( @contig, $item ); }
}
debugmsg ( "Supplied contig list of ".scalar(@contig)." contigs = \"".join ("\" \"", @contig)."\"" );
}
############################################################
# expand --tag lists separated by commas into hash of arrays
############################################################
{
my $ntags = 0;
foreach my $atag (@tag)
{
my @tmp = split ( /\s*[,;]\s*/, $atag );
my $taglabel = shift ( @tmp );
foreach my $item ( @tmp )
{
$item = expandatprefix ( $item );
$item =~ s/^0+//; # remove leading zeroes
unless ( $item =~ m/^$/ ) # skip null items
{
push ( @{$taghash{$item}}, $taglabel );
$ntags++;
}
}
} # foreach (@tag)
debugmsg ( "Stored ".commify($ntags)." tags" );
}
############################################################
# expand --color lists separated by commas into hash of arrays
############################################################
{
my $ncolors = 0;
foreach my $acolor (@color)
{
my @tmp = split ( /\s*[,;]\s*/, $acolor );
my $colorlabel = shift ( @tmp );
foreach my $item ( @tmp )
{
$item = expandatprefix ( $item );
$item =~ s/^0+//; # remove leading zeroes
$item =~ s/[\-\+]//g; # remove plus or minus - has no meaning, but this is a convenience to be compatible with bb.fastareorder
unless ( $item =~ m/^$/ ) # skip null items
{
push ( @{$colorhash{$item}}, $colorlabel );
$ncolors++;
}
}
} # foreach (@color)
debugmsg ( "Stored ".commify($ncolors)." colors" );
}
############################################################
# expand --forcelink lists separated by commas into hash of arrays
############################################################
{
my $nforce = 0;
foreach my $aforce (@forcelink)
{
my @tmp = split ( /\s*[,;]\s*/, $aforce );
foreach my $link ( @tmp )
{
$link = expandatprefix ( $link );
my @parts = split ( /\s*[:-]\s*/, $link );
unless ( scalar @parts == 4 ) { die "Invalid format for --forcelink \"$link\"\n"; }
foreach ( @parts )
{
s/^0+//; # remove leading zeroes
s/'//g; # remove primes (they are optional)
}
# add this artificial link to the list
push ( @{$ends{$parts[0].$parts[1]."'"}}, [ $parts[2], $parts[3]."'", 0 ] );
push ( @{$ends{$parts[2].$parts[3]."'"}}, [ $parts[0], $parts[1]."'", 0 ] );
}
$nforce++;
} # foreach (@forcelink)
debugmsg ( "Stored ".commify($nforce)." forced links" );
}
############################################################
# convert --exclude lists to a simple hash
############################################################
{
foreach my $aexclude (@exclude)
{
foreach my $item ( split ( /\s*[,;]\s*/, $aexclude ) )
{
$item = expandatprefix ( $item );
# cleaning and validation
$item =~ s/^0+//; # remove leading zeroes
$item =~ s/\s//g; # remove any white space
unless ( $item =~ m/^$/ ) # skip null items
{
if ( $item =~ m/[^\d]/ ) { die "Error, non-numeric character used for --exclude \"$item\"\n"; }
$excludehash{$item} = 1;
}
}
}
debugmsg ( "Stored ".commify(scalar keys %excludehash)." exclude contigs" );
}
############################################################
# convert --invert lists to a simple hash
############################################################
{
foreach my $ainvert (@invert)
{
$ainvert = expandatprefix ( $ainvert );
foreach my $item ( split ( /\s*[,;]\s*/, $ainvert ) )
{
$item = expandatprefix ( $item );
# cleaning and validation
$item =~ s/^0+//; # remove leading zeroes
$item =~ s/\s//g; # remove any white space
unless ( $item =~ m/^$/ ) # skip null items
{
if ( $item =~ m/[^\d]/ ) { die "Error, non-numeric character used for --invert \"$item\"\n"; }
$inverthash{$item} = 1;
}
}
}
debugmsg ( "Stored ".commify(scalar keys %inverthash)." invert contigs" );
}
############################################################
# parse 454ContigGraph.txt
############################################################
# sample content: refer to this excellent description for more info:
# http://contig.wordpress.com/2010/04/13/newbler-output-iii-the-454contiggraph-txt-file
#1 contig00001 588 2.6
#2 contig00002 1072 6.8
#3 contig00003 644 4.1
#...
#C 7 3' 14770 5' 3
#C 12 3' 14824 5' 5
#C 12 3' 52148 5' 4
#...
#S 1 84148 1:+;2:+;gapOneNoEdges:186;3:+;4:+;5:+;6:+
#S 2 17530 7:+
#S 3 25222 8:+;9:-;10:-;11:+;12:+;14:+;gapMultiEdges:4733;15:+;gapMultiEdges:4241;16:+;17:+;19:+
#...
#I 7 aCAACaTTATCATTGtATTTatATTCcTGTTtGAGATACGTGTGGACAGAGAATGTTGGTTTTTTGGACTAGAATCGGATTTATCATTATTATAATGT...
#I 48 AGTTCGTCCTGGACGACTTGAGTT 11:19543-5'..16159-5';6:19543-5'..60104-5'
#I 50 GGgTAATAGTTGACCGTCTTACGAAATtGGCACATTTTCTTCCAATTAACGAGAAATCTTCggtAGACAGACTAGTTCATATGTATGTGCGtGAAATC...
#...
#F 7 - 14770/3/0.0
#F 8 56895/6/0.0;54006/2/231.5 -
#F 12 - 14824/5/0.0;52148/4/0.0
#...
#P 1 10130/2/0.0;33848/3/0.0;34537/2/397.0;25104/2/679.5;15/170/698.4;6/2/1600.0;209/175/2352.0;9364/2/3929.5;19/89/4351.2;16/128/5345.5;17/25/5380.7;14/15/603...
#P 2 1/284/927.3;20341/2/7324.0;23108/2/8174.5 4/210/2787.7;3/22/3918.7;39/2/4345.5
#P 3 4/183/1037.9;24637/4/6940.0 1/156/3004.0;2/22/3918.7;1675/2/4906.0;29922/3/5867.0;97/2/6139.5;7360/2/7464.0
{
my $lines = 0;
my $clines = 0;
my $ilines = 0;
my $flines = 0;
my $slines = 0;
my $plines = 0;
my $endsstored = 0;
open ( my $INF, "<", $infilename ) or die ( "Error opening input file \"$infilename\": $!\n" );
while ( my $aline = <$INF> )
{
$lines++;
# progress indicator
unless ( $quiet ) { if ( ( $lines % 1000 ) == 0 ) { print commify($lines), "\n", $ansiup; } }
$aline =~ s/[\r\n]//g;
my @cols = split ( /\t/, $aline );
if ( $cols[0] eq "C" )
{
$clines++;
# store both orientations in hash
my $keep = 1;
if ( $cols[5] < $lowlimit ) { $keep = 0; }
if ( ( $highlimit ) and ( $cols[5] > $highlimit ) ) { $keep = 0; }
if ( $keep )
{
# store data from each end, the three columns are [0]=nextcontig [1]=5'|3'|5'p|3'p|5'f|3'f|5'i... [2]=readnum
push ( @{$ends{$cols[1].$cols[2]}}, [ $cols[3], $cols[4], $cols[5] ] );
# and store reciprocal end
push ( @{$ends{$cols[3].$cols[4]}}, [ $cols[1], $cols[2], $cols[5] ] );
$endsstored+=2;
} # if $keep
if ( $endsstored < $debuglimit )
{
my $txt = $keep?" Storing":"Not storing";
debugmsg ( "$txt ends: \$ends{$cols[1]$cols[2]} [ $cols[3], $cols[4], $cols[5] ];" );
debugmsg ( " : \$ends{$cols[3]$cols[4]} [ $cols[1], $cols[2], $cols[5] ];" );
}
} # "C"
elsif ( $cols[0] eq "I" ) # flowthrough information
{
$ilines++;
if ( $flowthrough )
{
# cols[1] is contig number
# cols[2] is contig sequence ( if <= 256 b.p.)
# cols[3] is the through-flow information, a ";" delimited list of
# the format 15:1805-3'..207-3'
# cols[3] will sometimes be null
if ( $cols[3] )
{
my @parts = split ( /;/, $cols[3] );
foreach my $apart ( @parts )
{
my @subparts = split ( /[:\-\.]/, $apart );
# @subparts columns become [0]=15 [1]=1805 [2]=3' [3]=null [4]=207 [5]=3'
push ( @{$ends{$cols[1]."0'"."i"}}, [ $subparts[1], $subparts[2], $subparts[0] ] );
push ( @{$ends{$cols[1]."0'"."i"}}, [ $subparts[3], $subparts[4], $subparts[0] ] );
} # foreach @leftparts
} # if ( $cols[3] )
} # if ( $flowthrough )
} # "I"
elsif ( $cols[0] eq "F" ) # flowbetween information
{
$flines++;
if ( defined $flowbetween )
{
# cols[1] is contig number
# cols[2] is flow information for reads flowing from the 5' end of the contig
# cols[3] is flow information for reads flowing from the 3' end of the contig
my @leftparts = split ( /;/, $cols[2] );
my @rightparts = split ( /;/, $cols[3] );
# each part is of the format xx/yy/z.z where xx=contig yy=number of reads z.z=distance in b.p.
if ( $cols[2] ne "-" ) # "-" is the indicator for null entry
{
foreach my $apart ( @leftparts )
{
my @subparts = split ( /\//, $apart );
# this, and paired links is the only case where we save a fourth column in
# the %ends hash, a distance in b.p. value
# $flowbetween is also used as a minimum distance cutoff for filtering, so
# filter by this distance value, and ignore if too short
if ( $subparts[2] >= $flowbetween )
{ push ( @{$ends{$cols[1]."5'f"}}, [ $subparts[0], "0'", $subparts[1], $subparts[2] ] ); }
} # foreach @leftparts
} # if ne "-"
if ( $cols[3] ne "-" ) # "-" is the indicator for null entry
{
foreach my $apart ( @rightparts )
{
my @subparts = split ( /\//, $apart );
if ( $subparts[2] >= $flowbetween )
{ push ( @{$ends{$cols[1]."3'f"}}, [ $subparts[0], "0'", $subparts[1], $subparts[2] ] ); }
} # foreach @leftparts
} # if ne "-"
} # if ( defined $flowbetween )
} # "F"
elsif ( $cols[0] eq "S" )
{
$slines++;
} # "S"
elsif ( $cols[0] eq "P" )
{
$plines++;
if ( $pairlinks ) {
my $con_number = $cols[1];
my $fiveprime_connects = $cols[2];
my $threeprime_connects = $cols[3];
my @tmp = split ";", $fiveprime_connects;
foreach my $connection (@tmp){
#now split up the connection.
next if $connection eq "-";
my @x = split "/", $connection;
my $termcontig = $x[0];
my $num_connects = $x[1];
my $distance = $x[2];
if($num_connects >= $lowlimit){
#store in temp pairs var and search through later for termcontig details.
push( @{$pairs{$con_number}}, [ $termcontig, "5'", $num_connects, $distance ] );
$endsstored += 2;
if ( $endsstored < $debuglimit )
{
debugmsg ( "Storing ends: \$ends{".$con_number."3'p} [ $termcontig, 5', $num_connects];" );
debugmsg ( " : \$ends{".$termcontig."5'p} [ $con_number, 3', $num_connects];" );
}
}
}
@tmp = split ";", $threeprime_connects;
foreach my $connection (@tmp){
#now split up the connection
next if $connection eq "-";
my @x = split "/", $connection;
my $termcontig = $x[0];
my $num_connects = $x[1];
my $distance = $x[2];
if($num_connects >= $lowlimit){
#store in temp pairs var and search through later for termcontig details.
push( @{$pairs{$con_number}}, [ $termcontig, "3'", $num_connects, $distance ] );
$endsstored += 2;
if ( $endsstored < $debuglimit )
{
debugmsg ( "Storing ends: \$ends{".$con_number."3'p} [ $termcontig, 5', $num_connects];" );
debugmsg ( " : \$ends{".$termcontig."5'p} [ $con_number, 3', $num_connects];" );
}
}
}
} # if ( $pairlinks )
} # "P"
elsif ( $cols[0] =~ m/^\d+$/ ) # first section of file
{
$contiglen[$cols[0]] = $cols[2];
$contigcov[$cols[0]] = $cols[3];
} # section 1
else
{
die ( "Error on line $lines of file \"$infilename\", unknown type of content:\n$aline\n" );
}
} # while <$INF>
close $INF;
debugmsg ( commify($lines) . " lines read from input file \"$infilename\"" );
debugmsg ( commify($#contiglen) . " contig lengths were stored" );
debugmsg ( commify($clines) . " \"C\" lines were found" );
debugmsg ( commify($endsstored) . " ends were stored" );
debugmsg ( commify($ilines) . " \"I\" lines were found" );
debugmsg ( commify($flines) . " \"F\" lines were found" );
debugmsg ( commify($plines) . " \"P\" lines were found" );
debugmsg ( commify($slines) . " \"S\" lines were found (and ignored)" );
}
############################################################
# make paired end information links
############################################################
# Added by Simon Gladman - CSIRO - 2011
# Adds paired end links to the link data variable "%ends"
foreach my $key (keys %pairs){
my @contig = @{$pairs{$key}};
foreach my $tmp (@contig){
my @x = @{$tmp};
my $term_contig = $x[0];
my @tcontig; # 10/11/2011 is this a bug? occassional not defined state here
if ( $pairs{$term_contig} ) { @tcontig = @{$pairs{$term_contig}}; } # end of fix
#my @tcontig = @{$pairs{$term_contig}};
foreach my $ttmp (@tcontig){
my @y = @{$ttmp};
if($y[0] == $key){
push( @{$ends{$key."$x[1]"."p"}}, [ $term_contig, "$y[1]", $x[2], $x[3] ] );
push( @{$ends{$term_contig."$y[1]"."p"}}, [ $key, "$x[1]", $y[2], $y[3] ] );
last;
}
}
}
}
############################################################
# sort data
############################################################
# sort data so that higher read coverage links come first
# sorting is only needed if we will automatically extend network
if ( $extend )
{
# extend applies only to command line contigs and not auto-generated ones
foreach ( @contig )
{ push ( @extarr, $extend ) } # but the $extend value just evaluates to "true" later
debugmsg ( "--extend=$extend, Sorting data" );
foreach my $key ( keys %ends )
{
@{$ends{$key}} = sort {$b->[2] <=> $a->[2]} @{$ends{$key}};
} # foreach my $key ( keys %ends )
debugmsg ( "Extend contig list of ".scalar(@extarr)." contigs" );
} # if ( $extend )
############################################################
# dead end detection
############################################################
# dead ends are contigs which might have reads extending to another contig
# from either the 5' or 3' end, but not both. Both ends could be dead ends, too.
# %deadends{contigid} is # of ends extending, so 1 = dead end,
# 2 = continues both ends, 0=isolated contig without any connections
unless ( $nolabel )
{
debugmsg ( "Dead end detection" );
foreach my $key ( keys %ends )
{
( my $contig = $key ) =~ s/[530]'[pfis]?$//; # remove 5' or 3' or 5'p etc at end of string
# note that this version of $contig does not have leading zeroes
$deadends{$contig}++;
@{$ends{$key}} = sort {$b->[2] <=> $a->[2]} @{$ends{$key}};
} # foreach my $key ( keys %ends )
} # unless ( $nolabel )
############################################################
# construct network from starting point(s)
############################################################
my $totalnodes = 0;
my $index = 0;
foreach my $acontig (@contig)
{
my $dbgtxt = (defined $extarr[$index])?"user-defined":"auto-extend";
debugmsg ( "Contig #".($index+1)."=$dbgtxt, Recursion starting at \"$acontig\"" );
# when hit end of our specified contigs
unless ( defined $extarr[$index] )
{
if ( $extend ) { debugmsg ( "At end of specified contigs, turning off extend" ); }
$extend = 0;
}
recurse ( $acontig, "", "", 0, $extend );
$index++;
} # foreach my $acontig (@contig)
unless ( $quiet ) { print commify($totalnodes), " nodes present in output\n"; }
############################################################
sub recurse { my ( $startcontig, $camefrom, $fromend, $recurselevel, $followlevel ) = @_;
############################################################
unless ( $totalnodes > $debuglimit ) { debugmsg ( "recurse \"$startcontig\", camefrom=\"$camefrom\" end=\"$fromend\" recurselevel=$recurselevel" ); }
# data for this contig is in @ { %ends{contig . 5'|3'|5'p|3'p|5'f|3'f|5'i... } } [0]=nextcontig [1]=5'|3'|5'p|3'p"5'f|3'f|5'i... [2]=readnum
# store lowest recursion level seen for this contig
unless ( $nolabel )
{
if ( ( ! defined $minrl{$startcontig} ) or ( $recurselevel < $minrl{$startcontig} ) )
{ $minrl{$startcontig} = $recurselevel; }
} # unless ( $nolabel )
# here is the limit to recursion
my $stophere = 0;
if ( $recurselevel > $level )
{
unless ( $totalnodes > $debuglimit ) { debugmsg ( "recursion for \"$startcontig\" at limit level=$level, returning" ); }
$stophere = 1;
}
# return if this contig is on the exclude list
elsif ( $excludehash{$startcontig} )
{
unless ( $totalnodes > $debuglimit ) { debugmsg ( "contig \"$startcontig\" on exclude list, returning" ); }
# save information for list of excluded option
my @row = ( $startcontig, $camefrom, $fromend );
push ( @listofexcl, \@row );
return 0;
## was this wrong? always return if on exclude list $stophere = 1;
}
# skip return if follow allows it
if ( ( $stophere ) and ( $followlevel < 1 ) ) { return 0; }
# count nodes
unless ( defined $seen{$startcontig} )
{
if ( $startcontig =~ m/^\s*$/ ) { die "Error, null contig in sub recurse($startcontig, $camefrom, $fromend, $recurselevel, $followlevel)\n"; }
$seen{$startcontig} = 1;
$totalnodes++;
}
foreach my $end ( "5'", "3'", "5'p", "3'p", "5'f", "3'f", "5'i", "3'i", "0'" )
{
my $first = 1;
foreach my $contigref ( @{$ends{$startcontig.$end}} )
{
unless ( $totalnodes > $debuglimit ) { debugmsg ( "from \"$startcontig\" end \"$end\" find linked contig ".join(";",@$contigref) ); }
# always skip links back to where we just came from ( and don't clear $first flag )
if ( $contigref->[0] eq $camefrom )
{
unless ( $totalnodes > $debuglimit ) { debugmsg ( "\"$startcontig\": skipping link back to source contig \"$camefrom\"" ); }
next;
}
# skip data storing if this edge was already stored
unless ( defined $edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} )
{
debugmsg ( "Storing in \@data at key \"$startcontig$end\": [ \"".join ( "\", \"", @$contigref )."\" ]" );
push ( @{$data{$startcontig.$end}}, $contigref ); # $contigref is array reference
if ( ( $first ) and ( $followlevel ) )
{
unless ( $totalnodes > $debuglimit ) { debugmsg ( "auto extension following contig \"$contigref->[0]\", \$extensions=$extensions" ); }
$extensions++;
$edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} = $recurselevel;
recurse ( $contigref->[0], $startcontig, $end, $recurselevel, $followlevel-1 );
} # if
} # unless $edgeseen
else
{ debugmsg ( "Seen, not storing in \@data at key \"$startcontig$end\": [ \"".join ( "\", \"", @$contigref )."\" ]" ); }
# recurse if edge seen level is higher than current level or not seen before
if ( ( ! defined $edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} ) or
( $edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} > $recurselevel ) )
{
$edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} = $recurselevel;
recurse ( $contigref->[0], $startcontig, $end, $recurselevel+1, 0 );
}
$first = 0;
} # foreach $contigref
} # foreach $end
} # sub recurse
############################################################
# consolidate pairs of flow between links
############################################################
collapseflowbetween();
############################################################
# create output table text file
############################################################
createoutputtable();
############################################################
# generate optional ABySS-Explorer .dot file
############################################################
if ( $abyssdotfile ) { createabyssfile(); }
############################################################
# generate optional FASTA file
############################################################
if ( $outfastaname ) { createfastafile(); }
############################################################
# create graphviz command file (.dot file)
############################################################
my $savednodes = 0;
my $savededges = 0;
debugmsg ( "Creating graphviz command file \"$cmdfilename\"" );
open ( my $OUTF, ">", $cmdfilename ) or die ( "Error creating graphviz command file \"$cmdfilename\": $!\n" );
print $OUTF "graph G\n";
print $OUTF " {\n";
print $OUTF " edge [len=$len];\n";
my $splinemode = $nospline?"false":"true";
print $OUTF " graph [overlap=$overlapmode,splines=$splinemode];\n";
print $OUTF " node [shape=plaintext];\n";
# table of nodes
print $OUTF "\n // Nodes\n";
foreach my $seencontig ( keys %seen )
{
my $contigid = sprintf ("%0${numdigits}d", $seencontig);
# box is proportional to size, except if contig is small,
# the box is still as large as the labels it contains
unless ( $contiglen[$seencontig] )
{ die "Error, no contig length for \"$seencontig\", maybe you specified a nonexistant contig\n"; }
my $scaledcontiglen = sprintf ( "%1d", $contiglen[$seencontig] * $scalefactor );
# define the box representing the contig, implemented as a table in HTML
print $OUTF " c$contigid [label=< <TABLE BORDER=\"1\" CELLBORDER=\"0\" CELLSPACING=\"0\" CELLPADDING=\"0\"";
if ( $colorhash{$seencontig}->[0] ) { print $OUTF " BGCOLOR=\"$colorhash{$seencontig}->[0]\""; }
# top row is always present
print $OUTF "><TR>";
if ( $inverthash{$seencontig} )
{ print $OUTF "<TD PORT=\"R\">3'</TD>"; }
else
{ print $OUTF "<TD PORT=\"L\">5'</TD>"; }
print $OUTF "<TD PORT=\"C\" WIDTH=\"$scaledcontiglen\"";
print $OUTF ">c$contigid</TD>";
if ( $inverthash{$seencontig} )
{ print $OUTF "<TD PORT=\"L\">5'</TD>"; }
else
{ print $OUTF "<TD PORT=\"R\">3'</TD>"; }
print $OUTF "</TR>";
# optional table row for contig size
if ( $showbp )
{ print $OUTF "<TR><TD COLSPAN=\"3\">",commify($contiglen[$contigid])," $bpabbreviation</TD></TR>"; }
# optional table row for coverage
if ( $showcov )
{ print $OUTF "<TR><TD COLSPAN=\"3\">cov=",commify($contigcov[$contigid]),"</TD></TR>"; }
# one or more optional rows for labels specified on the command line
if ( $taghash{$seencontig}->[0] )
{
foreach my $tag ( @{$taghash{$seencontig}} )
{ print $OUTF "<TR><TD COLSPAN=\"3\">$tag</TD></TR>"; }
}
# optional final rows for dead end contigs, or recursion limit contigs
unless ( $nolabel )
{
if ( ( ! defined $deadends{$seencontig} ) or ( $deadends{$seencontig} <= 1 ) )
{ print $OUTF "<TR><TD COLSPAN=\"3\" BGCOLOR=\"$deadendcolor\">Dead End</TD></TR>"; }
if ( ( defined $minrl{$seencontig} ) and ( $minrl{$seencontig} >= $level ) )
{ print $OUTF "<TR><TD COLSPAN=\"3\" BGCOLOR=\"$recursionlimitcolor\">Recursion limit</TD></TR>"; }
} # unless ( $nolabel )
# and the end of this huge mess of HTML
print $OUTF "</TABLE> >];\n";
} # foreach my $seencontig ( keys %seen )
debugmsg ( "Saved ".commify($savednodes)." nodes" );