#!/usr/bin/perl #AUTHOR # Marten Boetzer and Walter Pirovano (c) 2010 # SSAKE-based Scaffolding of Pre-Assembled Contigs after Extension (SSPACE) # walter.pirovano@baseclear.com #NAME # SSPACE Marten Boetzer - Walter Pirovano November 2010 #SYNOPSIS # SSAKE-based Scaffolding of Pre-Assembled Contigs after Extension (SSPACE) #DOCUMENTATION # README, MANUAL and TUTORIAL distributed with this software @ www.baseclear.com # Boetzer M, Henkel VJ, Jansen HJ, Butler D and Pirovano W. 2010. Scaffolding pre-assembled contigs using SSPACE. Bioinformatics. # http://www.baseclear.com/sequencing/data-analysis/bioinformatics-tools/ # We hope this code is useful to you -- Please send comments & suggestions to Walter.Pirovano@baseclear.com # If you use either the SSPACE code or ideas, please cite our work appropriately and accurately #LICENSE # SSPACE Copyright (c) 2010 BaseClear B.V. The Netherlands. All rights reserved. # SSAKE Copyright (c) 2006-2010 Canada's Michael Smith Genome Science Centre. All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # note: insert size and distance between pairing reads are used interchangeably #MAJOR CHANGES ON SSAKE V3.4 TO FORM SSPACE # -New scaffolding feature dealing with contigs having multiple alternatives # -Seperate scripts to decrease the memory usage # -Automatic filtering of reads and duplicate mate pairs # -Option for contig extension on unfiltered reads # -Removed tracking of reads during contig extension # -Mapping single reads to extended and non extended contigs # -Single reads mapped more than once to a contig are removed for scaffolding # -A summary file is generated containing detailed information about the scaffolding process # -An evidence file is generated which indicates the contigs present in the scaffolds # -Optional; Scaffolds and their contigs are visualised by generating a .dot file #CHANGES IN v1-1 # -Error on Bowtie-build solved by chmod Bowtie files use strict; use Storable; require "getopts.pl"; use File::Path; use File::Basename; #Specify path to DotLib use FindBin qw($Bin); use lib "$Bin/dotlib/"; use DotLib; use vars qw($opt_m $opt_o $opt_v $opt_p $opt_k $opt_a $opt_z $opt_s $opt_b $opt_d $opt_n $opt_l $opt_x $opt_u $opt_t); &Getopts('m:o:v:p:k:a:z:s:b:d:n:l:x:u:t:'); my ($base_overlap,$min_overlap,$verbose,$MIN_READ_LENGTH,$SEQ_SLIDE,$min_base_ratio,$min_links,$max_link_ratio,$unpaired_file,$max_trim,$base_name, $outdir,$max_count_trim,$min_tig_overlap, $doplot, $extending)= (20, 32, 0, 16, 1, 1, 5, 0.70, "no-u", 0, "standard_output",".", 10, 15, 0, 1); my $version = "[SSPACE_v1-1]"; my $seplines = ("-" x 60)."\n"; my ($MAX, $MAX_TOP, $TRACK_COUNT) = (0, 100, 1);# $MAX_TOP is the very maximum anchoring edge sequence that will be searched #-------------------------------------------------READ OPTIONS if(! ($opt_l)){ print "Usage: ".basename($0)." $version\n\n"; print "-l Library file containing two mate pate files with insert size, error and either mate pair or paired end indication.\n"; print "-s Fasta file containing contig sequences used for extension. Inserted pairs are mapped to extended and non-extended contigs (REQUIRED)\n"; print "-x Indicate whether to extend the contigs of -s using paired reads in -l. (-x 1=extension, -x 0=no extension, default -x 1)\n"; print "-m Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m $min_overlap)\n"; print "-o Minimum number of reads needed to call a base during an extension (default -o $base_overlap)\n"; print "-t Trim up to -t base(s) on the contig end when all possibilities have been exhausted for an extension (default -t $max_trim, optional)\n"; print "-b Base name for your output files (optional)\n"; print "-d Directory for storing output files (optional)\n"; print "-v Runs in verbose mode (-v 1=yes, -v 0=no, default -v 0, optional)\n"; print "============ Options below only considered for scaffolding ============\n"; print "-k Minimum number of links (read pairs) to compute scaffold (default -k $min_links, optional)\n"; print "-a Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a $max_link_ratio, optional)\n"; print "-n Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -n $min_tig_overlap, optional)\n"; print "-u Fasta/fastq file containing unpaired sequence reads (optional)\n"; print "-p Make .dot file for visualisation (-p 1=yes, -p 0=no, default -p 0, optional)\n"; exit; } die "ERROR: Parameter -s is required. Please insert a contig .fasta file\n" if(!$opt_s); my $filecontig = $opt_s if($opt_s); $min_overlap = $opt_m if ($opt_m); $base_overlap = $opt_o if ($opt_o); $max_trim = $opt_t if ($opt_t); $verbose = $opt_v if ($opt_v); $min_links = $opt_k if ($opt_k); $max_link_ratio = $opt_a if ($opt_a); $base_name = $opt_b if($opt_b); $outdir = $opt_d if ($opt_d); $min_tig_overlap = $opt_n if($opt_n); $unpaired_file = $opt_u if($opt_u); $doplot = $opt_p if($opt_p); $extending = $opt_x if($opt_x eq 1); my $libraryfile = $opt_l if ($opt_l); #-------------------------------------------------CHECKING PARAMETERS die "ERROR: Invalid library file $libraryfile ...Exiting.\n" if(! -e $libraryfile); die "ERROR: Invalid contig file $filecontig ...Exiting.\n" if(! -e $filecontig); die "ERROR: -x must be either 0 or 1. Your inserted -x is $extending...Exiting.\n" if(!($extending == 0 || $extending == 1)); die "ERROR: -m must be a number between 15-50. Your inserted -m is $min_overlap ...Exiting.\n" if($min_overlap < 15 || $min_overlap > 50); die "ERROR: -o must be set to 1 or higher. Your inserted -o is $base_overlap ...Exiting.\n" if($base_overlap < 1); die "ERROR: -k must be an integer number. Your inserted -k is $min_links ...Exiting.\n" if(!($min_links * 1 eq $min_links)); die "ERROR: -a must be a number between 0.00 and 1.00. Your inserted -a is $max_link_ratio ...Exiting.\n" if($max_link_ratio < 0 ||$max_link_ratio > 1 || !($max_link_ratio * 1 eq $max_link_ratio)); die "ERROR: -n must be an integer number. Your inserted -n is $min_tig_overlap ...Exiting.\n" if(!($min_tig_overlap * 1 eq $min_tig_overlap)); die "ERROR: Invalid unpaired file $unpaired_file -- fatal\n" if(! -e $unpaired_file && $opt_u); die "ERROR: -p must be either 0 or 1. Your inserted -p is $doplot...Exiting.\n" if(!($doplot == 0 || $doplot == 1)); #-------------------------------------------------check library file; open(FILELIB, "< $libraryfile"); my ($min_allowed, $low_iz, $up_iz, $library, $fileA, $fileB, $insert_size, $insert_stdev, $reverse); my $countline=0; while(){ chomp; $countline++; my @line = split(/\s+/, $_); if($#line >= 0){ die "ERROR: Line $countline in your library file ($libraryfile) contains $#line spaces, which should be 5 spaces. Check that no spaces are within the file names.\n" if($#line != 5); ($library, $fileA, $fileB, $insert_size, $insert_stdev, $reverse) = split(/\s+/, $_); die "ERROR: Invalid file in library $library: $fileA -- fatal\n" if(! -e $fileA); die "ERROR: Invalid file in library $library: $fileB -- fatal\n" if(! -e $fileB); die "ERROR: Insert size should be higher than or equal to 0. Your library $library has insert size of $insert_size. Exiting.\n" if(!($insert_size>0) || !($insert_size * 1 eq $insert_size)); die "ERROR: Insert stdev must be a number > 0.00. Your library $library has insert size of $insert_stdev. Exiting.\n" if($insert_stdev < 0 || !($insert_stdev * 1 eq $insert_stdev)); die "ERROR: Reverse must be either 0 or 1. Your library $library has reverse complement of $reverse ...Exiting.\n" if(!($reverse == 0 || $reverse == 1)); } } close FILELIB; #-------------------------------------------------Make folder structure mkpath("$outdir/intermediate_results"); mkpath("$outdir/pairinfo"); mkpath("$outdir/reads"); mkpath("$outdir/bowtieoutput"); my $unpaired = 0; $unpaired = $unpaired_file if (-e $opt_u && $extending == 1); #-------------------------------------------------Print input parameters my $contig = "$outdir/intermediate_results/" . $base_name . ".extendedcontigs.fasta"; $contig = "$outdir/intermediate_results/" . $base_name . ".formattedcontigs.fasta" if($extending == 0); my $log = $outdir."/".$base_name . ".logfile.txt"; my $summaryfile = $outdir."/".$base_name.".summaryfile.txt"; open (LOG, ">$log") || die "Can't write to $log -- fatal\n"; open (SUMFILE, ">$summaryfile") || die "Can't open $summaryfile -- fatal\n"; close SUMFILE; my $init_message = "Your inserted inputs on $version at ".getDate().":\nRequired inputs: \n\t-l = $libraryfile\n\t-s = $filecontig\n\t-b = $base_name\n\n"; $init_message .= "Optional inputs:\n\t-x = $extending\n\t-k = $min_links\n"; $init_message .= "\t-a = $max_link_ratio\n\t-n = $min_tig_overlap\n\t-p = $doplot\n\n"; $init_message .= "Contig extension inputs:\n\t-o = $base_overlap\n\t-t = $max_trim\n\t-m = $min_overlap\n\n" if($extending == 1); &printMessage($init_message); close LOG; #-------------------------------------------------READING AND CONVERTING INPUT SEQUENCES system("perl $Bin/bin/readLibFiles.pl $libraryfile $base_name $extending $unpaired $outdir"); #-------------------------------------------------FORMATTING OR EXTENDING CONTIGS system("perl $Bin/bin/ExtendOrFormatContigs.pl $contig $base_name $extending $filecontig $MIN_READ_LENGTH $base_overlap $min_overlap $min_base_ratio $max_trim $verbose $Bin $outdir"); #--------------------------------------------------UPDATE SUMMARY FILE open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; open (LOG, ">>$log") || die "Can't write to $log -- fatal\n"; my $sumfile .= "\nSUMMARY: \n".$seplines."\tInserted contig file;\n"; #write summary of initial contigs $sumfile = &writesummaryfiles($filecontig, "contig", $sumfile); $sumfile .= "\tAfter extension;\n" if($extending == 1); #write summary of extended contigs $sumfile = &writesummaryfiles($contig, "contig", $sumfile) if($extending == 1); &FlushFiles(); close LOG; close SUMFILE; #--------------------------------------------------READ BOWTIE OUTPUT open(FILELIB, "< $libraryfile") || die "Can't open $libraryfile -- fatal\n"; my ($lib, $fileA, $fileB, $insert_size, $insert_stdev, $pair, $headscaffolds, $prevlib, $mergedtigs, $evidencefile); while(){ chomp; &FlushFiles(); ($lib, $fileA, $fileB, $insert_size, $insert_stdev) = split(/\s+/, $_); next if($lib eq $prevlib || $lib eq ''); $prevlib = $lib; $min_allowed = -1 * ($insert_stdev * $insert_size); open (LOG, ">>$log") || die "Can't write to $log -- fatal\n"; &printMessage("\nLIBRARY $lib\n".$seplines); close LOG; open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; print SUMFILE "\n\nLIBRARY $lib STATS:\n".("#" x 80),"\n"; close SUMFILE; my $scaffold = $outdir."/intermediate_results/" . $base_name . ".$lib.scaffolds"; $mergedtigs = $outdir."/intermediate_results/" . $base_name . ".$lib.scaffolds.fasta"; my $issues = $outdir."/pairinfo/" . $base_name . ".$lib.pairing_issues"; my $distribution = $outdir."/pairinfo/" . $base_name . ".$lib.pairing_distribution.csv"; #-------------------------------------------------MAPPING READ PAIRS USING FILTERED FASTA FILE my $outfile = $outdir."/"."reads/$base_name.$lib.filtered.readpairs.singles.fasta"; my $up_iz = ($insert_size - $min_allowed); mkpath("$outdir/tmp.$base_name"); my $newcontig = processContig($contig, $up_iz); system("perl $Bin/bin/mapWithBowtie.pl $base_name $newcontig $outfile $lib $Bin $outdir"); #-------------------------------------------------Scaffold the contigs and generate .scaffold file my $bowtiefile = $outdir."/bowtieoutput/" . $base_name . ".$lib.mapped"; if(-e $bowtiefile){ system("perl $Bin/bin/PairingAndScaffolding.pl $contig $base_name $issues $distribution $verbose $lib $insert_size $min_allowed $scaffold $min_links $max_link_ratio $outdir"); #retrieve the contigs that were stored my $contigstored = "$outdir/tmp.$base_name/contigs.stored"; my $contigs = retrieve("$contigstored"); #-------------------------------------------------Generate .fasta file and .evidence file with scaffolds open (LOG, ">>$log") || die "Can't write to $log -- fatal\n"; ($headscaffolds, $evidencefile) = &mergeContigs($scaffold, $contigs, $mergedtigs, 100, $verbose, $min_tig_overlap,$max_count_trim); $contig = $mergedtigs; #-------------------------------------------------write summary of scaffolds $sumfile .= "\tAfter scaffolding $lib:\n"; $sumfile = &writesummaryfiles($mergedtigs, "scaffold", $sumfile); #------------------------------------------------- open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; print SUMFILE ("#" x 80),"\n"; close SUMFILE; &printMessage("\n$seplines"); $contigs = (''); undef $contigs; }else{ open (LOG, ">>$log") || die "Can't write to $log -- fatal\n"; &printMessage("WARNING: No scaffolding, because no reads found on contigs\n"); } my $removedir = "$outdir/tmp.$base_name"; rmtree([$removedir, 'blurfl/quux']); #remove 'tmp' folder }#END OF LIBRARY LOOP #-------------------------------------------------END OF LIBRARIES. PRINT SUMMARY TO FILE AND END SESSION my $finalfile = $outdir."/".$base_name . ".final.scaffolds.fasta"; my $finalevfile = $outdir."/".$base_name . ".final.evidence"; open (EVID, $evidencefile); open (FINALEV, "> $finalevfile"); while(){ print FINALEV $_; } open (SCAF, $mergedtigs); open (FINAL, "> $finalfile"); while(){ print FINAL $_; } #make .dot file for visualisation &visualiseScaffolds($outdir."/".$base_name.".visual_scaffolds", $evidencefile) if($doplot); open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; &printMessage("\n=>".getDate().": Creating summary file ($summaryfile)\n"); print SUMFILE $sumfile.$seplines; &printMessage(("*" x 50)."\n\nProcess run succesfully on ".getDate()."\n\n\n"); close SCAF; close FINAL; close EVID; close FINALEV; close LOG; close SUMFILE; #END OF MAIN PROGRAM #----------------- ###FUNCTION TO STORE ONLY THE EDGES OF THE CONTIGS. ONLY THESE EDGES ARE MAPPED, SAVING TIME FOR BUILDING THE INDEX WITH BOWTIE, AND MAPPING THE READS TO THE CONTIGS sub processContig{ my ($contigfile, $max_dist) = @_; my $lower = ($max_dist+200); open(IN,$contigfile) || die "can't read $contigfile -- fatal\n"; my $contigfilesub = "$outdir/tmp.$base_name/subset_contigs.fasta"; open(OUT,">$contigfilesub") || die "can't write to $contigfilesub -- fatal\n"; my ($seq, $counter) = ('', 0); while(){ chomp; my $line = $_; $seq.= uc($line) if(eof(IN)); if (/\>(\S+)/ || eof(IN)){ if($seq ne ''){ $counter++; if(length($seq) > (($lower * 2)+100)){ my $upper = (length($seq) - ($lower)); my $first = substr($seq, 0, $lower); my $second = substr($seq, $upper); my $newseq = $first."NNN".$second; print OUT ">$counter\n$newseq\n"; } else{ print OUT ">$counter\n$seq\n"; } } $seq=''; }else{ $seq.=uc($line); } } close IN; close OUT; return $contigfilesub; } ###MAKE A .FASTA FILE OF THE FOUND SCAFFOLDS. EITHER MERGE TWO CONTIGS WHEN A OVERLAP OF -n EXISTS OR PLACE A GAP sub mergeContigs{ my ($scaffold, $contigs, $mergedtigs, $chunk, $verbose,$min_tig_overlap,$max_count_trim) = @_; &printMessage("\n=>".getDate().": Merging contigs and creating fasta file of scaffolds ($mergedtigs)\n"); open(IN,$scaffold) || die "can't read $scaffold -- fatal\n"; my $evidence_file = $mergedtigs; $evidence_file =~ s/.fasta/.evidence/; open(SCAFS,">$evidence_file") || die "can't write to $evidence_file -- fatal\n"; open(OUT,">$mergedtigs") || die "can't write to $mergedtigs -- fatal\n"; my $scafhashcount = keys ( %$headscaffolds ); my $scaffoldHashStart; my ($tot,$sct,$ct_merge) = (0,0,0); while(){### each line is a scaffold chomp; my $sc="";; my @a = split(/\,/); my @tig; if($a[2]=~/\_/){ @tig = split(/\_/,$a[2]); }else{ push @tig, $a[2]; } CounterPrint(++$sct); #$sct++; my ($ct,$tigsum,$mct) = (0,0,0); my ($prev,$word,$template) = ("NA","NA","NA"); my ($seq,$prevseq,$headconcat,$prevEstimatedDistance, $prevLinks) = ("","","",""); foreach my $t (@tig){### each contig $ct++; if($t=~/([fr])(\d+)z(\d+)(\S+)?/i){ my $orient = $1; my $tnum=$2; my $head = $orient . $tnum; my $search = "tig" . $tnum; my $other = $4; $tot+= $3; $tigsum +=$3; my ($estimatedDistance, $links) = ("", ""); $estimatedDistance = $1 if($other=~/m((\-)?\d+)/); $links = $1 if($other=~/k((\-)?\d+)/); print "\tSC $a[0] - TIG $ct. pattern: $t search: $search totalTigSize: $tot Orientation: $orient Gap/Overlap estimated distance: $estimatedDistance\n" if($verbose); my $count_trim = 0; $seq = $contigs->{$tnum}{'seq'}; $seq = reverseComplement($seq) if($orient eq "r"); chomp $seq; my $prev; if($scafhashcount >0){ $prev = $headscaffolds->{$tnum}{'head'}; $prev =~ s/^\n//; chomp $prev; delete $headscaffolds->{$tnum}; chomp $prev; if($orient eq "r"){ my @prevarray = split("\n", $prev); if($#prevarray >=0){ my $newprev=""; my ($tnum, $sizetig, $links, $gap, $prevline) = ("","","","",""); for(my $i = $#prevarray; $i >= 0; $i--){ my @info = split(/\|/, $prevarray[$i]); if($#info eq 1){ ($tnum, $sizetig) = split(/\|/, $prevarray[$i]); }else{ ($tnum, $sizetig, $links, $gap) = split(/\|/, $prevarray[$i]); } $tnum =~ tr/fr/rf/; $newprev .= $prevline."|".$links."|".$gap."\n" if($prevline ne ""); $prevline = $tnum."|".$sizetig; } $newprev .= $prevline; $prev = $newprev; } } } else{ $prev = "$orient"."_$search|size".length($seq); } $prev .= "|links$links|gaps$estimatedDistance" if($links ne ""); #print "$prev\n"; if($word ne "NA"){ ##### if(length($seq)<=$chunk){ $template = $seq; }else{ $template = substr($seq,0,$chunk); } ##### word search my $dynamic_word = $word; if($prevEstimatedDistance <= 0){ SCAN: until($template =~ /$dynamic_word/){ $dynamic_word = substr($dynamic_word,1,length($dynamic_word)); if(length($dynamic_word) < $min_tig_overlap){ $count_trim++; last SCAN if($count_trim >= $max_count_trim); $dynamic_word = substr($word,0,length($word)-$count_trim); } } } if($prevEstimatedDistance <= 0 && $seq =~ /^\S{0,$max_count_trim}$dynamic_word(.*)/){### will grab the left-most match which is ok my $tail = $1; my $all = "ERROR_"; while($prevseq =~ /^(.*)$dynamic_word/ig){ $all = $1; } print "$prevseq **** $all **** WORD:$word *** DWord:$dynamic_word *** COUNTTRIM:$count_trim\n" if($all=~/ERROR/); $prevseq = $all . lc($dynamic_word) . $tail; my $overlap = length($dynamic_word); $ct_merge++; print "$ct_merge. GROUNDS FOR MERGING ($overlap nt overlap) !!!\n" if($verbose); $headconcat .= "|merged$overlap"."\n".$prev; }else{ ### ADDED RLW 5.MAR.2010 if($prevEstimatedDistance <= 0){ $prevseq .= "n" . $seq }else{ $prevseq .= ("N" x $prevEstimatedDistance) . $seq; } $headconcat .= "\n".$prev; } }else{ $prevseq = $seq; $headconcat = "\n".$prev; $mct++; } ##### For the next search if(length($seq)<=$chunk){ $word = $seq; }else{ $word = substr($seq,length($seq)-$chunk/2,$chunk/2); ### this will be the next word to search with } $prevEstimatedDistance = $estimatedDistance; $prevLinks = $links; }#tig regex }#each tig my $scsz = length($prevseq); $scaffoldHashStart->{$sct}{'head'} = $headconcat; my @line = split(/\n/, $headconcat); print SCAFS ">$a[0].$mct|size$scsz|tigs".($#line)."$headconcat\n\n"; print OUT ">$a[0].$mct|size$scsz\n$prevseq\n"; $prevseq = ''; } close IN; close SCAFS; close OUT; CounterPrint(" "); undef $contigs; &FlushFiles(); return ($scaffoldHashStart, $evidence_file); } ###WRITE SUMMARY STATISTICS FOR ALL CONTIGS OR SCAFFOLDS sub writesummaryfiles{ my ($input_file, $insert, $sumfile) = @_; open (INFILE, $input_file) || die "Can't open input file $input_file.\n"; my ($counter, $sum, $seq, $name, $foundN50, $sumN50) = (0,0, "","", 0, 0); my (@line, @lengths); while () { s/\r\n/\n/; chomp; $seq.= $_ if(eof(INFILE)); if ($_ =~ /^[>]/ || eof(INFILE)) { if($counter > 0){ push(@lengths, length($seq)); $sum+= length($seq); ($seq) = ""; } $counter++; } else { $seq .= $_; } } $counter--; my $half_length = $sum/2; my @lengths2 = reverse sort { $a <=> $b } @lengths; for(my $i = 0; $i <= $#lengths && $foundN50 == 0; $i++) { $sumN50 += @lengths2[$i]; if($sumN50 >= $half_length){ $foundN50 = @lengths2[$i] if($sumN50 >= $half_length); last; } } $sumfile .= "\t\tTotal number of $insert"."s = $counter\n"; $sumfile .= "\t\tSum (bp) = ". $sum. "\n"; $sumfile .= "\t\tMax $insert size = ". @lengths2[0]."\n"; $sumfile .= "\t\tMin $insert size = ". @lengths2[$#lengths]."\n"; $sumfile .= "\t\tAverage $insert size = ".int($sum/$counter)."\n"; $sumfile .= "\t\tN50 = ". $foundN50. "\n\n"; close (INFILE); close OUTFILE; return $sumfile; } ###FUNCTION TO GENERATE A VISUALISATION OF THE SCAFFOLDS AND THEIR CONTIGS IN .DOT FORMAT sub visualiseScaffolds{ my ($dotname, $evidence) = @_; my ($filext, $sizecutoff) = (1, 5000000); mkpath('dotfiles'); my $filename2 = "dotfiles/$dotname.part".$filext.".dot"; &printMessage("\n=>".getDate().": Producing .dot file for visualisation ($filename2)\n"); open(IN,$evidence) || die "can't read $evidence -- fatal\n"; open(DOT, ">$filename2") || die "can't open $filename2 -- fatal\n"; printHeader(\*DOT, undef); my ($prevtig, $prevgap, $prevlinks, $prevratio, $scafcount) = ("","","", "",0); while(){ chomp; my $line = $_; my $filesize = -s $filename2; if ($line =~ /^[>]/){ endCluster(\*DOT) if($scafcount > 0); my $filesize = -s $filename2; if($filesize > $sizecutoff){ printFooter(\*DOT); close(DOT); $filext++; $filename2 = "$dotname.part".$filext.".dot"; open(DOT, ">$filename2") || die "can't open $filename2 -- fatal\n"; printHeader(\*DOT, undef); } $scafcount++; $line =~ tr/[>\|]/ /; startCluster(\*DOT, $scafcount, "$line"); ($prevtig, $prevgap, $prevlinks, $prevratio) = ("","","", ""); } elsif($line =~ /^[fr]/){ my @info = split(/\|/, $line); my ($tnum, $sizetig, $links, $gap); if($#info eq 1){ ($tnum, $sizetig) = split(/\|/, $line); }else{ ($tnum, $sizetig, $links, $gap) = split(/\|/, $line); } my ($orient, $tig) = split(/_/,$tnum); my $ori=-1; my ($other, $gap2) = split(/gaps/,$gap); my ($other, $links2) = split(/links/,$links); $ori = 1 if($orient eq "f"); printNode(\*DOT, $tig, "$tig ($sizetig)", $ori); printEdge(\*DOT, $prevtig, $tig, "gap = $prevgap links = $prevlinks", undef) if($prevtig ne ""); $prevtig = $tig; $prevgap = $gap2; $prevlinks = $links2; } } endCluster(\*DOT) if($scafcount > 0); printFooter(\*DOT); close(DOT); close IN; } ###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE sub reverseComplement{ $_ = shift; tr/ATGC/TACG/; return (reverse()); } ###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE sub printMessage{ my $message = shift; print $message; print LOG $message; } ###FUNCTION TO GET THE CURRENT DATE sub getDate{ my $date = scalar(localtime); return $date; } ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE sub CounterPrint{ my $countingMessager = shift; print "\r$countingMessager"; $|++; } ###FLUSHES THE SUMMARY AND LOG FILE sub FlushFiles{ select((select(SUMFILE), $| = 1)[0]); select((select(LOG), $| = 1)[0]); $|++; } #########END MAIN SCRIPT