# -*- perl -*- ########################################################################### # # Copyright (C) 1997-2015 Nigel P. Brown # # (i) License # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # (ii) Contacts # # Project Admin: Nigel P. Brown # Email: biomview@gmail.com # Project URL: http://bio-mview.sourceforge.net # # (iii) Citation # # Please acknowledge use of this Program by citing the following reference in # any published work including web-sites: # # Brown, N.P., Leroy C., Sander C. (1998) MView: A Web compatible database # search or multiple alignment viewer. Bioinformatics. 14(4):380-381. # # and provide a link to the MView project URL given above under 'Contacts'. # ########################################################################### # $Id: FASTA1.pm,v 1.14 2015/06/14 17:09:04 npb Exp $ ########################################################################### # # Handles: fasta 1.x # # Acknowledgements. # # Christophe Leroy, 22/8/97 for original fasta1 parser. # ########################################################################### package NPB::Parse::Format::FASTA1; use NPB::Parse::Format::FASTA; use strict; use vars qw(@ISA @VERSIONS $NULL $ENTRY_START $ENTRY_END $HEADER_START $HEADER_END $RANK_START $RANK_END $TRAILER_START $TRAILER_END $MATCH_START $MATCH_END $SUM_START $SUM_END $ALN_START $ALN_END ); @ISA = qw(NPB::Parse::Format::FASTA); @VERSIONS = ( '1' => [ 'FASTA', ], ); $NULL = '^\s*$';#for emacs'; $ENTRY_START = '(?:' . '^\s*fasta.*searches a sequence data bank' . '|' . '^\s*\S+\s*[,:]\s+\d+\s+(?:aa|nt)' . ')'; $ENTRY_END = 'Library scan:'; $HEADER_START = $ENTRY_START; $HEADER_END = '^The best scores are:'; $RANK_START = $HEADER_END; $RANK_END = $NULL; $TRAILER_START = $ENTRY_END; $TRAILER_END = $ENTRY_END; $MATCH_START = '^\S{7}.*\d+\s+\d+\s+\d+\s*$';#for emacs'; $MATCH_END = "(?:$MATCH_START|$ENTRY_END)"; $SUM_START = $MATCH_START; $SUM_END = $NULL; $ALN_START = '^(?:\s+\d+\s+|\s+$)'; #the ruler $ALN_END = $MATCH_END; sub new { my $self=shift; $self->SUPER::new(@_) } ########################################################################### package NPB::Parse::Format::FASTA1::MATCH::ALN; use NPB::Parse::Math; use vars qw(@ISA); @ISA = qw(NPB::Parse::Format::FASTA::MATCH::ALN); sub new { my $type = shift; if (@_ < 2) { #at least two args, ($offset, $bytes are optional). NPB::Message::die($type, "new() invalid arguments (@_)"); } my ($parent, $text, $offset, $bytes) = (@_, -1, -1); my ($self, $line, $record); $self = new NPB::Parse::Record($type, $parent, $text, $offset, $bytes); $text = new NPB::Parse::Record_Stream($self); my ($query_start, $query_stop, $sbjct_start, $sbjct_stop) = (0,0,0,0); my ($query_leader,$query_trailer,$sbjct_leader,$sbjct_trailer) = (0,0,0,0); my ($query_length, $sbjct_length, $query_orient, $sbjct_orient); my ($query, $align, $sbjct, $len) = ('','',''); my ($x, $querykey, $sbjctkey, $depth); #first record my @tmp = (); #warn $text->inspect_stream; $line = $text->next_line(1); #initial empty line before the alignment ruler if ($line =~ /^$/) { $line = $text->next_line(1); } if ($line =~ /^\s*(\S+)\s+([^0-9 ]+)$/) { #no query ruler on first line, due to pure leading gaps seen #in ALIGN and ALIGN0 or very short sequences in LALIGN. #very short sequences can mean that start/stop is missing! $tmp[0] = ''; #missing ruler $tmp[1] = $line; #query sequence $tmp[2] = $text->next_line(1); #match pattern $tmp[3] = $text->next_line(1); #sbjct sequence $tmp[4] = $text->next_line(1); #sbjct ruler } elsif ($line =~ /^\s*\d+/) { #query ruler present on first line $tmp[0] = $line; #query ruler $tmp[1] = $text->next_line(1); #query sequence $tmp[2] = $text->next_line(1); #match pattern $tmp[3] = $text->next_line(1); #sbjct sequence $tmp[4] = $text->next_line(1); #sbjct ruler } elsif ($line =~ /^\s+$/) { #ruler has no numbers as sequence is too short $tmp[0] = ''; #missing ruler $tmp[1] = $text->next_line(1); #query sequence $tmp[2] = $text->next_line(1); #match pattern $tmp[3] = $text->next_line(1); #sbjct sequence $tmp[4] = $text->next_line(1); #sbjct ruler } else { $self->die("unexpected line: [$line]\n"); } #determine depth of sequence labels at left: take the shorter #since either sequence may begin with a gap $tmp[1] =~ /^(\s*\S+\s*)/; $depth = length $1; $tmp[3] =~ /^(\s*\S+\s*)/; $x = length $1; $depth = ($depth < $x ? $depth : $x); #recover sequence row names $querykey = substr($tmp[1], 0, $depth); $sbjctkey = substr($tmp[3], 0, $depth); #strip leading name $tmp[1] = substr($tmp[1], $depth); $tmp[2] = substr($tmp[2], $depth); $tmp[3] = substr($tmp[3], $depth); #warn "#0##$tmp[0]\n"; #warn "#1##$tmp[1]\n"; #warn "#2##$tmp[2]\n"; #warn "#3##$tmp[3]\n"; #warn "#4##$tmp[4]\n"; #query/sbjct/match lines #warn "|$tmp[1]|\n|$tmp[2]|\n|$tmp[3]|\n"; $len = max(length($tmp[1]), length($tmp[3])); $tmp[1] .= ' ' x ($len-length $tmp[1]) if length $tmp[1] < $len; $tmp[2] .= ' ' x ($len-length $tmp[2]) if length $tmp[2] < $len; $tmp[3] .= ' ' x ($len-length $tmp[3]) if length $tmp[3] < $len; $query = $tmp[1]; $align = $tmp[2]; $sbjct = $tmp[3]; #initialise query length $x = $tmp[1]; $x =~ tr/- //d; $query_length = length($x); #first query start/stop, if any if ($tmp[0] =~ /^\s*(\d+)/) { $query_start = $1; } if ($tmp[0] =~ /(\d+)\s*$/) { $query_stop = $1; } #compute length of query before query_start label $x = index($tmp[0], $query_start) + length($query_start) -1; $x = substr($tmp[1], 0, $x-$depth); #whole leader $x =~ tr/- //d; #leader minus gaps $query_leader = length($x); #initialise sbjct length $x = $tmp[3]; $x =~ tr/- //d; $sbjct_length = length($x); #first sbjct start/stop, if any if ($tmp[4] =~ /^\s*(\d+)/) { $sbjct_start = $1; } if ($tmp[4] =~ /(\d+)\s*$/) { $sbjct_stop = $1; } #compute length of sbjct before sbjct_start label $x = index($tmp[4], $sbjct_start) + length($sbjct_start) -1; $x = substr($tmp[3], 0, $x-$depth); #whole leader $x =~ tr/- //d; #leader minus gaps $sbjct_leader = length($x); #warn "ENTRY ($querykey, $sbjctkey)\n"; #warn "($query_length/$query_leader) ($sbjct_length/$sbjct_leader)\n"; #remaining records while (defined ($line = $text->next_line(1))) { next if $line =~ /^\s*$/; @tmp = (); if ($line =~ /^\s*\d+(?:\s|$)/) { #query ruler #warn "QUERY+RULER\n"; $tmp[0] = $line; #query ruler $line = $text->next_line(1); $tmp[1] = substr($line, $depth); #query sequence $line = $text->next_line(1); if ($line) { $tmp[2] = substr($line, $depth); #match pattern } else { $tmp[2] = ' ' x length $tmp[1]; } $line = $text->next_line(1); if ($line) { $tmp[3] = substr($line, $depth); #sbjct sequence } else { $tmp[3] = ' ' x length $tmp[1]; } $line = $text->next_line(1); if ($line) { $tmp[4] = $line; #sbjct ruler } else { $tmp[4] = ''; } } elsif (index($line, $sbjctkey) == 0) { #sbjct sequence #warn "SBJCT (####)\n"; if ($line) { $tmp[3] = substr($line, $depth); #sbjct sequence } else { $tmp[3] = ' ' x length $tmp[1]; } $line = $text->next_line(1); if ($line) { $tmp[4] = $line; #sbjct ruler } else { $tmp[4] = ''; } $tmp[0] = ''; $tmp[1] = ' ' x length $tmp[3]; $tmp[2] = ' ' x length $tmp[3]; } elsif (index($line, $querykey) == 0) { #query sequence (no ruler) #warn "QUERY (####)\n"; $tmp[1] = substr($line, $depth); #query sequence $line = $text->next_line(1); if ($line) { $tmp[2] = substr($line, $depth); #match pattern } else { $tmp[2] = ' ' x length $tmp[1]; } $line = $text->next_line(1); if ($line) { $tmp[3] = substr($line, $depth); #sbjct sequence } else { $tmp[3] = ' ' x length $tmp[1]; } $line = $text->next_line(1); if ($line) { $tmp[4] = $line; #sbjct ruler } else { $tmp[4] = ''; } $tmp[0] = ''; } else { $self->die("unexpected line: [$line]\n"); } #warn "#0##$tmp[0]\n"; #warn "#1##$tmp[1]\n"; #warn "#2##$tmp[2]\n"; #warn "#3##$tmp[3]\n"; #warn "#4##$tmp[4]\n"; #subsequent query start/stop, if any if ($query_start < 1 and $tmp[0] =~ /^\s*(\d+)/) { $query_start = $1; #compute length of query before query_start label $x = index($tmp[0], $query_start) + length($query_start) -1; $x = substr($tmp[1], 0, $x-$depth); #whole leader $x =~ tr/- //d; #leader minus gaps $query_leader = length($x); } if ($tmp[0] =~ /(\d+)\s*$/) { $query_stop = $1; #always update stop } #subsequent sbjct start/stop, if any if ($sbjct_start < 1 and $tmp[4] =~ /^\s*(\d+)/) { $sbjct_start = $1; } if ($tmp[4] =~ /(\d+)\s*$/) { $sbjct_stop = $1; #always update stop } #increment query length $x = $tmp[1]; $x =~ tr/- //d; $query_length += length($x); #increment sbjct length $x = $tmp[3]; $x =~ tr/- //d; $sbjct_length += length($x); #query/sbjct/match lines #warn "|$tmp[1]|\n|$tmp[2]|\n|$tmp[3]|\n"; $len = max(length($tmp[1]), length($tmp[3])); if (length $tmp[1] < $len) { $tmp[1] .= ' ' x ($len-length $tmp[1]); } if (length $tmp[2] < $len) { $tmp[2] .= ' ' x ($len-length $tmp[2]); } if (length $tmp[3] < $len) { $tmp[3] .= ' ' x ($len-length $tmp[3]); } $query .= $tmp[1]; $align .= $tmp[2]; $sbjct .= $tmp[3]; #warn "($query_length) ($sbjct_length)\n"; } #warn "$query\n$sbjct\n"; #warn $text->inspect_stream; #determine query orientation and start/stop #warn "QUERY ($query_start, $query_stop, $query_length, $query_leader)\n"; ($query_orient, $query_start, $query_stop) = $self->adjust_counts($query_start, $query_stop, $query_length, $query_leader, $self->query_base()); #determine sbjct orientation and start/stop #warn "SBJCT ($sbjct_start, $sbjct_stop, $sbjct_length, $sbjct_leader)\n"; ($sbjct_orient, $sbjct_start, $sbjct_stop) = $self->adjust_counts($sbjct_start, $sbjct_stop, $sbjct_length, $sbjct_leader, $self->sbjct_base()); #warn "EXIT ($query_orient, $query_start, $query_stop) ($sbjct_orient, $sbjct_start, $sbjct_stop)\n"; #query_leader/query_trailer $query =~ /^(\s*)/; $query_leader = length $1; $query =~ /(\s*)$/; $query_trailer = length $1; #sbjct_leader/sbjct_trailer $sbjct =~ /^(\s*)/; $sbjct_leader = length $1; $sbjct =~ /(\s*)$/; $sbjct_trailer = length $1; $self->{'query'} = $query; $self->{'align'} = $align; $self->{'sbjct'} = $sbjct; $self->{'query_orient'} = $query_orient; $self->{'query_start'} = $query_start; $self->{'query_stop'} = $query_stop; $self->{'query_leader'} = $query_leader; $self->{'query_trailer'} = $query_trailer; $self->{'sbjct_orient'} = $sbjct_orient; $self->{'sbjct_start'} = $sbjct_start; $self->{'sbjct_stop'} = $sbjct_stop; $self->{'sbjct_leader'} = $sbjct_leader; $self->{'sbjct_trailer'} = $sbjct_trailer; $self; } #Generic functions to fix start/stop positions suitable for any fasta member #that uses untranslated sequences where the stated sequence ranges use the #same numbering system as the displayed sequences. Must be overridden as #needed: # # fasta dna x dna or pro x pro (ok, base=1) # fasta[xy] dna x pro (override query, base=3) # tfast[axy] pro x dna (override sbjct, base=3) # sub query_base { return 1 } sub sbjct_base { return 1 } #determine orientation and adjust start/stop sub adjust_counts { my ($self, $start, $stop, $length, $leader, $base) = @_; my $orient = '+'; $orient = '-' if $start > $stop; $leader *= $base; $length *= $base; #counting untranslated sequence if ($orient eq '+') { $start -= $leader; $stop = $start + $length - 1; } else { $start += $leader; $stop = $start - $length + 1; } return ($orient, $start, $stop); } ########################################################################### 1;