#!/usr/bin/env ruby localdb = "sp" # database name from which homologues are collected # by locally installed blast. Leave this if you do # not use the '-l' option. mafftpath = "/opt/chipster/tools/mafft-7.130-without-extensions/bin/mafft" # path of mafft. "/usr/local/bin/mafft" # if mafft is in your command path, "mafft" is ok. blastpath = "blastall" # path of blastall. # if blastall is in your command path, "blastall" is ok. # mafft-homologs.rb v. 2.1 aligns sequences together with homologues # automatically collected from SwissProt via NCBI BLAST. # # mafft > 5.58 is required # # Usage: # mafft-homologs.rb [options] input > output # Options: # -a # the number of collected sequences (default: 50) # -e # threshold value (default: 1e-10) # -o "xxx" options for mafft # (default: " --op 1.53 --ep 0.123 --maxiterate 1000") # -l locally carries out blast searches instead of NCBI blast # (requires locally installed blast and a database) # -f outputs collected homologues also (default: off) # -w entire sequences are subjected to BLAST search # (default: well-aligned region only) require 'getopts' require 'tempfile' # mktemp GC.disable temp_vf = Tempfile.new("_vf").path temp_if = Tempfile.new("_if").path temp_pf = Tempfile.new("_pf").path temp_af = Tempfile.new("_af").path temp_qf = Tempfile.new("_qf").path temp_bf = Tempfile.new("_bf").path temp_rid = Tempfile.new("_rid").path temp_res = Tempfile.new("_res").path system( mafftpath + " --help > #{temp_vf} 2>&1" ) pfp = File.open( "#{temp_vf}", 'r' ) while pfp.gets break if $_ =~ /MAFFT v/ end pfp.close if( $_ ) then mafftversion = sub( /^\D*/, "" ).split(" ").slice(0).strip.to_s else mafftversion = "0" end if( mafftversion < "5.58" ) then puts "" puts "======================================================" puts "Install new mafft (v. >= 5.58)" puts "======================================================" puts "" exit end srand ( 0 ) def readfasta( fp, name, seq ) nseq = 0 tmpseq = "" while fp.gets if $_ =~ /^>/ then name.push( $_.sub(/>/,"").strip ) seq.push( tmpseq ) if nseq > 0 nseq += 1 tmpseq = "" else tmpseq += $_.strip end end seq.push( tmpseq ) return nseq end nadd = 50 eval = 1e-10 local = 0 fullout = 0 entiresearch = 0 corewin = 50 corethr = 0.3 mafftopt = " --op 1.53 --ep 0.123 --localpair --maxiterate 1000 --reorder " if getopts( "s", "f", "w", "l", "h", "e:", "a:", "o:", "c:", "d:" ) == nil || ARGV.length == 0 || $OPT_h then puts "Usage: #{$0} [-h -l -e# -a# -o\"[options for mafft]\"] input_file" exit end if $OPT_c then corewin = $OPT_c.to_i end if $OPT_d then corethr = $OPT_d.to_f end if $OPT_w entiresearch = 1 end if $OPT_f fullout = 1 end if $OPT_s fullout = 0 end if $OPT_l local = 1 end if $OPT_e then eval = $OPT_e.to_f end if $OPT_a then nadd = $OPT_a.to_i end if $OPT_o then mafftopt += " " + $OPT_o + " " end system "cat " + ARGV.to_s + " > #{temp_if}" ar = mafftopt.split(" ") nar = ar.length for i in 0..(nar-1) if ar[i] == "--seed" then system "cat #{ar[i+1]} >> #{temp_if}" end end nseq = 0 ifp = File.open( "#{temp_if}", 'r' ) while ifp.gets nseq += 1 if $_ =~ /^>/ end ifp.close if nseq >= 100 then STDERR.puts "The number of input sequences must be <100." exit elsif nseq == 1 then system( "cp #{temp_if}" + " #{temp_pf}" ) else STDERR.puts "Performing preliminary alignment .. " if entiresearch == 1 then # system( mafftpath + " --maxiterate 1000 --localpair #{temp_if} > #{temp_pf}" ) system( mafftpath + " --maxiterate 0 --retree 2 #{temp_if} > #{temp_pf}" ) else system( mafftpath + " --maxiterate 1000 --localpair --core --coreext --corethr #{corethr.to_s} --corewin #{corewin.to_s} #{temp_if} > #{temp_pf}" ) end end pfp = File.open( "#{temp_pf}", 'r' ) inname = [] inseq = [] slen = [] act = [] nin = 0 nin = readfasta( pfp, inname, inseq ) for i in 0..(nin-1) slen.push( inseq[i].gsub(/-/,"").length ) act.push( 1 ) end pfp.close pfp = File.open( "#{temp_if}", 'r' ) orname = [] orseq = [] nin = 0 nin = readfasta( pfp, orname, orseq ) pfp.close allen = inseq[0].length for i in 0..(nin-2) for j in (i+1)..(nin-1) next if act[i] == 0 next if act[j] == 0 pid = 0.0 total = 0 for a in 0..(allen-1) next if inseq[i][a,1] == "-" || inseq[j][a,1] == "-" total += 1 pid += 1.0 if inseq[i][a,1] == inseq[j][a,1] end pid /= total # puts "#{i.to_s}, #{j.to_s}, #{pid.to_s}" if pid > 0.5 then if slen[i] < slen[j] act[i] = 0 else act[j] = 0 end end end end #p act afp = File.open( "#{temp_af}", 'w' ) STDERR.puts "Searching .. \n" ids = [] add = [] sco = [] for i in 0..(nin-1) inseq[i].gsub!(/-/,"") afp.puts ">" + orname[i] afp.puts orseq[i] # afp.puts ">" + inname[i] # afp.puts inseq[i] STDERR.puts "Query (#{i+1}/#{nin})\n" + inname[i] if act[i] == 0 then STDERR.puts "Skip.\n\n" next end if local == 0 then command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?QUERY=" + inseq[i] + "&DATABASE=swissprot&HITLIST_SIZE=" + nadd.to_s + "&FILTER=L&EXPECT='" + eval.to_s + "'&FORMAT_TYPE=TEXT&PROGRAM=blastp&SERVICE=plain&NCBI_GI=on&PAGE=Proteins&CMD=Put' > #{temp_rid}" system command ridp = File.open( "#{temp_rid}", 'r' ) while ridp.gets break if $_ =~ / RID = (.*)/ end ridp.close rid = $1.strip STDERR.puts "Submitted to NCBI. rid = " + rid STDERR.printf "Waiting " while 1 STDERR.printf "." sleep 10 command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?RID=" + rid + "&DESCRIPTIONS=500&ALIGNMENTS=" + nadd.to_s + "&ALIGNMENT_TYPE=Pairwise&OVERVIEW=no&CMD=Get&FORMAT_TYPE=XML' > #{temp_res}" system command resp = File.open( "#{temp_res}", 'r' ) # resp.gets # if $_ =~ /WAITING/ then # resp.close # next # end while( resp.gets ) break if $_ =~ /QBlastInfoBegin/ end resp.gets if $_ =~ /WAITING/ then resp.close next else resp.close break end end else # puts "Not supported" # exit qfp = File.open( "#{temp_qf}", 'w' ) qfp.puts "> " qfp.puts inseq[i] qfp.close command = blastpath + " -p blastp -e #{eval} -b 1000 -m 7 -i #{temp_qf} -d #{localdb} > #{temp_res}" system command resp = File.open( "#{temp_res}", 'r' ) end STDERR.puts " Done.\n\n" resp = File.open( "#{temp_res}", 'r' ) while 1 while resp.gets break if $_ =~ /(.*)<\/Hit_id>/ || $_ =~ /()/ end id = $1 break if $_ =~ // # p id while resp.gets break if $_ =~ /(.*)<\/Hsp_bit-score>/ end score = $1.to_f # p score known = ids.index( id ) if known != nil then if sco[known] >= score then next else ids.delete_at( known ) add.delete_at( known ) sco.delete_at( known ) end end while resp.gets break if $_ =~ /(.*)<\/Hsp_hseq>/ end # break if $1 == nil target = $1.sub( /-/, "" ).sub( /U/, "X" ) # p target # STDERR.puts "adding 1 seq" ids.push( id ) sco.push( score ) add.push( target ) end resp.close end n = ids.length outnum = 0 while n > 0 && outnum < nadd m = rand( n ) afp.puts ">_addedbymaffte_" + ids[m] afp.puts add[m] ids.delete_at( m ) add.delete_at( m ) n -= 1 outnum += 1 end afp.close STDERR.puts "Performing alignment .. " system( mafftpath + mafftopt + " #{temp_af} > #{temp_bf}" ) STDERR.puts "done." bfp = File.open( "#{temp_bf}", 'r' ) outseq = [] outnam = [] readfasta( bfp, outnam, outseq ) bfp.close outseq2 = [] outnam2 = [] len = outseq.length for i in 0..(len-1) # p outnam[i] if fullout == 0 && outnam[i] =~ /_addedbymaffte_/ then next end outseq2.push( outseq[i] ) outnam2.push( outnam[i].sub( /_addedbymaffte_/, "_ho_" ) ) end nout = outseq2.length len = outseq[0].length p = len while p>0 p -= 1 allgap = 1 for j in 0..(nout-1) if outseq2[j][p,1] != "-" then allgap = 0 break end end if allgap == 1 then for j in 0..(nout-1) outseq2[j][p,1] = "" end end end for i in 0..(nout-1) puts ">" + outnam2[i] puts outseq2[i].gsub( /.{1,60}/, "\\0\n" ) end system( "rm -rf #{temp_if} #{temp_vf} #{temp_af} #{temp_bf} #{temp_pf} #{temp_qf} #{temp_res} #{temp_rid}" )