#!/usr/bin/env python '''manipulate blat PSL file.''' #import built-in modules import os,sys import re import string from optparse import OptionParser import warnings #import third-party modules #changes to the paths #changing history to this module __author__ = "Liguo Wang" __copyright__ = "Copyright 2010, Wei Li's Lab" __credits__ = [] __license__ = "GPL" __version__ = "1.0.1" __maintainer__ = "Liguo Wang" __email__ = "liguow@bcm.edu" __status__ = "Development" #Prototype or Production class PSL: '''manipulate PSL format file (blat output file)''' def __init__(self,inputfile_name,score=20,block=2): '''initialize this class arg1: inputfile psl file arg2: score cutoff. only report alignments with score >= this value. default=20 arg3: block count cutoff. only report alignments with block number <= this value. default=2 eg: a=PSL("filename",30,2) or a=PSL.PSL("filename",30,2) ''' self.__inputfile=open(inputfile_name,'r') self.__scoreCutoff=score #matched score smaller than this value will be removed self.__blockCutoff=block #matched blcok bigger than this value will be removed self.__pslLine=re.compile(r'^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+[+-]') #use this re to remove head lines self.__pslSplit=re.compile(r'\s+') self.__blankLine=re.compile(r'\s*\n') self.__data=[] totalLine=0 usedLine=0 nonPslLine=0 field=[] while True: self.__line=self.__inputfile.readline() if self.__blankLine.match(self.__line): #skip blank line continue elif self.__pslLine.match(self.__line): totalLine=totalLine+1 field=self.__pslSplit.split(self.__line) if string.atoi(field[0])< self.__scoreCutoff: continue elif string.atoi(field[17]) > self.__blockCutoff: continue usedLine = usedLine+1 self.__data.append(self.__line) else: nonPslLine = nonPslLine+1 if not self.__line:break #end of file print "\nTotal: ", totalLine, "lines" print "Used: ", usedLine, "lines" print "Non-PSL: ", nonPslLine,"lines","\n" def head(self,limit=10): '''print out header lines of PSL file, default first 10 lines eg: a.head(50) ''' count=0 #count how many lines have been printed field=[] for line in self.__data: line=line.rstrip("\n") print line count=count+1 if count >= limit:break def psl2bedFile(self,output_file=None): '''transform psl format into bed format. creat 6 column bed files. col1: target name, typically a chromosome col2: start coordinate col3: end coordinate col4: name of query. if a query is split. sequential number will be added at the end of the name col5: score. "matchScore.mismatch" col6: strand ''' if output_file is None: sys.stdout=sys.__stdout__ else: outfile=open(output_file,'w') sys.stdout=outfile field=[] blockSize=[] blockStart=[] for line in self.__data: line=line.rstrip('\n') field=self.__pslSplit.split(line) if string.atoi(field[17]) == 1: print field[13],"\t",field[15],"\t",field[16],"\t",field[9],"\t",field[0]+'.'+field[1],"\t",field[8] else: blockSize=field[18].split(',') blockSize.pop(-1) blockStart=field[20].split(',') blockStart.pop(-1) for i in range(0,len(blockSize)): print field[13],"\t",blockStart[i],"\t",string.atoi(blockStart[i])+string.atoi(blockSize[i]),"\t",field[9]+'.'+str(i+1),"\t",field[0]+'.'+field[1],"\t",field[8] def main(): parser=OptionParser() parser.add_option('-i','--input_file',dest="inputFileName",help="Input file name") parser.add_option('-o','--output_file',dest="outputFileName",help="Output file name") parser.add_option('-s','--score',dest="scoreCutoff",help="blat mapping score cutoff",type="int",default=20) parser.add_option('-b','--block_num',dest="blockCutoff",help="block number cutoff",type="int",default=2) (options,args)=parser.parse_args() obj=PSL(options.inputFileName,options.scoreCutoff,options.blockCutoff) obj.psl2bedFile(options.outputFileName) if __name__== '__main__': main() else: print >>sys.stderr, "module " + __name__ + " imported!"