#!/usr/bin/env python #import built-in modules import os,sys import re import string from optparse import OptionParser import warnings import string import collections import math from string import maketrans #import gc from operator import itemgetter from itertools import groupby #import third-party modules import psyco_full from bx.bitset import * from bx.bitset_builders import * from bx.intervals import * from itertools import * import fasta import pysam #changes to the paths #changing history to this module #05/26/2011: suppport multiple spliced mapped reads __author__ = "Liguo Wang" __copyright__ = "Copyright 2010, Wei Li's Lab" __credits__ = [] __license__ = "GPL" __version__ = "1.1.0" __maintainer__ = "Liguo Wang" __email__ = "liguow@bcm.edu" __status__ = "Development" #Prototype or Production class ParseBED: '''manipulate BED (http://genome.ucsc.edu/FAQ/FAQformat.html) format file.''' def __init__(self,bedFile): '''This is constructor of ParseBED''' self.transtab = string.maketrans("ACGTNX","TGCANX") self.f=open(bedFile,'r') self.fileName=os.path.basename(bedFile) self.ABS_fileName=bedFile def groupingBED(infile,outfile=None,strand=True,boundary="utr"): '''Group overlapping bed entries together. When strand=Ture, overlapping bed entries within the same strand will be merged. When strand=False, all overlapping bed entries are merged, regardless of the strand. When boundary="utr" (recommended), UTR regions are considered in determining overlapping relationship. When boundary="cds", UTR resgions are NOT considered in determing overlapping relationship''' f=open(infile,'r') bed_line=re.compile(r'^\S+\s+\d+\s+\d+') groups = collections.defaultdict(list) #key is internal ID, value is list of bed lines groups_boundary_st = {} #key is internal ID, value is leftmost boundary groups_boundary_end = {} #key is internal ID, value is rightmost boundary Orig_bedNum=0 cluster_Num=0 #check if the input bed file is properly sorted print >>sys.stderr,"check if input bed file is sorted properly ...", i=0 for line in f: line=line.rstrip('\r\n') line=line.lstrip() if bed_line.match(line): fields=line.split() i +=1 if i==1: chrom = fields[0] txStart=int(fields[1]) if i>1: if ((fields[0] < chrom) or ((fields[0] == chrom) and (int(fields[1]) >sys.stderr, "File not properly sorted:" + line exit(1) chrom = fields[0] txStart=int(fields[1]) else: #well, file seems to be OK print >>sys.stderr,"OK!" f.seek(0) if strand: #want to consider strand information. for line in f: line=line.rstrip('\r\n') line=line.lstrip() fields=line.split() if bed_line.match(line) and len(fields)==12: if fields[5] == '-':continue #merge overlapping transcripts on plus strand Orig_bedNum +=1 if Orig_bedNum ==1: #this is first bed entry overlap_flag=0 #has nothing to overlap because it's first entry chrom = fields[0] strand = fields[5] elif Orig_bedNum > 1: #this is NOT first entry overlap_flag=1 #we suppose current line is overlapped with previous one. Unless it can prove it's NOT! if fields[0] != chrom: #not the same chromosome overlap_flag=0 if (boundary == "utr") and ( fields[0] == chrom) and (int(fields[1]) >= groups_boundary_end[cluster_Num]): overlap_flag=0 if (boundary == "cds") and ( fields[0] == chrom) and (int(fields[6]) > groups_boundary_end[cluster_Num]): overlap_flag=0 if overlap_flag ==0: cluster_Num +=1 groups[cluster_Num].append(line) if (boundary == "utr"): groups_boundary_st[cluster_Num]=int(fields[1]) groups_boundary_end[cluster_Num]=int(fields[2]) if (boundary == "cds"): groups_boundary_st[cluster_Num]=int(fields[6]) groups_boundary_end[cluster_Num]=int(fields[7]) strand = fields[5] chrom = fields[0] elif overlap_flag ==1: groups[cluster_Num].append(line) strand = fields[5] chrom = fields[0] if (boundary == "utr"): groups_boundary_st[cluster_Num]=min(groups_boundary_st[cluster_Num],int(fields[1])) groups_boundary_end[cluster_Num] = max(groups_boundary_end[cluster_Num],int(fields[2])) if (boundary == "cds"): groups_boundary_st[cluster_Num]=min(groups_boundary_st[cluster_Num],int(fields[6])) groups_boundary_end[cluster_Num] = max(groups_boundary_end[cluster_Num],int(fields[7])) else: print >>sys.stderr,"unknown line:" + line +'\n' f.seek(0) #merge overlapping transcripts on minus strand Orig_bedNum=0 for line in f: line=line.rstrip('\r\n') line=line.lstrip() fields=line.split() if bed_line.match(line) and len(fields)==12: if fields[5] == '+':continue #merge overlapping transcripts on plus strand Orig_bedNum +=1 if Orig_bedNum ==1: #this is first bed entry overlap_flag=0 #has nothing to overlap because it's first entry chrom = fields[0] strand = fields[5] elif Orig_bedNum > 1: #this is NOT first entry overlap_flag=1 #we suppose current line is overlapped with previous one. Unless it can prove it's NOT! if fields[0] != chrom: #not the same chromosome overlap_flag=0 if (boundary == "utr") and ( fields[0] == chrom) and (int(fields[1]) >= groups_boundary_end[cluster_Num]): overlap_flag=0 if (boundary == "cds") and ( fields[0] == chrom) and (int(fields[6]) > groups_boundary_end[cluster_Num]): overlap_flag=0 if overlap_flag ==0: cluster_Num +=1 groups[cluster_Num].append(line) if (boundary == "utr"): groups_boundary_st[cluster_Num]=int(fields[1]) groups_boundary_end[cluster_Num]=int(fields[2]) if (boundary == "cds"): groups_boundary_st[cluster_Num]=int(fields[6]) groups_boundary_end[cluster_Num]=int(fields[7]) strand = fields[5] chrom = fields[0] elif overlap_flag ==1: groups[cluster_Num].append(line) strand = fields[5] chrom = fields[0] if (boundary == "utr"): groups_boundary_st[cluster_Num]=min(groups_boundary_st[cluster_Num],int(fields[1])) groups_boundary_end[cluster_Num] = max(groups_boundary_end[cluster_Num],int(fields[2])) if (boundary == "cds"): groups_boundary_st[cluster_Num]=min(groups_boundary_st[cluster_Num],int(fields[6])) groups_boundary_end[cluster_Num] = max(groups_boundary_end[cluster_Num],int(fields[7])) else: print >>sys.stderr,"unknown line:" + line +'\n' f.seek(0) else: # NOT want to consider strand information. merge + and - together Orig_bedNum=0 cluster_Num=0 for line in f: line=line.rstrip('\r\n') line=line.lstrip() fields=line.split() if bed_line.match(line) and len(fields)==12: #if fields[5] == '-':continue #merge overlapping transcripts on plus strand Orig_bedNum +=1 if Orig_bedNum ==1: #this is first bed entry overlap_flag=0 #has nothing to overlap because it's first entry chrom = fields[0] strand = fields[5] elif Orig_bedNum > 1: #this is NOT first entry overlap_flag=1 #we suppose current line is overlapped with previous one. Unless it can prove it's NOT! if fields[0] != chrom: #not the same chromosome overlap_flag=0 if (boundary == "utr") and ( fields[0] == chrom) and (int(fields[1]) >= groups_boundary_end[cluster_Num]): overlap_flag=0 if (boundary == "cds") and ( fields[0] == chrom) and (int(fields[6]) > groups_boundary_end[cluster_Num]): overlap_flag=0 if overlap_flag ==0: cluster_Num +=1 groups[cluster_Num].append(line) if (boundary == "utr"): groups_boundary_st[cluster_Num]=int(fields[1]) groups_boundary_end[cluster_Num]=int(fields[2]) if (boundary == "cds"): groups_boundary_st[cluster_Num]=int(fields[6]) groups_boundary_end[cluster_Num]=int(fields[7]) strand = fields[5] chrom = fields[0] elif overlap_flag ==1: groups[cluster_Num].append(line) strand = fields[5] chrom = fields[0] if (boundary == "utr"): groups_boundary_st[cluster_Num]=min(groups_boundary_st[cluster_Num],int(fields[1])) groups_boundary_end[cluster_Num] = max(groups_boundary_end[cluster_Num],int(fields[2])) if (boundary == "cds"): groups_boundary_st[cluster_Num]=min(groups_boundary_st[cluster_Num],int(fields[6])) groups_boundary_end[cluster_Num] = max(groups_boundary_end[cluster_Num],int(fields[7])) else: print >>sys.stderr,"unknown line:" + line +'\n' f.seek(0) try: FO=open(outfile,'w') print >>sys.stderr, "Writing to " + outfile + "\n" for k in sorted(groups.iterkeys()): FO.write( "Group_"+str(k) + "\t" + str(groups_boundary_st[k]) +'\t' + str(groups_boundary_end[k]) + ":\n") for line in groups[k]: FO.write("\t"+line+"\n") except: #return (groups,groups_boundary_st,groups_boundary_end) return groups groupingBED=staticmethod(groupingBED) def complementBED(self,sizeFile,outfile=None): '''Return the complement regions of a bed file. Requires a genomeSizeFile that maps chromosome to sizes. In genomeSizeFile, each row contains two columns (separaed by white spaces): the 1st column is chromosome, and the 2nd column is size. An example of sizeFile: chr1 247249719 chr2 242951149 chr3 199501827 ... ''' if outfile is None: outfile=self.fileName + ".compl.bed" FO=open(outfile,'w') #read sizeFile chrSize={} for line in open(sizeFile): line=line.rstrip(' \n') line=line.lstrip() fields = line.split() if ((len(fields)) != 2):continue chrSize[fields[0]] = int(fields[1]) bitsets = binned_bitsets_from_file( open( self.ABS_fileName ) ) for chrom in chrSize: if chrom in bitsets: bits = bitsets[chrom] bits.invert() length = chrSize[chrom] end=0 while 1: start = bits.next_set( end ) if start == bits.size: break end = bits.next_clear( start ) if end > length: end = length FO.write(chrom + "\t" + str(start) + "\t" + str(end) + "\n") if end == length: break else: FO.write(chrom + "\t0\t" + str(chrSize[chrom]) + "\n") FO.close() self.f.seek(0) def bedToWig(self,outfile=None,log2scale=False,header=True): '''Transform bed file into wiggle format. Input bed must have at least 3 columns[chrom St End]. For bed12 file, intron regions are automatically excluded. NOTE bed is 0-based and half-open while wiggle is 1-based.''' if outfile is None: outfile = self.fileName + ".wig" FO=open(outfile,'w') wig=collections.defaultdict(dict) headline="track type=wiggle_0 name=" + outfile + " track_label description='' visibility=full color=255,0,0" print >>sys.stderr, "Writing wig file to \"",outfile,"\"..." for line in self.f: line=line.rstrip('\r\n') if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue fields=line.rstrip('\n').split() coverReg=[] if len(fields)<12 and len(fields)>2 : chrom=fields[0] coverReg = range(int(fields[1])+1,int(fields[2])+1) elif len(fields)==12: chrom=fields[0] exon_sizes = map(int,fields[10].rstrip(',\n').split(',')) exon_starts = map(int, fields[11].rstrip( ',\n' ).split( ',' ) ) for st,size in zip(exon_starts,exon_sizes): coverReg.extend(range(int(fields[1]) + st + 1, int(fields[1]) + st + size +1 )) else:continue for i in coverReg: if wig[chrom].has_key(i): wig[chrom][i] +=1 else: wig[chrom][i]=1 if header:FO.write(headline + "\n") for chr in sorted(wig.keys()): print >>sys.stderr, "Writing ",chr, " ..." FO.write('variableStep chrom='+chr+'\n') for coord in sorted(wig[chr]): if log2scale: FO.write("%d\t%5.3f\n" % (coord,math.log(wig[chr][coord],2))) else: FO.write("%d\t%d\n" % (coord,wig[chr][coord])) self.f.seek(0) FO.close() def bedToGFF(self,outfile=None): '''Transform bed file into GFF format. Borrowed from Galaxy with slight change''' input_name = self.ABS_fileName if outfile is None: output_name = self.fileName + ".GFF" else: output_name=outfile skipped_lines = 0 first_skipped_line = 0 out = open( output_name, 'w' ) out.write( "##gff-version 2\n" ) out.write( "##bed_to_gff_converter.py\n\n" ) i = 0 for i, line in enumerate( self.f ): complete_bed = False line = line.rstrip( '\r\n' ) if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ): try: elems = line.split( '\t' ) if len( elems ) == 12: complete_bed = True chrom = elems[0] if complete_bed: feature = "mRNA" else: try: feature = elems[3] except: feature = 'feature%d' % ( i + 1 ) start = int( elems[1] ) + 1 end = int( elems[2] ) try: score = elems[4] except: score = '0' try: strand = elems[5] except: strand = '+' try: group = elems[3] except: group = 'group%d' % ( i + 1 ) if complete_bed: out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group ) ) else: out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group ) ) if complete_bed: # We have all the info necessary to annotate exons for genes and mRNAs block_count = int( elems[9] ) block_sizes = elems[10].split( ',' ) block_starts = elems[11].split( ',' ) for j in range( block_count ): exon_start = int( start ) + int( block_starts[j] ) exon_end = exon_start + int( block_sizes[j] ) - 1 out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) ) except: skipped_lines += 1 if not first_skipped_line: first_skipped_line = i + 1 else: skipped_lines += 1 if not first_skipped_line: first_skipped_line = i + 1 out.close() info_msg = "%i lines converted to GFF version 2. " % ( i + 1 - skipped_lines ) if skipped_lines > 0: info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) print info_msg def getUTR(self,utr=35): '''Extract UTR regions from input bed file (must be 12-column). output is 6-column bed format. When utr=35 [default], extract both 5' and 3' UTR. When utr=3, only extract 3' UTR. When utr=5, only extract 5' UTR''' ret_lst=[] for line in self.f: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue fields=line.rstrip('\r\n').split() chrom=fields[0] geneName=fields[3] strand=fields[5] txStart=int(fields[1]) txEnd=int(fields[2]) cdsStart=int(fields[6]) cdsEnd=int(fields[7]) exon_start=map(int,fields[11].rstrip(',').split(',')) exon_start=map((lambda x: x + txStart),exon_start) exon_end=map(int,fields[10].rstrip(',').split(',')) exon_end=map((lambda x,y:x+y),exon_start,exon_end) if strand == '+': if (utr==35 or utr==5): for st,end in zip(exon_start,exon_end): if st < cdsStart: utr_st = st utr_end = min(end,cdsStart) ret_lst.append([chrom,utr_st,utr_end,geneName,'0',strand]) if (utr==35 or utr==3): for st,end in zip(exon_start,exon_end): if end > cdsEnd: utr_st = max(st, cdsEnd) utr_end = end ret_lst.append([chrom,utr_st,utr_end,geneName,'0',strand]) if strand == '-': if (utr==35 or utr==3): for st,end in zip(exon_start,exon_end): if st < cdsStart: utr_st = st utr_end = min(end,cdsStart) ret_lst.append([chrom,utr_st,utr_end,geneName,'0',strand]) if (utr==35 or utr==5): for st,end in zip(exon_start,exon_end): if end > cdsEnd: utr_st = max(st, cdsEnd) utr_end = end ret_lst.append([chrom,utr_st,utr_end,geneName,'0',strand]) self.f.seek(0) return ret_lst def getExon(self): '''Extract exon regions from input bed file (must be 12-column). output is 6-column Tab separated bed file, each row represents one exon''' ret_lst=[] for line in self.f: try: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue fields=line.rstrip('\r\n').split() txStart=int(fields[1]) chrom=fields[0] strand=fields[5] geneName=fields[3] score=fields[4] exon_start=map(int,fields[11].rstrip(',').split(',')) exon_start=map((lambda x: x + txStart),exon_start) exon_end=map(int,fields[10].rstrip(',').split(',')) exon_end=map((lambda x,y:x+y),exon_start,exon_end) except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue if(strand == '-'): exonNum=len(exon_start) for st,end in zip(exon_start,exon_end): #FO.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t"+ str(score) + "\t" + strand + '\n') #exonNum -= 1 ret_lst.append([chrom,st,end]) else: exonNum=1 for st,end in zip(exon_start,exon_end): #FO.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t"+ str(score) +"\t" + strand + '\n') #exonNum += 1 ret_lst.append([chrom,st,end]) self.f.seek(0) return ret_lst def getTranscriptRanges(self): '''Extract exon regions from input bed file (must be 12-column). Return ranges of transcript''' mRNA_ranges = [] for line in self.f: try: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue fields=line.rstrip('\r\n').split() txStart = int(fields[1]) txEnd = int(fields[2]) chrom = fields[0] strand = fields[5] geneName = fields[3] yield([chrom, txStart, txEnd, strand, geneName + ":" + chrom + ":" + str(txStart) + '-' + str(txEnd)]) except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue self.f.seek(0) def getCDSExon(self): '''Extract CDS exon regions from input bed file (must be 12-column).''' ret_lst=[] for f in self.f: f = f.strip().split() chrom = f[0] chrom_start = int(f[1]) name = f[4] strand = f[5] cdsStart = int(f[6]) cdsEnd = int(f[7]) blockCount = int(f[9]) blockSizes = [ int(i) for i in f[10].strip(',').split(',') ] blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] # grab cdsStart - cdsEnd cds_exons = [] cds_seq = '' genome_seq_index = [] for base,offset in zip( blockStarts, blockSizes ): if (base + offset) < cdsStart: continue if base > cdsEnd: continue exon_start = max( base, cdsStart ) exon_end = min( base+offset, cdsEnd ) #cds_exons.append( (exon_start, exon_end) ) ret_lst.append([chrom,exon_start,exon_end]) self.f.seek(0) return ret_lst def truncate_bed(self, truncation_from=3, size = 250): ''' truncate bed from either the 5' end or 3' end. truncation_from: 3 or 5 size = truncation size (default 250 bp) ''' ret_lst=[] for f in self.f: gene_all_bases = [] chose_bases = [] f = f.strip().split() chrom = f[0] txStart = int(f[1]) txEnd = int(f[2]) score = f[4] name = f[3] strand = f[5] exon_start=map(int,f[11].rstrip(',').split(',')) exon_start=map((lambda x: x + txStart),exon_start) exon_end=map(int,f[10].rstrip(',').split(',')) exon_end=map((lambda x,y:x+y),exon_start,exon_end) for st,end in zip(exon_start,exon_end): gene_all_bases.extend(range(st+1,end+1)) #1-based, closed gene_all_bases = sorted(gene_all_bases) new_chrom = chrom #1 new_name = name #4 new_score = score #5 new_strand = strand #6 if truncation_from == 3: if strand == '+': chose_bases = gene_all_bases[-size:] elif strand == '-': chose_bases = gene_all_bases[:size] else: print >>sys.stderr, 'unknown strand ' + strand elif truncation_from == 5: if strand == '+': chose_bases = gene_all_bases[:size] elif strand == '-': chose_bases = gene_all_bases[-size:] else: print >>sys.stderr, 'unknown strand ' + strand else: print >>sys.stderr, "truncation_from takes '3' or '5'" new_tx_start = chose_bases[0]-1 #2 new_tx_end = chose_bases[-1] #3 new_thick_start = new_tx_start #7 new_thick_end = new_tx_end #8 new_igb = f[8] #9 exon_boundarie = [] for k, g in groupby(enumerate(chose_bases), lambda (i,x):i-x): group = map(itemgetter(1),g) exon_boundarie.append((group[0],group[-1])) #eg [(st1, end1), (st2, end2)] new_block_count = len(exon_boundarie) #10 tmp = [] for i,j in (exon_boundarie): tmp.append(str(j-i+1)) new_block_sizes = ','.join(tmp) #11 tmp = [] for i,j in (exon_boundarie): tmp.append(str(i - 1 - new_tx_start)) new_block_starts = ','.join(tmp) #12 ret_lst.append('\t'.join([str(i) for i in [new_chrom,new_tx_start,new_tx_end,new_name,new_score,new_strand,new_thick_start,new_thick_end,new_igb,new_block_count,new_block_sizes,new_block_starts]])) self.f.seek(0) return ret_lst def getIntron(self): '''Extract Intron regions from input bed file (must be 12-column). output is 6-column Tab separated bed file, each row represents one intron''' ret_lst=[] for line in self.f: try: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ","_") cds_start = int( fields[6] ) cds_end = int( fields[7] ) if int(fields[9] ==1): continue exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] if(strand == '-'): intronNum=len(intron_start) for st,end in zip(intron_start,intron_end): #FO.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t0\t" + strand + '\n') #intronNum -= 1 ret_lst.append([chrom,st,end]) else: intronNum=1 for st,end in zip(intron_start,intron_end): #FO.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t0\t" + strand + '\n') #intronNum += 1 ret_lst.append([chrom,st,end]) except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue self.f.seek(0) return ret_lst def getSpliceJunctions(self): ''' Return splice junctions for each transcripts. Single exon gene will be skipped. ''' for line in self.f: try: tmp = [] if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] if int(fields[9]) == 1: #skip single exon gene yield ((geneName, chrom, tx_start, tx_end, tmp)) continue exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] for st,end in zip(intron_start,intron_end): tmp.append(chrom + ':' + str(st) + '-' + str(end)) yield ((geneName, chrom, tx_start, tx_end, tmp)) except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue def getIntergenic(self,direction='up', size=1000): '''get intergenic regions. direction=up or down or both.''' ret_lst=[] for line in self.f: if line.startswith(('#','track','browser')): continue fields = line.split() chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) strand = fields[5] if(direction=="up" or direction=="both"): if strand=='-': region_st=tx_end region_end=tx_end +size else: region_st = max(tx_start-size,0) region_end=tx_start ret_lst.append([chrom,region_st,region_end]) if (direction=="down" or direction=="both"): if strand == '-': region_st = max(0,tx_start-size) region_end = tx_start else: region_st = tx_end region_end = tx_end+size ret_lst.append([chrom,region_st,region_end]) self.f.seek(0) return ret_lst def getBedinfor(self,outfile=None, reffa=None): '''Extract information (such as exonNumber, exonSize (min,max,mean)etc,.) from bed entries. ''' if reffa is not None: print >>sys.stderr, "Indexing " + reffa + ' ...', pysam.faidx(reffa) print >>sys.stderr, "Done!" transtab = maketrans("ACGTNX","TGCANX") if outfile is None:outfile = self.fileName + ".infor.xls" FO=open(outfile,'w') print >>sys.stderr,"writing feature information to " + outfile + " ..." intron_sizes=[] FO.write("geneID\t" + "Exon_Num\t" + "Exon_Len_Sum\t" + "Exon_Len_Min\t" + "Exon_Len_Max\t" + "Exon_Len_Avg\t") FO.write("Intron_Len_Sum\t" + "Intron_Len_Min\t" + "Intron_Len_Max\t" + "Intron_Len_Avg\t" + "3'UTR\t" + "5'UTR\t" + "GC\t" + "\n") for line in self.f: intron_sizes=[] mRNA_seq = '' GC_content = 0.0 utr5len = 0 utr3len = 0 try: if line.startswith(('#','track','browser')):continue # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ","_") cds_start = int( fields[6] ) cds_end = int( fields[7] ) exon_num= int(fields[9]) exon_sizes = map(int,fields[10].rstrip(',\n').split(',')) exon_starts = map(int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] if reffa is not None: for st,end in zip(exon_starts, exon_ends): exon_coord = chrom + ':' + str(st +1) + '-' + str(end) tmp = pysam.faidx(reffa,exon_coord) mRNA_seq += ''.join([i.rstrip('\n\r') for i in tmp[1:]]) if strand == '-': mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1] elif strand == '+': mRNA_seq = mRNA_seq.upper() GC_content = (mRNA_seq.count('C') + mRNA_seq.count('G'))*1.0/len(mRNA_seq) for st,end in zip(exon_starts,exon_ends): if strand == '+': if st < cds_start: # 5' UTR utr_st = st utr_end = min(end,cds_start) utr5len += (utr_end - utr_st) if end > cds_end: # 3' UTR utr_st = max(st, cds_end) utr_end = end utr3len += (utr_end - utr_st) if strand == '-': if st < cds_start: # 3' UTR utr_st = st utr_end = min(end,cds_start) utr3len += (utr_end - utr_st) if end > cds_end: # 5' UTR utr_st = max(st, cds_end) utr_end = end utr5len += (utr_end - utr_st) if cds_start == cds_end: utr3len = 0 utr5len = 0 except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue for st,end in zip(intron_start,intron_end): intron_sizes.append(end-st) FO.write(chrom +':'+str(tx_start+1)+':'+str(tx_end)+ ':' + geneName +'\t') FO.write(str(exon_num) + "\t" + str(sum(exon_sizes)) +'\t' +str(min(exon_sizes)) + '\t' + str(max(exon_sizes)) +'\t' + str(sum(exon_sizes)/exon_num) +"\t") if intron_sizes: FO.write(str(sum(intron_sizes)) +'\t' + str(min(intron_sizes)) + '\t' + str(max(intron_sizes)) +'\t' + str(sum(intron_sizes)/len(intron_sizes)) + "\t") else: FO.write('0' +'\t' + '0' + '\t' + '0' +'\t' + '0' + "\t") FO.write(str(utr3len) + '\t' + str(utr5len) + '\t' ) if reffa is not None: FO.write(str(GC_content) + '\n') else: FO.write("NA" + '\n') self.f.seek(0) FO.close() def filterBedbyIntronSize(self,outfile=None,min_intron=50): '''Filter bed files with intron size. Mamalian gene has minimum intron size ''' if outfile is None:outfile = self.fileName + ".filterIntron.xls" FO=open(outfile,'w') print >>sys.stderr,"writing feature information to " + outfile + " ..." intron_sizes=[] for line in self.f: intron_sizes=[] flag=0 try: if line.startswith(('#','track','browser')):continue # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ","_") cds_start = int( fields[6] ) cds_end = int( fields[7] ) exon_num= int(fields[9]) exon_sizes = map(int,fields[10].rstrip(',\n').split(',')) exon_starts = map(int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue if exon_num <=1: #intron size is 0 continue for st,end in zip(intron_start,intron_end): if end-st <=min_intron: flag=1 break if flag == 0: FO.write(line) self.f.seek(0) FO.close() def getAllUniqJunctions(self,outfile=None,flankSize=20): '''Extract unique (non-redundant) junctions from input bed file (must be 12-column). use flankSize to represent junction. Note that too large flankSize could exceed chromosome boundary and raise error when you convert bed to bigbed. output 12-column Tab separated bed file''' if outfile is None:outfile = self.fileName + ".uniqJunctions.bed" FO=open(outfile,'w') print >>sys.stderr,"writing unique junctions " + outfile + " ..." uniqJunc = collections.defaultdict(int) for line in self.f: #try: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue fields = line.split() chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ","_") cds_start = int( fields[6] ) cds_end = int( fields[7] ) if int(fields[9] ==1): continue exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] for st,end in zip(intron_start,intron_end): key=chrom + str(st) + str(end) if not uniqJunc.has_key(key): FO.write(chrom + "\t" + str(st-flankSize) + "\t" + str(end+flankSize) + "\t" + geneName + "\t0\t" + strand + '\t' + str(st-flankSize) + "\t" + str(end+flankSize) + '\t0,0,0\t2\t' + str(flankSize)+','+str(flankSize)+'\t' + '0,' + str(end-st+flankSize) +'\n') uniqJunc[key]+=1 #except: # print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, # continue self.f.seek(0) FO.close() def collapseJunctionBed(self,outfile=None): '''Junctions spannig the same block will be merged. multiple spliced junctions will be reported as is without any changes''' if outfile is None:outfile = self.fileName + ".collapsed.bed" FO=open(outfile,'w') print >>sys.stderr,"\tCollapse junctions for " + self.fileName, tss_start =collections.defaultdict(list) tss_end =collections.defaultdict(list) exon_size1 =collections.defaultdict(list) exon_size2 =collections.defaultdict(list) block_start1 =collections.defaultdict(list) block_start2 =collections.defaultdict(list) count = collections.defaultdict(int) chrm=dict() strands=dict() for line in self.f: skip_flag=0 try: if line.startswith(('#','track','broser')):continue fields = line.rstrip().split() exonSize = map(int, fields[10].rstrip(',\n').split(',')) chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] score = int(fields[4]) strand = fields[5].replace(" ","_") if int(fields[9]) ==1: #if bed has only 1 block print >>FO,line, continue if int(fields[9]) >=3: #if bed has more than 3 blocks print >>FO,line, continue except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue exonSize = map(int,fields[10].rstrip(',\n').split(',')) block_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] intronKey = chrom + ":" + str(intron_start[0]+1) + "-" + str(intron_end[0]) + ":" + strand count[intronKey] += score chrm[intronKey] = chrom strands[intronKey] = strand tss_start[intronKey].append(tx_start) tss_end[intronKey].append(tx_end) exon_size1[intronKey].append(exonSize[0]) exon_size2[intronKey].append(exonSize[1]) block_start1[intronKey].append(block_starts[0]) block_start2[intronKey].append(block_starts[1]) print >>sys.stderr,"Writing junctions to " + outfile + " ..." for key in count: print >>FO,chrm[key] + '\t' + str(min(tss_start[key])) + '\t' + str(max(tss_end[key])) + '\t' + "SR=" + str(count[key]) + '\t' + str(count[key]), print >>FO,'\t'+strands[key] + '\t' + str(min(tss_start[key])) + '\t' + str(max(tss_end[key])) + '\t', print >>FO, '255,0,0' + '\t2\t' + str(max(exon_size1[key])) + ',' + str(max(exon_size2[key])) + '\t', print >>FO, str(min(block_start1[key])) + ',' + str(max(block_start2[key])) self.f.seek(0) FO.close() def filterJunctionBed(self,outfile=None,overhang=8,min_intron=50,max_intron=1000000,cvg=1): '''filter junction bed file according to overhang size and supporting read''' if outfile is None:outfile = self.fileName + ".filter.bed" FO=open(outfile,'w') print >>sys.stderr,"\tfilter junctions ... " for line in self.f: skip_flag=0 if line.startswith(('#','track','broser','\n')):continue fields = line.rstrip().split() tx_start = int( fields[1] ) tx_end = int( fields[2] ) exonSize = map(int,fields[10].rstrip(',\n').split(',')) block_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] if fields[9] >=3: #pass for multiple spliced read print >>FO,line, continue if fields[4]>cvg: #pass for multi-read supporting junction print >>FO,line, continue for i in exonSize: if i < overhang: #filter out short overhang junctions skip_flag=1 break for st,end in zip(intron_start,intron_end): if (end - st) > max_intron or (end -st) < min_intron: skip_flag=1 break if skip_flag ==1: continue else: print >>FO,line self.f.seek(0) FO.close() def unionBED(self,outfile=None,outNameFile=None,boundary="utr",stranded=True): '''Collapse bed entries through UNION all overlapping exons. Just like "dense" display mode in UCSC genome browser. Bed entries will be merged if the following conditions are met: 1) Coordinates overlapped 2) on the same chromosome output two files, one bed file contains the merged bed entries, the other *.name.txt file contains which genes were merged together. NOTE: 1) input bed should 12-columns and sorted by chrom then by start position (use "sort -k1,1 -k2,2n myfile.bed >myfile.sorted.bed") 2) when stranded=True, only overlapping bed entries on the same strand will be merged. Otherwise, all overlapping bed entries will be meraged without considering strand information. ''' #open output file if outfile is None:outfile = self.fileName + ".merge.unionExon.bed" if outNameFile is None: outNameFile = self.fileName + ".name.txt" FO=open(outfile,'w') FName=open(outNameFile,'w') #some local variable within function bed_line=re.compile(r'^\S+\s+\d+\s+\d+') Merge_bed_TxStart=collections.defaultdict(list) Merge_bed_TxEnd=collections.defaultdict(list) Merge_bed_cdsStart=collections.defaultdict(list) Merge_bed_cdsEnd=collections.defaultdict(list) Merge_bed_ExonStart=collections.defaultdict(list) Merge_bed_ExonEnd=collections.defaultdict(list) Merge_bed_geneName=collections.defaultdict(list) Orig_bedNum=0 #number of bed entries before merging Merge_bedNum=0 #number of bed entries after merging Merge_bed_chr={} Merge_bed_strand={} final_exon_starts=[] final_exon_sizes=[] #check if the input bed file is properly sorted print >>sys.stderr,"check if input bed file is sorted properly ...", i=0 for line in self.f: line=line.strip() if bed_line.match(line): fields=line.split() i +=1 if i==1: chrom = fields[0] txStart=int(fields[1]) if i>1: if ((fields[0] < chrom) or ((fields[0] == chrom) and (int(fields[1]) >sys.stderr, "File not properly sorted:" + line exit(1) chrom = fields[0] txStart=int(fields[1]) else: #well, file seems to be OK print >>sys.stderr,"OK!" self.f.seek(0) print >>sys.stderr,"merge bed file to " + outfile + " ..." FO.write("track name=" + self.fileName + " description=" + ' \"' + "Overlapping entries in " + self.fileName + " were merged" + '\"'+'\n') if stranded: #want to consider strand information for line in self.f: line=line.strip() if bed_line.match(line): fields=line.split() if(len(fields)!=12): print >>sys.stderr, "bed file must be 12 columns speparated by Tab(whilte space)" +line if fields[5] == '-':continue #merge overlapping transcripts on plus strand Orig_bedNum +=1 if Orig_bedNum ==1: #this is first bed entry overlap_flag=0 #has nothing to overlap because it's first entry chrom = fields[0] strand = fields[5] txStart= int(fields[1]) txEnd= int(fields[2]) cdsStart= int(fields[6]) cdsEnd= int(fields[7]) geneName=fields[3] score=fields[4] txEnd_float=txEnd cdsEnd_float=cdsEnd elif Orig_bedNum > 1: #this is NOT first entry overlap_flag=1 #we suppose current line is overlapped with previous one. Unless it can prove it's NOT! if fields[0] != chrom: #not the same chromosome overlap_flag=0 txEnd_float = int(fields[2]) if (boundary == "utr") and ( fields[0] == chrom) and (int(fields[1]) >= txEnd_float): overlap_flag=0 if ((boundary == "cds") and ( fields[0] == chrom) and (int(fields[6]) > cdsEnd_float)): overlap_flag=0 txStart= int(fields[1]) txEnd= int(fields[2]) strand = fields[5] chrom = fields[0] cdsStart= int(fields[6]) cdsEnd= int(fields[7]) geneName=fields[3] score=fields[4] txEnd_float=max(txEnd_float,int(fields[2])) cdsEnd_float=max(cdsEnd_float,int(fields[7])) exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + txStart ), exon_starts) #0-based half open [) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); #intron_start = exon_ends[:-1] #0-based half open [) #intron_end=exon_starts[1:] if overlap_flag ==0: Merge_bedNum +=1 Merge_bed_TxStart[Merge_bedNum].append(txStart) Merge_bed_TxEnd[Merge_bedNum].append(txEnd) Merge_bed_cdsStart[Merge_bedNum].append(cdsStart) Merge_bed_cdsEnd[Merge_bedNum].append(cdsEnd) Merge_bed_ExonStart[Merge_bedNum].extend(exon_starts) Merge_bed_ExonEnd[Merge_bedNum].extend(exon_ends) Merge_bed_geneName[Merge_bedNum].append(geneName) Merge_bed_chr[Merge_bedNum]=chrom Merge_bed_strand[Merge_bedNum]=strand elif overlap_flag ==1: Merge_bed_TxStart[Merge_bedNum].append(txStart) Merge_bed_TxEnd[Merge_bedNum].append(txEnd) Merge_bed_cdsStart[Merge_bedNum].append(cdsStart) Merge_bed_cdsEnd[Merge_bedNum].append(cdsEnd) Merge_bed_ExonStart[Merge_bedNum].extend(exon_starts) Merge_bed_ExonEnd[Merge_bedNum].extend(exon_ends) Merge_bed_geneName[Merge_bedNum].append(geneName) Merge_bed_chr[Merge_bedNum]=chrom Merge_bed_strand[Merge_bedNum]=strand self.f.seek(0) #merge overlapping transcripts on minus strand Orig_bedNum=0 for line in self.f: line=line.strip() if bed_line.match(line): fields=line.split() if(len(fields)!=12): print >>sys.stderr, "bed file must be 12 columns speparated by Tab(whilte space)" +line if fields[5] == '+':continue Orig_bedNum +=1 if Orig_bedNum ==1: #this is first bed entry overlap_flag=0 #has nothing to overlap because it's first entry chrom = fields[0] strand = fields[5] txStart= int(fields[1]) txEnd= int(fields[2]) cdsStart= int(fields[6]) cdsEnd= int(fields[7]) geneName=fields[3] score=fields[4] txEnd_float=txEnd cdsEnd_float=cdsEnd elif Orig_bedNum > 1: #this is NOT first entry overlap_flag=1 #we suppose current line is overlapped with previous one. Unless it can prove it's NOT! if fields[0] != chrom: #not the same chromosome overlap_flag=0 txEnd_float = int(fields[2]) if (boundary == "utr") and ( fields[0] == chrom) and (int(fields[1]) >= txEnd_float): overlap_flag=0 if ((boundary == "cds") and ( fields[0] == chrom) and (int(fields[6]) > cdsEdn_float)): overlap_flag=0 txStart= int(fields[1]) txEnd= int(fields[2]) strand = fields[5] chrom = fields[0] cdsStart= int(fields[6]) cdsEnd= int(fields[7]) geneName=fields[3] score=fields[4] txEnd_float=max(txEnd_float,int(fields[2])) cdsEnd_float=max(cdsEnd_float,int(fields[7])) exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + txStart ), exon_starts) #0-based half open [) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); #intron_start = exon_ends[:-1] #0-based half open [) #intron_end=exon_starts[1:] if overlap_flag ==0: Merge_bedNum +=1 Merge_bed_TxStart[Merge_bedNum].append(txStart) Merge_bed_TxEnd[Merge_bedNum].append(txEnd) Merge_bed_cdsStart[Merge_bedNum].append(cdsStart) Merge_bed_cdsEnd[Merge_bedNum].append(cdsEnd) Merge_bed_ExonStart[Merge_bedNum].extend(exon_starts) Merge_bed_ExonEnd[Merge_bedNum].extend(exon_ends) Merge_bed_geneName[Merge_bedNum].append(geneName) Merge_bed_chr[Merge_bedNum]=chrom Merge_bed_strand[Merge_bedNum]=strand elif overlap_flag ==1: Merge_bed_TxStart[Merge_bedNum].append(txStart) Merge_bed_TxEnd[Merge_bedNum].append(txEnd) Merge_bed_cdsStart[Merge_bedNum].append(cdsStart) Merge_bed_cdsEnd[Merge_bedNum].append(cdsEnd) Merge_bed_ExonStart[Merge_bedNum].extend(exon_starts) Merge_bed_ExonEnd[Merge_bedNum].extend(exon_ends) Merge_bed_geneName[Merge_bedNum].append(geneName) Merge_bed_chr[Merge_bedNum]=chrom Merge_bed_strand[Merge_bedNum]=strand self.f.seek(0) else: # NOT want to consider strand information. merge + and - together Orig_bedNum=0 Merge_bedNum=0 for line in self.f: line=line.rstrip('\r\n') line=line.lstrip() if bed_line.match(line): fields=line.split() Orig_bedNum +=1 if Orig_bedNum ==1: #this is first bed entry overlap_flag=0 #has nothing to overlap because it's first entry chrom = fields[0] #strand = fields[5] txStart= int(fields[1]) txEnd= int(fields[2]) cdsStart= int(fields[6]) cdsEnd= int(fields[7]) geneName=fields[3] score=fields[4] txEnd_float=txEnd cdsEnd_float=cdsEnd elif Orig_bedNum > 1: #this is NOT first entry overlap_flag=1 #we suppose current line is overlapped with previous one. Unless it can prove it's NOT! if fields[0] != chrom: #not the same chromosome overlap_flag=0 txEnd_float = int(fields[2]) if (boundary == "utr") and ( fields[0] == chrom) and (int(fields[1]) >= txEnd_float): overlap_flag=0 if ((boundary == "cds") and ( fields[0] == chrom) and (int(fields[6]) > cdsEdn_float)): overlap_flag=0 txStart= int(fields[1]) txEnd= int(fields[2]) #strand = fields[5] chrom = fields[0] cdsStart= int(fields[6]) cdsEnd= int(fields[7]) geneName=fields[3] score=fields[4] txEnd_float=max(txEnd_float,int(fields[2])) cdsEnd_float=max(cdsEnd_float,int(fields[7])) exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + txStart ), exon_starts) #0-based half open [) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends) #intron_start = exon_ends[:-1] #0-based half open [) #intron_end=exon_starts[1:] if overlap_flag ==0: Merge_bedNum +=1 Merge_bed_TxStart[Merge_bedNum].append(txStart) Merge_bed_TxEnd[Merge_bedNum].append(txEnd) Merge_bed_cdsStart[Merge_bedNum].append(cdsStart) Merge_bed_cdsEnd[Merge_bedNum].append(cdsEnd) Merge_bed_ExonStart[Merge_bedNum].extend(exon_starts) Merge_bed_ExonEnd[Merge_bedNum].extend(exon_ends) Merge_bed_geneName[Merge_bedNum].append(geneName) Merge_bed_chr[Merge_bedNum]=chrom Merge_bed_strand[Merge_bedNum]='+' elif overlap_flag ==1: Merge_bed_TxStart[Merge_bedNum].append(txStart) Merge_bed_TxEnd[Merge_bedNum].append(txEnd) Merge_bed_cdsStart[Merge_bedNum].append(cdsStart) Merge_bed_cdsEnd[Merge_bedNum].append(cdsEnd) Merge_bed_ExonStart[Merge_bedNum].extend(exon_starts) Merge_bed_ExonEnd[Merge_bedNum].extend(exon_ends) Merge_bed_geneName[Merge_bedNum].append(geneName) Merge_bed_chr[Merge_bedNum]=chrom Merge_bed_strand[Merge_bedNum]='+' #meged exon and output for id in sorted(Merge_bed_TxStart.iterkeys()): final_exon = {} #key is exon start, value is exon end final_exon_sizes=[] final_exon_starts=[] FName.write("BED_box_" + str(id) + "\t") FName.write(','.join(Merge_bed_geneName[id]) + "\n") for E_st,E_end in sorted(zip(Merge_bed_ExonStart[id],Merge_bed_ExonEnd[id])): startOfFirstExon = E_st endOfFirstExon = E_end final_exon[E_st] = E_end break for E_st,E_end in sorted(zip(Merge_bed_ExonStart[id][1:],Merge_bed_ExonEnd[id][1:])): if final_exon.has_key(E_st): #the current start position already there. We only need to compare end position if E_end > endOfFirstExon: final_exon[startOfFirstExon] = E_end endOfFirstExon = E_end else: continue else: #the current start postion is different if E_st <=endOfFirstExon: if E_end <= endOfFirstExon: continue elif E_end > endOfFirstExon: final_exon[startOfFirstExon] = E_end endOfFirstExon = E_end else: final_exon[E_st]=E_end startOfFirstExon=E_st endOfFirstExon=E_end for k in sorted(final_exon.iterkeys()): final_exon_sizes.append(final_exon[k]-k) final_exon_starts.append(k-(min(Merge_bed_TxStart[id]))) FO.write(Merge_bed_chr[id] + "\t" + str(min(Merge_bed_TxStart[id])) + "\t" + str(max(Merge_bed_TxEnd[id])) +"\t") #column 1,2,3 FO.write("BED_box_" + str(id) + "\t0\t") #column 4,5 FO.write(Merge_bed_strand[id] +"\t") #column 6 FO.write(str(min(Merge_bed_cdsStart[id])) + "\t" + str(max(Merge_bed_cdsEnd[id])) + "\t" + "255,0,0"+"\t") #column 7,8,9 FO.write(str(len(final_exon_starts)) +'\t') #column 10 FO.write(','.join(map(str,final_exon_sizes))+"\t") #column 11 FO.write(','.join(map(str,final_exon_starts)) + "\n") #column 12 self.f.seek(0) FO.close() def correctSplicingBed(self,genome,outfile,sp="GTAG,GCAG,ATAC"): '''input should be bed12 file representing splicing junctions. The function will compare the splicing motif to genome. To see if the direcion is correct or not. Only consider GT/AG GC/AG, AT/AC motifs. Multiple spliced reads are accepted''' fout=open(outfile,'w') motif=sp.upper().split(',') motif_rev = [m.translate(self.transtab)[::-1] for m in motif] print >>sys.stderr, "\tloading "+genome+'...' tmp=fasta.Fasta(genome) for line in self.f: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue line=line.strip() fields=line.split() if (len(fields)<12): print >>fout,line continue chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5] cds_start = int( fields[6] ) cds_end = int( fields[7] ) exon_num= int(fields[9]) exon_sizes = map(int,fields[10].rstrip(',\n').split(',')) exon_starts = map(int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] splice_strand=[] for st,end in zip(intron_start,intron_end): splice_motif = tmp.fetchSeq(chrom, st, st+2) + tmp.fetchSeq(chrom, end-2, end) if splice_motif in motif: splice_strand.append('+') elif splice_motif in motif_rev: splice_strand.append('-') else: splice_strand.append('.') real_strand = set(splice_strand) if (len(real_strand) ==1): print >>fout,"\t".join((fields[0],fields[1],fields[2],fields[3],fields[4],real_strand.pop(),fields[6],fields[7],fields[8],fields[9],fields[10],fields[11])) else: print >>fout,"\t".join((fields[0],fields[1],fields[2],fields[3],fields[4],'.',fields[6],fields[7],fields[8],fields[9],fields[10],fields[11])) self.f.seek(0) fout.close() def nrBED(self,outfile=None): '''redundant bed entries (exactly the same gene structure) in bed12 file will be merged.''' if outfile is None: outfile = self.fileName + ".nr.bed" FO=open(outfile,'w') mergeGene=collections.defaultdict(list) print >>sys.stderr, "Removing redundcany from " + self.fileName + " ..." for line in self.f: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue fields=line.rstrip().split() chrom = fields[0] # txStart= fields[1] # txEnd= fields[2] # geneName=fields[3] score=fields[4] strand = fields[5] # cdsStart= fields[6] # cdsEnd= fields[7] # blockCount = fields[9] # blockSize = fields[10] # blockStart = fields[11] # key = ":".join((chrom,txStart,txEnd,strand,cdsStart,cdsEnd,blockCount,blockSize,blockStart)) mergeGene[key].append(geneName) for k in mergeGene: fields=k.split(":") name=";".join(mergeGene[k]) FO.write(fields[0] +'\t' + fields[1] + '\t' + fields[2] + '\t' + name + '\t0\t' + fields[3] + '\t' + fields[4] +'\t'+fields[5] +'\t0,0,0\t'+fields[6] +'\t' + fields[7] +'\t'+fields[8] +'\n') self.f.seek(0) FO.close() class CompareBED: '''Compare two bed fies. Standard BED file has 12 fields. (http://genome.ucsc.edu/FAQ/FAQformat.html)''' def __init__(self,bedFileA,bedFileB): '''This is constructor of ParseBED. Must provide two bed files for comprison. 1st bed file is user input bed, while 2nd bed file is usually a reference gene model''' self.A_fh=open(bedFileA,'r') self.A_full_Name=bedFileA self.A_base_Name=os.path.basename(bedFileA) self.B_fh=open(bedFileB,'r') self.B_full_Name=bedFileB self.B_base_Name=os.path.basename(bedFileB) def annotateEvents(self,outfile=None): '''Compare bed file A to bed file B (usually a bed file for reference gene model). NOTE that only intron boundaries are compared. This function is useful if one want to find if a junctions is novel or not. bed file A will divided into two files: *.known.bed and *.novel.bed''' if outfile is None: KnownBed = self.A_base_Name + ".known.bed" NovelBed = self.A_base_Name + ".novel.bed" else: KnownBed = outfile + ".known.bed" NovelBed = outfile + ".novel.bed" KNO=open(KnownBed,'w') NOV=open(NovelBed,'w') #print >>sys.stderr,"writing intron to " + outfile + " ..." ref_blocks=collections.defaultdict(dict) print >>sys.stderr, "reading reference bed file",self.B_full_Name, " ... ", for line in self.B_fh: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue # Parse fields from gene tabls fields = line.split() if(len(fields)<12): print >>sys.stderr, "Invalid bed line (skipped):",line, continue chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) if int(fields[9] ==1): continue exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] for i_st,i_end in zip (intron_start, intron_end): key_str=str(i_st) + '_' + str(i_end) ref_blocks[chrom][key_str]=i_end print >>sys.stderr,"Done" print >>sys.stderr, "processing",self.A_full_Name, "...", for line in self.A_fh: found=0 if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue # Parse fields from gene tabls fields = line.split() if(len(fields)<12): print >>sys.stderr, "Invalid bed line (skipped):",line, continue chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) if int(fields[9] ==1): continue exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] for i_st, i_end in sorted (zip (intron_start,intron_end)): key_str=str(i_st) + '_' + str(i_end) if (ref_blocks[chrom].has_key(key_str)): #found +=1 #if (found == len(intron_start)): print >>KNO, line, else: print >>NOV, line, print >>sys.stderr,"Done" def annotateSplicingSites(self,outfile=None): '''Compare bed file A to bed file B (usually a bed file for reference gene model). NOTE that only intron boundaries are compared. This function is useful if one want to find if a spilcing site is novel or not. bed file A will divided into two files: *.known.bed, *.35novel.bed, *.3novel.bed and *.5novel.bed NOTE: splicing junctions with multiple introns will be split''' if outfile is None: KnownBed = self.A_base_Name + ".known.bed" NovelBed5 = self.A_base_Name + ".5novel.bed" NovelBed3 = self.A_base_Name + ".3novel.bed" NovelBed35 = self.A_base_Name + ".35novel.bed" else: KnownBed = outfile + ".known.bed" NovelBed5 = outfile + ".5novel.bed" NovelBed3 = outfile + ".3novel.bed" NovelBed35 = outfile + ".35novel.bed" KNO=open(KnownBed,'w') N5=open(NovelBed5,'w') N3=open(NovelBed3,'w') N35=open(NovelBed35,'w') #print >>sys.stderr,"writing intron to " + outfile + " ..." refIntronStarts=collections.defaultdict(dict) refIntronEnds=collections.defaultdict(dict) print >>sys.stderr, "\treading reference bed file",self.B_full_Name, " ... ", for line in self.B_fh: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue # Parse fields from gene tabls fields = line.split() if(len(fields)<12): print >>sys.stderr, "Invalid bed line (skipped):",line, continue chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) if int(fields[9] ==1): continue exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end=exon_starts[1:] for i_st,i_end in zip (intron_start, intron_end): refIntronStarts[chrom][i_st] =i_st refIntronEnds[chrom][i_end] =i_end print >>sys.stderr,"Done" print >>sys.stderr, "\tprocessing",self.A_full_Name, "...", for line in self.A_fh: found=0 if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue # Parse fields from gene tabls fields = line.split() if(len(fields)<12): print >>sys.stderr, "Invalid bed line (skipped):",line, continue chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] score = fields[4] strand = fields[5] if int(fields[9] ==1): continue exon_sizes = map(int,fields[10].rstrip(',\n').split(',')) exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_start = exon_ends[:-1] intron_end = exon_starts[1:] counter=0 for i_st, i_end in zip (intron_start,intron_end): counter +=1 if(strand == '+' or strand == '.'): if (refIntronStarts[chrom].has_key(i_st) and refIntronEnds[chrom].has_key(i_end)): found =2 #known both elif (refIntronStarts[chrom].has_key(i_st) and not refIntronEnds[chrom].has_key(i_end)): found=5 # 5' splice site known, 3' splice site unkonwn elif (not refIntronStarts[chrom].has_key(i_st) and refIntronEnds[chrom].has_key(i_end)): found=3 # 5' splice site uknown, 3' splice site konwn else: found=10 elif(strand == '-'): if (refIntronStarts[chrom].has_key(i_st) and refIntronEnds[chrom].has_key(i_end)): found =2 #known elif (refIntronStarts[chrom].has_key(i_st) and not refIntronEnds[chrom].has_key(i_end)): found=3 # 5' splice site uknown, 3' splice site konwn elif (not refIntronStarts[chrom].has_key(i_st) and refIntronEnds[chrom].has_key(i_end)): found=5 # 5' splice site known, 3' splice site unkonwn else: found=10 else: continue if (found == 2): print >>KNO, '\t'.join((chrom,str(i_st - exon_sizes[counter-1]),str(i_end + exon_sizes[counter]),geneName + '_intron' + str(counter),score,strand,str(i_st - exon_sizes[counter-1]),str(i_end + exon_sizes[counter]),'0,255,0','2',str(exon_sizes[counter-1]) +','+str(exon_sizes[counter]),'0'+','+str(exon_sizes[counter-1] + i_end - i_st))) elif(found ==3): print >>N5, '\t'.join((chrom,str(i_st - exon_sizes[counter-1]),str(i_end + exon_sizes[counter]),geneName + '_intron' + str(counter),score,strand,str(i_st - exon_sizes[counter-1]),str(i_end + exon_sizes[counter]),'0,255,0','2',str(exon_sizes[counter-1]) +','+str(exon_sizes[counter]),'0'+','+str(exon_sizes[counter-1] + i_end - i_st))) elif(found ==5): print >>N3, '\t'.join((chrom,str(i_st - exon_sizes[counter-1]),str(i_end + exon_sizes[counter]),geneName + '_intron' + str(counter),score,strand,str(i_st - exon_sizes[counter-1]),str(i_end + exon_sizes[counter]),'0,255,0','2',str(exon_sizes[counter-1]) +','+str(exon_sizes[counter]),'0'+','+str(exon_sizes[counter-1] + i_end - i_st))) elif(found ==10): print >>N35, '\t'.join((chrom,str(i_st - exon_sizes[counter-1]),str(i_end + exon_sizes[counter]),geneName + '_intron' + str(counter),score,strand,str(i_st - exon_sizes[counter-1]),str(i_end + exon_sizes[counter]),'0,255,0','2',str(exon_sizes[counter-1]) +','+str(exon_sizes[counter]),'0'+','+str(exon_sizes[counter-1] + i_end - i_st))) print >>sys.stderr,"Done" def distribBed(self,outfile=None): '''Compare bed file A (usually a bed file of reads mapping results) to bed file B (usually a bed file for reference gene model). For each exon/intron of a gene, calculate how many reads mapped into it.''' if outfile is None: exon_count = self.B_base_Name + "_exon.count.bed" intron_count = self.B_base_Name + "_intron.count.bed" rscript=self.B_base_Name + ".piechart.r" rpdf=self.B_base_Name + ".piechart.pdf" else: exon_count = outfile + "_exon.count.bed" intron_count = outfile + "_intron.count.bed" rscript=outfile + ".piechart.r" rpdf=outfile + ".piechart.pdf" EXON_OUT = open(exon_count,'w') INTRON_OUT =open(intron_count,'w') R_OUT = open(rscript,'w') ranges={} intronReads=0 exonReads=0 intergenicReads=0 totalReads=0 splicedReads=0 #read SAM print >>sys.stderr, "reading "+ self.A_base_Name + '...', for line in self.A_fh: if line.startswith("track"):continue if line.startswith("#"):continue if line.startswith('browser'):continue fields=line.rstrip('\n ').split() totalReads +=1 if(len(fields)==12 and fields[9]>1): splicedReads +=1 continue else: chrom=fields[0].upper() mid = int(fields[1]) + int((int(fields[2])-int(fields[1]))/2) if chrom not in ranges: ranges[chrom] = Intersecter() else: ranges[chrom].add_interval( Interval( mid, mid ) ) self.A_fh.seek(0) print >>sys.stderr, "Done" #read refbed file print >>sys.stderr, "Assign reads to "+ self.B_base_Name + '...', for line in self.B_fh: try: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue # Parse fields from gene tabls fields = line.split() chrom = fields[0].upper() tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ","_") exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_starts = exon_ends[:-1] intron_ends=exon_starts[1:] except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue # assign reads to intron if(strand == '-'): intronNum=len(intron_starts) exonNum=len(exon_starts) for st,end in zip(intron_starts,intron_ends): if chrom in ranges: hits= len(ranges[chrom].find(st,end)) intronReads += hits INTRON_OUT.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\n') intronNum -= 1 for st,end in zip(exon_starts,exon_ends): if chrom in ranges: hits= len(ranges[chrom].find(st,end)) exonReads += hits EXON_OUT.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\n') exonNum -= 1 elif(strand == '+'): intronNum=1 exonNum=1 for st,end in zip(intron_starts,intron_ends): if chrom in ranges: hits= len(ranges[chrom].find(st,end)) intronReads += hits INTRON_OUT.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\n') intronNum += 1 for st,end in zip(exon_starts,exon_ends): if chrom in ranges: hits= len(ranges[chrom].find(st,end)) exonReads += hits EXON_OUT.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\n') exonNum += 1 intergenicReads=totalReads-exonReads-intronReads-splicedReads print >>sys.stderr, "Done." + '\n' print >>sys.stderr, "Total reads:\t" + str(totalReads) print >>sys.stderr, "Exonic reads:\t" + str(exonReads) print >>sys.stderr, "Intronic reads:\t" + str(intronReads) print >>sys.stderr, "Splicing reads:\t" + str(splicedReads) print >>sys.stderr, "Intergenic reads:\t" + str(intergenicReads) print >>sys.stderr,"writing R script ...", totalReads=float(totalReads) print >>R_OUT, "pdf('%s')" % rpdf print >>R_OUT, "dat=c(%d,%d,%d,%d)" % (exonReads,splicedReads,intronReads,intergenicReads) print >>R_OUT, "lb=c('exon(%.2f)','junction(%.2f)','intron(%.2f)','intergenic(%.2f)')" % (exonReads/totalReads,splicedReads/totalReads,intronReads/totalReads,intergenicReads/totalReads) print >>R_OUT, "pie(dat,labels=lb,col=rainbow(4),clockwise=TRUE,main='Total reads = %d')" % int(totalReads) print >>R_OUT, "dev.off()" print >>sys.stderr, "Done." self.B_fh.seek(0) def distribBedWithStrand(self,outfile=None,output=True): '''Compare bed file A (usually a bed file of reads mapping results) to bed file B (usually a bed file for reference gene model). NOTE: When assigning reads (from bed fileA) to gene (bed file B), program will consider strand information''' if output: if outfile is None: read_count = self.B_base_Name + "_count.xls" else: read_count = outfile + "_count.xls" READ_OUT = open(read_count,'w') Minus_ranges={} Plus_ranges={} unknown_ranges={} redat = collections.defaultdict(list) #read bed print >>sys.stderr, "\treading "+ self.A_base_Name + '...', for line in self.A_fh: if line.startswith(("track","#","browser")):continue fields=line.rstrip('\n ').split() #juntionID = fields[0] + ":" + fields[1] + ":" fields[2] + ":" + fields[5]; if(len(fields)<6): print >>sys.stderr,"[NOTE:input bed must be at least 6 columns] skipped this line: " + line, if (fields[5] == '-'): chrom=fields[0].upper() mid = int(fields[1]) + int((int(fields[2])-int(fields[1]))/2) if chrom not in Minus_ranges: Minus_ranges[chrom] = IntervalTree() else: Minus_ranges[chrom].insert(mid, mid,fields[4]) elif (fields[5] == '+'): chrom=fields[0].upper() mid = int(fields[1]) + int((int(fields[2])-int(fields[1]))/2) if chrom not in Plus_ranges: Plus_ranges[chrom] = IntervalTree() else: Plus_ranges[chrom].insert(mid, mid,fields[4]) else: chrom=fields[0].upper() mid = int(fields[1]) + int((int(fields[2])-int(fields[1]))/2) if chrom not in unknown_ranges: unknown_ranges[chrom] = IntervalTree() else: unknown_ranges[chrom].insert(mid, mid,fields[4]) self.A_fh.seek(0) print >>sys.stderr, "Done" #read refbed file print >>sys.stderr, "\tAssign reads to "+ self.B_base_Name + '...' for line in self.B_fh: try: if line.startswith(("track","#","browser")):continue # Parse fields from gene tabls fields = line.split() chrom = fields[0].upper() tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ","_") except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue if output: READ_OUT.write(line.rstrip() + '\t') if chrom in Plus_ranges: tmp=Plus_ranges[chrom].find(tx_start,tx_end) if(len(tmp)>0): READ_OUT.write(','.join(tmp) + '\t') else: READ_OUT.write('0\t') if chrom in Minus_ranges: tmp=Minus_ranges[chrom].find(tx_start,tx_end) if (len(tmp)>0): READ_OUT.write(','.join(tmp) + '\t') else: READ_OUT.write('0\t') if chrom in unknown_ranges: tmp=unknown_ranges[chrom].find(tx_start,tx_end) if (len(tmp)>0): READ_OUT.write(','.join(tmp) + '\n') else: READ_OUT.write('0\n') else: key='\t'.join((chrom.lower(),str(tx_start),str(tx_end),geneName,'0',strand)) if chrom in Plus_ranges and len(Plus_ranges[chrom].find(tx_start,tx_end))>0: redat[key].append(','.join(Plus_ranges[chrom].find(tx_start,tx_end))) else: redat[key].append('0') if chrom in Minus_ranges and len(Minus_ranges[chrom].find(tx_start,tx_end))>0: redat[key].append(','.join(Minus_ranges[chrom].find(tx_start,tx_end))) else: redat[key].append('0') if chrom in unknown_ranges and len(unknown_ranges[chrom].find(tx_start,tx_end))>0: redat[key].append(','.join(unknown_ranges[chrom].find(tx_start,tx_end))) else: redat[key].append('0') if output is not True: return redat print >>sys.stderr, "Done." self.B_fh.seek(0) def distribSpliceSites(self,outfile=None,output=True): '''Compare bed file A (usually a bed file of splicing junctions) to bed file B (usually a bed file for reference gene model). NOTE: When assigning reads (from bed fileA) to gene (bed file B), program will consider strand information''' if output: if outfile is None: read_count = self.B_base_Name + "_count.xls" else: read_count = outfile + "_count.xls" READ_OUT = open(read_count,'w') Minus_ranges={} Plus_ranges={} unknown_ranges={} redat = collections.defaultdict(list) #read bed print >>sys.stderr, "\treading "+ self.A_base_Name + '...', for line in self.A_fh: if line.startswith(("track","#","browser")):continue fields=line.rstrip('\n ').split() juntionID = fields[0] + ":" + fields[1] + ":" + fields[2] + ":" + fields[5] if(len(fields)<12): print >>sys.stderr,"[NOTE:input bed must be at least 12 columns] skipped this line: " + line, continue chrom = fields[0].upper() tx_start = int( fields[1] ) tx_end = int( fields[2] ) exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); intron_starts = exon_ends[:-1] intron_ends=exon_starts[1:] for st,end in zip(intron_starts,intron_ends): junctionID=chrom + ":" + str(st) + ":" + str(end) if (fields[5] == '-'): if chrom not in Minus_ranges: Minus_ranges[chrom] = IntervalTree() else: Minus_ranges[chrom].insert(st, st,(junctionID,fields[4])) Minus_ranges[chrom].insert(end, end,(junctionID,fields[4])) elif (fields[5] == '+'): if chrom not in Plus_ranges: Plus_ranges[chrom] = IntervalTree() else: Plus_ranges[chrom].insert(st, st,(junctionID,fields[4])) Plus_ranges[chrom].insert(end, end,(junctionID,fields[4])) else: if chrom not in unknown_ranges: unknown_ranges[chrom] = IntervalTree() else: unknown_ranges[chrom].insert(st, st,(junctionID,fields[4])) unknown_ranges[chrom].insert(end, end,(junctionID,fields[4])) self.A_fh.seek(0) print >>sys.stderr, "Done" #read refbed file print >>sys.stderr, "\tAssign reads to "+ self.B_base_Name + '...' for line in self.B_fh: try: if line.startswith(("track","#","browser")):continue # Parse fields from gene tabls fields = line.split() chrom = fields[0].upper() tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ","_") except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue if output: READ_OUT.write(line.rstrip() + '\t') if chrom in Plus_ranges: tmp=Plus_ranges[chrom].find(tx_start,tx_end) locus=[] junction_list=[] if(len(tmp)>0): for j in tmp: if j[0] not in junction_list: locus.append(j[1]) junction_list.append(j[0]) READ_OUT.write(','.join(locus) + '\t') else:READ_OUT.write('0\t') else:READ_OUT.write('0\t') if chrom in Minus_ranges: tmp=Minus_ranges[chrom].find(tx_start,tx_end) locus=[] junction_list=[] if(len(tmp)>0): for j in tmp: if j[0] not in junction_list: locus.append(j[1]) junction_list.append(j[0]) READ_OUT.write(','.join(locus) + '\t') else:READ_OUT.write('0\t') else:READ_OUT.write('0\t') if chrom in unknown_ranges: tmp=unknown_ranges[chrom].find(tx_start,tx_end) locus=[] junction_list=[] if(len(tmp)>0): for j in tmp: if j[0] not in junction_list: locus.append(j[1]) junction_list.append(j[0]) READ_OUT.write(','.join(locus) + '\n') else:READ_OUT.write('0\n') else:READ_OUT.write('0\n') else: key='\t'.join((chrom.lower(),str(tx_start),str(tx_end),geneName,'0',strand)) if chrom in Plus_ranges: tmp = Plus_ranges[chrom].find(tx_start,tx_end) locus=[] junction_list=[] if(len(tmp)>0): for j in tmp: if j[0] not in junction_list: locus.append(j[1]) junction_list.append(j[0]) redat[key].append(','.join(locus)) else:redat[key].append('0') else:redat[key].append('0') if chrom in Minus_ranges: tmp = Minus_ranges[chrom].find(tx_start,tx_end) locus=[] junction_list=[] if(len(tmp)>0): for j in tmp: if j[0] not in junction_list: locus.append(j[1]) junction_list.append(j[0]) redat[key].append(','.join(locus)) else:redat[key].append('0') else:redat[key].append('0') if chrom in unknown_ranges: tmp = unknown_ranges[chrom].find(tx_start,tx_end) locus=[] junction_list=[] if(len(tmp)>0): for j in tmp: if j[0] not in junction_list: locus.append(j[1]) junction_list.append(j[0]) redat[key].append(','.join(locus)) else:redat[key].append('0') else:redat[key].append('0') if output is not True: return redat print >>sys.stderr, "Done." self.B_fh.seek(0) def findClosestTSS(self,outfile=None,downStream=50000,upStream=50000): '''For each entry in input bed file (1st bed file), find the nearest gene (2nd bed file) based on TSS. Genes shared the same TSS will be grouped together. NOTE: gene is represented by its TSS, input bed entry is represented by its middle point. NOTE: this is peak centered, for each peak find the nearest gene(s)''' if outfile is None: outfileName = self.A_base_Name + ".nearestTSS.xls" else: outfileName = outfile + ".nearestTSS.xls" OUT=open(outfileName,'w') ranges={} tss_group={} tss_group_num=0 #read reference bed file print >>sys.stderr, "Reading "+ self.B_base_Name + '...', for line in self.B_fh: if line.startswith(("#","track","browser")):continue fields=line.rstrip('\n ').split() chrom = fields[0] geneName = fields[3] if (len(fields) >=6): if (fields[5] == '-'): tss_st = int(fields[2]) - 1 tss_end = int(fields[2]) else: tss_st = int(fields[1]) tss_end =int(fields[1]) + 1 elif (len(fields) < 6 and len(fields) >=3 ): tss_st = int(fields[1]) tss_end =int(fields[1]) + 1 else: print >>sys.stderr, "reference bed file must be at least 3 columns" sys.exit() key=chrom + ":" + str(tss_st) + ':' + str(tss_end) if key not in tss_group: tss_group[key] = geneName + ";" else: tss_group[key] += geneName + ";" for key in tss_group: tss_group_num +=1 chrom,tss_st,tss_end = key.split(":") #print chrom + '\t'+ tss_st +'\t' + tss_end if chrom not in ranges: ranges[chrom] = IntervalTree() ranges[chrom].insert_interval( Interval( int(tss_st), int(tss_end),value=tss_group[key] + '\t' + key) ) self.B_fh.seek(0) print >>sys.stderr, "Done. Total " + str(tss_group_num) + " TSS groups" #a=ranges['chr1'].find(1000000,1200000) #print a #read input bed file print >>sys.stderr, "Find nearest TSS(s) for "+ self.A_base_Name + '...', for line in self.A_fh: if line.startswith(("#","track","browser")):continue fields=line.rstrip('\n ').split() if (len(fields)>=6): chain = fields[5] elif (len(fields) < 6 and len(fields) >=3 ): chain = '+' else: print >>sys.stderr, "Input bed file must be at least 3 columns" sys.exit() chrom = fields[0] bed_st = int(fields[1]) + int((int(fields[2]) - int(fields[1]))/2) bed_end = bed_st +1 up=ranges[chrom].upstream_of_interval(Interval(bed_st,bed_end,strand=chain),num_intervals=1,max_dist=upStream) down=ranges[chrom].downstream_of_interval(Interval(bed_st,bed_end,strand=chain),num_intervals=1,max_dist=downStream) if (len(up) >0): up_name=up[0].value up_dist=abs(up[0].end - bed_end) else: up_name="NA\tNA" up_dist="NA" if (len(down) >0): down_name=down[0].value down_dist=abs(down[0].end - bed_end) else: down_name="NA\tNA" down_dist="NA" print >>OUT, line.rstrip() + "\t" + up_name + "\t" + str(up_dist) + '\t' + down_name + "\t" + str(down_dist) self.A_fh.seek(0) print >>sys.stderr, "Done." def findClosestTTS(self,outfile=None,downStream=50000,upStream=50000): '''For each entry in input bed file (1st bed file), find the nearest gene (2nd bed file) based on TTS. Genes shared the same TTS will be grouped together. NOTE: gene is represented by its TTS, input bed entry is represented by its middle point.''' if outfile is None: outfileName = self.A_base_Name + ".nearestTTS.xls" else: outfileName = outfile + ".nearestTTS.xls" OUT=open(outfileName,'w') ranges={} tts_group={} tts_group_num=0 #read reference bed file print >>sys.stderr, "Reading "+ self.B_base_Name + '...', for line in self.B_fh: if line.startswith(("#","track","browser")):continue fields=line.rstrip('\n ').split() chrom = fields[0] geneName = fields[3] if (len(fields) >=6): if (fields[5] == '-'): tts_st = int(fields[1]) tts_end = int(fields[1]) + 1 else: tts_st = int(fields[2]) - 1 tts_end =int(fields[2]) elif (len(fields) < 6 and len(fields) >=3 ): tts_st = int(fields[2]) - 1 tts_end =int(fields[2]) else: print >>sys.stderr, "reference bed file must be at least 3 columns" sys.exit() key=chrom + ":" + str(tts_st) + ':' + str(tts_end) if key not in tts_group: tts_group[key] = geneName + ";" else: tts_group[key] += geneName + ";" for key in tts_group: tts_group_num +=1 chrom,tts_st,tts_end = key.split(":") #print chrom + '\t'+ tts_st +'\t' + tts_end if chrom not in ranges: ranges[chrom] = IntervalTree() ranges[chrom].insert_interval( Interval( int(tts_st), int(tts_end),value=tts_group[key] + '\t' + key) ) self.B_fh.seek(0) print >>sys.stderr, "Done. Total " + str(tts_group_num) + " TTS groups" #a=ranges['chr1'].find(1000000,1200000) #print a #read input bed file print >>sys.stderr, "Find nearest TTS(s) for "+ self.A_base_Name + '...', for line in self.A_fh: if line.startswith(("#","track","browser")):continue fields=line.rstrip('\n ').split() if (len(fields)>=6): chain = fields[5] elif (len(fields) < 6 and len(fields) >=3 ): chain = '+' else: print >>sys.stderr, "Inut bed file must be at least 3 columns" sys.exit() chrom = fields[0] bed_st = int(fields[1]) + int((int(fields[2]) - int(fields[1]))/2) bed_end = bed_st +1 up=ranges[chrom].upstream_of_interval(Interval(bed_st,bed_end,strand=chain),num_intervals=1,max_dist=upStream) down=ranges[chrom].downstream_of_interval(Interval(bed_st,bed_end,strand=chain),num_intervals=1,max_dist=downStream) if (len(up) >0): up_name=up[0].value up_dist=abs(up[0].end - bed_end) else: up_name="NA\tNA" up_dist="NA" if (len(down) >0): down_name=down[0].value down_dist=abs(down[0].end - bed_end) else: down_name="NA\tNA" down_dist="NA" print >>OUT, line.rstrip() + "\t" + up_name + "\t" + str(up_dist) + '\t' + down_name + "\t" + str(down_dist) self.A_fh.seek(0) print >>sys.stderr, "Done." def findClosestPeak(self,mod, downStream=50000,upStream=50000): '''For each entry in second bed file (reference gene model) find the closest peak defined in the fist bed file''' mode={ 0:'TSS-up, TSS-down', 1:'TSS-up, TES-down', 2:'TES-up, TES-down', 3:'CDSS-up, CDSS-down', 4:'CDSS-up, CDSE-down', 5:'CDSE-up, CDSE-down' } ranges={} #read peak bed file print >>sys.stderr, "\tReading "+ self.A_base_Name + '...', for line in self.A_fh: if line.startswith(("#","track","browser")):continue if not line.strip(): continue line = line.strip('\n') fields=line.split() if len(fields)>=3 and len(fields) < 6: chrom = fields[0] start = fields[1] end = fields[2] peak_id = "ID=" + chrom + ":" + start + '-' + end + ":" + 'Score=0' #middle = int(int(start) + (int(start) + int(end))/2) if len(fields)>=6: chrom = fields[0] start = fields[1] end = fields[2] peak_id = "ID=" + chrom + ":" + start + '-' + end + ":" + 'Score=' + fields[4] #middle = int(int(start) + (int(start) + int(end))/2) if chrom not in ranges: ranges[chrom] = IntervalTree() ranges[chrom].insert_interval( Interval(int(start), int(end), value = peak_id) ) print >>sys.stderr, "Done" #read reference gene model bed file line_id=0 nearest_peak={} print >>sys.stderr, "\tFind nearest peaks for "+ self.B_base_Name + '...', for line in self.B_fh: if line.startswith(("#","track","browser")):continue if not line.strip():continue fields=line.rstrip('\n ').split() line_id +=1 try: strand = fields[5] chrom = fields[0] if strand == '-': TSS = int(fields[2]) TES = int(fields[1]) CDSS = int(fields[7]) CDSE = int(fields[6]) elif strand != '-': strand='+' TSS = int(fields[1]) TES = int(fields[2]) CDSS = int(fields[6]) CDSE = int(fields[7]) geneName=fields[3] score=fields[4] blockCount = fields[9] blockSize = fields[10] blockStart = fields[11] #line_id = geneName except: print >>sys.stderr, "Reference gene model must 12 column BED files" sys.exit(1) if chrom in ranges: if mod==0: #TSS-up, TSS-down hits = ranges[chrom].find(TSS - upStream, TSS + downStream ) elif mod==1: # TSS-up, TES-down if strand == '+': hits = ranges[chrom].find(TSS - upStream, TES + downStream) if strand == '-': hits = ranges[chrom].find(TSS + upStream, TES - downStream) elif mod==2: #TES-up, TES-down hits = ranges[chrom].find(TES - upStream, TES + downStream ) elif mod==3: #CDSS-up, CDSS-down hits = ranges[chrom].find(CDSS - upStream, CDSS + downStream ) elif mod==4: #CDSS-up, CDSE-down if strand == '+': hits = ranges[chrom].find(CDSS - upStream, CDSE + downStream) if strand == '-': hits = ranges[chrom].find(CDSS + upStream, CDSE - downStream) elif mod==5: #CDSE-up, CDSE-down hits = ranges[chrom].find(CDSE - upStream, CDSE + downStream ) else: print >>sys.stderr, "unknow arguments for 'mod'" return None if (len(hits) >0): val = ','.join([i.value for i in hits]) nearest_peak[line_id]=val else: nearest_peak[line_id] ="NA" else: nearest_peak[line_id] = "NA" print >>sys.stderr, "Done." return nearest_peak #FO=open("aaa",'w') #for k,v in nearest_peak.items(): # print >>FO, str(k) + '\t' + str(v) def bestMatch(self): '''Exon chain comparison. Given a bed entry in bedFileA, find the best-matched gene from bedFileB. If multiple genes from bedFileB matched equally good. Randomly report one.''' #read reference gene model ref_ranges={} ref_cdsRange={} print >>sys.stderr, "\tReading reference gene model " + self.B_base_Name + '...' for line in self.B_fh: boundaries=set() #try: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue fields=line.rstrip('\r\n').split() chrom = fields[0] txStart = int(fields[1]) txEnd = fields[2] geneName = fields[3] score = fields[4] strand = fields[5] cdsStart = fields[6] cdsEnd = fields[7] geneID = chrom + ":" + str(txStart) + "-" + txEnd + ":" + strand + ':' + geneName #boundaries.add(int(cdsStart)) #boundaries.add(int(cdsEnd)) ref_cdsRange[geneID] = [int(cdsStart),int(cdsEnd)] exon_start=map(int,fields[11].rstrip(',').split(',')) exon_start=map((lambda x: x + txStart),exon_start) exon_end=map(int,fields[10].rstrip(',').split(',')) exon_end=map((lambda x,y:x+y),exon_start,exon_end) #except: # print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, # continue for st,end in zip(exon_start,exon_end): boundaries.add(st) boundaries.add(end) chrom = chrom + ':' + strand if chrom not in ref_ranges: ref_ranges[chrom] = IntervalTree() ref_ranges[chrom].insert(int(txStart), int(txEnd),{geneID : boundaries}) print >>sys.stderr, "\tReading " + self.A_base_Name + '...' #read bed file utr_diff_score = -0.5 cds_diff_score = -1 common_score = 8 line_id=0 ret_dict={} for line in self.A_fh: boundaries=set() try: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue if not line.strip():continue line_id += 1 fields=line.rstrip('\r\n').split() chrom = fields[0] txStart = int(fields[1]) txEnd = fields[2] geneName = fields[3] score = fields[4] strand = fields[5] cdsStart = fields[6] cdsEnd = fields[7] geneID = chrom + ":" + str(txStart) + "-" + txEnd + ":" + strand + ':' + geneName #boundaries.add(int(cdsStart)) #boundaries.add(int(cdsEnd)) exon_start=map(int,fields[11].rstrip(',').split(',')) exon_start=map((lambda x: x + txStart),exon_start) exon_end=map(int,fields[10].rstrip(',').split(',')) exon_end=map((lambda x,y:x+y),exon_start,exon_end) except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue for st,end in zip(exon_start,exon_end): boundaries.add(st) boundaries.add(end) chrom = chrom + ':' + strand #print line.strip() + '\t', if chrom in ref_ranges: overlap_genes = ref_ranges[chrom].find(int(txStart),int(txEnd)) if len(overlap_genes)==0: #not overlap with any gene status = "novel_gene(non-overlap with known gene)" else: #overlap with known gene diff_dict=collections.defaultdict(int) #note the higher score, the better similarity #which ref gene is best match for i in overlap_genes: status = '' ref_gene_marks = i.values()[0] ref_gene_id = i.keys()[0] input_gene_marks = boundaries ref_gene_uniq = ref_gene_marks.difference(input_gene_marks) input_gene_uniq = input_gene_marks.difference(ref_gene_marks) common = input_gene_marks.intersection(ref_gene_marks) if ref_gene_marks.__eq__(input_gene_marks): #boundaries 100% match to known exon chain status = 'complete_match' + '_' + ref_gene_id break for bd in common: diff_dict[ref_gene_id] += common_score for bd in input_gene_uniq: if bd >= int(cdsStart) or bd <= int(cdsEnd): diff_dict[ref_gene_id] += cds_diff_score else: diff_dict[ref_gene_id] += utr_diff_score for bd in ref_gene_uniq: if bd >= ref_cdsRange[ref_gene_id][0] or bd <= ref_cdsRange[ref_gene_id][1]: diff_dict[ref_gene_id] += cds_diff_score else: diff_dict[ref_gene_id] += utr_diff_score if status.find('complete_match')==-1: bestID = max(diff_dict,key=diff_dict.get) for i in overlap_genes: if i.keys()[0] == bestID: ref_best_match_set = i.values()[0] break #we found a ref gene best match to input gene input_gene_marks = boundaries i_union = ref_best_match_set.union(input_gene_marks) i_intersection = ref_best_match_set.intersection(input_gene_marks) ref_gene_uniq = ref_best_match_set.difference(input_gene_marks) input_gene_uniq = input_gene_marks.difference(ref_best_match_set) if len(i_intersection) == 0: status = "novel_gene(overlap with known gene)" + bestID else: tmp=set() for bd in ref_gene_uniq: if bd >= ref_cdsRange[bestID][0] or bd <= ref_cdsRange[bestID][1]: tmp.add('different CDS') #status = "partial_match (different CDS)" + bestID else: tmp.add('different UTR') #status = "partial_match (different UTR)" + bestID status = 'partial_match (' + ';'.join(tmp) + ')' else: status = 'unknown' ret_dict[line_id] = status return ret_dict def unionBed3(lst): '''Take the union of 3 column bed files. return a new list''' bitsets = binned_bitsets_from_list(lst) ret_lst=[] for chrom in bitsets: bits = bitsets[chrom] end = 0 while 1: start = bits.next_set( end ) if start == bits.size: break end = bits.next_clear( start ) ret_lst.append([chrom, start, end]) bitsets=dict() return ret_lst def intersectBed3(lst1,lst2): '''Take the intersection of two bed files (3 column bed files)''' bits1 = binned_bitsets_from_list(lst1) bits2 = binned_bitsets_from_list(lst2) bitsets = dict() ret_lst = [] for key in bits1: if key in bits2: bits1[key].iand( bits2[key] ) bitsets[key] = bits1[key] for chrom in bitsets: bits = bitsets[chrom] end = 0 while 1: start = bits.next_set( end ) if start == bits.size: break end = bits.next_clear( start ) ret_lst.append([chrom, start, end]) bits1.clear() bits2.clear() bitsets.clear() return ret_lst def subtractBed3(lst1,lst2): '''subtrack lst2 from lst1''' bitsets1 = binned_bitsets_from_list(lst1) bitsets2 = binned_bitsets_from_list(lst2) ret_lst=[] for chrom in bitsets1: if chrom not in bitsets1: continue bits1 = bitsets1[chrom] if chrom in bitsets2: bits2 = bitsets2[chrom] bits2.invert() bits1.iand( bits2 ) end=0 while 1: start = bits1.next_set( end ) if start == bits1.size: break end = bits1.next_clear( start ) ret_lst.append([chrom,start,end]) bitsets1 = dict() bitsets2 = dict() return ret_lst def tillingBed(chrName,chrSize,stepSize=10000): '''tilling whome genome into small sizes''' #tilling genome for start in xrange(0,chrSize,stepSize): end = start + stepSize if end < chrSize: yield (chrName,start,end) else: yield (chrName,start,chrSize)