import sys, itertools, optparse, warnings optParser = optparse.OptionParser( usage = "python %prog [options] ", description= "This script counts how many reads in fall onto each exonic " + "part given in and outputs a list of counts in " + ", for further analysis with the DEXSeq Bioconductor package. " + "Notes: Use dexseq_prepare_annotation.py to produce . " + " may be '-' to indicate standard input.", epilog = "Written by Simon Anders (sanders@fs.tum.de) and Alejandro Reyes (reyes@embl.de), " + "European Molecular Biology Laboratory (EMBL). (c) 2010-2013. Released under the " + " terms of the GNU General Public License v3. Part of the 'DEXSeq' package." ) optParser.add_option( "-p", "--paired", type="choice", dest="paired", choices = ( "no", "yes" ), default = "no", help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" ) optParser.add_option( "-s", "--stranded", type="choice", dest="stranded", choices = ( "yes", "no", "reverse" ), default = "yes", help = "'yes', 'no', or 'reverse'. Indicates whether the data is " + "from a strand-specific assay (default: yes ). " + "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " + "preparation protocol. 'reverse' inverts strands and is needed for certain " + "protocols, e.g. paired-end with circularization." ) optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual", default = 10, help = "skip all reads with alignment quality lower than the given " + "minimum value (default: 10)" ) optParser.add_option( "-f", "--format", type="choice", dest="alignment", choices=("sam", "bam"), default="sam", help = "'sam' or 'bam'. Format of (default: sam)" ) optParser.add_option( "-r", "--order", type="choice", dest="order", choices=("pos", "name"), default="name", help = "'pos' or 'name'. Sorting order of (default: name). Paired-end sequencing " + "data must be sorted either by position or by read name, and the sorting order " + "must be specified. Ignored for single-end data." ) if len( sys.argv ) == 1: optParser.print_help() sys.exit(1) (opts, args) = optParser.parse_args() if len( args ) != 3: sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" ) sys.stderr.write( " Call with '-h' to get usage information.\n" ) sys.exit( 1 ) try: import HTSeq except ImportError: sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) sys.exit(1) gff_file = args[0] sam_file = args[1] out_file = args[2] stranded = opts.stranded == "yes" or opts.stranded == "reverse" reverse = opts.stranded == "reverse" is_PE = opts.paired == "yes" alignment = opts.alignment minaqual = opts.minaqual order = opts.order if alignment == "bam": try: import pysam except ImportError: sys.stderr.write( "Could not import pysam, which is needed to process BAM file (though\n" ) sys.stderr.write( "not to process text SAM files). Please install the 'pysam' library from\n" ) sys.stderr.write( "https://code.google.com/p/pysam/\n" ) sys.exit(1) if sam_file == "-": sam_file = sys.stdin # Step 1: Read in the GFF file as generated by aggregate_genes.py # and put everything into a GenomicArrayOfSets features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded ) for f in HTSeq.GFF_Reader( gff_file ): if f.type == "exonic_part": f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number'] features[f.iv] += f # initialise counters num_reads = 0 counts = {} counts[ '_empty' ] = 0 counts[ '_ambiguous' ] = 0 counts[ '_lowaqual' ] = 0 counts[ '_notaligned' ] = 0 # put a zero for each feature ID for iv, s in features.steps(): for f in s: counts[ f.name ] = 0 #We need this little helper below: def reverse_strand( s ): if s == "+": return "-" elif s == "-": return "+" else: raise SystemError, "illegal strand" def update_count_vector( counts, rs ): if( type(rs) == str): counts[ rs ] += 1 else: for f in rs: counts[f.name] += 1 return counts def map_read_pair(af, ar): rs = set() if ar != None and ar.optional_field("NH") > 1: return '_notunique' if af != None and af.optional_field("NH") > 1: return '_notunique' if af and ar and not af.aligned and not ar.aligned: return '_notaligned' if af and ar and not af.aQual < minaqual and ar.aQual < minaqual: return '_lowaqual' if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys(): for cigop in af.cigar: if cigop.type != "M": continue if reverse: cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) for iv, s in features[cigop.ref_iv].steps(): rs = rs.union( s ) if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in features.chrom_vectors.keys(): for cigop in ar.cigar: if cigop.type != "M": continue if not reverse: cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) for iv, s in features[cigop.ref_iv].steps(): rs = rs.union( s ) set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) if len( set_of_gene_names ) == 0: return '_empty' elif len( set_of_gene_names ) > 1: return '_ambiguous' else: return rs def clean_read_queue( queue, current_position ): clean_queue = dict( queue ) for i in queue: if queue[i].mate_start.pos < current_position: warnings.warn( "Read "+ i + " claims to have an aligned mate that could not be found." ) del clean_queue[i] return clean_queue if alignment == "sam": reader = HTSeq.SAM_Reader else: if HTSeq.__version__ < '0.5.4p4': raise SystemError, "If you are using alignment files in a bam format, please update your HTSeq to 0.5.4p4 or higher" reader = HTSeq.BAM_Reader # Now go through the aligned reads num_reads = 0 if not is_PE: for a in reader( sam_file ): if a.optional_field("NH") > 1: continue if not a.aligned: counts[ '_notaligned' ] += 1 continue if a.aQual < minaqual: counts[ '_lowaqual' ] += 1 continue rs = set() for cigop in a.cigar: if cigop.type != "M": continue if reverse: cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) for iv, s in features[cigop.ref_iv].steps( ): rs = rs.union( s ) set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) if len( set_of_gene_names ) == 0: counts[ '_empty' ] += 1 elif len( set_of_gene_names ) > 1: counts[ '_ambiguous' ] +=1 else: for f in rs: counts[ f.name ] += 1 num_reads += 1 if num_reads % 100000 == 0: sys.stderr.write( "%d reads processed.\n" % num_reads ) else: # paired-end alignments = dict() if order == "name": for af, ar in HTSeq.pair_SAM_alignments( reader( sam_file ) ): if af == None and ar.mate_aligned: continue elif ar == None and af.mate_aligned: continue else: rs = map_read_pair( af, ar ) if rs == '_notunique': continue counts = update_count_vector( counts, rs ) num_reads += 1 if num_reads % 100000 == 0: sys.stderr.write( "%d reads processed.\n" % num_reads ) else: processed_chromosomes = dict() num_reads = 0 current_chromosome='' current_position='' for a in reader( sam_file ): if current_chromosome != a.iv.chrom: if current_chromosome in processed_chromosomes: raise SystemError, "A chromosome that had finished to be processed before was found again in the alignment file, is your alignment file properly sorted by position?" processed_chromosomes[current_chromosome] = 1 alignments = clean_read_queue( alignments, current_position ) del alignments alignments = dict() if current_chromosome == a.iv.chrom and a.iv.start < current_position: raise SystemError, "Current read position is smaller than previous reads, is your alignment file properly sorted by position?" current_chromosome = a.iv.chrom current_position = a.iv.start if a.read.name and a.mate_aligned: if a.read.name in alignments: b = alignments[ a.read.name ] if a.pe_which == "first" and b.pe_which == "second": af=a ar=b else: af=b ar=a rs = map_read_pair(af, ar) del alignments[ a.read.name ] if rs == '_notunique': continue counts = update_count_vector(counts, rs) else: alignments[ a.read.name ] = a else: if a.pe_which == "first": rs = map_read_pair(a, None) else: rs = map_read_pair(None, a) if rs == '_notunique': continue counts = update_count_vector(counts, rs) num_reads += 1 if num_reads % 200000 == 0: alignments = clean_read_queue( alignments, current_position ) sys.stderr.write( "%d reads processed.\n" % (num_reads / 2) ) # Step 3: Write out the results fout = open( out_file, "w" ) for fn in sorted( counts.keys() ): fout.write( "%s\t%d\n" % ( fn, counts[fn] ) ) fout.close()