#!/usr/bin/env python # HTSeq_QA.py # # (c) Simon Anders, European Molecular Biology Laboratory, 2010 # released under GNU General Public License import sys, time, os.path, optparse from itertools import * import numpy import HTSeq def main(): try: import matplotlib except ImportError: sys.stderr.write("This script needs the 'matplotlib' library, which ") sys.stderr.write("was not found. Please install it." ) matplotlib.use('PDF') from matplotlib import pyplot # **** Parse command line **** optParser = optparse.OptionParser( usage = "%prog [options] read_file", description= "This script take a file with high-throughput sequencing reads " + "(supported formats: SAM, Solexa _export.txt, FASTQ, Solexa " + "_sequence.txt) and performs a simply quality assessment by " + "producing plots showing the distribution of called bases and " + "base-call quality scores by position within the reads. The " + "plots are output as a PDF file.", epilog = "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " + " Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " + " Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__ ) optParser.add_option( "-t", "--type", type="choice", dest="type", choices = ("sam", "bam", "solexa-export", "fastq", "solexa-fastq"), default = "sam", help="type of read_file (one of: sam [default], bam, " + "solexa-export, fastq, solexa-fastq)" ) optParser.add_option( "-o", "--outfile", type="string", dest="outfile", help="output filename (default is .pdf)" ) optParser.add_option( "-r", "--readlength", type="int", dest="readlen", help="the maximum read length (when not specified, the script guesses from the file" ) optParser.add_option( "-g", "--gamma", type="float", dest="gamma", default = 0.3, help="the gamma factor for the contrast adjustment of the quality score plot" ) optParser.add_option( "-n", "--nosplit", action="store_true", dest="nosplit", help="do not split reads in unaligned and aligned ones" ) optParser.add_option( "-m", "--maxqual", type="int", dest="maxqual", default=41, help="the maximum quality score that appears in the data (default: 41)" ) if len( sys.argv ) == 1: optParser.print_help() sys.exit(1) (opts, args) = optParser.parse_args() if len( args ) != 1: sys.stderr.write( sys.argv[0] + ": Error: Please provide one argument (the read_file).\n" ) sys.stderr.write( " Call with '-h' to get usage information.\n" ) sys.exit( 1 ) readfilename = args[0] if opts.type == "sam": readfile = HTSeq.SAM_Reader( readfilename ) isAlnmntFile = True elif opts.type == "bam": readfile = HTSeq.BAM_Reader( readfilename ) isAlnmntFile = True elif opts.type == "solexa-export": readfile = HTSeq.SolexaExportReader( readfilename ) isAlnmntFile = True elif opts.type == "fastq": readfile = HTSeq.FastqReader( readfilename ) isAlnmntFile = False elif opts.type == "solexa-fastq": readfile = HTSeq.FastqReader( readfilename, "solexa" ) isAlnmntFile = False else: sys.error( "Oops." ) twoColumns = isAlnmntFile and not opts.nosplit if opts.outfile is None: outfilename = os.path.basename( readfilename ) + ".pdf" else: outfilename = opts.outfile # **** Get read length **** if opts.readlen is not None: readlen = opts.readlen else: readlen = 0 if isAlnmntFile: reads = ( a.read for a in readfile ) else: reads = readfile for r in islice( reads, 10000 ): if len( r ) > readlen: readlen = len( r ) max_qual = opts.maxqual gamma = opts.gamma # **** Initialize count arrays **** base_arr_U = numpy.zeros( ( readlen, 5 ), numpy.int ) qual_arr_U = numpy.zeros( ( readlen, max_qual+1 ), numpy.int ) if twoColumns: base_arr_A = numpy.zeros( ( readlen, 5 ), numpy.int ) qual_arr_A = numpy.zeros( ( readlen, max_qual+1 ), numpy.int ) # **** Main counting loop **** i = 0 try: for a in readfile: if isAlnmntFile: r = a.read else: r = a if twoColumns and (isAlnmntFile and a.aligned): r.add_bases_to_count_array( base_arr_A ) r.add_qual_to_count_array( qual_arr_A ) else: r.add_bases_to_count_array( base_arr_U ) r.add_qual_to_count_array( qual_arr_U ) i += 1 if i % 200000 == 0: print i, "reads processed" except: sys.stderr.write( "Error occured in: %s\n" % readfile.get_line_number_string() ) raise print i, "reads processed" # **** Normalize result **** def norm_by_pos( arr ): arr = numpy.array( arr, numpy.float ) arr_n = ( arr.T / arr.sum( 1 ) ).T arr_n[ arr == 0 ] = 0 return arr_n def norm_by_start( arr ): arr = numpy.array( arr, numpy.float ) arr_n = ( arr.T / arr.sum( 1 )[ 0 ] ).T arr_n[ arr == 0 ] = 0 return arr_n base_arr_U_n = norm_by_pos( base_arr_U ) qual_arr_U_n = norm_by_start( qual_arr_U ) nreads_U = base_arr_U[0,:].sum() if twoColumns: base_arr_A_n = norm_by_pos( base_arr_A ) qual_arr_A_n = norm_by_start( qual_arr_A ) nreads_A = base_arr_A[0,:].sum() # **** Make plot **** def plot_bases( arr ): xg = numpy.arange( readlen ) pyplot.plot( xg, arr[ : , 0 ], marker='.', color='red') pyplot.plot( xg, arr[ : , 1 ], marker='.', color='darkgreen') pyplot.plot( xg, arr[ : , 2 ], marker='.',color='lightgreen') pyplot.plot( xg, arr[ : , 3 ], marker='.',color='orange') pyplot.plot( xg, arr[ : , 4 ], marker='.',color='grey') pyplot.axis( (0, readlen-1, 0, 1 ) ) pyplot.text( readlen*.70, .9, "A", color="red" ) pyplot.text( readlen*.75, .9, "C", color="darkgreen" ) pyplot.text( readlen*.80, .9, "G", color="lightgreen" ) pyplot.text( readlen*.85, .9, "T", color="orange" ) pyplot.text( readlen*.90, .9, "N", color="grey" ) pyplot.figure() pyplot.subplots_adjust( top=.85 ) pyplot.suptitle( os.path.basename(readfilename), fontweight='bold' ) if twoColumns: pyplot.subplot( 221 ) plot_bases( base_arr_U_n ) pyplot.ylabel( "proportion of base" ) pyplot.title( "non-aligned reads\n%.0f%% (%.3f million)" % ( 100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6 ) ) pyplot.subplot( 222 ) plot_bases( base_arr_A_n ) pyplot.title( "aligned reads\n%.0f%% (%.3f million)" % ( 100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6 ) ) pyplot.subplot( 223 ) pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens, norm=pyplot.normalize( 0, 1 ) ) pyplot.axis( (0, readlen-1, 0, max_qual+1 ) ) pyplot.xlabel( "position in read" ) pyplot.ylabel( "base-call quality score" ) pyplot.subplot( 224 ) pyplot.pcolor( qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens, norm=pyplot.normalize( 0, 1 ) ) pyplot.axis( (0, readlen-1, 0, max_qual+1 ) ) pyplot.xlabel( "position in read" ) else: pyplot.subplot( 211 ) plot_bases( base_arr_U_n ) pyplot.ylabel( "proportion of base" ) pyplot.title( "%.3f million reads" % ( nreads_U / 1e6 ) ) pyplot.subplot( 212 ) pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens, norm=pyplot.normalize( 0, 1 ) ) pyplot.axis( (0, readlen-1, 0, max_qual+1 ) ) pyplot.xlabel( "position in read" ) pyplot.ylabel( "base-call quality score" ) pyplot.savefig( outfilename ) if __name__ == "__main__": main()