# Time-stamp: <2016-02-15 16:12:30 Tao Liu> """Description: MACS 2 main executable Copyright (c) 2008,2009 Yong Zhang, Tao Liu Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: release candidate @version: $Id$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import logging from time import strftime import tempfile # ------------------------------------ # own python modules # ------------------------------------ from MACS2.OptValidator import opt_validate from MACS2.OutputWriter import * from MACS2.Prob import binomial_cdf_inv from MACS2.PeakModel import PeakModel,NotEnoughPairsException from MACS2.PeakDetect import PeakDetect from MACS2.Constants import * # ------------------------------------ # Main function # ------------------------------------ def check_names(treat, control, error_stream): """check common chromosome names""" tchrnames = set(treat.get_chr_names()) cchrnames = set(control.get_chr_names()) commonnames = tchrnames.intersection(cchrnames) if len(commonnames)==0: error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...") error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames))) error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames))) sys.exit() def run( args ): """The Main function/pipeline for MACS. """ # Parse options... options = opt_validate( args ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments info("\n"+options.argtxt) options.PE_MODE = options.format in ('BAMPE','BEDPE') if options.PE_MODE: tag = 'fragment' # call things fragments not tags else: tag = 'tag' tempfile.tempdir = options.tempdir #1 Read tag files info("#1 read %s files...", tag) if options.PE_MODE: (treat, control) = load_frag_files_options (options) else: (treat, control) = load_tag_files_options (options) if control is not None: check_names(treat, control, error) info("#1 %s size = %d", tag, options.tsize) tagsinfo = "# %s size is determined as %d bps\n" % (tag, options.tsize) t0 = treat.total tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0) info("#1 total %ss in treatment: %d", tag, t0) # not ready yet # options.filteringmodel = True # if options.filteringmodel: # treat.separate_dups() # t0 = treat.total + treat.dups.total # t1 = treat.total # info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0) # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0) # elif options.keepduplicates != "all": if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag) treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0) info("#1 max_dup_tags based on binomial = %d" % (treatment_max_dup_tags)) else: info("#1 user defined the maximum %ss...", tag) treatment_max_dup_tags = int(options.keepduplicates) if options.PE_MODE: info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags) else: info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags) treat.separate_dups(treatment_max_dup_tags) # changed 5-29 # treat.filter_dup(treatment_max_dup_tags) t1 = treat.total info("#1 %ss after filtering in treatment: %d", tag, t1) tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1) if options.PE_MODE: tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags) else: tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags) info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0) tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0) else: t1 = t0 if control is not None: c0 = control.total tagsinfo += "# total %ss in control: %d\n" % (tag, c0) info("#1 total %ss in control: %d", tag, c0) # not ready yet #if options.filteringmodel: # control.separate_dups() # c0 = treat.total + treat.dups.total # c1 = treat.total # info("#1 Redundant rate of treatment: %.2f", float(c0 - c1) / c0) # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(c0-c1)/c0) #elif options.keepduplicates != "all": if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 for control, calculate max duplicate %ss in single position based on binomial distribution...", tag) control_max_dup_tags = cal_max_dup_tags(options.gsize,c0) info("#1 max_dup_tags based on binomial = %d" % (control_max_dup_tags)) else: info("#1 user defined the maximum %ss...", tag) control_max_dup_tags = int(options.keepduplicates) if options.PE_MODE: info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags) else: info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags) # control.filter_dup(treatment_max_dup_tags) control.separate_dups(treatment_max_dup_tags) # changed 5-29 c1 = control.total info("#1 %ss after filtering in control: %d", tag, c1) tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1) if options.PE_MODE: tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags) else: tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags) info("#1 Redundant rate of control: %.2f" % (float(c0-c1)/c0)) tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0) else: c1 = c0 info("#1 finished!") #2 Build Model info("#2 Build Peak Model...") if options.nomodel: info("#2 Skipped...") if options.PE_MODE: #options.shiftsize = 0 options.d = options.tsize else: options.d=options.extsize if options.shift > 0: info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % (options.shift)) elif options.shift < 0: info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options.shift * -1)) info("#2 Use %d as fragment length" % (options.d)) options.scanwindow=2*options.d # remove the effect of --bw else: try: peakmodel = PeakModel(treatment = treat, max_pairnum = MAX_PAIRNUM, opt = options ) info("#2 finished!") debug("#2 Summary Model:") debug("#2 min_tags: %d" % (peakmodel.min_tags)) debug("#2 d: %d" % (peakmodel.d)) debug("#2 scan_window: %d" % (peakmodel.scan_window)) info("#2 predicted fragment length is %d bps" % peakmodel.d) info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d))) info("#2.2 Generate R script for model : %s" % (options.modelR)) model2r_script(peakmodel,options.modelR,options.name) options.d = peakmodel.d options.scanwindow= 2*options.d if options.d <= 2*options.tsize: warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d)) if options.onauto: options.d=options.extsize options.scanwindow=2*options.d warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.d)) else: warn("#2 You may need to consider one of the other alternative d(s): %s" % ','.join(map(str,peakmodel.alternative_d))) warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.") except NotEnoughPairsException: if not options.onauto: sys.exit(1) warn("#2 Skipped...") options.d=options.extsize options.scanwindow=2*options.d warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options.d)) #3 Call Peaks info("#3 Call peaks...") if options.nolambda: info("# local lambda is disabled!") # decide options.tocontrol according to options.tolarge if control and options.PE_MODE: c1 = c1 * 2 if control: if options.downsample: # use random sampling to balance treatment and control info("#3 User prefers to use random sampling instead of linear scaling.") if t1 > c1: info("#3 MACS is random sampling treatment %ss...", tag) if options.seed < 0: warn("#3 Your results may not be reproducible due to the random sampling!") else: info("#3 Random seed (%d) is used." % options.seed) treat.sample_num(c1, options.seed) info("#3 %d Tags from treatment are kept", treat.total) elif c1 > t1: info("#3 MACS is random sampling control %ss...", tag) if options.seed < 0: warn("#3 Your results may not be reproducible due to the random sampling!") else: info("#3 Random seed (%d) is used." % options.seed) control.sample_num(t1, options.seed) info("#3 %d %ss from control are kept", control.total, tag) # set options.tocontrol although it would;t matter now options.tocontrol = False else: if options.tolarge: if t1 > c1: # treatment has more tags than control, since tolarge is # true, we will scale control to treatment. options.tocontrol = False else: # treatment has less tags than control, since tolarge is # true, we will scale treatment to control. options.tocontrol = True else: if t1 > c1: # treatment has more tags than control, since tolarge is # false, we will scale treatment to control. options.tocontrol = True else: # treatment has less tags than control, since tolarge is # false, we will scale control to treatment. options.tocontrol = False peakdetect = PeakDetect(treat = treat, control = control, opt = options ) peakdetect.call_peaks() # filter out low FE peaks peakdetect.peaks.filter_fc( fc_low = options.fecutoff ) #call refinepeak if needed. # if options.refine_peaks: # info("#3 now put back duplicate reads...") # treat.addback_dups() # info("#3 calculate reads balance to refine peak summits...") # refined_peaks = treat.refine_peak_from_tags_distribution ( peakdetect.peaks, options.d, 0 ) # info("#3 reassign scores for newly refined peak summits...") # peakdetect.peaks = peakdetect.scoretrack.reassign_peaks( refined_peaks ) # replace # #info("#3 write to file: %s ..." % options.name+"_refined_peaks.encodePeak" ) # #refinedpeakfile = open(options.name+"_refined_peaks.encodePeak", "w") # #refined_peaks.write_to_narrowPeak (refinedpeakfile, name_prefix="%s_refined_peak_", name=options.name, score_column=score_column, trackline=options.trackline ) #diag_result = peakdetect.diag_result() #4 output #4.1 peaks in XLS info("#4 Write output xls file... %s" % (options.peakxls)) ofhd_xls = open( options.peakxls, "w" ) ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION)) ofhd_xls.write(options.argtxt+"\n") ofhd_xls.write(tagsinfo) if options.shift > 0: ofhd_xls.write("# Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options.shift)) elif options.shift < 0: ofhd_xls.write("# Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options.shift * -1)) ofhd_xls.write("# d = %d\n" % (options.d)) try: ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d))) except: # when --nomodel is used, there is no peakmodel object. Simply skip this line. pass if options.nolambda: ofhd_xls.write("# local lambda is disabled!\n") # pass write method so we can print too, and include name peakdetect.peaks.write_to_xls(ofhd_xls, name = options.name) ofhd_xls.close() #4.2 peaks in BED if options.log_pvalue: score_column = "pscore" elif options.log_qvalue: score_column = "qscore" #4.2 peaks in narrowPeak if not options.broad: #info("#4 Write peak bed file... %s" % (options.peakbed)) #ofhd_bed = open(options.peakbed,"w") #peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline) #ofhd_bed.close() info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak)) ofhd_bed = open( options.peakNarrowPeak, "w" ) peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, score_column=score_column, trackline=options.trackline ) ofhd_bed.close() #4.2-2 summits in BED info("#4 Write summits bed file... %s" % (options.summitbed)) ofhd_summits = open( options.summitbed, "w" ) peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_", name=options.name, description="Summits for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline ) ofhd_summits.close() #4.2 broad peaks in bed12 or gappedPeak else: info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak)) ofhd_bed = open( options.peakBroadPeak, "w" ) peakdetect.peaks.write_to_broadPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline) ofhd_bed.close() info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak)) ofhd_bed = open( options.peakGappedPeak, "w" ) peakdetect.peaks.write_to_gappedPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline) ofhd_bed.close() info("Done!") def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ): """Calculate the maximum duplicated tag number based on genome size, total tag number and a p-value based on binomial distribution. Brute force algorithm to calculate reverse CDF no more than MAX_LAMBDA(100000). """ return binomial_cdf_inv(1-p,tags_number,1.0/genome_size) def load_frag_files_options ( options ): """From the options, load treatment fragments and control fragments (if available). """ options.info("#1 read treatment fragments...") tp = options.parser(options.tfile[0], buffer_size=options.buffer_size) treat = tp.build_petrack() #treat.sort() if len(options.tfile) > 1: # multiple input for tfile in options.tfile[1:]: tp = options.parser(tfile, buffer_size=options.buffer_size) treat = tp.append_petrack( treat ) #treat.sort() treat.finalize() options.tsize = tp.d if options.cfile: options.info("#1.2 read input fragments...") cp = options.parser(options.cfile[0], buffer_size=options.buffer_size) control = cp.build_petrack() control_d = cp.d #control.sort() if len(options.cfile) > 1: # multiple input for cfile in options.cfile[1:]: cp = options.parser(cfile, buffer_size=options.buffer_size) control = cp.append_petrack( control ) #control.sort() control.finalize() else: control = None options.info("#1 mean fragment size is determined as %d bp from treatment" % options.tsize) # options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance) if control is not None: options.info("#1 note: mean fragment size in control is %d bp -- value ignored" % control_d) return (treat, control) def load_tag_files_options ( options ): """From the options, load treatment tags and control tags (if available). """ options.info("#1 read treatment tags...") tp = options.parser(options.tfile[0], buffer_size=options.buffer_size) if not options.tsize: # override tsize if user specified --tsize ttsize = tp.tsize() options.tsize = ttsize treat = tp.build_fwtrack() #treat.sort() if len(options.tfile) > 1: # multiple input for tfile in options.tfile[1:]: tp = options.parser(tfile, buffer_size=options.buffer_size) treat = tp.append_fwtrack( treat ) #treat.sort() treat.finalize() if options.cfile: options.info("#1.2 read input tags...") control = options.parser(options.cfile[0], buffer_size=options.buffer_size).build_fwtrack() #control.sort() if len(options.cfile) > 1: # multiple input for cfile in options.cfile[1:]: cp = options.parser(cfile, buffer_size=options.buffer_size) control = cp.append_fwtrack( control ) #control.sort() control.finalize() else: control = None options.info("#1 tag size is determined as %d bps" % options.tsize) return (treat, control)