# Time-stamp: <2015-07-27 13:52:39 Tao Liu> """Description: MACS 2 main executable Copyright (c) 2008,2009 Yong Zhang, Tao Liu Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: release candidate @version: $Id$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import logging from time import strftime # ------------------------------------ # own python modules # ------------------------------------ from MACS2.IO import cBedGraphIO from MACS2.IO.cDiffScore import DiffScoreTrackI from MACS2.IO.cPeakIO import PeakIO from MACS2.OptValidator import diff_opt_validate from MACS2.OutputWriter import * from MACS2.Prob import binomial_cdf_inv from MACS2.PeakModel import PeakModel,NotEnoughPairsException from MACS2.PeakDetect import PeakDetect from MACS2.Constants import * # ------------------------------------ # constants # ------------------------------------ logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) # ------------------------------------ # Misc functions # ------------------------------------ error = logging.critical # function alias warn = logging.warning debug = logging.debug info = logging.info # ------------------------------------ # Main function # ------------------------------------ def run( args ): """The Differential function/pipeline for MACS. """ # Parse options... options = diff_opt_validate( args ) #0 output arguments # info("\n"+options.argtxt) ofile_prefix = options.name # check if tag files exist with open(options.t1bdg) as f: pass with open(options.c1bdg) as f: pass with open(options.t2bdg) as f: pass with open(options.c2bdg) as f: pass if not options.peaks1 == '': info("Read peaks for condition 1...") p1io = PeakIO() with open(options.peaks1, 'rU') as f: p1io.read_from_xls(f) if not options.peaks2 == '': info("Read peaks for condition 2...") p2io = PeakIO() with open(options.peaks2, 'rU') as f: p2io.read_from_xls(f) #1 Read tag files info("Read and build treatment 1 bedGraph...") t1bio = cBedGraphIO.bedGraphIO(options.t1bdg) t1btrack = t1bio.build_bdgtrack() info("Read and build control 1 bedGraph...") c1bio = cBedGraphIO.bedGraphIO(options.c1bdg) c1btrack = c1bio.build_bdgtrack() if len(options.depth) >=2: depth1 = options.depth[0] depth2 = options.depth[1] else: depth1 = options.depth[0] depth2 = depth1 info("Read and build treatment 2 bedGraph...") t2bio = cBedGraphIO.bedGraphIO(options.t2bdg) t2btrack = t2bio.build_bdgtrack() info("Read and build control 2 bedGraph...") c2bio = cBedGraphIO.bedGraphIO(options.c2bdg) c2btrack = c2bio.build_bdgtrack() #3 Call Peaks diffscore = DiffScoreTrackI( t1btrack, c1btrack, t2btrack, c2btrack, depth1, depth2 ) diffscore.finalize() if options.call_peaks: diffscore.set_track_score_method(options.track_score_method) info("Calling peaks") if options.track_score_method == 'p': diffscore.call_peaks(cutoff = options.peaks_log_pvalue, min_length = options.pminlen) elif options.track_score_method == 'q': diffscore.call_peaks(cutoff = options.peaks_log_qvalue, min_length = options.pminlen) else: raise NotImplementedError else: info("Using existing peaks") diffscore.store_peaks(p1io, p2io) info("Rebuilding chromosomes") diffscore.rebuild_chromosomes() diffscore.annotate_peaks() info("Calling differentially occupied peaks") if options.score_method == 'p': diffscore.call_diff_peaks(cutoff = options.log_pvalue, min_length = options.dminlen, score_method = options.score_method) if options.score_method == 'q': diffscore.call_diff_peaks(cutoff = options.log_qvalue, min_length = options.dminlen, score_method = options.score_method) # diffscore.print_some_peaks() # diffscore.print_diff_peaks() info("Write output xls and BED files...") ofhd_xls = open( os.path.join( options.outdir, options.peakxls), "w" ) ofhd_xls.write("# This file is generated by MACS version, using the diffpeak module %s\n" % (MACS_VERSION)) ofhd_xls.write( options.argtxt+"\n" ) ofhd_bed = open( os.path.join( options.outdir, options.peakbed), "w" ) # pass write method so we can print too, and include name diffscore.write_peaks(xls=ofhd_xls, bed=ofhd_bed, name = options.name, name_prefix="%s_peak_", description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", trackline=options.trackline) ofhd_xls.close() ofhd_bed.close() if diffscore.has_peakio(): info("Write annotated peak xls files...") ofhd_xls1 = open( os.path.join( options.outdir, options.peak1xls), "w" ) ofhd_xls1.write("# This file is generated by MACS version, using the diffpeak module %s\n" % (MACS_VERSION)) ofhd_xls1.write(options.argtxt+"\n") ofhd_xls2 = open( os.path.join( options.outdir, options.peak2xls), "w" ) ofhd_xls2.write("# This file is generated by MACS version, using the diffpeak module %s\n" % (MACS_VERSION)) ofhd_xls2.write(options.argtxt+"\n") diffscore.write_peaks_by_summit(ofhd_xls1, ofhd_xls2, name = options.name, name_prefix="%s_peak_") ofhd_xls1.close() ofhd_xls2.close() if options.store_bdg: info("#4 Write output bedgraph files...") ofhd_logLR = open( os.path.join( options.outdir, options.bdglogLR), "w" ) ofhd_pvalue = open( os.path.join( options.outdir, options.bdgpvalue), "w" ) ofhd_logFC = open( os.path.join( options.outdir, options.bdglogFC), "w" ) diffscore.write_bedgraphs(logLR=ofhd_logLR, pvalue=ofhd_pvalue, logFC=ofhd_logFC, name = options.name, description=" for %s (Made with MACS v2, " + strftime("%x") + ")", trackline=options.trackline) ofhd_logLR.close() ofhd_pvalue.close() ofhd_logFC.close() def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ): """Calculate the maximum duplicated tag number based on genome size, total tag number and a p-value based on binomial distribution. Brute force algorithm to calculate reverse CDF no more than MAX_LAMBDA(100000). """ return binomial_cdf_inv(1-p,tags_number,1.0/genome_size) def load_frag_files_options ( options ): """From the options, load treatment fragments and control fragments (if available). """ options.info("#1 read treatment fragments...") tp = options.parser(options.tfile[0]) treat = tp.build_petrack() treat.sort() if len(options.tfile) > 1: # multiple input for tfile in options.tfile[1:]: tp = options.parser(tfile) treat = tp.append_petrack( treat ) treat.sort() options.tsize = tp.d if options.cfile: options.info("#1.2 read input fragments...") cp = options.parser(options.cfile[0]) control = cp.build_petrack() control_d = cp.d control.sort() if len(options.cfile) > 1: # multiple input for cfile in options.cfile[1:]: cp = options.parser(cfile) control = cp.append_petrack( control ) control.sort() else: control = None options.info("#1 mean fragment size is determined as %d bp from treatment" % options.tsize) # options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance) if control is not None: options.info("#1 note: mean fragment size in control is %d bp -- value ignored" % control_d) return (treat, control) def load_tag_files_options ( options ): """From the options, load treatment tags and control tags (if available). """ options.info("#1 read treatment tags...") tp = options.parser(options.tfile[0]) if not options.tsize: # override tsize if user specified --tsize ttsize = tp.tsize() options.tsize = ttsize treat = tp.build_fwtrack() treat.sort() if len(options.tfile) > 1: # multiple input for tfile in options.tfile[1:]: tp = options.parser(tfile) treat = tp.append_fwtrack( treat ) treat.sort() if options.cfile: options.info("#1.2 read input tags...") control = options.parser(options.cfile[0]).build_fwtrack() control.sort() if len(options.cfile) > 1: # multiple input for cfile in options.cfile[1:]: cp = options.parser(cfile) control = cp.append_fwtrack( control ) control.sort() else: control = None options.info("#1 tag size is determined as %d bps" % options.tsize) return (treat, control)