\documentclass{article} \title{Reproducibility of peaks: is one lane enough?} \usepackage[text={178mm,230mm},centering]{geometry} \usepackage{Sweave} \SweaveOpts{keep.source=TRUE,eps=FALSE,pdf=TRUE,width=9,height=11,prefix.string=figs/figs-peakreproducibility} \setkeys{Gin}{width=0.98\textwidth} \begin{document} \maketitle \raggedright <>= library(chipseq) library(lattice) library(latticeExtra) combineLaneReads <- function(laneList, chromList = names(laneList[[1]])) { names(chromList) = chromList ##to get the return value named lapply(chromList, function(chr) { list("+" = unlist(lapply(laneList, function(x) x[[chr]][["+"]]), use.names = FALSE), "-" = unlist(lapply(laneList, function(x) x[[chr]][["-"]]), use.names = FALSE)) }) } load("myodMyo.rda") load("myodFibro.rda") load("pairedReads.rda") set.seed(20081008) findPeaks <- function(x, g = extendReads(x), lower = 8, give.islands = TRUE, df = TRUE) { if (give.islands) ## return the whole island when a peak is found { s <- slice(coverage(g, width = max(end(g))), lower = 1) s <- s[viewMaxs(s) >= lower] } else s <- slice(coverage(g, width = max(end(g))), lower = lower) if (df) data.frame(start = start(s), end = end(s)) else s } findIntersections <- function(peaks) { ## peaks = data.frame(start, end, lane). peaks$lane <- as.factor(peaks$lane) peaks <- peaks[order(peaks$start), ] allpeaks <- with(peaks, IRanges(start, end)) sapply(levels(peaks$lane), function(x) { subpeaks <- with(subset(peaks, lane == x), IRanges(start, end)) findOverlaps(allpeaks, subpeaks, multiple = FALSE) }, simplify = TRUE) } ## foo <- findIntersections(myodMyoPeaks1) ## levelplot(head(is.na(foo[, c(1, 3, 5, 2, 4, 6, 7)]), 1000), aspect = "fill") captureRecaptureChr <- function(peaks, chr = "chr1") { peaks <- subset(peaks, chromosome == chr, select = -chromosome) ## peaks = data.frame(start, end, lane). peaks$lane <- as.factor(peaks$lane) peaks <- peaks[order(peaks$start), ] ## For each pair of lanes, we want ## c(number of peaks in 1, ## number of peaks in 2, ## number of 2 peaks overlapping a 1 peak, ## number of 1 peaks overlapping a 2 peak) ## all this is available if we compute 'number of j peaks overlapping a i peak' for all (i, j) doPair <- function(u, v) { peaks1 <- with(subset(peaks, lane == u), IRanges(start, end)) peaks2 <- with(subset(peaks, lane == v), IRanges(start, end)) if (length(peaks1) && length(peaks2)) sum(!is.na(findOverlaps(peaks2, peaks1, multiple = FALSE))) else 0 } lanes <- levels(peaks$lane) nlanes <- length(lanes) est <- common <- matrix(NA, nrow = nlanes, ncol = nlanes, dimnames = list(lanes, lanes)) for (i in seq_len(nlanes)) for (j in seq_len(nlanes)) common[i, j] <- doPair(lanes[i], lanes[j]) common } captureRecapture <- function(peaks, combine = TRUE, verbose = interactive()) { peaks$lane <- as.factor(peaks$lane) chroms <- unique(as.character(peaks$chromosome)) ans <- if (combine) with(peaks, matrix(0, nrow = nlevels(lane), ncol = nlevels(lane), dimnames = list(levels(lane), levels(lane)))) else with(peaks, array(0, dim = c(nlevels(lane), nlevels(lane), length(chroms)), dimnames = list(levels(lane), levels(lane), chroms))) for (chr in chroms) { if (verbose) message("Processing ", chr) if (combine) ans <- ans + captureRecaptureChr(peaks, chr) else ans[,,chr] <- captureRecaptureChr(peaks, chr) } ans } divideSample <- function(x, prop = 0.5) { ## take a single 'lane' of data, divide (per chromosome/strand) ## into 2 groups by proportion p ans <- list(lane1 = x, lane2 = x) for (chr in names(x)) for (strand in names(x[[chr]])) { n <- length(x[[chr]][[strand]]) if (n == 0) next m <- ceiling(prop * n) # m is at least 1... id <- sample(n, size = m) # so indexing by -id is ok. ans$lane1[[chr]][[strand]] <- x[[chr]][[strand]][id] ans$lane2[[chr]][[strand]] <- x[[chr]][[strand]][-id] } ans } propsWithError <- function(x, n) { p <- x / n psd <- sqrt(p * (1-p) / n) data.frame(chr = reorder(names(p), p), prop = p, lower = p - 3 * psd, upper = p + 3 * psd) } @ \section*{Capture-recapture estimates for one chromosome} <<>>= myodMyoPeaks <- summarizeReads(myodMyo, summary.fun = findPeaks, lower = 8) common <- captureRecaptureChr(myodMyoPeaks, "chr1") common <- common[c(1, 3, 5, 2, 4, 6, 7), c(1, 3, 5, 2, 4, 6, 7)] common ## estimated true number of peaks based on lane pair est.npeaks <- function(x, i = TRUE) outer(diag(x)[i], diag(x)[i], "*") / x[i, i] round(est.npeaks(common)) @ \section*{Capture-recapture estimates after combining chromosomes} <<>>= myodMyoCommon <- captureRecapture(myodMyoPeaks) round(est.npeaks(myodMyoCommon, c("1", "3", "6"))) round(est.npeaks(myodMyoCommon, c("2", "4", "7"))) @ With a higher cutoff: <<>>= myodMyoPeaks <- summarizeReads(myodMyo, summary.fun = findPeaks, lower = 12) myodMyoCommon <- captureRecapture(myodMyoPeaks[-7]) round(est.npeaks(myodMyoCommon, c("1", "3", "6"))) round(est.npeaks(myodMyoCommon, c("2", "4", "7"))) @ Reported number of peaks from combining lanes: <<>>= combinedMyo <- list(cblasts = combineLaneReads(myodMyo[c("1","3","6")]), ctubes = combineLaneReads(myodMyo[c("2","4","7")])) combinedMyoPeaks <- summarizeReads(combinedMyo, summary.fun = findPeaks, lower = 8) combinedMyoCommon <- captureRecapture(combinedMyoPeaks) round(est.npeaks(combinedMyoCommon)) @ \section*{Fibroblast estimates combining chromosomes} <<>>= myodFibroPeaks <- summarizeReads(myodFibro, summary.fun = findPeaks, lower = 8) myodFibroCommon <- captureRecapture(myodFibroPeaks) round(est.npeaks(myodFibroCommon, c("1", "3", "6"))) round(est.npeaks(myodFibroCommon, c("2", "4", "7"))) @ Combining lanes in fibroblast run: <<>>= combinedFibro <- list(fibro = combineLaneReads(myodFibro[c("1","3","6")]), fibroMyoD = combineLaneReads(myodFibro[c("2","4","7")])) combinedFibroPeaks <- summarizeReads(combinedFibro, summary.fun = findPeaks, lower = 12) combinedFibroCommon <- captureRecapture(combinedFibroPeaks) round(est.npeaks(combinedFibroCommon)) @ \section*{Comparing with paired end run} This at least compares similar samples (combining 3 antibodies, either virtually or actually). <<>>= combinedRuns <- list(fibroMyoD1 = combinedFibro$fibroMyoD, fibroMyoD3 = pairedReads$"1", myotubes1 = combinedMyo$ctubes, myotubes3 = pairedReads$"2") combinedRunsPeaks <- summarizeReads(combinedRuns, summary.fun = findPeaks, lower = 8) combinedRunsCommon <- captureRecapture(combinedRunsPeaks) combinedRunsCommon round(est.npeaks(combinedRunsCommon, 1:2)) round(est.npeaks(combinedRunsCommon, 3:4)) @ \section*{Estimated proportion by chromosome} <<>>= combinedRunsCommon <- captureRecapture(combinedRunsPeaks, combine = FALSE) rbind(combinedRunsCommon[1, 1, ], combinedRunsCommon[2, 2, ]) rbind(combinedRunsCommon[3, 3, ], combinedRunsCommon[4, 4, ]) ## paired-end peaks / combined peaks by chrom, fibro+MyoD combinedRunsCommon[1, 2, ] / combinedRunsCommon[1, 1, ] ## paired-end peaks / combined peaks by chrom, myotubes combinedRunsCommon[3, 4, ] / combinedRunsCommon[3, 3, ] ## other way round combinedRunsCommon[2, 1, ] / combinedRunsCommon[2, 2, ] combinedRunsCommon[4, 3, ] / combinedRunsCommon[4, 4, ] @ \newpage <>= plot(segplot(chr ~ lower + upper, data = propsWithError(combinedRunsCommon[1, 2, ], combinedRunsCommon[1, 1, ]), center = prop, draw.bands = FALSE)) @ \newpage <>= plot(segplot(chr ~ lower + upper, data = propsWithError(combinedRunsCommon[3, 4, ], combinedRunsCommon[3, 3, ]), center = prop, draw.bands = FALSE)) @ \newpage \section*{Combined lanes: subsampling} <<>>= npeaks.sub <- function(prop) { round(est.npeaks(captureRecapture(summarizeReads(divideSample(combinedMyo$ctubes, prop = prop), summary.fun = findPeaks, lower = 8)))) } npeaks.sub(0.25) npeaks.sub(0.25) npeaks.sub(0.33) npeaks.sub(0.33) npeaks.sub(0.5) npeaks.sub(0.5) @ \newpage \section*{Error rates using combined lanes as gold standard} Assuming that the combined lanes give all the true peaks (for some cutoff), we can try to vary the cutoff in the paired-end sample to get estimates of error rates. <<>>= ctube.peaks.8 <- lapply(combinedMyo$ctubes, findPeaks, lower = 8, df = FALSE) ctube.peaks.10 <- lapply(combinedMyo$ctubes, findPeaks, lower = 10, df = FALSE) commonPeaks <- function(lower = 10, x = pairedReads$"2", ref.peaks) { x.peaks <- lapply(x, findPeaks, lower = lower, df = FALSE) ## print(sapply(x.peaks, length)) count.overlap <- function(chr) { if (length(ref.peaks[[chr]]) && length(x.peaks[[chr]])) { ov.fp <- findOverlaps(x.peaks[[chr]], ref.peaks[[chr]], multiple = FALSE) ov.fn <- findOverlaps(ref.peaks[[chr]], x.peaks[[chr]], multiple = FALSE) } else ov.fp <- ov.fn <- integer(0) c(FP = sum(is.na(ov.fp)), total = length(x.peaks[[chr]]), FN = sum(is.na(ov.fn)), total.ref = length(ref.peaks[[chr]])) } ans <- as.data.frame(t(sapply(names(ref.peaks), count.overlap, simplify = TRUE))) ans <- within(ans, { FDR <- FP / total SD.FDR <- sqrt(FDR * (1 - FDR) / total) FNR <- FN / total.ref SD.FNR <- sqrt(FNR * (1 - FNR) / total.ref) }) ans$chr <- I(rownames(ans)) ans } ER.df.8 <- do.call(make.groups, lapply({ x <- 3:12; names(x) <- as.character(x); x }, function(i) { if (interactive()) message("Processing cutoff ", i) commonPeaks(i, ref.peaks = ctube.peaks.8) })) ER.df.10 <- do.call(make.groups, lapply({ x <- 3:12; names(x) <- as.character(x); x }, function(i) { if (interactive()) message("Processing cutoff ", i) commonPeaks(i, ref.peaks = ctube.peaks.10) })) @ \newpage <>= plot(segplot(jitter(as.numeric(as.character(which))) ~ I(FDR-3*SD.FDR) + I(FDR+3*SD.FDR), data = ER.df.8, horizontal = FALSE, ## center = FDR, draw.bands = FALSE, main = "Myotubes False discovery rate, using combined depth >= 8 as gold standard", panel = function(...) { panel.grid(h = -12, v = 0, col = "darkgrey") panel.segplot(...) })) @ \newpage <>= plot(segplot(jitter(as.numeric(as.character(which))) ~ I(FDR-3*SD.FDR) + I(FDR+3*SD.FDR), data = ER.df.10, horizontal = FALSE, ## center = FDR, draw.bands = FALSE, main = "Myotubes False discovery rate, using combined depth >= 10 as gold standard", panel = function(...) { panel.grid(h = -12, v = 0, col = "darkgrey") panel.segplot(...) })) @ \newpage <>= plot(segplot(jitter(as.numeric(as.character(which))) ~ I(FNR-3*SD.FNR) + I(FNR+3*SD.FNR), data = ER.df.8, horizontal = FALSE, ## center = FNR, draw.bands = FALSE, main = "Myotubes False negative rate, using combined depth >= 8 as gold standard", panel = function(...) { panel.grid(h = -12, v = 0, col = "darkgrey") panel.segplot(...) })) @ \newpage <>= plot(segplot(jitter(as.numeric(as.character(which))) ~ I(FNR-3*SD.FNR) + I(FNR+3*SD.FNR), data = ER.df.10, horizontal = FALSE, ## center = FNR, draw.bands = FALSE, main = "False negative rate, using combined depth >= 10 as gold standard", panel = function(...) { panel.grid(h = -12, v = 0, col = "darkgrey") panel.segplot(...) })) @ \newpage <>= plot(xyplot(1-FNR ~ FDR, data = ER.df.10, type = c("p", "g"))) @ \newpage \section*{A regression-like approach} Consider all peaks in combined lane. Compute maximum depth over the same regions in paired-end sample. <<>>= depthUnderPeaks <- function(x = pairedReads$"2", ref.peaks) { get.depth <- function(chr) { ref <- ref.peaks[[chr]] g <- extendReads(x[[chr]]) cov <- coverage(g, width = max(end(g), end(ref))) vcopy <- Views(cov, ref) data.frame(ref = viewMaxs(ref), obs = viewMaxs(vcopy)) } do.call(make.groups, sapply(names(ref.peaks), get.depth, simplify = FALSE)) } depth.paired <- depthUnderPeaks(pairedReads$"2", ref.peaks = ctube.peaks.8) xtabs(~ (ref >= 8) + (obs >= 8), depth.paired) xtabs(~ (ref >= 10) + (obs >= 8), depth.paired) with(depth.paired, xyplot(jitter(obs) ~ jitter(ref), pch = ".", cex = 2, subset = (obs < quantile(obs, 0.99) & ref < quantile(ref, 0.99)), panel = function(...) { if (panel.number() == 1) panel.smoothScatter(...) panel.abline(h = c(8, 10), v = c(8, 10), col = "darkgrey") if (panel.number() == 2) panel.xyplot(...) }, aspect = "iso")[c(1, 1)]) ## trans.scales <- ## function(x, trans = sqrt, n = 5, ## at = pretty(range(x), n = n), ## ...) ## { ## list(at = trans(at), labels = as.character(at)) ## } ## ticks <- c(0, 8, 10, 20, 50, 100, 200, 500) ## with(depth.paired, ## xyplot(sqrt(jitter(obs)) ~ sqrt(jitter(ref)), pch = ".", cex = 3, ## panel = function(...) { ## panel.abline(h = asinh(c(8, 10)), ## v = asinh(c(8, 10)), ## col = "darkgrey") ## panel.xyplot(...) ## }, ## aspect = "iso", ## default.scales = list(x = trans.scales(ref, at = ticks), ## y = trans.scales(obs, at = ticks)))) @ \newpage <>= plot(trellis.last.object()) @ \section*{Error rates combining fibroblasts+MyoD lanes} We can do a similar analysis using the fibroblasts+MyoD lanes: treating the three lanes from the first run as a gold standard, and using the paired-end sample to get estimates of error rates. <<>>= cfibro.peaks.8 <- lapply(combinedFibro$fibroMyoD, findPeaks, lower = 8, df = FALSE) cfibro.peaks.10 <- lapply(combinedFibro$fibroMyoD, findPeaks, lower = 10, df = FALSE) ER.df.8 <- do.call(make.groups, lapply({ x <- 3:12; names(x) <- as.character(x); x }, function(i) { if (interactive()) message("Processing cutoff ", i) commonPeaks(i, x = pairedReads$"1", ref.peaks = cfibro.peaks.8) })) ER.df.10 <- do.call(make.groups, lapply({ x <- 3:12; names(x) <- as.character(x); x }, function(i) { if (interactive()) message("Processing cutoff ", i) commonPeaks(i, x = pairedReads$"1", ref.peaks = cfibro.peaks.10) })) @ \newpage <>= plot(segplot(jitter(as.numeric(as.character(which))) ~ I(FDR-3*SD.FDR) + I(FDR+3*SD.FDR), data = ER.df.8, horizontal = FALSE, ## center = FDR, draw.bands = FALSE, main = "Fibro+MyoD False discovery rate, using combined depth >= 8 as gold standard", panel = function(...) { panel.grid(h = -12, v = 0, col = "darkgrey") panel.segplot(...) })) @ \newpage <>= plot(segplot(jitter(as.numeric(as.character(which))) ~ I(FDR-3*SD.FDR) + I(FDR+3*SD.FDR), data = ER.df.10, horizontal = FALSE, ## center = FDR, draw.bands = FALSE, main = "Fibro+MyoD False discovery rate, using combined depth >= 10 as gold standard", panel = function(...) { panel.grid(h = -12, v = 0, col = "darkgrey") panel.segplot(...) })) @ \newpage <>= plot(segplot(jitter(as.numeric(as.character(which))) ~ I(FNR-3*SD.FNR) + I(FNR+3*SD.FNR), data = ER.df.8, horizontal = FALSE, ## center = FNR, draw.bands = FALSE, main = "Fibro+MyoD False negative rate, using combined depth >= 8 as gold standard", panel = function(...) { panel.grid(h = -12, v = 0, col = "darkgrey") panel.segplot(...) })) @ \newpage <>= plot(segplot(jitter(as.numeric(as.character(which))) ~ I(FNR-3*SD.FNR) + I(FNR+3*SD.FNR), data = ER.df.10, horizontal = FALSE, ## center = FNR, draw.bands = FALSE, main = "Fibro+MyoD False negative rate, using combined depth >= 10 as gold standard", panel = function(...) { panel.grid(h = -12, v = 0, col = "darkgrey") panel.segplot(...) })) @ \newpage <>= plot(xyplot(1-FNR ~ FDR, data = ER.df.10, type = c("p", "g"))) @ \newpage \section*{A regression-like approach} Consider all peaks in combined lane. Compute maximum depth over the same regions in paired-end sample. <<>>= depth.paired <- depthUnderPeaks(pairedReads$"1", ref.peaks = cfibro.peaks.8) xtabs(~ (ref >= 8) + (obs >= 8), depth.paired) xtabs(~ (ref >= 10) + (obs >= 8), depth.paired) with(depth.paired, xyplot(jitter(obs) ~ jitter(ref), pch = ".", cex = 2, subset = (obs < quantile(obs, 0.99) & ref < quantile(ref, 0.99)), panel = function(...) { if (panel.number() == 1) panel.smoothScatter(...) panel.abline(h = c(8, 10), v = c(8, 10), col = "darkgrey") if (panel.number() == 2) panel.xyplot(...) }, aspect = "iso")[c(1, 1)]) ## trans.scales <- ## function(x, trans = sqrt, n = 5, ## at = pretty(range(x), n = n), ## ...) ## { ## list(at = trans(at), labels = as.character(at)) ## } ## ticks <- c(0, 8, 10, 20, 50, 100, 200, 500) ## with(depth.paired, ## xyplot(sqrt(jitter(obs)) ~ sqrt(jitter(ref)), pch = ".", cex = 3, ## panel = function(...) { ## panel.abline(h = asinh(c(8, 10)), ## v = asinh(c(8, 10)), ## col = "darkgrey") ## panel.xyplot(...) ## }, ## aspect = "iso", ## default.scales = list(x = trans.scales(ref, at = ticks), ## y = trans.scales(obs, at = ticks)))) @ \newpage <>= plot(trellis.last.object()) @ \end{document}