\documentclass{article} \title{Some Island-based Summaries of the ChIP-Seq Data} \usepackage[text={178mm,230mm},centering]{geometry} \usepackage{Sweave} \SweaveOpts{keep.source=TRUE,eps=FALSE,pdf=TRUE,width=9,height=10,prefix.string=figs/figs-islands} \setkeys{Gin}{width=0.98\textwidth} \begin{document} \maketitle \raggedright <>= library(chipseq) library(lattice) library(hexbin) library(latticeExtra) load("myodFibro.rda") load("myodMyo.rda") load("pairedReads.rda") set.seed(20081008) @ <>= logscale.components <- function(axis = c("x", "y"), base = 2) { axis <- match.arg(axis) switch(axis, x = function(...) { ans <- xscale.components.default(...) ans$bottom$labels$labels <- base^(ans$bottom$labels$at) ans }, y = function(...) { ans <- yscale.components.default(...) ans$left$labels$labels <- base^(ans$left$labels$at) ans }) } ## common infrastructure to summarize reads in the form of nested lists: ## reads.list=list("1" = list("chr1" = list("+"=..., "-"=...), ## "chr2"=...), ## "2" = list(...)) summarizeLane <- function(clist, summary.fun, ...) { ## clist is a list at the lane level, with one list("+"=, "-"=) for each chromsome ans <- do.call(lattice::make.groups, lapply(clist, summary.fun, ...)) names(ans)[names(ans) == "which"] <- "chromosome" ## cbind(chr = factor(colnames(ans), levels = colnames(ans)), as.data.frame(t(ans))) ans } summarizeReads <- function(reads.list, lanes = c("1", "2", "3", "4", "6", "7", "8"), ..., verbose = FALSE) { if (verbose) cat(paste("Processing lanes", paste(lanes, collapse = ",")), fill = TRUE) ans <- do.call(lattice::make.groups, lapply(reads.list[lanes], summarizeLane, ...)) names(ans)[names(ans) == "which"] <- "lane" ans } ## different summary.fun can give different useful summaries countSummary <- function(x) { ## x is a list at the lane->chromosome level, with components "+" and "-" npos <- length(x$"+") nneg <- length(x$"-") data.frame(n = npos + nneg, d = npos - nneg, r = npos / nneg) } ## get islands. But this needs more care; ## coverage by chromosome, possibly different species. ## We'll avoid this by just using the furthest hit ## library("BSgenome.Mmusculus.UCSC.mm9") ## library("BSgenome.Hsapiens.UCSC.hg18") ## mouse.seqlens <- seqlengths(Mmusculus) ## human.seqlens <- seqlengths(Hsapiens) sliceSummary <- function(x, lower = 1, viewSummary = list(sums = viewSums, maxs = viewMaxs)) ## x is a list at the lane->chromosome level, with components "+" and "-" { g <- extendReads(x) cov <- coverage(g, width = max(end(g) + 400L)) s <- slice(cov, lower = lower) ans <- data.frame(start = start(s), end = end(s)) if (is.list(viewSummary)) { for (nm in names(viewSummary)) ans[[nm]] <- viewSummary[[nm]](s) } else ans[["summary"]] <- viewSummary(s) ans } @ \section*{Distribution of reads} We summarize the reads in terms of number of reads by chromosome and lane, also including the difference and ratio between matches in the positive and negative strands. <<>>= readCountSummary <- make.groups(myodFibro = summarizeReads(myodFibro, summary.fun = countSummary), myodMyo = summarizeReads(myodMyo, summary.fun = countSummary), pairedReads = summarizeReads(pairedReads, summary.fun = countSummary)) str(readCountSummary) ## dotplot(chromosome ~ log2(n) | lane, readCountSummary, groups = which, ## par.settings = simpleTheme(pch = 16), type = "o", ## auto.key = TRUE) @ The number of reads is as we would expect. The number of paired-end reads is comparable to the second (myoblasts and myotubes) run. <<>>= dotplot(xtabs(n ~ lane + which, readCountSummary), type = "o", ylab = "Lane", auto.key = list(space = "right"), par.settings = simpleTheme(pch = 16)) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage A more detailed look. <<>>= useOuterStrips(dotplot(chromosome ~ (n) | which + lane, readCountSummary, as.table = TRUE)) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage The difference in the number of reads between strands is interesting. <<>>= dotplot(chromosome ~ d | lane + which, readCountSummary, xlab = "Positive strand matches - negative strand matches", panel = function(...) { panel.abline(v = 0, col = "grey", lwd = 3) panel.dotplot(...) }) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage \section*{Island comparisons} <<>>= ## this takes a while to run, so will cache it (20,428,526 rows) if (file.exists("readSliceSummary.rda")) load("readSliceSummary.rda") else { readSliceSummary <- make.groups(myodFibro = summarizeReads(myodFibro, summary.fun = sliceSummary), myodMyo = summarizeReads(myodMyo, summary.fun = sliceSummary), pairedReads = summarizeReads(pairedReads, summary.fun = sliceSummary)) save(readSliceSummary, file = "readSliceSummary.rda") } str(readSliceSummary) @ \newpage A summary of islands by lanes. <<>>= readSliceSummary.10 <- subset(readSliceSummary, maxs >= 10) useOuterStrips(hexbinplot(log2(sums) ~ log2(maxs) | which + lane, readSliceSummary.10, aspect = 0.7, main = "islands with depth >= 10", type = "g", trans = sqrt, inv = function(x) x^2)) ## xyplot(log2(sums) ~ log2(maxs) | lane + which, readSliceSummary.10, ## panel = panel.smoothScatter) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= useOuterStrips(hexbinplot(log2(end - start) ~ log2(maxs) | which + lane, readSliceSummary.10, aspect = 0.7, main = "islands with depth >= 10", type = "g", trans = sqrt, inv = function(x) x^2)) ## xyplot(log2(sums) ~ log2(maxs) | lane + which, readSliceSummary.10, ## panel = panel.smoothScatter) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage Distribution of reads in islands with depth 10 or more. <<>>= ntabdf <- as.data.frame(xtabs(~round(sums/200L) + lane + which, readSliceSummary.10)) ntabdf$nreads <- as.numeric(as.character(ntabdf[[1]])) useOuterStrips(xyplot(log2(Freq) ~ log2(nreads) | which + lane, ntabdf, type = c("p", "g"), pch = 20, xscale.components = logscale.components("x", 2), yscale.components = logscale.components("y", 2), scales = list(x = list(rot = 90)))) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage \section*{Comparison between myotubes and myoblasts} <<>>= library("BSgenome.Mmusculus.UCSC.mm9") mouse.chromlens <- seqlengths(Mmusculus) seqRanges <- lapply(myodMyo, extendReads) cblasts <- combineLanes(seqRanges[c("1","3","6")]) ctubes <- combineLanes(seqRanges[c("2","4","7")]) if (TRUE) # subsample { ss <- laneSubsample(cblasts, ctubes) cblasts <- ss$lane1 ctubes <- ss$lane2 } covblasts <- laneCoverage(cblasts, mouse.chromlens) covtubes <- laneCoverage(ctubes, mouse.chromlens) system.time( islandSummaries <- list(blasts = do.call(make.groups, islandSummary(islands(covblasts))), tubes = do.call(make.groups, islandSummary(islands(covtubes)))) ) islandSummaries <- lapply(islandSummaries, function(x) { names(x)[names(x) == "which"] <- "chromosome" x }) system.time( islandSummaries <- with(islandSummaries, make.groups(blasts, tubes)) ) depth.dist <- xtabs(~cut(maxdepth, c(0, 10, 15, 30, Inf)) + which, islandSummaries) names(dimnames(depth.dist))[1] <- "maxdepth" depth.dist islandSummaries.10 <- subset(islandSummaries, maxdepth >= 10) @ \newpage <<>>= hexbinplot(log2(clones) ~ log2(maxdepth) | which, islandSummaries.10, subset = maxdepth < 3000, aspect = 0.7, main = "islands with depth >= 10", type = c("g", "r"), xscale.components = logscale.components("x", 2), yscale.components = logscale.components("y", 2), ## scales = list(x = list(rot = 90)), trans = sqrt, inv = function(x) x^2) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= hexbinplot(log2(end-start) ~ log2(maxdepth) | which, islandSummaries.10, subset = maxdepth < 3000, aspect = 0.7, main = "islands with depth >= 10", type = c("g", "r"), xscale.components = logscale.components("x", 2), yscale.components = logscale.components("y", 2), ## scales = list(x = list(rot = 90)), trans = sqrt, inv = function(x) x^2) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= myo.ntabdf <- as.data.frame(xtabs(~round(clones) + which, islandSummaries.10)) myo.ntabdf$nreads <- as.numeric(as.character(myo.ntabdf[[1]])) xyplot(log2(Freq) ~ log2(nreads), myo.ntabdf, type = c("p", "g"), par.settings = simpleTheme(pch = 20), groups = which, auto.key = TRUE, xscale.components = logscale.components("x", 2), yscale.components = logscale.components("y", 2), scales = list(x = list(rot = 90))) ## xyplot(log2(Freq) ~ log2(nreads) | which, ntabdf, ## type = c("p", "g"), pch = 20, ## xscale.components = logscale.components("x", 2), ## yscale.components = logscale.components("y", 2), ## scales = list(x = list(rot = 90))) ## ## all = combineLanes(list(cblasts, ctubes)) ## blastIslands = islands(covblasts) ## blastIcts = readsPerIsland(blastIslands) ## blastSummaries = islandSummary(blastIslands) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \end{document}