\documentclass{article} \title{Singleton Islands in the ChIP-Seq Data} \usepackage[text={178mm,230mm},centering]{geometry} \usepackage{Sweave} \SweaveOpts{keep.source=TRUE,eps=FALSE,pdf=TRUE,width=9,height=10,prefix.string=figs/figs-singletons} \setkeys{Gin}{width=0.98\textwidth} \begin{document} \maketitle \raggedright <>= library(chipseq) library(lattice) library(simplehmm) ## library(hexbin) ## library(latticeExtra) library("BSgenome.Mmusculus.UCSC.mm9") mouse.seqlens <- seqlengths(Mmusculus) ## to get these, run ReadAndSaveData.R (on a machine which has the raw data) load("myodFibro.rda") load("myodMyo.rda") load("pairedReads.rda") load("simulatedReadsSampled.rda") # loads variable 'simulatedReads' set.seed(20081008) @ <>= count.hmm <- function(ref.locs, obs.locs, target.hits = 30, initial.states = c(20, 40), niter = 100) { chromosomes <- names(obs.locs) ref.hits <- sapply(ref.locs, length) obs.hits <- sapply(obs.locs, length) rel <- obs.hits / ref.hits getIntervals <- function(reflocs, obslocs, target = 15, prop = length(obslocs) / length(reflocs)) { nbins <- ceiling(length(reflocs) * prop / target) quantile(reflocs, prob = ppoints(nbins, a = 1), names = FALSE) } bin.list <- sapply(chromosomes, function(chr, ...) getIntervals(ref.locs[[chr]], obs.locs[[chr]], ...), target = target.hits, prop = median(rel), simplify = FALSE) ## str(bin.list) datahits.list <- sapply(chromosomes, function(chr) as.numeric(table(cut(x = obs.locs[[chr]], breaks = bin.list[[chr]], labels = NULL))), simplify = FALSE) ## str(datahits.list) fm <- coverageHmm(datahits.list, binlist = bin.list, family = hmm.family("nbinom", mu = 1, states.scale = initial.states, size = 20, states.free = TRUE)) fm <- update(fm, iterations = niter, verbose = FALSE) fm } combineLocs <- function(...) { dots <- list(...) nms <- names(dots[[1]]) sapply(nms, function(nm) { sort(unlist(lapply(dots, "[[", nm), use.names = FALSE)) }, simplify = FALSE) } @ \section*{Singleton islands} <<>>= if (file.exists("singletonLocs.rda")) load("singletonLocs.rda") else { myodMyoLocs <- summarizeReads(myodMyo) myodFibroLocs <- summarizeReads(myodFibro) pairedReadsLocs <- summarizeReads(pairedReads) save(myodFibroLocs, myodMyoLocs, pairedReadsLocs, file = "singletonLocs.rda") } counts <- make.groups(Fibro = count.singletons(myodFibroLocs), Myo = count.singletons(myodMyoLocs), Paired = count.singletons(pairedReadsLocs)) obs.locs <- myodMyoLocs[["8"]] ref.locs <- myodFibroLocs[["8"]] fm <- count.hmm(ref.locs, obs.locs, target.hits = 50, initial.states = c(40, 50, 60), niter = 10) write.hmm(fm, bins = fm$binlist, combine = TRUE, file = "hmmout_myotube.csv") fm <- update(fm, iterations = 70, verbose = FALSE) fm <- update(fm, iterations = 2, verbose = TRUE) summary(fm) @ \newpage Overall counts: <<>>= dotplot(chromosome ~ count/1e3 | lane, data = counts, groups = which, auto.key = list(columns = 2), xlab = "Number of singleton islands (1000)") @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage \section*{Comparison of reference lanes} <<>>= xyplot(fm, decode = "viterbi", xlab = "Location (Mb)", col = c('darkgrey', 'black'), ylim = c(0, 100), scales = list(x = list(tick.number = 20, axs = "r")), main = "Decoded path (Viterbi)", lty = 1, strip.left = TRUE) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= barchart(fm, strip.left = TRUE, xlab = "Location (Mb)", scales = list(x = list(tick.number = 20)), color = list(lightgreen = 1, pink = 3)) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage \section*{Comparison between fibroblasts+MyoD and myoblasts} <<>>= ref.locs <- do.call(combineLocs, myodFibroLocs[c("2", "4", "7")]) obs.locs <- do.call(combineLocs, myodMyoLocs[c("1", "3", "6")]) fm <- count.hmm(ref.locs, obs.locs, target.hits = 60, initial.states = c(50, 60, 70), niter = 10) fm <- update(fm, iterations = 70, verbose = FALSE) fm <- update(fm, iterations = 2, verbose = TRUE) summary(fm) @ \newpage <<>>= xyplot(fm, decode = "viterbi", xlab = "Location (Mb)", col = c('darkgrey', 'black'), ylim = c(0, 100), scales = list(x = list(tick.number = 20, axs = "r")), main = "Decoded path (Viterbi)", lty = 1, strip.left = TRUE) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= barchart(fm, strip.left = TRUE, xlab = "Location (Mb)", scales = list(x = list(tick.number = 20)), color = list(lightgreen = 1, pink = 3)) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= ## FIXME: have simpler interface write.hmm(fm, bins = fm$binlist, combine = TRUE, decoded = "viterbi", file = "hmmout_tmp.csv") fm.decoded <- read.table("hmmout_tmp.csv", sep=",", header = TRUE) fm.decoded.split <- split(fm.decoded, fm.decoded$chr) for (chrom in names(fm.decoded.split)) { print(chrom) v <- with(fm.decoded.split[[chrom]], Views(Mmusculus[[chrom]], start = start, end = end)) fm.decoded.split[[chrom]]$GC <- rowSums(alphabetFrequency(v, baseOnly = TRUE, freq = TRUE)[, c("G", "C")]) } fm.decoded <- do.call(rbind, fm.decoded.split) qqmath(~GC, fm.decoded, groups = factor(decoded), f.value = ppoints(100), auto.key = list(title="HMM state (myoblasts w.r.t. fibro+MyoD)"), type = c("p", "g")) ## densityplot(~GC, fm.decoded, groups = factor(decoded), ## plot.points = FALSE, auto.key = TRUE) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage \section*{Comparison of myotubes and myoblasts} <<>>= ref.locs <- do.call(combineLocs, myodMyoLocs[c("1", "3", "6")]) obs.locs <- do.call(combineLocs, myodMyoLocs[c("2", "4", "7")]) fm <- count.hmm(ref.locs, obs.locs, target.hits = 60, initial.states = c(50, 60, 70), niter = 10) fm <- update(fm, iterations = 70, verbose = FALSE) fm <- update(fm, iterations = 2, verbose = TRUE) summary(fm) @ \newpage <<>>= xyplot(fm, decode = "viterbi", xlab = "Location (Mb)", col = c('darkgrey', 'black'), ylim = c(0, 100), scales = list(x = list(tick.number = 20, axs = "r")), main = "Decoded path (Viterbi)", lty = 1, strip.left = TRUE) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= barchart(fm, strip.left = TRUE, xlab = "Location (Mb)", scales = list(x = list(tick.number = 20)), color = list(lightgreen = 1, pink = 3)) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage \section*{Comparison of fibroblasts+MyoD and myotubes in paired-end run} <<>>= ref.locs <- pairedReadsLocs[["1"]] obs.locs <- pairedReadsLocs[["2"]] fm <- count.hmm(ref.locs, obs.locs, target.hits = 40, initial.states = c(30, 50), niter = 10) fm <- update(fm, iterations = 100, verbose = FALSE) fm <- update(fm, iterations = 2, verbose = TRUE) summary(fm) @ \newpage <<>>= xyplot(fm, decode = "viterbi", xlab = "Location (Mb)", col = c('darkgrey', 'black'), ylim = c(0, 100), scales = list(x = list(tick.number = 20, axs = "r")), main = "Decoded path (Viterbi)", lty = 1, strip.left = TRUE) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= barchart(fm, strip.left = TRUE, xlab = "Location (Mb)", scales = list(x = list(tick.number = 20)), color = list(lightgreen = 1, pink = 2)) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage We can use intersection between singleton sets to estimate the effective genome size. <<>>= effective.lengths <- data.frame(official = mouse.seqlens[paste("chr", 1:19, sep = "")], estimated.1 = effective.glength.byChr(myodMyoLocs$"1", myodMyoLocs$"2"), estimated.2 = effective.glength.byChr(myodMyoLocs$"3", myodMyoLocs$"4"), estimated.3 = effective.glength.byChr(myodMyoLocs$"6", myodMyoLocs$"7")) effective.lengths <- within(effective.lengths, { ratio <- (estimated.1 + estimated.2 + estimated.3) / (3 * official) }) effective.lengths @ <<>>= effective.lengths <- within(effective.lengths, { ratio.1 <- estimated.1 / official ratio.2 <- estimated.2 / official ratio.3 <- estimated.3 / official }) effective.lengths$chr <- reorder(factor(rownames(effective.lengths)), with(effective.lengths, ratio.1 + ratio.2 + ratio.3)) dotplot(chr ~ ratio.1 + ratio.2 + ratio.3, effective.lengths) @ \newpage \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage A more systematic comparison of different estimates may expose the extent to which singleton islands are non-random. <<>>= official <- mouse.seqlens[paste("chr", 1:19, sep = "")] effective.ratios <- list(myo.1.3 = effective.glength.byChr(myodMyoLocs$"1", myodMyoLocs$"3") / official, myo.1.6 = effective.glength.byChr(myodMyoLocs$"1", myodMyoLocs$"6") / official, myo.1.2 = effective.glength.byChr(myodMyoLocs$"1", myodMyoLocs$"2") / official, myo.1.4 = effective.glength.byChr(myodMyoLocs$"1", myodMyoLocs$"4") / official, myo.1.paired.2 = effective.glength.byChr(myodMyoLocs$"1", pairedReadsLocs$"2") / official, myo.1.fibro.1 = effective.glength.byChr(myodMyoLocs$"1", myodFibroLocs$"1") / official, myo.1.fibro.3 = effective.glength.byChr(myodMyoLocs$"1", myodFibroLocs$"3") / official, myo.1.fibro.2 = effective.glength.byChr(myodMyoLocs$"1", myodFibroLocs$"2") / official, myo.1.fibro.4 = effective.glength.byChr(myodMyoLocs$"1", myodFibroLocs$"4") / official, myo.1.paired.1 = effective.glength.byChr(myodMyoLocs$"1", pairedReadsLocs$"1") / official) effective.ratios.df <- do.call(make.groups, effective.ratios) effective.ratios.df$chr <- factor(names(official)) effective.ratios.df <- within(effective.ratios.df, { chr <- reorder(chr, data, median) }) ## dotplot(chr ~ data, effective.ratios.df, groups = which, ## par.settings = simpleTheme(pch = 16, ## col = rep(trellis.par.get("superpose.line")$col[1:6], c(2, 2, 1, 2, 2, 1))), ## type = c("p", "a"), ## auto.key = list(columns = 2, lines = TRUE, points = FALSE, type = "o")) dotplot(chr ~ data | !(which %in% c("myo.1.3", "myo.1.6", "myo.1.2", "myo.1.4", "myo.1.paired.2")), effective.ratios.df, groups = which, layout = c(2, 1), strip = FALSE, par.settings = simpleTheme(pch = 16, col = rep(trellis.par.get("superpose.line")$col[1:6], c(2, 2, 1, 2, 2, 1))), type = c("p", "a"), auto.key = list(columns = 2, lines = TRUE, points = FALSE, type = "o")) @ \newpage \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= effective.ratios <- list(myo.2.1 = effective.glength.byChr(myodMyoLocs$"2", myodMyoLocs$"1") / official, myo.2.3 = effective.glength.byChr(myodMyoLocs$"2", myodMyoLocs$"3") / official, myo.2.4 = effective.glength.byChr(myodMyoLocs$"2", myodMyoLocs$"4") / official, myo.2.7 = effective.glength.byChr(myodMyoLocs$"2", myodMyoLocs$"7") / official, myo.2.paired.2 = effective.glength.byChr(myodMyoLocs$"2", pairedReadsLocs$"2") / official, myo.2.fibro.1 = effective.glength.byChr(myodMyoLocs$"2", myodFibroLocs$"1") / official, myo.2.fibro.3 = effective.glength.byChr(myodMyoLocs$"2", myodFibroLocs$"3") / official, myo.2.fibro.2 = effective.glength.byChr(myodMyoLocs$"2", myodFibroLocs$"2") / official, myo.2.fibro.4 = effective.glength.byChr(myodMyoLocs$"2", myodFibroLocs$"4") / official, myo.2.paired.1 = effective.glength.byChr(myodMyoLocs$"2", pairedReadsLocs$"1") / official) effective.ratios.df <- do.call(make.groups, effective.ratios) effective.ratios.df$chr <- factor(names(official)) effective.ratios.df <- within(effective.ratios.df, { chr <- reorder(chr, data, median) }) ## dotplot(chr ~ data, effective.ratios.df, groups = which, ## par.settings = simpleTheme(pch = 16, ## col = rep(trellis.par.get("superpose.line")$col[1:6], c(2, 2, 1, 2, 2, 1))), ## type = c("p", "a"), ## auto.key = list(columns = 2, lines = TRUE, points = FALSE, type = "o")) dotplot(chr ~ data | !(which %in% c("myo.2.1", "myo.2.3", "myo.2.4", "myo.2.7", "myo.2.paired.2")), effective.ratios.df, groups = which, layout = c(2, 1), strip = FALSE, par.settings = simpleTheme(pch = 16, col = rep(trellis.par.get("superpose.line")$col[1:6], c(2, 2, 1, 2, 2, 1))), type = c("p", "a"), auto.key = list(columns = 2, lines = TRUE, points = FALSE, type = "o")) @ \newpage \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage \section*{Comparison between myoblasts and simulation} <<>>= comparison <- "Myoblasts vs simulation" ref.locs <- combineLocs(simulatedReads[1:19]) obs.locs <- do.call(combineLocs, myodMyoLocs[c("1", "3", "6")]) fm <- count.hmm(ref.locs, obs.locs, target.hits = 60, initial.states = c(50, 60, 70), niter = 10) fm <- update(fm, iterations = 70, verbose = FALSE) fm <- update(fm, iterations = 2, verbose = TRUE) summary(fm) @ \newpage <<>>= xyplot(fm, decode = "viterbi", xlab = "Location (Mb)", sub = comparison, col = c('darkgrey', 'black'), ylim = c(0, 100), scales = list(x = list(tick.number = 20, axs = "r")), main = "Decoded path (Viterbi)", lty = 1, strip.left = TRUE) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= barchart(fm, strip.left = TRUE, xlab = "Location (Mb)", sub = comparison, scales = list(x = list(tick.number = 20)), color = list(lightgreen = 1, pink = 3)) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage \section*{Comparison between myotubes and simulation} <<>>= comparison <- "Myotubes vs simulation" ref.locs <- combineLocs(simulatedReads[1:19]) obs.locs <- do.call(combineLocs, myodMyoLocs[c("2", "4", "7")]) fm <- count.hmm(ref.locs, obs.locs, target.hits = 60, initial.states = c(50, 60, 70), niter = 10) fm <- update(fm, iterations = 70, verbose = FALSE) fm <- update(fm, iterations = 2, verbose = TRUE) summary(fm) write.hmm(fm, bins = fm$binlist, combine = TRUE, file = "hmmout_myotube.csv") @ \newpage <<>>= xyplot(fm, decode = "viterbi", xlab = "Location (Mb)", sub = comparison, col = c('darkgrey', 'black'), ylim = c(0, 100), scales = list(x = list(tick.number = 20, axs = "r")), main = "Decoded path (Viterbi)", lty = 1, strip.left = TRUE) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= barchart(fm, strip.left = TRUE, xlab = "Location (Mb)", sub = comparison, scales = list(x = list(tick.number = 20)), color = list(lightgreen = 1, pink = 3)) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage \section*{Comparison between fibrobalsts+MyoD and simulation} <<>>= comparison <- "fibroblasts+MyoD vs simulation" ref.locs <- combineLocs(simulatedReads[1:19]) obs.locs <- do.call(combineLocs, myodFibroLocs[c("2", "4", "7")]) fm <- count.hmm(ref.locs, obs.locs, target.hits = 60, initial.states = c(50, 60, 70), niter = 10) fm <- update(fm, iterations = 70, verbose = FALSE) fm <- update(fm, iterations = 2, verbose = TRUE) summary(fm) @ \newpage <<>>= xyplot(fm, decode = "viterbi", xlab = "Location (Mb)", sub = comparison, col = c('darkgrey', 'black'), ylim = c(0, 100), scales = list(x = list(tick.number = 20, axs = "r")), main = "Decoded path (Viterbi)", lty = 1, strip.left = TRUE) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \newpage <<>>= barchart(fm, strip.left = TRUE, xlab = "Location (Mb)", sub = comparison, scales = list(x = list(tick.number = 20)), color = list(lightgreen = 1, pink = 3)) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} <<>>= ## FIXME: have simpler interface write.hmm(fm, bins = fm$binlist, combine = TRUE, decoded = "viterbi", file = "hmmout_fibrosim.csv") fm.decoded <- read.table("hmmout_fibrosim.csv", sep=",", header = TRUE) fm.decoded.split <- split(fm.decoded, fm.decoded$chr) library(BSgenome.Mmusculus.UCSC.mm9) for (chrom in names(fm.decoded.split)) { print(chrom) v <- with(fm.decoded.split[[chrom]], Views(Mmusculus[[chrom]], start = start, end = end)) fm.decoded.split[[chrom]]$GC <- rowSums(alphabetFrequency(v, baseOnly = TRUE, freq = TRUE)[, c("G", "C")]) } fm.decoded <- do.call(rbind, fm.decoded.split) qqmath(~GC, fm.decoded, groups = factor(decoded), f.value = ppoints(100), auto.key = list(title="HMM state (fibro+MyoD rel. to simulation)"), type = c("p", "g")) ## densityplot(~GC, fm.decoded, groups = factor(decoded), ## plot.points = FALSE, auto.key = TRUE) @ \begin{center} <>= plot(trellis.last.object()) @ \end{center} \end{document}