\documentclass{article}
\title{Are the different antibodies largely equivalent?}

\usepackage[text={178mm,230mm},centering]{geometry}
\usepackage{Sweave}

\SweaveOpts{keep.source=TRUE,eps=FALSE,pdf=TRUE,width=9,height=10,prefix.string=figs/figs-islands}
\setkeys{Gin}{width=0.98\textwidth}

\begin{document}

\maketitle

\raggedright

<<setup,echo=FALSE,results=hide>>=

library(lattice)
library(chipseq)
library(chipseqData)
library(BSgenome.Mmusculus.UCSC.mm9)


library(robustbase)
## fcorrelation <- function(x) { covMcd(x, cor = TRUE)$cor }
## OR: fcorrelation <- cor
fcorrelation <- function(x) { cor(x, method = "spearman") }


data(solexa54)
data(myodMyo)
data(myodFibro)

set.seed(20081008)


## promoter information

data(geneMouse)
gregions <- genomic_regions(genes = geneMouse, proximal = 1000)
gregions <- subset(gregions, chrom %in% paste("chr", 1:19, sep = ""))
gregions$chrom <- gregions$chrom[drop = TRUE]

gpromoters <- gregions[c("chrom", "promoter.start", "promoter.end")]
names(gpromoters) <- c("chr", "start", "end")
gpromoters.split <- split(gpromoters, gpromoters$chr)


summarizeData <-
    function(edata, peak.ref, peak.cutoff = 6, ref = peak.ref, ref.cutoff = peak.cutoff,
             include = names(edata))
{
    peaks <-
        gdApply(edata[[peak.ref]],
                function(g, cutoff = peak.cutoff) {
                    print(length(g))
                    IntervalTree(slice(coverage(g, width = max(end(g)) + 100L), lower = cutoff))
                })
    ## accumulate per-peak information
    peakSummary <-
        sapply(names(peaks),
               function(chr) {
                   print(chr)
                   chrpeaks <- peaks[[chr]]
                   in.promoter <- !is.na(findOverlaps(chrpeaks,
                       with(gpromoters.split[[chr]], IRanges(start, end)),
                       multiple = FALSE))
                   countOverlapping <- function(x)
                   {
                       as.numeric(as.table(t(findOverlaps(edata[[x]][[chr]], chrpeaks, multiple = TRUE))))
                   }
                   ans <- data.frame(start = start(chrpeaks),
                                     end = end(chrpeaks),
                                     promoter = ifelse(in.promoter, "In promoter", "Not in promoter"))
                   for (nm in names(edata))
                       ans[[nm]] <- countOverlapping(nm)
                   ans
               }, simplify = FALSE)
    peakSummary.df <- do.call(make.groups, peakSummary)
    rownames(peakSummary.df) <- NULL
    computeRates <- function(cutoff = 5)
    {
        mytab <- function(x) table(factor(x, levels = c(FALSE, TRUE)))
        ## dsub <- peakSummary.df ## [peakSummary.df[[ref]] >= ref.cutoff, ]
        dsub.promoter <- subset(peakSummary.df, promoter == "In promoter")
        dsub <- subset(peakSummary.df, promoter != "In promoter")
        props <- c(sapply(include, function(x) prop.table(mytab(dsub[[x]] >= cutoff))[2]),
                   sapply(include, function(x) prop.table(mytab(dsub.promoter[[x]] >= cutoff))[2]))
        counts <- c(sapply(include, function(x) mytab(dsub[[x]] >= cutoff)[2]),
                    sapply(include, function(x) mytab(dsub.promoter[[x]] >= cutoff)[2]))
        data.frame(cutoff = cutoff, ref = ref, ref.cutoff = ref.cutoff,
                   promoter = rep(c("Not promoter", "Promoter"), each = length(include)),
                   sample = rep(include, 2),
                   proportion = props, counts = counts,
                   stringsAsFactors = FALSE)
    }
    props <- do.call(rbind, lapply(1:15, computeRates))
    list(peakSummary = peakSummary.df, props = props,
         peak.cutoff = peak.cutoff, peak.ref = peak.ref,
         include = include, nreads.ref = sum(unlist(lapply(edata[[peak.ref]], length))))
}


plotSamples <- function(ereads, peak.ref, peak.cutoff = 12, ...)
{
    foo <- 
        summarizeData(ereads, peak.ref = peak.ref, peak.cutoff = peak.cutoff,
                      include = names(ereads))
    with(foo, 
         xyplot(proportion ~ cutoff | promoter, data = props, type = c("g", "o"), 
                groups = factor(sample, levels = include),
                auto.key = list(lines = TRUE, points = FALSE, columns = 3),
                xlab = "cutoff",
                ylab = "Proportion of peaks with number of overlapping reads >= cutoff ",
                main = sprintf("%s peaks, depth >= %g [%d reads, %d peaks]", peak.ref, peak.cutoff, nreads.ref, nrow(peakSummary)), ...))
}


## all.reads <- 
##     GenomeDataList(list(ctubes = combineLaneReads(myodMyo[c("2", "4", "7")]),
##                         cblasts = combineLaneReads(myodMyo[c("1", "3", "6")]),
##                         cfibromyod = combineLaneReads(myodFibro[c("2", "4", "7")])))
                        
## ereads <- gdApply(all.reads,
##                   function(x, seqLen = 200) {
##                       sort(extendReads(x, seqLen = seqLen))
##                   })

## all.reads <- myodFibro[c(1, 3, 5, 2, 4, 6, 7)]
## ereads <- gdApply(all.reads,
##                   function(x, seqLen = 200) {
##                       sort(extendReads(x, seqLen = seqLen))
##                   })

@ 


<<eval=FALSE,echo=FALSE>>=

## seemed interesting, but can't really say what caused a departure from the null 
## (could be quality, could be different sets of peaks)


peakProfile <- function(ereads, chr = "chr1", 
                        chrlens = seqlengths(Mmusculus), 
                        cutoffs = c(12, 20),
                        nrep = 10L,
                        ...)
{
    ## take random subsets of 'combined' of size 'length(ref)',
    ## and compute number of peaks as function of cutoff.
    ## Provides null reference for same thing in 'ref'.
    refs <- lapply(ereads, "[[", chr)
    comb <- sort(do.call(c, refs))
    all.refs.df <- 
        lapply(refs, function(ref.chr) {
            nref <- length(ref.chr)
            ncomb <- length(comb)
            npeaks <-
                replicate(nrep,
                      {
                          sub <- comb[sort(sample.int(ncomb, nref))]
                          cov <- coverage(sub, width = chrlens[chr])
                          data.frame(cutoff = cutoffs,
                                     npeaks = sapply(cutoffs, function(lower) length(slice(cov, lower = lower))),
                                     type = "subsample")
                      }, simplify = FALSE)
            npeaks.ref <- {
                cov <- coverage(ref.chr, width = chrlens[chr])
                data.frame(cutoff = cutoffs,
                           npeaks = sapply(cutoffs, function(lower) length(slice(cov, lower = lower))),
                           type = "observed")
            }
            do.call(rbind, c(npeaks, list(npeaks.ref)))
        })
    stripplot(reorder(which, npeaks) ~ npeaks | factor(sprintf("cutoff = %g", cutoff)), 
              do.call(make.groups, all.refs.df), 
              jitter = TRUE,
              groups = type, pch = c(1, 16),
              xlab = sprintf("Number of peaks (%s)", chr),
              ...)
}

all.reads <- c(myodMyo)
names(all.reads)[1:7] <- c("blast_1", "tube_1", "blast_2", "tube_2", "blast_3", "tube_3", "preimmune")
ereads <- gdApply(all.reads,
                  function(x, seqLen = 200) {
                      sort(extendReads(x, seqLen = seqLen))
                  })
peakProfile(ereads[c(2, 4, 6)], chr = "chr5")


@ 


<<myodmyo,echo=FALSE,results=hide>>=


## all.reads <- myodFibro[c(1, 3, 5, 2, 4, 6, 7)]
## names(all.reads)[1:7] <- 
##     c("fibro_7311", "fibro_6975", "fibro_6196", 
##       "fibromyod_7311", "fibromyod_6975", "fibromyod_6196", "beadonly")
## ereads <- gdApply(all.reads,
##                   function(x, seqLen = 200) {
##                       sort(extendReads(x, seqLen = seqLen))
##                   })

## plot(plotSamples(ereads, peak.ref = "fibromyod_6196", peak.cutoff = 20,
##                  par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
##                                             lty = 1:3, lwd = 2)))


all.reads <- myodMyo[c(1, 3, 5, 2, 4, 6, 7)]
names(all.reads)[1:7] <- 
    c("blast_7311", "blast_6975", "blast_6196", 
      "tube_7311", "tube_6975", "tube_6196", "preimmune")
ereads <- gdApply(all.reads,
                  function(x, seqLen = 200) {
                      sort(extendReads(x, seqLen = seqLen))
                  })

@ 


We try to address the question in two ways.  The first one is as
follows:
\begin{itemize}
\item Choose one lane as the reference, and determine peaks (depth of
  20 and 12 used here)
\item For various other lanes, draw the folowing curve:
  \begin{itemize}
  \item For a given cutoff $k$, compute the proportion of peaks which
    have at least $k$ overlapping reads.
  \item Plot the proportion as a function of $k$.
  \end{itemize}
\end{itemize}
The results are shown in pages 2-13.  For example, the first three
plots use the three C2C12 myotube lanes as reference.  The first thing
we see is that for the pre-immune control lane, the proportion drops
to almost 0 for cutoffs beyond 5, which is reassuring.  In the second
plot, using antibody 6975 as reference, we see that the other myotube
lanes (as well as the other myoblast lanes) have fairly high overlap.
This suggests that most peaks in myotube 6975 are also enriched in the
other lanes.  Similarly, the plot for antibody 6196 suggests that most
peaks there are also enriched in antibody 7311, but not in antibody
6975.  The plot for antibody 7311 suggests that some peaks there
missing from both the other antibodies, more so in 6975 than 6196.
Together, these suggest that the set of 6975 peaks is a subset of 6196
peaks, which is a further subset of 7311 peaks.
\vspace{4mm}

Myotubes seem generally more similar to each other than to the
myoblasts.
\vspace{4mm}

The other approach is described later (page 14).


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "tube_7311", peak.cutoff = 20,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "tube_6975", peak.cutoff = 20,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "tube_6196", peak.cutoff = 20,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "blast_7311", peak.cutoff = 20,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "blast_6975", peak.cutoff = 20,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "blast_6196", peak.cutoff = 20,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


%% now for depth 12

\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "tube_7311", peak.cutoff = 12,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "tube_6975", peak.cutoff = 12,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "tube_6196", peak.cutoff = 12,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "blast_7311", peak.cutoff = 12,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "blast_6975", peak.cutoff = 12,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = "blast_6196", peak.cutoff = 12,
                 par.settings = simpleTheme(col = c(2, 2, 2, 4, 4, 4, 3), 
                                            lty = 1:3, lwd = 2)))
@ 
\end{center}


<<echo=FALSE>>=

## Another (simple) idea: look at peaks in combined data, then compute 
## correlation in number of overlapping reads


overlapData <-
    function(edata, peak.cutoff = 6, chromosomes = names(edata[[1]]),
             chrlens = seqlengths(Mmusculus))
{
    pb <- txtProgressBar(1, length(chromosomes), style = 3)
    pbi <<- 1
    comb <- 
        sapply(chromosomes, 
               function(chr) 
           {
               setTxtProgressBar(pb, pbi)
               pbi <<- pbi + 1
               components <- lapply(edata, "[[", chr)
               sort(do.call(c, components))
           }, simplify = FALSE)
    pbi <<- 1
    peaks <-
        sapply(chromosomes, 
               function(chr) 
           {
               setTxtProgressBar(pb, pbi)
               pbi <<- pbi + 1
               IntervalTree(slice(coverage(comb[[chr]], width = chrlens[chr]), lower = peak.cutoff))
           }, simplify = FALSE)
    ## accumulate per-peak information
    pbi <<- 1
    peakSummary <-
        sapply(names(peaks),
               function(chr) {
                   setTxtProgressBar(pb, pbi)
                   pbi <<- pbi + 1
                   ## print(chr)
                   chrpeaks <- peaks[[chr]]
                   in.promoter <- !is.na(findOverlaps(chrpeaks,
                       with(gpromoters.split[[chr]], IRanges(start, end)),
                       multiple = FALSE))
                   countOverlapping <- function(x)
                   {
                       as.numeric(as.table(t(findOverlaps(
                           edata[[x]][[chr]], chrpeaks, multiple = TRUE))))
                   }
                   ans <- data.frame(start = start(chrpeaks),
                                     end = end(chrpeaks),
                                     promoter = ifelse(in.promoter, "In promoter", "Not in promoter")) 
                   for (nm in names(edata))
                       ans[[nm]] <- sqrt(countOverlapping(nm))
                   ans
               }, simplify = FALSE)
    peakSummary.df <- do.call(make.groups, peakSummary)
    rownames(peakSummary.df) <- NULL
    ## compute correlations for subsamples
    fake.edata <- function() 
    {
        x <- edata
        for (chr in chromosomes)
        {
            lens <- cumsum(c(0, unlist(lapply(x, function(u) length(u[[chr]])))))
            i <- sample.int(length(comb[[chr]]), length(comb[[chr]]))
            for (k in seq_len(length(x))) 
            {
                x[[k]][[chr]] <- sort(comb[[chr]][i[ seq(lens[k]+1, lens[k+1]) ]])
            }
        }
        x
    }
    corSummary <- function()
    {
        pbi <<- 1
        fdata <- fake.edata()
        simPeakSummary <-
            sapply(names(peaks),
                   function(chr) {
                       setTxtProgressBar(pb, pbi)
                       pbi <<- pbi + 1
                       ## generate fake dataset similar to edata
                       chrpeaks <- peaks[[chr]]
                       in.promoter <- !is.na(findOverlaps(chrpeaks,
                           with(gpromoters.split[[chr]], IRanges(start, end)),
                           multiple = FALSE))
                       countOverlapping <- function(x)
                       {
                           as.numeric(as.table(t(findOverlaps(
                               fdata[[x]][[chr]], chrpeaks, multiple = TRUE))))
                       }
                       ans <- data.frame(promoter = ifelse(in.promoter, "In promoter", "Not in promoter"))
                       for (nm in names(fdata))
                           ans[[nm]] <- sqrt(countOverlapping(nm))
                       ans
                   }, simplify = FALSE)
        simPeakSummary.df <- do.call(rbind, simPeakSummary)
        
        simdsub.promoter <- subset(simPeakSummary.df, promoter == "In promoter")[names(edata)]
        simdsub.rest <- subset(simPeakSummary.df, promoter != "In promoter")[names(edata)]
        list(rest = fcorrelation(simdsub.rest),
             promoter = fcorrelation(simdsub.promoter))
    }    
    cors <- replicate(10, corSummary(), simplify = FALSE)
    close(pb)

    dsub.promoter <- subset(peakSummary.df, promoter == "In promoter")[names(edata)]
    dsub.rest <- subset(peakSummary.df, promoter != "In promoter")[names(edata)]
    
    list(peakSummary = peakSummary.df, 
         cors.obs = list(rest = fcorrelation(dsub.rest),
                         promoter = fcorrelation(dsub.promoter)),
         cors.null = cors, 
         peak.cutoff = peak.cutoff)
}


extract.cors <- function(x)
{
    cor.names <- 
        with(x$cors.obs, 
             outer(rownames(rest), colnames(rest), FUN = paste, sep = ":")[lower.tri(rest)])
    null.cors <- 
        sapply(x$cors.null, 
               function(u) {
                   with(u, c(rest[lower.tri(rest)], promoter[lower.tri(promoter)]))
               })
    colnames(null.cors) <- sprintf("null_%g", seq_len(ncol(null.cors)))
    obs.cors <- with(x$cors.obs, c(rest[lower.tri(rest)], promoter[lower.tri(promoter)]))
    data.frame(cors = c(obs.cors, null.cors), 
               sample = rep(c("observed", "subsampled"), c(length(obs.cors), length(null.cors))),
               type = c(sprintf("rest_%s", cor.names), sprintf("promoter_%s", cor.names)))
}


@ 


\newpage


One limitation of the first approach is that the results are affected
by sample size; in particular, more reads would move the curves up,
and it is not clear how their shapes would change.  This is not easy
to address, but we provide a second method which is hopefully more
robust to sample size changes.
\vspace{4mm}

The idea is as follows: When comparing several lanes, 
\begin{itemize}
\item Determine peaks in the combined data.
\item For each peak, compute number of overlapping reads
\item Compute pairwise correlations
\end{itemize}
The correlations are generally high, suggesting that there is
substantial overlap in the peak sets.  To determine if the overlap is
comparable to what we would expect under replication, we generate
observations from a reference distribution as follows:
\begin{itemize}
\item Start with all reads in the combined data
\item Randomly subsample reads and assign to each lane keeping the
  total number of reads unchanged (do this on a per-chromosome basis).
\item Compute correlations as above from this fake dataset.
\item Repeat this several times (10 in the examples shown here).
\end{itemize}
We then plot the reference values and the actual observed value.  In
summary, the observed correlations are usually smaller than what we
woould expect if the lanes were exact replicates, but the differences
are not big.  So, even though there are some differences in the sets
of binding sites enriched in the different lanes, they are largely
similar, and we are justified in combining data from the three
antibodies.
\vspace{4mm}

Similar plots for the primary mouse myotubes are shown later (page 17
onwards).


\newpage

<<echo=FALSE,results=hide>>=

all.reads <- myodMyo[c(1, 3, 5, 2, 4, 6, 7)]
names(all.reads)[1:7] <- 
    c("blast_7311", "blast_6975", "blast_6196", 
      "tube_7311", "tube_6975", "tube_6196", "preimmune")
ereads <- gdApply(all.reads,
                  function(x, seqLen = 200) {
                      sort(extendReads(x, seqLen = seqLen))
                  })
foo <- overlapData(ereads[4:6], peak.cutoff = 20)


@ 


\begin{center}
<<fig=TRUE,echo=FALSE,results=hide,height=5>>=
plot(splom(~peakSummary[-c(1, 2, 3, ncol(foo$peakSummary))] | peakSummary$promoter,
           data = foo, pch = ".", cex = 2, 
           main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff),
           xlab = "Scatter plot matrix of sqrt(number of overlapping reads)"))
@ 
<<fig=TRUE,echo=FALSE,results=hide,height=3>>=
plot(stripplot(type ~ cors, data = extract.cors(foo), groups = sample, jitter = TRUE, pch = c(16, 1),
               xlab = "Pairwise correlation of sqrt(number of overlapping reads), observed and subsampled",
               main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff)))
@ 
\end{center}


\newpage

<<echo=FALSE,results=hide>>=
foo <- overlapData(ereads[1:3], peak.cutoff = 20)
@ 


\begin{center}
<<fig=TRUE,echo=FALSE,results=hide,height=5>>=
plot(splom(~peakSummary[-c(1, 2, 3, ncol(foo$peakSummary))] | peakSummary$promoter,
           data = foo, pch = ".", cex = 2, 
           main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff),
           xlab = "Scatter plot matrix of sqrt(number of overlapping reads)"))
@ 
<<fig=TRUE,echo=FALSE,results=hide,height=3>>=
plot(stripplot(type ~ cors, data = extract.cors(foo), groups = sample, jitter = TRUE, pch = c(16, 1),
               xlab = "Pairwise correlation of sqrt(number of overlapping reads), observed and subsampled",
               main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff)))
@ 
\end{center}


\newpage

The next few pages give similar plots for the Primary mouse myotubes.
The first two plots seem to say that 6196 peaks are a (small) subset
of 6975 peaks.  However, note the very large difference in the number
of peaks (even though the number of reads is comparable), suggesting
some non-biological problem in the 6196 lane.


\newpage


<<echo=FALSE,results=hide>>=

all.reads <- solexa54[c("7", "8")]
names(all.reads) <- c("real_6975", "real_6196")
ereads <- gdApply(all.reads,
                  function(x, seqLen = 200) {
                      sort(extendReads(x, seqLen = seqLen))
                  })
foo <- overlapData(ereads, peak.cutoff = 20)


@ 

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = names(ereads)[1], peak.cutoff = 20, ylim = extendrange(c(0, 1))))
@ 
\end{center}

\newpage


\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = names(ereads)[2], peak.cutoff = 20, ylim = extendrange(c(0, 1))))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide,height=5>>=

## xyplot(real_6975^2 ~ real_6196^2 | ifelse(promoter, "In promoter", "Not in promoter"),
##        data = foo$peakSummary, pch = ".", cex = 2, 
##        main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff),
##        xlab = "Number of overlapping reads in real_6196",
##        ylab = "Number of overlapping reads in real_6975", alpha = 0.5)

## xtabs(~I(real_6975^2 >= 20) + I(real_6196^2 >= 20),
##       data = foo$peakSummary)

## xyplot(real_6975^2 ~ real_6196^2 | ifelse(promoter, "In promoter", "Not in promoter"),
##        data = foo$peakSummary, pch = ".", cex = 2, 
##        panel = panel.smoothScatter,
##        main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff),
##        xlab = "Number of overlapping reads in real_6196",
##        ylab = "Number of overlapping reads in real_6975")


plot(splom(~peakSummary[-c(1, 2, 3, ncol(peakSummary))] | peakSummary$promoter,
           data = foo, pch = ".", cex = 2, 
           main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff),
           xlab = "Scatter plot matrix of sqrt(number of overlapping reads)"))

@ 
<<fig=TRUE,echo=FALSE,results=hide,height=3>>=
plot(stripplot(type ~ cors, data = extract.cors(foo), groups = sample, jitter = TRUE, pch = c(16, 1),
               xlab = "Pairwise correlation of sqrt(number of overlapping reads), observed and subsampled",
               main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff)))
@ 
\end{center}


\newpage

To address Yi's question: ``Could you compare the real 6196tube and
real 6975tube to the 3 separate antibody lanes from C2 myotubes? The
figures on pages 18-20 indicate that real 6196 peaks are a subset of
real 6975 peaks. I am wondering if that is because real 6975 is just
too much better than all other lanes we have run so far, or real 6196
is not of high quality.''
\vspace{4mm}
Also included is the combined Fibroblast+MyoD.


<<echo=FALSE,results=hide>>=

all.reads <- c(myodMyo[c("2", "4", "7")], 
               solexa54[c("7", "8")],
               GenomeDataList(list(cfibromyod = combineLaneReads(myodFibro[c("2", "4", "7")]))))
names(all.reads) <- c("tube_7311", "tube_6975", "tube_6196", 
                      "real_6975", "real_6196", "cfibromyod")
ereads <- gdApply(all.reads,
                  function(x, seqLen = 200) {
                      sort(extendReads(x, seqLen = seqLen))
                  })

@ 

\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = names(ereads)[1], peak.cutoff = 20, ylim = extendrange(c(0, 1))))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = names(ereads)[2], peak.cutoff = 20, ylim = extendrange(c(0, 1))))
@ 
\end{center}


\newpage

\begin{center}
<<fig=TRUE,echo=FALSE,results=hide>>=
plot(plotSamples(ereads, peak.ref = names(ereads)[3], 
                 peak.cutoff = 20, ylim = extendrange(c(0, 1))))
@ 
\end{center}


%%%%%%%%%%%%%%%


\newpage

(Not directly relevant) Comparison of Fibroblast+MyoD with myotubes
using correlation.


<<echo=FALSE,results=hide>>=

all.reads <- 
    GenomeDataList(list(ctubes = combineLaneReads(myodMyo[c("2", "4", "7")]),
                        cfibromyod = combineLaneReads(myodFibro[c("2", "4", "7")])))

                        ## primary_6975 = solexa54[["7"]]))
ereads <- gdApply(all.reads,
                  function(x, seqLen = 200) {
                      sort(extendReads(x, seqLen = seqLen))
                  })
foo <- overlapData(ereads, peak.cutoff = 20)

@ 


\begin{center}
<<fig=TRUE,echo=FALSE,results=hide,height=4>>=
plot(splom(~peakSummary[-c(1, 2, 3, ncol(peakSummary))] | peakSummary$promoter,
           data = foo, pch = ".", cex = 2, 
           main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff),
           xlab = "Scatter plot matrix of sqrt(number of overlapping reads)"))

@ 
<<fig=TRUE,echo=FALSE,results=hide,height=4>>=
plot(splom(~peakSummary[-c(1, 2, 3, ncol(peakSummary))] | peakSummary$promoter,
           data = foo, pch = ".", cex = 2, 
           panel = panel.smoothScatter,
           main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff),
           xlab = "Scatter plot matrix of sqrt(number of overlapping reads)"))

@ 
<<fig=TRUE,echo=FALSE,results=hide,height=2>>=
plot(stripplot(type ~ cors, data = extract.cors(foo), groups = sample, jitter = TRUE, pch = c(16, 1),
               xlab = "Pairwise correlation of sqrt(number of overlapping reads), observed and subsampled",
               main = sprintf("Combined data, peaks of depth >= %g", foo$peak.cutoff)))
@ 
\end{center}


\end{document}