pdf("fibroblast_comparison.pdf", width = 12, height = 10)

## are fibroblasts a good model system (for what)?

## real C2C12 mouse muscles is the gold standard
##  - C2C12 cell line myotube is a cell-line equivalent [?]
##  - fibroblasts (MEF) cell-line is a preferable model system.
##  - is it sufficiently good?

## Strategy:

##  - find real C2C12 peaks
##  - look at coverage under myotube (cell line) and fibroblasts (cell line)

library(lattice)
library(chipseq)
library(chipseqData)

data(solexa54)
data(myodMyo)
data(myodFibro)


## digression: are three antibodies more or less similar?

if (FALSE)
{

    library(BSgenome.Mmusculus.UCSC.mm9)
    
    data(myodMyo)
    all.reads <- GenomeDataList(c(as.list(myodMyo), 
                                  list(ctubes = combineLaneReads(myodMyo[c("2", "4", "7")]),
                                       cblasts = combineLaneReads(myodMyo[c("1", "3", "6")]))))
    names(all.reads)[1:7] <- c("blast_1", "tube_1", "blast_2", "tube_2", "blast_3", "tube_3", "preimmune")
    ereads <- gdApply(all.reads,
                      function(x, seqLen = 200) {
                          sort(extendReads(x, seqLen = seqLen))
                      })


    peakProfile <- function(ref = ereads$tube_1, combined = ereads$ctubes,
                            chr = "chr1", chrlens = seqlengths(Mmusculus), ...)
    {
        ## take random subsets of 'combined' of size 'length(ref)',
        ## and compute number of peaks as function of cutoff.
        ## Provides null reference for same thing in 'ref'.

        ref.chr <- ref[[chr]]
        comb.chr <- combined[[chr]]
        nref <- length(ref.chr)
        ncomb <- length(comb.chr)
        cutoffs <- 5:20
        npeaks <-
            replicate(10,
                  {
                      cat(".")
                      sub <- comb.chr[sort(sample.int(ncomb, nref))]
                      cov <- coverage(sub, width = chrlens[chr])
                      data.frame(cutoff = cutoffs,
                                 npeaks = sapply(cutoffs, function(lower) length(slice(cov, lower = lower))),
                                 type = "simulation")
                  }, simplify = FALSE)
        npeaks.ref <- {
            cov <- coverage(ref.chr, width = chrlens[chr])
            data.frame(cutoff = cutoffs,
                       npeaks = sapply(cutoffs, function(lower) length(slice(cov, lower = lower))),
                       type = "observed")
        }
        npeaks.df <- do.call(rbind, c(npeaks, list(npeaks.ref)))
        stripplot(factor(cutoff) ~ npeaks, npeaks.df, jitter = TRUE,
                  groups = type, pch = c(1, 16),
                  ...)
    }
    
    
    peakProfile(ref = ereads$tube_1, combined = ereads$ctubes, chr = "chr5",
                scales = list(x = list(log = 2)))
    peakProfile(ref = ereads$tube_2, combined = ereads$ctubes, chr = "chr5")
    peakProfile(ref = ereads$tube_3, combined = ereads$ctubes, chr = "chr5")


}


## promoter information

data(geneMouse)
gregions <- genomic_regions(genes = geneMouse, proximal = 1000)
gregions <- subset(gregions, chrom %in% paste("chr", 1:19, sep = ""))
gregions$chrom <- gregions$chrom[drop = TRUE]

gpromoters <- gregions[c("chrom", "promoter.start", "promoter.end")]
names(gpromoters) <- c("chr", "start", "end")
gpromoters.split <- split(gpromoters, gpromoters$chr)

## samples being compared

combined <- combineLaneReads(c(solexa54[c("7", "8")],
                               myodMyo[c("2", "4", "7")],
                               myodFibro[c("2", "4", "7")]))

all.reads <- GenomeDataList(list(realMouse.6975 = solexa54[["7"]],
                                 realMouse.6196 = solexa54[["8"]],
                                 ctubes = combineLaneReads(myodMyo[c("2", "4", "7")]),
                                 cfibromyod = combineLaneReads(myodFibro[c("2", "4", "7")]),
                                 combined = combined,
                                 cfibro = combineLaneReads(myodFibro[c("1", "3", "6")]),
                                 preimmune = myodMyo[["8"]]))


ereads <- gdApply(all.reads,
                  function(x, seqLen = 200) {
                      sort(extendReads(x, seqLen = seqLen))
                  })

## number of reads and number of peaks

do.call(cbind, lapply(ereads[1:2], function(x) unlist(lapply(x, length)) / 1e3))

countPeaks <- function(x, lower = c(10))
{
    cov <- coverage(x, width = max(end(x)) + 500) 
    sapply(lower, function(i) length(slice(cov, lower = i)))
}

do.call(cbind, lapply(ereads[1:2], function(x) unlist(lapply(x, countPeaks, lower = 10))))


## 


summarizeData <-
    function(edata, peak.ref, peak.cutoff = 6, ref = peak.ref, ref.cutoff = peak.cutoff,
             include = names(edata))
{
    peaks <-
        gdApply(edata[[peak.ref]],
                function(g, cutoff = peak.cutoff) {
                    print(length(g))
                    IntervalTree(slice(coverage(g, width = max(end(g)) + 100L), lower = cutoff))
                })
    ## accumulate per-peak information
    peakSummary <-
        sapply(names(peaks),
               function(chr) {
                   print(chr)
                   chrpeaks <- peaks[[chr]]
                   in.promoter <-
                     chrpeaks %over% with(gpromoters.split[[chr]], IRanges(start, end))
                   countOverlapping <- function(x)
                   {
                       as.numeric(as.table(t(findOverlaps(edata[[x]][[chr]], chrpeaks))))
                   }
                   ans <- data.frame(start = start(chrpeaks),
                                     end = end(chrpeaks),
                                     promoter = in.promoter)
                   for (nm in names(edata))
                       ans[[nm]] <- countOverlapping(nm)
                   ans
               }, simplify = FALSE)
    peakSummary.df <- do.call(make.groups, peakSummary)
    rownames(peakSummary.df) <- NULL
    computeRates <- function(cutoff = 5)
    {
        mytab <- function(x) table(factor(x, levels = c(FALSE, TRUE)))
        dsub <- peakSummary.df ## [peakSummary.df[[ref]] >= ref.cutoff, ]
        dsub.promoter <- subset(dsub, promoter)
        props <- c(sapply(include, function(x) prop.table(mytab(dsub[[x]] >= cutoff))[2]),
                   sapply(include, function(x) prop.table(mytab(dsub.promoter[[x]] >= cutoff))[2]))
        counts <- c(sapply(include, function(x) mytab(dsub[[x]] >= cutoff)[2]),
                    sapply(include, function(x) mytab(dsub.promoter[[x]] >= cutoff)[2]))
        data.frame(cutoff = cutoff, ref = ref, ref.cutoff = ref.cutoff,
                   promoter = rep(c("All", "Promoter"), each = length(include)),
                   sample = rep(include, 2),
                   proportion = props, counts = counts,
                   stringsAsFactors = FALSE)
    }
    props <- do.call(rbind, lapply(1:15, computeRates))
    list(peakSummary = peakSummary.df, props = props,
         peak.cutoff = peak.cutoff, peak.ref = peak.ref,
         include = include)
}

fibroPeakSummary.6 <- summarizeData(ereads, peak.ref = "cfibromyod", peak.cutoff = 6,
                                    include = setdiff(names(ereads), c("combined")))

xyplot(proportion ~ cutoff | promoter, fibroPeakSummary.6$props, type = c("g", "o"), 
       groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2),
       ylab = "Proportion of peaks with >= cutoff reads",
       main = "fibroblasts+MyoD peaks, depth >= 6")


fibroPeakSummary.12 <- summarizeData(ereads, peak.ref = "cfibromyod", peak.cutoff = 12,
                                     include = setdiff(names(ereads), c("combined")))
fibroPeakSummary.20 <- summarizeData(ereads, peak.ref = "cfibromyod", peak.cutoff = 20,
                                     include = setdiff(names(ereads), c("combined")))

ctubePeakSummary.12 <- summarizeData(ereads, peak.ref = "ctubes", peak.cutoff = 10,
                                     include = setdiff(names(ereads), c("combined")))


png("sample_comparison_%03d.png", width = 600, height = 400)

xyplot(proportion ~ cutoff | promoter, fibroPeakSummary.12$props, type = c("g", "o"), 
       groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2),
       ylab = "Proportion of peaks with number of overlapping reads >= cutoff ",
       main = "fibroblasts+MyoD peaks, depth >= 12")

xyplot(proportion ~ cutoff | promoter, fibroPeakSummary.20$props, type = c("g", "o"), 
       groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2),
       ylab = "Proportion of peaks with number of overlapping reads >= cutoff ",
       main = "fibroblasts+MyoD peaks, depth >= 20")

## xyplot(counts ~ cutoff | promoter, fibroPeakSummary.10$props, type = c("g", "o"), 
##        groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2),
##        ylab = "Proportion of peaks with number of overlapping reads >= cutoff ",
##        main = "fibroblasts+MyoD peaks, depth >= 10")


xyplot(proportion ~ cutoff | promoter, ctubePeakSummary.12$props, type = c("g", "o"), 
       groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2),
       ylab = "Proportion of peaks with number of overlapping reads >= cutoff ",
       main = "Myotube peaks, depth >= 10")

dev.off()


splom(~data.frame(asinh(realMouse), asinh(ctubes), asinh(cfibromyod)),
      data = realMousePeakSummary.df,
      varnames = c("realMouse", "myotubes", "fibroblasts"),
      main = "asinh(number of reads overlapping real_mouse peaks with depth >= 6)",
      panel = panel.smoothScatter)

splom(~data.frame(asinh(realMouse), asinh(ctubes), asinh(cfibromyod)),
      data = realMousePeakSummary.df,
      subset = promoter,
      varnames = c("realMouse", "myotubes", "fibroblasts"),
      main = "asinh(number of reads overlapping real_mouse peaks with depth >= 6, promoters only)",
      panel = panel.smoothScatter)


if (FALSE)
{
    FUN <- sqrt
    FUN <- log1p
    FUN <- asinh
    xyplot(FUN(ctubes) + FUN(cfibromyod) ~ FUN(realMouse), data = realMousePeakSummary.df,
           outer = TRUE,
           panel = panel.smoothScatter)
    xyplot(FUN(ctubes) + FUN(cfibromyod) ~ FUN(realMouse), data = realMousePeakSummary.df,
           outer = TRUE, pch = ".", cex = 2)
}


## #####


## computeRates <-
##     function(data,
##              ref = c("realMouse", "ctubes", "cfibromyod"),
##              cutoff = 5, ref.cutoff = 6)
## {
##     ref <- match.arg(ref)
##     rest <- setdiff(c("realMouse", "ctubes", "cfibromyod"),
##                     ref)
##     dsub <- data[data[[ref]] >= ref.cutoff, ]
##     dsub.promoter <- subset(dsub, promoter)
##     props <- c(sapply(rest, function(x) prop.table(table(dsub[[x]] >= cutoff))[2]),
##                sapply(rest, function(x) prop.table(table(dsub.promoter[[x]] >= cutoff))[2]))
##     counts <- c(sapply(rest, function(x) table(dsub[[x]] >= cutoff)[2]),
##                 sapply(rest, function(x) table(dsub.promoter[[x]] >= cutoff)[2]))
##     data.frame(cutoff = cutoff, ref = ref, ref.cutoff = ref.cutoff,
##                promoter = rep(c("All", "Promoter"), each = length(rest)),
##                sample = rep(rest, 2),
##                proportion = props, counts = counts,
##                stringsAsFactors = FALSE)
## }


## number of reads and number of peaks for three myotube lanes


ereads <- gdApply(myodMyo[c("2", "4", "7")],
                  function(x, seqLen = 200) {
                      sort(extendReads(x, seqLen = seqLen))
                  })

nreads <- do.call(cbind, lapply(ereads, function(x) unlist(lapply(x, length)) / 1e3))

countPeaks <- function(x, lower = c(10))
{
    cov <- coverage(x, width = max(end(x)) + 500) 
    sapply(lower, function(i) length(slice(cov, lower = i)))
}

npeaks <- do.call(cbind, lapply(ereads, function(x) unlist(lapply(x, countPeaks, lower = 10))))

colnames(nreads) <- colnames(npeaks) <- c("myotube.7311", "myotube.6975", "myotube.6196")

nreads
npeaks
apply(nreads, 2, sum)
apply(npeaks, 2, sum)