pdf("fibroblast_comparison.pdf", width = 12, height = 10) ## are fibroblasts a good model system (for what)? ## real C2C12 mouse muscles is the gold standard ## - C2C12 cell line myotube is a cell-line equivalent [?] ## - fibroblasts (MEF) cell-line is a preferable model system. ## - is it sufficiently good? ## Strategy: ## - find real C2C12 peaks ## - look at coverage under myotube (cell line) and fibroblasts (cell line) library(lattice) library(chipseq) library(chipseqData) data(solexa54) data(myodMyo) data(myodFibro) ## digression: are three antibodies more or less similar? if (FALSE) { library(BSgenome.Mmusculus.UCSC.mm9) data(myodMyo) all.reads <- GenomeDataList(c(as.list(myodMyo), list(ctubes = combineLaneReads(myodMyo[c("2", "4", "7")]), cblasts = combineLaneReads(myodMyo[c("1", "3", "6")])))) names(all.reads)[1:7] <- c("blast_1", "tube_1", "blast_2", "tube_2", "blast_3", "tube_3", "preimmune") ereads <- gdApply(all.reads, function(x, seqLen = 200) { sort(extendReads(x, seqLen = seqLen)) }) peakProfile <- function(ref = ereads$tube_1, combined = ereads$ctubes, chr = "chr1", chrlens = seqlengths(Mmusculus), ...) { ## take random subsets of 'combined' of size 'length(ref)', ## and compute number of peaks as function of cutoff. ## Provides null reference for same thing in 'ref'. ref.chr <- ref[[chr]] comb.chr <- combined[[chr]] nref <- length(ref.chr) ncomb <- length(comb.chr) cutoffs <- 5:20 npeaks <- replicate(10, { cat(".") sub <- comb.chr[sort(sample.int(ncomb, nref))] cov <- coverage(sub, width = chrlens[chr]) data.frame(cutoff = cutoffs, npeaks = sapply(cutoffs, function(lower) length(slice(cov, lower = lower))), type = "simulation") }, simplify = FALSE) npeaks.ref <- { cov <- coverage(ref.chr, width = chrlens[chr]) data.frame(cutoff = cutoffs, npeaks = sapply(cutoffs, function(lower) length(slice(cov, lower = lower))), type = "observed") } npeaks.df <- do.call(rbind, c(npeaks, list(npeaks.ref))) stripplot(factor(cutoff) ~ npeaks, npeaks.df, jitter = TRUE, groups = type, pch = c(1, 16), ...) } peakProfile(ref = ereads$tube_1, combined = ereads$ctubes, chr = "chr5", scales = list(x = list(log = 2))) peakProfile(ref = ereads$tube_2, combined = ereads$ctubes, chr = "chr5") peakProfile(ref = ereads$tube_3, combined = ereads$ctubes, chr = "chr5") } ## promoter information data(geneMouse) gregions <- genomic_regions(genes = geneMouse, proximal = 1000) gregions <- subset(gregions, chrom %in% paste("chr", 1:19, sep = "")) gregions$chrom <- gregions$chrom[drop = TRUE] gpromoters <- gregions[c("chrom", "promoter.start", "promoter.end")] names(gpromoters) <- c("chr", "start", "end") gpromoters.split <- split(gpromoters, gpromoters$chr) ## samples being compared combined <- combineLaneReads(c(solexa54[c("7", "8")], myodMyo[c("2", "4", "7")], myodFibro[c("2", "4", "7")])) all.reads <- GenomeDataList(list(realMouse.6975 = solexa54[["7"]], realMouse.6196 = solexa54[["8"]], ctubes = combineLaneReads(myodMyo[c("2", "4", "7")]), cfibromyod = combineLaneReads(myodFibro[c("2", "4", "7")]), combined = combined, cfibro = combineLaneReads(myodFibro[c("1", "3", "6")]), preimmune = myodMyo[["8"]])) ereads <- gdApply(all.reads, function(x, seqLen = 200) { sort(extendReads(x, seqLen = seqLen)) }) ## number of reads and number of peaks do.call(cbind, lapply(ereads[1:2], function(x) unlist(lapply(x, length)) / 1e3)) countPeaks <- function(x, lower = c(10)) { cov <- coverage(x, width = max(end(x)) + 500) sapply(lower, function(i) length(slice(cov, lower = i))) } do.call(cbind, lapply(ereads[1:2], function(x) unlist(lapply(x, countPeaks, lower = 10)))) ## summarizeData <- function(edata, peak.ref, peak.cutoff = 6, ref = peak.ref, ref.cutoff = peak.cutoff, include = names(edata)) { peaks <- gdApply(edata[[peak.ref]], function(g, cutoff = peak.cutoff) { print(length(g)) IntervalTree(slice(coverage(g, width = max(end(g)) + 100L), lower = cutoff)) }) ## accumulate per-peak information peakSummary <- sapply(names(peaks), function(chr) { print(chr) chrpeaks <- peaks[[chr]] in.promoter <- chrpeaks %over% with(gpromoters.split[[chr]], IRanges(start, end)) countOverlapping <- function(x) { as.numeric(as.table(t(findOverlaps(edata[[x]][[chr]], chrpeaks)))) } ans <- data.frame(start = start(chrpeaks), end = end(chrpeaks), promoter = in.promoter) for (nm in names(edata)) ans[[nm]] <- countOverlapping(nm) ans }, simplify = FALSE) peakSummary.df <- do.call(make.groups, peakSummary) rownames(peakSummary.df) <- NULL computeRates <- function(cutoff = 5) { mytab <- function(x) table(factor(x, levels = c(FALSE, TRUE))) dsub <- peakSummary.df ## [peakSummary.df[[ref]] >= ref.cutoff, ] dsub.promoter <- subset(dsub, promoter) props <- c(sapply(include, function(x) prop.table(mytab(dsub[[x]] >= cutoff))[2]), sapply(include, function(x) prop.table(mytab(dsub.promoter[[x]] >= cutoff))[2])) counts <- c(sapply(include, function(x) mytab(dsub[[x]] >= cutoff)[2]), sapply(include, function(x) mytab(dsub.promoter[[x]] >= cutoff)[2])) data.frame(cutoff = cutoff, ref = ref, ref.cutoff = ref.cutoff, promoter = rep(c("All", "Promoter"), each = length(include)), sample = rep(include, 2), proportion = props, counts = counts, stringsAsFactors = FALSE) } props <- do.call(rbind, lapply(1:15, computeRates)) list(peakSummary = peakSummary.df, props = props, peak.cutoff = peak.cutoff, peak.ref = peak.ref, include = include) } fibroPeakSummary.6 <- summarizeData(ereads, peak.ref = "cfibromyod", peak.cutoff = 6, include = setdiff(names(ereads), c("combined"))) xyplot(proportion ~ cutoff | promoter, fibroPeakSummary.6$props, type = c("g", "o"), groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2), ylab = "Proportion of peaks with >= cutoff reads", main = "fibroblasts+MyoD peaks, depth >= 6") fibroPeakSummary.12 <- summarizeData(ereads, peak.ref = "cfibromyod", peak.cutoff = 12, include = setdiff(names(ereads), c("combined"))) fibroPeakSummary.20 <- summarizeData(ereads, peak.ref = "cfibromyod", peak.cutoff = 20, include = setdiff(names(ereads), c("combined"))) ctubePeakSummary.12 <- summarizeData(ereads, peak.ref = "ctubes", peak.cutoff = 10, include = setdiff(names(ereads), c("combined"))) png("sample_comparison_%03d.png", width = 600, height = 400) xyplot(proportion ~ cutoff | promoter, fibroPeakSummary.12$props, type = c("g", "o"), groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2), ylab = "Proportion of peaks with number of overlapping reads >= cutoff ", main = "fibroblasts+MyoD peaks, depth >= 12") xyplot(proportion ~ cutoff | promoter, fibroPeakSummary.20$props, type = c("g", "o"), groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2), ylab = "Proportion of peaks with number of overlapping reads >= cutoff ", main = "fibroblasts+MyoD peaks, depth >= 20") ## xyplot(counts ~ cutoff | promoter, fibroPeakSummary.10$props, type = c("g", "o"), ## groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2), ## ylab = "Proportion of peaks with number of overlapping reads >= cutoff ", ## main = "fibroblasts+MyoD peaks, depth >= 10") xyplot(proportion ~ cutoff | promoter, ctubePeakSummary.12$props, type = c("g", "o"), groups = sample, auto.key = list(lines = TRUE, points = FALSE, columns = 2), ylab = "Proportion of peaks with number of overlapping reads >= cutoff ", main = "Myotube peaks, depth >= 10") dev.off() splom(~data.frame(asinh(realMouse), asinh(ctubes), asinh(cfibromyod)), data = realMousePeakSummary.df, varnames = c("realMouse", "myotubes", "fibroblasts"), main = "asinh(number of reads overlapping real_mouse peaks with depth >= 6)", panel = panel.smoothScatter) splom(~data.frame(asinh(realMouse), asinh(ctubes), asinh(cfibromyod)), data = realMousePeakSummary.df, subset = promoter, varnames = c("realMouse", "myotubes", "fibroblasts"), main = "asinh(number of reads overlapping real_mouse peaks with depth >= 6, promoters only)", panel = panel.smoothScatter) if (FALSE) { FUN <- sqrt FUN <- log1p FUN <- asinh xyplot(FUN(ctubes) + FUN(cfibromyod) ~ FUN(realMouse), data = realMousePeakSummary.df, outer = TRUE, panel = panel.smoothScatter) xyplot(FUN(ctubes) + FUN(cfibromyod) ~ FUN(realMouse), data = realMousePeakSummary.df, outer = TRUE, pch = ".", cex = 2) } ## ##### ## computeRates <- ## function(data, ## ref = c("realMouse", "ctubes", "cfibromyod"), ## cutoff = 5, ref.cutoff = 6) ## { ## ref <- match.arg(ref) ## rest <- setdiff(c("realMouse", "ctubes", "cfibromyod"), ## ref) ## dsub <- data[data[[ref]] >= ref.cutoff, ] ## dsub.promoter <- subset(dsub, promoter) ## props <- c(sapply(rest, function(x) prop.table(table(dsub[[x]] >= cutoff))[2]), ## sapply(rest, function(x) prop.table(table(dsub.promoter[[x]] >= cutoff))[2])) ## counts <- c(sapply(rest, function(x) table(dsub[[x]] >= cutoff)[2]), ## sapply(rest, function(x) table(dsub.promoter[[x]] >= cutoff)[2])) ## data.frame(cutoff = cutoff, ref = ref, ref.cutoff = ref.cutoff, ## promoter = rep(c("All", "Promoter"), each = length(rest)), ## sample = rep(rest, 2), ## proportion = props, counts = counts, ## stringsAsFactors = FALSE) ## } ## number of reads and number of peaks for three myotube lanes ereads <- gdApply(myodMyo[c("2", "4", "7")], function(x, seqLen = 200) { sort(extendReads(x, seqLen = seqLen)) }) nreads <- do.call(cbind, lapply(ereads, function(x) unlist(lapply(x, length)) / 1e3)) countPeaks <- function(x, lower = c(10)) { cov <- coverage(x, width = max(end(x)) + 500) sapply(lower, function(i) length(slice(cov, lower = i))) } npeaks <- do.call(cbind, lapply(ereads, function(x) unlist(lapply(x, countPeaks, lower = 10)))) colnames(nreads) <- colnames(npeaks) <- c("myotube.7311", "myotube.6975", "myotube.6196") nreads npeaks apply(nreads, 2, sum) apply(npeaks, 2, sum)