##FIRST LET'S SEE WHAT THE P-VALUE PROBLEM LOOKS LIKE ##FIXME: just ran the first simulation set1 - need to do set2 again ##and start looking at the appropriate set of adjustments library("GOstats") library("hgu95av2.db") w1 <- as.list(hgu95av2LOCUSID) w2 <- unique(unlist(w1)) set.seed(123) myLL <- sample(w2, 100) xx <- hyperGTest(myLL) xx$numLL xx$numInt [1] 60 sum(xx$pvalues < 0.01) [1] 9 rval = NULL for(i in 1:10) { mx = sample(w2, 100) xx = hyperGTest(mx) rval[[i]] = list(nI = xx$numInt,gC= xx$goCounts,pV= xx$pvalues) } nI = sapply(rval, function(x) x$nI) gC = sapply(rval, function(x) length(x$gC)) countpvs = function(il, pv=0.05) sapply(il, function(x) sum(x$pV < pv)) nPV = countpvs(rval) sapply(rval, function(x) max(x$pV)) ##analysis analyzeRun = function(x, pv=0.05) { nI = sapply(x, function(y) y$nI) gC = sapply(x, function(y) length(y$gC)) nPV = countpvs(x, pv) return(list(nI=nI, gC=gC, pvs = nPV)) } ##for set1 set1 = genPVs(100,100) s1a = analyzeRun(set1) mean(s1a$pvs/s1a$gC) #should be about 0.05, but it isn't s2a = analyzeRun(set1, 0.01) mean(s2a$pvs/s2a$gC) ##is pretty close to 0.01 s3a = analyzeRun(set1, 0.1) mean(s3a$pvs/s3a$gC) ##about 3 times what it should be #for set2 set2 = genPVs(100,100) s2.05 = analyzeRun(set2) mean(s2.05$pvs/s2.05$gC) #should be about 0.05, but it isn't s2.01 = analyzeRun(set2, 0.01) mean(s2.01$pvs/s2.01$gC) ##is pretty close to 0.01 s2.1 = analyzeRun(set2, 0.1) mean(s2.1$pvs/s2.1$gC) ##about 3 times what it should be ##look at int g size set3 = genPVs(500, 75) ##run a simulation genPVs = function(nsim, nsamp, verbose=TRUE) { rval = NULL for(i in 1:nsim) { mx = sample(w2, nsamp) rval[[i]] = hyperGTest(mx) if( verbose ) print(paste("finished", i, "of", nsim)) } return(rval) } set.seed(455) set1 = genPVs(100, 100) ################# ## compare dists ################# data(Ndists) data(Bdists) bothBad = (Ndists==Inf & Bdists==Inf) badN = (Ndists==Inf & Bdists !=Inf) badB = (Bdists==Inf & Ndists !=Inf)