### library(Biostrings) # for mergeIUPACLetters() ### Produce a data frame with 3 columns: ### 1. RefSNP_id ("rs" prefix removed) ### 2. alleles_as_ambig: alleles as an IUPAC letter (complemented if strand is -) ### 3. loc: the current chr_pos col cookRawSNPs <- function(rawsnps) { ids <- rawsnps$RefSNP_id if (!all(substr(ids, 1, 2) == "rs")) stop("some RefSNP ids do not start with \"rs\"") ids <- substr(ids, 3, nchar(ids)) alleles <- gsub("/", "", rawsnps$alleles, fixed=TRUE) in_minus_strand <- which(rawsnps$strand == "-") alleles[in_minus_strand] <- chartr("ACGT", "TGCA", alleles[in_minus_strand]) alleles_as_ambig <- mergeIUPACLetters(alleles) ans <- data.frame(RefSNP_id=ids, alleles_as_ambig=alleles_as_ambig, loc=rawsnps$chr_pos, stringsAsFactors=FALSE) ans <- ans[order(ans$loc), ] row.names(ans) <- NULL ans } ### Return indices of SNPs that are hidden by a strictly less specific ### SNP occurring at the same location. Also return indices of ### SNPs occurring at the same location with incompatible reported ### alleles. hiddenByLessSpecificSNP <- function(snplocs) { loc_dups <- duplicated(snplocs$loc) dups <- duplicated(snplocs[ , c("alleles_as_ambig", "loc")]) locs0 <- unique(snplocs$loc[which(loc_dups & !dups)]) ans <- integer(0) for (loc in locs0) { have_this_loc <- which(snplocs$loc == loc) ambigs <- snplocs$alleles_as_ambig[have_this_loc] less_specific <- mergeIUPACLetters(paste(ambigs, collapse="")) ans <- c(ans, which(snplocs$loc == loc & snplocs$alleles_as_ambig != less_specific)) } ans } ### 'shortseqnames' must be a single string (e.g. "20 21 22") loadAndserializeSNPs <- function(path, shortseqnames, chr_prefix="ch") { seqnames <- paste(chr_prefix, strsplit(shortseqnames, " ", fixed=TRUE)[[1]], sep="") SNPcount <- integer(0) COLNAMES <- c("RefSNP_id", "alleles", "avg_het", "se_het", "chr", "chr_pos", "strand") for (seqname in seqnames) { cat("Loading raw SNPs for ", seqname, " ...\n", sep="") file <- file.path(path, paste(seqname, "_rawsnps.txt", sep="")) rawsnps <- read.table(file, quote="", col.names=COLNAMES, na.strings="?", stringsAsFactors=FALSE) snplocs <- cookRawSNPs(rawsnps) ## No need to drop inconsistent or redundant SNPs #to_drop <- hiddenByLessSpecificSNP(snplocs) #if (length(to_drop) != 0) { # cat("Dropping ", length(to_drop), " inconsistent or redundant SNPs ...\n", sep="") # snplocs <- snplocs[-to_drop, ] # row.names(snplocs) <- NULL #} SNPcount <- c(SNPcount, nrow(snplocs)) objname <- paste(seqname, "_snplocs", sep="") assign(objname, snplocs, envir=.GlobalEnv) cat("Saving ", objname, " (data frame with ", nrow(snplocs), " SNPs) ...\n", sep="") save(list=objname, file=paste(objname, ".rda", sep=""), envir=.GlobalEnv) } names(SNPcount) <- seqnames cat("Saving the SNPcount table ...\n") assign("SNPcount", SNPcount, envir=.GlobalEnv) save(list="SNPcount", file="SNPcount.rda", envir=.GlobalEnv) cat("DONE.\n") }