## The snp135common track must be downloaded directly, else it will timeout: ## #wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp135Common.txt.gz #zcat snp135Common.txt.gz | cut -f2,3,4,5,7,14 | gzip > snp135Common.BioC.txt.gz ## ## All operations below work on the above summarized version of the track. ## require(GenomicRanges) df2GR <- function(df, keepColumns=FALSE, ignoreStrand=FALSE){ # {{{ KDH require(GenomicRanges) stopifnot(class(df) == "data.frame") subs <- list(chromStart='start', chromEnd='end', chrom='chr', seqnames='chr') for(s in names(subs)) names(df) = gsub(s, subs[[s]], names(df), ignore=TRUE) stopifnot(all(c("start", "end") %in% names(df))) if('genome' %in% names(attributes(df))) g <- attr(df, 'genome') else g <- NULL if(substr(df$chr, 1, 3)[1] != 'chr') df$chr <- paste('chr', df$chr, sep='') df <- subset(df, !is.na(start) & !is.na(end)) if(!ignoreStrand && ("strand" %in% names(df))) { if(is.numeric(df$strand)) df$strand <- strandMe(df$strand) GR <- with(df, GRanges(chr, IRanges(start=start, end=end), strand=strand)) } else { GR <- with(df, GRanges(chr, IRanges(start=start, end=end))) } if('name' %in% names(df)) { names(GR) <- df$name df$name <- NULL } else { names(GR) <- rownames(df) } if(keepColumns) { skipped = c("rangename","chr","start","end","width","strand") elementMetadata(GR) <- as(df[, setdiff(names(df), skipped), drop=F], "DataFrame") } if('X' %in% names(elementMetadata(GR))) { if(all(is.na(GR$X))) { GR$X <- NULL } else { names(elementMetadata(GR))[which(names(elementMetadata(GR))=='X')]='score' } } if(!is.null(g)) genome(GR) <- g return(GR) } # }}} ## Create a GenomicRanges (GR) object of the snp135 common track ## snp135common.df <- read.table("snp135Common.BioC.txt.gz") names(snp135common.df) <- c('chrom','chromStart','chromEnd', 'name','strand','score') snp135common.GR <- df2GR(snp135common.df, keepColumns=TRUE) genome(snp135common.GR)<-'hg19' show(snp135common.GR) ## Validate this using the SNPlocs.Hsapiens.dbSNP.20111119 package ## library(SNPlocs.Hsapiens.dbSNP.20111119) rsidsToGRanges(c('rs55998931','rs71262673')) snp135common.GR[c('rs55998931','rs71262673')] ranges(snp135common.GR[c('rs55998931','rs71262673')])==ranges(rsidsToGRanges(c('rs55998931','rs71262673'))) strand(snp135common.GR) <- '*' ## UCSC strand annotation seems to be misleading ## UCSC's Single Nucleotide Polymorphisms are 2nt wide. This will not do. ## snp135common.GR <- resize(snp135common.GR, width=1, fix='end') if(!all(ranges(snp135common.GR[c('rs55998931','rs71262673')])==ranges(rsidsToGRanges(c('rs55998931','rs71262673'))))) message("UCSC SNPs are still 2nt wide...") ## ## Fixed, now save it: ## FDb.UCSC.snp135common.hg19 <- GenomicRangesToFeatureDb( snp135common, URL='http://genome.ucsc.edu/', tableName='snp135common', src='UCSC data table', label='Common (MAF > 0.01) SNPs from dbSNP build 135' ) saveDb(FDb.UCSC.snp135common.hg19, file='FDb.UCSC.snp135common.hg19.sqlite')