This post presents R code to retrieve SNPs in promoters for a list of genes. You provide your list of genes to the “gene” variable and then use biomaRt (mus_musculus) to get the transcriptional start sites (TSS) for each transcript for your list of genes. The code then uses the TSS info to search in promoter regions (defined here as [-1000, +200 bp] relative to TSS) for SNPs for each gene using biomaRt – my code looks for SNPs distinguishing C57BL/6J and CastEiJ mouse strains, but this can be easily altered in the getSnps function by changing “sp$cast_eij” to another strain name (use listAttributes(snpmart) in biomaRt).
You can view the promoters found to have SNPs in the “dataSnps” variable. Select the transcript SNP set you are interested in and run the final “getflank” function to get the surrounding sequence information using the mouse genome data provided in the BSgenome library. The code finds 100 bp of flanking sequence around each SNP site and this can adjust with the “offset” variable in “getflank” function. The “final” output provides row names and writes a tab delimited file that opens easily in Excel.
This is a quick way to get SNP data in promoter regions for genes of interest.
library(biomaRt) library(BSgenome.Mmusculus.UCSC.mm10) setwd("") #######################INPUT GENES OF INTEREST#### genes = c("Igf2","H19", "Igf2R", "Rasgrf", "Magel2") #######################GET TSS############################# #get transcriptional start sites for all genes of interest ensembl = useMart("ensembl") ensembl = useDataset("mmusculus_gene_ensembl", mart=ensembl) tss <- getBM(attributes=c('ensembl_gene_id', 'ensembl_transcript_id', 'chromosome_name','transcript_start', 'transcript_end','mgi_symbol'), filters = c("mgi_symbol"), values=genes, mart=ensembl) ########################GET SNPS IN PROMOTER########################### #get snps in promoter region : promoter region is defined as [-1000,+200] relative to transcriptional start site snpmart = useMart("snp") snpmart = useDataset("mmusculus_snp", mart=snpmart) getSnps <- function(x){ txstart = as.numeric(x[4]) txend = as.numeric(x[5]) id = x[1] txid = x[2] name = x[6] chr = x[3] if ( txstart > txend ) { getBM(attributes=c('refsnp_id', 'chr_name', 'chrom_start', 'allele', 'c57bl_6j', 'cast_eij'), filters=c("chr_name", "chrom_start", "chrom_end"), values=list( chr, (txstart - 200), (txstart + 1000)), mart = snpmart)->sp sp[sp$cast_eij %in% c("A","T","G","C"), ]->dataSnps } else if ( txstart < txend ) { getBM(attributes=c('refsnp_id', 'chr_name', 'chrom_start', 'allele', 'c57bl_6j', 'cast_eij'), filters=c("chr_name", "chrom_start", "chrom_end"), values=list( 2, ( txstart - 1000 ), ( txstart +200 )), mart = snpmart)->sp sp[sp$cast_eij %in% c("A","T","G","C"), ]->dataSnps } if ( nrow(dataSnps) > 1 ) { cbind(id,txid,name,dataSnps, txstart)->results return(results) } } snps<-apply(tss, 1, getSnps) dataSnps <- snps[!sapply(snps, is.null)]#All SNP Info with NULLS removed ###########################GET FLANKING SEQUENCE FOR EACH SNP########################### #Get sequence for snps dataSnps #view output dataSnps[[19]] -> d #select SNP set for transcript of interest getflank <- function(x) { id = x[1] txid = x[2] name = x[3] position = as.numeric(x[6]) alleles = paste("[",x[7]," > ",x[9],"]", sep="") chr = paste("chr", x[5], sep="") txstart = x[10] offset = 100 leftflank <- getSeq(BSgenome.Mmusculus.UCSC.mm10,chr,position-offset,position-1) rightflank <- getSeq(BSgenome.Mmusculus.UCSC.mm10,chr,position+1,position+offset) paste(leftflank,alleles,rightflank,sep="")->seq cbind(id, txid, name, txstart, position, alleles, chr, seq)->out return(out) } final <- apply(d, 1, getflank) r<- c("id","txid", "name", "txstart", "snp position", "Ref > Alt", "chr", "Seq") rownames(final) <- r write.table(final, file = "SNPSandFlankingSEQinPROMOTERS_Gene.txt", sep="\t")