SNP impacting hormonal binding motifs in function of strain-specific RNA responses in male germ cells.

Published: 06-11-2018| Version 1 | DOI: 10.17632/3s94xbbtjx.1
Contributors:
ludwig stenz,
Julien Prados

Description

This data contains, 1) FVB/N and C57BL/6J alleles differing by SNP obtained from the Sanger Institute and affecting FOXA1, FOXA2, FOXA3, ESR1 and ANDR motifs according to HOCOMOCO position weight matrices (PWM): (FOXA1_MOUSE.H11MO.0.A,FOXA2_MOUSE.H11MO.0.A,FOXA3_MOUSE.H11MO.0.A,ESR1_MOUSE.H11MO.1.A,ANDR_MOUSE.H11MO.1.A). 2) Strain-specific RNA responses to in utero exposure to di(2-ethylhexyl)phthalate (DEHP) in male germ cells assessed using all-RNA-seq (GEO series accession number GSE107839).

Files

Steps to reproduce

R # 1) work on SNP: # Required packages and libraries require(VariantAnnotation) require(BSgenome.Mmusculus.UCSC.mm10) library(EnsDb.Mmusculus.v79) library(BSgenome.Mmusculus.UCSC.mm10) # load FVB mutations vcf.fvb <- VCF.fvb <- readVcf(" ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/strain_specific_vcfs/FVB_NJ.mgp.v5.snps.dbSNP142.vcf.gz ") nrow(vcf.fvb) # 5556605 # keep only high quality SNPs vcf.fvb <- vcf.fvb[rowRanges(vcf.fvb)$FILTER=="PASS"] # vcf.fvb <- vcf.fvb[rowRanges(vcf.fvb)$QUAL&gt;=228] # vcf.fvb <- vcf.fvb[elementNROWS(rowRanges(vcf.fvb)$ALT)==1] nrow(vcf.fvb) # 4823906 # retrieve surrounding sequence (here 15 bases) rowRanges(vcf.fvb)$ref.seq <- local({ mot.region <- promoters(rowRanges(vcf.fvb),14,15) seqlevelsStyle(mot.region) <- "UCSC" getSeq(BSgenome.Mmusculus.UCSC.mm10,mot.region) }) # verification of sequences table(subseq(rowRanges(vcf.fvb)$ref.seq,15,15)==rowRanges(vcf.fvb)$REF) # allele constructions rowRanges(vcf.fvb)$alt.seq <- rowRanges(vcf.fvb)$ref.seq subseq(rowRanges(vcf.fvb)$alt.seq,15,15) <- unlist(rowRanges(vcf.fvb)$ALT) # 2) Function to screening PWM matrices on reconstructed alleles: vmatchPWM <- function(seq,pwm,...) { S <- DNAString(paste0(seq,collapse="")) j <- successiveIRanges(nchar(seq)) i <- matchPWM(pwm,S,with.score=TRUE,...) h <- findOverlaps(i,j,type="within") mcols(h)$score <- mcols(i)$score[queryHits(h)] o <- order(mcols(h)$score) score <- numeric(subjectLength(h)) score[subjectHits(h)[o]] <- mcols(h)$score[o] score } # 3) reshaping CuffDiff table FPKM <- fpkm(genes(cuff.all)) FPKM <- subset(FPKM,quant_status=="OK") FPKM <- reshape(FPKM, idvar="gene_id", timevar="sample_name", direction="wide",drop=c("conf_hi","conf_lo","stdev","quant_status")) write.table(FPKM, file = "FPKM.txt", sep = "\t") FPKM <- read.table("FPKM.txt", sep = "\t", header = TRUE) # 4) Merging FPKM and SNP data ARE.FPKM <- merge(are,FPKM,by.x="nearest_gene_name",by.y="gene_id",all.x=TRUE) # 5) compute strain-specific answer: ARE.FPKM$D300.effect <- (log2(ARE.FPKM$fpkm.FVB_D300_F1+1) - log2(ARE.FPKM$fpkm.FVB_CTL_F1+1)) - (log2(ARE.FPKM$fpkm.C57_D300_F1+1) - log2(ARE.FPKM$fpkm.C57_CTL_F1+1))