######################################################################################### # # ### BSA SNP Supplies ### # # # This file contains the scripts for the functions that power the BSAsnp() function # function that performs BSA using SNP chips with the help of a few user inputs. # If you are trying to perform BSA, you may have opened # this file by mistake, please open "BSAsnp.R" instead. # # # # Becker, July 2010 ######################################################################################### ######################################################################################### # This section defines the map.graph function for the final mapping and plots. map.graph <- function(p1,p2,a1,a2,file.stem, plotcM=F){ colvan <- na.omit(new.snp.data[,c(p1,p2)]) # get the relevant columns rownames(colvan) <- new.snp.coord sense.mat <- do.call('rbind', strsplit(rownames(sense), "_")) # create a matrix sense.coord <- paste("X",sense.mat[,1],"_",sense.mat[,2], sep = "") # reformat naming scheme markers <- colvan[colvan[,1] != colvan[,2],] # get the markers sense.m <- sense[sense.coord %in% rownames(markers),] # find the sense markers antisense.m <- antisense[sense.coord %in% rownames(markers),] # the antisense markers have the same index as sense mark.coord <- as.data.frame(sense.mat[sense.coord %in% rownames(markers),c(1:2)], stringsAsFactors = FALSE) mark.coord[,1] <- as.numeric(mark.coord[,1]) mark.coord[,2] <- as.numeric(mark.coord[,2]) colnames(mark.coord) <- c("Chr","bp") #Now make cM like Wolyn, et al if(plotcM){chr.len <- round(c(96.6, 83.3, 103.9, 64.5, 97.7)) #kosambi map distances # assume equal recombination rate across chromosome # assign cM from physical position chr.bp <- tapply(mark.coord$bp,mark.coord$Chr,max) cM <- list() for (i in 1:5) cM[[i]] <- mark.coord$bp[mark.coord$Chr == i]*chr.len[i]/chr.bp[i] mark.coord$cM <- unlist(cM)} #now do the comparison sense.comp <- sense.m[,a1] - sense.m[,a2] #Hk -Lk antisense.comp <- antisense.m[,a1] - antisense.m[,a2] #Hk - Lk smooth.val <- 0.25 #for the loess smooth #do sense smooth.dif <- list() for (i in 1:5){ bulk1 <- cbind(sense.comp[mark.coord$Chr==i], mark.coord[mark.coord$Chr==i,]) smooth.dif[[i]] <- loess((bulk1[,1]) ~ bulk1$bp ,span = smooth.val)$fitted }#close chrom loop #now do antisense smooth.dif.anti <- list() for (i in 1:5){ bulk1 <- cbind(antisense.comp[mark.coord$Chr==i], mark.coord[mark.coord$Chr==i,]) smooth.dif.anti[[i]] <- loess((bulk1[,1]) ~ bulk1$bp ,span = smooth.val)$fitted }#close chrom loop ################################################ #Now do the control experiment with the probes that aren't different num.mark <- nrow(markers) # get the number of markers, use the same # for the controls con.markers <- colvan[colvan[,1] == colvan[,2],] con.markers <-con.markers[sample(1:nrow(con.markers), nrow(markers)),] con.sense.m <- sense[sense.coord %in% rownames(con.markers),] con.antisense.m <- antisense[sense.coord %in% rownames(con.markers),] con.mark.coord <- as.data.frame(sense.mat[sense.coord %in% rownames(con.markers),c(1:2)], stringsAsFactors = FALSE) con.mark.coord[,1] <- as.numeric(con.mark.coord[,1]) con.mark.coord[,2] <- as.numeric(con.mark.coord[,2]) colnames(con.mark.coord) <- c("Chr","bp") if(plotcM){chr.len <- round(c(96.6, 83.3, 103.9, 64.5, 97.7)) #kosambi map distances # assume equal recombination rate across chromosome # assign cM from physical position chr.bp <- tapply(con.mark.coord$bp,con.mark.coord$Chr,max) cM.con <- list() for (i in 1:5) cM.con[[i]] <- con.mark.coord$bp[con.mark.coord$Chr == i]*chr.len[i]/chr.bp[i] con.mark.coord$cM <- unlist(cM.con)} #now do the comparison con.sense.comp <- con.sense.m[,a1] - con.sense.m[,a2] #HP -LP con.antisense.comp <- con.antisense.m[,a1] - con.antisense.m[,a2] #HP - LP smooth.val <- 0.25 # for the loess smooth # do sense con.smooth.dif <- list() for (i in 1:5){ bulk1 <- cbind(con.sense.comp[con.mark.coord$Chr==i], con.mark.coord[con.mark.coord$Chr==i,]) con.smooth.dif[[i]] <- loess((bulk1[,1]) ~ bulk1$bp ,span = smooth.val)$fitted }# close chrom loop #now do antisense con.smooth.dif.anti <- list() for (i in 1:5){ bulk1 <- cbind(con.antisense.comp[con.mark.coord$Chr==i], con.mark.coord[con.mark.coord$Chr==i,]) con.smooth.dif.anti[[i]] <- loess((bulk1[,1]) ~ bulk1$bp ,span = smooth.val)$fitted }# close chrom loop ### make the spreadsheet to save the sense data: traceout.1 <- data.frame("chr1",mark.coord$bp[mark.coord$Chr==1]/1e6, mark.coord$bp[mark.coord$Chr==1]/1e6,smooth.dif[1]) traceout.2 <- data.frame("chr2",(30432385+mark.coord$bp[mark.coord$Chr==2])/1e6, mark.coord$bp[mark.coord$Chr==2]/1e6,smooth.dif[2]) traceout.3 <- data.frame("chr3",(50136912+mark.coord$bp[mark.coord$Chr==3])/1e6, mark.coord$bp[mark.coord$Chr==3]/1e6,smooth.dif[3]) traceout.4 <- data.frame("chr4",(73607619+mark.coord$bp[mark.coord$Chr==4])/1e6, mark.coord$bp[mark.coord$Chr==4]/1e6,smooth.dif[4]) traceout.5 <- data.frame("chr5",(92192531+mark.coord$bp[mark.coord$Chr==5])/1e6, mark.coord$bp[mark.coord$Chr==5]/1e6,smooth.dif[5]) colnames(traceout.1) <- c("chromosome","tot_bp","bp","diff") colnames(traceout.2) <- c("chromosome","tot_bp","bp","diff") colnames(traceout.3) <- c("chromosome","tot_bp","bp","diff") colnames(traceout.4) <- c("chromosome","tot_bp","bp","diff") colnames(traceout.5) <- c("chromosome","tot_bp","bp","diff") final.table <- rbind(traceout.1,traceout.2,traceout.3,traceout.4,traceout.5) ### add cM if they're interested. if(plotcM){final.table$cM <- unlist(cM)} ### save the spreadsheet t.file <- paste(file.stem, "snpprobemapping.csv", sep = "_") write.table(final.table,file = t.file, sep = ",", row.names=F) ### now get the graph limits g.max <- max(c(unlist(smooth.dif.anti),unlist(smooth.dif),unlist(con.smooth.dif.anti),unlist(con.smooth.dif))) g.min <- min(c(unlist(smooth.dif.anti),unlist(smooth.dif),unlist(con.smooth.dif.anti),unlist(con.smooth.dif))) ### prepare to save the graphs in a pdf fn <- paste(file.stem, "snpprobemapping.pdf", sep = "_") pdf(fn, width = 11, height = 8.5) par(mfrow=c(1,5)) ### choose whether to plot by cM or Mb, then make the plots if(plotcM){ for (i in 1:5){ skim=seq.int(1, nrow(mark.coord[mark.coord$Chr==i,]),4) plot((mark.coord$cM[mark.coord$Chr==i])[skim],(smooth.dif[[i]])[skim],type="l",ylim=c(g.min, g.max), ylab = "allele frequency difference", xlab = paste("cM chromosome",i), main = file.stem) lines((mark.coord$cM[mark.coord$Chr==i])[skim],(smooth.dif.anti[[i]])[skim], col = 'blue') lines((con.mark.coord$cM[con.mark.coord$Chr==i])[skim],(con.smooth.dif.anti[[i]])[skim], col = 'yellow') lines((con.mark.coord$cM[con.mark.coord$Chr==i])[skim],(con.smooth.dif[[i]])[skim], col = 'brown') abline(h=0) }#close chrom loop }else{ for (i in 1:5){ skim=seq.int(1, nrow(mark.coord[mark.coord$Chr==i,]),4) plot((mark.coord$bp[mark.coord$Chr==i]/1e6)[skim],(smooth.dif[[i]])[skim],type="l",ylim=c(g.min, g.max), ylab = "allele frequency difference", xlab = paste("Mb chromosome",i), main = file.stem) lines((mark.coord$bp[mark.coord$Chr==i]/1e6)[skim],(smooth.dif.anti[[i]])[skim], col = 'blue') lines((con.mark.coord$bp[con.mark.coord$Chr==i]/1e6)[skim],(con.smooth.dif.anti[[i]])[skim], col = 'yellow') lines((con.mark.coord$bp[con.mark.coord$Chr==i]/1e6)[skim],(con.smooth.dif[[i]])[skim], col = 'brown') abline(h=0) }#close chrom loop }#close if/else dev.off() #shut that pdf } #end of function loop ######################################################################################### ######################################################################################### # This section defines the BSAsnp function that runs the user-driven process of BSA # using SNP chips. BSAsnp <- function(){ # find which OS-dependent readcel function to use. os.type= readline("What type of OS are you using? \nEnter 1 for PC, Enter 2 for Mac, Linux, or Unix: ") # choose which readcel function to use, with default as Mac/Unix/Linux: if (as.numeric(os.type)==1){ #set the memory limit higher memory.limit(size = 3095) ### readcel.R for spatial correction library(affy) library(stats) readcel <- function(cel.files, probe.number, array.size, filter.size ){ mprobe.mean <- matrix(NA, nr=probe.number, nc=length(cel.files) ) for (i in 1:length(cel.files) ) { probe.mean <- log(intensity(read.affybatch(filenames=cel.files [i] ) ) ) mprobe.mean[,i] <- matrix(probe.mean, nc=array.size, byrow=T) [array.size:1,] [probe.ok] } # calculate median probe within array meds <- apply(mprobe.mean,2,median) mprobe.resid <- t( t(mprobe.mean) - meds) filter.size <- 2*trunc(filter.size /2)+1 # force to be odd so the edge will be same length = filter.size/2 fe <- (filter.size-1)/2 # filter edge mprobe.bg <- mprobe.resid # do spatial correction for (i in 1:length(cel.files)) { sp.mat <- matrix(0,nr= array.size,nc=array.size) sp.mat[probe.ok] <- mprobe.resid[,i] sp.mat[!probe.ok] <- median(mprobe.resid[,i]) sp.mat.sum <- apply(sp.mat,2,filter,rep(1,filter.size)) # from start to end, every 51 probes sum in moving sp.mat.sum <- apply(sp.mat.sum,1,filter,rep(1,filter.size)) sp.mat.count <- apply(probe.ok,2,filter,rep(1,filter.size)) sp.mat.count <- apply(sp.mat.count,1,filter,rep(1,filter.size)) #how many probes have been added up to contribute the sp.mat.sum spatial.corrected <- t(sp.mat.sum/sp.mat.count) # fill in edges, with the background of edge boarder spatial.corrected[1:fe,] <- rep(spatial.corrected[fe+1, ], each = fe) spatial.corrected[(array.size+1 - fe):array.size, ] <- rep(spatial.corrected[array.size - fe, ], each = fe) spatial.corrected[,1:(fe)] <- spatial.corrected[,fe+1] spatial.corrected[,(array.size+1 - fe):array.size] <- spatial.corrected[,array.size - fe] mprobe.bg[,i] <- spatial.corrected[probe.ok] #bitmap( paste("spatialResid", cel.files[i], ".png", sep="" ) ) #image(1:array.size, 1:array.size, spatial.corrected, # main = paste(cel.files[i], "spatialResid", filter.size,sep="") ) #dev.off() } mprobe.mean <- (mprobe.mean - mprobe.bg)[chrom.order,] return (mprobe.mean) } }else{ ### readcel.R for spatial correction on Macs and everything other than PCs library(affy) library(stats) readcel <- function(cel.files, probe.number, array.size, filter.size ){ mprobe.mean <- matrix(NA, nr=probe.number, nc=length(cel.files) ) for (i in 1:length(cel.files) ) { probe.mean <- log(intensity(read.affybatch(filenames=cel.files [i] ) ) ) mprobe.mean[,i] <- matrix(probe.mean, nc=array.size, byrow=T) [array.size:1,] [probe.ok] } # calculate median probe within array meds <- apply(mprobe.mean,2,median) mprobe.resid <- t( t(mprobe.mean) - meds) filter.size <- 2*trunc(filter.size /2)+1 # force to be odd so the edge will be same length = filter.size/2 fe <- (filter.size-1)/2 # filter edge mprobe.bg <- mprobe.resid # do spatial correction for (i in 1:length(cel.files)) { sp.mat <- matrix(0,nr= array.size,nc=array.size) sp.mat[probe.ok] <- mprobe.resid[,i] sp.mat[!probe.ok] <- median(mprobe.resid[,i]) sp.mat.sum <- apply(sp.mat,2,filter,rep(1,filter.size)) # from start to end, every 51 probes sum in moving sp.mat.sum <- apply(sp.mat.sum,1,filter,rep(1,filter.size)) sp.mat.count <- apply(probe.ok,2,filter,rep(1,filter.size)) sp.mat.count <- apply(sp.mat.count,1,filter,rep(1,filter.size)) #how many probes have been added up to contribute the sp.mat.sum spatial.corrected <- t(sp.mat.sum/sp.mat.count) # fill in edges, with the background of edge boarder spatial.corrected[1:fe,] <- rep(spatial.corrected[fe+1, ], each = fe) spatial.corrected[(array.size+1 - fe):array.size, ] <- rep(spatial.corrected[array.size - fe, ], each = fe) spatial.corrected[,1:(fe)] <- spatial.corrected[,fe+1] spatial.corrected[,(array.size+1 - fe):array.size] <- spatial.corrected[,array.size - fe] mprobe.bg[,i] <- spatial.corrected[probe.ok] #bitmap( paste("spatialResid", cel.files[i], ".png", sep="" ) ) #image(1:array.size, 1:array.size, spatial.corrected, # main = paste(cel.files[i], "spatialResid", filter.size,sep="") ) #dev.off() }#close reading loop mprobe.mean <- (mprobe.mean - mprobe.bg)[chrom.order,] return (mprobe.mean) }#close readcel function }#close if/else #set your working directory to the supplementary directory supp.dir=readline("Enter the path to your supplementary directory: ") cat("Please wait while supplementary data loads.") #be polite. setwd(supp.dir) #load the supplementary files load("snp.RData", .GlobalEnv) #annotation for SNP probes from Affymetrix load("snp.calls1020.RData", .GlobalEnv) #polymorphism data from Nordborg Lab # new.snp.coord has to be in the Global Environment for map.graph to work assign("new.snp.coord", paste("X",new.snp.data[,1],"_",new.snp.data[,2], sep = ""), envir=.GlobalEnv) cat("\nLoad complete.") #set your working directory to the files directory files.dir= readline("Enter the path to the directory containing your .CEL files: ") setwd(files.dir) cel.files <- list.celfiles() # find files # in case they gave the wrong directory, ask for a new one: while(length(cel.files)==0){ files.dir= readline("There are no .CEL files in that directory. Enter another directory: ") setwd(files.dir) cel.files <- list.celfiles() } # find out what arrays to use: cat("\n", "Available .CEL files:", "\n") for (i in 1:length(cel.files)){cat(i,"-", cel.files[i],"\n")} a1= as.numeric(readline("Enter the number shown above corresponding to the first array to be compared: ")) a2= as.numeric(readline("Enter the number shown above corresponding to the second array to be compared: ")) cat("Find the ecotype ID # for the outcross parent from this site: \nhttp://arabidopsis.usc.edu/Accession/#By%20Name\n\n") cat("Here are a few common ID numbers for convenience:\n", "Bil-7 id 6901\nCAM-16 id 23\nCol-0 id 6909\nCS28181 id 6744\nCvi-0 id 6911\nEden-1 id 6009\nHod id 8235\nKr-0 id 7201 (aka CS28419)\nLer-1 id 6932\nVan-0 id 6977\n", sep="") # find the column in new.snp.data that corresponds to the outcross parent. eco.id.num = readline("Enter the ecotype ID # for the outcross parent: ") p2=which(colnames(new.snp.data) == paste("X", eco.id.num, sep="")) # in case they gave the wrong ecotype ID number while(length(p2)==0){ eco.id.num= readline("That ecotype ID number can't be found in our polymorphism data.\nTry an alternate ID #:") p2=which(colnames(new.snp.data) == paste("X", eco.id.num, sep=""))} #get the cM/Mb decision cM.decider= readline("How should the difference in allele frequency be plotted? Enter 1 for vs. centimorgans, Enter 2 for vs. mega base pairs: ") if (as.numeric(cM.decider)!=1){plotcMtag=F}else{plotcMtag=T} #have the default be centimorgans #get the file.stem file.stem= readline("Enter the name to be given to the output files: ") cat("Please wait while analysis is performed.")#be polite #read/correct .CEL files library(affy) ord <- rep( c(1,2,3,4), nrow(snp)/4 ) # set up the matrix for readcel snp <- cbind( snp, ord) array.size <- 1612 probe.ok <- matrix(FALSE, nr=array.size, nc=array.size) probe.ok.chrom <- matrix(NA, nr=array.size, nc=array.size) probe.ok.position <- matrix(NA, nr=array.size, nc=array.size) probe.ok.ord <- matrix(NA, nr=array.size, nc=array.size) probe.ok[cbind(snp$xpos+1, snp$ypos+1)] <- TRUE probe.ok.chrom[cbind(snp$xpos + 1, snp$ypos+1)] <- as.numeric( as.character(snp$chr) ) probe.ok.position [cbind(snp$xpos + 1, snp$ypos+1)] <- as.numeric( as.character( snp$bp) ) probe.ok.ord [cbind(snp$xpos + 1, snp$ypos+1)] <- as.numeric( as.character(snp$ord) ) probe.ok.chrom.valid <- probe.ok.chrom[probe.ok] probe.ok.position.valid <- probe.ok.position [probe.ok] probe.ok.ord.valid <- probe.ok.ord [probe.ok] chrom.order <- order(probe.ok.chrom.valid, probe.ok.position.valid, probe.ok.ord.valid) # only read in the files you want to analyze- note the indexing vector mprobe.mean <- readcel(cel.files=cel.files, probe.number=nrow(snp), array.size=1612, filter.size=81) colnames (mprobe.mean) <- sub(".CEL", "", cel.files)#name rows rownames (mprobe.mean) <- as.character(snp$snpid)#name cols #sort & subtract signals #sense and antisense have to be in the Global Environment for map.graph to work. assign("antisense", mprobe.mean[seq(1, nrow(snp), 4),]-mprobe.mean[seq(3, nrow(snp),4),], envir=.GlobalEnv) assign("sense", mprobe.mean[seq(2, nrow(snp),4),]-mprobe.mean[seq(4, nrow(snp),4),], envir=.GlobalEnv) p1 <-552 # the column in "new.snp.data" that corresponds to the ref strain (i.e. Col-0) map.graph(p1,p2,a1,a2,file.stem, plotcM=plotcMtag) #execute the function, note we choose arrays 1 and 2 b/c of use of the indexing vector above #let them know you're done. cat("\nAnalysis complete. Look in your .CEL files directory for results.\n\n") }#end function loop