#!/usr/bin/env Rscript
suppressPackageStartupMessages(require("Biostrings"))
suppressPackageStartupMessages(require("BSgenome"))
suppressPackageStartupMessages(require("optparse"))


############################################################ crispRtool #######################################################
## Script to search off-target hits in the genome for a given list of single-guide RNAs (sgRNA). 
## Samuel Lessard
## Lettre Lab
## v. 1.02
## Last updated 2017-02-14
##
## Usage: Rscript crispRtool.Rscript [options]



################################################################# PARSE ARGUMENTS ##########################################################

scores=	c(0,0,0.014,0,0,0.395,0.317,0,0.389,0.079,0.445,0.508,0.613,0.851,0.732,0.828,0.615,0.804,0.685,0.583)

option_list <- list(
 make_option(c("-i", "--input"), type="character", default="", help="Input file of sgRNAs in FASTA format. Sequences must be 20bp long and PAMs should not be included. If this option is omitted, random sequences will be analyzed."),
 make_option(c("-o", "--outname"), type="character", default="output.txt", help="Output base name [default \"%default\"]."),
 make_option(c("-p", "--pam"), type="character", default="NGG",help="PAM sequence [default \"%default\"]. Multiple PAMs can be analyzed by passing a comma-separated list of motifs (eg. NGG,NGA)."),
 make_option(c("-s", "--outscore"), type="numeric", default=5, help="Minimum individal score for off-targets to be outputed [default %default]."),
 make_option(c("-n", "--nmm"), type="integer", default=4, help="Maximum number of mismatches allowed in off-targets [default %default]."),
 make_option(c("-a", "--assembly"), type="character", default="BSgenome.Hsapiens.UCSC.hg38.masked", help="Masked genome to be searched for off-targets [default %default]. Active masks are for assembly gaps and intra-contig ambiguities. Repeats are not masked. Genome must be a BSgenome package. The script will try to install the package if it is not installed."),
 make_option(c("-r", "--random"), type="integer", default=5, help="Number of random guides to analyze [default %default]. Ignored if input argument is present.")
)


###################################################1############## FUNCTIONS ###########################################################

# Function to filter matches based on given PAM(s)
filterMatches<-function(matches,pam,guide,genome_seq,reverse=FALSE,nmm=4){
		## Get all matches
		all_matches<-suppressWarnings(Views(genome_seq, start=start(matches), end=end(matches)))
		
		## Get PAM of matches
		if (reverse==TRUE){
			suppressWarnings(pam_seqs<-DNAStringSet(Views(genome_seq, start=start(matches)-nchar(pam[1]), end=end(matches)-20)))
		}else{	
			suppressWarnings(pam_seqs<-DNAStringSet(Views(genome_seq, start=start(matches)+20, end=end(matches)+nchar(pam[1]))))
		}
		
		## Extract matches with correct PAM 
		good_matches<-rep(0,length(all_matches))
		for (j in length(pam)){
			good_matches<-good_matches+elementLengths(vmatchPattern(pam[j],pam_seqs,fixed=FALSE))
		}
		all_matches<-all_matches[which(good_matches>0),]
		
		## Remove matches with to much ambigious nucleotides (>nmm)
		if (length(matches)>0){
			all_matches<-all_matches[which(rowSums(alphabetFrequency(all_matches)[,c("A","C","G","T"),drop=FALSE])>=20-nmm),]
		}
		all_matches	
}




#Calculate scores of matches
getScores<-function(mismatches,scores=scores){
	if (length(mismatches)>0){
		ind_scores<-sapply(mismatches,function(mm){
			n= length(mm)
			d=19
			# Mismatch distance penality
			if (n>1){
				d=mean(dist(mm))
			}
			sc=100
			if (n>0){
				# Score formula
				sc=prod(sapply(mm,function(x){1-scores[x]}))*(1/(((19-d)/19)*4+1))*(1/n^2)*100
			}
			sc
		})
		ind_scores
	}else{
		NULL
	}
}

# Function to output matches to file
writeMatches<-function(sgrna_name,sgrna_seq,chr,matches,scores,strand="+",append=TRUE,outscore=5,outname="results.txt"){
	written=append
	matches<-matches[which(scores>outscore),]
	scores<-scores[which(scores>outscore)]
	if(length(scores>0)){
		matches<-data.frame(sgRNA_ID=sgrna_name,sgRNA_Seq=as.character(sgrna_seq),chr=chr,start=start(matches),end=end(matches),strand=strand,match_Seq=as.character(matches),score=scores)
		write.table(file=outname,matches,quote=FALSE,row.names=FALSE,append=append,col.names=FALSE,sep="\t")
		written=TRUE
	}
	written
}

writeHeader<-function(outname="results.txt"){
	matches<-data.frame(sgRNA_ID="sgRNA_ID",sgRNA_Seq="sgRNA_seq",chr="chr",start="start",end="end",strand="strand",match_Seq="match_seq",score="score")
	write.table(file=outname,matches,quote=FALSE,row.names=FALSE,append=FALSE,col.names=FALSE,sep="\t")
	written=TRUE
}


# Function to prepare summary output file
prepareOutput<-function(sgRNA_list,nmm){
	output<-as.data.frame(sgRNA_list)
	output<-data.frame(sgRNA_ID=rownames(output),sgRNA_Seq=output$x,nontargeting_score=0,best_match_position=NA,best_match_sequence=NA,best_match_score=NA,targeting_guide_score=0,total=0)
	for (i in 0:nmm){
		output[[paste("N_",i,sep="")]]=NA
	}
	output
}

mismatch2=function(sgrna,matches){
	if (length(matches)>0){
		mms=sapply(1:20,function(i){
			ref=names(IUPAC_CODE_MAP[which(grepl(as.character(subseq(sgrna,i,i)),IUPAC_CODE_MAP))])
			sapply(as.character(subseq(DNAStringSet(matches),i,i)),function(x){!any(x==ref)})
		})
		if (is.null(nrow(mms))){
			mms=list(which(mms))
			return(mms)
		}else{
			mms=apply(mms,1,which)
			if (is.matrix(mms)){
				mms=as.list(as.data.frame(mms))
			}
			return(mms)
		}
	}else{
		return(NULL)
	}
	
}

runAnalysisBSgenome	<-function(sgRNA_list,genome,pam,nmm,scores,outname,outscore){
		# Create sgRNA dictionnary
		sg_dict<-PDict(sgRNA_list,max.mismatch=nmm)
		rsg_dict<-PDict(reverseComplement(sgRNA_list),max.mismatch=nmm)

		## Search genome for off-targets
		params <- new("BSParams", X = genome, FUN = matchPDict,exclude = c("M","_"))
		all_forward_matches<-bsapply(params, pdict = sg_dict,max.mismatch=nmm,fixed="pattern")
		all_reverse_matches<-bsapply(params, pdict = rsg_dict,max.mismatch=nmm,fixed="pattern")

		out_matches = paste(outname,".matches.txt",sep="")
		append=writeHeader(outname=out_matches)
		append=TRUE
		
		# Iterate forward and reverse matches
		iterate_matches = mapply(function(forward_matches,reverse_matches,chr){
			
			# Iterate each guide and get scores
			iterate_guides = mapply(function(guide_forward_matches,guide_reverse_matches,dictg,rdictg,guide_name){
				
				# Create list to store results
				results = list()
				results$total_scores=NULL
                results$guide_score=100
                results$total_mm<-rep(0,nmm+1)
                results$best_score = 0
                results$best_sequence = NA
				results$best_position = NA

				# Get reverse complent of PAM
				rc_pam<-as.character(reverseComplement(DNAStringSet(pam)))
				
				# Filter matches
				fmatches<-filterMatches(guide_forward_matches,pam=pam,reverse=FALSE,genome_seq=genome[[chr]],nmm=nmm)
                rmatches<-filterMatches(guide_reverse_matches,pam=rc_pam,reverse=TRUE,genome_seq=genome[[chr]],nmm=nmm)

				rseqs = as.character(rmatches)
				fseqs = as.character(fmatches)

                if (length(fmatches)>0 | length(rmatches)>0){
					# Find mismatches
					fmms<-mismatch2(dictg,fmatches) # Faster than mismatch function
					rmms<-mismatch2(rdictg,rmatches)

                    # Get Scores
		            forward_scores<-getScores(fmms,scores=scores)
					reverse_scores<-getScores(rmms,scores=rev(scores))
					
					all_scores = c(reverse_scores,forward_scores)
                    
					## Calculate total number of mismatches
					all.mms<-c(rmms,fmms)
                    for (z in 0:nmm){
                        results$total_mm[z+1]=results$total_mm[z+1] +length(all.mms[which(lapply(all.mms,length)==z)])
                    }
					
					# Store results
					if (length(all_scores)>0){
							bm<-order(-all_scores)[1]
							if(all_scores[bm]>results$best_score){
								results$best_score=all_scores[bm]
								results$best_match=c(rmatches,fmatches)[bm]
								results$best_sequence=as.character(results$best_match)
								results$best_position=paste(paste("",chr,sep=""),start(results$best_match),end(results$best_match),sep="_")
				
							}

							if (length(fmatches)>0){
								append=writeMatches(guide_name,dictg,chr,fmatches,forward_scores,strand="+",append=append,outscore=outscore,outname=out_matches)
							}
							if (length(rmatches)>0){
								append=writeMatches(guide_name,dictg,chr,rmatches,reverse_scores,strand="-",append=append,outscore=outscore,outname=out_matches)

							}

							## Gather all scores for each guide and each chromosomes
							results$total_scores=all_scores
                    }
                }
				results
				
			},forward_matches,reverse_matches,sg_dict,rsg_dict,names(sg_dict),SIMPLIFY=FALSE)
			iterate_guides
		},all_forward_matches,all_reverse_matches,as.list(names(all_forward_matches)),SIMPLIFY=FALSE)

		#Aggregate results
		output = aggregate(sg_dict,iterate_matches,sgRNA_list,nmm)
		
}
	
	
	
aggregate<-function(dict,iterate_matches,sgRNA_list,nmm){
	## Prepare output files
	output<-prepareOutput(sgRNA_list,nmm)
	for (i in 1:length(dict)){
		total_scores=NULL
		guide_score=100
		total_mm<-rep(0,nmm+1)
		best_score = 0
		best_sequence = NA
		best_position = NA

		for (chr in 1:length(iterate_matches)){
			total_scores = c(total_scores,iterate_matches[[chr]][[i]]$total_scores)
			
			for (z in 0:nmm){
				total_mm[z+1]=total_mm[z+1]+iterate_matches[[chr]][[i]]$total_mm[z+1]
			}
			
			if(iterate_matches[[chr]][[i]]$best_score > best_score){
				best_score = iterate_matches[[chr]][[i]]$best_score
				best_sequence = iterate_matches[[chr]][[i]]$best_sequence
				best_position = iterate_matches[[chr]][[i]]$best_position
			}
		}
		
		## Store aggregated results
		output$score[i]=guide_score
		for (z in 0:nmm){
				output[[paste("N",z,sep="_")]][i]= total_mm[z+1]
		}
		
		output$best_match_position[i]=best_position
		output$best_match_sequence[i]=best_sequence
		output$best_match_score[i]=best_score
		output$Targeting_guide_score[i] = ifelse(total_mm[1]>0,100/sum(total_scores)*100,100/(100+sum(total_scores)) * 100)
	}
		output
}


# Function to check if genome assembly is installed
checkAssembly<-function(assembly){
	
	if (!assembly %in% installed.packages()[,"Package"]){
		genomes<-available.genomes()
		if (!assembly %in% genomes){
			genomes<-genomes[grep("masked",genomes)]
			genomes<-paste(genomes,collapse="\n")
			cat(sprintf("Error: %s not available. Available genomes are:\n\n%s\n",assembly,genomes))
			stop()
		}
		cat(paste(assembly, " not installed. Trying to install the package.\n"))
		source("https://bioconductor.org/biocLite.R")
		biocLite(assembly)
	}
}

makeRandomSet<-function(n){
	set<-sapply(1:n,function(x){paste(sample(DNA_ALPHABET[1:4],20,replace=T),collapse="")})
	set<-DNAStringSet(set)
	names(set)<-sapply(1:n,function(x){paste("random",x,sep="_")})
	set
}

readGuides<-function(input=NULL,n=NULL){
	if (!is.null(n)){
		sgRNA_list=makeRandomSet(n)
	}else{
		if( file.access(input) == -1) {
			stop(sprintf("Specified file ( %s ) does not exist", input))
		}
		sgRNA_list=readDNAStringSet(input, "fasta")
		if (any(width(sgRNA_list)!=20)){
			stop("All guides must have 20bp!\n")
		}
	}
	cat(paste("Testing ",length(sgRNA_list)," guides\n",sep=""))
	sgRNA_list
}

checkArgs<-function(opt){
	if(opt$nmm < 0 | opt$nmm > 5){
		stop("Number of mismatches must be between 0 and 5!\n")
	}
	if(opt$outscore < 0 | opt$outscore > 100){
		stop("Outscore must be between 0 and 100!\n")
	}
	if(opt$input=="" & opt$random<=0){
		stop("No guides to analyze (random guide option <=0 and no input file)\n")
	}
	checkAssembly(opt$assembly)
}

################################################################# MAIN ##########################################################


# Check options and arguments
opt <- parse_args(OptionParser(option_list=option_list))

# Get sgRNA list
sgRNA_list=NULL
if (opt$input == ""){
	sgRNA_list=readGuides(n=opt$random)
}else{		
	input <- opt$input
	sgRNA_list=readGuides(input=input)
}

checkArgs(opt)

outname=opt$outname
outscore=opt$outscore
pam=opt$pam
nmm=opt$nmm
assembly=opt$assembly


#Retreive genome
require(assembly,character.only=TRUE) 			
genome=get(assembly) 	

#Retreive PAMs
if(grepl(",",pam)){pam=unlist(strsplit(pam,","))}

#Run the analysis
cat("Searching for all genome-wide matches (this may take while)...\n")
results=NULL
results = runAnalysisBSgenome(sgRNA_list,genome,pam,nmm,scores,outname,outscore)


#Output summary file
summaryName<-paste(outname,".summary.txt",sep="")
write.table(results,summaryName,quote=F,row.names=F)
cat(paste("Summary results written to [",outname,".summary.txt]\n",sep=""))
cat(paste("Individual marches written to [",outname,".matches.txt]\n",sep=""))

