#!/usr/bin/env bash
#you must compile velvet prior to running sisrs - see the manual for appropriate compilation parameters
#you must install bowtie2
#you must install samtools
#this script uses python2.7 - it has not been tested with python3
#run format: ./sisrs
#run with non-defaults: ./sisrs -r ./reference.fasta -p 40 -f fastq -m 4 -a ./fastq_data/
#reads must be paired, where the filename contains either R1 or R2

VERSION=1.0

help() {
    cat <<ENDLINE
    
 SISRS: Site Identification from Short Read Sequences
 Version $VERSION
 Copyright (c) 2013-2015 Rachel Schwartz <Rachel.Schwartz@asu.edu>
 https://github.com/rachelss/SISRS

Usage:

 sisrs command options

By default, SISRS assumes that

 * A reference genome is not available.
 * The K-mer size to be used by Velvet in contig assembly is 21.
 * Only one processor is available.
 * Files are in fastq format.
 * A site is only required to have data for two species to be included
   in the final alignment.
 * Folders containing reads are in the present working directory.
 * A minimum of three reads are required to call the base at a site
   for a taxon.

Commands:
 sites : produce an alignment of sites from raw reads
 alignContigs : run sisrs skipping the composite genome assembly
 mapContigs : run sisrs, also skipping alignment of reads to composite genome
 identifyFixedSites : run sisrs, also skipping mapping of contigs to a reference
 outputAlignment : get sisrs alignment from sites id'd for individual species
 loci : produce a set of aligned loci based on the most variable regions of
        the composite genome
 
Flags:
    
 -g : MANDATORY if running sisrs from the beginning - the approximate genome size
      - this will reduce the size of the composite assembly by using a subset
      of reads to approximate 10x coverage
 -p : use this number of processors
 -r : the path to the reference genome in fasta format
 -k : k-mer size (for assembly)
 -f : the folder containing the folders of reads
 -n : the number of reads required to call a base at a site
 -t : the threshold for calling a site; e.g. 0.99 means that >99% of
      bases for that taxon must be one allele; only recommended for
      low ploidy with <3 individuals
 -m : the number of species that are allowed to have missing data at
      a site
 -l : the number of alleles for sisrs loci

 Example command:
 sisrs sites -g 50000000 -p 40 -m 4 -a test_data
 
 Example command:
 sisrs loci -g 50000000 -p 40 -m 4 -a test_data

ENDLINE
}

#use defaults or input values
KMER=21
MINREAD=3
PROCESSORS=1
MAINFOLDER=.
THRESHOLD=1
ALLELES=1
DEBUG=0
ASSEMBLER=Velvet
CMD=$1

cmds=(buildContigs alignContigs mapContigs identifyFixedSites outputAlignment)
if [[ "${CMD}" != "sites" ]] && [[ "${CMD}" != "loci" ]] && [[ ! "${cmds[*]}" =~ $CMD ]]; then
    help
    exit 0
fi 
    
OPTIND=2 # tells getopts to start parsing at the second argument

while getopts g:r:k:p:m:a:f:l:n:t:hd option
do
    case "${option}"
	in
	g) GENOMESIZE=${OPTARG};;
	r) REFFILE=${OPTARG};;
	k) KMER=${OPTARG};;
	p) PROCESSORS=${OPTARG};;
	m) MISSING=${OPTARG};;
	f) MAINFOLDER=${OPTARG};;
        a) ASSEMBLER=${OPTARG};;
	l) ALLELES=${OPTARG};;
	n) MINREAD=${OPTARG};;
	t) THRESHOLD=${OPTARG};;
	d) DEBUG=1;;
	h) help; exit;;
	\? ) echo "Unknown option" >&2; exit 1;;
    esac
done

DIRS="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"    #where is the sisrs file?
DIR=$( dirname ${DIRS} )                                    #go one level up from location of sisrs file
MAINFOLDER=$( echo "${MAINFOLDER}" | sed -e "s/\/*$//" )    #remove trailing / if necessary

if [[ $ASSEMBLER -eq Velvet ]]; then
    which velveth &>/dev/null
    [ $? -eq 0 ] || { echo "Velvet must be installed to run SISRS. The installation folder must be in your path.    Aborting."; exit 1; }
fi
which bowtie2 &>/dev/null
[ $? -eq 0 ] || { echo "Bowtie2 must be installed to run SISRS. The installation folder must be in your path. Aborting."; exit 1; }
which parallel &>/dev/null
[ $? -eq 0 ] || { echo "GNU parallel must be installed to run SISRS. The installation folder must be in your path. Aborting."; exit 1; }

get_fastqa(){
    #get paired fastq
    FILELIST=( $( find "${MAINFOLDER}" -name "*R1*fastq" ) )     #all fastq file paths as array
    if [[ ${#FILELIST[@]} -gt 0 ]]; then
        declare -a ALLFOLDERLIST=()
        for F in "${FILELIST[@]}"; do ALLFOLDERLIST+=("$( dirname "${F}" )"); done       #array of directories containing fastq files
        FOLDERLIST=( $(echo "${ALLFOLDERLIST[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') )  #sorted unique list of folders with paired fastq files as array
    fi
    
    #get unpaired fastq
    FILELISTA=( $( find "${MAINFOLDER}" -name "*fastq" ) )     
    FILELISTU=()
    for F in "${FILELISTA[@]}"; do
        if [[ $F != *R1* ]] && [[ $F != *R2* ]] && [[ $F != *shuffled* ]] && [[ $F != *sampled* ]]; then FILELISTU+=(${F})
        fi
    done
    if [[ ${#FILELISTU[@]} -gt 0 ]]; then
        declare -a ALLFOLDERLISTU=()
        for F in "${FILELISTU[@]}"; do ALLFOLDERLISTU+=("$( dirname "${F}" )"); done       #array of directories containing fastq files
        FOLDERLISTU=( $(echo "${ALLFOLDERLISTU[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') )  #sorted unique list of folders with unpaired fastq files as array
    fi
    FOLDERLISTA=( $(echo "${FOLDERLISTU[@]} ${FOLDERLIST[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') )

if [ -z "$MISSING" ]; then MISSING=$((${#FOLDERLISTA[@]} - 2)); fi    #minimum of data for 2 species unless otherwise specified
}

buildContigs(){
    if [ -z $GENOMESIZE ]; then
        echo "You must enter the approximate size of the genome / transcriptome. For more information run sisrs -h"
        exit 0
    fi
    
    echo "Subsampling data"
    NUMER=$((10*${GENOMESIZE}))
    DENOM=$((100*2*${#FOLDERLISTA[@]}))      #assumes 100bp reads - add option in future
    LEFTREADS=$((${NUMER}/${DENOM}))
    parallel --jobs "${PROCESSORS}" "python ${DIR}/libexec/sisrs/sub_sample_for_velvet_unshuff.py ${LEFTREADS} {}" ::: "${FOLDERLISTA[@]}"   #subsample data and shuffle 
    
    for F in "${FILELISTU[@]}"; do 
	ln $F $(dirname "${F}")/subsampled_$(basename "${F}"); 
    done
    
    #make reference contigs - velveth
    if [ -n "${REFFILE}" ]; then
        if [[ ${#FILELISTU[@]} -gt 0 ]]; then
	    if [[ ${#FILELIST[@]} -gt 0 ]]; then
		echo "Running Velvet with PE and SE reads, and reference"
                velveth ${MAINFOLDER}/velvetoutput ${KMER} -create_binary -fasta -reference ${REFFILE} -fastq -shortPaired ${MAINFOLDER}/*/*shuffled*fastq -short ${MAINFOLDER}/*/subsampled*fastq
	    else
		echo "Running Velvet with SE reads and reference"
                velveth ${MAINFOLDER}/velvetoutput ${KMER} -create_binary -fasta -reference ${REFFILE} -fastq -short ${MAINFOLDER}/*/subsampled*fastq
	    fi
        else
            echo "Running Velvet with PE reads and reference"
	    velveth ${MAINFOLDER}/velvetoutput ${KMER} -create_binary -fasta -reference ${REFFILE} -fastq -shortPaired ${MAINFOLDER}/*/*shuffled*fastq
        fi
    else
        if [[ ${#FILELISTU[@]} -gt 0 ]]; then
	    if [[ ${#FILELIST[@]} -gt 0 ]]; then
		echo "Running Velvet with PE and SE reads"
                velveth ${MAINFOLDER}/velvetoutput ${KMER} -create_binary -fastq -shortPaired ${MAINFOLDER}/*/*shuffled*fastq -short ${MAINFOLDER}/*/subsampled*fastq
	    else
		echo "Running Velvet with SE reads"
                velveth ${MAINFOLDER}/velvetoutput ${KMER} -create_binary ${REFFILE} -fastq -short ${MAINFOLDER}/*/subsampled*fastq
	    fi
        else
	    echo "Running Velvet with PE reads"
	    velveth ${MAINFOLDER}/velvetoutput ${KMER} -create_binary ${REFFILE} -fastq -shortPaired ${MAINFOLDER}/*/*shuffled*fastq
        fi
    fi
    echo ==== Velveth is finished ====
    
    velvetg ${MAINFOLDER}/velvetoutput -exp_cov auto -cov_cutoff auto# -min_contig_lgth 100     #run velvet - output contigs
    echo ==== Velvetg is finished ===
}


alignContigs(){
    bowtie2-build ${MAINFOLDER}/velvetoutput/contigs.fa ${MAINFOLDER}/velvetoutput/contigs      #index contigs
    
    #align reads to contigs
    for FILE in "${FILELIST[@]}"; do
        NAME=$( echo ${FILE} | sed 's/R1//' | sed 's/\.[^.]*$//' ) #includes folder but not the read or the extension
        echo ==== Aligning FILELIST ${NAME}  ====
        #N=1 allows a mismatch #x The basename of the index for the reference genome
        bowtie2 -p "${PROCESSORS}" -N 1 --local -x ${MAINFOLDER}/velvetoutput/contigs -1 ${FILE} -2 $( echo ${FILE}|sed 's/R1/R2/' ) > >(tee ${NAME}_stdout.log) 2> >(tee ${NAME}_stderr.log >&2) | samtools view -Su -F 4 - | samtools sort - ${NAME}
    done
    for FILE in "${FILELISTU[@]}"; do
        NAME=$( echo ${FILE} | sed 's/\.[^.]*$//' ) #includes folder but not the read or the extension
        echo ==== Aligning FILELISTU ${NAME} ====
        #N=1 allows a mismatch #x The basename of the index for the reference genome
        bowtie2 -p "${PROCESSORS}" -N 1 --local -x ${MAINFOLDER}/velvetoutput/contigs -U ${FILE} > >(tee ${NAME}_stdout.log) 2> >(tee ${NAME}_stderr.log >&2) | samtools view -Su -F 4 - | samtools sort - ${NAME}
    done
    echo ==== Done Aligning ====
}
    
mapContigs(){
    #index bam
    parallel --jobs "${PROCESSORS}" 'samtools index $( echo {} | sed 's/R1//' | sed 's/\.[^.]*$//' ).bam' ::: "${FILELIST[@]}" "${FILELISTU[@]}"
    echo ==== Done Indexing Bam Files ====
       
    #map contigs to reference
    if [ -n "${REFFILE}" ]; then
        NAME=$( echo ${REFFILE} | sed 's/\.[^.]*$//' )
        bowtie2-build ${REFFILE} ${NAME}        #bowtie2-build [options]* <reference_in> <bt2_base>
        bowtie2 -p ${PROCESSORS} -N 1 -x ${NAME} -f -U ${MAINFOLDER}/velvetoutput/contigs.fa > >(tee ${NAME}_stdout.log) 2> >(tee ${NAME}_stderr.log >&2) -S ${MAINFOLDER}/velvetoutput/align_contigs.sam    #bowtie2 -x <ref_base> -U <fq files> -S <output sam>
	echo ==== Done Mapping Contigs ====
    fi
}

identifyFixedSites(){
    #put base for each site in a dictionary (allows no variation when calling sites)
    parallel --jobs "${PROCESSORS}" "${DIR}/libexec/sisrs/get_pruned_dict.py {} ${MINREAD} ${THRESHOLD}" ::: "${FOLDERLISTA[@]}"
    echo ==== Done Identifying Fixed Sites Without Error ====
}

outputAlignment(){
    if [ -n "${REFFILE}" ]; then
        ${DIR}/libexec/sisrs/get_alignment.py ${MISSING} ${REFFILE} ${MAINFOLDER}
    else
        ${DIR}/libexec/sisrs/get_alignment.py ${MISSING} X ${MAINFOLDER}
    fi
}

runSISRS(){
    if [[ "${cmds[*]}" =~ "${CMD}" ]]; then               # get pos in list of commands
        for (( i = 0; i < ${#cmds[@]}; i++ )); do
            if [[ "${cmds[$i]}" == "${1}" ]]; then
                break
            fi
        done
    else
        i=0
    fi
    i=$((i-1))
    for (( i = $i; i < ${#cmds[@]}; i++ )); do      #run from selected opt to end
        ${cmds[$i]}
    done
}

####################LOCI#######################

copyRefContigs(){
    grep -oe "[A-Z]*_*[0-9]*_*len[^/]*" "${MAINFOLDER}/alignment.nex" | uniq -c | sort -k1 -nr | awk '{print $2}' > "${MAINFOLDER}/loci.txt"
    python ${DIR}/libexec/gfr/get_seq.py "${MAINFOLDER}/velvetoutput/contigs.fa" "${MAINFOLDER}/loci.txt" "${MAINFOLDER}/ref_genes.fa" 
}

gfr_aligntoConservedContigs(){
    bowtie2-build "${MAINFOLDER}"/ref_genes.fa "${MAINFOLDER}"/ref_genes    #build index for conserved contigs
    for FOLDER in "${FOLDERLISTA[@]}"; do rm -f "$FOLDER"_loci/*bam; done
    
    for FOLDER in "${FOLDERLIST[@]}"; do
        for FILE in "${FOLDER}"/*R1*.fastq; do
	    # output sorted bam file w/o unaligned reads - lots per folder
            echo Aligning ${FILE} to conserved contigs
	    bowtie2 -p "${PROCESSORS}" -N 1 --local -x "${MAINFOLDER}"/ref_genes -1 ${FILE} -2 $( echo ${FILE}|sed 's/R1/R2/' ) > >(tee ${FILE/.fastq/}_stdout.log) 2> >(tee ${FILE/.fastq/}_stderr.log >&2) | samtools view -Su -F 4 - | samtools sort - "${MAINFOLDER}"/"$FOLDER"_loci/$(basename ${FILE/.fastq})  
        done
    done
    
    for FOLDER in "${FOLDERLISTU[@]}"; do
        for FILE in "${FOLDER}"/*reads*fastq; do
            bowtie2 -p "${PROCESSORS}" -N 1 --local -x "${MAINFOLDER}"/ref_genes -U ${FILE} > >(tee ${FILE/.fastq/}_stdout.log) 2> >(tee ${FILE/.fastq/}_stderr.log >&2) | samtools view -Su -F 4 - | samtools sort - "${MAINFOLDER}"/"$FOLDER"_loci/$(basename ${FILE/.fastq})  
        done        
    done
}

gfr_mergebam(){
    declare -a TOMERGE=()
    for FOLDER in "${FOLDERLISTA[@]}"; do
        rm -f "$FOLDER"_loci/merged.bam        # remove existing merged.bam file    
        if [[ $(ls -l "$FOLDER"_loci/*bam | wc -l) -eq 1 ]]; then      #if only one file just link
            F="$FOLDER"_loci/*bam
            ln $(echo "$F") "$FOLDER"_loci/merged.bam
        elif [[ $(ls -l "$FOLDER"_loci/*bam | wc -l) -gt 1 ]]; then
            TOMERGE+=("$FOLDER"_loci)
        fi
    done
    if [[ "${#TOMERGE[@]}" -gt 0 ]]; then
        parallel -j "${PROCESSORS}" samtools merge '{}/merged.bam {}/*.bam' ::: "${TOMERGE[@]}"
    fi
}

gfr_getAlleles(){
    for FOLDER in "${FOLDERLISTA[@]}"; do
	for F in ${FOLDER}_loci/*.sam; do 
	    rm -f $F; 
	done
    done

    parallel -j "${PROCESSORS}" "samtools view {}_loci/merged.bam > {}_loci/merged.sam" ::: "${FOLDERLISTA[@]}"     #convert to sam file
    parallel -j "${PROCESSORS}" "python ${DIR}/libexec/gfr/separate_sam_by_contig.py {}_loci/merged.sam" ::: "${FOLDERLISTA[@]}"   #separate by gene -> lots of sam files per folder
    for FOLDER in "${FOLDERLISTA[@]}"; do 
	rm -f "${FOLDER}"_loci/merged.sam; 
    done

    for FOLDER in "${FOLDERLISTA[@]}"; do
        find ${FOLDER}_loci/ -type f -name '*sam'| parallel -j "${PROCESSORS}" "python ${DIR}/libexec/gfr/make_alignment_from_sam.py {} ${ALLELES}"
    done
    
#    parallel -j "${PROCESSORS}" 'for j in {}_loci/*sam; do' "python ${DIR}/libexec/gfr/make_alignment_from_sam.py" '$j' "$ALLELES; done" ::: "${FOLDERLISTA[@]}"
}

gfr_getAlignments(){
    parallel -j "${PROCESSORS}" 'for j in {}_loci/*fa; do cat $j >> loci/{/}.fa; done'  ::: "${FOLDERLISTA[@]}" #concatenate fa files -> all species fa files in main folder
        
    python ${DIR}/libexec/gfr/genealign_from_allgenes.py ${ALLELES}  #make files for each gene containing seq for each species 
    for F in "${FOLDERLISTA[@]}"; do
        # delete species fasta files 
        mv loci/${F}.fa loci/${F}.fasta; 
    done
    find loci/ -type f -name '*fa' | parallel -j "${PROCESSORS}" "mafft {} > {.}_align.fa"
}

runGFR(){
    for FOLDER in "${FOLDERLISTA[@]}"; do
        if [ ! -d "$FOLDER"_loci ]; then
            mkdir "$FOLDER"_loci
        fi
    done
    
    gfr_aligntoConservedContigs
    gfr_mergebam
    gfr_getAlleles
    
    if [ ! -d loci ]; then
        mkdir loci
    else
        for j in loci/*fa; do 
            rm -f $j; 
        done
    fi
    
    gfr_getAlignments
}

####################RUN####################
get_fastqa

if [[ $CMD = "sites" ]] || [[ "${cmds[*]}" =~ $CMD ]]; then
    # Run SISRS only
    echo "**** SISRS ****"
    runSISRS
elif [[ $CMD = "loci" ]]; then
    # Run SISRS with loci (GFR). First check if alignment.nex and/or ref_genes.fa. If there is no alignment.nex then rerun SISRS to produce file. If
    # ref_genes.fa doesn't exist then build it from alignment.nex
    echo "**** SISRS loci ****"
    if [ ! -f ${MAINFOLDER}/ref_genes.fa ] && [ ! -f "${MAINFOLDER}/alignment.nex" ]; then
	# need to rerun SISRS from scratch to get ref_genes.fa
	echo "** Missing alignment and reference, running SISRS"
	runSISRS
	copyRefContigs
    else 
	if [ ! -f ${MAINFOLDER}/ref_genes.fa ]; then
	    echo "** Missing reference, copying from previous SISRS session"
	    copyRefContigs  # Get ref_genes.fa from previous sisrs run
	fi
    fi

    # get individual loci
    runGFR
else
    echo "You must specify sisrs sites or loci"
    exit 0
fi

if [[ DEBUG -eq 1 ]]; then
    echo "DIRS          = ${DIRS}"
    echo "DIR           = ${DIR}"
    echo "MAINFOLDER    = ${MAINFOLDER}"
    echo "FILELIST      = ${FILELIST[@]}"
    echo "FOLDERLIST    = ${FOLDERLIST[@]}"
    echo "FILELISTA     = ${FILELISTA}"
    echo "FILELISTU     = ${FILELISTU[@]}"
    echo "FILELISTA[@]  = ${FILELISTA[@]}"
    echo "#FILELISTU[@] = ${#FILELISTU[@]}"
    echo "FOLDERLISTA   = ${FOLDERLISTA}"
    echo "MISSING       = ${MISSING}"
fi