# snakemake --cores 1 -s workflows/reference/Snakefile -r -p --use-conda

from os.path import join, basename

REF_DIR            = config["ref_dir"]
AZURE_KEY          = config["azure_public_key"]
DATASET_KEY        = config["1000g_public_key"]
GRCH37_FASTA       = join(REF_DIR, config["reference"]["GRCh37_fasta"]["file"])
GRCH37_FASTA_FAI   = join(REF_DIR, config["reference"]["GRCh37_fasta_fai"]["file"])
GENETIC_MAP        = join(REF_DIR, config["reference"]["GENETIC_MAP"]["file"])
GENETIC_MAP_GRCH37 = join(REF_DIR, config["reference"]["genetic_map_GRCh37"]["file"])
REF_VCF            = join(REF_DIR, config["reference"]["vcfRef"]["file"])
REF_HAPS           = join(REF_DIR, config["reference"]["refHaps"]["file"])
LIFT_CHAIN         = join(REF_DIR, config["reference"]["lift_chain"]["file"])
CMMAP              = join(REF_DIR, config["reference"]["cmmap"]["file"])
SITE_1000GENOME    = join(REF_DIR, config["reference"]["SITE_1000GENOME"]["file"])
HAPMAP_PED         = join(REF_DIR, config["reference"]["hapmap_ped"]["file"])
HAPMAP_MP          = join(REF_DIR, config["reference"]["hapmap_mp"]["file"])
HAPMAP_FAM         = join(REF_DIR, config["reference"]["hapmap_fam"]["file"])
AFFYMETRIX_CHIP    = join(REF_DIR, config["reference"]["affymetrix_chip"]["file"])
PEDSIM_MAP         = join(REF_DIR, config["reference"]["pedsim_map"]["file"])

GRCH37_FASTA_url            = config["reference"]["GRCh37_fasta"]["url"]
GRCH37_FASTA_md5            = config["reference"]["GRCh37_fasta"]["md5"]
GRCH37_FASTA_basename       = basename(GRCH37_FASTA_url)

GRCH37_FASTA_FAI_url       = config["reference"]["GRCh37_fasta_fai"]["url"]
GRCH37_FASTA_FAI_md5       = config["reference"]["GRCh37_fasta_fai"]["md5"]

GENETIC_MAP_url        = config["reference"]["GENETIC_MAP"]["url"]
GENETIC_MAP_md5        = config["reference"]["GENETIC_MAP"]["md5"]

GENETIC_MAP_GRCH37_url = config["reference"]["genetic_map_GRCh37"]["url"]
GENETIC_MAP_GRCH37_md5 = config["reference"]["genetic_map_GRCh37"]["md5"]
GENETIC_MAP_GRCH37_filename = join(REF_DIR, basename(config["reference"]["genetic_map_GRCh37"]["url"]))

REF_VCF_url            = config["reference"]["vcfRef"]["url"]
REF_VCF_md5            = config["reference"]["vcfRef"]["md5"]

LIFT_CHAIN_url         = config["reference"]["lift_chain"]["url"]
LIFT_CHAIN_basename    = basename(LIFT_CHAIN_url)
LIFT_CHAIN_md5         = config["reference"]["lift_chain"]["md5"]

CMMAP_url              = config["reference"]["cmmap"]["url"]
CMMAP_basename         = basename(CMMAP_url)
CMMAP_md5              = config["reference"]["cmmap"]["md5"]

SITE_1000GENOME_url    = config["reference"]["SITE_1000GENOME"]["url"]
SITE_1000GENOME_md5    = config["reference"]["SITE_1000GENOME"]["md5"]
SITE_1000GENOME_basename = basename(SITE_1000GENOME_url)

HAPMAP_FAM_url         = config["reference"]["hapmap_fam"]["url"]
HAPMAP_FAM_md5         = config["reference"]["hapmap_fam"]["md5"]

AFFYMETRIX_CHIP_url    = config["reference"]["affymetrix_chip"]["url"]
AFFYMETRIX_CHIP_md5    = config["reference"]["affymetrix_chip"]["md5"]

PEDSIM_MAP_url         = config["reference"]["pedsim_map"]["url"]
PEDSIM_MAP_basename    = basename(PEDSIM_MAP_url)
PEDSIM_MAP_md5         = config["reference"]["pedsim_map"]["md5"]

PICARD_url             = config["reference"]["picard"]["url"]
PICARD                 = config["reference"]["picard"]["file"]

CHROMOSOMES = [str(i) for i in range(1, 23)]

# if any of these are True then download and process reference for the phase and imputation
need_phase = config['phase']
need_imputation = config['impute']

if need_phase or need_imputation:
    rule all:
        input:
            GRCh37_fasta = GRCH37_FASTA,
            GRCh37_fasta_dict = expand("{ref}.dict", ref=GRCH37_FASTA),
            GENETIC_MAP = GENETIC_MAP,
            genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
            cmmap = expand(CMMAP, chrom=CHROMOSOMES),
            lift_chain = LIFT_CHAIN,
            SITE_1000GENOME = SITE_1000GENOME,
            pedsim_map = PEDSIM_MAP,
            hd_genotype_chip = HD_GENOTYPE_CHIP,
            affymetrix_chip = AFFYMETRIX_CHIP,
            vcfRef = expand(REF_VCF, chrom=CHROMOSOMES),
            refHaps = expand(REF_HAPS, chrom=CHROMOSOMES)
else:
    rule all:
        input:
            GRCh37_fasta = GRCH37_FASTA,
            GRCh37_fasta_dict = expand("{ref}.dict", ref=GRCH37_FASTA),
            GENETIC_MAP = GENETIC_MAP,
            genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
            cmmap = expand(CMMAP, chrom=CHROMOSOMES),
            lift_chain = LIFT_CHAIN,
            SITE_1000GENOME = SITE_1000GENOME,
            pedsim_map = PEDSIM_MAP,
            vcfRef = [],
            refHaps = []


rule download_GRCh37_fasta:
    output:
        GRCh37_fasta = GRCH37_FASTA,
        GRCh37_fasta_fai = GRCH37_FASTA_FAI
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/download_GRCh37_fasta.log"
    shell:
        """
            set +e
            set -x

            wget "{GRCH37_FASTA_url}{DATASET_KEY}" -O {REF_DIR}/{GRCH37_FASTA_basename} --tries 50 |& tee -a {log}           
            sum=$(md5sum {REF_DIR}/{GRCH37_FASTA_basename} | cut -d " " -f1)
            if [ $sum != {GRCH37_FASTA_md5} ]; then
                echo "md5 sum of GRCH37_FASTA is invalid" |& tee -a {log}
                exit 1
            fi
            # gzip exit with "decompression OK, trailing garbage ignored". need to silence
            gzip -d {REF_DIR}/{GRCH37_FASTA_basename} |& tee -a {log}
            
            wget "{GRCH37_FASTA_FAI_url}{DATASET_KEY}" -O {GRCH37_FASTA_FAI} --tries 50 |& tee -a {log}
            sum2=$(md5sum {GRCH37_FASTA_FAI} | cut -d " " -f1)
            if [ $sum2 != {GRCH37_FASTA_FAI_md5} ]; then 
                echo "md5 sum of GRCH37_FASTA_FAI is invalid" |& tee -a {log}
                exit 1
            fi           
            exit 0
        """

rule create_fasta_dict:
    input:
        GRCh37_fasta = GRCH37_FASTA
    output:
        expand("{ref}.dict", ref=GRCH37_FASTA)
    log:
        "logs/ref/create_fasta_dict.log"
    shell:
        """
            wget {PICARD_url} --tries 50 -c |& tee -a {log}
            java -jar {PICARD} CreateSequenceDictionary -R {input.GRCh37_fasta} -O {output} |& tee -a {log}
            rm {PICARD}
        """

rule download_GENETIC_MAP:
    output:
        GENETIC_MAP = GENETIC_MAP
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/download_GENETIC_MAP.log"
    shell:
        """
            set -e
            set -x

            wget {GENETIC_MAP_url} -O {GENETIC_MAP} --tries 50 -c |& tee -a {log}
            sum=$(md5sum {GENETIC_MAP} | cut -d " " -f1)
            if [ $sum != {GENETIC_MAP_md5} ]; then
                echo "md5 sum of GENETIC_MAP is invalid" |& tee -a {log}
                exit 1
            fi
        """

rule download_genetic_map_GRCh37:
    output:
        genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES)
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/download_genetic_map_GRCh37.log"
    shell:
        """
            set -e
            set -x

            wget {GENETIC_MAP_GRCH37_url} --tries 50 -O {GENETIC_MAP_GRCH37_filename} |& tee -a {log}
            sum=$(md5sum {GENETIC_MAP_GRCH37_filename} | cut -d " " -f1)
            if [ $sum != {GENETIC_MAP_GRCH37_md5} ]; then
                echo "md5 sum of genetic_map_GRCh37 is invalid" |& tee -a {log}
                exit 1
            fi
            tar -zxvf {GENETIC_MAP_GRCH37_filename} -C {REF_DIR}/genetic_map_GRCh37 --exclude=README.txt |& tee -a {log}
            rm {GENETIC_MAP_GRCH37_filename} |& tee -a {log}
            # consider postprocessing? https://github.com/arq5x/gemini/blob/master/gemini/annotation_provenance/make-recombination.sh
        """

rule download_vcfRef:
    output:
        vcfRef = REF_VCF,
        vcfRef_idx = REF_VCF + '.csi'
    wildcard_constraints:
        chrom="((2[0-2])|(1[0-9])|([1-9]))" # all numbers from 1 to 22
    conda:
        "../../envs/download.yaml"
    threads:
        workflow.cores
    log:
        "logs/ref/download_vcfRef{chrom}.log"
    shell:
        """
            set -e
            set -x
            md5array=({REF_VCF_md5})
            mkdir -p {REF_DIR}/1000genome/bcf/
            chrom={wildcards.chrom}
            wget "{REF_VCF_url}{DATASET_KEY}" -O {REF_DIR}/1000genome/bcf/1000genome_chr{wildcards.chrom}.tmp.bcf --tries 50 -c  |& tee -a {log}
            sum=$(md5sum {REF_DIR}/1000genome/bcf/1000genome_chr{wildcards.chrom}.tmp.bcf | cut -d " " -f1)
            if [ $sum != ${{md5array[{wildcards.chrom}-1]}} ]; then
                echo "md5 sum of vcfRef is invalid" |& tee -a {log}
                exit 1
            fi
            bcftools annotate --set-id '%CHROM:%POS:%REF:%FIRST_ALT' {REF_DIR}/1000genome/bcf/1000genome_chr{wildcards.chrom}.tmp.bcf -O z -o -Ob -o {REF_DIR}/1000genome/bcf/1000genome_chr{wildcards.chrom}.bcf |& tee -a {log}
            bcftools index {REF_DIR}/1000genome/bcf/1000genome_chr{wildcards.chrom}.bcf |& tee -a {log}
            rm {REF_DIR}/1000genome/bcf/1000genome_chr{wildcards.chrom}.tmp.bcf
        """

rule make_refHaps:
    input:
        vcfRef = REF_VCF
    output:
        refHaps = REF_HAPS
    wildcard_constraints:
        chrom="((2[0-2])|(1[0-9])|([1-9]))" # all numbers from 1 to 22
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/make_refHaps{chrom}.log"
    threads:
        workflow.cores
    shell:
        """
            set -e
            set -x
            bcftools view --min-ac 251 -i 'strlen(REF)=1 & strlen(ALT)=1' {REF_DIR}/1000genome/bcf/1000genome_chr{wildcards.chrom}.bcf -O z -o {REF_DIR}/1000genome/chr{wildcards.chrom}.ac251.vcf.gz |& tee -a {log}
            Minimac3-omp --cpus {threads} --refHaps {REF_DIR}/1000genome/chr{wildcards.chrom}.ac251.vcf.gz --processReference --prefix {REF_DIR}/Minimac/{wildcards.chrom}.1000g.Phase3.v5.With.Parameter.Estimates |& tee -a {log}
            rm {REF_DIR}/1000genome/chr{wildcards.chrom}.ac251.vcf.gz |& tee -a {log}
        """


rule download_lift_chain:
    output:
        lift_chain = LIFT_CHAIN
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/download_lift_chain.log"
    shell:
        """
            set -e
            set -x

            wget {LIFT_CHAIN_url} -O {LIFT_CHAIN} --tries 50 |& tee -a {log}
            sum=$(md5sum {LIFT_CHAIN} | cut -d " " -f1)
            if [ $sum != {LIFT_CHAIN_md5} ]; then
                echo "md5 sum of LIFT_CHAIN is invalid" |& tee -a {log}
                exit 1
            fi
            gzip -d {LIFT_CHAIN} |& tee -a {log}
            sed -e s/chr//g -i {REF_DIR}/hg38ToHg19.over.chain |& tee -a {log}
            gzip -c {REF_DIR}/hg38ToHg19.over.chain > {LIFT_CHAIN}
            rm {REF_DIR}/hg38ToHg19.over.chain |& tee -a {log}
        """

rule download_cmmap:
    output:
        cmmap = expand(CMMAP, chrom=CHROMOSOMES)
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/download_cmmap.log"
    shell:
        """
            set -e
            set -x

            # found only processed maps from dropbox
            # TODO: need to find original and process it accordingly
            wget {CMMAP_url} -P {REF_DIR} --tries 50 -c  |& tee -a {log}
            sum=$(md5sum {REF_DIR}/{CMMAP_basename} | cut -d " " -f1)
            if [ $sum != {CMMAP_md5} ]; then
                echo "md5 sum of cmmap is invalid" |& tee -a {log}
                exit 1
            fi
            unzip {REF_DIR}/{CMMAP_basename} -d {REF_DIR} |& tee -a {log}
            rm {REF_DIR}/{CMMAP_basename} |& tee -a {log}
        """

rule download_SITE_1000GENOME:
    output:
        SITE_1000GENOME=SITE_1000GENOME
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/download_SITE_1000GENOME.log"
    shell:
        """
            set -e
            set -x

            mkdir -p {REF_DIR}/1000genome/allele_info |& tee -a {log}
            wget "{SITE_1000GENOME_url}{DATASET_KEY}" -O "{REF_DIR}/1000genome/allele_info/{SITE_1000GENOME_basename}" --tries 50 |& tee -a {log}
            sum=$(md5sum "{REF_DIR}/1000genome/allele_info/{SITE_1000GENOME_basename}" | cut -d " " -f1)
            if [ $sum != {SITE_1000GENOME_md5} ]; then
                echo "md5 sum of SITE_1000GENOME is invalid" |& tee -a {log}
                exit 1
            fi
            bcftools view --no-header -m2 -M2 -v snps -g ^miss -Ov -o {SITE_1000GENOME} "{REF_DIR}/1000genome/allele_info/{SITE_1000GENOME_basename}" |& tee -a {log}
            rm "{REF_DIR}/1000genome/allele_info/{SITE_1000GENOME_basename}" |& tee -a {log}
        """

rule download_hapmap_fam:
    output:
        hapmap_fam=HAPMAP_FAM
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/download_hapmap_fam.log"
    shell:
        """
            wget {HAPMAP_FAM_url} -O {HAPMAP_FAM} --tries 50 |& tee -a {log}
            sum=$(md5sum {HAPMAP_FAM} | cut -d " " -f1)
            if [ $sum != {HAPMAP_FAM_md5} ]; then
                echo "md5 sum of HAPMAP_FAM is invalid" |& tee -a {log}
                exit 1
            fi
        """

rule download_affymetrix_chip:
    output:
        affymetrix_chip=AFFYMETRIX_CHIP
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/download_affymetrix_chip.log"
    shell:
        """
            mkdir -p {REF_DIR}/1000genome/affymetrix_chip |& tee -a {log}
            wget "{AFFYMETRIX_CHIP_url}{DATASET_KEY}" -O {AFFYMETRIX_CHIP} --tries 50 |& tee -a {log}
            sum=$(md5sum {AFFYMETRIX_CHIP} | cut -d " " -f1)
            if [ $sum != {AFFYMETRIX_CHIP_md5} ]; then
                echo "md5 sum of AFFYMETRIX_CHIP is invalid" |& tee -a {log}
                exit 1
            fi
            wget "{AFFYMETRIX_CHIP_url}.tbi{DATASET_KEY}" -O {AFFYMETRIX_CHIP}.tbi --tries 50 -c  |& tee -a {log}
        """

rule download_pedsim_map:
    output:
        pedsim_map=PEDSIM_MAP
    conda:
        "../../envs/download.yaml"
    log:
        "logs/ref/download_pedsim_map.log"
    shell:
        """
            wget {PEDSIM_MAP_url} -P {REF_DIR} --tries 50 -c  |& tee -a {log}
            sum=$(md5sum {REF_DIR}/{PEDSIM_MAP_basename} | cut -d " " -f1)
            if [ $sum != {PEDSIM_MAP_md5} ]; then
                echo "md5 sum of PEDSIM_MAP is invalid" |& tee -a {log}
                exit 1
            fi
            tar xvzf {REF_DIR}/{PEDSIM_MAP_basename} -C {REF_DIR} |& tee -a {log}
            printf "#chr\tpos\tmale_cM\tfemale_cM\n" > {PEDSIM_MAP}
            for chr in {{1..22}}; do
            paste {REF_DIR}/Refined_genetic_map_b37/male_chr$chr.txt {REF_DIR}/Refined_genetic_map_b37/female_chr$chr.txt \
                | awk -v OFS="\t" 'NR > 1 && $2 == $6 {{print $1,$2,$4,$8}}' \
                | sed 's/^chr//' >> {PEDSIM_MAP};
            done
            rm -f {REF_DIR}/{PEDSIM_MAP_basename} |& tee -a {log}
        """
