from os.path import join

flow            = config["flow"]
is_client       = False
use_simulated_ibd = config["use_simulated_ibd"] if "use_simulated_ibd" in config else False

REF_DIR            = config["ref_dir"]
GRCH37_FASTA       = join(REF_DIR, config["reference"]["GRCh37_fasta"]["file"])
GENETIC_MAP        = join(REF_DIR, config["reference"]["GENETIC_MAP"]["file"])
GENETIC_MAP_GRCH37 = join(REF_DIR, config["reference"]["genetic_map_GRCh37"]["file"])
REF_VCF            = join(REF_DIR, config["reference"]["vcfRef"]["file"])
REF_HAPS           = join(REF_DIR, config["reference"]["refHaps"]["file"])
LIFT_CHAIN         = join(REF_DIR, config["reference"]["lift_chain"]["file"])
CMMAP              = join(REF_DIR, config["reference"]["cmmap"]["file"])
SITE_1000GENOME    = join(REF_DIR, config["reference"]["SITE_1000GENOME"]["file"])
HAPMAP_PED         = join(REF_DIR, config["reference"]["hapmap_ped"]["file"])
HAPMAP_MP          = join(REF_DIR, config["reference"]["hapmap_mp"]["file"])
HAPMAP_FAM         = join(REF_DIR, config["reference"]["hapmap_fam"]["file"])
AFFYMETRIX_CHIP    = join(REF_DIR, config["reference"]["affymetrix_chip"]["file"])
PEDSIM_MAP         = join(REF_DIR, config["reference"]["pedsim_map"]["file"])


SAMPLES = [str(s) for s in range(10)]
CHROMOSOMES     = [str(i) for i in range(1, 23)]

PLINK_FORMATS   = ['bed', 'bim', 'fam']

assembly = config['assembly']
need_phase = config['phase']
need_imputation = config['impute']
need_remove_imputation = config['remove_imputation']

_IDEAL_LARGE_MEM_GB = 20

# if available, return the ideal pretty-large amount for most jobs (20GB)
def _mem_gb_for_ram_hungry_jobs():
  return min(_IDEAL_LARGE_MEM_GB, config["mem_gb"])


#CHIP_DATA_LINK = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/hd_genotype_chip/ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.vcf.gz'
#CHIP_INDEX_LINK = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/hd_genotype_chip/ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.vcf.gz.tbi'

rule all:
    input:
        "results/relatives.tsv",
        "results/accuracy.png",
        "results/overlaps.tsv"


rule intersect:
    input:
        hd_genotype_chip=AFFYMETRIX_CHIP,
        vcfRef=REF_VCF
    output: "pedsim/phased/chr{chrom}.phased.vcf.gz"
    conda: "../../envs/bcftools.yaml"
    shell:
        """
            bcftools isec -n=2 -w1 -r {wildcards.chrom} -O z -o {output} {input.vcfRef} {input.hd_genotype_chip}
        """


rule merge_background:
    input:
        data=expand("pedsim/phased/chr{chrom}.phased.vcf.gz", chrom=CHROMOSOMES),
        eu=config['sim_samples_file']
    output:
        "pedsim/phased/background.vcf.gz"
    params:
        list="pedsim/phased/phased.merge.list"
    conda:
        "../../envs/bcftools.yaml"
    shell:
        """
            # for now just skip empty files
            true > {params.list} && \
            for i in {input.data}; do
                if [ -s $i ]
                then
                    echo $i >> {params.list}
                else
                    continue
                fi
            done
            bcftools concat -f {params.list} | bcftools view --force-samples --samples-file {input.eu} -O z -o {output}
        """


rule simulate:
    input:
        bg=rules.merge_background.output,
        _map=PEDSIM_MAP,
        _def=config['sim_params_file'],
        intf='params/nu_p_campbell.tsv'
    output:
        vcf='pedsim/simulated/data.vcf.gz',
        seg='pedsim/simulated/data.seg',
        fam='pedsim/simulated/data-everyone.fam'
    params:
        prefix='pedsim/simulated/data'
    conda:
        "../../envs/ped-sim.yaml"
    shell:
        """
            pedsim -d {input._def} -m {input._map} -i {input.bg} -o {params.prefix} --intf {input.intf} --fam
        """

rule shuffle_phase:
        input: vcf=rules.simulate.output.vcf
        output: vcf="pedsim/simulated/data_shuffled.vcf.gz",
                plink=temp(expand("plink/data_shuffled.{ext}", ext=PLINK_FORMATS))
        params:
            plink="plink/data_shuffled",
            vcf="pedsim/simulated/data_shuffled"
        conda: "../../envs/plink.yaml"
        shell:
            """
                plink --vcf {input.vcf} --make-bed --out {params.plink} 
                plink --bfile {params.plink} --export vcf bgz --out {params.vcf}
            """

rule postprocess:
    input:
        vcf=rules.shuffle_phase.output.vcf,
        fam='pedsim/simulated/data-everyone.fam'
    output:
        kin='pedsim/simulated/reheaded_data.kinship',
        vcf='input.vcf.gz',
        fam='pedsim/simulated/reheaded_data.fam'
    conda:
        "../../envs/postprocess.yaml"
    script:
        "../../scripts/postprocess.py"

include: "../../rules/preprocessing.smk"

if flow == 'ibis':
    include: "../../rules/relatives_ibis.smk"
elif flow == 'germline':
    include: "../../rules/relatives.smk"
elif flow == 'ibis_king':
    include: "../../rules/relatives_ibis_king.smk"

rule evaluate_accuracy:
    input:
        rel='results/relatives.tsv',
        fam=rules.postprocess.output['fam']
    output:
        accuracy='results/accuracy.png',
        pr='results/precision_recall.png',
        conf_matrix='results/confusion_matrix.png',
        updated_rel='results/updated_relatives.tsv',
        pedigree_plot='results/pedigree_plot.png'
    params:
        source='both',
        po_fs_plot='results/po_fs_plot.png' if flow == 'ibis' else None
    log:
        "logs/evaluation/accuracy.log"
    conda:
        "../../envs/evaluation.yaml"
    script:
         '../../scripts/evaluate.py'

rule split_map_for_ibd:
    input:
        bim = "preprocessed/data.bim"
    output: expand("cm2/chr{chrom}.cm.map", chrom=CHROMOSOMES)
    params:
        cm_dir='cm2'
    conda:
        "../../envs/evaluation.yaml"
    script:
        "../../scripts/split_map.py"

rule evaluate_ibd:
    input:
        pedsim=rules.simulate.output['seg'],
        ibd=aggregate_input, # only segments of first up to 2K samples
        cm=expand("cm2/chr{chrom}.cm.map", chrom=CHROMOSOMES)
    params:
        cm_dir='cm2',
        is_rapid_ibd=False
    conda:
        '../../envs/evaluation.yaml'
    output:
        overlap='results/overlaps.tsv',
        seg_accuracy='results/seg_accuracy.png',
        total_len_accuracy='results/total_len_accuracy.png'
    script:
        '../../scripts/evaluate_ibd.py'