rule all:
    input:
        auspice_v1_tree = "auspice/tb_tree.json",
        auspice_v1_meta = "auspice/tb_meta.json",
        auspice_v1_seq = "auspice/tb_seq.json",
        auspice_noauth_tree = "auspice/tb-noauth_tree.json",
        auspice_noauth_meta = "auspice/tb-noauth_meta.json",
        auspice_v2 = "auspice/tb_main.json"

# Config variables to be used by rules
# Parameters are defined within their own rules

rule config:
    params:
        seq = "data/tb.vcf.gz",
        meta = "data/meta.tsv",
        no_auth_meta = "data/meta-noAuth.tsv",
        exclude = "data/dropped_strains.txt",
        mask = "data/Locus_to_exclude_Mtb.bed",
        ref = "data/ref.fasta",
        drms = "data/DRMs.txt",
        sites = "data/drm_sites.txt",
        generef = "data/Mtb_H37Rv_NCBI_Annot.gff",
        genes = "data/genes.txt",
        colors = "data/color.tsv",
        config = "data/config.json",
        no_auth_config = "data/no-auth-config.json",
        geo_info = "data/lat_longs.tsv",
        clades = "data/clades.tsv"

config = rules.config.params #so we can use config.x rather than rules.config.params.x
#end of config definition

rule filter:
    input:
        seq = config.seq,
        meta = config.meta,
        exclude = config.exclude
    output:
        "results/filtered.vcf.gz"
    params:
        sequences_per_group = 10,
        group_by = "year",
        min_len = 200000
    shell:
        """
        augur filter --sequences {input.seq} --metadata {input.meta} \
            --output {output} \
            --exclude {input.exclude} \
            --group-by {params.group_by} \
            --sequences-per-group {params.sequences_per_group} \
            --no-probabilistic-sampling
        """

rule mask:
    input:
        seq = rules.filter.output,
        mask = config.mask
    output:
       "results/masked.vcf.gz"
    shell:
        "augur mask --sequences {input.seq} --output {output} --mask {input.mask}"

rule tree:
    input:
        aln = rules.mask.output,
        ref = config.ref,
        sites = config.sites
    output:
        "results/tree_raw.nwk"
    params:
        method = 'fasttree'
    shell:
        'augur tree --exclude-sites {input.sites} --alignment {input.aln} --vcf-reference {input.ref} --output {output} --method {params.method}'

rule refine:
    input:
        tree = rules.tree.output,
        aln = rules.mask.output,
        metadata = config.meta,
        ref = config.ref
    output:
        tree = "results/tree.nwk",
        node_data = "results/branch_lengths.json",
    params:
        root = 'min_dev',
        clock_rate = 1e-7,
        clock_std = 3e-8
    shell:
        """
        augur refine --tree {input.tree} --alignment {input.aln} --metadata {input.metadata} \
            --output-tree {output.tree} --output-node-data {output.node_data} --vcf-reference {input.ref} \
            --timetree --root {params.root} --coalescent 0.0001 \
            --clock-rate {params.clock_rate} --clock-std-dev {params.clock_std}
        """

rule ancestral:
    input:
        tree = rules.refine.output.tree,
        alignment = rules.mask.output,
        ref = config.ref
    output:
        nt_data = "results/nt_muts.json",
        vcf_out = "results/nt_muts.vcf"
    params:
        inference = "joint"
    shell:
        """
        augur ancestral --tree {input.tree} --alignment {input.alignment} \
            --output-node-data {output.nt_data} --inference {params.inference} \
            --output-vcf {output.vcf_out} --vcf-reference {input.ref}
        """

rule translate:
    input:
        tree = rules.refine.output.tree,
        ref = config.ref,
        gene_ref = config.generef,
        vcf = rules.ancestral.output.vcf_out,
        genes = config.genes
    output:
        aa_data = "results/aa_muts.json",
        vcf_out = "results/translations.vcf",
        fasta_out = "results/translations_reference.fasta"
    shell:
        """
        augur translate --tree {input.tree} --genes {input.genes} --vcf-reference {input.ref} \
            --ancestral-sequences {input.vcf} --output-node-data {output.aa_data} --reference-sequence {input.gene_ref} \
            --alignment-output {output.vcf_out} --vcf-reference-output {output.fasta_out}
        """

#To get alignments for these genes, run 'snakemake reconstruct_all'
genes = ["cycA", "esxU", "rpoC", "Rv2752c"]
rule reconstruct:
    input:
        tree = rules.refine.output.tree,
        aa_muts = rules.translate.output.aa_data,
        vcf_translate_ref = rules.translate.output.fasta_out
    params:
        gene = "{gene}"
    output:
        "results/translate_{gene}.fasta"
    shell:
        """
        augur reconstruct-sequences --tree {input.tree} --gene {params.gene} \
            --mutations {input.aa_muts} \
            --vcf-aa-reference {input.vcf_translate_ref} --output {output}
        """

rule reconstruct_all:
    input:
        expand(["results/translate_{gene}.fasta"], gene=genes)

rule clades:
    input:
        tree = rules.refine.output.tree,
        aa_muts = rules.translate.output.aa_data,
        nuc_muts = rules.ancestral.output.nt_data,
        clades = config.clades
    output:
        clade_data = "results/clades.json"
    shell:
        """
        augur clades --tree {input.tree} \
            --mutations {input.nuc_muts} {input.aa_muts} \
            --output-node-data {output.clade_data} --clades {input.clades}
        """

rule traits:
    input:
        tree = rules.refine.output.tree,
        meta = config.meta
    output:
        "results/traits.json"
    params:
        traits = 'location cluster'
    shell:
        'augur traits --tree {input.tree} --metadata {input.meta}'
        ' --columns {params.traits} --output-node-data {output}'

rule export_v1:
    message: "Exporting data files for for auspice using nextflu compatible schema"
    input:
        tree = rules.refine.output.tree,
        metadata = config.meta,
        branch_lengths = rules.refine.output.node_data,
        traits = rules.traits.output,
        nt_muts = rules.ancestral.output.nt_data,
        aa_muts = rules.translate.output.aa_data,
        color_defs = config.colors,
        config = config.config,
        geo_info = config.geo_info,
        clades = rules.clades.output.clade_data,
        ref = config.ref,
        translations = rules.translate.output.fasta_out
    output:
        tree = rules.all.input.auspice_v1_tree,
        meta = rules.all.input.auspice_v1_meta,
        seq = rules.all.input.auspice_v1_seq
    shell:
        """
        augur export v1 \
            --tree {input.tree} \
            --metadata {input.metadata} \
            --reference {input.ref} --reference-translations {input.translations} \
            --node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} {input.clades} \
            --auspice-config {input.config} \
            --colors {input.color_defs} \
            --lat-longs {input.geo_info} \
            --output-tree {output.tree} \
            --output-meta {output.meta} \
            --output-sequence {output.seq}
        augur validate export-v1 {output.meta} {output.tree}
        """

rule no_auth_export_v1:
    message: "Exporting data files for for auspice using nextflu compatible schema"
    input:
        tree = rules.refine.output.tree,
        metadata = config.no_auth_meta,
        branch_lengths = rules.refine.output.node_data,
        traits = rules.traits.output,
        nt_muts = rules.ancestral.output.nt_data,
        aa_muts = rules.translate.output.aa_data,
        color_defs = config.colors,
        config = config.no_auth_config,
        geo_info = config.geo_info,
        clades = rules.clades.output.clade_data,
        ref = config.ref,
        translations = rules.translate.output.fasta_out
    output:
        tree = rules.all.input.auspice_noauth_tree,
        meta = rules.all.input.auspice_noauth_meta
    shell:
        """
        augur export v1\
            --tree {input.tree} \
            --metadata {input.metadata} \
            --reference {input.ref} --reference-translations {input.translations} \
            --node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} {input.clades} \
            --auspice-config {input.config} \
            --colors {input.color_defs} \
            --lat-longs {input.geo_info} \
            --output-tree {output.tree} \
            --output-meta {output.meta}
        augur validate export-v1 {output.meta} {output.tree}
        """


rule export_v2:
    message: "Exporting data files for for auspice using nextstrain schema v2"
    input:
        tree = rules.refine.output.tree,
        metadata = config.meta,
        branch_lengths = rules.refine.output.node_data,
        traits = rules.traits.output,
        nt_muts = rules.ancestral.output.nt_data,
        aa_muts = rules.translate.output.aa_data,
        color_defs = config.colors,
        geo_info = config.geo_info,
        clades = rules.clades.output.clade_data,
        ref = config.ref,
        translations = rules.translate.output.fasta_out
    output:
        main = rules.all.input.auspice_v2
    params:
        title = '\'TB outbreak in Nunavik, Canada\'',
        maints = "'Emma Hodcroft <https://neherlab.org/emma-hodcroft.html>' 'John Brown <http://www.google.com>'",
        geo = 'location',
        # extra_traits = 'host',
        # panels = 'tree map entropy frequencies'
    shell:
        """
        augur export v2 \
            --tree {input.tree} \
            --metadata {input.metadata} \
            --node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} {input.clades} \
            --colors {input.color_defs} \
            --lat-longs {input.geo_info} \
            --output {output.main} \
            --title {params.title} \
            --maintainers {params.maints} \
            --geo-resolutions {params.geo}
        """

rule clean:
    message: "Removing directories: {params}"
    params:
        "results ",
        "auspice"
    shell:
        "rm -rfv {params}"
