## Copyright Broad Institute, 2017
##
## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample,
## and performs additional filtering and functional annotation tasks.
##
## Main requirements/expectations :
## - One analysis-ready BAM file (and its index) for each sample
##
## Description of inputs:
##
## ** Runtime ** 
## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator
## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google)
## max_retries: how many times to retry failed tasks -- very important on the cloud when there are transient errors
## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar
##                in the docker image.  This must be supplied when running in an environment that does not support docker
##                (e.g. SGE cluster on a Broad on-prem VM)
##
## ** Workflow options **
## intervals: genomic intervals (will be used for scatter)
## scatter_count: number of parallel jobs to generate when scattering over intervals
## artifact_modes: (optional) types of artifacts to consider in the orientation bias filter
## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional)
## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional)
## run_orientation_bias_filter: (optional) if true, run the orientation bias filter, which is the GATK implementation of
##         D-ToxoG with modifications to allow multiple artifact modes.
##         For more information on D-ToxoG, see https://software.broadinstitute.org/cancer/cga/dtoxog
## run_orientation_bias_mixture_model_filter: (optional) if true, filter orientation bias sites with the read orientation artifact mixture model.
##         This is the recommended orientation bias filter, particularly for data sequenced on Illumina NovaSeq.
##         If set to true, artifact_mode will be ignored, as the model learns the artifact modes on its own.
##         While we offer both options, there's no need to run both the mixture model filter and the one based on D-ToxoG. 
## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF).  Important:  This requires a
##                   docker image and should  not be run in environments where docker is unavailable (e.g. SGE cluster on
##                   a Broad on-prem VM).  Access to docker hub is also required, since the task downloads a public docker image.
##                   (optional, false by default)
##
## ** Primary inputs **
## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary
## tumor_bam, tumor_bam_index: BAM and index for the tumor sample
## normal_bam, normal_bam_index: BAM and index for the normal sample
##
## ** Primary resources ** (optional but strongly recommended)
## pon, pon_index: optional panel of normals in VCF format containing probable technical artifacts (false positves)
## gnomad, gnomad_index: optional database of known germline variants (see http://gnomad.broadinstitute.org/downloads)
## variants_for_contamination, variants_for_contamination_index: VCF of common variants with allele frequencies for calculating contamination
##
## ** Secondary resources ** (for optional tasks)
## onco_ds_tar_gz, default_config_file: Oncotator datasources and config file
## sequencing_center, sequence_source: metadata for Oncotator
## filter_oncotator_maf: Whether the MAF generated by oncotator should have the filtered variants removed. Default: true
## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified.  Generated by BwaMemIndexImageCreator.
##
## Funcotator parameters (see Funcotator help for more details).
## funco_reference_version: "hg19" for hg19 or b37.  "hg38" for hg38.  Default: "hg19"
## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process.
## funco_transcript_selection_mode: How to select transcripts in Funcotator.  ALL, CANONICAL, or BEST_EFFECT
## funco_data_sources_tar_gz:  Funcotator datasources tar gz file.  Bucket location is recommended when running on the cloud.
## funco_annotation_defaults:  Default values for annotations, when values are unspecified.  Specified as  <ANNOTATION>:<VALUE>.  For example:  "Center:Broad"
## funco_annotation_overrides:  Values for annotations, even when values are unspecified.  Specified as  <ANNOTATION>:<VALUE>.  For example:  "Center:Broad"
## funcotator_excluded_fields:  Annotations that should not appear in the output (VCF or MAF).  Specified as  <ANNOTATION>.  For example:  "ClinVar_ALLELEID"
##
## Outputs :
## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam
##   file of reassembled reads if requested
##
## Cromwell version support
## - Successfully tested on v29
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the docker
## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information
## pertaining to the included programs.
workflow Mutect2 {
    # Mutect2 inputs
    File? intervals
    File ref_fasta
    File ref_fai
    File ref_dict
    File tumor_bam
    File tumor_bai
    File? normal_bam
    File? normal_bai
    File? pon
    File? pon_index
    Int scatter_count
    File? gnomad
    File? gnomad_index
    File? variants_for_contamination
    File? variants_for_contamination_index
    File? realignment_index_bundle
    String? realignment_extra_args
    Boolean? run_orientation_bias_filter
    Boolean run_ob_filter = select_first([run_orientation_bias_filter, false]) && (length(select_first([artifact_modes, ["G/T", "C/T"]])) > 0)
    Boolean? run_orientation_bias_mixture_model_filter
    Boolean run_ob_mm_filter = select_first([run_orientation_bias_mixture_model_filter, false])
    File? ob_mm_filter_training_intervals
    Array[String]? artifact_modes
    File? tumor_sequencing_artifact_metrics
    String? m2_extra_args
    String? m2_extra_filtering_args
    String? split_intervals_extra_args
    Boolean? make_bamout
    Boolean make_bamout_or_default = select_first([make_bamout, false])
    Boolean? compress_vcfs
    Boolean compress = select_first([compress_vcfs, false])
    File? gga_vcf
    File? gga_vcf_idx

    # oncotator inputs
    Boolean? run_oncotator
    Boolean run_oncotator_or_default = select_first([run_oncotator, false])
    File? onco_ds_tar_gz
    String? onco_ds_local_db_dir
    String? sequencing_center
    String? sequence_source
    File? default_config_file
    String? oncotator_extra_args

    # funcotator inputs
    Boolean? run_funcotator
    Boolean run_funcotator_or_default = select_first([run_funcotator, false])
    String? funco_reference_version
    File? funco_data_sources_tar_gz
    String? funco_transcript_selection_mode
    File? funco_transcript_selection_list
    Array[String]? funco_annotation_defaults
    Array[String]? funco_annotation_overrides
    Array[String]? funcotator_excluded_fields
    String? funcotator_extra_args

    File? gatk_override

    # runtime
    String gatk_docker
    String basic_bash_docker = "ubuntu:16.04"
    String? oncotator_docker
    String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"])
    Boolean? filter_oncotator_maf
    Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true])
    Boolean? filter_funcotations
    Boolean filter_funcotations_or_default = select_first([filter_funcotations, true])

    Int? preemptible_attempts
    Int? max_retries

    # Use as a last resort to increase the disk given to every task in case of ill behaving data
    Int? emergency_extra_disk

    # Disk sizes used for dynamic sizing
    Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB"))
    Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bai, "GB"))
    Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB") + size(gnomad_index, "GB")) else 0
    Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0

    # If no tar is provided, the task downloads one from broads ftp server
    Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100
    Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100
    Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0

    # This is added to every task as padding, should increase if systematically you need more disk for every call
    Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0])

    # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes
    # Large is for Bams/WGS vcfs
    # Small is for metrics/other vcfs
    Float large_input_to_output_multiplier = 2.25
    Float small_input_to_output_multiplier = 2.0

    # logic about output file names -- these are the names *without* .vcf extensions
    String output_basename = basename(tumor_bam, ".bam")
    String unfiltered_name = output_basename + "-unfiltered"
    String filtered_name = output_basename + "-filtered"
    String funcotated_name = output_basename + "-funcotated"

    String output_vcf_name = basename(tumor_bam, ".bam") + ".vcf"


    call SplitIntervals {
        input:
            intervals = intervals,
            ref_fasta = ref_fasta,
            ref_fai = ref_fai,
            ref_dict = ref_dict,
            scatter_count = scatter_count,
            split_intervals_extra_args = split_intervals_extra_args,
            gatk_override = gatk_override,
            gatk_docker = gatk_docker,
            preemptible_attempts = preemptible_attempts,
            max_retries = max_retries,
            disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad
    }

    Int m2_output_size = tumor_bam_size / scatter_count
    scatter (subintervals in SplitIntervals.interval_files ) {
        call M2 {
            input:
                intervals = subintervals,
                ref_fasta = ref_fasta,
                ref_fai = ref_fai,
                ref_dict = ref_dict,
                tumor_bam = tumor_bam,
                tumor_bai = tumor_bai,
                normal_bam = normal_bam,
                normal_bai = normal_bai,
                pon = pon,
                pon_index = pon_index,
                gnomad = gnomad,
                gnomad_index = gnomad_index,
                preemptible_attempts = preemptible_attempts,
                max_retries = max_retries,
                m2_extra_args = m2_extra_args,
                make_bamout = make_bamout_or_default,
                artifact_prior_table = LearnReadOrientationModel.artifact_prior_table,
                compress = compress,
                gga_vcf = gga_vcf,
                gga_vcf_idx = gga_vcf_idx,
                gatk_override = gatk_override,
                gatk_docker = gatk_docker,
                disk_space = tumor_bam_size + normal_bam_size + ref_size + gnomad_vcf_size + m2_output_size + disk_pad
        }

        Float sub_vcf_size = size(M2.unfiltered_vcf, "GB")
        Float sub_bamout_size = size(M2.output_bamOut, "GB")
    }

    call SumFloats as SumSubVcfs {
        input:
            sizes = sub_vcf_size,
            preemptible_attempts = preemptible_attempts,
            max_retries = max_retries
    }

    call MergeVCFs {
        input:
            input_vcfs = M2.unfiltered_vcf,
            input_vcf_indices = M2.unfiltered_vcf_index,
            output_name = unfiltered_name,
            compress = compress,
            gatk_override = gatk_override,
            gatk_docker = gatk_docker,
            preemptible_attempts = preemptible_attempts,
            max_retries = max_retries,
            disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad
    }

    if (make_bamout_or_default) {
        call SumFloats as SumSubBamouts {
            input:
                sizes = sub_bamout_size,
                preemptible_attempts = preemptible_attempts,
                max_retries = max_retries
        }

        call MergeBamOuts {
            input:
                ref_fasta = ref_fasta,
                ref_fai = ref_fai,
                ref_dict = ref_dict,
                bam_outs = M2.output_bamOut,
                output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"),
                gatk_override = gatk_override,
                gatk_docker = gatk_docker,
                disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad,
                max_retries = max_retries
        }
    }

    if (run_ob_filter && !defined(tumor_sequencing_artifact_metrics)) {
        call CollectSequencingArtifactMetrics {
            input:
                gatk_docker = gatk_docker,
                ref_fasta = ref_fasta,
                ref_fai = ref_fai,
                preemptible_attempts = preemptible_attempts,
                max_retries = max_retries,
                tumor_bam = tumor_bam,
                tumor_bai = tumor_bai,
                gatk_override = gatk_override,
                disk_space = tumor_bam_size + ref_size + disk_pad
        }
    }

    if (run_ob_mm_filter) {
        call CollectF1R2Counts {
            input:
                gatk_docker = gatk_docker,
                ref_fasta = ref_fasta,
                ref_fai = ref_fai,
                ref_dict = ref_dict,
                preemptible_attempts = preemptible_attempts,
                tumor_bam = tumor_bam,
                tumor_bai = tumor_bai,
                gatk_override = gatk_override,
                disk_space = tumor_bam_size + ref_size + disk_pad,
                intervals = if defined(ob_mm_filter_training_intervals) then ob_mm_filter_training_intervals else intervals,
                max_retries = max_retries
        }

        call LearnReadOrientationModel {
            input:
                alt_table = CollectF1R2Counts.alt_table,
                ref_histogram = CollectF1R2Counts.ref_histogram,
                alt_histograms = CollectF1R2Counts.alt_histograms,
                tumor_sample = CollectF1R2Counts.tumor_sample,
                gatk_override = gatk_override,
                gatk_docker = gatk_docker,
                preemptible_attempts = preemptible_attempts,
                max_retries = max_retries
        }
    }

    if (defined(variants_for_contamination)) {
        call CalculateContamination {
            input:
                gatk_override = gatk_override,
                intervals = intervals,
                ref_fasta = ref_fasta,
                ref_fai = ref_fai,
                ref_dict = ref_dict,
                preemptible_attempts = preemptible_attempts,
                max_retries = max_retries,
                gatk_docker = gatk_docker,
                tumor_bam = tumor_bam,
                tumor_bai = tumor_bai,
                normal_bam = normal_bam,
                normal_bai = normal_bai,
                variants_for_contamination = variants_for_contamination,
                variants_for_contamination_index = variants_for_contamination_index,
                disk_space = tumor_bam_size + normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad
        }
    }

    call Filter {
        input:
            gatk_override = gatk_override,
            gatk_docker = gatk_docker,
            intervals = intervals,
            unfiltered_vcf = MergeVCFs.merged_vcf,
            unfiltered_vcf_index = MergeVCFs.merged_vcf_index,
            output_name = filtered_name,
            compress = compress,
            preemptible_attempts = preemptible_attempts,
            max_retries = max_retries,
            contamination_table = CalculateContamination.contamination_table,
            maf_segments = CalculateContamination.maf_segments,
            m2_extra_filtering_args = m2_extra_filtering_args,
            disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad
    }

    if (run_ob_filter) {
        # Get the metrics either from the workflow input or CollectSequencingArtifactMetrics if no workflow input is provided
        File input_artifact_metrics = select_first([tumor_sequencing_artifact_metrics, CollectSequencingArtifactMetrics.pre_adapter_metrics])

        call FilterByOrientationBias {
            input:
                gatk_override = gatk_override,
                input_vcf = Filter.filtered_vcf,
                input_vcf_index = Filter.filtered_vcf_index,
                output_name = filtered_name,
                compress = compress,
                gatk_docker = gatk_docker,
                preemptible_attempts = preemptible_attempts,
                max_retries = max_retries,
                pre_adapter_metrics = input_artifact_metrics,
                artifact_modes = artifact_modes,
                disk_space = ceil(size(Filter.filtered_vcf, "GB") * small_input_to_output_multiplier) + ceil(size(input_artifact_metrics, "GB")) + disk_pad
        }
    }

    if (defined(realignment_index_bundle)) {
        File realignment_filter_input = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
        File realignment_filter_input_idx = select_first([FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index])
        call FilterAlignmentArtifacts {
            input:
                gatk_override = gatk_override,
                bam = tumor_bam,
                bai = tumor_bai,
                realignment_index_bundle = select_first([realignment_index_bundle]),
                realignment_extra_args = realignment_extra_args,
                gatk_docker = gatk_docker,
                max_retries = max_retries,
                compress = compress,
                output_name = filtered_name,
                input_vcf = realignment_filter_input,
                input_vcf_idx = realignment_filter_input_idx
        }
    }

    if (run_oncotator_or_default) {
        File oncotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
        call oncotate_m2 {
            input:
                m2_vcf = oncotate_vcf_input,
                onco_ds_tar_gz = onco_ds_tar_gz,
                onco_ds_local_db_dir = onco_ds_local_db_dir,
                sequencing_center = sequencing_center,
                sequence_source = sequence_source,
                default_config_file = default_config_file,
                case_id = M2.tumor_sample[0],
                control_id = M2.normal_sample[0],
                oncotator_docker = oncotator_docker_or_default,
                preemptible_attempts = preemptible_attempts,
                max_retries = max_retries,
                disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad,
                filter_maf = filter_oncotator_maf_or_default,
                oncotator_extra_args = oncotator_extra_args
        }
    }

    if (run_funcotator_or_default) {
        File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
        File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index])
        call FuncotateMaf {
            input:
                input_vcf = funcotate_vcf_input,
                input_vcf_idx = funcotate_vcf_input_index,
                ref_fasta = ref_fasta,
                ref_fasta_index = ref_fai,
                ref_dict = ref_dict,
                reference_version = select_first([funco_reference_version, "hg19"]),
                data_sources_tar_gz = funco_data_sources_tar_gz,
                case_id = M2.tumor_sample[0],
                control_id = M2.normal_sample[0],
                transcript_selection_mode = funco_transcript_selection_mode,
                transcript_selection_list = funco_transcript_selection_list,
                annotation_defaults = funco_annotation_defaults,
                annotation_overrides = funco_annotation_overrides,
                gatk_docker = gatk_docker,
                gatk_override = gatk_override,
                filter_funcotations = filter_funcotations_or_default,
                funcotator_excluded_fields = funcotator_excluded_fields,
                sequencing_center = sequencing_center,
                sequence_source = sequence_source,
                disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad,
                max_retries = max_retries,
                extra_args = funcotator_extra_args
        }
    }

    output {
        File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
        File filtered_vcf_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index])
        File? contamination_table = CalculateContamination.contamination_table
        File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf
        File? funcotated_maf = FuncotateMaf.funcotated_output
        File? preadapter_detail_metrics = CollectSequencingArtifactMetrics.pre_adapter_metrics
        File? bamout = MergeBamOuts.merged_bam_out
        File? bamout_index = MergeBamOuts.merged_bam_out_index
        File? maf_segments = CalculateContamination.maf_segments
    }
}

task SplitIntervals {
    # inputs
    File? intervals
    File ref_fasta
    File ref_fai
    File ref_dict
    Int scatter_count
    String? split_intervals_extra_args

    File? gatk_override

    # runtime
    String gatk_docker
    Int? mem
    Int? preemptible_attempts
    Int? max_retries
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 3500
    Int command_mem = machine_mem - 500

    command {
        set -e
        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

        mkdir interval-files
        gatk --java-options "-Xmx${command_mem}m" SplitIntervals \
            -R ${ref_fasta} \
            ${"-L " + intervals} \
            -scatter ${scatter_count} \
            -O interval-files \
            ${split_intervals_extra_args}
        cp interval-files/*.intervals .
    }

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: machine_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 0])
        cpu: select_first([cpu, 1])
    }

    output {
        Array[File] interval_files = glob("*.intervals")
    }
}

task M2 {
    # inputs
    File? intervals
    File ref_fasta
    File ref_fai
    File ref_dict
    File tumor_bam
    File tumor_bai
    File? normal_bam
    File? normal_bai
    File? pon
    File? pon_index
    File? gnomad
    File? gnomad_index
    String? m2_extra_args
    Boolean? make_bamout
    Boolean compress
    File? gga_vcf
    File? gga_vcf_idx
    File? artifact_prior_table

    String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
    String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"

    File? gatk_override

    # runtime
    String gatk_docker
    Int? mem
    Int? preemptible_attempts
    Int? max_retries
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 3500
    Int command_mem = machine_mem - 500


    command <<<
        set -e

        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

        # We need to create these files regardless, even if they stay empty
        touch bamout.bam
        echo "" > normal_name.txt

        gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode
        tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`"

        if [[ -f "${normal_bam}" ]]; then
            gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode
            normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`"
        fi

        gatk --java-options "-Xmx${command_mem}m" Mutect2 \
            -R ${ref_fasta} \
            $tumor_command_line \
            $normal_command_line \
            ${"--germline-resource " + gnomad} \
            ${"-pon " + pon} \
            ${"-L " + intervals} \
            ${"--genotyping-mode GENOTYPE_GIVEN_ALLELES --alleles " + gga_vcf} \
            -O "${output_vcf}" \
            ${true='--bam-output bamout.bam' false='' make_bamout} \
            ${"--orientation-bias-artifact-priors " + artifact_prior_table} \
            ${m2_extra_args}
    >>>

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: machine_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File unfiltered_vcf = "${output_vcf}"
        File unfiltered_vcf_index = "${output_vcf_index}"
        File output_bamOut = "bamout.bam"
        String tumor_sample = read_string("tumor_name.txt")
        String normal_sample = read_string("normal_name.txt")
    }
}

task MergeVCFs {
    # inputs
    Array[File] input_vcfs
    Array[File] input_vcf_indices
    String output_name
    Boolean compress
    String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
    String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"

    File? gatk_override

    # runtime
    String gatk_docker
    Int? mem
    Int? preemptible_attempts
    Int? max_retries
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 3500
    Int command_mem = machine_mem - 1000

    # using MergeVcfs instead of GatherVcfs so we can create indices
    # WARNING 2015-10-28 15:01:48 GatherVcfs  Index creation not currently supported when gathering block compressed VCFs.
    command {
        set -e
        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
        gatk --java-options "-Xmx${command_mem}m" MergeVcfs -I ${sep=' -I ' input_vcfs} -O ${output_vcf}
    }

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: machine_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File merged_vcf = "${output_vcf}"
        File merged_vcf_index = "${output_vcf_index}"
    }
}

task MergeBamOuts {
    # inputs
    File ref_fasta
    File ref_fai
    File ref_dict
    Array[File]+ bam_outs
    String output_vcf_name

    File? gatk_override

    # runtime
    String gatk_docker
    Int? mem
    Int? preemptible_attempts
    Int? max_retries
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 7000
    Int command_mem = machine_mem - 1000

    command <<<
        # This command block assumes that there is at least one file in bam_outs.
        #  Do not call this task if len(bam_outs) == 0
        set -e
        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
        gatk --java-options "-Xmx${command_mem}m" GatherBamFiles \
            -I ${sep=" -I " bam_outs} -O unsorted.out.bam -R ${ref_fasta}

        # We must sort because adjacent scatters may have overlapping (padded) assembly regions, hence
        # overlapping bamouts

        gatk --java-options "-Xmx${command_mem}m" SortSam -I unsorted.out.bam \
            -O ${output_vcf_name}.out.bam \
            --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT
        gatk --java-options "-Xmx${command_mem}m" BuildBamIndex -I ${output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT
    >>>

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: machine_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File merged_bam_out = "${output_vcf_name}.out.bam"
        File merged_bam_out_index = "${output_vcf_name}.out.bai"
    }
}

task CollectSequencingArtifactMetrics {
    # inputs
    File ref_fasta
    File ref_fai
    File tumor_bam
    File tumor_bai

    File? gatk_override

    # runtime
    String gatk_docker
    Int? mem
    Int? preemptible_attempts
    Int? max_retries
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 7000
    Int command_mem = machine_mem - 1000

    command {
        set -e
        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
        gatk --java-options "-Xmx${command_mem}m" CollectSequencingArtifactMetrics \
            -I ${tumor_bam} -O "gatk" -R ${ref_fasta} -VALIDATION_STRINGENCY LENIENT
    }

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: machine_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File pre_adapter_metrics = "gatk.pre_adapter_detail_metrics"
    }
}

# Data collection step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018
task CollectF1R2Counts {
    # input
    File ref_fasta
    File ref_fai
    File ref_dict
    File tumor_bam
    File tumor_bai

    File? gatk_override
    File? intervals

    # runtime
    Int? max_retries
    String gatk_docker
    Int? mem
    Int? preemptible_attempts
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 7000
    Int command_mem = machine_mem - 1000

    command {
        set -e
        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
        
        # Get the sample name. The task M2 retrieves this information too, but it must be done separately here
        # to avoid a cyclic dependency
        gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode
        tumor_name=$(head -n 1 tumor_name.txt)

        gatk --java-options "-Xmx${command_mem}m" CollectF1R2Counts \
        -I ${tumor_bam} -R ${ref_fasta} \
        ${"-L " + intervals} \
        -alt-table "$tumor_name-alt.tsv" \
        -ref-hist "$tumor_name-ref.metrics" \
        -alt-hist "$tumor_name-alt-depth1.metrics"
    }

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: machine_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File alt_table = glob("*-alt.tsv")[0]
        File ref_histogram = glob("*-ref.metrics")[0]
        File alt_histograms = glob("*-alt-depth1.metrics")[0]
        String tumor_sample = read_string("tumor_name.txt")
    }
}

# Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018
task LearnReadOrientationModel {
    File alt_table
    File ref_histogram
    File? alt_histograms

    File? gatk_override
    File? intervals
    String tumor_sample

    # runtime
    Int? max_retries
    String gatk_docker
    Int? mem
    Int? preemptible_attempts
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 8000
    Int command_mem = machine_mem - 1000

    command {
        set -e
        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

        gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \
        -alt-table ${alt_table} \
        -ref-hist ${ref_histogram} \
        -alt-hist ${alt_histograms} \
        -O "${tumor_sample}-artifact-prior-table.tsv"
    }

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: machine_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File artifact_prior_table = "${tumor_sample}-artifact-prior-table.tsv"
    }

}

task CalculateContamination {
    # inputs
    File? intervals
    File ref_fasta
    File ref_fai
    File ref_dict
    File tumor_bam
    File tumor_bai
    File? normal_bam
    File? normal_bai
    File? variants_for_contamination
    File? variants_for_contamination_index

    File? gatk_override

    # runtime
    Int? preemptible_attempts
    Int? max_retries
    String gatk_docker
    Int? disk_space
    Int? mem

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 3000
    Int command_mem = machine_mem - 500

    command {
        set -e

        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

        if [[ -f "${normal_bam}" ]]; then
            gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
                -V ${variants_for_contamination} -L ${variants_for_contamination} -O normal_pileups.table
            NORMAL_CMD="-matched normal_pileups.table"
        fi

        gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
            -V ${variants_for_contamination} -L ${variants_for_contamination} -O pileups.table
        gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table --tumor-segmentation segments.table $NORMAL_CMD
    }

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: command_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
    }

    output {
        File pileups = "pileups.table"
        File contamination_table = "contamination.table"
        File maf_segments = "segments.table"
    }
}

task Filter {
    # inputs
    File? intervals
    File unfiltered_vcf
    File unfiltered_vcf_index
    String output_name
    Boolean compress
    String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
    String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
    File? contamination_table
    File? maf_segments
    String? m2_extra_filtering_args

    File? gatk_override

    # runtime
    String gatk_docker
    Int? mem
    Int? preemptible_attempts
    Int? max_retries
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 7000
    Int command_mem = machine_mem - 500

    command {
        set -e

        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

        gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \
      	    -O ${output_vcf} \
      	    ${"--contamination-table " + contamination_table} \
      	    ${"--tumor-segmentation " + maf_segments} \
      	    ${m2_extra_filtering_args}
    }

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: machine_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File filtered_vcf = "${output_vcf}"
        File filtered_vcf_index = "${output_vcf_index}"
    }
}

task FilterByOrientationBias {
    # input
    File? gatk_override
    File input_vcf
    File input_vcf_index
    String output_name
    Boolean compress
    String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
    String output_vcf_index = output_vcf +  if compress then ".tbi" else ".idx"
    File pre_adapter_metrics
    Array[String]? artifact_modes

    # If artifact modes is passed in to the task as [], this task will fail.
    Array[String] final_artifact_modes = select_first([artifact_modes, ["G/T", "C/T"]])

    # runtime
    Int? preemptible_attempts
    Int? max_retries
    String gatk_docker
    Int? disk_space
    Int? mem
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 7000
    Int command_mem = machine_mem - 500

    command {
        set -e

        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

        gatk --java-options "-Xmx${command_mem}m" FilterByOrientationBias \
            -V ${input_vcf} \
            -AM ${sep=" -AM " final_artifact_modes} \
            -P ${pre_adapter_metrics} \
            -O ${output_vcf}
    }

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: command_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File filtered_vcf = "${output_vcf}"
        File filtered_vcf_index = "${output_vcf_index}"
    }
}

task FilterAlignmentArtifacts {
    #input
    File? gatk_override
    File input_vcf
    File input_vcf_idx
    File bam
    File bai
    String output_name
    Boolean compress
    String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
    String output_vcf_index = output_vcf +  if compress then ".tbi" else ".idx"
    File realignment_index_bundle
    String? realignment_extra_args

    # runtime
    String gatk_docker
    Int? mem
    Int? preemptible_attempts
    Int? max_retries
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 9000
    Int command_mem = machine_mem - 500

    command {
        set -e

        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

        gatk --java-options "-Xmx${command_mem}m" FilterAlignmentArtifacts \
            -V ${input_vcf} \
            -I ${bam} \
            --bwa-mem-index-image ${realignment_index_bundle} \
            ${realignment_extra_args} \
            -O ${output_vcf}
    }

    runtime {
        docker: gatk_docker
        bootDiskSizeGb: 12
        memory: command_mem + " MB"
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File filtered_vcf = "${output_vcf}"
        File filtered_vcf_index = "${output_vcf_index}"
    }
}

task oncotate_m2 {
    # inputs
    File m2_vcf
    File? onco_ds_tar_gz
    String? onco_ds_local_db_dir
    String? oncotator_exe
    String? sequencing_center
    String? sequence_source
    File? default_config_file
    String case_id
    String? control_id
    String? oncotator_extra_args

    # runtime
    String oncotator_docker
    Int? mem
    Int? preemptible_attempts
    Int? max_retries
    Int? disk_space
    Int? cpu
    Boolean use_ssd = false

    Boolean? filter_maf
    Boolean is_filter_maf = select_first([filter_maf, true])
    String filter_maf_args = if (is_filter_maf) then " --collapse-filter-cols --prune-filter-cols " else ""

    # Mem is in units of GB but our command and memory runtime values are in MB
    Int machine_mem = if defined(mem) then mem * 1000 else 3500
    Int command_mem = machine_mem - 500

    command <<<
        # fail if *any* command below (not just the last) doesn't return 0, in particular if wget fails
        set -e

        # local db dir is a directory and has been specified
        if [[ -d "${onco_ds_local_db_dir}" ]]; then
            echo "Using local db-dir: ${onco_ds_local_db_dir}"
            echo "THIS ONLY WORKS WITHOUT DOCKER!"
            ln -s ${onco_ds_local_db_dir} onco_dbdir
        elif [[ "${onco_ds_tar_gz}" == *.tar.gz ]]; then
            echo "Using given tar file: ${onco_ds_tar_gz}"
            mkdir onco_dbdir
            tar zxvf ${onco_ds_tar_gz} -C onco_dbdir --strip-components 1
        else
            echo "Downloading and installing oncotator datasources from Broad FTP site..."
            # Download and untar the db-dir
            wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/oncotator/oncotator_v1_ds_April052016.tar.gz
            tar zxvf oncotator_v1_ds_April052016.tar.gz
            ln -s oncotator_v1_ds_April052016 onco_dbdir
        fi

        ${default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt  \
            -v ${m2_vcf} ${case_id}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --collapse-number-annotations --log_name oncotator.log \
            -a Center:${default="Unknown" sequencing_center} \
            -a source:${default="Unknown" sequence_source} \
            -a normal_barcode:${control_id} \
            -a tumor_barcode:${case_id} \
            ${"--default_config " + default_config_file} \
            ${filter_maf_args} \
            ${oncotator_extra_args}
    >>>

    runtime {
        docker: oncotator_docker
        memory: machine_mem + " MB"
        bootDiskSizeGb: 12
        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
        cpu: select_first([cpu, 1])
    }

    output {
        File oncotated_m2_maf="${case_id}.maf.annotated"
    }
}

# Calculates sum of a list of floats
task SumFloats {
    Array[Float] sizes

    # Runtime parameters
    Int? preemptible_attempts
    Int? max_retries

    command <<<
        python -c "print ${sep="+" sizes}"
    >>>

    output {
        Float total_size = read_float(stdout())
    }

    runtime {
        docker: "python:2.7"
        disks: "local-disk " + 10 + " HDD"
        preemptible: select_first([preemptible_attempts, 10])
        maxRetries: select_first([max_retries, 3])
    }
}

task FuncotateMaf {
     # inputs
     File ref_fasta
     File ref_fasta_index
     File ref_dict
     File input_vcf
     File input_vcf_idx
     String reference_version
     String output_format = "MAF"
     String? sequencing_center
     String? sequence_source
     String case_id
     String? control_id

     File? data_sources_tar_gz
     String? transcript_selection_mode
     File? transcript_selection_list
     Array[String]? annotation_defaults
     Array[String]? annotation_overrides
     Array[String]? funcotator_excluded_fields
     Boolean filter_funcotations
     File? interval_list

     String? extra_args

     # ==============
     # Process input args:
     String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else ""
     String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else ""
     String filter_funcotations_args = if (filter_funcotations) then " --remove-filtered-variants " else ""
     String excluded_fields_args = if defined(funcotator_excluded_fields) then " --exclude-field " else ""
     String final_output_filename = basename(input_vcf, ".vcf") + ".maf.annotated"
     # ==============

     # runtime

     String gatk_docker
     File? gatk_override
     Int? mem
     Int? preemptible_attempts
     Int? max_retries
     Int? disk_space_gb
     Int? cpu

     Boolean use_ssd = false

     # This should be updated when a new version of the data sources is released
     String default_datasources_version = "funcotator_dataSources.v1.4.20180615"

     # You may have to change the following two parameter values depending on the task requirements
     Int default_ram_mb = 3000
     # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).
     Int default_disk_space_gb = 100

     # Mem is in units of GB but our command and memory runtime values are in MB
     Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb
     Int command_mem = machine_mem - 1000

     command <<<
         set -e
         export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

         DATA_SOURCES_TAR_GZ=${data_sources_tar_gz}
         if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then
             # We have to download the data sources:
             echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ"
             echo "Downloading default data sources..."
             wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/${default_datasources_version}.tar.gz
             tar -zxf ${default_datasources_version}.tar.gz
             DATA_SOURCES_FOLDER=${default_datasources_version}
         else
             # Extract the tar.gz:
             mkdir datasources_dir
             tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1
             DATA_SOURCES_FOLDER="$PWD/datasources_dir"
         fi

         gatk --java-options "-Xmx${command_mem}m" Funcotator \
             --data-sources-path $DATA_SOURCES_FOLDER \
             --ref-version ${reference_version} \
             --output-file-format ${output_format} \
             -R ${ref_fasta} \
             -V ${input_vcf} \
             -O ${final_output_filename} \
             ${"-L " + interval_list} \
             ${"--transcript-selection-mode " + transcript_selection_mode} \
             ${"--transcript-list " + transcript_selection_list} \
            --annotation-default normal_barcode:${control_id} \
            --annotation-default tumor_barcode:${case_id} \
            --annotation-default Center:${default="Unknown" sequencing_center} \
            --annotation-default source:${default="Unknown" sequence_source} \
             ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \
             ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \
             ${excluded_fields_args}${default="" sep=" --exclude-field " funcotator_excluded_fields} \
             ${filter_funcotations_args} \
             ${extra_args}
     >>>

     runtime {
         docker: gatk_docker
         bootDiskSizeGb: 20
         memory: machine_mem + " MB"
         disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
         preemptible: select_first([preemptible_attempts, 3])
         maxRetries: select_first([max_retries, 3])
         cpu: select_first([cpu, 1])
     }

     output {
         File funcotated_output = "${final_output_filename}"
     }
 }