## Copyright Broad Institute, 2017 ## ## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample, ## and performs additional filtering and functional annotation tasks. ## ## Main requirements/expectations : ## - One analysis-ready BAM file (and its index) for each sample ## ## Description of inputs: ## ## ** Runtime ** ## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator ## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) ## max_retries: how many times to retry failed tasks -- very important on the cloud when there are transient errors ## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar ## in the docker image. This must be supplied when running in an environment that does not support docker ## (e.g. SGE cluster on a Broad on-prem VM) ## ## ** Workflow options ** ## intervals: genomic intervals (will be used for scatter) ## scatter_count: number of parallel jobs to generate when scattering over intervals ## artifact_modes: (optional) types of artifacts to consider in the orientation bias filter ## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional) ## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional) ## run_orientation_bias_filter: (optional) if true, run the orientation bias filter, which is the GATK implementation of ## D-ToxoG with modifications to allow multiple artifact modes. ## For more information on D-ToxoG, see https://software.broadinstitute.org/cancer/cga/dtoxog ## run_orientation_bias_mixture_model_filter: (optional) if true, filter orientation bias sites with the read orientation artifact mixture model. ## This is the recommended orientation bias filter, particularly for data sequenced on Illumina NovaSeq. ## If set to true, artifact_mode will be ignored, as the model learns the artifact modes on its own. ## While we offer both options, there's no need to run both the mixture model filter and the one based on D-ToxoG. ## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF). Important: This requires a ## docker image and should not be run in environments where docker is unavailable (e.g. SGE cluster on ## a Broad on-prem VM). Access to docker hub is also required, since the task downloads a public docker image. ## (optional, false by default) ## ## ** Primary inputs ** ## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary ## tumor_bam, tumor_bam_index: BAM and index for the tumor sample ## normal_bam, normal_bam_index: BAM and index for the normal sample ## ## ** Primary resources ** (optional but strongly recommended) ## pon, pon_index: optional panel of normals in VCF format containing probable technical artifacts (false positves) ## gnomad, gnomad_index: optional database of known germline variants (see http://gnomad.broadinstitute.org/downloads) ## variants_for_contamination, variants_for_contamination_index: VCF of common variants with allele frequencies for calculating contamination ## ## ** Secondary resources ** (for optional tasks) ## onco_ds_tar_gz, default_config_file: Oncotator datasources and config file ## sequencing_center, sequence_source: metadata for Oncotator ## filter_oncotator_maf: Whether the MAF generated by oncotator should have the filtered variants removed. Default: true ## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator. ## ## Funcotator parameters (see Funcotator help for more details). ## funco_reference_version: "hg19" for hg19 or b37. "hg38" for hg38. Default: "hg19" ## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process. ## funco_transcript_selection_mode: How to select transcripts in Funcotator. ALL, CANONICAL, or BEST_EFFECT ## funco_data_sources_tar_gz: Funcotator datasources tar gz file. Bucket location is recommended when running on the cloud. ## funco_annotation_defaults: Default values for annotations, when values are unspecified. Specified as :. For example: "Center:Broad" ## funco_annotation_overrides: Values for annotations, even when values are unspecified. Specified as :. For example: "Center:Broad" ## funcotator_excluded_fields: Annotations that should not appear in the output (VCF or MAF). Specified as . For example: "ClinVar_ALLELEID" ## ## Outputs : ## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam ## file of reassembled reads if requested ## ## Cromwell version support ## - Successfully tested on v29 ## ## LICENSING : ## This script is released under the WDL source code license (BSD-3) (see LICENSE in ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may ## be subject to different licenses. Users are responsible for checking that they are ## authorized to run all programs before running this script. Please see the docker ## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information ## pertaining to the included programs. workflow Mutect2 { # Mutect2 inputs File? intervals File ref_fasta File ref_fai File ref_dict File tumor_bam File tumor_bai File? normal_bam File? normal_bai File? pon File? pon_index Int scatter_count File? gnomad File? gnomad_index File? variants_for_contamination File? variants_for_contamination_index File? realignment_index_bundle String? realignment_extra_args Boolean? run_orientation_bias_filter Boolean run_ob_filter = select_first([run_orientation_bias_filter, false]) && (length(select_first([artifact_modes, ["G/T", "C/T"]])) > 0) Boolean? run_orientation_bias_mixture_model_filter Boolean run_ob_mm_filter = select_first([run_orientation_bias_mixture_model_filter, false]) File? ob_mm_filter_training_intervals Array[String]? artifact_modes File? tumor_sequencing_artifact_metrics String? m2_extra_args String? m2_extra_filtering_args String? split_intervals_extra_args Boolean? make_bamout Boolean make_bamout_or_default = select_first([make_bamout, false]) Boolean? compress_vcfs Boolean compress = select_first([compress_vcfs, false]) File? gga_vcf File? gga_vcf_idx # oncotator inputs Boolean? run_oncotator Boolean run_oncotator_or_default = select_first([run_oncotator, false]) File? onco_ds_tar_gz String? onco_ds_local_db_dir String? sequencing_center String? sequence_source File? default_config_file String? oncotator_extra_args # funcotator inputs Boolean? run_funcotator Boolean run_funcotator_or_default = select_first([run_funcotator, false]) String? funco_reference_version File? funco_data_sources_tar_gz String? funco_transcript_selection_mode File? funco_transcript_selection_list Array[String]? funco_annotation_defaults Array[String]? funco_annotation_overrides Array[String]? funcotator_excluded_fields String? funcotator_extra_args File? gatk_override # runtime String gatk_docker String basic_bash_docker = "ubuntu:16.04" String? oncotator_docker String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"]) Boolean? filter_oncotator_maf Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true]) Boolean? filter_funcotations Boolean filter_funcotations_or_default = select_first([filter_funcotations, true]) Int? preemptible_attempts Int? max_retries # Use as a last resort to increase the disk given to every task in case of ill behaving data Int? emergency_extra_disk # Disk sizes used for dynamic sizing Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB")) Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bai, "GB")) Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB") + size(gnomad_index, "GB")) else 0 Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0 # If no tar is provided, the task downloads one from broads ftp server Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100 Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100 Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 # This is added to every task as padding, should increase if systematically you need more disk for every call Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0]) # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes # Large is for Bams/WGS vcfs # Small is for metrics/other vcfs Float large_input_to_output_multiplier = 2.25 Float small_input_to_output_multiplier = 2.0 # logic about output file names -- these are the names *without* .vcf extensions String output_basename = basename(tumor_bam, ".bam") String unfiltered_name = output_basename + "-unfiltered" String filtered_name = output_basename + "-filtered" String funcotated_name = output_basename + "-funcotated" String output_vcf_name = basename(tumor_bam, ".bam") + ".vcf" call SplitIntervals { input: intervals = intervals, ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, scatter_count = scatter_count, split_intervals_extra_args = split_intervals_extra_args, gatk_override = gatk_override, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, max_retries = max_retries, disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad } Int m2_output_size = tumor_bam_size / scatter_count scatter (subintervals in SplitIntervals.interval_files ) { call M2 { input: intervals = subintervals, ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, tumor_bam = tumor_bam, tumor_bai = tumor_bai, normal_bam = normal_bam, normal_bai = normal_bai, pon = pon, pon_index = pon_index, gnomad = gnomad, gnomad_index = gnomad_index, preemptible_attempts = preemptible_attempts, max_retries = max_retries, m2_extra_args = m2_extra_args, make_bamout = make_bamout_or_default, artifact_prior_table = LearnReadOrientationModel.artifact_prior_table, compress = compress, gga_vcf = gga_vcf, gga_vcf_idx = gga_vcf_idx, gatk_override = gatk_override, gatk_docker = gatk_docker, disk_space = tumor_bam_size + normal_bam_size + ref_size + gnomad_vcf_size + m2_output_size + disk_pad } Float sub_vcf_size = size(M2.unfiltered_vcf, "GB") Float sub_bamout_size = size(M2.output_bamOut, "GB") } call SumFloats as SumSubVcfs { input: sizes = sub_vcf_size, preemptible_attempts = preemptible_attempts, max_retries = max_retries } call MergeVCFs { input: input_vcfs = M2.unfiltered_vcf, input_vcf_indices = M2.unfiltered_vcf_index, output_name = unfiltered_name, compress = compress, gatk_override = gatk_override, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, max_retries = max_retries, disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad } if (make_bamout_or_default) { call SumFloats as SumSubBamouts { input: sizes = sub_bamout_size, preemptible_attempts = preemptible_attempts, max_retries = max_retries } call MergeBamOuts { input: ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, bam_outs = M2.output_bamOut, output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), gatk_override = gatk_override, gatk_docker = gatk_docker, disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad, max_retries = max_retries } } if (run_ob_filter && !defined(tumor_sequencing_artifact_metrics)) { call CollectSequencingArtifactMetrics { input: gatk_docker = gatk_docker, ref_fasta = ref_fasta, ref_fai = ref_fai, preemptible_attempts = preemptible_attempts, max_retries = max_retries, tumor_bam = tumor_bam, tumor_bai = tumor_bai, gatk_override = gatk_override, disk_space = tumor_bam_size + ref_size + disk_pad } } if (run_ob_mm_filter) { call CollectF1R2Counts { input: gatk_docker = gatk_docker, ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, preemptible_attempts = preemptible_attempts, tumor_bam = tumor_bam, tumor_bai = tumor_bai, gatk_override = gatk_override, disk_space = tumor_bam_size + ref_size + disk_pad, intervals = if defined(ob_mm_filter_training_intervals) then ob_mm_filter_training_intervals else intervals, max_retries = max_retries } call LearnReadOrientationModel { input: alt_table = CollectF1R2Counts.alt_table, ref_histogram = CollectF1R2Counts.ref_histogram, alt_histograms = CollectF1R2Counts.alt_histograms, tumor_sample = CollectF1R2Counts.tumor_sample, gatk_override = gatk_override, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, max_retries = max_retries } } if (defined(variants_for_contamination)) { call CalculateContamination { input: gatk_override = gatk_override, intervals = intervals, ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, preemptible_attempts = preemptible_attempts, max_retries = max_retries, gatk_docker = gatk_docker, tumor_bam = tumor_bam, tumor_bai = tumor_bai, normal_bam = normal_bam, normal_bai = normal_bai, variants_for_contamination = variants_for_contamination, variants_for_contamination_index = variants_for_contamination_index, disk_space = tumor_bam_size + normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad } } call Filter { input: gatk_override = gatk_override, gatk_docker = gatk_docker, intervals = intervals, unfiltered_vcf = MergeVCFs.merged_vcf, unfiltered_vcf_index = MergeVCFs.merged_vcf_index, output_name = filtered_name, compress = compress, preemptible_attempts = preemptible_attempts, max_retries = max_retries, contamination_table = CalculateContamination.contamination_table, maf_segments = CalculateContamination.maf_segments, m2_extra_filtering_args = m2_extra_filtering_args, disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad } if (run_ob_filter) { # Get the metrics either from the workflow input or CollectSequencingArtifactMetrics if no workflow input is provided File input_artifact_metrics = select_first([tumor_sequencing_artifact_metrics, CollectSequencingArtifactMetrics.pre_adapter_metrics]) call FilterByOrientationBias { input: gatk_override = gatk_override, input_vcf = Filter.filtered_vcf, input_vcf_index = Filter.filtered_vcf_index, output_name = filtered_name, compress = compress, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, max_retries = max_retries, pre_adapter_metrics = input_artifact_metrics, artifact_modes = artifact_modes, disk_space = ceil(size(Filter.filtered_vcf, "GB") * small_input_to_output_multiplier) + ceil(size(input_artifact_metrics, "GB")) + disk_pad } } if (defined(realignment_index_bundle)) { File realignment_filter_input = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) File realignment_filter_input_idx = select_first([FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) call FilterAlignmentArtifacts { input: gatk_override = gatk_override, bam = tumor_bam, bai = tumor_bai, realignment_index_bundle = select_first([realignment_index_bundle]), realignment_extra_args = realignment_extra_args, gatk_docker = gatk_docker, max_retries = max_retries, compress = compress, output_name = filtered_name, input_vcf = realignment_filter_input, input_vcf_idx = realignment_filter_input_idx } } if (run_oncotator_or_default) { File oncotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) call oncotate_m2 { input: m2_vcf = oncotate_vcf_input, onco_ds_tar_gz = onco_ds_tar_gz, onco_ds_local_db_dir = onco_ds_local_db_dir, sequencing_center = sequencing_center, sequence_source = sequence_source, default_config_file = default_config_file, case_id = M2.tumor_sample[0], control_id = M2.normal_sample[0], oncotator_docker = oncotator_docker_or_default, preemptible_attempts = preemptible_attempts, max_retries = max_retries, disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad, filter_maf = filter_oncotator_maf_or_default, oncotator_extra_args = oncotator_extra_args } } if (run_funcotator_or_default) { File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) call FuncotateMaf { input: input_vcf = funcotate_vcf_input, input_vcf_idx = funcotate_vcf_input_index, ref_fasta = ref_fasta, ref_fasta_index = ref_fai, ref_dict = ref_dict, reference_version = select_first([funco_reference_version, "hg19"]), data_sources_tar_gz = funco_data_sources_tar_gz, case_id = M2.tumor_sample[0], control_id = M2.normal_sample[0], transcript_selection_mode = funco_transcript_selection_mode, transcript_selection_list = funco_transcript_selection_list, annotation_defaults = funco_annotation_defaults, annotation_overrides = funco_annotation_overrides, gatk_docker = gatk_docker, gatk_override = gatk_override, filter_funcotations = filter_funcotations_or_default, funcotator_excluded_fields = funcotator_excluded_fields, sequencing_center = sequencing_center, sequence_source = sequence_source, disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad, max_retries = max_retries, extra_args = funcotator_extra_args } } output { File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) File filtered_vcf_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) File? contamination_table = CalculateContamination.contamination_table File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf File? funcotated_maf = FuncotateMaf.funcotated_output File? preadapter_detail_metrics = CollectSequencingArtifactMetrics.pre_adapter_metrics File? bamout = MergeBamOuts.merged_bam_out File? bamout_index = MergeBamOuts.merged_bam_out_index File? maf_segments = CalculateContamination.maf_segments } } task SplitIntervals { # inputs File? intervals File ref_fasta File ref_fai File ref_dict Int scatter_count String? split_intervals_extra_args File? gatk_override # runtime String gatk_docker Int? mem Int? preemptible_attempts Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 Int command_mem = machine_mem - 500 command { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} mkdir interval-files gatk --java-options "-Xmx${command_mem}m" SplitIntervals \ -R ${ref_fasta} \ ${"-L " + intervals} \ -scatter ${scatter_count} \ -O interval-files \ ${split_intervals_extra_args} cp interval-files/*.intervals . } runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } output { Array[File] interval_files = glob("*.intervals") } } task M2 { # inputs File? intervals File ref_fasta File ref_fai File ref_dict File tumor_bam File tumor_bai File? normal_bam File? normal_bai File? pon File? pon_index File? gnomad File? gnomad_index String? m2_extra_args Boolean? make_bamout Boolean compress File? gga_vcf File? gga_vcf_idx File? artifact_prior_table String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" File? gatk_override # runtime String gatk_docker Int? mem Int? preemptible_attempts Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 Int command_mem = machine_mem - 500 command <<< set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} # We need to create these files regardless, even if they stay empty touch bamout.bam echo "" > normal_name.txt gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`" if [[ -f "${normal_bam}" ]]; then gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`" fi gatk --java-options "-Xmx${command_mem}m" Mutect2 \ -R ${ref_fasta} \ $tumor_command_line \ $normal_command_line \ ${"--germline-resource " + gnomad} \ ${"-pon " + pon} \ ${"-L " + intervals} \ ${"--genotyping-mode GENOTYPE_GIVEN_ALLELES --alleles " + gga_vcf} \ -O "${output_vcf}" \ ${true='--bam-output bamout.bam' false='' make_bamout} \ ${"--orientation-bias-artifact-priors " + artifact_prior_table} \ ${m2_extra_args} >>> runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File unfiltered_vcf = "${output_vcf}" File unfiltered_vcf_index = "${output_vcf_index}" File output_bamOut = "bamout.bam" String tumor_sample = read_string("tumor_name.txt") String normal_sample = read_string("normal_name.txt") } } task MergeVCFs { # inputs Array[File] input_vcfs Array[File] input_vcf_indices String output_name Boolean compress String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" File? gatk_override # runtime String gatk_docker Int? mem Int? preemptible_attempts Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 Int command_mem = machine_mem - 1000 # using MergeVcfs instead of GatherVcfs so we can create indices # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. command { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} gatk --java-options "-Xmx${command_mem}m" MergeVcfs -I ${sep=' -I ' input_vcfs} -O ${output_vcf} } runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File merged_vcf = "${output_vcf}" File merged_vcf_index = "${output_vcf_index}" } } task MergeBamOuts { # inputs File ref_fasta File ref_fai File ref_dict Array[File]+ bam_outs String output_vcf_name File? gatk_override # runtime String gatk_docker Int? mem Int? preemptible_attempts Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 7000 Int command_mem = machine_mem - 1000 command <<< # This command block assumes that there is at least one file in bam_outs. # Do not call this task if len(bam_outs) == 0 set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} gatk --java-options "-Xmx${command_mem}m" GatherBamFiles \ -I ${sep=" -I " bam_outs} -O unsorted.out.bam -R ${ref_fasta} # We must sort because adjacent scatters may have overlapping (padded) assembly regions, hence # overlapping bamouts gatk --java-options "-Xmx${command_mem}m" SortSam -I unsorted.out.bam \ -O ${output_vcf_name}.out.bam \ --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT gatk --java-options "-Xmx${command_mem}m" BuildBamIndex -I ${output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT >>> runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File merged_bam_out = "${output_vcf_name}.out.bam" File merged_bam_out_index = "${output_vcf_name}.out.bai" } } task CollectSequencingArtifactMetrics { # inputs File ref_fasta File ref_fai File tumor_bam File tumor_bai File? gatk_override # runtime String gatk_docker Int? mem Int? preemptible_attempts Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 7000 Int command_mem = machine_mem - 1000 command { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} gatk --java-options "-Xmx${command_mem}m" CollectSequencingArtifactMetrics \ -I ${tumor_bam} -O "gatk" -R ${ref_fasta} -VALIDATION_STRINGENCY LENIENT } runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File pre_adapter_metrics = "gatk.pre_adapter_detail_metrics" } } # Data collection step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018 task CollectF1R2Counts { # input File ref_fasta File ref_fai File ref_dict File tumor_bam File tumor_bai File? gatk_override File? intervals # runtime Int? max_retries String gatk_docker Int? mem Int? preemptible_attempts Int? disk_space Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 7000 Int command_mem = machine_mem - 1000 command { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} # Get the sample name. The task M2 retrieves this information too, but it must be done separately here # to avoid a cyclic dependency gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode tumor_name=$(head -n 1 tumor_name.txt) gatk --java-options "-Xmx${command_mem}m" CollectF1R2Counts \ -I ${tumor_bam} -R ${ref_fasta} \ ${"-L " + intervals} \ -alt-table "$tumor_name-alt.tsv" \ -ref-hist "$tumor_name-ref.metrics" \ -alt-hist "$tumor_name-alt-depth1.metrics" } runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File alt_table = glob("*-alt.tsv")[0] File ref_histogram = glob("*-ref.metrics")[0] File alt_histograms = glob("*-alt-depth1.metrics")[0] String tumor_sample = read_string("tumor_name.txt") } } # Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018 task LearnReadOrientationModel { File alt_table File ref_histogram File? alt_histograms File? gatk_override File? intervals String tumor_sample # runtime Int? max_retries String gatk_docker Int? mem Int? preemptible_attempts Int? disk_space Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 8000 Int command_mem = machine_mem - 1000 command { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \ -alt-table ${alt_table} \ -ref-hist ${ref_histogram} \ -alt-hist ${alt_histograms} \ -O "${tumor_sample}-artifact-prior-table.tsv" } runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File artifact_prior_table = "${tumor_sample}-artifact-prior-table.tsv" } } task CalculateContamination { # inputs File? intervals File ref_fasta File ref_fai File ref_dict File tumor_bam File tumor_bai File? normal_bam File? normal_bai File? variants_for_contamination File? variants_for_contamination_index File? gatk_override # runtime Int? preemptible_attempts Int? max_retries String gatk_docker Int? disk_space Int? mem # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3000 Int command_mem = machine_mem - 500 command { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} if [[ -f "${normal_bam}" ]]; then gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ -V ${variants_for_contamination} -L ${variants_for_contamination} -O normal_pileups.table NORMAL_CMD="-matched normal_pileups.table" fi gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ -V ${variants_for_contamination} -L ${variants_for_contamination} -O pileups.table gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table --tumor-segmentation segments.table $NORMAL_CMD } runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: command_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) } output { File pileups = "pileups.table" File contamination_table = "contamination.table" File maf_segments = "segments.table" } } task Filter { # inputs File? intervals File unfiltered_vcf File unfiltered_vcf_index String output_name Boolean compress String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" File? contamination_table File? maf_segments String? m2_extra_filtering_args File? gatk_override # runtime String gatk_docker Int? mem Int? preemptible_attempts Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 7000 Int command_mem = machine_mem - 500 command { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \ -O ${output_vcf} \ ${"--contamination-table " + contamination_table} \ ${"--tumor-segmentation " + maf_segments} \ ${m2_extra_filtering_args} } runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File filtered_vcf = "${output_vcf}" File filtered_vcf_index = "${output_vcf_index}" } } task FilterByOrientationBias { # input File? gatk_override File input_vcf File input_vcf_index String output_name Boolean compress String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" File pre_adapter_metrics Array[String]? artifact_modes # If artifact modes is passed in to the task as [], this task will fail. Array[String] final_artifact_modes = select_first([artifact_modes, ["G/T", "C/T"]]) # runtime Int? preemptible_attempts Int? max_retries String gatk_docker Int? disk_space Int? mem Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 7000 Int command_mem = machine_mem - 500 command { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} gatk --java-options "-Xmx${command_mem}m" FilterByOrientationBias \ -V ${input_vcf} \ -AM ${sep=" -AM " final_artifact_modes} \ -P ${pre_adapter_metrics} \ -O ${output_vcf} } runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: command_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File filtered_vcf = "${output_vcf}" File filtered_vcf_index = "${output_vcf_index}" } } task FilterAlignmentArtifacts { #input File? gatk_override File input_vcf File input_vcf_idx File bam File bai String output_name Boolean compress String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" File realignment_index_bundle String? realignment_extra_args # runtime String gatk_docker Int? mem Int? preemptible_attempts Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 9000 Int command_mem = machine_mem - 500 command { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} gatk --java-options "-Xmx${command_mem}m" FilterAlignmentArtifacts \ -V ${input_vcf} \ -I ${bam} \ --bwa-mem-index-image ${realignment_index_bundle} \ ${realignment_extra_args} \ -O ${output_vcf} } runtime { docker: gatk_docker bootDiskSizeGb: 12 memory: command_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File filtered_vcf = "${output_vcf}" File filtered_vcf_index = "${output_vcf_index}" } } task oncotate_m2 { # inputs File m2_vcf File? onco_ds_tar_gz String? onco_ds_local_db_dir String? oncotator_exe String? sequencing_center String? sequence_source File? default_config_file String case_id String? control_id String? oncotator_extra_args # runtime String oncotator_docker Int? mem Int? preemptible_attempts Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false Boolean? filter_maf Boolean is_filter_maf = select_first([filter_maf, true]) String filter_maf_args = if (is_filter_maf) then " --collapse-filter-cols --prune-filter-cols " else "" # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 Int command_mem = machine_mem - 500 command <<< # fail if *any* command below (not just the last) doesn't return 0, in particular if wget fails set -e # local db dir is a directory and has been specified if [[ -d "${onco_ds_local_db_dir}" ]]; then echo "Using local db-dir: ${onco_ds_local_db_dir}" echo "THIS ONLY WORKS WITHOUT DOCKER!" ln -s ${onco_ds_local_db_dir} onco_dbdir elif [[ "${onco_ds_tar_gz}" == *.tar.gz ]]; then echo "Using given tar file: ${onco_ds_tar_gz}" mkdir onco_dbdir tar zxvf ${onco_ds_tar_gz} -C onco_dbdir --strip-components 1 else echo "Downloading and installing oncotator datasources from Broad FTP site..." # Download and untar the db-dir wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/oncotator/oncotator_v1_ds_April052016.tar.gz tar zxvf oncotator_v1_ds_April052016.tar.gz ln -s oncotator_v1_ds_April052016 onco_dbdir fi ${default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ -v ${m2_vcf} ${case_id}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --collapse-number-annotations --log_name oncotator.log \ -a Center:${default="Unknown" sequencing_center} \ -a source:${default="Unknown" sequence_source} \ -a normal_barcode:${control_id} \ -a tumor_barcode:${case_id} \ ${"--default_config " + default_config_file} \ ${filter_maf_args} \ ${oncotator_extra_args} >>> runtime { docker: oncotator_docker memory: machine_mem + " MB" bootDiskSizeGb: 12 disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File oncotated_m2_maf="${case_id}.maf.annotated" } } # Calculates sum of a list of floats task SumFloats { Array[Float] sizes # Runtime parameters Int? preemptible_attempts Int? max_retries command <<< python -c "print ${sep="+" sizes}" >>> output { Float total_size = read_float(stdout()) } runtime { docker: "python:2.7" disks: "local-disk " + 10 + " HDD" preemptible: select_first([preemptible_attempts, 10]) maxRetries: select_first([max_retries, 3]) } } task FuncotateMaf { # inputs File ref_fasta File ref_fasta_index File ref_dict File input_vcf File input_vcf_idx String reference_version String output_format = "MAF" String? sequencing_center String? sequence_source String case_id String? control_id File? data_sources_tar_gz String? transcript_selection_mode File? transcript_selection_list Array[String]? annotation_defaults Array[String]? annotation_overrides Array[String]? funcotator_excluded_fields Boolean filter_funcotations File? interval_list String? extra_args # ============== # Process input args: String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" String filter_funcotations_args = if (filter_funcotations) then " --remove-filtered-variants " else "" String excluded_fields_args = if defined(funcotator_excluded_fields) then " --exclude-field " else "" String final_output_filename = basename(input_vcf, ".vcf") + ".maf.annotated" # ============== # runtime String gatk_docker File? gatk_override Int? mem Int? preemptible_attempts Int? max_retries Int? disk_space_gb Int? cpu Boolean use_ssd = false # This should be updated when a new version of the data sources is released String default_datasources_version = "funcotator_dataSources.v1.4.20180615" # You may have to change the following two parameter values depending on the task requirements Int default_ram_mb = 3000 # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Int default_disk_space_gb = 100 # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb Int command_mem = machine_mem - 1000 command <<< set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} DATA_SOURCES_TAR_GZ=${data_sources_tar_gz} if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then # We have to download the data sources: echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ" echo "Downloading default data sources..." wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/${default_datasources_version}.tar.gz tar -zxf ${default_datasources_version}.tar.gz DATA_SOURCES_FOLDER=${default_datasources_version} else # Extract the tar.gz: mkdir datasources_dir tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 DATA_SOURCES_FOLDER="$PWD/datasources_dir" fi gatk --java-options "-Xmx${command_mem}m" Funcotator \ --data-sources-path $DATA_SOURCES_FOLDER \ --ref-version ${reference_version} \ --output-file-format ${output_format} \ -R ${ref_fasta} \ -V ${input_vcf} \ -O ${final_output_filename} \ ${"-L " + interval_list} \ ${"--transcript-selection-mode " + transcript_selection_mode} \ ${"--transcript-list " + transcript_selection_list} \ --annotation-default normal_barcode:${control_id} \ --annotation-default tumor_barcode:${case_id} \ --annotation-default Center:${default="Unknown" sequencing_center} \ --annotation-default source:${default="Unknown" sequence_source} \ ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \ ${excluded_fields_args}${default="" sep=" --exclude-field " funcotator_excluded_fields} \ ${filter_funcotations_args} \ ${extra_args} >>> runtime { docker: gatk_docker bootDiskSizeGb: 20 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 3]) maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } output { File funcotated_output = "${final_output_filename}" } }