#!/usr/bin/env cwl-runner

cwlVersion: v1.0
class: Workflow
requirements:
  ScatterFeatureRequirement: {}
  SchemaDefRequirement:
    types:
      - $import: ../types/FASTQReadPairType.yml
inputs:
  # Intervals should come from capture kit in bed format
  intervals: File[]?
  interval_padding: int?
  # target intervals in picard interval_list format (created from intervals bed file)
  target_interval_list: File
  # bait intervals in picard interval_list format
  bait_interval_list: File
  # Read samples, fastq format
  # NOTE: GATK best practices recommends unmapped SAM/BAM files
  read_pair:
    type: ../types/FASTQReadPairType.yml#FASTQReadPairType
  # reference genome, fasta
  # NOTE: GATK can't handle compressed fasta reference genome
  reference_genome:
    type: File
    secondaryFiles:
    - .amb
    - .ann
    - .bwt
    - .pac
    - .sa
    - .fai
    - ^.dict
  # Number of threads to use for mapping
  threads: int
  # Read Group annotations
  # Can be the project name
  library: string
  # e.g. Illumina
  platform: string
  known_sites:
    type: File[] # vcf files of known sites, with indexing
    secondaryFiles:
    - .idx
  resource_dbsnp:
    type: File
    secondaryFiles:
    - .idx
outputs:
  fastqc_reports:
    type: File[]
    outputSource: qc/output_qc_report
  trim_reports:
    type: File[]
    outputSource: trim/trim_reports
  markduplicates_bam:
    type: File
    outputSource: mark_duplicates/output_dedup_bam_file
  # Recalibration
  recalibration_table:
    type: File
    outputSource: recalibrate_01_analyze/output_recalibration_report
  recalibrated_reads:
    type: File
    outputSource: recalibrate_02_apply_bqsr/output_recalibrated_bam
  raw_variants:
    type: File
    outputSource: variant_calling/output_variants
    doc: "VCF file from per sample variant calling"
  haplotypes_bam:
    type: File
    outputSource: variant_calling/output_bam
    doc: "BAM file containing assembled haplotypes and locally realigned reads"
steps:
  file_pair_details:
    run: ../tools/extract-named-file-pair-details.cwl
    in:
       read_pair: read_pair
       library: library
       platform: platform
    out:
       - reads
       - read_pair_name
       - read_group_header
  generate_sample_filenames:
    run: ../tools/generate-sample-filenames.cwl
    in:
      sample_name: file_pair_details/read_pair_name
    out:
      - combined_reads_output_filenames
      - mapped_reads_output_filename
      - sorted_reads_output_filename
      - dedup_reads_output_filename
      - dedup_metrics_output_filename
      - recal_reads_output_filename
      - recal_table_output_filename
      - raw_variants_output_filename
      - haplotypes_bam_output_filename
  combine_reads:
    run: ../tools/concat-gz-files.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 1
        ramMin: 1024
    scatter: [files, output_filename]
    scatterMethod: dotproduct
    in:
       files: file_pair_details/reads
       output_filename: generate_sample_filenames/combined_reads_output_filenames
    out:
       - output
  qc:
    run: ../tools/fastqc.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 4
        ramMin: 3072
    scatter: input_fastq_file
    in:
      input_fastq_file: combine_reads/output
      threads:
        default: 4
    out:
      - output_qc_report
  trim:
    run: ../tools/trim_galore.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 4
        ramMin: 8192
    in:
      reads: combine_reads/output
      paired:
        default: true
    out:
      - trimmed_reads
      - trim_reports
  map:
    run: ../tools/gitc-bwa-mem-samtools.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: $(inputs.threads)
        ramMin: 16384
        outdirMin: 12288
        tmpdirMin: 12288
    in:
      reads: trim/trimmed_reads
      reference: reference_genome
      read_group_header: file_pair_details/read_group_header
      output_filename: generate_sample_filenames/mapped_reads_output_filename
      threads: threads
    out:
      - output
  sort:
    run: ../tools/GATK4-SortSam.cwl
    requirements:
      - class: ResourceRequirement
        ramMin: 16384
    in:
      input_file: map/output
      output_sorted_bam_filename: generate_sample_filenames/sorted_reads_output_filename
      sort_order: { default: "coordinate" }
      java_opt: { default: "-Xms4g" }
    out:
      - output_sorted_bam
  mark_duplicates:
    run: ../tools/GATK4-MarkDuplicates.cwl
    requirements:
      - class: ResourceRequirement
        ramMin: 16384
        outdirMin: 12288
        tmpdirMin: 12288
    in:
      input_file: sort/output_sorted_bam
      output_filename: generate_sample_filenames/dedup_reads_output_filename
      metrics_filename: generate_sample_filenames/dedup_metrics_output_filename
      validation_stringency: { default: "SILENT" }
      assume_sort_order: { default: "coordinate" }
      create_index: { default: "true" }
      optical_duplicate_pixel_distance: { default: 2500 }
      java_opt: { default: "-Xms4g" }
    out:
      - output_dedup_bam_file
      - output_metrics_file
  # Now recalibrate
  recalibrate_01_analyze:
    run: ../tools/GATK4-BaseRecalibrator.cwl
    requirements:
      - class: ResourceRequirement
        ramMin: 8192
    in:
      reference: reference_genome
      input_bam: mark_duplicates/output_dedup_bam_file
      output_recalibration_report_filename: generate_sample_filenames/recal_table_output_filename
      known_sites: known_sites
      intervals: intervals
      interval_padding: interval_padding
      java_opt: { default: "-Xms4g" }
    out:
      - output_recalibration_report
  recalibrate_02_apply_bqsr:
    run: ../tools/GATK4-ApplyBQSR.cwl
    requirements:
      - class: ResourceRequirement
        ramMin: 6144
    in:
      reference: reference_genome
      input_bam: mark_duplicates/output_dedup_bam_file
      output_recalibrated_bam_filename: generate_sample_filenames/recal_reads_output_filename
      intervals: intervals
      interval_padding: interval_padding
      bqsr_report: recalibrate_01_analyze/output_recalibration_report
      add_output_sam_program_record: { default: true }
      java_opt: { default: "-Xms3g" }
    out:
      - output_recalibrated_bam
  variant_calling:
    run: ../tools/GATK4-HaplotypeCaller.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 1
        ramMin: 14336
    in:
      reference: reference_genome
      input_bam: recalibrate_02_apply_bqsr/output_recalibrated_bam
      # Naming your output file using the .g.vcf extension will automatically set the appropriate values  for --variant_index_type and --variant_index_parameter
      output_variants_filename: generate_sample_filenames/raw_variants_output_filename
      output_bam_filename: generate_sample_filenames/haplotypes_bam_output_filename
      intervals: intervals
      interval_padding: interval_padding
      annotation_groups: { default: ['StandardAnnotation','AS_StandardAnnotation'] }
      emit_ref_confidence: { default: "GVCF" }
      java_opt: { default: "-Xms7g" }
    out:
      - output_variants
      - output_bam