#!/usr/bin/env cwl-runner

cwlVersion: v1.0
class: Workflow
requirements:
  ScatterFeatureRequirement: {}
  SchemaDefRequirement:
    types:
      - $import: ../types/FASTQReadPairType.yml
inputs:
  # Intervals should come from capture kit in bed format
  intervals: File[]?
  # target intervals in picard interval_list format (created from intervals bed file)
  target_interval_list: File
  # bait intervals in picard interval_list format
  bait_interval_list: File
  interval_padding: int?
  # Read samples, fastq format
  # NOTE: GATK best practices recommends unmapped SAM/BAM files
  read_pair:
    type: ../types/FASTQReadPairType.yml#FASTQReadPairType
  # reference genome, fasta
  # NOTE: GATK can't handle compressed fasta reference genome
  reference_genome: File
  # Number of threads to use for mapping
  threads: int
  # Read Group annotations
  # Can be the project name
  library: string
  # e.g. Illumina
  platform: string
  # GATK
  GATKJar: File
  knownSites: File[] # vcf files of known sites, with indexing
  # Variant Recalibration - Common
  resource_dbsnp: File
outputs:
  fastqc_reports:
    type: File[]
    outputSource: qc/output_qc_report
  hs_metrics:
    type: File
    outputSource: collect_hs_metrics/output_hs_metrics_file
  trim_reports:
    type: File[]
    outputSource: trim/trim_reports
  markduplicates_bam:
    type: File
    outputSource: mark_duplicates/output_dedup_bam_file
  # Recalibration
  recalibration_table:
    type: File
    outputSource: recalibrate_01_analyze/output_baseRecalibrator
  recalibrated_reads:
    type: File
    outputSource: recalibrate_02_apply/output_printReads
  raw_variants:
    type: File
    outputSource: variant_calling/output_HaplotypeCaller
    doc: "VCF file from per sample variant calling"
  haplotypes_bam:
    type: File
    outputSource: variant_calling/output_HaplotypesBam
    doc: "BAM file containing assembled haplotypes and locally realigned reads"

steps:
  file_pair_details:
    run: ../tools/extract-named-file-pair-details.cwl
    in:
       read_pair: read_pair
       library: library
       platform: platform
    out:
       - reads
       - read_pair_name
       - read_group_header
  generate_sample_filenames:
    run: ../tools/generate-sample-filenames.cwl
    in:
      sample_name: file_pair_details/read_pair_name
    out:
      - combined_reads_output_filenames
      - mapped_reads_output_filename
      - sorted_reads_output_filename
      - dedup_reads_output_filename
      - dedup_metrics_output_filename
      - recal_reads_output_filename
      - recal_table_output_filename
      - raw_variants_output_filename
      - haplotypes_bam_output_filename
      - hs_metrics_output_filename
  combine_reads:
    run: ../tools/concat-gz-files.cwl
    scatter: [files, output_filename]
    scatterMethod: dotproduct
    in:
       files: file_pair_details/reads
       output_filename: generate_sample_filenames/combined_reads_output_filenames
    out:
       - output
  qc:
    run: ../tools/fastqc.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 4
        ramMin: 2500
    scatter: input_fastq_file
    in:
      input_fastq_file: combine_reads/output
      threads:
        default: 4
    out:
      - output_qc_report
  trim:
    run: ../tools/trim_galore.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 4
        ramMin: 8000
    in:
      reads: combine_reads/output
      paired:
        default: true
    out:
      - trimmed_reads
      - trim_reports
  map:
    run: ../tools/bwa-mem-samtools.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: $(inputs.threads)
        ramMin: 16000
        outdirMin: 12000
        tmpdirMin: 12000
    in:
      reads: trim/trimmed_reads
      reference: reference_genome
      read_group_header: file_pair_details/read_group_header
      output_filename: generate_sample_filenames/mapped_reads_output_filename
      threads: threads
    out:
      - output
  sort:
    run: ../tools/picard-SortSam.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 1
        ramMin: 4000
        outdirMin: 12000
        tmpdirMin: 12000
    in:
      input_file: map/output
      output_filename: generate_sample_filenames/sorted_reads_output_filename
    out:
      - sorted
  mark_duplicates:
    run: ../tools/picard-MarkDuplicates.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 1
        ramMin: 4000
        outdirMin: 12000
        tmpdirMin: 12000
    in:
      input_file: sort/sorted
      output_filename: generate_sample_filenames/dedup_reads_output_filename
      metrics_filename: generate_sample_filenames/dedup_metrics_output_filename
    out:
      - output_dedup_bam_file
      - output_metrics_file
  collect_hs_metrics:
    run: ../tools/picard-CollectHsMetrics.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 1
        ramMin: 4000
        outdirMin: 12000
        tmpdirMin: 12000
    in:
      input_file: mark_duplicates/output_dedup_bam_file
      reference_sequence: reference_genome
      target_intervals: target_interval_list
      bait_intervals: bait_interval_list
      output_filename: generate_sample_filenames/hs_metrics_output_filename
    out:
      - output_hs_metrics_file
  # Now recalibrate
  recalibrate_01_analyze:
    run: ../tools/GATK-BaseRecalibrator.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 8
        ramMin: 4096
    in:
      GATKJar: GATKJar
      inputBam_BaseRecalibrator: mark_duplicates/output_dedup_bam_file
      intervals: intervals
      interval_padding: interval_padding
      knownSites: knownSites
      cpu_threads:
        default: 8
      outputfile_BaseRecalibrator: generate_sample_filenames/recal_table_output_filename
      reference: reference_genome
    out:
      - output_baseRecalibrator
  recalibrate_02_apply:
    run: ../tools/GATK-PrintReads.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 8
        ramMin: 4096
    in:
      GATKJar: GATKJar
      inputBam_printReads: mark_duplicates/output_dedup_bam_file
      intervals: intervals
      input_baseRecalibrator: recalibrate_01_analyze/output_baseRecalibrator
      cpu_threads:
        default: 8
      outputfile_printReads: generate_sample_filenames/recal_reads_output_filename
      reference: reference_genome
    out:
      - output_printReads
  variant_calling:
    run: ../tools/GATK-HaplotypeCaller.cwl
    requirements:
      - class: ResourceRequirement
        coresMin: 1
        ramMin: 16384
    in:
      GATKJar: GATKJar
      inputBam_HaplotypeCaller: recalibrate_02_apply/output_printReads
      intervals: intervals
      interval_padding: interval_padding
      reference: reference_genome
      group:
        default: ['StandardAnnotation','AS_StandardAnnotation']
      dbsnp: resource_dbsnp
      emitRefConfidence:
        default: "GVCF"
      outputfile_HaplotypeCaller: generate_sample_filenames/raw_variants_output_filename
      bamOutput: generate_sample_filenames/haplotypes_bam_output_filename
        # Naming your output file using the .g.vcf extension will automatically set the appropriate values  for --variant_index_type and --variant_index_parameter
    out:
      - output_HaplotypeCaller
      - output_HaplotypesBam