cwlVersion: v1.0
class: Workflow


requirements:
  - class: StepInputExpressionRequirement
  - class: InlineJavascriptRequirement
  - class: MultipleInputFeatureRequirement


'sd:upstream':
  first_biological_condition:
    - "chipseq-se.cwl"
    - "chipseq-pe.cwl"
    - "trim-chipseq-se.cwl"
    - "trim-chipseq-pe.cwl"
    - "trim-atacseq-se.cwl"
    - "trim-atacseq-pe.cwl"
  second_biological_condition:
    - "chipseq-se.cwl"
    - "chipseq-pe.cwl"
    - "trim-chipseq-se.cwl"
    - "trim-chipseq-pe.cwl"
    - "trim-atacseq-se.cwl"
    - "trim-atacseq-pe.cwl"
  genome_indices:
    - "genome-indices.cwl"


inputs:

  alias:
    type: string
    label: "Experiment short name/Alias"
    sd:preview:
      position: 1

  bambai_pair_cond_1:
    type: File[]
    secondaryFiles:
      - .bai
    format: "http://edamontology.org/format_2572"
    label: "Biological condition 1"
    doc: "Coordinate sorted BAM alignment and index BAI files for the first biological condition"
    'sd:upstreamSource': "first_biological_condition/bambai_pair"
    'sd:localLabel': true

  bambai_pair_cond_2:
    type: File[]
    secondaryFiles:
      - .bai
    format: "http://edamontology.org/format_2572"
    label: "Biological condition 2"
    doc: "Coordinate sorted BAM alignment and index BAI files for the second biological condition"
    'sd:upstreamSource': "second_biological_condition/bambai_pair"
    'sd:localLabel': true

  alias_cond_1:
    type: string?
    default: "condition_1"
    label: "Name for condition 1"
    doc: "Name to be displayed for condition 1"
    'sd:layout':
      advanced: true

  alias_cond_2:
    type: string?
    default: "condition_2"
    label: "Name for condition 2"
    doc: "Name to be displayed for condition 2"
    'sd:layout':
      advanced: true

  chrom_length_file:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "Chromosome length file"
    doc: "Chromosome length file"
    'sd:upstreamSource': "genome_indices/chrom_length"

  annotation_file:
    type: File
    label: "Genome annotation"
    format: "http://edamontology.org/format_3475"
    doc: "Genome annotation file in TSV format"
    'sd:upstreamSource': "genome_indices/annotation"

  merge_peaks:
    type: boolean?
    default: true
    label: "Merge peaks closer than fragment size"
    doc: "Merge peaks which have a distance less than the estimated mean fragment size (recommended for histone data)"
    'sd:layout':
      advanced: true

  remove_duplicates:
    type: boolean?
    default: false
    label: "Remove the duplicate reads"
    doc: "Remove the duplicate reads"
    'sd:layout':
      advanced: true

  housekeeping_genes_bed_file:
    type: File?
    format: "http://edamontology.org/format_3003"
    label: "Housekeeping genes file"
    doc: "Define housekeeping genes (BED format) used for normalizing"
    'sd:layout':
     advanced: true

  deadzones_bed_file:
    type: File?
    format: "http://edamontology.org/format_3003"
    label: "Dead zones file"
    doc: "Define blacklisted genomic regions avoided for analysis"
    'sd:layout':
     advanced: true

  pvalue_cutoff:
    type: float?
    default: 0.1
    label: "P-value cutoff for peak detection"
    doc: "P-value cutoff for peak detection. Call only peaks with p-value lower than cutoff. [default: 0.1]"
    'sd:layout':
      advanced: true

  bin_size:
    type: int?
    default: 100
    label: "Size of underlying bins for creating the signal"
    doc: "Size of underlying bins for creating the signal"
    'sd:layout':
      advanced: true

  # no_correction:
  #   type: boolean?
  #   default: false
  #   label: "Skip p-value correction"
  #   doc: "Do not use multipe test correction for p-values (Benjamini/Hochberg)"
  #   'sd:layout':
  #     advanced: true

  extension_size:
    type:
      - "null"
      - string
      - int[]
    label: "Comma-separated list of read extension sizes (provide value for every sample)"
    doc: |
      Read's extension size for BAM files (comma separated list for each BAM file in config file).
      If option is not chosen, estimate extension sizes
    'sd:layout':
      advanced: true


outputs:

  diffpeaks_bed_file:
    type: File
    format: "http://edamontology.org/format_3004"
    label: "Estimated differential peaks"
    doc: "Estimated differential peaks, bigBed"
    outputSource: bed_to_bigbed/bigbed_file
    'sd:visualPlugins':
    - igvbrowser:
        tab: 'IGV Genome Browser'
        id: 'igvbrowser'
        type: 'annotation'
        format: 'bigbed'
        name: "Differential peaks"
        height: 120

  diffpeaks_annotated_file:
    type: File
    format: "http://edamontology.org/format_3475"
    label: "Estimated differential peaks with assigned genes"
    doc: "File contains nearest gene information for the differential peaks BED file generated by rgt-THOR"
    outputSource: restore_columns/output_file
    'sd:visualPlugins':
    - syncfusiongrid:
        tab: 'Differential Peak Calling'
        Title: 'Differential Peaks rgt-THOR Results'

  cond_1_bigwig_file:
    type: File[]
    format: "http://edamontology.org/format_3006"
    label: "First biological condition ChIP-seq signals"
    doc: "Postprocessed ChIP-seq signals from the first biological condition samples"
    outputSource: thor/cond_1_bigwig_file
    'sd:visualPlugins':
    - igvbrowser:
        tab: 'IGV Genome Browser'
        id: 'igvbrowser'
        type: 'wig'
        name: "Biological condition 1"
        height: 120

  cond_2_bigwig_file:
    type: File[]
    format: "http://edamontology.org/format_3006"
    label: "Second biological condition ChIP-seq signals"
    doc: "Postprocessed ChIP-seq signals from the second biological condition samples"
    outputSource: thor/cond_2_bigwig_file
    'sd:visualPlugins':
    - igvbrowser:
        tab: 'IGV Genome Browser'
        id: 'igvbrowser'
        type: 'wig'
        name: "Biological condition 2"
        height: 120

  thor_stderr_log:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "rgt-THOR stderr log"
    doc: "rgt-THOR stderr log"
    outputSource: thor/stderr_log


steps:

  thor:
    run: ../tools/rgt-thor.cwl
    in:
      bambai_pair_cond_1: bambai_pair_cond_1
      bambai_pair_cond_2: bambai_pair_cond_2
      chrom_length_file: chrom_length_file
      merge_peaks: merge_peaks
      housekeeping_genes_bed_file: housekeeping_genes_bed_file
      deadzones_bed_file: deadzones_bed_file
      pvalue_cutoff: pvalue_cutoff
      extension_size: extension_size
      # no_correction: no_correction
      remove_duplicates: remove_duplicates
      bin_size: bin_size
    out:
      - diffpeaks_bed_file
      - cond_1_bigwig_file
      - cond_2_bigwig_file
      - stderr_log

  filter_columns:
    run: ../tools/custom-bash.cwl
    in:
      input_file: thor/diffpeaks_bed_file
      script:
        default: >
          cat $0 | awk
          'BEGIN {print "chr\tstart\tend\tlength\tabs_summit\tpileup\t-log10(pvalue)\tfold_enrichment\t-log10(qvalue)\tname"}
          {print $1"\t"$2"\t"$3"\t"$3-$2+1"\t0\t"NR"\t0\t0\t0\t0"}' > `basename $0`
    out: [output_file]

  assign_genes:
      run: ../tools/iaintersect.cwl
      in:
        input_filename: filter_columns/output_file
        annotation_filename: annotation_file
        promoter_bp:
          default: 1000
      out: [result_file]

  restore_columns:
    run: ../tools/custom-bash.cwl
    in:
      input_file: [assign_genes/result_file, thor/diffpeaks_bed_file]
      param: [alias_cond_1, alias_cond_2]
      script:
        default: |
          NAME_1=$2
          NAME_2=$3
          cat $0 | grep -v "start" | sort -k 11n > sorted_iaintersect_result.tsv
          cat $1 | tr ";" "\t" > thor_result.tsv
          echo -e "refseq_id\tgene_id\ttxStart\ttxEnd\tstrand\tchrom\tstart\tend\tlength\tregion\tname\tscore\tcondition\tcolor\t${NAME_1}_counts\t${NAME_2}_counts\t-log10(pvalue)" > `basename $0`;
          cat sorted_iaintersect_result.tsv | paste - thor_result.tsv | cut -f 1-9,15,19-21,24,26-28 >> `basename $0`
          rm sorted_iaintersect_result.tsv thor_result.tsv
    out: [output_file]

  sort_bed:
    run: ../tools/linux-sort.cwl
    in:
      unsorted_file: thor/diffpeaks_bed_file
      key:
        default: ["1,1","2,2n"]
    out: [sorted_file]

  bed_to_bigbed:
    run: ../tools/ucsc-bedtobigbed.cwl
    in:
      input_bed: sort_bed/sorted_file
      bed_type:
        default: "bed4+7"
      chrom_length_file: chrom_length_file
      output_filename:
        source: sort_bed/sorted_file
        valueFrom: $(self.basename + ".bigBed")
    out: [bigbed_file]


$namespaces:
  s: http://schema.org/

$schemas:
- http://schema.org/docs/schema_org_rdfa.html

s:name: "THOR - differential peak calling of ChIP-seq signals with replicates"
label: "THOR - differential peak calling of ChIP-seq signals with replicates"
s:alternateName: "THOR is an HMM-based approach to detect and analyze differential peaks in two sets of ChIP-seq data from distinct biological conditions with replicates"

s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rgt-thor.cwl
s:codeRepository: https://github.com/datirium/workflows
s:license: http://www.apache.org/licenses/LICENSE-2.0

s:isPartOf:
  class: s:CreativeWork
  s:name: Common Workflow Language
  s:url: http://commonwl.org/

s:creator:
  - class: s:Organization
    s:legalName: "Cincinnati Children's Hospital Medical Center"
    s:location:
    - class: s:PostalAddress
      s:addressCountry: "USA"
      s:addressLocality: "Cincinnati"
      s:addressRegion: "OH"
      s:postalCode: "45229"
      s:streetAddress: "3333 Burnet Ave"
      s:telephone: "+1(513)636-4200"
    s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png"
    s:department:
    - class: s:Organization
      s:legalName: "Allergy and Immunology"
      s:department:
      - class: s:Organization
        s:legalName: "Barski Research Lab"
        s:member:
        - class: s:Person
          s:name: Michael Kotliar
          s:email: mailto:michael.kotliar@cchmc.org
          s:sameAs:
          - id: http://orcid.org/0000-0002-6486-3898

doc: |
  What is THOR?
  --------------

  THOR is an HMM-based approach to detect and analyze differential peaks in two sets of ChIP-seq data
  from distinct biological conditions with replicates. THOR performs genomic signal processing, peak
  calling and p-value calculation in an integrated framework.

  For more information please refer to:
  -------------------------------------

  Allhoff, M., Sere K., Freitas, J., Zenke, M., Costa, I.G. (2016), Differential Peak Calling of ChIP-seq
  Signals with Replicates with THOR, Nucleic Acids Research, epub gkw680.