cwlVersion: v1.0
class: Workflow


requirements:
  - class: StepInputExpressionRequirement
  - class: InlineJavascriptRequirement
  - class: MultipleInputFeatureRequirement


'sd:upstream':
  genome_indices:
    - "genome-indices.cwl"


inputs:

  alias:
    type: string
    label: "Experiment short name/Alias"
    sd:preview:
      position: 1

  regions_file:
    type: File
    format: "http://edamontology.org/format_3003"
    label: "Regions file. Headerless BED file with minimum [chrom start end] columns. Optionally, CSV"
    doc: "Regions of interest. Formatted as headerless BED file with minimum [chrom start end] columns. Optionally, CSV"

  motifs_db:
    type:
      - "null"
      - type: enum
        symbols: ["vertebrates", "insects", "worms", "plants", "yeast", "all"]
    default: "vertebrates"
    label: "Set motifs DB to check against"
    doc: "Set motifs DB to check against"

  chrom_length_file:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "Chromosome length file"
    doc: "Chromosome length file"
    'sd:upstreamSource': "genome_indices/chrom_length"

  genome_fasta_file:
    type: File
    format: "http://edamontology.org/format_1929"
    label: "Reference genome FASTA file"
    doc: "Reference genome FASTA file. Includes all chromosomes in a single file"
    'sd:upstreamSource': "genome_indices/fasta_output"

  skip_denovo:
    type: boolean?
    default: True
    label: "Skip de novo motif enrichment"
    doc: "Skip de novo motif enrichment"
    'sd:layout':
      advanced: true

  skip_known:
    type: boolean?
    default: False
    label: "Skip known motif enrichment"
    doc: "Skip known motif enrichment"
    'sd:layout':
      advanced: true

  use_binomial:
    type: boolean?
    default: False
    label: "Use binomial distribution instead of hypergeometric to calculate p-values"
    doc: "Use binomial distribution instead of hypergeometric to calculate p-values"
    'sd:layout':
      advanced: true
  
  threads:
    type: int?
    default: 4
    label: "Threads number"
    doc: "Number of threads for those steps that support multithreading"
    'sd:layout':
      advanced: true


outputs:

  homer_found_motifs:
    type: File
    outputSource: find_motifs/compressed_results_folder
    label: "Compressed file with Homer motifs"
    doc: "Homer motifs"

  homer_stdout_log:
    type: File
    format: "http://edamontology.org/format_2330"
    outputSource: find_motifs/stdout_log
    label: "Homer stdout log"
    doc: "Homer stdout log"

  homer_known_motifs:
    type: File?
    format: "http://edamontology.org/format_2331"
    outputSource: find_motifs/known_motifs
    label: "Known motifs html file"
    doc: "Known motifs html file"

  homer_denovo_motifs:
    type: File?
    format: "http://edamontology.org/format_2331"
    outputSource: find_motifs/denovo_motifs
    label: "de novo motifs html file"
    doc: "de novo motifs html file"

  homer_stderr_log:
    type: File
    format: "http://edamontology.org/format_2330"
    outputSource: find_motifs/stderr_log
    label: "Homer stderr log"
    doc: "Homer stderr log"


steps:

  make_unique:
    run: ../tools/custom-bash.cwl
    in:
      input_file: regions_file
      script:
        default: |
          cat "$0" | tr -d '\r' | tr "," "\t" | cut -f 1-3 | awk NF | sort -u -k1,1 -k2,2n -k3,3n > `basename $0`
    out:
      - output_file

  bedtools_slop:
    run: ../tools/bedtools-slop.cwl
    in:
      bed_file: make_unique/output_file
      chrom_length_file: chrom_length_file
      bi_direction:
        default: 20000
    out:
      - extended_bed_file

  bedtools_sort:
    run: ../tools/linux-sort.cwl
    in:
      unsorted_file: bedtools_slop/extended_bed_file
      key:
        default: ["1,1","2,2n"]
    out:
      - sorted_file

  bedtools_merge:
    run: ../tools/bedtools-merge.cwl
    in:
      bed_file: bedtools_sort/sorted_file
    out:
      - merged_bed_file

  bedtools_subtract:
    run: ../tools/bedtools-subtract.cwl
    in:
      reduced_bed_file: bedtools_merge/merged_bed_file
      subtracted_bed_file: make_unique/output_file
    out:
      - difference_bed_file

  bedtools_shuffle:
    run: ../tools/bedtools-shuffle.cwl
    in:
      bed_file: make_unique/output_file
      chrom_length_file: chrom_length_file
      incl_bed_file: bedtools_subtract/difference_bed_file
      no_overlapping:
        default: True
      max_tries:
        default: 10000
      seed:
        default: 123456789
    out:
      - shuffled_bed_file

  bedtools_get_fasta_target:
    run: ../tools/bedtools-getfasta.cwl
    in:
      intervals_file: make_unique/output_file
      genome_fasta_file: genome_fasta_file
    out:
      - sequences_file

  bedtools_get_fasta_background:
    run: ../tools/bedtools-getfasta.cwl
    in:
      intervals_file: bedtools_shuffle/shuffled_bed_file
      genome_fasta_file: genome_fasta_file
    out:
      - sequences_file

  find_motifs:
    run: ../tools/homer-find-motifs.cwl
    in:
      target_fasta_file: bedtools_get_fasta_target/sequences_file
      background_fasta_file: bedtools_get_fasta_background/sequences_file
      skip_denovo: skip_denovo
      skip_known: skip_known
      use_binomial: use_binomial
      motifs_db: motifs_db
      threads: threads
    out:
      - compressed_results_folder
      - known_motifs
      - denovo_motifs
      - stdout_log
      - stderr_log


$namespaces:
  s: http://schema.org/

$schemas:
- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf

label: "Motif Finding with HOMER with random background regions"
s:name: "Motif Finding with HOMER with random background regions"
s:alternateName: "Motif Finding with HOMER with random background regions"

s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/homer-motif-analysis.cwl
s:codeRepository: https://github.com/datirium/workflows
s:license: http://www.apache.org/licenses/LICENSE-2.0

s:isPartOf:
  class: s:CreativeWork
  s:name: Common Workflow Language
  s:url: http://commonwl.org/

s:creator:
  - class: s:Organization
    s:legalName: "Cincinnati Children's Hospital Medical Center"
    s:location:
    - class: s:PostalAddress
      s:addressCountry: "USA"
      s:addressLocality: "Cincinnati"
      s:addressRegion: "OH"
      s:postalCode: "45229"
      s:streetAddress: "3333 Burnet Ave"
      s:telephone: "+1(513)636-4200"
    s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png"
    s:department:
    - class: s:Organization
      s:legalName: "Allergy and Immunology"
      s:department:
      - class: s:Organization
        s:legalName: "Barski Research Lab"
        s:member:
        - class: s:Person
          s:name: Michael Kotliar
          s:email: mailto:michael.kotliar@cchmc.org
          s:sameAs:
          - id: http://orcid.org/0000-0002-6486-3898


# doc:
#   $include: ../descriptions/homer-motif-analysis.md


doc: |
  Motif Finding with HOMER with random background regions
  ---------------------------------------------------
  
  HOMER contains a novel motif discovery algorithm that was designed for regulatory element analysis
  in genomics applications (DNA only, no protein). It is a differential motif discovery algorithm,
  which means that it takes two sets of sequences and tries to identify the regulatory elements that
  are specifically enriched in on set relative to the other. It uses ZOOPS scoring (zero or one
  occurrence per sequence) coupled with the hypergeometric enrichment calculations (or binomial) to
  determine motif enrichment. HOMER also tries its best to account for sequenced bias in the dataset.
  It was designed with ChIP-Seq and promoter analysis in mind, but can be applied to pretty much any
  nucleic acids motif finding problem.

  Here is how we generate background for Motifs Analysis
  -------------------------------------
  1. Take input file with regions in a form of “chr" “start" “end"
  2. Sort and remove duplicates from this regions file
  3. Extend each region in 20Kb into both directions
  4. Merge all overlapped extended regions
  5. Subtract not extended regions from the extended ones
  6. Randomly distribute not extended regions within the regions
     that we got as a result of the previous step
  7. Get fasta file from these randomly distributed regions (from the previous step). Use it as background

  For more information please refer to:
  -------------------------------------
  [Official documentation](http://homer.ucsd.edu/homer/motif/)