cwlVersion: v1.0
class: Workflow


requirements:
  - class: SubworkflowFeatureRequirement
  - class: StepInputExpressionRequirement
  - class: InlineJavascriptRequirement
  - class: MultipleInputFeatureRequirement


'sd:upstream':
  deseq_experiment: "deseq.cwl"

inputs:

  alias_name:
    type: string
    label: "Experiment short name/Alias"
    sd:preview:
      position: 1

  read_counts_file:
    type: File
    format: "http://edamontology.org/format_3709"
    label: "DESeq experiment"
    doc: "Input gene expression dataset file in txt or gct format. Same with GSEA"
    'sd:upstreamSource': "deseq_experiment/read_counts_file"
    'sd:localLabel': true

  phenotypes_file:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "DESeq experiment"
    doc: "Input class vector (phenotype) file in CLS format. Same with GSEA"
    'sd:upstreamSource': "deseq_experiment/phenotypes_file"
    'sd:localLabel': true

  gene_set_database:
    type:
    - "null"
    - type: enum
      name: "genesetdatabase"
      symbols:
      - H_hallmark_gene_sets
      - C1_positional_gene_sets
      - C2_curated_gene_sets
      - C3_regulatory_target_gene_sets
      - C4_computational_gene_sets
      - C5_ontology_gene_sets
      - C6_oncogenic_signature_gene_sets
      - C7_immunologic_signature_gene_sets
      - C8_cell_type_signature_gene_sets
      - KEGG_2021_Human
      - Reactome_2022
      - WikiPathways_2019_Human
    default: "H_hallmark_gene_sets"
    label: "Gene set database. Ignored if GMT file is privided"
    doc: "Gene set database"

  gene_set_database_file:
    type: File?
    format: "http://edamontology.org/format_2330"
    default: null
    label: "Gene set database file in GMT format"
    doc: "Gene set database file in GMT (Gene Matrix Transposed) format"

  permutation_type:
    type:
    - "null"
    - type: enum
      name: "permutationtype"
      symbols:
      - gene_set
      - phenotype
    default: "gene_set"
    label: "Permutation type"
    doc: "Permutation type. Default: gene_set"

  permutation_count:
    type: int?
    default: 1000
    label: "Number of random permutations"
    doc: "Number of random permutations. For calculating esnulls. Default: 1000"

  min_gene_set_size:
    type: int?
    default: 15
    label: "Min size of input genes presented in Gene Sets"
    doc: "Min size of input genes presented in Gene Sets. Default: 15"
    'sd:layout':
      advanced: true

  max_gene_set_size:
    type: int?
    default: 500
    label: "Max size of input genes presented in Gene Sets"
    doc: "Max size of input genes presented in Gene Sets. Default: 500"
    'sd:layout':
      advanced: true

  ranking_metrics:
    type:
    - "null"
    - type: enum
      name: "rankingmetrics"
      symbols:
      - signal_to_noise
      - t_test
      - ratio_of_classes
      - diff_of_classes
      - log2_ratio_of_classes
    default: "signal_to_noise"
    label: "Methods to calculate correlations of ranking metrics"
    doc: "Methods to calculate correlations of ranking metrics. Default: log2_ratio_of_classes"

  ascending_rank_sorting:
    type: boolean?
    default: false
    label: "Ascending rank metric sorting order"
    doc: "Ascending rank metric sorting order. Default: False"

  graphs_count:
    type: int?
    default: 20
    label: "Numbers of top graphs produced"
    doc: "Numbers of top graphs produced. Default: 20"
    'sd:layout':
      advanced: true

  seed:
    type: int?
    default: 123
    label: "Number of random seed. Default: None"
    doc: "Number of random seed. Default: None"
    'sd:layout':
      advanced: true

  threads:
    type: int?
    default: 4
    label: "Number of threads"
    doc: "Number of threads for those steps that support multithreading"
    'sd:layout':
      advanced: true


outputs:

  gseapy_enrichment_report:
    type: File
    format: "http://edamontology.org/format_3475"
    label: "Enrichment report"
    doc: "Enrichment report"
    outputSource: convert_to_tsv/output_file
    "sd:visualPlugins":
    - syncfusiongrid:
        tab: "Gene Set Enrichment"
        Title: "Gene Set Enrichment"

  gseapy_enrichment_plots:
    type: File
    label: "Compressed TAR with enrichment plots"
    doc: "Compressed TAR with enrichment plots"
    outputSource: rename_enrichment_plots/target_file

  gseapy_enrichment_heatmaps:
    type: File
    label: "Compressed TAR with enrichment heatmaps"
    doc: "Compressed TAR with enrichment heatmaps"
    outputSource: rename_enrichment_heatmaps/target_file

  gseapy_stdout_log:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "GSEApy stdout log"
    doc: "GSEApy stdout log"
    outputSource: run_gseapy/stdout_log

  gseapy_stderr_log:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "GSEApy stderr log"
    doc: "GSEApy stderr log"
    outputSource: run_gseapy/stderr_log

  summary_report:
    type: File
    format: "http://edamontology.org/format_3835"
    label: "Enrichment report"
    doc: "Enrichment report"
    outputSource: report_summary/summary_file
    'sd:visualPlugins':
    - markdownView:
        tab: 'Overview'

  summary_stderr_log:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "stderr log"
    doc: "stderr log"
    outputSource: report_summary/log_file_stderr

  summary_stdout_log:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "stdout log"
    doc: "stdout log"
    outputSource: report_summary/log_file_stdout


steps:

  run_gseapy:
    run: ../tools/gseapy.cwl
    in:
      read_counts_file: read_counts_file
      phenotypes_file: phenotypes_file
      gene_set_database:
        source: [gene_set_database, gene_set_database_file]
        valueFrom: $(self[1]?self[1]:self[0])
      permutation_type: permutation_type
      permutation_count: permutation_count
      min_gene_set_size: min_gene_set_size
      max_gene_set_size: max_gene_set_size
      ranking_metrics: ranking_metrics
      ascending_rank_sorting: ascending_rank_sorting
      graphs_count: graphs_count
      seed: seed
      threads: threads
    out:
      - enrichment_report
      - enrichment_plots
      - enrichment_heatmaps
      - stdout_log
      - stderr_log

  convert_to_tsv:
    run: ../tools/custom-bash.cwl
    in:
      input_file: run_gseapy/enrichment_report
      script:
        default: |
          cat "$0" | tr "," "\t" > `basename $0 csv`tsv
    out: [output_file]

  enrichment_plots_to_folder:
    run: ../tools/files-to-folder.cwl
    in:
      input_files: run_gseapy/enrichment_plots
    out: [folder]

  compress_enrichment_plots:
    run: ../tools/tar-compress.cwl
    in:
      folder_to_compress: enrichment_plots_to_folder/folder
    out: [compressed_folder]

  rename_enrichment_plots:
    run: ../tools/rename.cwl
    in:
      source_file: compress_enrichment_plots/compressed_folder
      target_filename:
        default: "enrichment_plots.tar.gz"
    out: [target_file]

  enrichment_heatmaps_to_folder:
    run: ../tools/files-to-folder.cwl
    in:
      input_files: run_gseapy/enrichment_heatmaps
    out: [folder]

  compress_enrichment_heatmaps:
    run: ../tools/tar-compress.cwl
    in:
      folder_to_compress: enrichment_heatmaps_to_folder/folder
    out: [compressed_folder]

  rename_enrichment_heatmaps:
    run: ../tools/rename.cwl
    in:
      source_file: compress_enrichment_heatmaps/compressed_folder
      target_filename:
        default: "enrichment_heatmaps.tar.gz"
    out: [target_file]

  report_summary:
    run: ../tools/gseapy-reportsummary.cwl
    in:
      read_counts_file: read_counts_file
      phenotypes_file: phenotypes_file
      enrichment_report: convert_to_tsv/output_file
    out:
      - summary_file
      - log_file_stderr
      - log_file_stdout


$namespaces:
  s: http://schema.org/

$schemas:
- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf

s:name: "GSEApy - Gene Set Enrichment Analysis in Python"
label: "GSEApy - Gene Set Enrichment Analysis in Python"
s:alternateName: "GSEApy - Gene Set Enrichment Analysis in Python"

s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/gseapy.cwl
s:codeRepository: https://github.com/datirium/workflows
s:license: http://www.apache.org/licenses/LICENSE-2.0

s:isPartOf:
  class: s:CreativeWork
  s:name: Common Workflow Language
  s:url: http://commonwl.org/

s:creator:
- class: s:Organization
  s:legalName: "Cincinnati Children's Hospital Medical Center"
  s:location:
  - class: s:PostalAddress
    s:addressCountry: "USA"
    s:addressLocality: "Cincinnati"
    s:addressRegion: "OH"
    s:postalCode: "45229"
    s:streetAddress: "3333 Burnet Ave"
    s:telephone: "+1(513)636-4200"
  s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png"
  s:department:
  - class: s:Organization
    s:legalName: "Allergy and Immunology"
    s:department:
    - class: s:Organization
      s:legalName: "Barski Research Lab"
      s:member:
      - class: s:Person
        s:name: Michael Kotliar
        s:email: mailto:misha.kotliar@gmail.com
        s:sameAs:
        - id: http://orcid.org/0000-0002-6486-3898


# doc:
#   $include: ../descriptions/gseapy.md


doc: |
  GSEAPY: Gene Set Enrichment Analysis in Python
  ==============================================

  Gene Set Enrichment Analysis is a computational method that determines whether an a priori
  defined set of genes shows statistically significant, concordant differences between two
  biological states (e.g. phenotypes).

  GSEA requires as input an expression dataset, which contains expression profiles for multiple samples.
  While the software supports multiple input file formats for these datasets, the tab-delimited GCT format
  is the most common. The first column of the GCT file contains feature identifiers (gene ids or symbols in
  the case of data derived from RNA-Seq experiments). The second column contains a description of the feature;
  this column is ignored by GSEA and may be filled with “NA”s. Subsequent columns contain the expression
  values for each feature, with one sample's expression value per column. It is important to note that there
  are no hard and fast rules regarding how a GCT file's expression values are derived. The important point is
  that they are comparable to one another across features within a sample and comparable to one another
  across samples. Tools such as DESeq2 can be made to produce properly normalized data (normalized counts)
  which are compatible with GSEA.

  Documents
  ==============================================
  - GSEA Home Page: https://www.gsea-msigdb.org/gsea/index.jsp
  - Results Interpretation: https://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideTEXT.htm#_Interpreting_GSEA_Results
  - GSEA User Guide: https://gseapy.readthedocs.io/en/latest/faq.html
  - GSEAPY Docs: https://gseapy.readthedocs.io/en/latest/introduction.html

  References
  ==============================================
  - Subramanian, Tamayo, et al. (2005, PNAS), https://www.pnas.org/content/102/43/15545
  - Mootha, Lindgren, et al. (2003, Nature Genetics), http://www.nature.com/ng/journal/v34/n3/abs/ng1180.html