cwlVersion: v1.0
class: Workflow


requirements:
  - class: SubworkflowFeatureRequirement
  - class: StepInputExpressionRequirement
  - class: InlineJavascriptRequirement
  - class: MultipleInputFeatureRequirement


'sd:upstream':
  genome_indices:
    - "cellranger-mkref.cwl"


inputs:

  alias:
    type: string
    label: "Experiment short name/Alias"
    sd:preview:
      position: 1

  indices_folder:
    type: Directory
    label: "Genome Type"
    doc: "Cell Ranger ARC generated genome indices folder"
    'sd:upstreamSource': "genome_indices/arc_indices_folder"
    'sd:localLabel': true

  gex_fastq_file_r1:
    type: File
    format: "http://edamontology.org/format_1930"
    label: "GEX FASTQ file R1 (optionally compressed)"
    doc: "GEX FASTQ file R1 (optionally compressed)"

  gex_fastq_file_r2:
    type: File
    format: "http://edamontology.org/format_1930"
    label: "GEX FASTQ file R2 (optionally compressed)"
    doc: "GEX FASTQ file R2 (optionally compressed)"

  atac_fastq_file_r1:
    type: File
    format: "http://edamontology.org/format_1930"
    label: "ATAC FASTQ file R1 (optionally compressed)"
    doc: "ATAC FASTQ file R1 (optionally compressed)"

  atac_fastq_file_r2:
    type: File
    format: "http://edamontology.org/format_1930"
    label: "ATAC FASTQ file R2 (optionally compressed)"
    doc: "ATAC FASTQ file R2 (optionally compressed)"

  atac_fastq_file_r3:
    type: File
    format: "http://edamontology.org/format_1930"
    label: "ATAC FASTQ file R3 (optionally compressed)"
    doc: "ATAC FASTQ file R3 (optionally compressed)"

  exclude_introns:
    type: boolean?
    default: false
    label: "Disable counting of intronic reads"
    doc: |
      Disable counting of intronic reads. In this mode, only reads that are exonic
      and compatible with annotated splice junctions in the reference are counted.
      Note: using this mode will reduce the UMI counts in the feature-barcode matrix
    'sd:layout':
      advanced: true

  threads:
    type: int?
    default: 4
    label: "Number of threads"
    doc: "Number of threads for those steps that support multithreading"
    'sd:layout':
      advanced: true

  memory_limit:
    type: int?
    default: 20
    label: "Genome Type"
    doc: |
      Maximum memory used (GB).
      The same as was used for generating indices.
      The same will be applied to virtual memory
    'sd:upstreamSource': "genome_indices/memory_limit"
    'sd:localLabel': true


outputs:

  fastqc_report_gex_fastq_r1:
    type: File
    outputSource: run_fastqc_for_gex_fastq_r1/html_file
    label: "FastqQC report for GEX FASTQ file R1"
    doc: |
      FastqQC report for GEX FASTQ file R1
    'sd:visualPlugins':
    - linkList:
        tab: 'Overview'
        target: "_blank"

  fastqc_report_gex_fastq_r2:
    type: File
    outputSource: run_fastqc_for_gex_fastq_r2/html_file
    label: "FastqQC report for GEX FASTQ file R2"
    doc: |
      FastqQC report for GEX FASTQ file R2
    'sd:visualPlugins':
    - linkList:
        tab: 'Overview'
        target: "_blank"

  fastqc_report_atac_fastq_r1:
    type: File
    outputSource: run_fastqc_for_atac_fastq_r1/html_file
    label: "FastqQC report for ATAC FASTQ file R1"
    doc: |
      FastqQC report for ATAC FASTQ file R1
    'sd:visualPlugins':
    - linkList:
        tab: 'Overview'
        target: "_blank"

  fastqc_report_atac_fastq_r2:
    type: File
    outputSource: run_fastqc_for_atac_fastq_r2/html_file
    label: "FastqQC report for ATAC FASTQ file R2"
    doc: |
      FastqQC report for ATAC FASTQ file R2
    'sd:visualPlugins':
    - linkList:
        tab: 'Overview'
        target: "_blank"

  fastqc_report_atac_fastq_r3:
    type: File
    outputSource: run_fastqc_for_atac_fastq_r3/html_file
    label: "FastqQC report for ATAC FASTQ file R3"
    doc: |
      FastqQC report for ATAC FASTQ file R3
    'sd:visualPlugins':
    - linkList:
        tab: 'Overview'
        target: "_blank"

  web_summary_report:
    type: File
    outputSource: generate_counts_matrix/web_summary_report
    label: "Cell Ranger summary"
    doc: |
      Cell Ranger summary
    'sd:visualPlugins':
    - linkList:
        tab: 'Overview'
        target: "_blank"

  metrics_summary_report:
    type: File
    outputSource: generate_counts_matrix/metrics_summary_report
    label: "Run summary metrics in CSV format"
    doc: |
      Run summary metrics in CSV format

  barcode_metrics_report:
    type: File
    outputSource: generate_counts_matrix/barcode_metrics_report
    label: "ATAC and GEX barcode metrics in CSV format"
    doc: |
      ATAC and GEX read count summaries generated for every
      barcode observed in the experiment. The columns contain
      the paired ATAC and Gene Expression barcode sequences,
      ATAC and Gene Expression QC metrics for that barcode,
      as well as whether this barcode was identified as a
      cell-associated partition by the pipeline.

  gex_possorted_genome_bam_bai:
    type: File
    outputSource: generate_counts_matrix/gex_possorted_genome_bam_bai
    label: "Aligned to the genome indexed reads GEX BAM+BAI files"
    doc: |
      GEX position-sorted reads aligned to the genome and transcriptome
      annotated with barcode information in BAM format

  atac_possorted_genome_bam_bai:
    type: File
    outputSource: generate_counts_matrix/atac_possorted_genome_bam_bai
    label: "Aligned to the genome indexed reads ATAC BAM+BAI files"
    doc: |
      ATAC position-sorted reads aligned to the genome annotated with
      barcode information in BAM format

  filtered_feature_bc_matrix_folder:
    type: File
    outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder
    label: "Compressed folder with filtered feature-barcode matrices"
    doc: |
      Filtered feature barcode matrix stored as a CSC sparse matrix in MEX format.
      The rows consist of all the gene and peak features concatenated together
      (identical to raw feature barcode matrix) and the columns are restricted to
      those barcodes that are identified as cells.

  filtered_feature_bc_matrix_h5:
    type: File
    outputSource: generate_counts_matrix/filtered_feature_bc_matrix_h5
    label: "Filtered feature-barcode matrices in HDF5 format"
    doc: |
      Filtered feature barcode matrix stored as a CSC sparse matrix in hdf5 format.
      The rows consist of all the gene and peak features concatenated together
      (identical to raw feature barcode matrix) and the columns are restricted to
      those barcodes that are identified as cells.

  raw_feature_bc_matrices_folder:
    type: File
    outputSource: compress_raw_feature_bc_matrices_folder/compressed_folder
    label: "Compressed folder with unfiltered feature-barcode matrices"
    doc: |
      Raw feature barcode matrix stored as a CSC sparse matrix in MEX format.
      The rows consist of all the gene and peak features concatenated together
      and the columns consist of all observed barcodes with non-zero signal for
      either ATAC or gene expression.

  raw_feature_bc_matrices_h5:
    type: File
    outputSource: generate_counts_matrix/raw_feature_bc_matrices_h5
    label: "Unfiltered feature-barcode matrices in HDF5 format"
    doc: |
      Raw feature barcode matrix stored as a CSC sparse matrix in hdf5 format.
      The rows consist of all the gene and peak features concatenated together
      and the columns consist of all observed barcodes with non-zero signal for
      either ATAC or gene expression.

  secondary_analysis_report_folder:
    type: File
    outputSource: compress_secondary_analysis_report_folder/compressed_folder
    label: "Compressed folder with secondary analysis results"
    doc: |
      Various secondary analyses that utilize the ATAC data, the GEX data, and their
      linkage: dimensionality reduction and clustering results for the ATAC and GEX
      data, differential expression, and differential accessibility for all clustering
      results above and linkage between ATAC and GEX data.

  gex_molecule_info_h5:
    type: File
    outputSource: generate_counts_matrix/gex_molecule_info_h5
    label: "GEX molecule-level information for aggregating samples into larger datasets"
    doc: |
      Count and barcode information for every GEX molecule observed in the experiment
      in hdf5 format

  loupe_browser_track:
    type: File
    outputSource: generate_counts_matrix/loupe_browser_track
    label: "Loupe Browser visualization file with all the analysis outputs"
    doc: |
      Loupe Browser visualization file with all the analysis outputs

  atac_fragments_file:
    type: File
    outputSource: generate_counts_matrix/atac_fragments_file
    label: "Count and barcode information for every ATAC fragment in TSV format"
    doc: |
      Count and barcode information for every ATAC fragment observed in
      the experiment in TSV format.
  
  atac_peaks_bed_file:
    type: File
    outputSource: generate_counts_matrix/atac_peaks_bed_file
    label: "Identified peaks in BED format"
    doc: |
      Locations of open-chromatin regions identified in this sample.
      These regions are referred to as "peaks".

  atac_cut_sites_bigwig_file:
    type: File
    outputSource: generate_counts_matrix/atac_cut_sites_bigwig_file
    label: "Observed transposition sites in bigWig format"
    doc: |
      Genome track of observed transposition sites in the experiment
      smoothed at a resolution of 400 bases in BIGWIG format.
    'sd:visualPlugins':
    - igvbrowser:
        tab: 'IGV Genome Browser'
        id: 'igvbrowser'
        type: 'wig'
        name: "ATAC cut sites"
        height: 120

  atac_peak_annotation_file:
    type: File
    outputSource: generate_counts_matrix/atac_peak_annotation_file
    label: "Annotations of peaks based on genomic proximity in TSV format"
    doc: |
      Annotations of peaks based on genomic proximity alone.
      Note that these are not functional annotations and they
      do not make use of linkage with GEX data.

  generate_counts_matrix_stdout_log:
    type: File
    outputSource: generate_counts_matrix/stdout_log
    label: stdout log generated by cellranger-arc count
    doc: |
      stdout log generated by cellranger-arc count

  generate_counts_matrix_stderr_log:
    type: File
    outputSource: generate_counts_matrix/stderr_log
    label: stderr log generated by cellranger-arc count
    doc: |
      stderr log generated by cellranger-arc count

  collected_statistics:
    type: File
    outputSource: collect_statistics/collected_statistics
    label: "Collected statistics in Markdown format"
    doc: "Collected statistics in Markdown format"
    'sd:visualPlugins':
    - markdownView:
        tab: 'Overview'

  # compressed_html_data_folder:
  #   type: File
  #   outputSource: compress_html_data_folder/compressed_folder
  #   label: "Compressed folder with CellBrowser formatted results"
  #   doc: |
  #     Compressed folder with CellBrowser formatted results

  # html_data_folder:
  #   type: Directory
  #   outputSource: cellbrowser_build/html_data
  #   label: "Folder with not compressed CellBrowser formatted results"
  #   doc: |
  #     Folder with not compressed CellBrowser formatted results

  # cellbrowser_report:
  #   type: File
  #   outputSource: cellbrowser_build/index_html_file
  #   label: "CellBrowser formatted Cellranger report"
  #   doc: |
  #     CellBrowser formatted Cellranger report
  #   'sd:visualPlugins':
  #   - linkList:
  #       tab: 'Overview'
  #       target: "_blank"


steps:

  extract_gex_fastq_r1:
    run: ../tools/extract-fastq.cwl
    in:
      compressed_file: gex_fastq_file_r1
    out:
    - fastq_file

  extract_gex_fastq_r2:
    run: ../tools/extract-fastq.cwl
    in:
      compressed_file: gex_fastq_file_r2
    out:
    - fastq_file

  extract_atac_fastq_r1:
    run: ../tools/extract-fastq.cwl
    in:
      compressed_file: atac_fastq_file_r1
    out:
    - fastq_file

  extract_atac_fastq_r2:
    run: ../tools/extract-fastq.cwl
    in:
      compressed_file: atac_fastq_file_r2
    out:
    - fastq_file

  extract_atac_fastq_r3:
    run: ../tools/extract-fastq.cwl
    in:
      compressed_file: atac_fastq_file_r3
    out:
    - fastq_file


  run_fastqc_for_gex_fastq_r1:
    run: ../tools/fastqc.cwl
    in:
      reads_file: extract_gex_fastq_r1/fastq_file
      threads: threads
    out:
    - html_file

  run_fastqc_for_gex_fastq_r2:
    run: ../tools/fastqc.cwl
    in:
      reads_file: extract_gex_fastq_r2/fastq_file
      threads: threads
    out:
    - html_file

  run_fastqc_for_atac_fastq_r1:
    run: ../tools/fastqc.cwl
    in:
      reads_file: extract_atac_fastq_r1/fastq_file
      threads: threads
    out:
    - html_file

  run_fastqc_for_atac_fastq_r2:
    run: ../tools/fastqc.cwl
    in:
      reads_file: extract_atac_fastq_r2/fastq_file
      threads: threads
    out:
    - html_file

  run_fastqc_for_atac_fastq_r3:
    run: ../tools/fastqc.cwl
    in:
      reads_file: extract_atac_fastq_r3/fastq_file
      threads: threads
    out:
    - html_file


  generate_counts_matrix:
    run: ../tools/cellranger-arc-count.cwl
    in:
      gex_fastq_file_r1: extract_gex_fastq_r1/fastq_file
      gex_fastq_file_r2: extract_gex_fastq_r2/fastq_file
      atac_fastq_file_r1: extract_atac_fastq_r1/fastq_file
      atac_fastq_file_r2: extract_atac_fastq_r2/fastq_file
      atac_fastq_file_r3: extract_atac_fastq_r3/fastq_file
      indices_folder: indices_folder
      exclude_introns: exclude_introns
      threads: threads
      memory_limit: memory_limit
      virt_memory_limit: memory_limit
    out:
    - web_summary_report
    - metrics_summary_report
    - barcode_metrics_report
    - gex_possorted_genome_bam_bai
    - atac_possorted_genome_bam_bai
    - filtered_feature_bc_matrix_folder
    - filtered_feature_bc_matrix_h5
    - raw_feature_bc_matrices_folder
    - raw_feature_bc_matrices_h5
    - secondary_analysis_report_folder
    - gex_molecule_info_h5
    - loupe_browser_track
    - atac_fragments_file
    - atac_peaks_bed_file
    - atac_cut_sites_bigwig_file
    - atac_peak_annotation_file
    - stdout_log
    - stderr_log

  compress_filtered_feature_bc_matrix_folder:
    run: ../tools/tar-compress.cwl
    in:
      folder_to_compress: generate_counts_matrix/filtered_feature_bc_matrix_folder
    out:
    - compressed_folder

  compress_raw_feature_bc_matrices_folder:
    run: ../tools/tar-compress.cwl
    in:
      folder_to_compress: generate_counts_matrix/raw_feature_bc_matrices_folder
    out:
    - compressed_folder

  compress_secondary_analysis_report_folder:
    run: ../tools/tar-compress.cwl
    in:
      folder_to_compress: generate_counts_matrix/secondary_analysis_report_folder
    out:
    - compressed_folder

  collect_statistics:
    run:
      cwlVersion: v1.0
      class: CommandLineTool
      hints:
      - class: DockerRequirement
        dockerPull: rackspacedot/python37
      inputs:
        script:
          type: string?
          default: |
            #!/usr/bin/env python3
            import sys, csv
            with open(sys.argv[1], "r") as input_stream:
              with open("collected_statistics.md", "w") as output_stream:
                output_stream.write("### Cell Ranger ARC Statistics\n")
                keys, values = None, None
                for i, row in enumerate(csv.reader(input_stream)):
                  if i==0:
                    keys = row
                  else:
                    values = row
                for k,v in zip(keys, values):
                  output_stream.write("- "+k+": "+v+"\n")
          inputBinding:
            position: 5
        metrics_summary_report:
          type: File
          inputBinding:
            position: 6
      outputs:
        collected_statistics:
          type: File
          outputBinding:
            glob: "*"
      baseCommand: ["python3", "-c"]
    in:
      metrics_summary_report: generate_counts_matrix/metrics_summary_report
    out:
    - collected_statistics

  # Need to be updated to display data from Cell Ranger ARC pipeline
  # cellbrowser_build:
  #   run: ../tools/cellbrowser-build-cellranger.cwl
  #   in:
  #     secondary_analysis_report_folder: generate_counts_matrix/secondary_analysis_report_folder
  #     filtered_feature_bc_matrix_folder: generate_counts_matrix/filtered_feature_bc_matrix_folder
  #   out:
  #   - html_data
  #   - index_html_file

  # compress_html_data_folder:
  #   run: ../tools/tar-compress.cwl
  #   in:
  #     folder_to_compress: cellbrowser_build/html_data
  #   out:
  #   - compressed_folder


$namespaces:
  s: http://schema.org/

$schemas:
- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf

s:name: "Cell Ranger ARC Count Gene Expression + ATAC"
label: "Cell Ranger ARC Count Gene Expression + ATAC"
s:alternateName: "Counts ATAC and gene expression reads from a single 10x Genomics Cell Ranger Multiome ATAC + Gene Expression library"

s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-arc-count.cwl
s:codeRepository: https://github.com/datirium/workflows
s:license: http://www.apache.org/licenses/LICENSE-2.0

s:isPartOf:
  class: s:CreativeWork
  s:name: Common Workflow Language
  s:url: http://commonwl.org/

s:creator:
- class: s:Organization
  s:legalName: "Cincinnati Children's Hospital Medical Center"
  s:location:
  - class: s:PostalAddress
    s:addressCountry: "USA"
    s:addressLocality: "Cincinnati"
    s:addressRegion: "OH"
    s:postalCode: "45229"
    s:streetAddress: "3333 Burnet Ave"
    s:telephone: "+1(513)636-4200"
  s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png"
  s:department:
  - class: s:Organization
    s:legalName: "Allergy and Immunology"
    s:department:
    - class: s:Organization
      s:legalName: "Barski Research Lab"
      s:member:
      - class: s:Person
        s:name: Michael Kotliar
        s:email: mailto:misha.kotliar@gmail.com
        s:sameAs:
        - id: http://orcid.org/0000-0002-6486-3898


doc: |
  Cell Ranger ARC Count Gene Expression + ATAC
  ============================================