cwlVersion: v1.0
class: Workflow


requirements:
  - class: SubworkflowFeatureRequirement
  - class: StepInputExpressionRequirement
  - class: InlineJavascriptRequirement
    expressionLib:
    - var get_root = function(basename) {
          return basename.split('.').slice(0,1).join('.');
      };


'sd:metadata':
  - "../metadata/rnaseq-header.cwl"

'sd:upstream':
  genome_indices: "genome-indices.cwl"


inputs:

# General inputs

  star_indices_folder:
    type: Directory
    label: "STAR indices folder"
    'sd:upstreamSource': "genome_indices/star_indices"
    doc: "Path to STAR generated indices"

  bowtie_indices_folder:
    type: Directory
    label: "BowTie Ribosomal Indices"
    'sd:upstreamSource': "genome_indices/ribosomal_indices"
    doc: "Path to Bowtie generated indices"

  chrom_length_file:
    type: File
    label: "Chromosome length file"
    format: "http://edamontology.org/format_2330"
    'sd:upstreamSource': "genome_indices/chrom_length"
    doc: "Chromosome length file"

  annotation_file:
    type: File
    label: "Annotation file"
    format:
      - "http://edamontology.org/format_2306"
      - "http://edamontology.org/format_3475"
    'sd:upstreamSource': "genome_indices/annotation"
    doc: "GTF or TAB-separated annotation file"

  fastq_file:
    type: File
    label: "FASTQ input file"
    format: "http://edamontology.org/format_1930"
    doc: "Reads data in a FASTQ format"

# Advanced inputs

  exclude_chr:
    type: string?
    'sd:layout':
      advanced: true
    label: "Chromosome to be excluded in rpkm calculation"
    doc: "Chromosome to be excluded in rpkm calculation"

  clip_3p_end:
    type: int?
    default: 0
    'sd:layout':
      advanced: true
    label: "Clip from 3p end"
    doc: "Number of bases to clip from the 3p end"

  clip_5p_end:
    type: int?
    default: 0
    'sd:layout':
      advanced: true
    label: "Clip from 5p end"
    doc: "Number of bases to clip from the 5p end"

# System dependent

  threads:
    type: int?
    default: 1
    'sd:layout':
      advanced: true
    label: "Number of threads"
    doc: "Number of threads for those steps that support multi-threading"


outputs:

  bigwig:
    type: File
    format: "http://edamontology.org/format_3006"
    label: "BigWig file"
    doc: "Generated BigWig file"
    outputSource: bam_to_bigwig/bigwig_file
    'sd:visualPlugins':
    - igvbrowser:
        tab: 'IGV Genome Browser'
        id: 'igvbrowser'
        type: 'wig'
        name: "BigWig Track"
        height: 120

  star_final_log:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "STAR final log"
    doc: "STAR Log.final.out"
    outputSource: star_aligner/log_final

  star_out_log:
    type: File?
    format: "http://edamontology.org/format_2330"
    label: "STAR log out"
    doc: "STAR Log.out"
    outputSource: star_aligner/log_out

  star_progress_log:
    type: File?
    format: "http://edamontology.org/format_2330"
    label: "STAR progress log"
    doc: "STAR Log.progress.out"
    outputSource: star_aligner/log_progress

  star_stdout_log:
    type: File?
    format: "http://edamontology.org/format_2330"
    label: "STAR stdout log"
    doc: "STAR Log.std.out"
    outputSource: star_aligner/log_std

  star_sj_log:
    type: File?
    format: "http://edamontology.org/format_2330"
    label: "STAR sj log"
    doc: "STAR SJ.out.tab"
    outputSource: star_aligner/log_sj

  fastx_statistics:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "FASTQ statistics"
    doc: "fastx_quality_stats generated FASTQ file quality statistics file"
    outputSource: fastx_quality_stats/statistics_file
    'sd:visualPlugins':
    - line:
        tab: 'QC Plots'
        Title: 'Base frequency plot'
        xAxisTitle: 'Nucleotide position'
        yAxisTitle: 'Frequency'
        colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"]
        data: [$13, $14, $15, $16, $17]
    - boxplot:
        tab: 'QC Plots'
        Title: 'Quality Control'
        xAxisTitle: 'Nucleotide position'
        yAxisTitle: 'Quality score'
        colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"]
        data: [$11, $7, $8, $9, $12]

  bambai_pair:
    type: File
    format: "http://edamontology.org/format_2572"
    label: "Coordinate sorted BAM alignment file (+index BAI)"
    doc: "Coordinate sorted BAM file and BAI index file"
    outputSource: samtools_sort_index_2/bam_bai_pair
    'sd:visualPlugins':
    - igvbrowser:
        tab: 'IGV Genome Browser'
        id: 'igvbrowser'
        optional: true
        type: 'alignment'
        format: 'bam'
        name: "BAM Track"
        displayMode: "SQUISHED"

  bowtie_log:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "Bowtie alignment log"
    doc: "Bowtie alignment log file"
    outputSource: bowtie_aligner/log_file

  # rpkm_isoforms:
  #   type: File
  #   format: "http://edamontology.org/format_3752"
  #   label: "RPKM, grouped by isoforms"
  #   doc: "Calculated rpkm values, grouped by isoforms"
  #   outputSource: rpkm_calculation/isoforms_file

  rpkm_genes:
    type: File
    format: "http://edamontology.org/format_3475"
    label: "raw reads grouped by gene name"
    doc: "raw reads grouped by gene name"
    outputSource: group_isoforms/genes_file
    'sd:visualPlugins':
    - syncfusiongrid:
        tab: 'Gene Expression'
        Title: 'raw reads grouped by gene name'

  rpkm_common_tss:
    type: File
    format: "http://edamontology.org/format_3475"
    label: "raw reads grouped by common TSS"
    doc: "raw reads grouped by common TSS"
    outputSource: group_isoforms/common_tss_file

  get_stat_log:
    type: File?
    label: "YAML formatted combined log"
    format: "http://edamontology.org/format_3750"
    doc: "YAML formatted combined log"
    outputSource: get_stat/collected_statistics_yaml

  get_stat_markdown:
    type: File?
    label: "Markdown formatted combined log"
    format: "http://edamontology.org/format_3835"
    doc: "Markdown formatted combined log"
    outputSource: get_stat/collected_statistics_md
    'sd:visualPlugins':
    - markdownView:
        tab: 'Overview'

  get_formatted_stats:
    type: File?
    label: "Bowtie, STAR and GEEP mapping stats"
    format: "http://edamontology.org/format_2330"
    doc: "Processed and combined Bowtie & STAR aligner and GEEP logs"
    outputSource: get_stat/collected_statistics_tsv
    'sd:visualPlugins':
    - tableView:
        vertical: true
        tab: 'Overview'
    'sd:preview':
      'sd:visualPlugins':
      - pie:
          colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072', '#778899']
          data: [$2, $3, $4, $5, $6]

  bam_statistics_report:
    type: File
    label: "BAM statistics report"
    format: "http://edamontology.org/format_2330"
    doc: "BAM statistics report (right after alignment and sorting)"
    outputSource: get_bam_statistics/log_file


  trim_report:
    type: File
    label: "cutadapt report"
    doc: "cutadapt generated log"
    outputSource: umisep_cutadapt/report_file

  umi_tools_dedup_stdout:
    type: File
    label: "umi_tools dedup stdout log"
    doc: "umi_tools dedup stdout log"
    outputSource: umi_tools_dedup/stdout_log

  umi_tools_dedup_stderr:
    type: File
    label: "umi_tools dedup stderr log"
    doc: "umi_tools dedup stderr log"
    outputSource: umi_tools_dedup/stderr_log

  umi_tools_dedup_stats:
    type:
      - "null"
      - File[]
    label: "umi_tools dedup stats"
    doc: "umi_tools dedup stats"
    outputSource: umi_tools_dedup/output_stats

  # trim_report:
  #   type: File
  #   label: "TrimGalore report"
  #   doc: "TrimGalore generated log"
  #   outputSource: trim_fastq/report_file


steps:

  extract_fastq:
    run: ../tools/extract-fastq.cwl
    in:
      compressed_file: fastq_file
    out: [fastq_file]

  # trim_fastq:
  #   run: ../tools/trimgalore.cwl
  #   in:
  #     input_file: extract_fastq/fastq_file
  #     dont_gzip:
  #       default: true
  #     length:
  #       default: 30
  #   out:
  #     - trimmed_file
  #     - report_file

  umisep_cutadapt:
    in:
      input_file: extract_fastq/fastq_file
    out:
      - trimmed_file
      - report_file
    run:
      cwlVersion: v1.0
      class: CommandLineTool
      hints:
      - class: DockerRequirement
        dockerPull: scidap/trimgalore:v0.6.6
      inputs:
        bash_script:
          type: string?
          default: |
            #!/bin/bash

            FILE=$0
            BASENAME=$(basename "$FILE")

            cat ${FILE} | awk '
            NR%4==1{ rd_name=$1; rd_info=$2 }
            NR%4==2{ umi=substr($1,1,10); rd_seq=substr($1,11) }
            NR%4==0{ print rd_name"_"umi" "rd_info; print rd_seq; print "+"; print substr($1,11) }' |
            cutadapt -m 20 -O 20 -a "polyA=A{20}" -a "QUALITY=G{20}" -n 2 - |
            cutadapt  -m 20 -O 3 --nextseq-trim=10  -a "r1adapter=A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000" - |
            cutadapt -m 20 -O 3 -a "r1polyA=A{18}" - |
            cutadapt -m 20 -O 20 -g "r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20" --discard-trimmed -o trimmed_${BASENAME} -

          inputBinding:
            position: 1
          doc: |
            Bash function to run awk & cutadapt from Lexogen with all input parameters or skip it if trigger is false
        input_file:
          type:
            - File
          inputBinding:
            position: 2
          doc: |
            Input FASTQ file
      outputs:
        trimmed_file:
          type: File
          outputBinding:
            glob: "trimmed_*"
        report_file:
          type: stderr
      baseCommand: [bash, '-c']
      stderr: umisep_cutadapt.log

  rename:
    run: ../tools/rename.cwl
    in:
      source_file: umisep_cutadapt/trimmed_file
      target_filename:
        source: extract_fastq/fastq_file
        valueFrom: $(self.basename)
    out:
      - target_file

  star_aligner:
    run: ../tools/star-alignreads.cwl
    in:
      readFilesIn: rename/target_file
      genomeDir: star_indices_folder
      outFilterMultimapNmax:
        default: 1
      outFilterMismatchNmax:
        default: 5
      alignSJDBoverhangMin:
        default: 1
      seedSearchStartLmax:
        default: 15
      clip3pNbases: clip_3p_end
      clip5pNbases: clip_5p_end
      threads: threads
    out:
      - aligned_file
      - log_final
      - uniquely_mapped_reads_number
      - log_out
      - log_progress
      - log_std
      - log_sj

  fastx_quality_stats:
    run: ../tools/fastx-quality-stats.cwl
    in:
      input_file: rename/target_file
    out: [statistics_file]

  samtools_sort_index_1:
    run: ../tools/samtools-sort-index.cwl
    in:
      sort_input: star_aligner/aligned_file
      sort_output_filename:
        source: rename/target_file
        valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam')
      threads: threads
    out: [bam_bai_pair]

  umi_tools_dedup:
    run: ../tools/umi-tools-dedup.cwl
    in:
      bam_file: samtools_sort_index_1/bam_bai_pair
      multimapping_detection_method:
        default: "NH"
    out: [dedup_bam_file, stdout_log, stderr_log, output_stats]

  samtools_sort_index_2:
    run: ../tools/samtools-sort-index.cwl
    in:
      sort_input: umi_tools_dedup/dedup_bam_file
      sort_output_filename:
        source: rename/target_file
        valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam')
      threads: threads
    out: [bam_bai_pair]

  bam_to_bigwig:
    run: ../tools/bam-bedgraph-bigwig.cwl
    in:
      bam_file: samtools_sort_index_2/bam_bai_pair
      chrom_length_file: chrom_length_file
      mapped_reads_number: star_aligner/uniquely_mapped_reads_number
#     fragmentsize is not set (STAR gives only read length). It will be calculated automatically by bedtools genomecov.
    out: [bigwig_file]

  bowtie_aligner:
    run: ../tools/bowtie-alignreads.cwl
    in:
      upstream_filelist: rename/target_file
      indices_folder: bowtie_indices_folder
      clip_3p_end: clip_3p_end
      clip_5p_end: clip_5p_end
      v:
        default: 3
      m:
        default: 1
      best:
        default: true
      strata:
        default: true
      sam:
        default: true
      threads: threads
    out: [log_file]

  rpkm_calculation:
    run: ../tools/geep.cwl
    in:
      bam_file: samtools_sort_index_2/bam_bai_pair
      annotation_file: annotation_file
      rpkm_threshold:
        default: 0
      max_cycles:
        default: 0
      exclude_chr: exclude_chr
      threads: threads
    out: [isoforms_file]

  group_isoforms:
    in:
      isoforms_file: rpkm_calculation/isoforms_file
    out:
      - genes_file
      - common_tss_file
      - error_file
    run:
      cwlVersion: v1.0
      class: CommandLineTool
      hints:
        - class: DockerRequirement
          dockerPull: biowardrobe2/scidap-deseq:v0.0.20
      inputs:
        bash_script:
          type: string?
          default: |
            #!/bin/bash

            FILE=$0
            BASENAME=$(basename "$FILE")

            get_gene_n_tss.R --isoforms "${FILE}"

            sed -ibak 's/[[:space:]]\{1,\}[^[:space:]]\{1,\}$//' "${BASENAME}.genes.tsv"
            sed -ibak 's/[[:space:]]\{1,\}[^[:space:]]\{1,\}$//' "${BASENAME}.common_tss.tsv"
            rm -f ./*bak
          inputBinding:
            position: 1
          doc: |
            Bash function to run awk & cutadapt from Lexogen with all input parameters or skip it if trigger is false
        isoforms_file:
          type: File
          inputBinding:
            position: 5
      outputs:
        genes_file:
          type: File
          outputBinding:
            glob: $(inputs.genes_filename?inputs.genes_filename:"*genes.tsv")
          doc: "Output TSV gene expression file"
        common_tss_file:
          type: File
          outputBinding:
            glob: $(inputs.common_tss_file?inputs.common_tss_file:"*common_tss.tsv")
          doc: "Output TSV common tss expression file"
        error_file:
          type: stderr
      baseCommand: [bash, '-c']
      stderr: group_isoforms_error.log

  get_bam_statistics:
    run: ../tools/samtools-stats.cwl
    in:
      bambai_pair: samtools_sort_index_2/bam_bai_pair
      output_filename:
        source: samtools_sort_index_2/bam_bai_pair
        valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt")
    out: [log_file]

  get_stat:
      run: ../tools/collect-statistics-rna-quantseq.cwl
      in:
        # trimgalore_report_fastq_1: trim_fastq/report_file
        star_alignment_report: star_aligner/log_final
        bowtie_alignment_report: bowtie_aligner/log_file
        bam_statistics_report: get_bam_statistics/log_file
        isoforms_file: rpkm_calculation/isoforms_file
      out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md]


$namespaces:
  s: http://schema.org/

$schemas:
- http://schema.org/version/9.0/schemaorg-current-http.rdf

s:name: "QuantSeq 3' mRNA-Seq single-read"
label: "QuantSeq 3' mRNA-Seq single-read"
s:alternateName: "Run QuantSeq 3' mRNA-Seq basic analysis with single-end data file"

s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/trim-quantseq-mrnaseq-se.cwl
s:codeRepository: https://github.com/datirium/workflows
s:license: http://www.apache.org/licenses/LICENSE-2.0

s:isPartOf:
  class: s:CreativeWork
  s:name: Common Workflow Language
  s:url: http://commonwl.org/

s:creator:
  - class: s:Organization
    s:legalName: "Datirium, LLC"
    s:member:
      - class: s:Person
        s:name: Artem BArski
        s:email: mailto:Artem.Barski@datirum.com
      - class: s:Person
        s:name: Andrey Kartashov
        s:email: mailto:Andrey.Kartashov@datirium.com
        s:sameAs:
          - id: http://orcid.org/0000-0001-9102-5681

# doc:
#   $include: ../descriptions/trim-quantseq-mrnaseq-se.md


doc: |
  ### Pipeline for Lexogen's QuantSeq 3' mRNA-Seq Library Prep Kit FWD for Illumina

  [Lexogen original documentation](https://www.lexogen.com/quantseq-3mrna-sequencing/)

  * Cost-saving and streamlined globin mRNA depletion during QuantSeq library preparation
  * Genome-wide analysis of gene expression
  * Cost-efficient alternative to microarrays and standard RNA-Seq
  * Down to 100 pg total RNA input
  * Applicable for low quality and FFPE samples
  * Single-read sequencing of up to 9,216 samples/lane
  * Dual indexing and Unique Molecular Identifiers (UMIs) are available

  ### QuantSeq 3’ mRNA-Seq Library Prep Kit FWD for Illumina

  The QuantSeq FWD Kit is a library preparation protocol designed to generate Illumina compatible libraries of sequences close to the 3’ end of polyadenylated RNA.

  QuantSeq FWD contains the Illumina Read 1 linker sequence in the second strand synthesis primer,
  hence NGS reads are generated towards the poly(A) tail, directly reflecting the mRNA sequence (see workflow).
  This version is the recommended standard for gene expression analysis.
  Lexogen furthermore provides a high-throughput version with optional dual indexing (i5 and i7 indices) allowing up to 9,216 samples to be multiplexed in one lane.

  #### Analysis of Low Input and Low Quality Samples

  The required input amount of total RNA is as low as 100 pg.
  QuantSeq is suitable to reproducibly generate libraries from low quality RNA, including FFPE samples.
  See Fig.1 and 2 for a comparison of two different RNA qualities (FFPE and fresh frozen cryo-block) of the same sample.

  ![Fig 1](https://www.lexogen.com/wp-content/uploads/2017/02/Correlation_Samples.jpg)

  Figure 1 | Correlation of gene counts of FFPE and cryo samples.

  ![Fig 2](https://www.lexogen.com/wp-content/uploads/2017/02/Venn_diagrams.jpg)

  Figure 2 | Venn diagrams of genes detected by QuantSeq at a uniform read depth of 2.5 M reads in FFPE and cryo samples with 1, 5, and 10 reads/gene thresholds.

  #### Mapping of Transcript End Sites

  By using longer reads QuantSeq FWD allows to exactly pinpoint the 3’ end of poly(A) RNA (see Fig. 3) and therefore obtain accurate information about the 3’ UTR.

  ![Figure 3](https://www.lexogen.com/wp-content/uploads/2017/02/Read_Coverage.jpg)

  Figure 3 | QuantSeq read coverage versus normalized transcript length of NGS libraries derived from FFPE-RNA (blue) and cryo-preserved RNA (red).


  ### Current workflow should be used only with the single-end RNA-Seq data. It performs the following steps:

  1. Separates UMIes and trims adapters from input FASTQ file
  2. Uses ```STAR``` to align reads from input FASTQ file according to the predefined reference indices; generates unsorted BAM file and alignment statistics file
  3. Uses ```fastx_quality_stats``` to analyze input FASTQ file and generates quality statistics file
  4. Uses ```samtools sort``` and generates coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 2 (after running STAR)
  5. Uses ```umi_tools dedup``` and generates final filtered sorted BAM(+BAI) file pair
  6. Generates BigWig file on the base of sorted BAM file
  7. Maps input FASTQ file to predefined rRNA reference indices using ```bowtie``` to define the level of rRNA contamination; exports resulted statistics to file
  8. Calculates isoform expression level for the sorted BAM file and GTF/TAB annotation file using GEEP reads-counting utility; exports results to file