cwlVersion: v1.2.0-dev4 class: Workflow requirements: - class: SubworkflowFeatureRequirement - class: StepInputExpressionRequirement - class: MultipleInputFeatureRequirement - class: InlineJavascriptRequirement expressionLib: - var get_root = function(basename) { return basename.split('.').slice(0,1).join('.'); }; 'sd:metadata': - "../metadata/rnaseq-header.cwl" 'sd:upstream': genome_indices: "genome-indices.cwl" inputs: star_indices_folder: type: Directory label: "STAR indices folder" 'sd:upstreamSource': "genome_indices/star_indices" doc: "Path to STAR generated indices" bowtie_indices_folder: type: Directory label: "BowTie Ribosomal Indices" 'sd:upstreamSource': "genome_indices/ribosomal_indices" doc: "Path to Bowtie generated indices" chrom_length_file: type: File label: "Chromosome length file" format: "http://edamontology.org/format_2330" 'sd:upstreamSource': "genome_indices/chrom_length" doc: "Chromosome length file" annotation_file: type: File label: "Annotation file" format: - "http://edamontology.org/format_2306" - "http://edamontology.org/format_3475" 'sd:upstreamSource': "genome_indices/annotation" doc: "GTF or TAB-separated annotation file" annotation_gtf_file: type: File label: "GTF annotation file" format: "http://edamontology.org/format_2306" 'sd:upstreamSource': "genome_indices/annotation_gtf" doc: "GTF annotation file" fastq_file: type: File label: "FASTQ input file" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format" use_umi: type: boolean? default: true 'sd:layout': advanced: true label: "Use UMIs" doc: "Use UMIs (for FWD-UMI libraries)" min_length: type: int? default: 30 'sd:layout': advanced: true label: "Set minimum length for trimmed reads when running FWD/REV pipeline. Shorter reads get discarded. Set 0 to disable" doc: | Set minimum length for trimmed reads when running FWD/REV (not UMI) pipeline. Shorter reads get discarded. Applied only when running trim_fastq step. For FWD-UMI pipeline we use cutadapt instead of TrimGalore, so this input is not used exclude_chr: type: string? default: "" 'sd:layout': advanced: true label: "Coma-separated list of chromosomes to be excluded from gene expression calculation" doc: "Coma-separated list of chromosomes to be excluded from gene expression calculation" clip_3p_end: type: int? default: 0 'sd:layout': advanced: true label: "Clip N bp from 3p end" doc: "Number of bp to clip from the 3p end" clip_5p_end: type: int? default: 0 'sd:layout': advanced: true label: "Clip N bp from 5p end" doc: "Number of bp to clip from the 5p end" threads: type: int? default: 1 'sd:layout': advanced: true label: "Number of threads" doc: "Number of threads for those steps that support multi-threading" outputs: bigwig: type: File format: "http://edamontology.org/format_3006" label: "BigWig file" doc: "Generated BigWig file" outputSource: bam_to_bigwig/bigwig_file 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' id: 'igvbrowser' type: 'wig' name: "BigWig Track" height: 120 star_final_log: type: File format: "http://edamontology.org/format_2330" label: "STAR final log" doc: "STAR Log.final.out" outputSource: star_aligner/log_final star_out_log: type: File? format: "http://edamontology.org/format_2330" label: "STAR log out" doc: "STAR Log.out" outputSource: star_aligner/log_out star_progress_log: type: File? format: "http://edamontology.org/format_2330" label: "STAR progress log" doc: "STAR Log.progress.out" outputSource: star_aligner/log_progress star_stdout_log: type: File? format: "http://edamontology.org/format_2330" label: "STAR stdout log" doc: "STAR Log.std.out" outputSource: star_aligner/log_std star_sj_log: type: File? format: "http://edamontology.org/format_2330" label: "STAR sj log" doc: "STAR SJ.out.tab" outputSource: star_aligner/log_sj fastx_statistics: type: File format: "http://edamontology.org/format_2330" label: "FASTQ statistics" doc: "fastx_quality_stats generated FASTQ file quality statistics file" outputSource: fastx_quality_stats/statistics_file 'sd:visualPlugins': - line: tab: 'QC Plots' Title: 'Base frequency plot' xAxisTitle: 'Nucleotide position' yAxisTitle: 'Frequency' colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] data: [$13, $14, $15, $16, $17] - boxplot: tab: 'QC Plots' Title: 'Quality Control' xAxisTitle: 'Nucleotide position' yAxisTitle: 'Quality score' colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] data: [$11, $7, $8, $9, $12] bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" outputSource: samtools_sort_index_2/bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' id: 'igvbrowser' optional: true type: 'alignment' format: 'bam' name: "BAM Track" displayMode: "SQUISHED" bowtie_log: type: File format: "http://edamontology.org/format_2330" label: "Bowtie alignment log" doc: "Bowtie alignment log file" outputSource: bowtie_aligner/log_file rpkm_genes: type: File format: "http://edamontology.org/format_3475" label: "raw reads grouped by gene name" doc: "raw reads grouped by gene name" outputSource: group_isoforms/genes_file 'sd:visualPlugins': - syncfusiongrid: tab: 'Gene Expression' Title: 'raw reads grouped by gene name' reads_per_gene_htseq_count: type: File format: "http://edamontology.org/format_3475" label: "Gene expression from htseq-count (reads per gene)" doc: "Gene expression from htseq-count (reads per gene)" outputSource: htseq_calculate_expression/gene_expression_report rpkm_common_tss: type: File format: "http://edamontology.org/format_3475" label: "raw reads grouped by common TSS" doc: "raw reads grouped by common TSS" outputSource: group_isoforms/common_tss_file get_stat_log: type: File? format: "http://edamontology.org/format_3750" label: "YAML formatted combined log" doc: "YAML formatted combined log" outputSource: get_stat/collected_statistics_yaml get_stat_markdown: type: File? label: "Markdown formatted combined log" format: "http://edamontology.org/format_3835" doc: "Markdown formatted combined log" outputSource: get_stat/collected_statistics_md 'sd:visualPlugins': - markdownView: tab: 'Overview' get_formatted_stats: type: File? label: "Bowtie, STAR and GEEP mapping stats" format: "http://edamontology.org/format_2330" doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" outputSource: get_stat/collected_statistics_tsv 'sd:visualPlugins': - tableView: vertical: true tab: 'Overview' 'sd:preview': 'sd:visualPlugins': - pie: colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072', '#778899'] data: [$2, $3, $4, $5, $6] bam_statistics_report: type: File label: "BAM statistics report" format: "http://edamontology.org/format_2330" doc: "BAM statistics report (right after alignment and sorting)" outputSource: get_bam_statistics/log_file trimgalore_report: type: File? format: "http://edamontology.org/format_2330" label: "Adapter trimming report from TrimGalore. Even if it was eventually bypassed" doc: "Adapter trimming report from TrimGalore. Even if it was eventually bypassed" outputSource: trim_fastq/report_file cutadapt_report: type: File? format: "http://edamontology.org/format_2330" label: "Adapter trimming report from Cutadapt" doc: "Adapter trimming report from Cutadapt" outputSource: umisep_cutadapt/report_file umi_tools_dedup_stdout: type: File? format: "http://edamontology.org/format_2330" label: "umi_tools dedup stdout log" doc: "umi_tools dedup stdout log" outputSource: umi_tools_dedup/stdout_log umi_tools_dedup_stderr: type: File? format: "http://edamontology.org/format_2330" label: "umi_tools dedup stderr log" doc: "umi_tools dedup stderr log" outputSource: umi_tools_dedup/stderr_log umi_tools_dedup_stats: type: - "null" - File[] label: "umi_tools dedup stats" doc: "umi_tools dedup stats" outputSource: umi_tools_dedup/output_stats steps: extract_fastq: run: ../tools/extract-fastq.cwl in: compressed_file: fastq_file out: - fastq_file trim_fastq: when: $(!inputs.use_umi) # not sure if ! is valid syntax in this case run: ../tools/trimgalore.cwl in: use_umi: use_umi # need it for "when" input_file: extract_fastq/fastq_file dont_gzip: default: true # saves time length: min_length out: - trimmed_file - report_file bypass_trim: run: ../tools/bypass-trimgalore-se.cwl in: original_fastq_file: extract_fastq/fastq_file trimmed_fastq_file: trim_fastq/trimmed_file trimming_report_file: trim_fastq/report_file min_reads_count: default: 100 # any small number should be good, as we are catching the case when TrimGalore discarded all reads out: - selected_fastq_file umisep_cutadapt: when: $(inputs.use_umi) in: use_umi: use_umi # need it for "when" input_file: extract_fastq/fastq_file out: - trimmed_file - report_file run: cwlVersion: v1.0 class: CommandLineTool hints: - class: DockerRequirement dockerPull: scidap/trimgalore:v0.6.6 inputs: bash_script: type: string? default: | #!/bin/bash FILE=$0 BASENAME=$(basename "$FILE") cat ${FILE} | awk ' NR%4==1{ rd_name=$1; rd_info=$2 } NR%4==2{ umi=substr($1,1,10); rd_seq=substr($1,11) } NR%4==0{ print rd_name"_"umi" "rd_info; print rd_seq; print "+"; print substr($1,11) }' | cutadapt -m 20 -O 20 -a "polyA=A{20}" -a "QUALITY=G{20}" -n 2 - | cutadapt -m 20 -O 3 --nextseq-trim=10 -a "r1adapter=A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000" - | cutadapt -m 20 -O 3 -a "r1polyA=A{18}" - | cutadapt -m 20 -O 20 -g "r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20" --discard-trimmed -o trimmed_${BASENAME} - inputBinding: position: 1 doc: "Bash function to run awk & cutadapt from Lexogen" input_file: type: File inputBinding: position: 2 doc: "Input FASTQ file" outputs: trimmed_file: type: File outputBinding: glob: "trimmed_*" report_file: type: stderr baseCommand: [bash, '-c'] stderr: umisep_cutadapt_report.txt rename: run: ../tools/rename.cwl in: source_file: source: - umisep_cutadapt/trimmed_file - bypass_trim/selected_fastq_file pickValue: the_only_non_null # should be always only one non-null value target_filename: source: extract_fastq/fastq_file valueFrom: $(self.basename) out: - target_file star_aligner: run: ../tools/star-alignreads.cwl in: readFilesIn: rename/target_file genomeDir: star_indices_folder outFilterMultimapNmax: default: 1 outFilterMismatchNmax: default: 5 alignSJDBoverhangMin: default: 1 seedSearchStartLmax: default: 15 clip3pNbases: clip_3p_end clip5pNbases: clip_5p_end threads: threads out: - aligned_file - log_final - uniquely_mapped_reads_number - log_out - log_progress - log_std - log_sj fastx_quality_stats: run: ../tools/fastx-quality-stats.cwl in: input_file: rename/target_file out: [statistics_file] samtools_sort_index_1: run: ../tools/samtools-sort-index.cwl in: sort_input: star_aligner/aligned_file sort_output_filename: source: rename/target_file valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') threads: threads out: [bam_bai_pair] umi_tools_dedup: when: $(inputs.use_umi) run: ../tools/umi-tools-dedup.cwl in: use_umi: use_umi # need it for "when" bam_file: samtools_sort_index_1/bam_bai_pair multimapping_detection_method: default: "NH" out: - dedup_bam_file - output_stats - stdout_log - stderr_log samtools_sort_index_2: # easier to run it twice even if umi_tools_dedup was skipped run: ../tools/samtools-sort-index.cwl in: sort_input: source: - umi_tools_dedup/dedup_bam_file # will be selected first (if not null) - samtools_sort_index_1/bam_bai_pair pickValue: first_non_null sort_output_filename: source: rename/target_file valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') threads: threads out: [bam_bai_pair] bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: bam_file: samtools_sort_index_2/bam_bai_pair chrom_length_file: chrom_length_file mapped_reads_number: star_aligner/uniquely_mapped_reads_number out: [bigwig_file] bowtie_aligner: run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: rename/target_file indices_folder: bowtie_indices_folder clip_3p_end: clip_3p_end clip_5p_end: clip_5p_end v: default: 3 m: default: 1 best: default: true strata: default: true sam: default: true threads: threads out: [log_file] calculate_expression: run: ../tools/geep.cwl in: bam_file: samtools_sort_index_2/bam_bai_pair annotation_file: annotation_file rpkm_threshold: default: 0 max_cycles: default: 0 exclude_chr: exclude_chr threads: threads out: [isoforms_file] group_isoforms: in: isoforms_file: calculate_expression/isoforms_file out: - genes_file - common_tss_file run: cwlVersion: v1.0 class: CommandLineTool hints: - class: DockerRequirement dockerPull: biowardrobe2/scidap-deseq:v0.0.20 inputs: bash_script: type: string? default: | #!/bin/bash FILE=$0 BASENAME=$(basename "$FILE") get_gene_n_tss.R --isoforms "${FILE}" --gene grouped.genes.tsv --tss grouped.common_tss.tsv sed -ibak 's/[[:space:]]\{1,\}[^[:space:]]\{1,\}$//' grouped.genes.tsv sed -ibak 's/[[:space:]]\{1,\}[^[:space:]]\{1,\}$//' grouped.common_tss.tsv rm -f ./*bak inputBinding: position: 1 doc: "Bash function to run R script to group expression by genes and common TSS" isoforms_file: type: File inputBinding: position: 5 outputs: genes_file: type: File outputBinding: glob: $(inputs.genes_filename?inputs.genes_filename:"*genes.tsv") doc: "Output TSV gene expression file" common_tss_file: type: File outputBinding: glob: $(inputs.common_tss_file?inputs.common_tss_file:"*common_tss.tsv") doc: "Output TSV common tss expression file" baseCommand: [bash, '-c'] get_bam_statistics: run: ../tools/samtools-stats.cwl in: bambai_pair: samtools_sort_index_2/bam_bai_pair output_filename: source: samtools_sort_index_2/bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_stat: run: ../tools/collect-statistics-rna-quantseq.cwl in: star_alignment_report: star_aligner/log_final bowtie_alignment_report: bowtie_aligner/log_file bam_statistics_report: get_bam_statistics/log_file isoforms_file: calculate_expression/isoforms_file out: - collected_statistics_yaml - collected_statistics_tsv - collected_statistics_md htseq_calculate_expression: run: ../tools/htseq-count.cwl in: alignment_bam_file: samtools_sort_index_2/bam_bai_pair annotation_gtf_file: annotation_gtf_file out: - gene_expression_report $namespaces: s: http://schema.org/ $schemas: - http://schema.org/version/9.0/schemaorg-current-http.rdf s:name: "QuantSeq 3' FWD, FWD-UMI or REV for single-read mRNA-Seq data" label: "QuantSeq 3' FWD, FWD-UMI or REV for single-read mRNA-Seq data" s:alternateName: "Runs QuantSeq 3' FWD, FWD-UMI or REV analysis for single-read mRNA-Seq data" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/trim-quantseq-mrnaseq-se.cwl s:codeRepository: https://github.com/datirium/workflows s:license: http://www.apache.org/licenses/LICENSE-2.0 s:isPartOf: class: s:CreativeWork s:name: Common Workflow Language s:url: http://commonwl.org/ s:creator: - class: s:Organization s:legalName: "Datirium, LLC" s:member: - class: s:Person s:name: Artem Barski s:email: mailto:Artem.Barski@datirum.com - class: s:Person s:name: Andrey Kartashov s:email: mailto:Andrey.Kartashov@datirium.com s:sameAs: - id: http://orcid.org/0000-0001-9102-5681 - class: s:Person s:name: Michael Kotliar s:email: mailto:misha.kotliar@gmail.com s:sameAs: - id: http://orcid.org/0000-0002-6486-3898 # doc: # $include: ../descriptions/trim-quantseq-mrnaseq-se.md doc: | ### Devel version of QuantSeq 3' FWD, FWD-UMI or REV for single-read mRNA-Seq data