cwlVersion: v1.0 class: Workflow label: EMG core analysis requirements: - class: StepInputExpressionRequirement - class: SubworkflowFeatureRequirement - class: SchemaDefRequirement types: - $import: ../tools/FragGeneScan-model.yaml - $import: ../tools/InterProScan-apps.yaml - $import: ../tools/InterProScan-protein_formats.yaml - $import: ../tools/esl-reformat-replace.yaml - $import: ../tools/biom-convert-table.yaml - $import: ../tools/trimmomatic-sliding_window.yaml - $import: ../tools/trimmomatic-end_mode.yaml - $import: ../tools/trimmomatic-phred.yaml inputs: sequencing_run_id: string input_sequences: type: File format: edam:format_1929 # FASTA ncRNA_ribosomal_models: File[] ncRNA_ribosomal_model_clans: File ncRNA_other_models: File[] ncRNA_other_model_clans: File fraggenescan_model: ../tools/FragGeneScan-model.yaml#model mapseq_ref: type: File format: edam:format_1929 # FASTA secondaryFiles: .mscluster mapseq_taxonomy: File go_summary_config: File outputs: #All of the sequence file QC stats qc_stats_summary: type: File outputSource: sequence_stats/summary_out qc_stats_seq_len_pcbin: type: File outputSource: sequence_stats/seq_length_pcbin qc_stats_seq_len_bin: type: File outputSource: sequence_stats/seq_length_bin qc_stats_seq_len: type: File outputSource: sequence_stats/seq_length_out qc_stats_nuc_dist: type: File outputSource: sequence_stats/nucleotide_distribution_out qc_stats_gc_pcbin: type: File outputSource: sequence_stats/gc_sum_pcbin qc_stats_gc_bin: type: File outputSource: sequence_stats/gc_sum_bin qc_stats_gc: type: File outputSource: sequence_stats/gc_sum_out #Taxonomic analysis step SSU_sequences: type: File outputSource: extract_SSUs/sequences ssu_classifications: type: File outputSource: classify_SSUs/classifications #Repeat extraction for LSU LSU_sequences: type: File outputSource: extract_LSUs/sequences #Repeat extract for 5S 5S_sequences: type: File outputSource: extract_5Ss/sequences #The predicted proteins and their annotations predicted_CDS: type: File outputSource: ORF_prediction/predicted_CDS_aa #The GO terms, full and slimmed. go_summary: type: File outputSource: functional_analysis/go_summary go_summary_slim: type: File outputSource: functional_analysis/go_summary_slim functional_annotations: type: File outputSource: functional_analysis/functional_annotations #Taxonomic visualisation step ssu_otu_visualization: type: File outputSource: visualize_otu_counts/otu_visualization ssu_otu_counts_hdf5: type: File outputSource: convert_otu_counts_to_hdf5/result ssu_otu_counts_json: type: File outputSource: convert_otu_counts_to_json/result #TODO - repeat taxonomy LSU #Non-coding RNA analysis other_ncRNAs: type: File outputSource: find_other_ncRNAs/matches #TODO - Extract these into a single file #TODO - check all the outputs #Sequence cat #Global Summary files match_count: type: int outputSource: ipr_stats/match_count CDS_with_match_count: type: int outputSource: ipr_stats/CDS_with_match_count reads_with_match_count: type: int outputSource: ipr_stats/reads_with_match_count stats_reads: type: File outputSource: ipr_stats/reads numberReadsWithOrf: type: int outputSource: orf_stats/numberReadsWithOrf numberOrfs: type: int outputSource: orf_stats/numberOrfs readsWithOrf: type: File outputSource: orf_stats/readsWithOrf interproscan: type: File outputSource: categorisation/interproscan no_functions_seqs: type: File outputSource: categorisation/no_functions_seqs pCDS_seqs: type: File outputSource: categorisation/pCDS_seqs steps: #sequence QC stats sequence_stats: run: ../tools/qc-stats.cwl in: QCed_reads: input_sequences out: - summary_out - seq_length_pcbin - seq_length_bin - seq_length_out - nucleotide_distribution_out - gc_sum_pcbin - gc_sum_bin - gc_sum_out #Ribosomal ncRNA identification find_ribosomal_ncRNAs: run: cmsearch-multimodel.cwl in: query_sequences: input_sequences covariance_models: ncRNA_ribosomal_models clan_info: ncRNA_ribosomal_model_clans out: [ matches ] index_reads: run: ../tools/esl-sfetch-index.cwl in: sequences: input_sequences out: [ sequences_with_index ] #SSU classification get_SSU_coords: run: ../tools/SSU-from-tablehits.cwl in: table_hits: find_ribosomal_ncRNAs/matches out: [ SSU_coordinates ] extract_SSUs: run: ../tools/esl-sfetch-manyseqs.cwl in: indexed_sequences: index_reads/sequences_with_index names: get_SSU_coords/SSU_coordinates names_contain_subseq_coords: { default: true } out: [ sequences ] classify_SSUs: run: ../tools/mapseq.cwl in: sequences: extract_SSUs/sequences database: mapseq_ref taxonomy: mapseq_taxonomy out: [ classifications ] #LSU classification get_LSU_coords: run: ../tools/LSU-from-tablehits.cwl in: table_hits: find_ribosomal_ncRNAs/matches out: [ LSU_coordinates ] extract_LSUs: run: ../tools/esl-sfetch-manyseqs.cwl in: indexed_sequences: index_reads/sequences_with_index names: get_LSU_coords/LSU_coordinates names_contain_subseq_coords: { default: true } out: [ sequences ] #Visualisation of taxonomic classification convert_classifications_to_otu_counts: run: ../tools/mapseq2biom.cwl in: otu_table: mapseq_taxonomy label: sequencing_run_id query: classify_SSUs/classifications out: [ otu_counts, krona_otu_counts ] visualize_otu_counts: run: ../tools/krona.cwl in: otu_counts: convert_classifications_to_otu_counts/krona_otu_counts out: [ otu_visualization ] convert_otu_counts_to_hdf5: run: ../tools/biom-convert.cwl in: biom: convert_classifications_to_otu_counts/otu_counts hdf5: { default: true } table_type: { default: OTU table } out: [ result ] convert_otu_counts_to_json: run: ../tools/biom-convert.cwl in: biom: convert_classifications_to_otu_counts/otu_counts json: { default: true } table_type: { default: OTU table } out: [ result ] #5S extraction get_5S_coords: run: ../tools/5S-from-tablehits.cwl in: table_hits: find_ribosomal_ncRNAs/matches out: [ 5S_coordinates ] extract_5Ss: run: ../tools/esl-sfetch-manyseqs.cwl in: indexed_sequences: index_reads/sequences_with_index names: get_5S_coords/5S_coordinates names_contain_subseq_coords: { default: true } out: [ sequences ] #Find other ubquitious ncRNAs find_other_ncRNAs: run: cmsearch-multimodel.cwl in: query_sequences: input_sequences covariance_models: ncRNA_other_models clan_info: ncRNA_other_model_clans out: [ matches ] #TODO - need to extract ncRNA sequences #TODO - need to think about summary file for ncRNAs #TODO - Extra tRNAs and then run them through tRNAScan-se #TODO - Longer term ITS1 identification #TODO - Remove ORFs that overlaps with ncRNA predictions >4 bp #Protein identification and tidying up ORF_prediction: run: orf_prediction.cwl in: sequence: input_sequences completeSeq: { default: false } model: fraggenescan_model out: [ predicted_CDS_aa ] remove_asterisks_and_reformat: run: ../tools/esl-reformat.cwl in: sequences: ORF_prediction/predicted_CDS_aa replace: { default: { find: '*', replace: X } } out: [ reformatted_sequences ] #Can we go full fat InterPro in the future? functional_analysis: doc: | Matches are generated against predicted CDS, using a sub set of databases (Pfam, TIGRFAM, PRINTS, PROSITE patterns, Gene3d) from InterPro. run: functional_analysis.cwl in: predicted_CDS: remove_asterisks_and_reformat/reformatted_sequences go_summary_config: go_summary_config out: [ functional_annotations, go_summary, go_summary_slim ] #Sequence catagorisation & summary steps. ipr_stats: run: ../tools/ipr_stats.cwl in: iprscan: functional_analysis/functional_annotations out: - match_count - CDS_with_match_count - reads_with_match_count - reads - id_list orf_stats: run: ../tools/orf_stats.cwl in: orfs: ORF_prediction/predicted_CDS_aa out: [ numberReadsWithOrf, numberOrfs, readsWithOrf ] categorisation: run: ../tools/create_categorisations.cwl in: seqs: extract_SSUs/sequences ipr_idset: ipr_stats/reads cds_idset: orf_stats/readsWithOrf out: [ interproscan, pCDS_seqs, no_functions_seqs ] $namespaces: edam: http://edamontology.org/ s: http://schema.org/ $schemas: - http://edamontology.org/EDAM_1.16.owl - https://schema.org/docs/schema_org_rdfa.html s:license: "https://www.apache.org/licenses/LICENSE-2.0" s:copyrightHolder: "EMBL - European Bioinformatics Institute"