#!/usr/bin/env cwl-runner label: "Bacterial Annotation, pass 2, blastp-based functional annotation (first pass)" cwlVersion: v1.2 class: Workflow requirements: - class: SubworkflowFeatureRequirement - class: MultipleInputFeatureRequirement inputs: scatter_gather_nchunks: type: string # This LDS2 resource needs to be fixed by removing absolute path from files lds2: label: "Extract ORF Proteins/lds2" type: File # therefore it should always come with original ASN.1 file with seq-entries proteins: label: "Extract ORF Proteins/proteins" type: File prot_ids_A: label: "Extract ORF Proteins/prot_ids" type: File prot_ids_B1: label: "Get off-frame ORFs/prot_ids" type: File prot_ids_B2: label: "AntiFam tainted proteins I/oseqids" type: File blastdb: label: "Input blastdb databases" type: string[] blast_rules_db: label: "parameter to store the literal 'blast_rules_db'" type: string default: blast_rules_db identification_db_dir: label: "Create identification BLASTdb" type: Directory annotation: label: "Get ORFs/outseq" type: File raw_seqs: label: "Prepare_Unannotated_Sequences/sequences" type: File plasmids: label: "Prepare_Unannotated_Sequences/plasmids" type: File sequence_cache: type: Directory # cluster_blastp_wnode_output: # shortcut to bypass cluster_blastp # type: Directory unicoll_cache: type: Directory taxid: type: int blast_hits_cache: type: File? taxon_db: type: File genus_list: type: int[] outputs: aligns: label: "goes to protein_alignment/Seed Search Compartments/compartments" type: File outputSource: Map_Naming_Hits/aligns steps: Remove_off_frame_ORFs: label: "Remove off-frame ORFs" run: ../progs/set_operation.cwl # validated in: A: source: [prot_ids_A] linkMerge: merge_flattened B: source: [prot_ids_B1, prot_ids_B2] linkMerge: merge_flattened operation: default: '-' # subracts B from A out: [output] # does not go out Separate_Plasmid_ORFs: label: "Separate Plasmid ORFs" run: ../progs/separate_plasmid_orfs.cwl in: orf_ids: Remove_off_frame_ORFs/output orf_entries: annotation raw_seqs: raw_seqs # Prepare_Unannotated_Sequences/sequences plasmids: plasmids out: [chomosome_orfs, plasmid_orfs] Find_Naming_Protein_Hits_for_struct: # this is used now for chromosomes only label: "Find Naming Protein Hits for struct" run: ../task_types/tt_blastp_wnode_naming.cwl in: scatter_gather_nchunks: scatter_gather_nchunks ids: source: [Separate_Plasmid_ORFs/chomosome_orfs] linkMerge: merge_flattened lds2: lds2 proteins: proteins blastdb_dir: identification_db_dir blastdb: source: #will this fly? - blastdb # Get_Proteins/selected_blastdb - blast_rules_db linkMerge: merge_flattened # cluster_blastp_wnode_output: cluster_blastp_wnode_output # shortcut affinity: default: subject asn_cache: [sequence_cache, unicoll_cache] max_batch_length: default: 10000 nogenbank: default: true align_filter: default: 'score>0 && pct_identity_gapopen_only > 35' allow_intersection: default: false comp_based_stats: default: F compart: default: false dbsize: default: '6000000000' evalue: default: 0.1 extra_coverage: default: 20 max_jobs: default: 1 max_target_seqs: default: 50 no_merge: default: true ofmt: default: asn-binary seg: default: '30 2.2 2.5' soft_masking: default: 'yes' threshold: default: 21 top_by_score: default: 10 word_size: default: 6 taxid: taxid genus_list: genus_list blast_hits_cache: source: blast_hits_cache blast_type: default: 'orf' taxon_db: taxon_db out: [blast_align] Find_Struct_Protein_Hits_for_Plasmids: label: "Find Struct Protein Hits for Plasmids" # i.e. ORFs run: ../task_types/tt_blastp_wnode_struct.cwl in: scatter_gather_nchunks: scatter_gather_nchunks ids: source: [Separate_Plasmid_ORFs/plasmid_orfs] linkMerge: merge_flattened lds2: lds2 proteins: proteins blastdb_dir: identification_db_dir blastdb: # whole naming databases is used for plasmids default: [blastdb, blast_rules_db] # cluster_blastp_wnode_output: cluster_blastp_wnode_output # shortcut affinity: default: subject asn_cache: [sequence_cache, unicoll_cache] max_batch_length: default: 10000 nogenbank: default: true align_filter: default: 'score>0 && pct_identity_gapopen_only > 35' allow_intersection: default: false comp_based_stats: default: F compart: default: false dbsize: default: '6000000000' evalue: default: 0.1 # Find_Struct_Protein_Hits_for_Plasmids extra_coverage: default: 29 max_jobs: default: 1 max_target_seqs: default: 50 no_merge: default: true ofmt: default: asn-binary seg: default: '30 2.2 2.5' soft_masking: default: 'yes' threshold: default: 21 top_by_score: default: 10 word_size: default: 6 taxid: taxid genus_list: genus_list blast_hits_cache: source: blast_hits_cache blast_type: default: 'orf' taxon_db: taxon_db out: [blast_align] Map_Naming_Hits: label: "Map Naming Hits" run: bacterial_hit_mapping.cwl in: hmm_hits: source: - Find_Naming_Protein_Hits_for_struct/blast_align - Find_Struct_Protein_Hits_for_Plasmids/blast_align linkMerge: merge_flattened sequences: annotation align_fmt: default: seq-align-set asn_cache: [sequence_cache, unicoll_cache] expansion_ratio: default: 1.1 nogenbank: default: true no_compart: default: false outfile: default: "mapped-aligns.asn" # bogus because requirements from this are imported down proteins: proteins out: [aligns]