#!/usr/bin/env cwl-runner label: "Bacterial Annotation, pass 3, structural annotation, functional annotation: ab initio GeneMark, by WP, by HMM (second pass)" cwlVersion: v1.0 class: Workflow requirements: - class: SubworkflowFeatureRequirement - class: MultipleInputFeatureRequirement inputs: AntiFamLib: type: Directory uniColl_cache: type: Directory sequence_cache: type: Directory hmm_aligns: type: File label: "Map HMM Hits/align" prot_aligns: type: File label: "Filter Protein Alignments I/align" annotation: type: File label: "Resolve Annotation Conflicts/annotation" models1: type: File label: "Run GeneMark Training/models" raw_seqs: type: File label: #Prepare Unannotated Sequences/raw_seqs" thresholds: # ${GP_HOME}/etc/thresholds.xml type: File naming_sqlite: # /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/home/badrazat/local-install/2018-05-17/third-party/data/BacterialPipeline/uniColl/ver-3.2/naming.sqlite type: File hmm_params: # Run GeneMark Training/hmm_params (EXTERNAL, put to input/ type: File? selenoproteins: # /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/home/badrazat/local-install/2018-05-17/third-party/data/BacterialPipeline/Selenoproteins/selenoproteins type: Directory selenocysteines_db: type: string default: blastdb naming_hmms_combined: # ${GP_HOME}/third-party/data/BacterialPipeline/uniColl/ver-3.2/naming_hmms_combined.mft type: Directory hmms_tab: type: File wp_hashes: File # input/wp-hashes.sqlite taxon_db: File # input/taxonomy.sqlite3 genemark_path: Directory scatter_gather_nchunks: string steps: Extract_ab_initio_Proteins: label: "Extract ab initio Proteins" run: ../progs/protein_extract.cwl in: input: models1 nogenbank: default: true out: [proteins, lds2, seqids] Search_ab_initio_for_AntiFam: label: "Search ab initio for AntiFam" run: ../task_types/tt_hmmsearch_wnode.cwl in: # this comes always with lds2. LDS2 refers to proteins proteins: Extract_ab_initio_Proteins/proteins hmm_path: AntiFamLib seqids: Extract_ab_initio_Proteins/seqids lds2: Extract_ab_initio_Proteins/lds2 # hmms_tab: hmms_tab # goes eventually to -fam parameter -fam is empty here asn_cache: sequence_cache scatter_gather_nchunks: scatter_gather_nchunks out: [hmm_hits] ab_initio_AntiFam_tainted_proteins: label: "ab initio AntiFam tainted proteins" run: ../progs/reduce.cwl in: aligns: Search_ab_initio_for_AntiFam/hmm_hits out: [oseqids] Good_ab_initio_proteins: label: "Good ab initio proteins" run: ../progs/set_operation.cwl in: A: source: [Extract_ab_initio_Proteins/seqids] linkMerge: merge_flattened B: source: [ab_initio_AntiFam_tainted_proteins/oseqids] linkMerge: merge_flattened operation: default: '-' # subracts B from A out: [output] Good_ab_initio_annotations: label: "Good ab initio annotations" run: ../progs/bact_filter_preserved.cwl in: annotation: models1 ifmt: default: seq-entry only_those_ids: Good_ab_initio_proteins/output nogenbank: default: true out: [out_annotation] # goes out -o Find_Best_Evidence_Alignments: label: "Find Best Evidence Alignments" run: ../progs/bact_best_evidence_alignments.cwl in: annotation: [annotation, Good_ab_initio_annotations/out_annotation] asn_cache: [uniColl_cache, sequence_cache] # ${GP_cache_dir},${GP_HOME}/third-party/data/BacterialPipeline/uniColl/ver-3.2/cache # type: Directory[] align: [hmm_aligns, prot_aligns] # -input-manifest # type: File[] # source: # linkMerge: merge_flattened max_overlap: default: 120 output_align_name: default: best_aligns.asn start_stop_allowance: default: 60 thr: thresholds unicoll_sqlite: naming_sqlite nogenbank: default: true out: [out_align] # -o Run_GeneMark: label: "Run GeneMark" run: ../progs/genemark.cwl in: # so far, the whole node! alignments: Find_Best_Evidence_Alignments/out_align annotation: annotation # Resolve Annotation Conflicts/annotation (EXTERNAL, put to input/ asn_cache: [uniColl_cache, sequence_cache] # ${GP_cache_dir},${GP_HOME}/third-party/data/BacterialPipeline/uniColl/ver-3.2/cache # type: Directory[] genemark_path: genemark_path # ${GP_HOME}/third-party/GeneMark # type: Directory hmm_params: hmm_params marked_annotation_name: default: marked-annotation.asn min_seq_len: default: 200 preliminary_models_name: # -out default: preliminary-models.asn sequences: raw_seqs thr: thresholds tmp_dir_name: default: workdir # type: Directory nogenbank: default: true out: [marked_annotation, preliminary_models] # all internal! Run_GeneMark_Post: label: "Run GeneMark (genemark_post)" run: ../progs/genemark_post.cwl in: abs_short_model_limit: default: 60 asn_cache: [uniColl_cache, sequence_cache] # ${GP_cache_dir},${GP_HOME}/third-party/data/BacterialPipeline/uniColl/ver-3.2/cache # type: Directory[] genemark_annot: Run_GeneMark/preliminary_models max_overlap: default: 120 max_unannotated_region: default: 5000 models_name: # -out default: models.asn out_product_ids_name: default: all-proteins.ids product_id_prefix: default: 'PGAP' pre_annot: Run_GeneMark/marked_annotation selenocysteines: selenoproteins selenocysteines_db: selenocysteines_db short_model_limit: default: 180 unicoll_sqlite: naming_sqlite nogenbank: default: true out: [models] PGAP_plus_ab_initio: label: "PGAP + ab initio" run: ../progs/bact_entries_merge.cwl in: annotation: source: - Run_GeneMark_Post/models linkMerge: merge_flattened ab_initio: source: - Good_ab_initio_annotations/out_annotation linkMerge: merge_flattened out: [out_annotation] Extract_Model_Proteins: label: "Extract Model Proteins" run: ../progs/protein_extract.cwl in: input: PGAP_plus_ab_initio/out_annotation nogenbank: default: true out: [proteins, lds2, seqids] Search_Naming_HMMs: label: "Search Naming HMMs" run: ../task_types/tt_hmmsearch_wnode.cwl in: proteins: Extract_Model_Proteins/proteins hmm_path: naming_hmms_combined # naming_hmms_combined.mft converted to Directory seqids: Extract_Model_Proteins/seqids lds2: Extract_Model_Proteins/lds2 hmms_tab: hmms_tab # goes eventually to -fam parameter asn_cache: sequence_cache scatter_gather_nchunks: scatter_gather_nchunks out: [hmm_hits] Assign_Naming_HMM_to_Proteins: label: "Assign Naming HMM to Proteins" run: ../progs/assign_hmm.cwl in: input: Search_Naming_HMMs/hmm_hits db: naming_sqlite out: [assignments] Name_by_WPs: label: "Name by WPs" run: ../progs/identify_wp.cwl in: wp_hashes: wp_hashes taxon_db: taxon_db ifmt: default: seq-entries lds2: Extract_Model_Proteins/lds2 proteins: Extract_Model_Proteins/proteins sequences: PGAP_plus_ab_initio/out_annotation # -input fast: default: true out: [out_names] # -onames, there is also prot2wp, but it goes only to tax check, which we dropped in the first round outputs: # long output names are preliminary. # after the list is complete, drop the long prefixes Find_Best_Evidence_Alignments_aligns: # sink: Generate Annotation Reports/cluster_prot_aligns (EXTERNAL, put to output/) # sink: Validate Annotation/cluster_best_mft (EXTERNAL, put to output/) label: "goes to protein_alignment/Seed Search Compartments/compartments" type: File outputSource: Find_Best_Evidence_Alignments/out_align Run_GeneMark_Post_models: type: File outputSource: Run_GeneMark_Post/models Extract_Model_Proteins_seqids: type: File outputSource: Extract_Model_Proteins/seqids Extract_Model_Proteins_lds2: type: File outputSource: Extract_Model_Proteins/lds2 Extract_Model_Proteins_proteins: type: File outputSource: Extract_Model_Proteins/proteins Search_Naming_HMMs_hmm_hits: type: File outputSource: Search_Naming_HMMs/hmm_hits Assign_Naming_HMM_to_Proteins_assignments: type: File outputSource: Assign_Naming_HMM_to_Proteins/assignments Name_by_WPs_names: type: File outputSource: Name_by_WPs/out_names PGAP_plus_ab_initio_annotation: type: File outputSource: PGAP_plus_ab_initio/out_annotation