#!/usr/bin/env cwl-runner
label: "Bacterial Annotation, pass 4, blastp-based functional annotation (second pass)"
cwlVersion: v1.0
class: Workflow
requirements:
    - class: SubworkflowFeatureRequirement
    - class: MultipleInputFeatureRequirement

inputs:
    scatter_gather_nchunks: 
        type: string
    lds2: # Extract Model Proteins/lds2
        type: File
    proteins: # Extract Model Proteins/proteins
        type: File
    annotation: #  Good, AntiFam filtered annotations
        type: File
    Good_AntiFam_filtered_proteins_gilist: 
        type: File
    sequence_cache:
        type: Directory
    uniColl_cache:
        type: Directory
    naming_blast_db: # NamingDatabase
        type: Directory
    naming_sqlite: # see bacterial_annot_pass3
        type: File
    hmm_assignments:  # XML assignments
        type: File
    wp_assignments:  # XML assignments
        type: File
    Extract_Model_Proteins_prot_ids: # pass 3
        type: File
    CDDdata: # ${GP_HOME}/third-party/data/CDD/cdd -
       type: Directory
    CDDdata2: # ${GP_HOME}/third-party/data/cdd_add 
        type: Directory
    thresholds:
        type: File
    defline_cleanup_rules: # defline_cleanup_rules # ${GP_HOME}/etc/product_rules.prt
        type: File
    blast_rules_db_dir:
        type: Directory
    blast_rules_db: 
        type: string
    identification_db_dir:
        type: Directory
    # cached for intermediate testing
    # cached_Find_Naming_Protein_Hits:
    #    type: File
    taxid:
      type: int
    blast_hits_cache: 
      type: File?
    taxon_db: 
      type: File
    genus_list: 
      type: int[]
steps:
    Find_Naming_Protein_Hits:
        label: "Find Naming Protein Hits"
        run: ../task_types/tt_blastp_wnode_naming.cwl
        in:
            scatter_gather_nchunks: scatter_gather_nchunks
            # files/directories
            ids: 
                source: [Good_AntiFam_filtered_proteins_gilist]
                linkMerge: merge_flattened
            lds2: lds2
            proteins: proteins
            blastdb_dir: 
                source: [blast_rules_db_dir, identification_db_dir] # production
                linkMerge: merge_flattened
            blastdb:
                default: [blastdb, blast_rules_db]
            # cluster_blastp_wnode_output: cluster_blastp_wnode_output # shortcut
            # literal parameters
            affinity: 
                default: subject
            asn_cache: [sequence_cache, uniColl_cache]
            align_filter: 
                default: 'score>0 && pct_identity_gapopen_only > 35' 
            allow_intersection: 
                default: true
            # batch-size:
            #    default: 1
            comp_based_stats:   
                default: F
            compart: 
                default: true
            dbsize: 
                default: '6000000000'
            delay:
                default: 0
            evalue: 
                default: 0.1
            # extra_coverage:  # application default
            max_batch_length: 
                default: 10000
            max_jobs: 
                default: 1
            max_target_seqs: 
                default: 50
            no_merge: 
                default: false
            nogenbank: 
                default: true
            ofmt: 
                default: asn-binary
            seg: 
                default:  '30 2.2 2.5'
            soft_masking:
                default: 'yes'
            threshold: 
                default: 21
            top_by_score: 
                default: 10
            word_size: 
                default: 6
            taxid: taxid
            genus_list: genus_list
            blast_hits_cache: 
              source: blast_hits_cache
            blast_type:
              default: 'predicted-protein'
            taxon_db: taxon_db
        out: [blast_align] # does not go out
    Find_best_protein_hits:
        label: "Find best protein hits"
        run: ../progs/align_filter.cwl
        in:
            input: Find_Naming_Protein_Hits/blast_align
            # input: cached_Find_Naming_Protein_Hits # for shortcuts
            asn_cache: [sequence_cache, uniColl_cache]
            filter:
                default: 'subject_coverage >= 10'
            ifmt:
                default: seq-align-set
            nogenbank:
                default: true
        out: [o]
    Assign_Clusters_to_Proteins_sort:
        label: "Assign Clusters to Proteins"
        run: ../progs/align_sort.cwl
        in:
            input: Find_best_protein_hits/o
            ifmt:
                default: seq-align-set
            k:
                default: query,subject,-score,-num_ident,query_align_len,subject_align_len,query_start,subject_start
            limit_mem: 
                default: '13G'
            nogenbank:
                default: true
            # internal: tmp
        out: [output]
    Assign_Clusters_to_Proteins:
        label: "Assign Clusters to Proteins"
        run: ../progs/assign_cluster.cwl
        in:
            asn_cache: [sequence_cache, uniColl_cache]
            lds2: lds2
            proteins: proteins
            comp_based_stats:
                default: F
            cutoff:
                default: 0.5
            hfmt:
                default: seq-align
            hits: Assign_Clusters_to_Proteins_sort/output
            margin:
                default: 0.05
            namedb_dir: naming_blast_db # NamingDatabase
            namedb:
                default: blastdb
            seg:
                default: no
            sure_cutoff:
                default: 0.15
            task:
                default: blastp
            threshold:
                default: 21
            unicoll_sqlite: naming_sqlite
            word_size:
                default: 6
            nogenbank:
                default: true
        out: [protein_assignments] # xml format does, not go out of the workflow
    Prepare_SPARCLBL_input:
        label: "Prepare SPARCLBL input"
        run: ../progs/prepare_sparclbl_input.cwl
        in:
            other_assignments: 
                source: [Assign_Clusters_to_Proteins/protein_assignments, hmm_assignments, wp_assignments]
                linkMerge: merge_flattened
            input: Extract_Model_Proteins_prot_ids # pass 3
            unicoll_sqlite: naming_sqlite
        out: [prot_ids, precedences] # not go out of the workflow
    Assign_SPARCL_Architecture_Names_to_Proteins_gp_fetch_sequences:
        label: "Assign SPARCL Architecture Names to Proteins"
        run: ../progs/gp_fetch_sequences.cwl
        in:
            # not sure why do we have this in PGAP.
            # asn_cache: [full_id_cache]
            #    linkMerge: merge_flattened
            input: Prepare_SPARCLBL_input/prot_ids
            lds2: lds2
            proteins: proteins
        out: [out_proteins]
    Assign_SPARCL_Architecture_Names_to_Proteins_asn2fasta:
        label: "Assign SPARCL Architecture Names to Proteins"
        run: ../progs/asn2fasta.cwl
        in:
            i: Assign_SPARCL_Architecture_Names_to_Proteins_gp_fetch_sequences/out_proteins
            serial:
                default: binary
            type:   
                default: seq-entry
            prots_only:
                default: true
            fasta_name:
                default: proteins.fa
        out: [fasta]
    Assign_SPARCL_Architecture_Names_to_Proteins_sparclbl:
        label: "Assign SPARCL Architecture Names to Proteins"
        run: ../progs/sparclbl.cwl
        in:
            s: Assign_SPARCL_Architecture_Names_to_Proteins_asn2fasta/fasta
            p: Prepare_SPARCLBL_input/precedences
            m: # number_of_blast_processes
                default: 20 
            n: # max_files_per_proc
                default: 500
            b: CDDdata
            d: CDDdata2
            x:
                default: 1
        out: [protein_assignments] # not go out of the workflow
    Add_Names_to_Proteins:
        label: "Add Names to Proteins"
        run: ../progs/add_prot_names_to_annot.cwl
        in:
            # let's ditch full_id_cache for now
            # asn_cache: [sequence_cache, full_id_cache]
            asn_cache: 
                source: [sequence_cache]
                linkMerge: merge_flattened
            defline_cleanup_rules: defline_cleanup_rules # ${GP_HOME}/etc/product_rules.prt
            proteins: 
                - Assign_Clusters_to_Proteins/protein_assignments
                - Assign_SPARCL_Architecture_Names_to_Proteins_sparclbl/protein_assignments
                - hmm_assignments
                - wp_assignments
            input: annotation #  Good, AntiFam filtered annotations
            unicoll_sqlite: naming_sqlite
            nogenbank:
                default: true
            submission_mode_genbank:
                default: true
        out: [out_annotation]
    Bacterial_Annot_Filter:
        label: "Bacterial Annot Filter"
        run: ../progs/bact_annot_filter.cwl
        in:
            abs_short_model_limit:
                default: 60
            asn_cache: 
                source: [sequence_cache]
                linkMerge: merge_flattened
            input: 
                source: [Add_Names_to_Proteins/out_annotation]
                linkMerge: merge_flattened
            long_model_limit:
                default: 1000000 # 1,000,000
            max_overlap:
                default: 120
            max_unannotated_region:
                default: 5000
            short_model_limit:
                default: 180
            thr: thresholds
            nogebank:
                default: true
        out:
            - out_annotation # this goes out
            # - good_proteins # internal to taxcheck
    # this is later.
    # WP_Tax_Check:
    #     label: "WP Tax Check"
    #    run: ../progs/wp_taxcheck.cwl    
    #    in:
    #        __input__: Bacterial_Annot_Filter/good_proteins
            
outputs:
    out_annotation: 
        type: File
        outputSource: Bacterial_Annot_Filter/out_annotation