#!/usr/bin/env cwl-runner
label: "Bacterial Annotation, pass 2, blastp-based functional annotation (first pass)"
cwlVersion: v1.0
class: Workflow
requirements:
    - class: SubworkflowFeatureRequirement
    - class: MultipleInputFeatureRequirement

inputs:
    scatter_gather_nchunks: 
        type: string
    # This LDS2 resource needs to be fixed by removing absolute path from files
    lds2: 
        label: "Extract ORF Proteins/lds2"
        type: File
    # therefore it should always come with original ASN.1 file with seq-entries
    proteins: 
        label: "Extract ORF Proteins/proteins"
        type: File
    prot_ids_A: 
        label: "Extract ORF Proteins/prot_ids"
        type: File
    prot_ids_B1: 
        label: "Get off-frame ORFs/prot_ids"
        type: File
    prot_ids_B2: 
        label: "AntiFam tainted proteins I/oseqids"
        type: File
    blast_rules_db_dir: 
        label: "Get BLAST Rules db const"
        type: Directory
    blast_rules_db: 
        label: "Name of blast_rules_db"
        type: string
    identification_db_dir:
        label: "Create identification BLASTdb"
        type: Directory
    annotation:
        label: "Get ORFs/outseq"
        type: File
    sequence_cache: 
        type: Directory
    # cluster_blastp_wnode_output: # shortcut to bypass cluster_blastp
    #    type: Directory
    unicoll_cache: 
        type: Directory
    taxid:
      type: int
    blast_hits_cache: 
      type: File?
    taxon_db: 
      type: File
    genus_list: 
      type: int[]
outputs:
    aligns: 
        label: "goes to protein_alignment/Seed Search Compartments/compartments"
        type: File
        outputSource: Map_Naming_Hits/aligns
        
steps:
    Remove_off_frame_ORFs: 
        label: "Remove off-frame ORFs"
        run: ../progs/set_operation.cwl  # validated
        in:
            A: 
                source: [prot_ids_A]
                linkMerge: merge_flattened
            B: 
                source: [prot_ids_B1, prot_ids_B2]
                linkMerge: merge_flattened
            operation:
                default: '-' # subracts B from A
        out: [output] # does not go out
        
    Find_Naming_Protein_Hits_I: # 30 minutes
        label: "Find Naming Protein Hits I"
        run: ../task_types/tt_blastp_wnode_naming.cwl
        in:
            scatter_gather_nchunks: scatter_gather_nchunks
            ids: 
                source: [Remove_off_frame_ORFs/output]
                linkMerge: merge_flattened
            lds2: lds2
            proteins: proteins
            blastdb_dir: 
                # source: [blast_rules_db_dir] # test only: for testing InitialWorkDirRequirement for Directory[] case
                source: [blast_rules_db_dir, identification_db_dir] # production
                linkMerge: merge_flattened
            blastdb:
                default: [blastdb, blast_rules_db]
            # cluster_blastp_wnode_output: cluster_blastp_wnode_output # shortcut
            affinity: 
                default: subject
            asn_cache: [sequence_cache, unicoll_cache]
            max_batch_length: 
                default: 10000
            nogenbank: 
                default: true
            align_filter: 
                default: 'score>0 && pct_identity_gapopen_only > 35' 
            allow_intersection: 
                default: false
            comp_based_stats:   
                default: F
            compart: 
                default: false
            dbsize: 
                default: '6000000000'
            evalue: 
                default: 0.1
            extra_coverage: 
                default: 20
            max_jobs: 
                default: 1
            max_target_seqs: 
                default: 50
            no_merge: 
                default: true
            ofmt: 
                default: asn-binary
            seg: 
                default:  '30 2.2 2.5'
            soft_masking:
                default: 'yes'
            threshold: 
                default: 21
            top_by_score: 
                default: 10
            word_size: 
                default: 6
            taxid: taxid
            genus_list: genus_list
            blast_hits_cache: 
              source: blast_hits_cache
            blast_type:
              default: 'orf'
            taxon_db: taxon_db
        out: [blast_align] 
    Map_Naming_Hits: 
        label: "Map Naming Hits"
        run: ../bacterial_annot/bacterial_hit_mapping.cwl
        in:
            hmm_hits: Find_Naming_Protein_Hits_I/blast_align
            sequences: annotation
            align_fmt:
                default: seq-align-set
            asn_cache: [sequence_cache, unicoll_cache]
            expansion_ratio:
                default: 1.1
            nogenbank:
                default: true
            no_compart:
                default: false
            # bogus because requirements from this are imported down
            proteins: proteins
        out: [aligns]