#!/usr/bin/env cwl-runner
label: "Bacterial Annotation, pass 1, genemark training, by HMMs (first pass)"
cwlVersion: v1.0
class: Workflow

requirements:
  - class: SubworkflowFeatureRequirement
  - class: MultipleInputFeatureRequirement

inputs:
  asn_cache: Directory
  inseq: File
  hmm_path: Directory
  hmms_tab: File
  uniColl_cache: Directory
  trna_annots: File
  ncrna_annots: File
  nogenbank: boolean
  Execute_CRISPRs_annots: File
  Generate_16S_rRNA_Annotation_annotation: File
  Generate_23S_rRNA_Annotation_annotation: File
  Post_process_CMsearch_annotations_annots_5S: File
  thresholds: File
  genemark_path: Directory
  # Cached computational steps
  #hmm_hits: File
  scatter_gather_nchunks: string
outputs:
  outseqs:
    type: File
    outputSource: Get_ORFs/outseqs
  aligns: 
    type: File
    outputSource: Map_HMM_Hits/aligns
  hmm_hits: 
    type: File
    outputSource: Search_All_HMMs_I/hmm_hits
  proteins:
    type: File
    outputSource: Extract_ORF_Proteins/proteins
  lds2:
    type: File
    outputSource: Extract_ORF_Proteins/lds2
  seqids:
    type: File
    outputSource: Extract_ORF_Proteins/seqids
  prot_ids:
    type: File
    outputSource: Get_off_frame_ORFs/prot_ids
  protein_aligns: 
    type: File
    outputSource: Resolve_Annotation_Conflicts/protein_aligns
  annotation: 
    type: File
    outputSource: Resolve_Annotation_Conflicts/annotation
  out_hmm_params: 
    type: File
    outputSource: Run_GeneMark_Training/out_hmm_params
steps:
  Get_ORFs:
    run: ../progs/gp_getorf.cwl
    in:
      asn_cache: asn_cache
      input: inseq
    out: [outseqs]

  Extract_ORF_Proteins:
    run: ../progs/protein_extract.cwl
    in:
      input: Get_ORFs/outseqs
      nogenbank: nogenbank
    out: [proteins, lds2, seqids]

  # Skipped due to compute cost, for now
  Search_All_HMMs_I:
    label: "Search All HMMs I"
    run: ../task_types/tt_hmmsearch_wnode.cwl
    in:
      proteins: Extract_ORF_Proteins/proteins
      hmm_path: hmm_path
      seqids: Extract_ORF_Proteins/seqids
      lds2: Extract_ORF_Proteins/lds2
      hmms_tab: hmms_tab
      asn_cache: asn_cache
      scatter_gather_nchunks: scatter_gather_nchunks
    out:
      [hmm_hits]
      #[hmm_hits, jobs, workdir]

  Map_HMM_Hits:
    run: bacterial_hit_mapping.cwl
    in:
      seq_cache: asn_cache
      unicoll_cache: uniColl_cache
      asn_cache: [asn_cache, uniColl_cache]
      # hmm_hits: hmm_hits # Should be from hmmsearch
      hmm_hits: Search_All_HMMs_I/hmm_hits
      sequences: Get_ORFs/outseqs
      ### this guys below not tested yet
      align_fmt: 
         default: seq-align
      expansion_ratio:
         default: 0.0
      no_compart:
         default: true
      nogenbank:
         default: true
    out: [aligns]

  Get_off_frame_ORFs:
    run: get_off_frame_orfs.cwl
    label: "Get_off_frame_ORFs task node"
    in:
      aligns: Map_HMM_Hits/aligns
      seq_entries: Get_ORFs/outseqs
    out: [prot_ids]
  Resolve_Annotation_Conflicts:
    label: "Resolve Annotation Conflicts"
    run: ../progs/bacterial_resolve_conflicts.cwl
    in:
        features: # all external to this node
            - Execute_CRISPRs_annots # Execute CRISPR/annots
            - ncrna_annots # Post-process CMsearch Rfam annotations/annots
            - Generate_16S_rRNA_Annotation_annotation # Generate 16S rRNA Annotation/annotation
            - Generate_23S_rRNA_Annotation_annotation # Generate 23S rRNA Annotation/annotation
            - Post_process_CMsearch_annotations_annots_5S # Post-process CMsearch annotations/annots
            - trna_annots # Run tRNAScan/annot
        # input_annotation: mft not used
        # prot: mft not used
        # mapped-regions: mft not used
        thr: thresholds
        asn_cache: 
            source: [asn_cache]
            linkMerge: merge_flattened
    out: 
        - protein_aligns
        - annotation
 
  Run_GeneMark_Training:
    label: "Run GeneMark Training, genemark"
    run: ../progs/genemark_training.cwl
    in:
        asn_cache: 
            source: [asn_cache, uniColl_cache]
            linkMerge: merge_flattened
        sequences: inseq
        genemark_path: genemark_path # ${GP_HOME}/third-party/GeneMark 
        marked_annotation_name:
                default: marked-annotation.asn
        min_seq_len:
            default: 200
        preliminary_models_name: # -out
            default: preliminary-models.asn
        thr:  thresholds
        tmp_dir_name: 
            default: workdir  
            # type: Directory
        nogenbank: 
            default: true
    out: [out_hmm_params] # export