#!/usr/bin/env cwl-runner
label: bacterial_kmer
cwlVersion: v1.2
class: Workflow 
requirements: 
    - class: SubworkflowFeatureRequirement    
    - class: MultipleInputFeatureRequirement
    
inputs:
    Extract_Kmers_From_Input___entry: File
    gencoll_asn: File
    asn_cache: Directory
    gc_seq_cache: Directory
    gc_cache: File
    kmer_cache_sqlite: File
    ref_assembly_taxid: int
    ref_assembly_id: int
    ANI_cutoff: File
    kmer_reference_assemblies: File
    tax_synon: File
    taxon_db: File
    gcextract2_sqlite: File
    ani_report_transform: File
outputs:
    Identify_Top_N_ANI_annot:
        type: File
        outputSource: Identify_Top_N_ANI/annot
    Identify_Top_N_ANI_top:
        type: File
        outputSource: Identify_Top_N_ANI/top
    Identify_Top_N_ANI_top_txt:
        type: File
        outputSource: Identify_Top_N_ANI_transform/output
    Extract_Top_Assemblies___tax_report:
        type: File
        outputSource: Extract_Top_Assemblies/tax_report
    errors:
        type: File?
        outputSource: Evaluate_ANI_report/xml_output
steps:
  # 
  # Internal PGAP step "Get Reference Assemblies" ommitted it's a const, repplaced by 
  #     kmer_reference_assemblies
  # Internal PGAP step "Query_Kmer_Cache:
  #     This task node in classic PGAP converts list of gcids to a list of URIS
  #     we do not need to do the conversion, since kmer_store is capable of recognising list of URIs
  # Internal PGAP step "Extract_Kmer_List":
  #     This task node in classic PGAP takes the list of new reference assemblies and computes kmer hashes 
  #     We do not need to do this here, because external PGAP will be always supplied with up to date
  #     reference assemblies 
  # Internal PGAP step "Store_in_Kmer_Cache":
  #     Stores input kmer cache file (local to buildrun directory) in global storage
  #     We do not need to do this here, because external PGAP will be always supplied with up to date
  #     reference assemblies 
  Extract_Kmers_From_Input:
    label: Extract Kmers From Input
    run: ../task_types/tt_kmer_seq_entry_extract_wnode.cwl
    doc: |
        computes kmers for input genome (Extract_Kmers_From_Input___entry)
        produces kmer files (.gz and xml)
        it will have a new output: out_kmer_dir
    in:
      entry: Extract_Kmers_From_Input___entry
      kmer_file_list: 
        source: [kmer_reference_assemblies]
        linkMerge: merge_flattened
      asn_cache: asn_cache
    out: [out_kmer_dir]
  Convert_kmer_files_to_sqlite:
    label: Convert Kmer files to SQLITE
    doc: |
        This new step will convert input .kmer.gz (kmer_file) and .xml (kmer_metadata_file)
        into new sqlite database
        The program we are calling here takes a directory input
    run: ../progs/kmer_files2sqlite.cwl
    in:
        kmer_dir: Extract_Kmers_From_Input/out_kmer_dir
    out: [out_kmer_cache_sqlite]
  List_sqlite:
        label: List SQLITE kmer cache contents
        doc: Produces the list of all keys in sqlite database
        run: ../progs/list_kmer_sqlite.cwl
        in:
            kmer_cache_sqlite: Convert_kmer_files_to_sqlite/out_kmer_cache_sqlite
        out: [keys]
  Combine_kmer_sqlite:
        label: Combine kmer SQLITE
        doc: |
            This new step will combine together reference kmer store and newly created kmer store for a new assembly
        run: ../progs/combine_kmer_sqlite.cwl
        in:
            fast: 
                default: true
            kmer_cache_sqlite: 
                source: [kmer_cache_sqlite, Convert_kmer_files_to_sqlite/out_kmer_cache_sqlite]
                linkMerge: merge_flattened
        out: [combined_cache_sqlite]
  Compare_Kmer:
    label: Compare Kmer
    doc: |
        compares kmers from different genomes in the input and produces distance matrix
        filled only for ref vs current elements
    run: ../task_types/tt_kmer_ref_compare_wnode.cwl
    in:
      kmer_cache_sqlite: Combine_kmer_sqlite/combined_cache_sqlite
      kmer_list: List_sqlite/keys
      ref_kmer_list:   kmer_reference_assemblies
      dist_method:
        default: minhash
      minhash_signature:
        default: minhash
      score_method:
        default: boolean
    out: [distances, outdir]
  Identify_Top_N:
    label: Identify Top N hits by Kmer
    doc: |
        Identifies Top N hits to input genome by kmer distances 
        produces distances XML and list of matching genomes (storage URIs)
    run: ../task_types/tt_kmer_top_n.cwl
    in:
      kmer_cache_sqlite: Combine_kmer_sqlite/combined_cache_sqlite    
      distances: Compare_Kmer/distances
    out: [matches, top_distances]
  Compare_Kmer_Pairwise_prepare_input:
        label: Glue SQLITE keys
        run: ../progs/cat.cwl
        in:
            input:
                source: [List_sqlite/keys, Identify_Top_N/matches]
                linkMerge: merge_flattened
            output_file_name:
                default: 'target_and_matches.ids'
        out: [output]
  Compare_Kmer__Pairwise_:
    label: Compare Kmer pairwise
    doc: |
        compares kmers from top hits to query genome in the input and produces distance matrix for subsequent tree: all-against-all. This is the most downstream node requiring sqlite cache
    run: ../task_types/tt_kmer_compare_wnode.cwl
    in:
      kmer_cache_sqlite: Combine_kmer_sqlite/combined_cache_sqlite    
      kmer_list: Compare_Kmer_Pairwise_prepare_input/output
      dist_method:
            default: minhash
      minhash_signature:
        default: minhash
      score_method:
        default: boolean
    out: [distances]
  Extract_Top_Assemblies:
    label: Extract Top Assemblies
    doc: |
        Takes XML file with top distances between matches and reference assembly taxid
        produces XML tax report and list of top matched assemblies 
    run: ../task_types/tt_kmer_top_n_extract.cwl
    in:
      top_distances: Identify_Top_N/top_distances
      ref_assembly_taxid: ref_assembly_taxid
      ref_assembly_id: ref_assembly_id
      taxon_db: taxon_db
      gcextract2_sqlite: gcextract2_sqlite
    out: [tax_report, gc_id_list]
  Build_Kmer_Tree:
    label: Build Kmer Tree
    doc: Output is BioTree ASN.1
    run: ../task_types/tt_kmer_build_tree.cwl
    in:
        distances: Compare_Kmer__Pairwise_/distances
        sort:
            default: leaf-count-ascending
        no_merge:
            default: true
        skip_markup:
            default: true
    out: [tree]
  Get_Top_Assemblies_GenColl_ASN:
    label: Get Top Assemblies GenColl ASN
    doc: Input is list of reference assemblies, not to be mixed with list of URIs
    run: ../task_types/tt_gcaccess_from_list.cwl
    in:
      gc_id_list: Extract_Top_Assemblies/gc_id_list
      gc_cache: gc_cache
    out: [gencoll_asn]
  Extract_Input_GenColl_IDs:
    label: Extract Input GenColl IDs
    doc: Input is input ASN.1 file for our target assembly
    run: ../progs/gc_extract_ids.cwl
    in:
      input: gencoll_asn
    out: [output]
  Assembly_Assembly_BLASTn:
    label: Assembly Assembly BLASTn
    doc: This is rather standard blast
    run: ../task_types/tt_assm_assm_blastn_wnode.cwl
    in:
      queries_gc_id_list: Extract_Input_GenColl_IDs/output
      subjects_gc_id_list: Extract_Top_Assemblies/gc_id_list
      # this will brea here
      ref_gencoll_asn: Get_Top_Assemblies_GenColl_ASN/gencoll_asn
      gencoll_asn: gencoll_asn
      affinity:
        default: 'subject'
      # settings
      asn_cache: asn_cache
      gc_seq_cache: gc_seq_cache
      gc_cache: gc_cache
      compart: 
        default: true
      evalue: 
        default: 0.0001
      gapextend: 
        default: 1
      gapopen: 
        default: 2
      max_bases_per_call: 
        default: 500000000
      max_target_seqs: 
        default: 250
      merge_align_filter: 
        default: "((reciprocity = 3 AND align_length_ungap >= 5) OR align_length > 1000) AND pct_identity_gap > 25"
      merge_engine: 
        default: "tree-merger"
      soft_masking:  
        default: 'true'
      task:  
        default: megablast
      use_common_components:  
        default: true
      window_size:  
        default: 150
      word_size:  
        default: 28
      workers_per_cpu:  
        default:  0.4
    out: [blast_align]
  Identify_Top_N_ANI:
    doc: identify top N ANI
    label: "Identify Top N ANI"
    run: ../task_types/tt_ani_top_n.cwl
    in:
        asn_cache: 
          source: [asn_cache, gc_seq_cache]
          linkMerge: merge_flattened
        ANI_cutoff: ANI_cutoff
        gencoll_asn: gencoll_asn
        ref_gencoll_asn: Get_Top_Assemblies_GenColl_ASN/gencoll_asn
        blast_align: Assembly_Assembly_BLASTn/blast_align
        ref_assembly_taxid: ref_assembly_taxid
        ref_assembly_id: ref_assembly_id
        tax_synon: tax_synon
        gcextract2_sqlite: gcextract2_sqlite
        taxon_db: taxon_db
    out: [top,annot]
  Identify_Top_N_ANI_transform:
    doc: transform ANI taxcheck report XML file to Genbank suitable (SMART equivalent) text report
    run: ../progs/xsltproc.cwl
    in:
      xml: Identify_Top_N_ANI/top
      xslt: ani_report_transform
      output_name:
        default: 'ani-tax-report.txt'
    out: [output]
  Evaluate_ANI_report:
    run: ../progs/xml_evaluate.cwl
    in:
      input: Identify_Top_N_ANI/top
      xpath_fail: 
        default: >
          /tax-check/results[@status='MISASSIGNED' or @status='CONTAMINATED']
      ignore_all_errors: 
        default: true
      stdout_redir:
        default: errors.xml
    out: [xml_output]