diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ce74cc0..90a36672 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#576](https://github.com/nf-core/taxprofiler/pull/576) Sort input to krakenuniq to enable retrieval of cached batch tasks (added by @muniheart) + ### `Fixed` - [573](https://github.com/nf-core/taxprofiler/pull/573) Improved help messages and documentation to state many of the taxpasta related params require taxonomy files to be input (❤️ to @alexhbnr for reporting, fix by @jfy133) diff --git a/conf/modules.config b/conf/modules.config index 1b82c9ee..35a912e2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -603,6 +603,7 @@ process { } withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + tag = { "${meta.db_name}|${task.index}" } ext.args = { "${meta.db_params}" } // one run with multiple samples, so fix ID to just db name to ensure clean log name ext.prefix = { "${meta.db_name}.krakenuniq" } diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index e21a745b..8409efe9 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -384,7 +384,8 @@ workflow PROFILING { } if ( params.run_krakenuniq ) { - ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq + + ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq .map { meta, reads, db_meta, db -> def seqtype = (reads[0].name ==~ /.+?\.f\w{0,3}a(\.gz)?$/) ? 'fasta' : 'fastq' @@ -395,6 +396,9 @@ workflow PROFILING { } .groupTuple(by: [0,2,3]) .flatMap { single_meta, reads, db_meta, db -> + // Sort reads array by comparing last element, prefix. This will ensure batch membership remains + // constant across runs, enabling retrieval of cached tasks. + reads.sort { a,b -> a[-1] <=> b[-1] } def batches = reads.collate(params.krakenuniq_batch_size) return batches.collect { batch -> // We split the sample identifier from the reads again after batching.