From bc1e3167d56060f5e0fd1fcbfbf6a1cd4e87524f Mon Sep 17 00:00:00 2001 From: Gisela Gabernet Date: Fri, 16 Feb 2024 12:46:55 -0500 Subject: [PATCH 1/6] avoid creating extra param --- conf/test_10x_sc.config | 1 - nextflow.config | 2 - nextflow_schema.json | 6 -- subworkflows/local/sc_raw_input.nf | 97 ++++++++++++++---------------- workflows/airrflow.nf | 6 +- 5 files changed, 49 insertions(+), 63 deletions(-) diff --git a/conf/test_10x_sc.config b/conf/test_10x_sc.config index 39a7b3d4..76936ef9 100644 --- a/conf/test_10x_sc.config +++ b/conf/test_10x_sc.config @@ -18,7 +18,6 @@ params { // params mode = 'fastq' - sc_raw = true library_generation_method = 'sc_10x_genomics' clonal_threshold = 0 diff --git a/nextflow.config b/nextflow.config index 8a69d532..d84a0c59 100644 --- a/nextflow.config +++ b/nextflow.config @@ -120,7 +120,6 @@ params { // Single cell raw input options // ----------------------- reference_10x = null - sc_raw = false // ----------------------- @@ -296,7 +295,6 @@ profiles { test_assembled_immcantation_devel_mm { includeConfig 'conf/test_assembled_immcantation_devel_mm.config' } test_nocluster { includeConfig 'conf/test_nocluster.config' } test_fetchimgt { includeConfig 'conf/test_fetchimgt.config' } - test_igblast { includeConfig 'conf/test_igblast.config' } test_10x_sc { includeConfig 'conf/test_10x_sc.config' } test_clontech_umi { includeConfig 'conf/test_clontech_umi.config' } test_nebnext_umi { includeConfig 'conf/test_nebnext_umi.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index d0d56282..4224ebd1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -459,12 +459,6 @@ "description": "Path to the reference directory required by cellranger. Can either be directory or tar.gz.", "help_text": "See for [IMGT](https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/advanced/references#imgt) or [default](https://www.10xgenomics.com/support/software/cell-ranger/downloads).", "fa_icon": "fas fa-database" - }, - "sc_raw": { - "type": "boolean", - "description": "Must be given when raw single cell data should be run.", - "help_text": "Must be given when raw single cell data should be run.", - "fa_icon": "fas fa-database" } }, "help_text": "Options for running raw single cell data.", diff --git a/subworkflows/local/sc_raw_input.nf b/subworkflows/local/sc_raw_input.nf index 12a4b640..25e5b8c9 100644 --- a/subworkflows/local/sc_raw_input.nf +++ b/subworkflows/local/sc_raw_input.nf @@ -26,69 +26,64 @@ workflow SC_RAW_INPUT { ch_reads = FASTQ_INPUT_CHECK.out.reads // validate library generation method parameter - if (params.library_generation_method == 'sc_10x_genomics') { - if (params.vprimers) { - error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require V-region primers, please provide a reference file instead or select another library method option." - } else if (params.race_linker) { - error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require the --race_linker parameter, please provide a reference file instead or select another library method option." - } - if (params.cprimers) { - error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require C-region primers, please provide a reference file instead or select another library method option." - } - if (params.umi_length > 0) { - error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require to set the UMI length, please provide a reference file instead or select another library method option." - } - if (params.reference_10x) { - // necessary to allow tar.gz files as input so that tests can run - if (params.reference_10x.endsWith(".tar.gz")){ - UNZIP_CELLRANGERDB( - params.reference_10x - ) - UNZIP_CELLRANGERDB.out.unzipped.set { ch_sc_reference } - } else { - ch_sc_reference = Channel.fromPath(params.reference_10x, checkIfExists: true) - } + if (params.vprimers) { + error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require V-region primers, please provide a reference file instead or select another library method option." + } else if (params.race_linker) { + error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require the --race_linker parameter, please provide a reference file instead or select another library method option." + } + if (params.cprimers) { + error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require C-region primers, please provide a reference file instead or select another library method option." + } + if (params.umi_length > 0) { + error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require to set the UMI length, please provide a reference file instead or select another library method option." + } + if (params.reference_10x) { + // necessary to allow tar.gz files as input so that tests can run + if (params.reference_10x.endsWith(".tar.gz")){ + UNZIP_CELLRANGERDB( + params.reference_10x + ) + UNZIP_CELLRANGERDB.out.unzipped.set { ch_sc_reference } } else { - error "The transcript-specific primer, 5'-RACE, UMI library generation method requires you to provide a reference file." + ch_sc_reference = Channel.fromPath(params.reference_10x, checkIfExists: true) } + } else { + error "The transcript-specific primer, 5'-RACE, UMI library generation method requires you to provide a reference file." + } - // run cellranger vdj - CELLRANGER_VDJ ( - ch_reads, - ch_sc_reference - ) - ch_versions = ch_versions.mix(CELLRANGER_VDJ.out.versions) + // run cellranger vdj + CELLRANGER_VDJ ( + ch_reads, + ch_sc_reference + ) + ch_versions = ch_versions.mix(CELLRANGER_VDJ.out.versions) - ch_cellranger_out = CELLRANGER_VDJ.out.outs + ch_cellranger_out = CELLRANGER_VDJ.out.outs - ch_cellranger_out - .map { meta, out_files -> - [ meta, out_files.find { it.endsWith("airr_rearrangement.tsv") } ] - } - .set { ch_cellranger_airr } + ch_cellranger_out + .map { meta, out_files -> + [ meta, out_files.find { it.endsWith("airr_rearrangement.tsv") } ] + } + .set { ch_cellranger_airr } - // TODO : add VALIDATE_INPUT Module - // this module requires input in csv format... Might need to create this in an extra module + // TODO : add VALIDATE_INPUT Module + // this module requires input in csv format... Might need to create this in an extra module - // rename tsv file to unique name - RENAME_FILE_TSV( - ch_cellranger_airr - ) - .set { ch_renamed_tsv } + // rename tsv file to unique name + RENAME_FILE_TSV( + ch_cellranger_airr + ) + .set { ch_renamed_tsv } - // convert airr tsv to fasta (cellranger does not create any fasta with clonotype information) - CHANGEO_CONVERTDB_FASTA_FROM_AIRR( - RENAME_FILE_TSV.out.file - ) + // convert airr tsv to fasta (cellranger does not create any fasta with clonotype information) + CHANGEO_CONVERTDB_FASTA_FROM_AIRR( + RENAME_FILE_TSV.out.file + ) - ch_fasta = CHANGEO_CONVERTDB_FASTA_FROM_AIRR.out.fasta + ch_fasta = CHANGEO_CONVERTDB_FASTA_FROM_AIRR.out.fasta // TODO: here you can add support for MiXCR sc protocols. - } else { - error "The provided library generation method is not supported. Please check the docs for `--library_generation_method`." - } - emit: versions = ch_versions // complete cellranger output diff --git a/workflows/airrflow.nf b/workflows/airrflow.nf index d0878af9..4fcc6f18 100644 --- a/workflows/airrflow.nf +++ b/workflows/airrflow.nf @@ -97,10 +97,12 @@ workflow AIRRFLOW { if ( params.mode == "fastq" ) { // SC:Perform sequence assembly if input type is fastq from single-cell sequencing data (currently only 10XGenomics) - if (params.sc_raw) { + if (params.library_generation_method == "sc_10x_genomics") { + SC_RAW_INPUT( ch_input ) + ch_fasta = SC_RAW_INPUT.out.fasta ch_versions = ch_versions.mix(SC_RAW_INPUT.out.versions) ch_cellranger_airr = SC_RAW_INPUT.out.airr @@ -122,8 +124,6 @@ workflow AIRRFLOW { ch_fastqc_postassembly_mqc = Channel.empty() } else { // Perform sequence assembly if input type is fastq from bulk sequencing data - // TODO make this part run from ch_reads_split.bulk! -> other input, FASTQ_INPUT_CHECK is not needed then anymore - SEQUENCE_ASSEMBLY( ch_input, DATABASES.out.igblast.collect() From 2bb6952c58ebd66cf0a1fe75883365f7767d12ec Mon Sep 17 00:00:00 2001 From: Gisela Gabernet Date: Sun, 18 Feb 2024 22:12:22 -0500 Subject: [PATCH 2/6] merge fastqs with multiple lanes --- CHANGELOG.md | 1 + bin/check_samplesheet.py | 9 ++- modules.json | 5 ++ modules/nf-core/cat/fastq/environment.yml | 7 ++ modules/nf-core/cat/fastq/main.nf | 80 +++++++++++++++++++++++ modules/nf-core/cat/fastq/meta.yml | 42 ++++++++++++ subworkflows/local/fastq_input_check.nf | 47 +++++++++---- 7 files changed, 174 insertions(+), 17 deletions(-) create mode 100644 modules/nf-core/cat/fastq/environment.yml create mode 100644 modules/nf-core/cat/fastq/main.nf create mode 100644 modules/nf-core/cat/fastq/meta.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 944bc3e1..b88df086 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#294](https://github.com/nf-core/airrflow/pull/294) Merge template updates nf-core/tools v2.11.1 - [#299](https://github.com/nf-core/airrflow/pull/299) Add profile for common NEB and TAKARA protocols +- Add possibility to merge multi-lane samples when starting from fastq files ### `Fixed` diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index e27d87e8..9867c446 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -160,9 +160,12 @@ def check_samplesheet(file_in, assembled): ## Check that sample ids are unique if len(tab["sample_id"]) != len(set(tab["sample_id"])): - print_error( - "Sample IDs are not unique! The sample IDs in the input samplesheet should be unique for each sample." - ) + if assembled: + print_error( + "Sample IDs are not unique! The sample IDs in the input samplesheet should be unique for each sample." + ) + else: + print("WARNING: Sample IDs are not unique! FastQs with the same sample ID will be merged.") ## Check that pcr_target_locus is IG or TR for val in tab["pcr_target_locus"]: diff --git a/modules.json b/modules.json index 04cd992c..19799719 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "cat/fastq": { + "branch": "master", + "git_sha": "02fd5bd7275abad27aad32d5c852e0a9b1b98882", + "installed_by": ["modules"] + }, "cellranger/mkvdjref": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml new file mode 100644 index 00000000..bff93add --- /dev/null +++ b/modules/nf-core/cat/fastq/environment.yml @@ -0,0 +1,7 @@ +name: cat_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 00000000..3d963784 --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,80 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 00000000..db4ac3c7 --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,42 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - cat + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/subworkflows/local/fastq_input_check.nf b/subworkflows/local/fastq_input_check.nf index 52f87190..b5165871 100644 --- a/subworkflows/local/fastq_input_check.nf +++ b/subworkflows/local/fastq_input_check.nf @@ -3,8 +3,7 @@ */ include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' -//TODO: when enchantr supports input samplesheet from raw sequencing, update code here to commented one. -//include { VALIDATE_INPUT } from '../../modules/local/enchantr/validate_input' +include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq/main' workflow FASTQ_INPUT_CHECK { take: @@ -15,22 +14,41 @@ workflow FASTQ_INPUT_CHECK { .tsv .splitCsv ( header:true, sep:'\t' ) .map { create_fastq_channels(it) } + .dump (tag: 'fastq_channel_before_merge_samples') + .groupTuple(by: [0]) + .dump(tag: 'fastq_channel_after_merge_samples_grouped') + .branch { + meta, fastqs -> + single: fastqs.size() == 1 + return [ meta, fastqs.flatten() ] + multiple: fastqs.size() > 1 + return [ meta, fastqs.flatten() ] + } .set { ch_reads } - // VALIDATE_INPUT( - // samplesheet, - // params.miairr, - // params.collapseby, - // params.cloneby - // ) + ch_versions = SAMPLESHEET_CHECK.out.versions + + // Merge multi-lane sample fastq for protocols except for 10x genomics (cellranger handles multi-fastq per sample) + if (params.library_generation_method == 'sc_10x_genomics') { + + ch_merged_reads = ch_reads.single.mix( ch_reads.multiple ) + + } else { + + CAT_FASTQ ( + ch_reads.multiple + ) + .reads + .mix( ch_reads.single ) + .dump (tag: 'fastq_channel_after_merge_samples') + .set { ch_merged_reads } - // VALIDATE_INPUT.out.validated_input - // .splitCsv(header: true, sep:'\t') - // .map { get_meta(it) } - // .set{ ch_reads } + ch_versions = ch_versions.mix( CAT_FASTQ.out.versions ) + + } emit: - reads = ch_reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + reads = ch_merged_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] samplesheet = SAMPLESHEET_CHECK.out.tsv // tsv metadata file } @@ -47,6 +65,7 @@ def create_fastq_channels(LinkedHashMap col) { meta.filetype = "fastq" meta.single_cell = col.single_cell.toLowerCase() meta.locus = col.pcr_target_locus + meta.single_end = false def array = [] if (!file(col.filename_R1).exists()) { From f1caa7b8c8b828ffb2c42ad03420e002d1554828 Mon Sep 17 00:00:00 2001 From: Gisela Gabernet Date: Sun, 18 Feb 2024 22:27:20 -0500 Subject: [PATCH 3/6] fix text --- subworkflows/local/sc_raw_input.nf | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/sc_raw_input.nf b/subworkflows/local/sc_raw_input.nf index 25e5b8c9..109947b8 100644 --- a/subworkflows/local/sc_raw_input.nf +++ b/subworkflows/local/sc_raw_input.nf @@ -27,15 +27,15 @@ workflow SC_RAW_INPUT { // validate library generation method parameter if (params.vprimers) { - error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require V-region primers, please provide a reference file instead or select another library method option." + error "The single-cell 10X genomics library generation method does not require V-region primers, please provide a reference file instead or select another library method option." } else if (params.race_linker) { - error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require the --race_linker parameter, please provide a reference file instead or select another library method option." + error "The single-cell 10X genomics library generation method does not require the --race_linker parameter, please provide a reference file instead or select another library method option." } if (params.cprimers) { - error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require C-region primers, please provide a reference file instead or select another library method option." + error "The single-cell 10X genomics library generation method does not require C-region primers, please provide a reference file instead or select another library method option." } if (params.umi_length > 0) { - error "The transcript-specific primer, 5'-RACE, UMI library generation method does not require to set the UMI length, please provide a reference file instead or select another library method option." + error "The single-cell 10X genomics library generation method does not require to set the UMI length, please provide a reference file instead or select another library method option." } if (params.reference_10x) { // necessary to allow tar.gz files as input so that tests can run @@ -48,7 +48,7 @@ workflow SC_RAW_INPUT { ch_sc_reference = Channel.fromPath(params.reference_10x, checkIfExists: true) } } else { - error "The transcript-specific primer, 5'-RACE, UMI library generation method requires you to provide a reference file." + error "The single-cell 10X genomics library generation method requires you to provide a reference file." } // run cellranger vdj @@ -84,6 +84,7 @@ workflow SC_RAW_INPUT { // TODO: here you can add support for MiXCR sc protocols. + emit: versions = ch_versions // complete cellranger output From 7b8b8c9a8f15554e032bcbe00c7fa59e8c2e5f82 Mon Sep 17 00:00:00 2001 From: Gisela Gabernet Date: Sun, 18 Feb 2024 22:29:27 -0500 Subject: [PATCH 4/6] fix lint --- workflows/airrflow.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/airrflow.nf b/workflows/airrflow.nf index 4fcc6f18..ede383fa 100644 --- a/workflows/airrflow.nf +++ b/workflows/airrflow.nf @@ -145,7 +145,7 @@ workflow AIRRFLOW { ch_presto_assemblepairs_logs = SEQUENCE_ASSEMBLY.out.presto_assemblepairs_logs ch_presto_collapseseq_logs = SEQUENCE_ASSEMBLY.out.presto_collapseseq_logs ch_presto_splitseq_logs = SEQUENCE_ASSEMBLY.out.presto_splitseq_logs - } + } } else if ( params.mode == "assembled" ) { From 68a73290accfeba97b83ded4ab4e4528e5f9d799 Mon Sep 17 00:00:00 2001 From: Gisela Gabernet Date: Mon, 19 Feb 2024 09:01:04 -0500 Subject: [PATCH 5/6] fix metadata merge --- bin/reveal_add_metadata.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bin/reveal_add_metadata.R b/bin/reveal_add_metadata.R index f2ff5b5f..1745da89 100755 --- a/bin/reveal_add_metadata.R +++ b/bin/reveal_add_metadata.R @@ -61,8 +61,12 @@ if (!("INPUTID" %in% names(opt))) { # Read metadata file metadata <- read.csv(opt$METADATA, sep = "\t", header = TRUE, stringsAsFactors = F) +# Merging samples over multiple lanes introduces multi-rows per sample +# We expect only one row per sample metadata <- metadata %>% - filter(sample_id == opt$INPUTID) + dplyr::filter(sample_id == opt$INPUTID) %>% + dplyr::select(!starts_with("filename_")) %>% + dplyr::distinct() if (nrow(metadata) != 1) { stop("Expecting nrow(metadata) == 1; nrow(metadata) == ", nrow(metadata), " found") @@ -81,10 +85,7 @@ internal_fields <- "id", "filetype", "valid_single_cell", - "valid_pcr_target_locus", - "filename_R1", - "filename_R2", - "filename_I1" + "valid_pcr_target_locus" ) metadata <- metadata[, !colnames(metadata) %in% internal_fields] From afa4c1dbbad344ec6156855a5673573e874f9cd0 Mon Sep 17 00:00:00 2001 From: Gisela Gabernet Date: Tue, 20 Feb 2024 10:00:53 -0500 Subject: [PATCH 6/6] add collect --- subworkflows/local/sc_raw_input.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/sc_raw_input.nf b/subworkflows/local/sc_raw_input.nf index 109947b8..8f46cbfd 100644 --- a/subworkflows/local/sc_raw_input.nf +++ b/subworkflows/local/sc_raw_input.nf @@ -53,8 +53,8 @@ workflow SC_RAW_INPUT { // run cellranger vdj CELLRANGER_VDJ ( - ch_reads, - ch_sc_reference + ch_reads, + ch_sc_reference.collect() ) ch_versions = ch_versions.mix(CELLRANGER_VDJ.out.versions)