diff --git a/.gitignore b/.gitignore index 0d3bda2b..bceee492 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,6 @@ .Rhistory .RData .Ruserdata +TESTE docs/_html -teste \ No newline at end of file +teste diff --git a/.zenodo.json b/.zenodo.json index 6e122456..8a37436f 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -1,8 +1,8 @@ { - "description": "

The pipeline

\n\n

bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, genomic islands, prophages, ICEs, KO, and more.

\n\n

Release notes

\n\n

This is a super small fix:

", + "description": "

The pipeline

\n\n

bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more.

", "license": "other-open", - "title": "fmalmeida/bacannot: fmalmeida/bacannot v2.4.2", - "version": "v2.4.2", + "title": "fmalmeida/bacannot: A generic but comprehensive bacterial annotation pipeline", + "version": "v3.0", "upload_type": "software", "creators": [ { diff --git a/README.md b/README.md index 9bd7db0a..206dc233 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,11 @@ -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3627669.svg)](https://doi.org/10.5281/zenodo.3627669) [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/fmalmeida/bacannot?include_prereleases&label=Latest%20release)](https://github.com/fmalmeida/bacannot/releases) [![Documentation](https://img.shields.io/badge/Documentation-readthedocs-brightgreen)](https://bacannot.readthedocs.io/en/latest/?badge=latest) [![Dockerhub](https://img.shields.io/badge/Docker-fmalmeida/bacannot-informational)](https://hub.docker.com/r/fmalmeida/bacannot) [![Nextflow version](https://img.shields.io/badge/Nextflow%20>=-v20.07-important)](https://www.nextflow.io/docs/latest/getstarted.html) [![License](https://img.shields.io/badge/License-GPL%203-black)](https://github.com/fmalmeida/bacannot/blob/master/LICENSE) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3627669.svg)](https://doi.org/10.5281/zenodo.3627669) +[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/fmalmeida/bacannot?include_prereleases&label=Latest%20release)](https://github.com/fmalmeida/bacannot/releases) +[![Documentation](https://img.shields.io/badge/Documentation-readthedocs-brightgreen)](https://bacannot.readthedocs.io/en/latest/?badge=latest) +[![Dockerhub](https://img.shields.io/badge/Docker-fmalmeida/bacannot-informational)](https://hub.docker.com/r/fmalmeida/bacannot) +[![Nextflow version](https://img.shields.io/badge/Nextflow%20>=-v20.07-important)](https://www.nextflow.io/docs/latest/getstarted.html) +[![License](https://img.shields.io/badge/License-GPL%203-black)](https://github.com/fmalmeida/bacannot/blob/master/LICENSE)

@@ -78,8 +83,8 @@ These images have been kept separate to not create massive Docker image and to a * Required - docker pull fmalmeida/bacannot:v2.4 # Main image for core annotations - docker pull fmalmeida/bacannot:v2.4_renv # R packages for reports + docker pull fmalmeida/bacannot:v3.0 # Main image for core annotations + docker pull fmalmeida/bacannot:v3.0_renv # R packages for reports docker pull fmalmeida/bacannot:jbrowse # JBrowse software * Optional @@ -87,11 +92,10 @@ These images have been kept separate to not create massive Docker image and to a docker pull fmalmeida/bacannot:kofamscan # If user wants KO annotation docker pull fmalmeida/bacannot:antismash # If user wants antismash annotation docker pull fmalmeida/bacannot:server # If user wants to open the shiny parser - docker pull fmalmeida/mpgap:v2.3 # If using raw reads as input 🔥 Nextflow can also automatically handle images download on the fly when executed. However, some servers may hang the download due to the image size (view below). -❗ If the download of `fmalmeida/bacannot:v2.4` image keeps hanging due to its size, download `fmalmeida/bacannot:main_tools` first. It is the core of the versioned tag and it will help on the download by creating some cache. +❗ If the download of `fmalmeida/bacannot:v3.0` image keeps hanging due to its size, download `fmalmeida/bacannot:main_tools` first. It is the core of the versioned tag and it will help on the download by creating some cache. 2. Install Nextflow (version 20.07 or higher): @@ -105,33 +109,19 @@ These images have been kept separate to not create massive Docker image and to a ### Maintaining databases up-to-date -By default, github actions have been set to build the docker image containing the databases (`fmalmeida/bacannot:v2.4`) in the first day of every month. Therefore, to use the most up-to-date databases users must run `docker pull fmalmeida/bacannot:v2.4` before running the pipeline. +To use the most up-to-date databases users must run `docker pull fmalmeida/bacannot:v3.0` before running the pipeline. We try to keep this image updated every three months if they pass execution tests after built. -Additionally, a custom script is provided to allow users to update the database image any time. +A custom script is provided to allow users to update the database image any time, if desired. ```bash bash <(wget -O - -o /dev/null https://github.com/fmalmeida/bacannot/raw/master/bin/update_database_image.sh) ``` -> This command line will trigger a custom script that downloads the databases and build the fmalmeida/bacannot:v2.4 docker image. +> This command line will trigger a custom script that downloads the databases and build the main docker image. ## Quickstart -For a rapid and simple quickstart we will use as input the _Escherichia coli_ reference genome. - -```bash - - # Download the ecoli ref genome - wget -O ecoli_ref.fna.gz https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/865/GCF_000008865.2_ASM886v2/GCF_000008865.2_ASM886v2_genomic.fna.gz - gzip -d ecoli_ref.fna.gz - - # Run the pipeline using the Escherichia coli resfinder database - nextflow run fmalmeida/bacannot \ - --prefix ecoli \ - --genome ecoli_ref.fna \ - --outdir _ANNOTATION \ - --threads 4 \ - --resfinder_species "Escherichia coli" +Please refer to the quickstart page » ``` ### Overview of outputs diff --git a/configuration_template/bacannot.config b/configuration_template/bacannot.config deleted file mode 100644 index 256cc1b1..00000000 --- a/configuration_template/bacannot.config +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Configuration File to run fmalmeida/bacannot pipeline. - */ - -/* - - Required Parameters. - This parameters must always be set - -*/ -params { - - /* - - SINGLE GENOME ANALYSIS - - */ - -// Prefix for writing genome assembly and annotatin resulting files -// Preferentially the sample name - prefix = 'out' - -// The input file formats are mutually exclusive. Users must choose between giving an -// assembled genome or raw reads to the pipeline. -// Input genome -- Always in FASTA format. - genome = '' - -// Input raw reads -- Always in FASTQ format. -// When using raw reads, the fmalmeida/mpgap is also required to be available. - sreads_single = '' // Path to unpaired illumina reads, if available for the sample - sreads_paired = '' // Path to paired end illumina reads, if available for the sample - lreads = '' // Path to longreads (ONT or Pacbio), if available for the sample - lreads_type = '' // Longreads is used? If so, from which tech it is? Options: [ nanopore or pacbio ] - -// Species panel to be used when annotating with Resfinder. If blank, -// it will not be executed. Must be identical (without the *) as written -// in their webservice https://cge.cbs.dtu.dk/services/ResFinder/. -// E.g. 'Escherichia coli'; 'Klebsiella' ... - resfinder_species = '' - -// Configure optional Methylation annotation with nanopolish -// If left blank, it will not be executed. When both parameters are set -// it will automatically execute nanopolish to call methylation - - nanopolish_fast5 = '' // Path to directory containing FAST5 files - nanopolish_fastq = '' // Path to fastq files (file related to FAST5 files above) - - /* - - MULTIPLE GENOME ANALYSIS - - */ - -// When analysing multiple genomes at once, all the parameters described above, must be, whenever -// necessary and applicable to your data, set inside a samplesheet file in YAML format. We provide -// an well-formated example of this YAML file at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml -// -// Please read the example YAML samplesheet so you can understand how to properly fill it. -// -// It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet.html - in_yaml = '' - - /* - - GENERAL PARAMETERS -- FOR BOTH SINGLE AND MULTIPLE GENOME WORKFLOWS - - */ - -// Main output folder name. More than one bacannot annotation can be redirected -// to the same output parameter. It is good to keep related annotations together. -// A subdirectory with the filename will be created inside this directory. - outdir = 'output' - -// Number of threads to be used by each software - threads = 2 - -// Number of jobs to run in parallel. Be aware that each job (in parallel) can consume -// N threads (set above). Be sure to carefully check your resources before augmenting -// this parameter. For example: parallel_jobs = 2 + threads = 5 can consume until 10 -// threads at once. -// If not given, let's nextflow automatically handle it. - parallel_jobs = - -// Number of minimum overlapping base pairs required for merging -// Negative values, such as -20, means the number of required overlapping bases for merging. -// Positive values, such as 5, means the maximum distance accepted between features for merging. -// By default (if Blank), this process is not executed. For execution the user needs to provide a value - bedtools_merge_distance = '' - - /* - * Prokka optional parameters - */ - -// Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria') - prokka_kingdom = '' - -// Translation table code. Must be set if the above is set. -// Example: params.prokka_genetic.code = 11 - prokka_genetic_code = false - -// Use rnammer instead of Barrnap? False or True? - prokka_use_rnammer = false - - /* - * Handling the execution of processes - * - * By default, all processes are executed. These - * parameters tells wheter NOT to run a process. - * - * Which means: false will allow its execution - * while true will create a barrier and skip a process. - -*/ -// (NOT RUN?) Plasmids annotation (controls PlasmidFinder execution) - skip_plasmid_search = false - -// (NOT RUN?) General Virulence annotation (controls VFDB and Victors scan) - skip_virulence_search = false - -// (NOT RUN?) Resistance annotation (controls AMRfinder and RGI) - skip_resistance_search = false - -// (NOT RUN?) ICE annotation (controls ICEberg annotation) - skip_iceberg_search = false - -// (NOT RUN?) prophage annotation (controls PHAST and Phigaro) - skip_prophage_search = false - -// (NOT RUN?) KO (KEGG Orthology) annotation - skip_kofamscan = false - -// (NOT RUN?) antiSMASH (secondary metabolite) annotation - skip_antismash = false - - /* - * Custom databases can be used to annotate additional genes in the genome. - * It runs a BLASTn alignment against the genome, therefore, the custom database - * MUST be a nucleotide fasta of genes. More than one custom database can be given - * separated by commas. Gene headers must be properly formated as described in the - * documentation: https://bacannot.readthedocs.io/en/latest/custom-db.html - */ -// Custom nucleotide fastas - custom_db = '' - - /* - * Annotation thresholds to be used when scanning specific databases and features - * Select a combination of thresholds that is meaningful for your data. Some of - * the databases are protein-only, others are nucleotide only. We cannnot control - * that and the databases will be scanned either if blastp or blastn using these - * thresholds described here. - */ - -// Identity threshold for plasmid annotation - plasmids_minid = 90 - -// Coverage threshold for plasmid annotation - plasmids_mincov = 60 - -// Virulence genes identity threshold - blast_virulence_minid = 90 - -// Virulence genes coverage threshold - blast_virulence_mincov = 80 - -// AMR genes identity threshold - blast_resistance_minid= 90 - -// AMR genes coverage threshold - blast_resistance_mincov = 80 - -// MGEs (ICEs and Phages) identity threshold - blast_MGEs_minid = 65 - -// MGEs (ICEs and Phages) coverage threshold - blast_MGEs_mincov = 65 - -// User's custom database identity threashold - blast_custom_minid = 0 - -// User's custom database coverage threashold - blast_custom_mincov = 0 - -} - -/* - Configuration of Nextflow Scopes - */ - -//Trace Report -trace { - enabled = false - file = "${params.outdir}" + "/annotation_pipeline_trace.txt" - fields = 'task_id,name,status,exit,realtime,cpus,%cpu,memory,%mem,rss' -} - -//Timeline Report -timeline { - enabled = false - file = "${params.outdir}" + "/annotation_pipeline_timeline.html" -} - -//Complete Report -report { - enabled = false - file = "${params.outdir}" + "/annotation_pipeline_nextflow_report.html" -} - -/* - Setting up NF profiles - To use different profiles and executors - please read more at: https://www.nextflow.io/docs/latest/config.html#config-profiles -*/ -profiles { - standard { - // Executor - process.executor = "local" - // QueueSize limit - if (params.parallel_jobs || params.parallel_jobs != '') { - qs = params.parallel_jobs - } - executor { - name = "local" - if (params.parallel_jobs || params.parallel_jobs != '') { - queueSize = qs - } - } - } - - awsbatch { - process.executor = 'awsbatch' - process.queue = 'my-batch-queue' - // cpu allocation - process.cpus = params.threads - } -} diff --git a/docs/config.rst b/docs/config.rst index 4ceb3d0b..b391ed1f 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -1,241 +1,14 @@ .. _config: Configuration File -"""""""""""""""""" +================== To download a configuration file template users just need to run ``nextflow run fmalmeida/bacannot --get_config`` Using a config file your code is lot more clean and concise: ``nextflow run fmalmeida/bacannot -c [path-to-config]`` -Default configuration: +Default configuration +--------------------- -.. code-block:: groovy - - /* - * Configuration File to run fmalmeida/bacannot pipeline. - */ - - /* - - Required Parameters. - This parameters must always be set - - */ - params { - - /* - - SINGLE GENOME ANALYSIS - - */ - - // Prefix for writing genome assembly and annotatin resulting files - // Preferentially the sample name - prefix = 'out' - - // The input file formats are mutually exclusive. Users must choose between giving an - // assembled genome or raw reads to the pipeline. - // Input genome -- Always in FASTA format. - genome = '' - - // Input raw reads -- Always in FASTQ format. - // When using raw reads, the fmalmeida/mpgap is also required to be available. - sreads_single = '' // Path to unpaired illumina reads, if available for the sample - sreads_paired = '' // Path to paired end illumina reads, if available for the sample - lreads = '' // Path to longreads (ONT or Pacbio), if available for the sample - lreads_type = '' // Longreads is used? If so, from which tech it is? Options: [ nanopore or pacbio ] - - // Species panel to be used when annotating with Resfinder. If blank, - // it will not be executed. Must be identical (without the *) as written - // in their webservice https://cge.cbs.dtu.dk/services/ResFinder/. - // E.g. 'Escherichia coli'; 'Klebsiella' ... - resfinder_species = '' - - // Configure optional Methylation annotation with nanopolish - // If left blank, it will not be executed. When both parameters are set - // it will automatically execute nanopolish to call methylation - - nanopolish_fast5 = '' // Path to directory containing FAST5 files - nanopolish_fastq = '' // Path to fastq files (file related to FAST5 files above) - - /* - - MULTIPLE GENOME ANALYSIS - - */ - - // When analysing multiple genomes at once, all the parameters described above, must be, whenever - // necessary and applicable to your data, set inside a samplesheet file in YAML format. We provide - // an well-formated example of this YAML file at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml - // - // Please read the example YAML samplesheet so you can understand how to properly fill it. - // - // It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet.html - in_yaml = '' - - /* - - GENERAL PARAMETERS -- FOR BOTH SINGLE AND MULTIPLE GENOME WORKFLOWS - - */ - - // Main output folder name. More than one bacannot annotation can be redirected - // to the same output parameter. It is good to keep related annotations together. - // A subdirectory with the filename will be created inside this directory. - outdir = 'output' - - // Number of threads to be used by each software - threads = 2 - - // Number of jobs to run in parallel. Be aware that each job (in parallel) can consume - // N threads (set above). Be sure to carefully check your resources before augmenting - // this parameter. For example: parallel_jobs = 2 + threads = 5 can consume until 10 - // threads at once. - // If not given, let's nextflow automatically handle it. - parallel_jobs = - - // Number of minimum overlapping base pairs required for merging - // Negative values, such as -20, means the number of required overlapping bases for merging. - // Positive values, such as 5, means the maximum distance accepted between features for merging. - // By default (if Blank), this process is not executed. For execution the user needs to provide a value - bedtools_merge_distance = '' - - /* - * Prokka optional parameters - */ - - // Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria') - prokka_kingdom = '' - - // Translation table code. Must be set if the above is set. - // Example: params.prokka_genetic.code = 11 - prokka_genetic_code = false - - // Use rnammer instead of Barrnap? False or True? - prokka_use_rnammer = false - - /* - * Handling the execution of processes - * - * By default, all processes are executed. These - * parameters tells wheter NOT to run a process. - * - * Which means: false will allow its execution - * while true will create a barrier and skip a process. - - */ - // (NOT RUN?) Plasmids annotation (controls PlasmidFinder execution) - skip_plasmid_search = false - - // (NOT RUN?) General Virulence annotation (controls VFDB and Victors scan) - skip_virulence_search = false - - // (NOT RUN?) Resistance annotation (controls AMRfinder and RGI) - skip_resistance_search = false - - // (NOT RUN?) ICE annotation (controls ICEberg annotation) - skip_iceberg_search = false - - // (NOT RUN?) prophage annotation (controls PHAST and Phigaro) - skip_prophage_search = false - - // (NOT RUN?) KO (KEGG Orthology) annotation - skip_kofamscan = false - - // (NOT RUN?) antiSMASH (secondary metabolite) annotation - skip_antismash = false - - /* - * Custom databases can be used to annotate additional genes in the genome. - * It runs a BLASTn alignment against the genome, therefore, the custom database - * MUST be a nucleotide fasta of genes. More than one custom database can be given - * separated by commas. Gene headers must be properly formated as described in the - * documentation: https://bacannot.readthedocs.io/en/latest/custom-db.html - */ - // Custom nucleotide fastas - custom_db = '' - - /* - * Annotation thresholds to be used when scanning specific databases and features - * Select a combination of thresholds that is meaningful for your data. Some of - * the databases are protein-only, others are nucleotide only. We cannnot control - * that and the databases will be scanned either if blastp or blastn using these - * thresholds described here. - */ - - // Identity threshold for plasmid annotation - plasmids_minid = 90 - - // Coverage threshold for plasmid annotation - plasmids_mincov = 60 - - // Virulence genes identity threshold - blast_virulence_minid = 90 - - // Virulence genes coverage threshold - blast_virulence_mincov = 80 - - // AMR genes identity threshold - blast_resistance_minid= 90 - - // AMR genes coverage threshold - blast_resistance_mincov = 80 - - // MGEs (ICEs and Phages) identity threshold - blast_MGEs_minid = 65 - - // MGEs (ICEs and Phages) coverage threshold - blast_MGEs_mincov = 65 - - // User's custom database identity threashold - blast_custom_minid = 0 - - // User's custom database coverage threashold - blast_custom_mincov = 0 - - } - - /* - Configuration of Nextflow Scopes - */ - - //Trace Report - trace { - enabled = false - file = "${params.outdir}" + "/annotation_pipeline_trace.txt" - fields = 'task_id,name,status,exit,realtime,cpus,%cpu,memory,%mem,rss' - } - - //Timeline Report - timeline { - enabled = false - file = "${params.outdir}" + "/annotation_pipeline_timeline.html" - } - - //Complete Report - report { - enabled = false - file = "${params.outdir}" + "/annotation_pipeline_nextflow_report.html" - } - - /* - Setting up NF profiles - To use different profiles and executors - please read more at: https://www.nextflow.io/docs/latest/config.html#config-profiles - */ - profiles { - standard { - // Executor - process.executor = "local" - // QueueSize limit - if (params.parallel_jobs || params.parallel_jobs != '') { - qs = params.parallel_jobs - } - executor { - name = "local" - if (params.parallel_jobs || params.parallel_jobs != '') { - queueSize = qs - } - } - } - } +.. literalinclude:: ../nextflow.config + :language: groovy diff --git a/docs/custom-db.rst b/docs/custom-db.rst index 28080e0c..89b7eac7 100644 --- a/docs/custom-db.rst +++ b/docs/custom-db.rst @@ -13,7 +13,7 @@ Although simple, the custom database must follow some rules about sequence heade of custom reports in HTML format, that shall be available under the ``report_files`` directory. Sequence header format -"""""""""""""""""""""" +---------------------- Sequence headers must follow a 5-field rule separated by "~~~" and spaces. The first 4 fields must be separated by "~~~" and the last one by one space, following the example shown below: @@ -35,7 +35,7 @@ example shown below: It is very important to follow this header format in order to make it possible and easier to render summaries and reports of the BLASTn result, such as below: BLASTn summary example -"""""""""""""""""""""" +---------------------- When the header is followed, the summaries and reports are very well rendered such as in this example: diff --git a/docs/examples.rst b/docs/examples.rst deleted file mode 100644 index 073233a1..00000000 --- a/docs/examples.rst +++ /dev/null @@ -1,115 +0,0 @@ -.. _examples: - -Usage examples -============== - -Launching interactive graphical interface -""""""""""""""""""""""""""""""""""""""""" - -Users can trigger a graphical and interactive pipeline configuration and execution by using `nf-core launch `_ utility. - -.. code-block:: bash - - # Install nf-core - pip install nf-core - - # Lauch pipeline interactive configuration - nf-core launch fmalmeida/bacannot - -Single genome annotation -"""""""""""""""""""""""" - -.. code-block:: bash - - ./nextflow run fmalmeida/bacannot \ - --outdir TESTE \ - --threads 3 \ - --genome assembly.fasta \ - --bedtools_merge_distance -20 \ - --skip_kofamscan - -.. note:: - - This command will perform a rapid annotation of ``assembly.fasta`` file using a minimum of 20 overlapping bases - for gene merge and will not execute Kofamscan, nor methylation call with Nanopolish. - -Multiple genome annotation -"""""""""""""""""""""""""" - -.. code-block:: bash - - ./nextflow run fmalmeida/bacannot \ - --outdir TESTE \ - --threads 3 \ - --in_yaml samplesheet.yaml \ - --custom_db db1.fasta - -.. warning:: - - Samplesheet must be properly configured as in :ref:`samplesheet`. - -.. note:: - - The ``--custom_db`` parameter is used to add an annotation process with BLASTn using an user's custom db. - -A little more complex example -""""""""""""""""""""""""""""" - -.. code-block:: bash - - ./nextflow run fmalmeida/bacannot \ - --outdir TESTE \ - --threads 3 \ - --genome assembly.fasta \ - --bedtools_merge_distance -20 \ - --nanopolish_fastq "fastq/input.fastq" \ - --nanopolish_fast5 "fast5_pass_dir" \ - --resfinder_species "Escherichia coli" - -.. note:: - - Differently, this command will run **all** the main analysis because the Resfinder and Nanopolish - parameters have been set and no process have been told to skip (e.g. ``--skip_kofamscan``). - -Annotating from raw reads -""""""""""""""""""""""""" - -Users are able to annotate genomes directly from raw reads. When raw reads are used, Unicycler is used to create -shortreads-only and hybrid assemblies while Flye is used to create longreads-only assemblies the annotation process. - - -.. code-block:: bash - - nextflow run fmalmeida/bacannot \ - --sreads_paired "sample1_{1,2}.fastq" \ - --lreads "sample1_lreads.fastq" \ - --lreads_type nanopore \ - --outdir TESTE \ - --skip_kofamscan \ - --threads 5 \ - --nanopolish_fastq "sample1_lreads.fastq" \ - --nanopolish_fast5 "fast5_pass_dir" - -.. note:: - - This command will first perform a hybrid assembly with Unicycler and then annotate the assembled genome. Additionnally, since - nanopolish parameters were given, it will call methylations with nanopolish. - -.. note:: - - Remember to always write input paths inside double quotes. - -.. note:: - - When using paired end reads it is required that input reads are set with the "{1,2}"" pattern. For example: "SRR6307304_{1,2}.fastq". This will properly load reads "SRR6307304_1.fastq" and "SRR6307304_2.fastq" - -.. warning:: - - When running hybrid assemblies or mixing short read types it is advised to **avoid not required REGEX** and write the full file path, using only the required REGEX for paired end reads when applicable. So that the pipeline does not load any different read that also matches the REGEX and avoid confusions with the inputs. - -Running with a configuration file -""""""""""""""""""""""""""""""""" - -.. code-block:: bash - - ./nextflow run fmalmeida/bacannot -c bacannot.config diff --git a/docs/index.rst b/docs/index.rst index 517634d8..ccd3cb18 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,9 +9,7 @@ Bacannot ======== -Designed to provide an easy-to-use framework for performing a comprehensive annotation on -prokaryotic genomes, `bacannot `_ is developed with `Nextflow `_ -and `Docker `_. It can annotate resistance genes, virulence factors, genomic islands, prophages, methylation and more. +Designed to provide an easy-to-use framework for performing a comprehensive annotation on prokaryotic genomes, `bacannot `_ is developed with `Nextflow `_ and `Docker `_. It can annotate resistance genes, virulence factors, genomic islands, prophages, methylation and more. Its main steps are: @@ -81,13 +79,11 @@ Its main steps are: installation quickstart - inputs + samplesheet outputs manual config custom-db - samplesheet - examples Support Contact =============== diff --git a/docs/inputs.rst b/docs/inputs.rst deleted file mode 100644 index 338ade9b..00000000 --- a/docs/inputs.rst +++ /dev/null @@ -1,36 +0,0 @@ -.. _inputs: - -Input files -=========== - -Required -^^^^^^^^ - -To execute the annotation pipeline users **must** provide genomic data as either raw reads or assembled genomes as input. When raw reads are used, Unicycler and Flye -assemblers are used to create, respectively, shortreads-only and hybrid assemblies, or longreads-only assemblies for the annotation process. Which means, the minimum -required input files are: - -* An assembled genome in FASTA format, **or**; -* Raw sequencing reads. - -.. note:: - - Users can analyse more than one genome at once by properly configuring a samplesheet - and using it with the ``--in_yaml`` parameter. See :ref:`samplesheet`. - -Optional -^^^^^^^^ - -The pipeline accepts as input two other input files types that are used to perform additional annotation processes, they are: - -* path to a directory of FAST5 and path to ONT fastq - - * This data will be used for the methylation calling process - -* path to custom **nucleotide** databases as described in :ref:`custom-db` - - * These custom databases will be used to perform additional annotation processes using BLASTn - -.. note:: - - Users must must carefully read the documentation in order to better understand the details of the pipeline workflow customization. diff --git a/docs/installation.rst b/docs/installation.rst index cd13292f..bfdeb728 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -36,15 +36,15 @@ This pipeline requires only `Docker `_ (and its Docker .. code-block:: bash docker pull fmalmeida/bacannot:main_tools ; # this is the core of the main image - docker pull fmalmeida/bacannot:v2.4 ; + docker pull fmalmeida/bacannot:v3.0 ; docker pull fmalmeida/bacannot:kofamscan ; docker pull fmalmeida/bacannot:antismash ; docker pull fmalmeida/bacannot:jbrowse ; - docker pull fmalmeida/bacannot:v2.4_renv ; + docker pull fmalmeida/bacannot:v3.0_renv ; .. tip:: - If the download of ``fmalmeida/bacannot:v2.4`` image keeps hanging due to its size, download the ``fmalmeida/bacannot:main_tools`` first. It is the core of the versioned tag and it will help on the download by creating some cache. Also, remember to always keep your Docker images up to date (Docker pull will always download the latest) + If the download of ``fmalmeida/bacannot:v3.0`` image keeps hanging due to its size, download the ``fmalmeida/bacannot:main_tools`` first. It is the core of the versioned tag and it will help on the download by creating some cache. Also, remember to always keep your Docker images up to date (Docker pull will always download the latest) 6. (Optional) Install nf-core utility @@ -52,12 +52,6 @@ This pipeline requires only `Docker `_ (and its Docker pip install nf-core>=1.10 -7. (Optional) Docker image for using raw reads as input - - .. code-block:: bash - - docker pull fmalmeida/mpgap - .. note:: The pipeline requires a UNIX system, therefore, Windows users may successfully use this pipeline via the `Linux subsystem for windows `_. diff --git a/docs/manual.rst b/docs/manual.rst index 6df87349..b2826cf6 100644 --- a/docs/manual.rst +++ b/docs/manual.rst @@ -9,69 +9,34 @@ Manual nextflow run fmalmeida/bacannot --help Parameters description -^^^^^^^^^^^^^^^^^^^^^^ +---------------------- -Input files (single genome analysis) -"""""""""""""""""""""""""""""""""""" +Input files +""""""""""" -.. note:: - - These parameters must only be used when annotating a single genome. If running the pipeline with more than 1 input - genomes users must set them in the samplesheet YAML file as described in :ref:`samplesheet`. - -.. note:: - - Remember to always write input paths inside double quotes. - -.. note:: - - When using paired end reads it is required that input reads are set with the "{1,2}"" pattern. For example: "SRR6307304_{1,2}.fastq". This will properly load reads "SRR6307304_1.fastq" and "SRR6307304_2.fastq" +Required +^^^^^^^^ -.. warning:: - - When running hybrid assemblies or mixing short read types it is advised to **avoid not required REGEX** and write the full file path, using only the required REGEX for paired end reads when applicable. So that the pipeline does not load any different read that also matches the REGEX and avoid confusions with the inputs. +To execute the annotation pipeline users **must** provide genomic data as either raw reads or assembled genomes as input. When raw reads are used, Unicycler and Flye assemblers are used to create, respectively, shortreads-only and hybrid assemblies, or longreads-only assemblies for the annotation process. Which means, the minimum required input files are: -.. list-table:: - :widths: 20 10 20 30 - :header-rows: 1 +* An assembled genome in FASTA format, **or**; +* Raw sequencing reads. - * - Arguments - - Required - - Default value - - Description +Optional +^^^^^^^^ - * - ``--genome`` - - Y (if raw reads are not used) - - NA - - Genome(s) to be annotated in FASTA file. Mutually exclusively with the use of raw reads. +The pipeline accepts as input two other input files types that are used to perform additional annotation processes, they are: - * - ``--prefix`` - - Y - - out - - This sets the prefix to be used when writing results +* path to a directory of FAST5 - * - ``--sreads_single`` - - N (Y if assembled genome is not used) - - NA - - Path to short unpaired reads. E.g. "SRR*.fastq.gz" + * Then used together with nanopore reads it will call DNA methylation with Nanopolish. - * - ``--sreads_paired`` - - N (Y if assembled genome is not used) - - NA - - Path to short paired reads. E.g. "SRR6307304_{1,2}.fastq" +* path to custom **nucleotide** databases as described in :ref:`custom-db` - * - ``--lreads`` - - N (Y if assembled genome is not used) - - NA - - Path to longreads (ONT or Pacbio) - - * - ``--lreads_type`` - - N (Y if longreads are used) - - NA - - Longreads are used? If so, from which technology it is? Options: [ 'nanopore' or 'pacbio' ] + * These custom databases (``--custom_db``) will be used to perform additional annotation processes using BLASTn -Input files (multiple genome analysis) -"""""""""""""""""""""""""""""""""""""" +Input samplesheet +^^^^^^^^^^^^^^^^^ .. list-table:: :widths: 20 10 20 25 @@ -82,10 +47,14 @@ Input files (multiple genome analysis) - Default value - Description - * - ``--in_yaml`` + * - ``--input`` - Y - NA - - Input samplesheet in YAML format. Used when analysis is to be performed with multiple genomes at once. It is incompatible with the parameters for single genome analysis. + - Input samplesheet describing all the samples to be analysed. + +.. note:: + + Please read the :ref:`samplesheet manual page` to better understand the samplesheet format. Output directory """""""""""""""" @@ -99,11 +68,10 @@ Output directory - Default value - Description - * - ``--outdir`` + * - ``--output`` - Y - - output - - Name of directory to store output values. A sub-directory for each - genome will be created inside this main directory. + - outdir + - Name of directory to store output values. A sub-directory for each genome will be created inside this main directory. Max job request """"""""""""""" @@ -159,8 +127,7 @@ Resfinder annotation .. note:: - This parameter must only be used when annotating a single genome. If running the pipeline in multi-sample mode, - users must set it inside the samplesheet YAML file as described in :ref:`samplesheet`. + Sets a default value for input samples. If a sample has a different value given inside the samplesheet, the pipeline will use, for that sample, the value found inside the :ref:`samplesheet`. .. warning:: @@ -307,33 +274,6 @@ Annotation thresholds - 0 - Coverage (%) threshold to be used when annotating with user's custom databases -Methylation call -"""""""""""""""" - -.. note:: - - This parameter must only be used when annotating a single genome. If running the pipeline with more than 1 input - genomes users must set it in the samplesheet YAML file as described in :ref:`samplesheet`. - -.. list-table:: - :widths: 20 10 20 30 - :header-rows: 1 - - * - Arguments - - Required - - Default value - - Description - - * - ``--nanopolish_fast5`` - - N - - NA - - Path to directory containing fast5 files to be used to call methylation. If null, the analysis will be skipped - - * - ``--nanopolish_fastq`` - - N - - NA - - Path to fastq reads (related to fast5 files) that will be used to call methylation. If null, the analysis will be skipped - Merge distance """""""""""""" diff --git a/docs/outputs.rst b/docs/outputs.rst index 052251a2..161f06dc 100644 --- a/docs/outputs.rst +++ b/docs/outputs.rst @@ -7,12 +7,7 @@ Here, using the results produced in the :ref:`quickstart` section, we give users .. note:: - Please take note that the pipeline uses the directory set with the ``--outdir`` parameter as a storage place in which it will create a folder named as the - ``--prefix`` parameter. This ``{prefix}`` folder will contain all the results. - - Therefore the the same ``--outdir`` can be used for different annotations - as each one of them will have a different sub-folder. This is useful and required for the genomic comparative pipeline (that is under construction) that will - use this folder as input, and enable the user to rapidly compare the results between the samples under the same ``--outdir`` folder. + Please take note that the pipeline uses the directory set with the ``--outdir`` parameter as a storage place in which it will create a folder for each sample using its ``id``. Therefore the the same ``--outdir`` can be used for different annotations. Directory tree -------------- diff --git a/docs/quickstart.rst b/docs/quickstart.rst index c97c55db..f78a969e 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -14,6 +14,22 @@ Download the data wget -O ecoli_ref.fna.gz https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/865/GCF_000008865.2_ASM886v2/GCF_000008865.2_ASM886v2_genomic.fna.gz gzip -d ecoli_ref.fna.gz +Prepare a samplesheet +--------------------- + +After downloading the genome, we must create a samplesheet for the input data as described in the :ref:`samplesheet manual page`. A proper formated file for this data would look like that: + +.. code-block:: yaml + + samplesheet: + - id: ecoli + assembly: ecoli_ref.fna + resfinder: Escherichia coli + +.. note:: + + Download this file and save it as ``bacannot_samplesheet.yaml``. + Run the pipeline ---------------- @@ -23,13 +39,29 @@ For examplification purposes and to get a major overview we will execute the pip # Run the pipeline using the Escherichia coli resfinder database nextflow run fmalmeida/bacannot \ - --prefix ecoli \ - --genome ecoli_ref.fna \ - --outdir _ANNOTATION \ - --threads 4 \ - --resfinder_species "Escherichia coli" + --input bacannot_samplesheet.yaml \ + --output _ANNOTATION \ + --threads 10 + +.. note:: + + The resfinder species could also be selected via the command line with ``--resfinder_species``. Please, read more about it at :ref:`manual` and :ref:`samplesheet`. Outputs ------- A glimpse over the main outputs produced by bacannot is given at :ref:`outputs` section. + +Testing more workflows +---------------------- + +Moreover, we have also made available a few example datasets in the pipeline so users can test all capabilities at once, from assembling raw reads to annotating genomes. To test it users must run: + +.. code-block:: bash + + # Run the pipeline using the provided test datasets + nextflow run fmalmeida/bacannot --profile test --threads 10 + +.. note:: + + Unfortunately, due to file sizes, we could not provide fast5 files for users to check on the methylation step. \ No newline at end of file diff --git a/docs/samplesheet.rst b/docs/samplesheet.rst index 5fcb05e6..4ebfe8a3 100644 --- a/docs/samplesheet.rst +++ b/docs/samplesheet.rst @@ -1,26 +1,16 @@ .. _samplesheet: -Samplesheet configuration (for multi-genome analysis) -===================================================== +Samplesheet (input files) +========================= -The samplesheet is a YAML document that is used to describe the input samples. It is required when the user -wants to annotate more than one genome at once saving them at the same output directory. This execution is -triggered by the ``--in_yaml`` parameter and it is incompatible with all the parameters used for single -genome analysis (shown below): +The samplesheet is a required YAML document that is used to describe the input samples and, if desired, its "sample-specific" configuration. The input samplesheet is given using the ``--input`` parameter. -The use of a samplesheet is **incompatible** with: +.. tip:: -+ ``--genome`` -+ ``--sreads_paired`` -+ ``--sreads_single`` -+ ``--lreads`` -+ ``--lreads_type`` -+ ``--resfinder_species`` -+ ``--nanopolish_fast5`` -+ ``--nanopolish_fastq`` + A samplesheet template can be downloaded with: ``nextflow run fmalmeida/bacannot --get_samplesheet`` Samplesheet header -"""""""""""""""""" +------------------ The first line of the file must be the header followed by an indentation: @@ -30,7 +20,7 @@ The first line of the file must be the header followed by an indentation: - ...: Sample identification -""""""""""""""""""""" +--------------------- Each sample must be identified by the tag *id* in the YAML file, followed by the input input tags that shall be used by the pipeline: @@ -45,11 +35,20 @@ be used by the pipeline: ...: ...: -Input tags -"""""""""" +Input tags (keys) +----------------- -Input tags are the tags that are used to represent/set the inputs that shall be used for each sample that -will be analysed. The available tags are: +Input tags are are used to represent/set the inputs that shall be used for each input sample. By default, for resfinder species panel, if it is not set inside the samplesheet, the pipeline will use the configurations set via the "nextflow config file" or via the command line. Otherwise, if set inside the samplesheet, it will overwrite the pipeline’s configuration for that specific sample. + +.. note:: + + Whenever an assembled genome is given with ``assembly:`` the pipeline **will not** perform genome assembly even if reads are given. + + Users may use the ``assembly:`` tag together with ``nanopore:`` and ``fast5:`` tags, which will trigger methylation calling with Nanopolish + +Please, the :ref:`manual reference page` the global/defaults configurations. + +The available tags are: .. list-table:: :widths: 20 50 @@ -74,18 +73,26 @@ will be analysed. The available tags are: - Used to set path to nanopore raw FAST5 data (used in conjunction with ``nanopore`` for calling methylation with Nanopolish) * - ``resfinder`` - - Used to set resfinder species database for resistance annotation with resfinder (must be exactly as shown in `their web page `_). If your species is not available in Resfinder panels, you may use it with the "Other" panel + - Used to set resfinder species database for resistance annotation with resfinder (must be exactly as shown in `their web page `_). If your species is not available in Resfinder panels, you may use it with the "Other" panel. Possible to set with ``--resfinder_species``, please read the :ref:`manual page`. .. note:: - The illumina tag is the only one that **must** be set in indented newlines (one line per read) as shown in the complete samplesheet example. The order - of the reads in these newlines must be Pair1; Pair2; Unpaired (Whenever they are used) -- Check samples 1, 4 and 5 to understand. + Note for the illumina tag/key. + + * When using both paired and unpaired reads, the paired reads must be given first, in the order\: pair 1, pair 2, unpaired. + * Otherwise, if using only paired reads, they must be given in the order\: pair 1, pair 2. + * If using only unpaired reads, only one entry is expected. Check samples in the template to 1, 4 and 5 to understand it. + * The illumina tag is the only one that **must** be set in indented newlines + * two white spaces relative to the + * one line per read as shown in the complete samplesheet example. + +.. warning:: - All the other input tags **must** be set in the same line, right after the separator (":"), without quotations. + All the other input tags **must** be set in the same line, right after the separator (":"), without quotations, white spaces or signs. Complete samplesheet example -"""""""""""""""""""""""""""" +---------------------------- .. code-block:: yaml @@ -95,12 +102,11 @@ Complete samplesheet example - sample_1/1.fastq - sample_1/2.fastq nanopore: sample_1/ont.fastq - resfinder: Escherichia coli - id: sample_2 assembly: sample_2/assembly.fasta nanopore: sample_2/ont.fastq fast5: sample_2/fast5_pass - resfinder: Klebsiella + resfinder: Klebsiella # this tells the pipeline a differente value for only this sample - id: sample_3 nanopore: sample_3/ont.fastq fast5: sample_3/fast5_pass diff --git a/example_samplesheet.yaml b/example_samplesheet.yaml index f761a9f8..71014d36 100644 --- a/example_samplesheet.yaml +++ b/example_samplesheet.yaml @@ -1,5 +1,4 @@ -# This is an exemplification of the accepted YAML syntax accepted by the -# bacannot pipeline in order to analyse multiple genomes at once. +# This is an exemplification of the accepted YAML syntax accepted by the bacannot pipeline # # The file must contain a header-line with the key "samplesheet:". All the # input samples must be given nested to this key. Obs: The header-line and @@ -22,6 +21,7 @@ # # nanopore: ont_nanopore.fastq (Oxford nanopore long reads) # fast5: path/to/fast5_pass (Path to dir containing ONT FAST5s for methylation calling) +# If using an already assembled genome, users may give the nanopore and fast5 data to call methylation # # pacbio: pacbio.fastq (Pacbio long reads) # @@ -48,15 +48,14 @@ samplesheet: - sample_1/1.fastq - sample_1/2.fastq nanopore: sample_1/ont.fastq - resfinder: Escherichia coli + resfinder: Escherichia coli # this tells the pipeline that this specific sample should be treated as E. coli despite any other default set - id: sample_2 assembly: sample_2/assembly.fasta nanopore: sample_2/ont.fastq - fast5: sample_2/fast5_pass - resfinder: Klebsiella + fast5: sample_2/fast5_pass # will call methylation on assembled genome - id: sample_3 nanopore: sample_3/ont.fastq - fast5: sample_3/fast5_pass + fast5: sample_3/fast5_pass # will also call methylation flye genome assembly - id: sample_4 pacbio: sample_4/pacbio.fastq illumina: diff --git a/main.nf b/main.nf index bf3bb705..aa8d44ff 100644 --- a/main.nf +++ b/main.nf @@ -10,10 +10,7 @@ import org.yaml.snakeyaml.Yaml * Include functions */ include { helpMessage } from './nf_functions/help.nf' -include { exampleMessage } from './nf_functions/examples.nf' include { logMessage } from './nf_functions/log.nf' -include { write_csv } from './nf_functions/writeCSV.nf' -include { parse_csv } from './nf_functions/parseCSV.nf' include { paramsCheck } from './nf_functions/paramsCheck.nf' @@ -28,21 +25,13 @@ params.help = false exit 0 } -// CLI examples -params.examples = false - // Show help emssage - if (params.examples){ - exampleMessage() - exit 0 -} - /* * Does the user wants to download the configuration file? */ params.get_config = false if (params.get_config) { - new File("bacannot.config").write(new URL ("https://github.com/fmalmeida/bacannot/raw/master/configuration_template/bacannot.config").getText()) + new File("bacannot.config").write(new URL ("https://github.com/fmalmeida/bacannot/raw/master/nextflow.config").getText()) println "" println "bacannot.config file saved in working directory" println "After configuration, run:" @@ -51,70 +40,64 @@ if (params.get_config) { exit 0 } +/* + * Does the user wants to download the YAML samplesheet file? + */ + +params.get_samplesheet = false +if (params.get_samplesheet) { + new File("bacannot_samplesheet.yaml").write(new URL ("https://github.com/fmalmeida/bacannot/raw/master/example_samplesheet.yaml").getText()) + println "" + println "bacannot_samplesheet.yaml file saved in working directory" + println "After configuration, run:" + println "nextflow run fmalmeida/bacannot --input bacannot_samplesheet.yaml" + println "Nice code!\n" + exit 0 +} + /* * Load general parameters and establish defaults */ // General parameters -params.outdir = 'outdir' -params.threads = 2 +params.output = 'outdir' +params.threads = 2 params.bedtools_merge_distance = '' // Input parameters -params.in_yaml = '' -params.prefix = '' -params.genome = '' -params.sreads_single = '' -params.sreads_paired = '' -params.lreads = '' -params.lreads_type = '' +params.input = '' // Prokka parameters -params.prokka_kingdom = '' +params.prokka_kingdom = '' params.prokka_genetic_code = false -params.prokka_use_rnammer = false +params.prokka_use_rnammer = false // User custom db -params.custom_db = '' -params.blast_custom_minid = 0 +params.custom_db = '' +params.blast_custom_minid = 0 params.blast_custom_mincov = 0 // Resfinder parameters params.resfinder_species = '' // Blast parameters -params.plasmids_minid = 90 -params.plasmids_mincov = 60 -params.blast_virulence_minid = 90 -params.blast_virulence_mincov = 80 -params.blast_resistance_minid = 90 +params.plasmids_minid = 90 +params.plasmids_mincov = 60 +params.blast_virulence_minid = 90 +params.blast_virulence_mincov = 80 +params.blast_resistance_minid = 90 params.blast_resistance_mincov = 80 -params.blast_MGEs_minid = 65 -params.blast_MGEs_mincov = 65 -// Nanopolish -params.nanopolish_fast5 = '' -params.nanopolish_fastq = '' +params.blast_MGEs_minid = 65 +params.blast_MGEs_mincov = 65 // Workflow parameters -params.skip_plasmid_search = false -params.skip_virulence_search = false +params.skip_plasmid_search = false +params.skip_virulence_search = false params.skip_resistance_search = false -params.skip_iceberg_search = false -params.skip_prophage_search = false -params.skip_kofamscan = false -params.skip_antismash = false +params.skip_iceberg_search = false +params.skip_prophage_search = false +params.skip_kofamscan = false +params.skip_antismash = false /* * Define log message */ logMessage() -/* - * Include modules (Execution setup) - */ - -// Unicycler assembly -include { unicycler } from './modules/assembly/unicycler.nf' params(outdir: params.outdir, - threads: params.threads, prefix: params.prefix) - -// Flye assembly -include { flye } from './modules/assembly/flye.nf' params(outdir: params.outdir, - threads: params.threads, prefix: params.prefix, lreads_type: params.lreads_type) - /* * Define custom workflows */ @@ -123,10 +106,7 @@ include { flye } from './modules/assembly/flye.nf' params(outdir: params.outdir, include { parse_samplesheet } from './workflows/parse_samples.nf' // Bacannot pipeline for multiple genomes -include { SINGLE_SAMPLE } from './workflows/simple_workflow.nf' - -// Bacannot pipeline for multiple genomes -include { MULTIPLE_SAMPLE } from './workflows/batch_workflow.nf' +include { BACANNOT } from './workflows/bacannot.nf' /* @@ -135,50 +115,37 @@ include { MULTIPLE_SAMPLE } from './workflows/batch_workflow.nf' workflow { - if (params.in_yaml) { + if (params.input) { - parameter_yaml = file(params.in_yaml).readLines().join("\n") + // Load yaml + samplesheet_yaml = file(params.input) + parameter_yaml = samplesheet_yaml.readLines().join("\n") new Yaml().load(parameter_yaml).each { k, v -> params[k] = v } - // Read YAML file - parse_samplesheet(params.samplesheet) + // Copy YAML samplesheet to output directory so user has a copy of it + file(params.output).mkdir() + samplesheet_yaml.copyTo(params.output + "/" + "${samplesheet_yaml.getName()}") - // Convert it to CSV for usability - samples_ch = write_csv(parse_samplesheet.out) + // Parse YAML file + parse_samplesheet(params.samplesheet) // Run annotation - MULTIPLE_SAMPLE(samples_ch, (params.custom_db) ? Channel.fromPath( params.custom_db.split(',').collect{ it } ) : Channel.empty()) + BACANNOT( + parse_samplesheet.out, + (params.custom_db) ? Channel.fromPath( params.custom_db.split(',').collect{ it } ) : Channel.empty() + ) } else { - if (params.genome) { - - // User have an assembled genome - SINGLE_SAMPLE(Channel.fromPath(params.genome), - (params.nanopolish_fast5 && params.nanopolish_fastq) ? Channel.fromPath( params.nanopolish_fast5 ) : Channel.empty(), - (params.nanopolish_fast5 && params.nanopolish_fastq) ? Channel.fromPath( params.nanopolish_fastq ) : Channel.empty(), - (params.custom_db) ? Channel.fromPath( params.custom_db.split(',').collect{ it } ) : Channel.empty()) - - } else if (params.sreads_single || params.sreads_paired) { - - // User have illumina reads (so it goes to unicycler) - unicycler((params.sreads_paired) ? Channel.fromFilePairs( params.sreads_paired, flat: true, size: 2 ) : Channel.value(['', '', '']), - (params.sreads_single) ? Channel.fromPath( params.sreads_single ) : Channel.value(''), - (params.lreads) ? Channel.fromPath( params.lreads ) : Channel.value('')) - SINGLE_SAMPLE(unicycler.out[1], - (params.nanopolish_fast5 && params.nanopolish_fastq) ? Channel.fromPath( params.nanopolish_fast5 ) : Channel.empty(), - (params.nanopolish_fast5 && params.nanopolish_fastq) ? Channel.fromPath( params.nanopolish_fastq ) : Channel.empty(), - (params.custom_db) ? Channel.fromPath( params.custom_db.split(',').collect{ it } ) : Channel.empty()) - - } else if ((params.lreads && params.lreads_type) && (!params.sreads_paired && !params.sreads_single)) { - - // User does not have illumina reads (so it goes to flye) - flye(Channel.fromPath( params.lreads )) - SINGLE_SAMPLE(flye.out[1], - (params.nanopolish_fast5 && params.nanopolish_fastq) ? Channel.fromPath( params.nanopolish_fast5 ) : Channel.empty(), - (params.nanopolish_fast5 && params.nanopolish_fastq) ? Channel.fromPath( params.nanopolish_fastq ) : Channel.empty(), - (params.custom_db) ? Channel.fromPath( params.custom_db.split(',').collect{ it } ) : Channel.empty()) - } + // Message to user + println(""" + ERROR! + A major error has occurred! + ==> A samplesheet has not been provided. Please, provide a samplesheet to run the analysis. Online documentation is available at: https://bacannot.readthedocs.io/en/latest/ + Please, read the docs. + Cheers. + """) + } } diff --git a/markdown/CHANGELOG.md b/markdown/CHANGELOG.md index b9c0700d..229354b8 100644 --- a/markdown/CHANGELOG.md +++ b/markdown/CHANGELOG.md @@ -2,6 +2,36 @@ The tracking for changes started in v2.1 +## v3.0 + +### input configuration + +* In order to keeps things the least complex possible and to make the pipeline less confusing, the pipeline has been reconfigured in order to properly use it, in all workflow types (for multiple samples at once or just one) through the samplesheet. + + Because of that, we removed the possibility to pass the input reads via the command line and now, the files input data files, must always be set inside the samplesheet, even if analysing only one sample. + + Read more at: https://bacannot.readthedocs.io/en/latest/samplesheet.html + + Check the template samplesheet at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml + + The samplesheet is given with the parameter `--input` +* Due to the implementation above, the folowing parameters are now deprecated, since they are now set inside the YAML file: + + `--genome` + + `--sreads_paired` + + `--sreads_single` + + `--lreads` + + `--lreads_type` + + `--nanopolish_fast5` + + `--nanopolish_fastq` +* The `--resfinder_species` parameter keeps existing. It now sets a default for all samples in the samplesheet. However, when a sample has another value for that set with the key `resfinder`, the pipeline will use, for that specific sample, the value found inside the samplesheet. + +### nomenclature change + +* In order to make it simple and natural, two changes ocurred in input/output parameters + + The `--outdir` parameter is now `--output` + + The `--in_yaml` parameter is now `--input` + +### comments + +* Since this changes are somewhat major changes, the pipeline main version has changed and it is now in v3.0 + + The docker image is now `fmalmeida/bacannot:v3.0` and `fmalmeida/bacannot:v3.0_renv` + ## v2.4.2 Changed how `tag` directives are used inside the pipeline. Now, instead of showing information about the process, it shows which sample is being processed, which is more useful to users. diff --git a/modules/KOs/kegg-decoder.nf b/modules/KOs/kegg-decoder.nf index f750db7e..1fe8c2f2 100644 --- a/modules/KOs/kegg-decoder.nf +++ b/modules/KOs/kegg-decoder.nf @@ -1,5 +1,5 @@ process kegg_decoder { - publishDir "${params.outdir}/${prefix}/KOfamscan", mode: 'copy' + publishDir "${params.output}/${prefix}/KOfamscan", mode: 'copy' tag "${prefix}" label 'kofam' diff --git a/modules/KOs/kofamscan.nf b/modules/KOs/kofamscan.nf index 4f4f9e4d..f01c9772 100644 --- a/modules/KOs/kofamscan.nf +++ b/modules/KOs/kofamscan.nf @@ -1,5 +1,5 @@ process kofamscan { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else "$filename" } diff --git a/modules/MGEs/digIS.nf b/modules/MGEs/digIS.nf index f3f51aaf..7140c62e 100644 --- a/modules/MGEs/digIS.nf +++ b/modules/MGEs/digIS.nf @@ -1,5 +1,5 @@ process digis { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else if (filename == "${prefix}.gff") null else "$filename" diff --git a/modules/MGEs/draw_gis.nf b/modules/MGEs/draw_gis.nf index b5b71ec7..c518a383 100644 --- a/modules/MGEs/draw_gis.nf +++ b/modules/MGEs/draw_gis.nf @@ -1,5 +1,5 @@ process draw_GIs { - publishDir "${params.outdir}/${prefix}/genomic_islands", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}/genomic_islands", mode: 'copy', saveAs: { filename -> if (filename == "plots") "$filename" else null } diff --git a/modules/MGEs/iceberg.nf b/modules/MGEs/iceberg.nf index 0de139d0..f1ff1697 100644 --- a/modules/MGEs/iceberg.nf +++ b/modules/MGEs/iceberg.nf @@ -1,5 +1,5 @@ process iceberg { - publishDir "${params.outdir}/${prefix}/ICEs", mode: 'copy' + publishDir "${params.output}/${prefix}/ICEs", mode: 'copy' tag "${prefix}" label 'main' diff --git a/modules/MGEs/islandPath_DIMOB.nf b/modules/MGEs/islandPath_DIMOB.nf index 628533c9..7dfacc20 100644 --- a/modules/MGEs/islandPath_DIMOB.nf +++ b/modules/MGEs/islandPath_DIMOB.nf @@ -1,5 +1,5 @@ process find_GIs { - publishDir "${params.outdir}/${prefix}/genomic_islands", mode: 'copy' + publishDir "${params.output}/${prefix}/genomic_islands", mode: 'copy' errorStrategy 'retry' maxRetries 5 tag "${prefix}" diff --git a/modules/MGEs/plasmidfinder.nf b/modules/MGEs/plasmidfinder.nf index 0805f4a3..d81702d8 100644 --- a/modules/MGEs/plasmidfinder.nf +++ b/modules/MGEs/plasmidfinder.nf @@ -1,5 +1,5 @@ process plasmidfinder { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename == "plasmidfinder") "plasmids/$filename" else null } diff --git a/modules/MGEs/platon.nf b/modules/MGEs/platon.nf index cc4812e9..20ef8870 100644 --- a/modules/MGEs/platon.nf +++ b/modules/MGEs/platon.nf @@ -1,5 +1,5 @@ process platon { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else if (filename == "platon") "plasmids/$filename" else null diff --git a/modules/assembly/flye.nf b/modules/assembly/flye.nf index 459087a5..e6e63a9d 100644 --- a/modules/assembly/flye.nf +++ b/modules/assembly/flye.nf @@ -1,26 +1,24 @@ process flye { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else if (filename == "flye_${prefix}") "assembly" else null } - label 'assembly' + label 'flye' tag "${prefix}" input: - file(lreads) + tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), val(assembly), val(resfinder_species) output: - file "flye_${prefix}" - file("flye_${prefix}.fasta") + file "flye_${prefix}" // Saves all files + // Keep tuple structure to mixing channels + tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), file("${lreads}"), val("${lr_type}"), file("${fast5}"), file("flye_${prefix}.fasta"), val("${resfinder_species}") file('flye_version.txt') script: - lr = (params.lreads_type == 'nanopore') ? '--nano-raw' : '--pacbio-raw' - prefix = params.prefix + lr = (lr_type == 'nanopore') ? '--nano-raw' : '--pacbio-raw' """ - source activate flye ; - # Save flye version flye -v > flye_version.txt ; diff --git a/modules/assembly/flye_batch.nf b/modules/assembly/flye_batch.nf deleted file mode 100644 index 048be78b..00000000 --- a/modules/assembly/flye_batch.nf +++ /dev/null @@ -1,33 +0,0 @@ -process flye { - publishDir "${params.outdir}/${id}", mode: 'copy', saveAs: { filename -> - if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" - else if (filename == "flye_${id}") "assembly" - else null - } - label 'assembly' - tag "${id}" - - input: - tuple val(id), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), val(assembly), val(resfinder_species) - - output: - file "flye_${id}" // Saves all files - // Keep tuple structure to mixing channels - tuple val("${id}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), file("${lreads}"), val("${lr_type}"), file("${fast5}"), file("flye_${id}.fasta"), val("${resfinder_species}") - file('flye_version.txt') - - script: - lr = (lr_type == 'nanopore') ? '--nano-raw' : '--pacbio-raw' - """ - source activate flye ; - - # Save flye version - flye -v > flye_version.txt ; - - # Run flye - flye ${lr} $lreads --plasmids --out-dir flye_${id} --threads ${params.threads} &> flye.log ; - - # Save a copy for annotation - cp flye_${id}/assembly.fasta flye_${id}.fasta - """ -} diff --git a/modules/assembly/unicycler.nf b/modules/assembly/unicycler.nf index 9ffb55f3..1fdca3d2 100644 --- a/modules/assembly/unicycler.nf +++ b/modules/assembly/unicycler.nf @@ -1,27 +1,26 @@ process unicycler { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else if (filename == "unicycler_${prefix}") "assembly" else null } - label 'assembly' + label 'unicycler' tag "${prefix}" input: - tuple val(id), file(sread1), file(sread2) - file(sreads) - file(lreads) + tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), val(assembly), val(resfinder_species) output: - file "unicycler_${prefix}/*" - file("unicycler_${prefix}.fasta") + file "unicycler_${prefix}" // Save everything + // Keep tuple structure to mixing channels + tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), file("${lreads}"), val("${lr_type}"), file("${fast5}"), file("unicycler_${prefix}.fasta"), val("${resfinder_species}") file('unicycler_version.txt') - + script: unpaired_param = (sreads.getName() != "input.3") ? "-s $sreads" : "" paired_param = (sread1.getName() != "input.1" && sread2.getName() != "input.2") ? "-1 $sread1 -2 $sread2" : "" lr_param = (lreads.getName() != "input.4") ? "-l $lreads" : "" - prefix = params.prefix + """ # Save unicycler version unicycler --version > unicycler_version.txt diff --git a/modules/assembly/unicycler_batch.nf b/modules/assembly/unicycler_batch.nf deleted file mode 100644 index 1af6c6af..00000000 --- a/modules/assembly/unicycler_batch.nf +++ /dev/null @@ -1,34 +0,0 @@ -process unicycler { - publishDir "${params.outdir}/${id}", mode: 'copy', saveAs: { filename -> - if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" - else if (filename == "unicycler_${id}") "assembly" - else null - } - label 'assembly' - tag "${id}" - - input: - tuple val(id), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), val(assembly), val(resfinder_species) - - output: - file "unicycler_${id}" // Save everything - // Keep tuple structure to mixing channels - tuple val("${id}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), file("${lreads}"), val("${lr_type}"), file("${fast5}"), file("unicycler_${id}.fasta"), val("${resfinder_species}") - file('unicycler_version.txt') - - script: - unpaired_param = (sreads.getName() != "input.3") ? "-s $sreads" : "" - paired_param = (sread1.getName() != "input.1" && sread2.getName() != "input.2") ? "-1 $sread1 -2 $sread2" : "" - lr_param = (lreads.getName() != "input.4") ? "-l $lreads" : "" - - """ - # Save unicycler version - unicycler --version > unicycler_version.txt - - # Run unicycler - unicycler $paired_param $unpaired_param $lr_param -o unicycler_${id} -t ${params.threads} &> unicycler.log - - # Save copy for annotation - cp unicycler_${id}/assembly.fasta unicycler_${id}.fasta - """ -} diff --git a/modules/generic/antismash.nf b/modules/generic/antismash.nf index d812a3da..f74dee4d 100644 --- a/modules/generic/antismash.nf +++ b/modules/generic/antismash.nf @@ -1,5 +1,5 @@ process antismash { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else "$filename" } diff --git a/modules/generic/barrnap.nf b/modules/generic/barrnap.nf index 5403bfbb..a6952028 100644 --- a/modules/generic/barrnap.nf +++ b/modules/generic/barrnap.nf @@ -1,5 +1,5 @@ process barrnap { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else "rRNA/$filename" } diff --git a/modules/generic/custom_blast.nf b/modules/generic/custom_blast.nf index 99593a22..ca332c24 100644 --- a/modules/generic/custom_blast.nf +++ b/modules/generic/custom_blast.nf @@ -1,5 +1,5 @@ process custom_blast { - publishDir "${params.outdir}/${prefix}/custom_annotations/${customDB.baseName}", mode: 'copy' + publishDir "${params.output}/${prefix}/custom_annotations/${customDB.baseName}", mode: 'copy' tag "${prefix}" label 'main' @@ -9,7 +9,7 @@ process custom_blast { output: // Outputs must be linked to each prefix (tag) - tuple val(prefix), val("${customDB.baseName}"), file("${prefix}_${customDB.baseName}_blastn.txt"), file("${prefix}_${customDB.baseName}_blastn.gff") + tuple val(prefix), val("${customDB.baseName}"), file("${prefix}_${customDB.baseName}_blastn.summary.txt"), file("${prefix}_${customDB.baseName}_blastn.gff") file('*.txt') // Grab all script: diff --git a/modules/generic/custom_blast_batch.nf b/modules/generic/custom_blast_batch.nf deleted file mode 100644 index 0c482486..00000000 --- a/modules/generic/custom_blast_batch.nf +++ /dev/null @@ -1,30 +0,0 @@ -process custom_blast { - publishDir "${params.outdir}/${prefix}/custom_annotations/${customDB.baseName}", mode: 'copy' - tag "${prefix}" - label 'main' - - input: - tuple val(prefix), file(gff), file(genome) - each file(customDB) - - output: - // Outputs must be linked to each prefix (tag) - tuple val(prefix), val("${customDB.baseName}"), file("${prefix}_${customDB.baseName}_blastn.summary.txt"), file("${prefix}_${customDB.baseName}_blastn.gff") - file('*.txt') // Grab all - - script: - """ - # Step 1 - Create blast db - makeblastdb -in $customDB -dbtype nucl -out customDB ; - - # Step 2 - Execute blastn - run_blasts.py blastn --query $genome --db customDB --minid ${params.blast_custom_minid} \ - --mincov ${params.blast_custom_mincov} --threads ${params.threads} --out ${prefix}_${customDB.baseName}_blastn.txt > ${prefix}_${customDB.baseName}_blastn.summary.txt ; - - # Step 3 - Get BED from blastn - awk '{print \$1 "\t" \$2 "\t" \$3}' ${prefix}_${customDB.baseName}_blastn.txt | tail -n +2 > ${prefix}_${customDB.baseName}_blastn.bed ; - - # Step 4 - Find intersection with annotation - bedtools intersect -wa -a $gff -b ${prefix}_${customDB.baseName}_blastn.bed > ${prefix}_${customDB.baseName}_blastn.gff ; - """ -} diff --git a/modules/generic/custom_blast_report.nf b/modules/generic/custom_blast_report.nf index b6f946b2..c1f07495 100644 --- a/modules/generic/custom_blast_report.nf +++ b/modules/generic/custom_blast_report.nf @@ -1,5 +1,5 @@ process custom_blast_report { - publishDir "${params.outdir}/${prefix}/report_files/custom_databases", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}/report_files/custom_databases", mode: 'copy', saveAs: { filename -> if (filename.indexOf(".html") > 0) "report_${customDB}.html" else "$filename" } diff --git a/modules/generic/genome_mask.nf b/modules/generic/genome_mask.nf index 976dba38..9e7cc4ef 100644 --- a/modules/generic/genome_mask.nf +++ b/modules/generic/genome_mask.nf @@ -1,5 +1,5 @@ process masking_genome { - publishDir "${params.outdir}/${prefix}", mode: 'copy', + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: {filename -> //This line saves the files with specific sufixes in specific folders if (filename.indexOf(".gff") > 0 ) "gffs/$filename" diff --git a/modules/generic/gff2gbk.nf b/modules/generic/gff2gbk.nf index d1ed0c7e..71d90fcc 100644 --- a/modules/generic/gff2gbk.nf +++ b/modules/generic/gff2gbk.nf @@ -1,5 +1,5 @@ process gff2gbk { - publishDir "${params.outdir}/${prefix}/gbk", mode: 'copy' + publishDir "${params.output}/${prefix}/gbk", mode: 'copy' label 'main' tag "${prefix}" diff --git a/modules/generic/gff2sql.nf b/modules/generic/gff2sql.nf index 2acf6f8f..5e2572e5 100644 --- a/modules/generic/gff2sql.nf +++ b/modules/generic/gff2sql.nf @@ -1,5 +1,5 @@ process create_sql { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf(".sqlite") > 0) "sqldb/$filename" else "$filename" } diff --git a/modules/generic/jbrowse.nf b/modules/generic/jbrowse.nf index ac91dc4a..ea1e9437 100644 --- a/modules/generic/jbrowse.nf +++ b/modules/generic/jbrowse.nf @@ -1,5 +1,5 @@ process jbrowse { - publishDir "${params.outdir}/${prefix}/jbrowse", mode: 'copy' + publishDir "${params.output}/${prefix}/jbrowse", mode: 'copy' label 'jbrowse' tag "${prefix}" diff --git a/modules/generic/mash.nf b/modules/generic/mash.nf index bad2f4c7..f8afea93 100644 --- a/modules/generic/mash.nf +++ b/modules/generic/mash.nf @@ -1,5 +1,5 @@ process refseq_masher { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else "refseq_masher/$filename" } diff --git a/modules/generic/merge_annotations.nf b/modules/generic/merge_annotations.nf index d5e3c3a6..b514fcbc 100644 --- a/modules/generic/merge_annotations.nf +++ b/modules/generic/merge_annotations.nf @@ -1,5 +1,5 @@ process merge_annotations { - publishDir "${params.outdir}/${prefix}/gffs", mode: 'copy' + publishDir "${params.output}/${prefix}/gffs", mode: 'copy' label 'renv' tag "${prefix}" diff --git a/modules/generic/merge_gff.nf b/modules/generic/merge_gff.nf index efcd5ed2..e9f3a482 100644 --- a/modules/generic/merge_gff.nf +++ b/modules/generic/merge_gff.nf @@ -1,5 +1,5 @@ process gff_merge { - publishDir "${params.outdir}/${prefix}/gffs", mode: 'copy' + publishDir "${params.output}/${prefix}/gffs", mode: 'copy' label 'main' tag "${prefix}" diff --git a/modules/generic/methylation.nf b/modules/generic/methylation.nf index 18551a7a..0c50bcb9 100644 --- a/modules/generic/methylation.nf +++ b/modules/generic/methylation.nf @@ -1,5 +1,5 @@ process call_methylation { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else "methylations/$filename" } @@ -7,9 +7,7 @@ process call_methylation { label 'main' input: - tuple val(prefix), file(draft) - file(fast5) - file(reads) + tuple val(prefix), file(draft), file(reads), file(fast5) output: // Grab all outputs @@ -19,6 +17,10 @@ process call_methylation { tuple val(prefix), file("chr.sizes") optional true file('nanopolish_version.txt') + when: + // When an entry does not exist, it is created as 'input' + if (fast5.getName() != 'input.5' && reads.getName() != 'input.4') // Names were set in assembly and prokka process + script: fast5_dir = fast5.getName() """ diff --git a/modules/generic/methylation_batch.nf b/modules/generic/methylation_batch.nf deleted file mode 100644 index 27e2ab5d..00000000 --- a/modules/generic/methylation_batch.nf +++ /dev/null @@ -1,50 +0,0 @@ -process call_methylation { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> - if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" - else "methylations/$filename" - } - tag "${prefix}" - label 'main' - - input: - tuple val(prefix), file(draft), file(reads), file(fast5) - - output: - // Grab all outputs - file "*_calls.tsv" optional true - file "*_frequency.tsv" optional true - tuple val(prefix), file("methylation_frequency.bedGraph") optional true - tuple val(prefix), file("chr.sizes") optional true - file('nanopolish_version.txt') - - when: - // When an entry does not exist, it is created as 'input' - if (fast5.getName() != 'input.5' && reads.getName() != 'input.4') // Names were set in assembly and prokka process - - script: - fast5_dir = fast5.getName() - """ - # Get tool version - nanopolish --version > nanopolish_version.txt ; - - # Index Our Fast5 Data - nanopolish index -d ${fast5_dir} ${reads} ; - - # Map Our Indexed Reads to Our Genome - minimap2 -a -x map-ont ${draft} ${reads} | samtools sort -T tmp -o reads_output.sorted.bam ; - samtools index reads_output.sorted.bam ; - - # Call Methylation - nanopolish call-methylation -r ${reads} -b reads_output.sorted.bam -g ${draft} -t ${params.threads} > methylation_call.tsv ; - - # Calculate Methylation Frequencies - /work/nanopolish/scripts/calculate_methylation_frequency.py methylation_call.tsv > methylation_frequency.tsv ; - - # Transform These TSV files into bedGraph - [ ! -s methylation_frequency.tsv ] || grep -v "start" methylation_frequency.tsv | \ - awk '{ print \$1 "\t" \$2 "\t" \$3 "\t" \$7 }' > methylation_frequency.bedGraph ; - - # Create Contig Sizes File - seqtk comp ${draft} | awk '{ print \$1 "\t" \$2 }' > chr.sizes - """ -} diff --git a/modules/generic/mlst.nf b/modules/generic/mlst.nf index 41cf5b70..61c24647 100644 --- a/modules/generic/mlst.nf +++ b/modules/generic/mlst.nf @@ -1,5 +1,5 @@ process mlst { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else "MLST/$filename" } diff --git a/modules/generic/prokka.nf b/modules/generic/prokka.nf index 7f1fe104..cce5d054 100644 --- a/modules/generic/prokka.nf +++ b/modules/generic/prokka.nf @@ -1,5 +1,5 @@ process prokka { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else if (filename == "annotation") "$filename" else null @@ -8,7 +8,7 @@ process prokka { label 'main' input: - file(assembly) + tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), file(assembly), val(resfinder_species) output: // Grab all outputs @@ -19,11 +19,12 @@ process prokka { tuple val(prefix), file("annotation/${prefix}.fna") // renamed genome tuple val(prefix), file("annotation/${prefix}.faa") // gene aa sequences tuple val(prefix), file("annotation/${prefix}.ffn") // gene nt sequences + tuple val(prefix), file("annotation/${prefix}.fna"), file("${lreads}"), file("${fast5}") // For methylation calling + tuple val(prefix), file("annotation/${prefix}.fna"), val("${resfinder_species}") // For resfinder tuple val(prefix), file("annotation/${prefix}.txt") // prokka stats file('prokka_version.txt') // Save prokka version script: - prefix = "${params.prefix}" kingdom = (params.prokka_kingdom) ? "--kingdom ${params.prokka_kingdom}" : '' gcode = (params.prokka_genetic_code) ? "--gcode ${params.prokka_genetic_code}" : '' rnammer = (params.prokka_use_rnammer) ? "--rnammer" : '' diff --git a/modules/generic/prokka_batch.nf b/modules/generic/prokka_batch.nf deleted file mode 100644 index 7119071b..00000000 --- a/modules/generic/prokka_batch.nf +++ /dev/null @@ -1,43 +0,0 @@ -process prokka { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> - if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" - else if (filename == "annotation") "$filename" - else null - } - tag "${prefix}" - label 'main' - - input: - tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), file(assembly), val(resfinder_species) - - output: - // Grab all outputs - file "annotation" - // Outputs must be linked to each prefix (tag) - tuple val(prefix), file("annotation/${prefix}.gff") // annotation in gff format - tuple val(prefix), file("annotation/${prefix}.gbk") // annotation in gbk format - tuple val(prefix), file("annotation/${prefix}.fna") // renamed genome - tuple val(prefix), file("annotation/${prefix}.faa") // gene aa sequences - tuple val(prefix), file("annotation/${prefix}.ffn") // gene nt sequences - tuple val(prefix), file("annotation/${prefix}.fna"), file("${lreads}"), file("${fast5}") // For methylation calling - tuple val(prefix), file("annotation/${prefix}.fna"), val("${resfinder_species}") // For resfinder - tuple val(prefix), file("annotation/${prefix}.txt") // prokka stats - file('prokka_version.txt') // Save prokka version - - script: - kingdom = (params.prokka_kingdom) ? "--kingdom ${params.prokka_kingdom}" : '' - gcode = (params.prokka_genetic_code) ? "--gcode ${params.prokka_genetic_code}" : '' - rnammer = (params.prokka_use_rnammer) ? "--rnammer" : '' - """ - # activate env - source activate PERL_env ; - - # Save Prokka version - prokka -v &> prokka_version.txt ; - - # Run prokka - prokka $kingdom $gcode $rnammer --outdir annotation \ - --cpus ${params.threads} --mincontiglen 200 --prefix ${prefix} \ - --genus '' --species '' --strain \"${prefix}\" $assembly - """ -} diff --git a/modules/generic/reports.nf b/modules/generic/reports.nf index 8ef4e9bb..6057499d 100644 --- a/modules/generic/reports.nf +++ b/modules/generic/reports.nf @@ -1,5 +1,5 @@ process report { - publishDir "${params.outdir}/${prefix}/report_files", mode: 'copy' + publishDir "${params.output}/${prefix}/report_files", mode: 'copy' label 'renv' tag "${prefix}" @@ -64,7 +64,7 @@ process report { rmarkdown::render("report_MGEs.Rmd", \ params = list( blast_id = ${params.blast_MGEs_minid}, \ blast_cov = ${params.blast_MGEs_mincov}, \ - phigaro_dir = "${params.outdir}/prophages/phigaro", \ + phigaro_dir = "${params.output}/prophages/phigaro", \ phigaro_txt = "$phigaro_txt", \ phispy_tsv = "$phispy_tsv", \ ice_prot_blast = "$iceberg_blastp", \ diff --git a/modules/generic/sequenceserver.nf b/modules/generic/sequenceserver.nf index 928041d0..43e9b901 100644 --- a/modules/generic/sequenceserver.nf +++ b/modules/generic/sequenceserver.nf @@ -1,5 +1,5 @@ process sequenceserver { - publishDir "${params.outdir}/${prefix}/SequenceServerDBs", mode: 'copy' + publishDir "${params.output}/${prefix}/SequenceServerDBs", mode: 'copy' tag "${prefix}" label 'main' diff --git a/modules/prophages/phast.nf b/modules/prophages/phast.nf index 6abbc758..2fdf379b 100644 --- a/modules/prophages/phast.nf +++ b/modules/prophages/phast.nf @@ -1,5 +1,5 @@ process phast { - publishDir "${params.outdir}/${prefix}/prophages/phast_db", mode: 'copy' + publishDir "${params.output}/${prefix}/prophages/phast_db", mode: 'copy' tag "${prefix}" label 'main' diff --git a/modules/prophages/phigaro.nf b/modules/prophages/phigaro.nf index 5ea347d2..6c9b7146 100644 --- a/modules/prophages/phigaro.nf +++ b/modules/prophages/phigaro.nf @@ -1,5 +1,5 @@ process phigaro { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename == "out.phg") null else if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else "prophages/phigaro/$filename" diff --git a/modules/prophages/phispy.nf b/modules/prophages/phispy.nf index bdc32372..3a545632 100644 --- a/modules/prophages/phispy.nf +++ b/modules/prophages/phispy.nf @@ -1,5 +1,5 @@ process phispy { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else if (filename == "PhiSpy") "prophages/$filename" else null diff --git a/modules/resistance/amrfinder.nf b/modules/resistance/amrfinder.nf index 0c32e6ba..25d037e8 100644 --- a/modules/resistance/amrfinder.nf +++ b/modules/resistance/amrfinder.nf @@ -1,5 +1,5 @@ process amrfinder { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else "resistance/AMRFinderPlus/$filename" } diff --git a/modules/resistance/argminer.nf b/modules/resistance/argminer.nf index 24150214..4db37bbd 100644 --- a/modules/resistance/argminer.nf +++ b/modules/resistance/argminer.nf @@ -1,5 +1,5 @@ process argminer { - publishDir "${params.outdir}/${prefix}/resistance/ARGMiner", mode: 'copy' + publishDir "${params.output}/${prefix}/resistance/ARGMiner", mode: 'copy' tag "${prefix}" label 'main' diff --git a/modules/resistance/resfinder.nf b/modules/resistance/resfinder.nf index 83e11935..50544f8e 100644 --- a/modules/resistance/resfinder.nf +++ b/modules/resistance/resfinder.nf @@ -1,5 +1,5 @@ process resfinder { - publishDir "${params.outdir}/${prefix}/resistance", mode: 'copy' + publishDir "${params.output}/${prefix}/resistance", mode: 'copy' tag "${prefix}" label 'main' diff --git a/modules/resistance/rgi_annotation.nf b/modules/resistance/rgi_annotation.nf index 2391c715..1c217d20 100644 --- a/modules/resistance/rgi_annotation.nf +++ b/modules/resistance/rgi_annotation.nf @@ -1,5 +1,5 @@ process card_rgi { - publishDir "${params.outdir}/${prefix}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" else if (filename == "Parsed_RGI_${prefix}_hits.txt") null else "resistance/RGI/$filename" diff --git a/modules/virulence/vfdb.nf b/modules/virulence/vfdb.nf index c3999907..e88e97e6 100644 --- a/modules/virulence/vfdb.nf +++ b/modules/virulence/vfdb.nf @@ -1,5 +1,5 @@ process vfdb { - publishDir "${params.outdir}/${prefix}/virulence/vfdb", mode: 'copy' + publishDir "${params.output}/${prefix}/virulence/vfdb", mode: 'copy' tag "${prefix}" label 'main' @@ -20,5 +20,4 @@ process vfdb { --mincov ${params.blast_virulence_mincov} --threads ${params.threads} --out ${prefix}_vfdb_blastn_onGenes.txt --2way | \ sed -e 's/ACCESSION/VFDB_ID/g' > ${prefix}_vfdb_blastn_onGenes.summary.txt ; """ - } diff --git a/modules/virulence/victors.nf b/modules/virulence/victors.nf index 47948212..9a848399 100644 --- a/modules/virulence/victors.nf +++ b/modules/virulence/victors.nf @@ -1,5 +1,5 @@ process victors { - publishDir "${params.outdir}/${prefix}/virulence/victors", mode: 'copy' + publishDir "${params.output}/${prefix}/virulence/victors", mode: 'copy' tag "${prefix}" label 'main' diff --git a/nextflow.config b/nextflow.config index 98ffdf7f..48151694 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,64 +12,28 @@ params { /* - SINGLE GENOME ANALYSIS + INPUT SAMPLESHEET */ -// Prefix for writing genome assembly and annotatin resulting files -// Preferentially the sample name - prefix = 'out' - -// The input file formats are mutually exclusive. Users must choose between giving an -// assembled genome or raw reads to the pipeline. -// Input genome -- Always in FASTA format. - genome = '' - -// Input raw reads -- Always in FASTQ format. -// When using raw reads, the fmalmeida/mpgap is also required to be available. - sreads_single = '' // Path to unpaired illumina reads, if available for the sample - sreads_paired = '' // Path to paired end illumina reads, if available for the sample - lreads = '' // Path to longreads (ONT or Pacbio), if available for the sample - lreads_type = '' // Longreads is used? If so, from which tech it is? Options: [ nanopore or pacbio ] - -// Species panel to be used when annotating with Resfinder. If blank, -// it will not be executed. Must be identical (without the *) as written -// in their webservice https://cge.cbs.dtu.dk/services/ResFinder/. -// E.g. 'Escherichia coli'; 'Klebsiella' ... - resfinder_species = '' - -// Configure optional Methylation annotation with nanopolish -// If left blank, it will not be executed. When both parameters are set -// it will automatically execute nanopolish to call methylation - - nanopolish_fast5 = '' // Path to directory containing FAST5 files - nanopolish_fastq = '' // Path to fastq files (file related to FAST5 files above) - - /* - - MULTIPLE GENOME ANALYSIS - - */ - -// When analysing multiple genomes at once, all the parameters described above, must be, whenever -// necessary and applicable to your data, set inside a samplesheet file in YAML format. We provide -// an well-formated example of this YAML file at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml +// Input data mus be given inside a well-formated samplesheet. +// We provide a well-formated example at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml // -// Please read the example YAML samplesheet so you can understand how to properly fill it. +// Please read the example samplesheet so you can understand how to properly fill it. // // It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet.html - in_yaml = '' + input = '' /* - GENERAL PARAMETERS -- FOR BOTH SINGLE AND MULTIPLE GENOME WORKFLOWS + GENERAL PARAMETERS */ // Main output folder name. More than one bacannot annotation can be redirected // to the same output parameter. It is good to keep related annotations together. // A subdirectory with the filename will be created inside this directory. - outdir = 'output' + output = 'outdir' // Number of threads to be used by each software threads = 2 @@ -78,8 +42,9 @@ params { // N threads (set above). Be sure to carefully check your resources before augmenting // this parameter. For example: parallel_jobs = 2 + threads = 5 can consume until 10 // threads at once. -// If not given, let's nextflow automatically handle it. - parallel_jobs = +// +// If not given, let's nextflow automatically decide it, which is the default. + parallel_jobs = // Number of minimum overlapping base pairs required for merging // Negative values, such as -20, means the number of required overlapping bases for merging. @@ -101,6 +66,18 @@ params { // Use rnammer instead of Barrnap? False or True? prokka_use_rnammer = false + /* + * Resfinder species panel + */ + +// Species panel to be used when annotating with Resfinder. +// It sets a default for all samples in the samplesheet. +// If a sample has a different value inside the samplesheet it will overwrite the value for that sample +// If blank it will not be executed. +// It must be identical (without the *) as written in their webservice https://cge.cbs.dtu.dk/services/ResFinder/. +// E.g. 'Escherichia coli'; 'Klebsiella' ... + resfinder_species = '' + /* * Handling the execution of processes * @@ -189,20 +166,20 @@ params { //Trace Report trace { enabled = false - file = "${params.outdir}" + "/annotation_pipeline_trace.txt" + file = "${params.output}" + "/annotation_pipeline_trace.txt" fields = 'task_id,name,status,exit,realtime,cpus,%cpu,memory,%mem,rss' } //Timeline Report timeline { enabled = false - file = "${params.outdir}" + "/annotation_pipeline_timeline.html" + file = "${params.output}" + "/annotation_pipeline_timeline.html" } //Complete Report report { enabled = false - file = "${params.outdir}" + "/annotation_pipeline_nextflow_report.html" + file = "${params.output}" + "/annotation_pipeline_nextflow_report.html" } /* @@ -211,21 +188,21 @@ report { please read more at: https://www.nextflow.io/docs/latest/config.html#config-profiles */ profiles { + + // default profile standard { // Executor process.executor = "local" + // specifying number of threads wanted + process.cpus = params.threads // QueueSize limit - if (params.parallel_jobs || params.parallel_jobs != '') { - qs = params.parallel_jobs - } - executor { - name = "local" - if (params.parallel_jobs || params.parallel_jobs != '') { - queueSize = qs - } - } + if ( params.parallel_jobs) { process.executor.queueSize = params.parallel_jobs } } + // test profile + test { includeConfig 'test_data/test_profile.config' } + + // amazon aws profile awsbatch { process.executor = 'awsbatch' process.queue = 'my-batch-queue' @@ -241,11 +218,11 @@ profiles { // specific images process { withLabel: 'main' { - container = 'fmalmeida/bacannot:v2.4' + container = 'fmalmeida/bacannot:v3.0' } withLabel: 'renv' { - container = 'fmalmeida/bacannot:v2.4_renv' + container = 'fmalmeida/bacannot:v3.0_renv' } withLabel: 'jbrowse' { @@ -260,8 +237,12 @@ process { container = 'fmalmeida/bacannot:antismash' } - withLabel: 'assembly' { - container = 'fmalmeida/mpgap:v2.3' + withLabel: 'unicycler' { + container = 'quay.io/biocontainers/unicycler:0.4.8--py38h8162308_3' + } + + withLabel: 'flye' { + container = 'quay.io/biocontainers/flye:2.9--py39h39abbe0_0' } } @@ -271,3 +252,16 @@ docker { runOptions = '--platform linux/amd64 -u $(id -u):root' fixOwnership = true } + +/* + Adding manifest +*/ +manifest { + name = "fmalmeida/bacannot" + author = "Felipe Almeida" + description = "Nextflow pipeline for bacterial genome annotation" + homePage = "https://github.com/fmalmeida/bacannot" + mainScript = "main.nf" + nextflowVersion = ">=20.10.0" + version = "3.0" +} \ No newline at end of file diff --git a/nextflow_schema.json b/nextflow_schema.json index bbf1346a..eb9e744e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,89 +5,23 @@ "description": "", "type": "object", "definitions": { - "input_parameters_single_genome_analysis": { - "title": "Input parameters (single genome analysis)", + "input_output_options": { + "title": "Input/output options", "type": "object", "fa_icon": "fas fa-terminal", - "description": "Input files for single genome analysis", + "description": "Set input and output parameters", "properties": { - "genome": { + "input": { "type": "string", - "description": "Input genome for annotation (can't be used with raw reads)", - "fa_icon": "fas fa-book", - "help_text": "Path to the input query genome to be annotated. Cannot be used with raw reads at the same time." + "description": "Path to input samplesheet" }, - "prefix": { + "output": { "type": "string", - "description": "Output prefix for results", - "help_text": "This sets the prefix to be used when writing results", - "fa_icon": "fas fa-align-left", - "default": "out" - }, - "sreads_single": { - "type": "string", - "fa_icon": "fas fa-book", - "description": "Path to input short unpaired reads (can't be used with assembled genomes)", - "help_text": "When combining different sequencing library types, users must never set path to reads from more than one sample." - }, - "sreads_paired": { - "type": "string", - "fa_icon": "fas fa-book", - "description": "Path to input short paired end reads (can't be used with assembled genomes)", - "help_text": "When combining different sequencing library types, users must never set path to reads from more than one sample." - }, - "lreads": { - "type": "string", - "fa_icon": "fas fa-book", - "description": "Path to input longreads (ONT or Pacbio) (can't be used with assembled genomes)", - "help_text": "When combining different sequencing library types, users must never set path to reads from more than one sample." - }, - "lreads_type": { - "type": "string", - "fa_icon": "fas fa-quote-left", - "description": "Using longreads? From which tech it is? Nanopore or Pacbio?", - "help_text": "Must be used if giving longreads as input.", - "enum": [ - "nanopore", - "pacbio" - ] + "description": "Path for output directory", + "default": "outdir" } } }, - "input_parameters_multiple_genome_analysis": { - "title": "Input parameters (multiple genome analysis)", - "type": "object", - "description": "", - "default": "", - "properties": { - "in_yaml": { - "type": "string", - "description": "Input samplesheet in YAML format", - "help_text": "Used when analysis is to be performed with multiple genomes at once. It Is incompatible with the parameters for single genome analysis. More information can be found at: https://bacannot.readthedocs.io/en/latest/samplesheet.html", - "fa_icon": "fas fa-align-justify" - } - }, - "fa_icon": "fas fa-terminal" - }, - "output_directory": { - "title": "Output directory", - "type": "object", - "description": "Define where to store results", - "default": "", - "properties": { - "outdir": { - "type": "string", - "description": "The output directory where the results will be saved.", - "default": "./results", - "fa_icon": "fas fa-folder-open", - "help_text": "All the results will be stored here." - } - }, - "required": [ - "outdir" - ], - "fa_icon": "far fa-folder" - }, "generic_options": { "title": "Generic options", "type": "object", @@ -111,13 +45,13 @@ "properties": { "threads": { "type": "integer", - "default": "2", + "default": 2, "description": "Number of threads to be used by each software" }, "parallel_jobs": { "type": "integer", "description": "Number of jobs to run in parallel", - "help_text": "Number of jobs to run in parallel. Be aware that each job (in parallel) can consume\nN threads (set above). Be sure to carefully check your resources before augmenting\nthis parameter. For example: parallel_jobs = 2 + threads = 5 can consume until 10\nthreads at once.\nIf not given, let's nextflow automatically handle it" + "help_text": "Number of jobs to run in parallel. If not given, let's nextflow automatically handle it.\n\nBe aware that each job (in parallel) can consume\nN threads (set above). Be sure to carefully check your resources before augmenting\nthis parameter. For example: parallel_jobs = 2 + threads = 5 can consume until 10\nthreads at once." } } }, @@ -152,19 +86,19 @@ "fa_icon": "fas fa-cog", "required": [ "prokka_kingdom", - "prokka_use_rnammer" + "prokka_genetic_code" ] }, "resfinder_optional_process": { "title": "Resfinder optional process", "type": "object", - "description": "Select the resfinder species panel to use", + "description": "Set default resfinder species panel to use", "default": "", "properties": { "resfinder_species": { "type": "string", "description": "Select the resfinder species panel to use", - "help_text": "If blank, resfinder will not be executed. Users must check Resfinder website to know the panels. This parameter is to be used with single genome annotation workflows, otherwise it must be set inside the samplesheet YAML file, please check out more at: https://bacannot.readthedocs.io/en/latest/samplesheet.html" + "help_text": "If blank, resfinder will not be executed. Sets a default value for all samples in the samplesheet. If a sample has another value set inside the samplesheet, the pipeline will use for that specific sample the value inside the samplesheet. Users must check Resfinder website to know the panels. If your species is not available in resfinder species panels you may set it to \"Other\"." } }, "fa_icon": "fas fa-cog" @@ -287,25 +221,6 @@ }, "fa_icon": "fas fa-cogs" }, - "configure_optional_methylation": { - "title": "Configure optional Methylation", - "type": "object", - "description": "Configure nanopolish methylation annotation process", - "default": "", - "fa_icon": "fas fa-cog", - "properties": { - "nanopolish_fastq": { - "type": "string", - "description": "Path to fastq files (file related to FAST5 files above)", - "help_text": "If blank, the process will not be executed!" - }, - "nanopolish_fast5": { - "type": "string", - "description": "Path to directory containing FAST5 files", - "help_text": "Set full path to the directory containing the ONT sequencing FAST5 files" - } - } - }, "user_custom_database": { "title": "User custom database", "type": "object", @@ -331,13 +246,7 @@ }, "allOf": [ { - "$ref": "#/definitions/input_parameters_single_genome_analysis" - }, - { - "$ref": "#/definitions/input_parameters_multiple_genome_analysis" - }, - { - "$ref": "#/definitions/output_directory" + "$ref": "#/definitions/input_output_options" }, { "$ref": "#/definitions/generic_options" @@ -357,9 +266,6 @@ { "$ref": "#/definitions/configure_thresholds_of_annotation_tasks" }, - { - "$ref": "#/definitions/configure_optional_methylation" - }, { "$ref": "#/definitions/user_custom_database" } diff --git a/nf_functions/examples.nf b/nf_functions/examples.nf deleted file mode 100644 index dc293302..00000000 --- a/nf_functions/examples.nf +++ /dev/null @@ -1,25 +0,0 @@ -def exampleMessage() { - log.info """ - Example Usages: - - ## Launching interactive graphical interface -\$ nf-core launch fmalmeida/bacannot - - ## Single genome annotation, with customization -\$ nextflow run fmalmeida/bacannot --outdir TESTE --threads 3 --genome assembly.fasta --bedtools_merge_distance -20 --blast_virulence_minid 90 \ ---blast_virulence_mincov 80 --blast_MGEs_minid 70 --blast_MGEs_mincov 60 --nanopolish_fast5 ./fast5_pass --nanopolish_fastq ./ont.fastq \ ---resfinder_species "Klebsiella" - - ## Multiple genome annotation, with custom database annotation - ## using either raw reads or assembled genomes -\$ nextflow run fmalmeida/bacannot --outdir TESTE --threads 3 --in_yaml samplesheet.yaml --custom_db db1.fasta - - ## Annotating from raw reads -\$ nextflow run fmalmeida/bacannot --sreads_paired "sample1_{1,2}.fastq" --lreads "sample1_lreads.fastq" --lreads_type nanopore \ ---outdir TESTE --skip_kofamscan --threads 5 --nanopolish_fastq "sample1_lreads.fastq" --nanopolish_fast5 "fast5_pass_dir" - - ## Running with a configuration file -\$ nextflow run fmalmeida/bacannot -c bacannot.config - -""".stripIndent() -} diff --git a/nf_functions/help.nf b/nf_functions/help.nf index 468c611a..d480d693 100644 --- a/nf_functions/help.nf +++ b/nf_functions/help.nf @@ -16,56 +16,32 @@ def helpMessage() { nextflow run fmalmeida/bacannot [--get_config] - Show command line examples: - - nextflow run fmalmeida/bacannot --examples - Execution Reports: nextflow run fmalmeida/bacannot [OPTIONS] [-with-report] [-with-trace] [-with-timeline] OPTIONS: - # Input configuration -- Analysis of a single genome + # Input configuration # Users can give either a genome in FASTA file or raw reads in FASTQ - # Please do not use glob. patterns ('*') with these parameters - - --prefix Prefix for writing genome assembly and annotatin resulting files. - Preferentially the sample name. [Default: out] - - --genome Set path to the genome in FASTA file. - - --sreads_paired Illumina paired end reads in fastq for assembly before annotation. - - --sreads_single Illumina unpaired reads in fastq for assembly before annotation. - - --lreads Path to longreads in fastq assembly before annotation (ONT or Pacbio). - - --lreads_type Tells the technology of the input longreads: [ nanopore or pacbio ]. - - # Input configuration -- Analysis of multiple genomes - # Users can give either a genome in FASTA file or raw reads in FASTQ - # The analysis of multiple genomes at once is configured via a YAML file - # Check the example YAML at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml + # The analysis is configured via a samplesheet + # Check the example samplesheet at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml # # Also documented at: https://bacannot.readthedocs.io/en/latest/samplesheet.html - --in_yaml Set path to the samplesheet in YAML format to analyse more than one - genome at once. + --input Set path to the input samplesheet. - # Annotation configuration -- Used for either for the single - # genome analysis workflow and the multiple genome analysis + # Annotation configuration # Read it and configure it properly # General Parameters - --outdir Output directory name + --output Output directory name --threads Number of threads to use --parallel_jobs Number of jobs to run in parallel. Each job can consume up - to N threads (--threads). If not given, let's nextflow automatically - handle it. Default: NA. + to N threads (--threads). If not given, let's nextflow automatically handle it. Default: NA. --bedtools_merge_distance By default, this process is not executed. For execution one needs to provide a minimum number of overlapping @@ -117,10 +93,9 @@ def helpMessage() { Multiple FASTAs can be provided separated by comma. E.g. db1.fasta,db2.fasta,... - # Configure resfinder optional parameter - # Only used with analysing a single genome - # When analysing multiple genomes it must be set in the YAML file. - # Check the example YAML at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml + # Configure a default resfinder species panel for all samples + # If a sample has another value inside the samplesheet, the pipeline will use + # the one found inside the samplesheet for that specific sample. # # Also documented at: https://bacannot.readthedocs.io/en/latest/samplesheet.html @@ -145,18 +120,5 @@ def helpMessage() { --skip_kofamscan Tells whether you do not want to execute KO annotation with kofamscan - - # Configure optional Methylation annotation with nanopolish - # If left blank, it will not be executed. And, with both parameters are set - # it will automatically execute nanopolish to call methylation. - # Only used with analysing a single genome - # When analysing multiple genomes it must be set in the YAML file. - # Check the example YAML at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml - - --nanopolish_fast5 Path to directory containing FAST5 files - - --nanopolish_fastq Path to fastq files (file related to FAST5 files above) - - """.stripIndent() } diff --git a/nf_functions/log.nf b/nf_functions/log.nf index bed4632d..b43d7416 100644 --- a/nf_functions/log.nf +++ b/nf_functions/log.nf @@ -3,8 +3,8 @@ def logMessage() { log.info " Container-based, fmalmeida/bacannot, Genome Annotation Pipeline " log.info "=================================================================" def summary = [:] - if (params.genome) { summary['Input genomes'] = params.genome } - summary['Output dir'] = "${params.outdir}" + summary['Input genomes'] = params.input + summary['Output dir'] = "${params.output}" summary['Threads'] = params.threads if (params.skip_virulence_search == false) { summary['Blast % ID - Virulence Genes'] = params.blast_virulence_minid diff --git a/nf_functions/paramsCheck.nf b/nf_functions/paramsCheck.nf index 99961da5..36386b97 100644 --- a/nf_functions/paramsCheck.nf +++ b/nf_functions/paramsCheck.nf @@ -1,79 +1,5 @@ def paramsCheck() { - - /* - - User tried to use both raw reads and a assembled genome as input - - */ - if (params.genome && (params.sreads_paired || params.sreads_single || params.lreads)) { - println """ - ERROR! - - A minor error has occurred - ==> User used raw reads and assembled genomes as input. - - You cannot use both types of inputs together. Used either assembled genomes OR raw reads. - - Cheers. - """.stripIndent() - - exit 1 - } - - /* - - User tried to use both the options for single and multiple genome analysis - - */ - if (params.in_yaml && (params.sreads_paired || params.sreads_single || params.lreads - || params.genome || params.lreads_type || params.resfinder_species - || params.nanopolish_fast5 || params.nanopolish_fastq)) { - println """ - ERROR! - - A major error has occurred - ==> User have set parameters for single and multiple genome analysis. - - This pipeline works either annotating a single genome or multiple genome at once. However, the analysis of multiple - genomes is set via the YAML file and it is incompatible with the parameters from single genome analysis. - - Therefore, when using the --in_yaml parameter you cannot use any of the following parameters as they are specific for single genome analysis: - - * --genome - * --sreads_paired - * --sreads_single - * --lreads - * --lreads_type - * --resfinder_species - * --nanopolish_fast5 - * --nanopolish_fastq - - Cheers. - """.stripIndent() - - exit 1 - } - - /* - - User has given the --lreads parameter but forgot --lreads_type - - */ - if (params.lreads && !params.lreads_type) { - println """ - ERROR! - - A minor error has occurred - ==> User used --lreads but forgot --lreads_type. - - When giving longreads as input, you must tell the pipeline from wich tech it comes from: 'nanopore' or 'pacbio' - - Cheers. - """.stripIndent() - - exit 1 - } - + /* Checking the prokka parameters @@ -96,24 +22,4 @@ def paramsCheck() { exit 1 } - /* - - Checking nanopolish parameters - - */ - if ((params.nanopolish_fast5 && !params.nanopolish_fastq) || (!params.nanopolish_fast5 && params.nanopolish_fastq)) { - println """ - ERROR! - - A minor error has occurred - ==> User forgot to set both --nanopolish_fast5 and --nanopolish_fastq. - - These parameters must be used together. They are the necessary files to call methylations from ONT data with Nanopolish. - - Cheers. - """.stripIndent() - - exit 1 - } - } diff --git a/nf_functions/parseCSV.nf b/nf_functions/parseCSV.nf deleted file mode 100644 index d83e6517..00000000 --- a/nf_functions/parseCSV.nf +++ /dev/null @@ -1,29 +0,0 @@ -def parse_csv(in_ch) { - - return in_ch.splitCsv(header: ['name', 'entrypoint', 'fwd', 'rev', 'single', 'lreads', 'lr_type', 'fast5', 'assembly', 'resfinder']).map{ row -> - - if (row.entrypoint == "flye" && row.lreads != "missing_lreads" && row.lr_type == "missing_lr_type") { - println """ - ERROR! - - A minor error has occurred - ==> User used --lreads but forgot --lreads_type. - - When giving longreads as input, you must tell the pipeline from wich tech it comes from: 'nanopore' or 'pacbio' - - Cheers. - """.stripIndent() - - exit 1 - } else { - tuple(row.name, row.entrypoint, (row.fwd == "missing_pairFWD") ? row.fwd : file(row.fwd), (row.rev == "missing_pairREV") ? row.rev : file(row.rev), - (row.single == "missing_single") ? row.single : file(row.single), (row.lreads == "missing_lreads") ? row.lreads : file(row.lreads), row.lr_type, - (row.fast5 == "missing_fast5") ? row.fast5 : file(row.fast5), (row.assembly == "missing_assembly") ? row.assembly : file(row.assembly), row.resfinder) - } - } - -} - -def filter_ch(in_ch, entrypoint) { - parse_csv(in_ch.map { it.text }) | filter { it[1] == "${entrypoint}" } -} diff --git a/nf_functions/writeCSV.nf b/nf_functions/writeCSV.nf index 6ac202e6..06c48700 100644 --- a/nf_functions/writeCSV.nf +++ b/nf_functions/writeCSV.nf @@ -1,26 +1,92 @@ def write_csv(in_list) { - return in_list.collectFile( name:"samples.txt", sort: { it[0] }, newLine: true ) { + return in_list.collectFile( name:"samples.csv", sort: { it[0] }, newLine: true ) { - // Check short read pairs - fwd_pair = (it.paired != "missing_paired") ? "${it.paired[0]}" : "missing_pairFWD" - rev_pair = (it.paired != "missing_paired") ? "${it.paired[1]}" : "missing_pairREV" + /* + * checking given/available values + */ + - // Check lreads technology - lr_type = (it.lr_type != "missing_lr_type") ? "${it.lr_type}" : "missing_lr_type" + // Check for Illumina reads + if (it.illumina) { + + if (it.illumina.size() == 1) { // just one read + + fwd_pair = "missing_pairFWD" + rev_pair = "missing_pairREV" + unpaired = it.illumina[0] + + } else if (it.illumina.size() == 2) { // two reads are given + + fwd_pair = it.illumina[0] + rev_pair = it.illumina[1] + unpaired = "missing_single" + + } else if (it.illumina.size() == 3) { // three reads are given + + fwd_pair = it.illumina[0] + rev_pair = it.illumina[1] + unpaired = it.illumina[2] + + } + + } else { // no reads are given + fwd_pair = "missing_pairFWD" + rev_pair = "missing_pairREV" + unpaired = "missing_single" + } + + + + // Check long reads + if (it.nanopore) { // nanopore is given + + lr_type = "nanopore" + lreads = it.nanopore + + } else if (it.pacbio) { // pacbio is given + + lr_type = "pacbio" + lreads = it.pacbio + + } else { // none is given + + lr_type = "missing_lr_type" + lreads = "missing_lreads" + + } + + // Check fast5 + fast5 = (it.fast5) ? it.fast5 : "missing_fast5" + + // Check assembly + assembly = (it.assembly) ? it.assembly : "missing_assembly" + + // Check resfinder + // it uses the command line param as default and overwrites with + // sample specific value if found inside samplesheet + resfinder = (params.resfinder_species) ? params.resfinder_species : "missing_resfinder" + if (it.resfinder) { resfinder = it.resfinder } // Check entrypoint - if (it.assembly != "missing_assembly") { + if (assembly != "missing_assembly") { - "${it.id},annotation,${fwd_pair},${rev_pair},${it.single},${it.lreads},${lr_type},${it.fast5},${it.assembly},${it.resfinder}" + // assembled genome is given + "${it.id},annotation,${fwd_pair},${rev_pair},${unpaired},${lreads},${lr_type},${fast5},${assembly},${resfinder}" - } else if (it.assembly == "missing_assembly" && it.paired == "missing_paired" && it.single == "missing_single" && it.lreads != "missing_lreads") { + } else if (assembly == "missing_assembly" && + fwd_pair == "missing_pairFWD" && + rev_pair == "missing_pairREV" && + unpaired == "missing_single" && + lreads != "missing_lreads") { - "${it.id},flye,${fwd_pair},${rev_pair},${it.single},${it.lreads},${lr_type},${it.fast5},${it.assembly},${it.resfinder}" + // short reads are not available but long reads are + "${it.id},flye,${fwd_pair},${rev_pair},${unpaired},${lreads},${lr_type},${fast5},${assembly},${resfinder}" } else { - "${it.id},unicycler,${fwd_pair},${rev_pair},${it.single},${it.lreads},${lr_type},${it.fast5},${it.assembly},${it.resfinder}" + // short reads are available + "${it.id},unicycler,${fwd_pair},${rev_pair},${unpaired},${lreads},${lr_type},${fast5},${assembly},${resfinder}" } } diff --git a/test_data/samplesheet.yaml b/test_data/samplesheet.yaml new file mode 100644 index 00000000..61335f94 --- /dev/null +++ b/test_data/samplesheet.yaml @@ -0,0 +1,18 @@ +# test profile samplesheet configuration +samplesheet: + - id: ecoli_1 + illumina: + - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_1.fastq.gz + - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_2.fastq.gz + - id: ecoli_2 + nanopore: https://github.com/fmalmeida/test_datasets/raw/main/ecoli_ont_15X.fastq.gz + - id: ecoli_3 + pacbio: https://github.com/fmalmeida/test_datasets/raw/main/ecoli_pacbio_15X.fastq.gz + - id: ecoli_4 + illumina: + - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_1.fastq.gz + - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_2.fastq.gz + nanopore: https://github.com/fmalmeida/test_datasets/raw/main/ecoli_ont_15X.fastq.gz + - id: klebsiella_1 + assembly: https://github.com/fmalmeida/test_datasets/raw/main/klebsiella_ref.fna + resfinder: Klebsiella diff --git a/test_data/small_custom_db.fasta b/test_data/small_custom_db.fasta new file mode 100644 index 00000000..bcab6bf0 --- /dev/null +++ b/test_data/small_custom_db.fasta @@ -0,0 +1,55 @@ +>VFDB~~~(entA)~~~VFG048409(gb|YP_002918380.1)~~~[Ent_(VF0562)] VFG048409(gb|YP_002918380.1) (entA) 2,3-dihydroxybenzoate-2,3-dehydrogenase [Ent (VF0562)] +GTGGGCGCTGCTCACCCGCGAGGTGCAATAATGGCGGCGCTGGATTTTCACGGCCAGACC +GTGTGGGTGACCGGCGCAGGTAAAGGGATTGGCTATGCCACCGCGCTGGCTTTCGTCGAG +GCCGGGGCCAACGTCACCGGGTTCGACCTCGCCTTTGACGGCGAAAGCTACCCGTTTGCC +ACCGAAACGCTGGACGTTGCCGATGCCGACCAGGTACGCGAGGCATGCAGCCGGCTGTTG +GCGAACACCGAGCGTTTAGATGTGCTGGTGAACGCCGCCGGGATCCTGCGCATGGGCGCT +ACCGACCAGCTGAGCGCGGAAGACTGGCAGCAGACCTTCGCGGTCAACGTCGGCGGCGCG +TTTAATCTGTTTCAGCAGACGATGGCCCAGTTCCGTCGCCAGCGGGGCGGGGCGATCGTC +ACCGTCGCCTCCGATGCCGCGCACACGCCGCGCATCGGCATGAGCGCCTATGGCGCCTCG +AAGGCGGCGCTGAAAAGCCTGGCGCTGACCGTCGGCCTTGAGCTGGCGGGCTGCGGGGTG +CGCTGTAATCTGGTGTCGCCGGGCTCCACCGATACCGATATGCAGCGCACCCTGTGGGTC +AGCGACGATGCAGAGCAGCAGCGGATCCGCGGCTTCGGCGAGCAGTTTAAGCTCGGCATC +CCGCTGGGTAAAATCGCCCGCCCGCAGGAGATCGCCAATACCATTCTGTTCCTCGCCTCT +TCTCACGCCAGCCATATCACCCTGCAGGATATCGTGGTTGATGGCGGCTCGACGCTGGGG +GCGTAA +>VFDB~~~(entB)~~~VFG048419(gb|YP_002918379.1)~~~[Ent_(VF0562)] VFG048419(gb|YP_002918379.1) (entB) 2,3-dihydro-2,3-dihydroxybenzoate synthetase, isochroismatase [Ent (VF0562)] +ATGGCAATCCCTAAATTACAGGCATATGCGCTGCCGGAAGCCAGCGATATCCCGGCGAAC +AAGGTCAACTGGGCCTTTGAGCCGTCGCGCGCCGCGCTGCTGATCCACGATATGCAGGAA +TATTTCCTCAACTTCTGGGGCGAAAACAGCGCGATGATGGAGAAAGTGGTGGCTAATATC +GCCGCCCTGCGCGACTTCTGCAAACAGAACGGCATTCCGGTGTACTACACTGCCCAGCCG +AAAGAGCAGAGCGATGAAGACCGCGCCCTGCTGAATGATATGTGGGGGCCGGGTCTGACC +CGCTCGCCGGAGCAGCAGCAGGTGATTGCCGCGCTGGCGCCGGATGAAGACGATACCGTG +CTGGTGAAATGGCGCTACAGCGCGTTTCATCGCTCGCCGCTGGAGGAGATGCTGAAAGAG +ACCGGTCGCGACCAGCTGATCATCACCGGCGTTTACGCCCATATCGGCTGCATGACCACC +GCCACCGATGCTTTCATGCGCGATATCAAACCGTTCTTTGTCGCCGACGCGCTGGCAGAT +TTCAGCCGTGAAGAGCACCTGATGGCGCTGAAATACGTCGCCGGCCGCTCCGGCCGCGTG +GTGATGACCGAAGAGCTGCTACCGCTCCCAGCCTCCAAAGCGGCGCTGCGCGCGCTGATC +CTGCCGCTGCTCGACGAATCCGATGAACCGCTGGATGATGAAAACCTGATCGACTACGGT +CTGGATTCGGTGCGGATGATGGCCCTGGCCGCCCGCTGGCGCAAAGTGCACGGCGATATC +GACTTCGTGATGCTGGCGAAAAACCCGACCATCGACGCCTGGTGGGCGCTGCTCACCCGC +GAGGTGCAATAA +>VFDB~~~(rcsA)~~~VFG049007(gb|YP_002920216.1)~~~[RcsAB_(VF0571)] VFG049007(gb|YP_002920216.1) (rcsA) transcriptional activator for ctr capsule biosynthesis [RcsAB (VF0571)] +ATGTCAACGATGATTATGGATTTGTGCAGCTATACCCGGTTGGGATTGACGGGATATCTG +ACCAGTCGGGGAATTAAAAAACAGGAAATCGTTGAGGTCAACAGTGCTGCGGATCTGCAG +AAACACTGTACGTCGTGTTGCCCGGCGGTGGTGTTTCTGAATGAAGACTGTTTCGTGCAT +GATGATGAAAGTAATGGCATTATTCGCCAGATCATTACGCAAAACCCGGCGACGCTGTTT +GTTATCTTTATGTCGCTGGCGAACATCCATTTTGACCGCTATTTGCGGGTACGGAAGAAT +CTGCTAATCAGTTCAAAATCGATAACCCCAAAAGACCTTGATGTTATTCTGGTTAATTAT +CTTAAATACAAAAACACCAGTGTAGGGCAGTTAACTTTACCGACATTGTCACTGAGTAAA +ACAGAATCAAATATGCTGCAAATGTGGATGGCCGGGCATGGTACTTCGCAAATCTCAACG +CAAATGAACATCAAAGCGAAGACGGTATCGTCGCATAAAGGCAATATTAAAAAGAAAATA +CAAACGCATAATAAGCAGGTGATTTATCATATCGTTCGGCTGACCGAAAACATCACCTCC +GGTATTCAGGTAAATATGCGCTGA +>VFDB~~~(rcsB)~~~VFG049018(gb|YP_002920501.1)~~~[RcsAB_(VF0571)] VFG049018(gb|YP_002920501.1) (rcsB) transcriptional regulator RcsB [RcsAB (VF0571)] +ATGAACACTATGAACGTAATTATTGCCGATGACCATCCGATCGTACTGTTCGGTATTCGT +AAGTCACTTGAGCAAATTGAGTGGGTGAACGTCGTTGGCGAGTTTGAAGACTCCACGGCC +CTTATCAACAATCTTCCGAAGCTGGACGCCCATGTGCTGATCACCGATCTGTCCATGCCC +GGTGACAAATACGGCGACGGGATCACGCTGATTAAATACATCAAGCGCCACTTCCCGGAT +CTGTCGATCATTGTTCTGACCATGAACAACAATCCGGCGATCCTCAGCGCGGTGCTGGAT +CTGGATATCGAAGGGATCGTCCTCAAGCAGGGGGCGCCGACCGACCTGCCAAAAGCGCTG +GCCGCGCTGCAGAAAGGGAAGAAATTCACCCCGGAAAGCGTCTCTCGCCTGCTGGAAAAA +ATCAGCGCCAGCGGCTATGGCGATAAACGCCTGTCGCCGAAAGAGAGCGAAGTACTGCGC +CTGTTCGCCGAAGGTTTCCTGGTAACCGAGATCGCCAAAAAGCTTAATCGCAGCATTAAA +ACCATCAGCAGCCAGAAGAAATCGGCGATGATGAAGCTGGGCGTCGAAAACGACATCGCC +TTGCTGAACTACCTCTCTTCCGTCTCGCTGAGCGCGACGGACAAAGAGTAA \ No newline at end of file diff --git a/test_data/test_profile.config b/test_data/test_profile.config new file mode 100644 index 00000000..5def0eae --- /dev/null +++ b/test_data/test_profile.config @@ -0,0 +1,86 @@ +/* + * Configuration File to run fmalmeida/bacannot pipeline. + */ + +/* + + Required Parameters. + This parameters must always be set + +*/ +params { + + /* + + INPUT SAMPLESHEET + + */ + +// Input data mus be given inside a well-formated samplesheet. +// We provide a well-formated example at: https://github.com/fmalmeida/bacannot/blob/master/example_samplesheet.yaml +// +// Please read the example samplesheet so you can understand how to properly fill it. +// +// It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet.html + input = 'https://github.com/fmalmeida/bacannot/raw/develop/test_data/samplesheet.yaml' + + /* + + GENERAL PARAMETERS + + */ + +// Main output folder name. More than one bacannot annotation can be redirected +// to the same output parameter. It is good to keep related annotations together. +// A subdirectory with the filename will be created inside this directory. + output = 'EXAMPLE_OUTPUT' + +// Number of threads to be used by each software + threads = 5 + + /* + * Resfinder species panel + */ + +// Species panel to be used when annotating with Resfinder. +// It sets a default for all samples in the samplesheet. +// If a sample has a different value inside the samplesheet it will overwrite the value for that sample +// If blank it will not be executed. +// It must be identical (without the *) as written in their webservice https://cge.cbs.dtu.dk/services/ResFinder/. +// E.g. 'Escherichia coli'; 'Klebsiella' ... + resfinder_species = 'Escherichia coli' + + /* + * Custom databases can be used to annotate additional genes in the genome. + * It runs a BLASTn alignment against the genome, therefore, the custom database + * MUST be a nucleotide fasta of genes. More than one custom database can be given + * separated by commas. Gene headers must be properly formated as described in the + * documentation: https://bacannot.readthedocs.io/en/latest/custom-db.html + */ +// Custom nucleotide fastas + custom_db = 'https://github.com/fmalmeida/bacannot/raw/develop/test_data/small_custom_db.fasta' + +} + +/* + Configuration of Nextflow Scopes + */ + +//Trace Report +trace { + enabled = true + file = "${params.output}" + "/annotation_pipeline_trace.txt" + fields = 'task_id,name,status,exit,realtime,cpus,%cpu,memory,%mem,rss' +} + +//Timeline Report +timeline { + enabled = true + file = "${params.output}" + "/annotation_pipeline_timeline.html" +} + +//Complete Report +report { + enabled = true + file = "${params.output}" + "/annotation_pipeline_nextflow_report.html" +} diff --git a/workflows/batch_workflow.nf b/workflows/bacannot.nf similarity index 94% rename from workflows/batch_workflow.nf rename to workflows/bacannot.nf index 77db848c..dd095f90 100644 --- a/workflows/batch_workflow.nf +++ b/workflows/bacannot.nf @@ -3,19 +3,16 @@ */ // Unicycler assembly -include { unicycler } from '../modules/assembly/unicycler_batch.nf' +include { unicycler } from '../modules/assembly/unicycler.nf' // Flye assembly -include { flye } from '../modules/assembly/flye_batch.nf' - -// filter function -include { filter_ch } from '../nf_functions/parseCSV.nf' +include { flye } from '../modules/assembly/flye.nf' // Species identification include { refseq_masher } from '../modules/generic/mash.nf' // Prokka annotation -include { prokka } from '../modules/generic/prokka_batch.nf' +include { prokka } from '../modules/generic/prokka.nf' // MLST annotation include { mlst } from '../modules/generic/mlst.nf' @@ -76,10 +73,10 @@ include { amrfinder } from '../modules/resistance/amrfinder.nf' include { card_rgi } from '../modules/resistance/rgi_annotation.nf' // Methylation calling (Nanopolish) -include { call_methylation } from '../modules/generic/methylation_batch.nf' +include { call_methylation } from '../modules/generic/methylation.nf' // User's custom db annotation -include { custom_blast } from '../modules/generic/custom_blast_batch.nf' +include { custom_blast } from '../modules/generic/custom_blast.nf' include { custom_blast_report } from '../modules/generic/custom_blast_report.nf' // Merging annotation in GFF @@ -110,20 +107,29 @@ include { antismash } from '../modules/generic/antismash.nf' DEF WORKFLOW */ -workflow MULTIPLE_SAMPLE { +workflow BACANNOT { take: input_ch custom_db + main: + // generate channel branches + // now we create the filtered channels + input_ch.branch{ + unicycler: it[1] == 'unicycler' + flye: it[1] == 'flye' + annotation: it[1] == "annotation" + }.set { parsed_inputs } + // Step 0 -- Run unicycler when necessary - unicycler(filter_ch(input_ch, "unicycler")) + unicycler(parsed_inputs.unicycler) // Step 0 -- Run flye when necessary - flye(filter_ch(input_ch, "flye")) + flye(parsed_inputs.flye) // First step -- Prokka annotation - prokka(filter_ch(input_ch, "annotation").mix(flye.out[1], unicycler.out[1])) + prokka(parsed_inputs.annotation.mix(flye.out[1], unicycler.out[1])) // Second step -- MLST analysis mlst(prokka.out[3]) diff --git a/workflows/parse_samples.nf b/workflows/parse_samples.nf index 47380fb8..064060f0 100644 --- a/workflows/parse_samples.nf +++ b/workflows/parse_samples.nf @@ -1,59 +1,30 @@ +include { write_csv } from '../nf_functions/writeCSV.nf' workflow parse_samplesheet { take: data main: - // Parse input to check for missing entries - parsed = [] - - data.each { - - // Check for Illumina reads - if (it.illumina) { - - // Only one read - if (it.illumina.size() == 1) { - it['paired'] = "missing_paired" - it['single'] = it.illumina[0] - } else if (it.illumina.size() == 2) { - it['paired'] = [it.illumina[0], it.illumina[1]] - it['single'] = "missing_single" - } else if (it.illumina.size() == 3) { - it['paired'] = [it.illumina[0], it.illumina[1]] - it['single'] = it.illumina[2] - } - } else { - it['paired'] = "missing_paired" - it['single'] = "missing_single" - } - - // Check long reads - if (it.nanopore) { - it['lr_type'] = "nanopore" - it['lreads'] = it.nanopore - } else if (it.pacbio) { - it['lr_type'] = "pacbio" - it['lreads'] = it.pacbio - } else { - it['lr_type'] = "missing_lr_type" - it['lreads'] = "missing_lreads" - } - - // Check fast5 - it['fast5'] = (it.fast5) ? it.fast5 : "missing_fast5" - - // Check assembly - it['assembly'] = (it.assembly) ? it.assembly : "missing_assembly" - - // Check resfinder - it['resfinder'] = (it.resfinder) ? it.resfinder : "missing_resfinder" - - // Save - parsed.add(it) + // iterate over input list + custom_csv = write_csv(Channel.fromList(data)) + + // now we parse the csv created + parsed_csv = custom_csv.splitCsv(header: ['name', 'entrypoint', 'fwd', 'rev', 'single', 'lreads', 'lr_type', 'fast5', 'assembly', 'resfinder']).map{ row -> + tuple( + row.name, + row.entrypoint, + (row.fwd == "missing_pairFWD") ? row.fwd : file(row.fwd), + (row.rev == "missing_pairREV") ? row.rev : file(row.rev), + (row.single == "missing_single") ? row.single : file(row.single), + (row.lreads == "missing_lreads") ? row.lreads : file(row.lreads), + row.lr_type, + (row.fast5 == "missing_fast5") ? row.fast5 : file(row.fast5), + (row.assembly == "missing_assembly") ? row.assembly : file(row.assembly), + row.resfinder + ) } emit: - Channel.from(parsed) + parsed_csv } diff --git a/workflows/simple_workflow.nf b/workflows/simple_workflow.nf deleted file mode 100644 index a9b09d39..00000000 --- a/workflows/simple_workflow.nf +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Include modules (Execution setup) - */ - -// Species identification -include { refseq_masher } from '../modules/generic/mash.nf' - -// Prokka annotation -include { prokka } from '../modules/generic/prokka.nf' - -// sequenceserver generation -include { sequenceserver } from '../modules/generic/sequenceserver.nf' - -// MLST annotation -include { mlst } from '../modules/generic/mlst.nf' - -// rRNA annotation -include { barrnap } from '../modules/generic/barrnap.nf' - -// Calculate GC content -include { compute_gc } from '../modules/generic/compute_gc.nf' - -// KOFAM annotation -include { kofamscan } from '../modules/KOs/kofamscan.nf' - -// KEGG decoder -include { kegg_decoder } from '../modules/KOs/kegg-decoder.nf' - -// antiSMASH -include { antismash } from '../modules/generic/antismash.nf' - -// Plasmid annotation with plasmidfinder -include { plasmidfinder } from '../modules/MGEs/plasmidfinder.nf' - -// Plasmid annotation with platon -include { platon } from '../modules/MGEs/platon.nf' - -// Virulence annotation with VFDB -include { vfdb } from '../modules/virulence/vfdb.nf' - -// Virulence annotation with Victors -include { victors } from '../modules/virulence/victors.nf' - -// Prophage annotation with PHAST -include { phast } from '../modules/prophages/phast.nf' - -// Prophage annotation with PHIGARO -include { phigaro } from '../modules/prophages/phigaro.nf' - -// Prophage annotation with phispy -include { phispy } from '../modules/prophages/phispy.nf' - -// ICE annotation with ICEberg db -include { iceberg } from '../modules/MGEs/iceberg.nf' - -// Genomic Islands detection with Islandpath-DIMOB -include { find_GIs } from '../modules/MGEs/islandPath_DIMOB.nf' -include { draw_GIs } from '../modules/MGEs/draw_gis.nf' - -// IS identification -include { digis } from '../modules/MGEs/digIS.nf' - -// AMR annotation with ARGMiner -include { argminer } from '../modules/resistance/argminer.nf' - -// AMR annotation with Resfinder -include { resfinder } from '../modules/resistance/resfinder.nf' - -// AMR annotation with AMRFinderPlus -include { amrfinder } from '../modules/resistance/amrfinder.nf' - -// AMR annotation with CARD-RGI -include { card_rgi } from '../modules/resistance/rgi_annotation.nf' - -// Methylation calling (Nanopolish) -include { call_methylation } from '../modules/generic/methylation.nf' - -// User's custom db annotation -include { custom_blast } from '../modules/generic/custom_blast.nf' -include { custom_blast_report } from '../modules/generic/custom_blast_report.nf' - -// Merging annotation in GFF -include { merge_annotations } from '../modules/generic/merge_annotations.nf' - -// Convert GFF to GBK -include { gff2gbk } from '../modules/generic/gff2gbk.nf' - -// Convert GFF to SQL -include { create_sql } from '../modules/generic/gff2sql.nf' - -// Bedtools gff merge -include { gff_merge } from '../modules/generic/merge_gff.nf' - -// JBrowse -include { jbrowse } from '../modules/generic/jbrowse.nf' - -// Output reports -include { report } from '../modules/generic/reports.nf' - -/* - DEF WORKFLOW -*/ - -workflow SINGLE_SAMPLE { - - take: - input_genome - fast5_dir - fast5_fastqs - custom_db - - main: - - // First step -- Prokka annotation - prokka(input_genome) - - // Second step -- MLST analysis - mlst(prokka.out[3]) - - // Third step -- rRNA annotation - barrnap(prokka.out[3]) - - // Fouth step -- calculate GC content for JBrowse - compute_gc(prokka.out[3]) - - // Fifth step -- run kofamscan - if (params.skip_kofamscan == false) { - kofamscan(prokka.out[4]) - kegg_decoder(kofamscan.out[1]) - kofamscan_output = kofamscan.out[1] - kegg_decoder_svg = kegg_decoder.out[1] - } else { - kofamscan_output = Channel.empty() - kegg_decoder_svg = Channel.empty() - } - - /* - Sixth step -- MGE, Virulence and AMR annotations - */ - - // Plasmidfinder - if (params.skip_plasmid_search == false) { - - // plasmidfinder - plasmidfinder(prokka.out[3]) - plasmidfinder_output = plasmidfinder.out[1] - - // platon - platon(prokka.out[3]) - platon_output = platon.out[1] - } else { - plasmidfinder_output = Channel.empty() - platon_output = Channel.empty() - } - - // IslandPath software - find_GIs(prokka.out[2]) - - // Virulence search - if (params.skip_virulence_search == false) { - - // VFDB - vfdb(prokka.out[5]) - vfdb_output = vfdb.out[1] - - // Victors db - victors(prokka.out[4]) - victors_output = victors.out[1] - } else { - vfdb_output = Channel.empty() - victors_output = Channel.empty() - } - - // Prophage search - if (params.skip_prophage_search == false) { - - // PHAST db - phast(prokka.out[4]) - phast_output = phast.out[1] - - // Phigaro software - phigaro(prokka.out[3]) - phigaro_output_1 = phigaro.out[0] - phigaro_output_2 = phigaro.out[1] - - // PhiSpy - phispy(prokka.out[2]) - phispy_output = phispy.out[1] - } else { - phast_output = Channel.empty() - phigaro_output_1 = Channel.empty() - phigaro_output_2 = Channel.empty() - phispy_output = Channel.empty() - } - - // ICEs search - if (params.skip_iceberg_search == false) { - // ICEberg db - iceberg(prokka.out[4], prokka.out[3]) - iceberg_output = iceberg.out[1] - iceberg_output_2 = iceberg.out[2] - } else { - iceberg_output = Channel.empty() - iceberg_output_2 = Channel.empty() - } - - // AMR search - if (params.skip_resistance_search == false) { - - // AMRFinderPlus - amrfinder(prokka.out[4]) - amrfinder_output = amrfinder.out[0] - - // CARD-RGI - card_rgi(prokka.out[4]) - rgi_output = card_rgi.out[2] - rgi_output_parsed = card_rgi.out[1] - rgi_heatmap = card_rgi.out[3] - - // ARGMiner - argminer(prokka.out[4]) - argminer_output = argminer.out[0] - - if (params.resfinder_species) { - // Resfinder - resfinder(prokka.out[3].concat(Channel.value(params.resfinder_species)).collect()) - resfinder_output_1 = resfinder.out[0] - resfinder_output_2 = resfinder.out[1] - resfinder_phenotable = resfinder.out[2] - resfinder_gff = resfinder.out[3] - } else { - resfinder_output_1 = Channel.empty() - resfinder_output_2 = Channel.empty() - resfinder_phenotable = Channel.empty() - resfinder_gff = Channel.empty() - } - - } else { - rgi_output = Channel.empty() - rgi_output_parsed = Channel.empty() - rgi_heatmap = Channel.empty() - amrfinder_output = Channel.empty() - argminer_output = Channel.empty() - resfinder_output_1 = Channel.empty() - resfinder_output_2 = Channel.empty() - resfinder_phenotable = Channel.empty() - resfinder_gff = Channel.empty() - } - - /* - Seventh step -- Methylation call - */ - if (params.nanopolish_fast5 && params.nanopolish_fastq) { - call_methylation(prokka.out[3], fast5_dir, fast5_fastqs) - methylation_out_1 = call_methylation.out[2] - methylation_out_2 = call_methylation.out[3] - } else { - methylation_out_1 = Channel.empty() - methylation_out_2 = Channel.empty() - } - - /* - - Additional steps created after main releases - - */ - - // species identification - refseq_masher(prokka.out[3]) - - // IS identification - digis(prokka.out[3].join(prokka.out[2])) - - // antiSMASH - if (params.skip_antismash == false) { - antismash(prokka.out[2]) - antismash_output = antismash.out[0] - } else { - antismash_output = Channel.empty() - } - - // sequenceserver - sequenceserver(prokka.out[3].join(prokka.out[5]).join(prokka.out[4])) - - - /* - Eighth step -- Merge all annotations with the same Prefix value in a single Channel - */ - annotations_files = prokka.out[3].join(prokka.out[1]) - .join(mlst.out[0]) - .join(barrnap.out[0]) - .join(compute_gc.out[0]) - .join(kofamscan_output, remainder: true) - .join(vfdb_output, remainder: true) - .join(victors_output, remainder: true) - .join(amrfinder_output, remainder: true) - .join(resfinder_gff, remainder: true) - .join(rgi_output, remainder: true) - .join(iceberg_output, remainder: true) - .join(phast_output, remainder: true) - .join(phigaro_output_2, remainder: true) - .join(find_GIs.out[0], remainder: true) - - // Contatenation of annotations in a single GFF file - merge_annotations(annotations_files.join(digis.out[1], remainder: true)) - - // Plot genomic islands - draw_GIs(merge_annotations.out[0].join(find_GIs.out[0])) - - // Convert GFF file to GBK file - gff2gbk(merge_annotations.out[0].join(prokka.out[3])) - - // Convert GFF file to sqldb - create_sql(merge_annotations.out[0].join(prokka.out[5]) - .join(prokka.out[4]) - .join(prokka.out[3]) - .join(digis.out[2])) - - // User wants to merge the final gff file? - if (params.bedtools_merge_distance) { - gff_merge(merge_annotations.out[0]) - } - - /* - - Nineth step -- Perform users custom annotation - - */ - if (params.custom_db) { - custom_blast(merge_annotations.out[0].join(prokka.out[3]), custom_db) - custom_blast_report(custom_blast.out[0]) - } - - /* - Final step -- Create genome browser and reports - */ - - // Grab inputs needed for JBrowse step - jbrowse_input = merge_annotations.out[0].join(annotations_files, remainder: true) - .join(methylation_out_1, remainder: true) - .join(methylation_out_2, remainder: true) - .join(phispy_output, remainder: true) - .join(merge_annotations.out[8], remainder: true) // parsed and changed digIS - .join(antismash_output, remainder: true) - // Jbrowse Creation - jbrowse(jbrowse_input) - - // Render reports - report(jbrowse_input.join(rgi_output_parsed, remainder: true) - .join(rgi_heatmap, remainder: true) - .join(argminer_output, remainder: true) - .join(iceberg_output_2, remainder: true) - .join(plasmidfinder_output, remainder: true) - .join(resfinder_output_1, remainder: true) - .join(resfinder_output_2, remainder: true) - .join(resfinder_phenotable, remainder: true) - .join(draw_GIs.out[1], remainder: true) - .join(phigaro_output_1, remainder: true) - .join(platon_output, remainder: true) - .join(prokka.out[6], remainder: true) - .join(kegg_decoder_svg, remainder: true) - .join(refseq_masher.out[0], remainder: true)) - -}