From 46fcb59d2a144c73aa195e8bac54770422a407f2 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Sun, 1 Oct 2023 08:33:29 -0400 Subject: [PATCH] Wrap-up v3.3 for release (#93) * Add pre-formatted database (#82) * add pre-formatted database info * add information about pre-formatted database * 50 add tool integron finder 20 (#87) * update version * Add pre-formatted database (#83) * add pre-formatted database info * add information about pre-formatted database * update falmeida-py package * change version * change main tools to public containers * use biocontainer * aggregate other non-biocontainer tools and diminish the size of docker images * update module labels * re-arranged docker images * add integron_finder module * update amrfinder version * trying to addintegron finder to gff * update docker * fixed image install * fixed integron finder 2 gff * remove unnecessary grouptuple * fix image and emboss module * fix organization * add docker image to module * fix indentation * fix indentation * added integron finder results to final GFF and JBROWSE * integron finder results added to HTML report * fix docker image * properly added to json summary * update changelog * update readme * update list of tools * update default config in docs * backscape tildes * update installation docs * fix indentation * update outputs docs * fix wrong pipeline name * fix typo * update quickstart * fixed mlst execution in singularity * fix indentation * 85 prokka module can get after modules stuck if the header file longer than 20 and not separated by tab or space (#89) * add awk command to clean big fasta headers * add awk statement to clean big fasta headers * update bakta version * fix bakta stats parsing * 81 add tool mob suite (#90) * Add pre-formatted database (#83) * add pre-formatted database info * add information about pre-formatted database * add mob suite module * added results to HTML report * Update Dockerfile * added mob_suite to json summary * add tool to markdown files * add tool information to docs * add example reports * update singularity config * fixed kofamscan download * fix dockerfile * Fix unicycler tag * use only docker images to avoid timeout error * use docker ocntainer to avoid singularity timeout * fixed resfinder for singularity * fixed docker image * fix gff2sql in singularity * use proper singularity images * fix singularity image download * fixed docker image * Add option for prebuilt db download (#94) * include module to download pre-built databases * update docs * 69 tools to use own docker image (#91) * moved container configurations of assembly modules * update default flye version * update container configuration for database-setup modules * re-organize container definition of 'generic' modules * reorganize container configuration for KO modules * reorganized container configuration for MGEs modules * finalizing container configuration reorganization of last modules * containers already defined in config files * update params schema * fixed zenodo download * mob_suite singularity image not always suited for low connection servers * add option to download container configs * update unicycler version (0.5.0--py310h6cc9453_3) * 96 error summary for bugfix release (#101) Update falmeida-py version * 98 include ices and prophage annotation in json summary (#106) * Try Dockerfile fix * Update Dockerfile * Update Dockerfile * Update CHANGELOG.md * 100 update pipeline docker images from docker tags to docker shasum (#108) * fix singularity run options * fix misc dockerfile * update renv docker image environment * update docker images to use shasum * Update CHANGELOG.md * 107 duplicate reads to unique read names (#109) * Add pre-formatted database (#83) * add pre-formatted database info * add information about pre-formatted database * update docs and fix report links * include information of newly known issues (#103) * add parameter to enable deduplication of reads * Update manual.md * update changelog * Update docs for v3.3 (#110) * update cli help * Update installation.md * add indentation * Update README.md * Update README.md * fix tracedir * always show from copy * Update quickstart.md * Update manual.md * update citation information * add citation example * Update CHANGELOG.md --- .zenodo.json | 2 +- README.md | 79 +++------ bin/gff2sql.R | 4 +- bin/mlst-make_blast_db.sh | 23 +++ bin/run_jbrowse.sh | 120 +++++++------ conf/defaults.config | 16 +- conf/docker.config | 102 +++++++---- conf/singularity.config | 111 ++++++++---- docker/misc/Dockerfile | 113 +++++++----- docker/perlenv/Dockerfile | 38 ---- docker/perlenv/build.sh | 1 - docker/pyenv/Dockerfile | 26 --- docker/pyenv/build.sh | 1 - docker/renv/Dockerfile | 4 +- docker/renv/reports/no_integronfinder.Rmd | 1 + docker/renv/reports/report_MGEs.Rmd | 35 +++- docker/renv/reports/yes_digis.Rmd | 2 +- docker/renv/reports/yes_integronfinder.Rmd | 12 ++ docker/renv/reports/yes_plasmids.Rmd | 26 +++ docker/renv/scripts/rscripts/gff2sql.R | 139 --------------- docker/set_version.sh | 2 +- docs/config.md | 188 +------------------- docs/defaults.config | 191 +++++++++++++++++++++ docs/index.md | 6 +- docs/installation.md | 65 +++++-- docs/manual.md | 8 +- docs/outputs.md | 3 +- docs/quickstart.md | 9 +- docs/reports/report_MGEs.html | 83 +++++---- docs/reports/report_general.html | 22 +-- docs/reports/report_resistance.html | 22 +-- docs/reports/report_virulence.html | 12 +- docs/requirements.txt | 3 +- lib/WorkflowBacannot.groovy | 16 +- lib/WorkflowMain.groovy | 26 ++- main.nf | 2 +- markdown/CHANGELOG.md | 14 ++ markdown/list_of_tools.md | 4 +- mkdocs.yml | 2 + modules/KOs/kofamscan.nf | 2 +- modules/MGEs/draw_gis.nf | 1 - modules/MGEs/integron_finder.nf | 42 +++++ modules/MGEs/integron_finder_2gff.nf | 24 +++ modules/MGEs/islandpath.nf | 4 +- modules/MGEs/mob_suite.nf | 36 ++++ modules/MGEs/plasmidfinder.nf | 2 +- modules/MGEs/platon.nf | 2 +- modules/assembly/flye.nf | 11 +- modules/assembly/unicycler.nf | 49 +++++- modules/bacannot_dbs/amrfinder.nf | 4 +- modules/bacannot_dbs/antismash.nf | 2 +- modules/bacannot_dbs/argminer.nf | 2 +- modules/bacannot_dbs/card.nf | 4 +- modules/bacannot_dbs/get_zenodo.nf | 19 ++ modules/bacannot_dbs/iceberg.nf | 2 +- modules/bacannot_dbs/kofamscan.nf | 17 +- modules/bacannot_dbs/mlst.nf | 2 +- modules/bacannot_dbs/phast.nf | 2 +- modules/bacannot_dbs/phigaro.nf | 2 +- modules/bacannot_dbs/plasmidfinder.nf | 2 +- modules/bacannot_dbs/platon.nf | 2 +- modules/bacannot_dbs/prokka.nf | 2 +- modules/bacannot_dbs/resfinder.nf | 2 +- modules/bacannot_dbs/vfdb.nf | 2 +- modules/bacannot_dbs/victors.nf | 2 +- modules/generic/bakta.nf | 13 +- modules/generic/barrnap.nf | 2 +- modules/generic/circos.nf | 3 +- modules/generic/gc_skew.nf | 3 +- modules/generic/gff2gbk.nf | 3 - modules/generic/gff2sql.nf | 3 - modules/generic/jbrowse.nf | 5 +- modules/generic/karyotype.nf | 1 - modules/generic/mash.nf | 2 +- modules/generic/merge_annotations.nf | 8 +- modules/generic/merge_summaries.nf | 1 - modules/generic/mlst.nf | 10 +- modules/generic/prepare_circos.nf | 1 - modules/generic/prokka.nf | 23 ++- modules/generic/reports.nf | 98 ++++++----- modules/generic/sequenceserver.nf | 3 +- modules/generic/summary.nf | 40 +++-- modules/prophages/phigaro.nf | 7 +- modules/prophages/phispy.nf | 2 +- modules/resistance/amrfinder.nf | 2 +- modules/resistance/amrfinder2tsv.nf | 1 - modules/resistance/resfinder.nf | 63 +++---- modules/resistance/rgi_annotation.nf | 7 +- modules/virulence/vfdb2tsv.nf | 1 - nextflow.config | 15 +- nextflow_schema.json | 66 ++++--- workflows/bacannot.nf | 63 ++++--- workflows/bacannot_dbs.nf | 35 ++-- 93 files changed, 1293 insertions(+), 962 deletions(-) create mode 100755 bin/mlst-make_blast_db.sh delete mode 100644 docker/perlenv/Dockerfile delete mode 100644 docker/perlenv/build.sh delete mode 100644 docker/pyenv/Dockerfile delete mode 100644 docker/pyenv/build.sh create mode 100644 docker/renv/reports/no_integronfinder.Rmd create mode 100644 docker/renv/reports/yes_integronfinder.Rmd delete mode 100644 docker/renv/scripts/rscripts/gff2sql.R create mode 100644 docs/defaults.config create mode 100644 modules/MGEs/integron_finder.nf create mode 100644 modules/MGEs/integron_finder_2gff.nf create mode 100644 modules/MGEs/mob_suite.nf create mode 100644 modules/bacannot_dbs/get_zenodo.nf diff --git a/.zenodo.json b/.zenodo.json index 64902e93..3865b55a 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -2,7 +2,7 @@ "description": "

The pipeline

\n\n

bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more, while providing nice an beautiful interactive documents for results exploration.

", "license": "other-open", "title": "fmalmeida/bacannot: A generic but comprehensive bacterial annotation pipeline", - "version": "v3.2", + "version": "v3.3", "upload_type": "software", "creators": [ { diff --git a/README.md b/README.md index aa3fe83f..2eda798d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3627669-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3627669) +[![F1000 Paper](https://img.shields.io/badge/Citation%20F1000-10.12688/f1000research.139488.1-orange)](https://doi.org/10.12688/f1000research.139488.1) [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/fmalmeida/bacannot?include_prereleases&label=Latest%20release)](https://github.com/fmalmeida/bacannot/releases) [![Documentation](https://img.shields.io/badge/Documentation-readthedocs-brightgreen)](https://bacannot.readthedocs.io/en/latest/?badge=latest) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) @@ -8,6 +8,7 @@ [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![License](https://img.shields.io/badge/License-GPL%203-black)](https://github.com/fmalmeida/bacannot/blob/master/LICENSE) [![Follow on Twitter](http://img.shields.io/badge/twitter-%40fmarquesalmeida-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/fmarquesalmeida) +[![Zenodo Archive](https://img.shields.io/badge/Zenodo-Archive-blue)](https://doi.org/10.5281/zenodo.3627669) [![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/github.com/fmalmeida/bacannot) @@ -47,8 +48,9 @@ Its main steps are: | Annotation of virulence genes | [Victors](http://www.phidias.us/victors/) and [VFDB](http://www.mgc.ac.cn/VFs/main.htm) | | Prophage sequences and genes annotation | [PHASTER](http://phast.wishartlab.com/), [Phigaro](https://github.com/bobeobibo/phigaro) and [PhySpy](https://github.com/linsalrob/PhiSpy) | | Annotation of integrative and conjugative elements | [ICEberg](http://db-mml.sjtu.edu.cn/ICEberg/) | +| Annotation of bacterial integrons | [Integron Finder](https://github.com/gem-pasteur/Integron_Finder) | | Focused detection of insertion sequences | [digIS](https://github.com/janka2012/digIS) | -| _In silico_ detection of plasmids | [Plasmidfinder](https://cge.cbs.dtu.dk/services/PlasmidFinder/) and [Platon](https://github.com/oschwengers/platon) | +| _In silico_ detection and typing of plasmids | [Plasmidfinder](https://cge.cbs.dtu.dk/services/PlasmidFinder/), [Platon](https://github.com/oschwengers/platon) and [MOB-typer](https://github.com/phac-nml/mob-suite)| | Prediction and visualization of genomic islands | [IslandPath-DIMOB](https://github.com/brinkmanlab/islandpath) and [gff-toolbox](https://github.com/fmalmeida/gff-toolbox) | | Custom annotation from formatted FASTA or NCBI protein IDs | [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs) | | Merge of annotation results | [bedtools](https://bedtools.readthedocs.io/en/latest/) | @@ -86,18 +88,7 @@ These images have been kept separate to not create massive Docker image and to a ## Installation -1. If you don't have it already install [Docker](https://docs.docker.com/) in your computer. - * After installed, you need to download the required Docker images - - ```bash - docker pull fmalmeida/bacannot:v3.2_misc ; - docker pull fmalmeida/bacannot:v3.2_perlenv ; - docker pull fmalmeida/bacannot:v3.2_pyenv ; - docker pull fmalmeida/bacannot:v3.2_renv ; - docker pull fmalmeida/bacannot:jbrowse ; - ``` - -๐Ÿ”ฅ Nextflow can also automatically handle images download on the fly when executed. If docker has exceeded its download limit rates, please try again in a few hours. +1. If you don't have it already install either [Docker](https://docs.docker.com/) or [Singularity](https://docs.sylabs.io/guides/3.5/user-guide/index.html) in your computer. 2. Install Nextflow (version 20.10 or higher): @@ -111,48 +102,7 @@ These images have been kept separate to not create massive Docker image and to a ๐Ÿ”ฅ Users can get let the pipeline always updated with: `nextflow pull fmalmeida/bacannot` -### Downloading and updating databases - -Bacannot databases are not inside the docker images anymore to avoid huge images and problems with connections and limit rates with dockerhub. - -#### Pre-formatted - -Users can directly download pre-formatted databases from Zenodo: https://doi.org/10.5281/zenodo.7615811 - -Useful for standardization and also overcoming known issues that may arise when formatting databases with `singularity` profile. - -#### I want to generate a new formatted database - -To download and format a copy of required bacannot databases users can execute the following: - -```bash -# Download pipeline databases -nextflow run fmalmeida/bacannot --get_dbs --output bacannot_dbs -profile -``` - -This will produce a directory like this: - -```bash -bacannot_dbs -โ”œโ”€โ”€ amrfinder_db -โ”œโ”€โ”€ antismash_db -โ”œโ”€โ”€ argminer_db -โ”œโ”€โ”€ card_db -โ”œโ”€โ”€ iceberg_db -โ”œโ”€โ”€ kofamscan_db -โ”œโ”€โ”€ mlst_db -โ”œโ”€โ”€ phast_db -โ”œโ”€โ”€ phigaro_db -โ”œโ”€โ”€ pipeline_info -โ”œโ”€โ”€ plasmidfinder_db -โ”œโ”€โ”€ platon_db -โ”œโ”€โ”€ prokka_db -โ”œโ”€โ”€ resfinder_db -โ”œโ”€โ”€ vfdb_db -โ””โ”€โ”€ victors_db -``` - -> To update databases you can either download a new one to a new directory. Remove the database you want to get a new one from the root bacannot dir and use the same command above to save in the same directory (the pipeline will only try to download missing databases). Or, you can use the parameter `--force_update` to download everything again. +Please refer to the installation page, for a complete guide on required images and databases. ยป ## Quickstart @@ -185,6 +135,17 @@ Create a configuration file in your working directory: nextflow run fmalmeida/bacannot --get_config +##### Overwrite container versions with config + +The pipeline uses pre-set docker and singularity configuration files to set all the containers and versions of images that should be used by each module in the pipeline. + +Although not recommended, one can use these configuration files to change the version of specific tools if desired. + +To download these configs one can: + + nextflow run fmalmeida/bacannot --get_docker_config + nextflow run fmalmeida/bacannot --get_singularity_config + ### Interactive graphical configuration and execution #### Via NF tower launchpad (good for cloud env execution) @@ -234,7 +195,11 @@ It will result in the following: ## Citation -To cite this tool please refer to our [Zenodo tag](https://doi.org/10.5281/zenodo.3627669). +In order to cite this pipeline, please refer to: + +> Almeida FMd, Campos TAd and Pappas Jr GJ. Scalable and versatile container-based pipelines for de novo genome assembly and bacterial annotation. [version 1; peer review: awaiting peer review]. F1000Research 2023, 12:1205 (https://doi.org/10.12688/f1000research.139488.1) + +Additionally, archived versions of the pipeline are also found in [Zenodo](https://doi.org/10.5281/zenodo.3627669). This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [GPLv3](https://github.com/fmalmeida/bacannot/blob/master/LICENSE). diff --git a/bin/gff2sql.R b/bin/gff2sql.R index 61eac29d..94920dfc 100755 --- a/bin/gff2sql.R +++ b/bin/gff2sql.R @@ -54,8 +54,8 @@ addTable <- function (con, sql, input) { # Loading SQL database driver drv <- dbDriver("SQLite") -dbname <- file.path("/work", opt$out) -con <- dbConnect(drv, dbname=dbname) +print(opt$out) +con <- dbConnect(drv, dbname=opt$out) ##################################### ### First STEP load GENOME to sql ### diff --git a/bin/mlst-make_blast_db.sh b/bin/mlst-make_blast_db.sh new file mode 100755 index 00000000..4cc370f8 --- /dev/null +++ b/bin/mlst-make_blast_db.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +MLSTDIR="$0" +BLASTDIR="$DIR/../db/blast" +BLASTFILE="$BLASTDIR/mlst.fa" + +mkdir -p "$BLASTDIR" +rm -f "$BLASTFILE" + +#for N in $(find $MLSTDIR -maxdepth 1 | grep -v '_2$'); do +for N in $(find $MLSTDIR -mindepth 1 -maxdepth 1 -type d); do + SCHEME=$(basename $N) + echo "Adding: $SCHEME" + cat "$MLSTDIR"/$SCHEME/*.tfa \ + | grep -v 'not a locus' \ + | sed -e "s/^>/>$SCHEME./" \ + >> "$BLASTFILE" +done + +makeblastdb -hash_index -in "$BLASTFILE" -dbtype nucl -title "PubMLST" -parse_seqids + +echo "Created BLAST database for $BLASTFILE" diff --git a/bin/run_jbrowse.sh b/bin/run_jbrowse.sh index 75eb3283..6753ca15 100755 --- a/bin/run_jbrowse.sh +++ b/bin/run_jbrowse.sh @@ -14,7 +14,7 @@ Help() echo "Simple help message for the utilization of this script" echo "It takes the jbrowse data path and all the files that shall be plotted from bacannot" echo - echo "Syntax: run_jbrowse.sh [-h|p|g|b|s|f|r|B|P|G|m|S|R|d|A]" + echo "Syntax: run_jbrowse.sh [-h|p|g|b|s|f|r|B|P|G|m|S|R|d|A|i]" echo "options:" echo echo "h Print this help" @@ -32,59 +32,63 @@ Help() echo "R Path to Resfinder custom GFF" echo "d Path to digIS custom GFF" echo "A Path to antismash custom GFF" + echo "i Path to Integron Finder custom GFF" echo "" echo } # Get the options -while getopts "hp:g:b:s:f:r:B:P:G:m:S:R:d:A:" option; do - case $option in - h) # display Help - Help - exit;; -p) # get genome prefix - PREFIX="$OPTARG" - ;; -g) # get genome FASTA - GENOME="$OPTARG" - ;; -b) # get GC bedgraph - BEDGRAPH="$OPTARG" - ;; -s) # get chr sizes - CHRSIZES="$OPTARG" - ;; -f) # get prokka gff - PROKKAGFF="$OPTARG" - ;; -r) # get barrnap gff - rRNAGFF="$OPTARG" - ;; -B) # get phigaro bed - PHIGAROBED="$OPTARG" - ;; -P) # get phispy bed - PHISPYBED="$OPTARG" - ;; -G) # get GIs bed - GIBED="$OPTARG" - ;; -m) # get nanopolish methylation - NANOMETHYL="$OPTARG" - ;; -S) # get nanopolish chr sizes - NANOSIZES="$OPTARG" - ;; -R) # get resfinder GFF - RESFINDERGFF="$OPTARG" - ;; -d) # get digIS GFF - DIGISGFF="$OPTARG" - ;; -A) # get antismash GFF - ANTISMASHGFF="$OPTARG" - ;; - esac +while getopts "hp:g:b:s:f:r:B:P:G:m:S:R:d:A:i:" option; do + case $option in + h) # display Help + Help + exit;; + p) # get genome prefix + PREFIX="$OPTARG" + ;; + g) # get genome FASTA + GENOME="$OPTARG" + ;; + b) # get GC bedgraph + BEDGRAPH="$OPTARG" + ;; + s) # get chr sizes + CHRSIZES="$OPTARG" + ;; + f) # get prokka gff + PROKKAGFF="$OPTARG" + ;; + r) # get barrnap gff + rRNAGFF="$OPTARG" + ;; + B) # get phigaro bed + PHIGAROBED="$OPTARG" + ;; + P) # get phispy bed + PHISPYBED="$OPTARG" + ;; + G) # get GIs bed + GIBED="$OPTARG" + ;; + m) # get nanopolish methylation + NANOMETHYL="$OPTARG" + ;; + S) # get nanopolish chr sizes + NANOSIZES="$OPTARG" + ;; + R) # get resfinder GFF + RESFINDERGFF="$OPTARG" + ;; + d) # get digIS GFF + DIGISGFF="$OPTARG" + ;; + A) # get antismash GFF + ANTISMASHGFF="$OPTARG" + ;; + i) # get integron finder GFF + INTEGRONFINDERGFF="$OPTARG" + ;; + esac done # Main @@ -313,7 +317,7 @@ remove-track.pl --trackLabel "${PREFIX} CARD-RGI resistance features" --dir data --trackLabel "${PREFIX} Resfinder resistance features" --out "data" --nameAttributes "Resfinder_gene,ID,Resfinder_phenotype" ; remove-track.pl --trackLabel "${PREFIX} Resfinder resistance features" --dir data &> /tmp/error [ ! -s $RESFINDERGFF ] || echo -E " { \"compress\" : 0, \ - \"displayMode\" : \"compact\", \ + \"displayMode\" : \"compact\", \ \"key\" : \"${PREFIX} Resfinder resistance features\", \ \"category\" : \"Resistance annotation\", \ \"label\" : \"${PREFIX} Resfinder resistance features\", \ @@ -343,6 +347,22 @@ remove-track.pl --trackLabel "${PREFIX} ICE genes from ICEberg database" --dir d \"urlTemplate\" : \"tracks/${PREFIX} ICE genes from ICEberg database/{refseq}/trackData.json\" } " | add-track-json.pl data/trackList.json [ $(grep "ICEberg" $PROKKAGFF | wc -l) -eq 0 ] || rm -f iceberg ices ; +## Integron Finder +[ $(wc -l $INTEGRONFINDERGFF) -eq 0 ] || flatfile-to-json.pl --gff $INTEGRONFINDERGFF --key "${PREFIX} Annotated Integrons - Integron Finder" --trackType CanvasFeatures \ +--trackLabel "${PREFIX} Annotated Integrons - Integron Finder" --out "data" --nameAttributes "ID,integron_type" ; +remove-track.pl --trackLabel "${PREFIX} Annotated Integrons - Integron Finder" --dir data &> /tmp/error +[ $(wc -l $INTEGRONFINDERGFF) -eq 0 ] || echo -E " { \"compress\" : 0, \ +\"displayMode\" : \"compact\", \ + \"key\" : \"${PREFIX} Annotated Integrons - Integron Finder\", \ + \"category\" : \"MGEs annotation\", \ + \"label\" : \"${PREFIX} Annotated Integrons - Integron Finder\", \ + \"storeClass\" : \"JBrowse/Store/SeqFeature/NCList\", \ + \"style\" : { \"className\" : \"feature\", \"color\": \"#6db6d9\" }, \ + \"trackType\" : \"CanvasFeatures\", \ + \"type\" : \"CanvasFeatures\", \ + \"nameAttributes\" : \"ID,integron_type\", \ + \"urlTemplate\" : \"tracks/${PREFIX} Annotated Integrons - Integron Finder/{refseq}/trackData.json\" } " | add-track-json.pl data/trackList.json + ## PROPHAGES ### PHAST [ $(grep "PHAST" $PROKKAGFF | wc -l) -eq 0 ] || grep "PHAST" $PROKKAGFF > prophage ; diff --git a/conf/defaults.config b/conf/defaults.config index 3a795347..0d43fe48 100644 --- a/conf/defaults.config +++ b/conf/defaults.config @@ -14,8 +14,9 @@ params { // Trigger database download and formatting workflow? --> will not run annotation // Will download and format a database inside {output} parameter - get_dbs = false - force_update = false + get_dbs = false + force_update = false + get_zenodo_db = false // download pre-built database /* @@ -31,6 +32,9 @@ params { // It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet input = null +// Enable reads deduplication for assembly? (If input has reads) + enable_deduplication = false + // path to directory containing databases used by bacannot // you can download databases with: // nextflow run fmalmeida/bacannot --get_dbs --output bacannot_dbs -profile @@ -175,13 +179,13 @@ params { // Select versions of bioconda quay.io additional tools // Tools that are not part of the core of the pipeline, // but can eventually be used by users - unicycler_version = '0.4.8--py38h8162308_3' - flye_version = '2.9--py39h39abbe0_0' - bakta_version = '1.6.1--pyhdfd78af_0' + unicycler_version = '0.5.0--py310h6cc9453_3' + flye_version = '2.9--py39h6935b12_1' + bakta_version = '1.7.0--pyhdfd78af_1' // Max resource options max_memory = '20.GB' max_cpus = 16 max_time = '40.h' -} \ No newline at end of file +} diff --git a/conf/docker.config b/conf/docker.config index 84710c67..28ffa1ab 100644 --- a/conf/docker.config +++ b/conf/docker.config @@ -14,63 +14,97 @@ docker { // specific images process { - // container with various tools for general purposes + // + // Custom pipeline's containers with various tools for general purposes + // withLabel: 'db_download|db_tools|misc' { - container = 'fmalmeida/bacannot:v3.2_misc' - } - - // container for perl tools - withLabel: 'perl' { - container = 'fmalmeida/bacannot:v3.2_perlenv' - } - - // container for python tools - withLabel: 'python' { - container = 'fmalmeida/bacannot:v3.2_pyenv' + container = 'fmalmeida/bacannot@sha256:726e085f1bd71b47c2d8a38fd46d812aab7eb8978bab7bf3cde3aa2b7b3e0f2c' } // container for R tools withLabel: 'renv' { - container = 'fmalmeida/bacannot:v3.2_renv' + container = 'fmalmeida/bacannot@sha256:952f58a2c03e50f8a376073346fb1ccda28d6249e3fdfea07a3286a6ff1adf0c' } // container for bacannot server withLabel: 'server' { - container = 'fmalmeida/bacannot:server' + container = 'fmalmeida/bacannot@sha256:0ec3b289d6e0c624556d125b2ed9b63499178e266a315175fd87cf020a402898' } withLabel: 'jbrowse' { - container = 'fmalmeida/bacannot:jbrowse' + container = 'fmalmeida/bacannot@sha256:6afdca17b561bf212c1f976422aee3fe047563c32a15112a6262556d1f75201e' + } + + // + // Public containers used within the pipeline + // + withName: FLYE { + container = "quay.io/biocontainers/flye:${params.flye_version}" } withName: UNICYCLER { - container = "quay.io/biocontainers/unicycler:${params.unicycler_version}" + container = "quay.io/biocontainers/unicycler:${params.unicycler_version}" } - withName: FLYE { - container = "quay.io/biocontainers/flye:${params.flye_version}" + withName: 'AMRFINDER_DB|AMRFINDER' { + container = "ncbi/amr:3.11.2-2022-12-19.1" } withName: BAKTA { - container = "quay.io/biocontainers/bakta:${params.bakta_version}" + container = "quay.io/biocontainers/bakta:${params.bakta_version}" } - /* - * Other (non-image) customization - */ - - // islandPath dimob container - withName: 'ISLANDPATH' { - // it generally fails without any reason on the first time - errorStrategy = 'retry' - maxRetries = 5 + withName: BARRNAP { + container = "quay.io/biocontainers/barrnap:0.9--hdfd78af_4" } - - // kofamscan container - withName: 'KOFAMSCAN' { - // it generally fails without any reason on the first time - errorStrategy = 'retry' - maxRetries = 2 + + withName: REFSEQ_MASHER { + container = "quay.io/biocontainers/refseq_masher:0.1.2--py_0" + } + + withName: MLST { + container = "quay.io/biocontainers/mlst:2.19.0--hdfd78af_1" + } + + withName: PROKKA { + container = "quay.io/biocontainers/prokka:1.14.6--pl5321hdfd78af_4" + } + + withName: KOFAMSCAN { + container = "quay.io/biocontainers/kofamscan:1.3.0--hdfd78af_2" } + + withName: INTEGRON_FINDER { + container = "quay.io/biocontainers/integron_finder:2.0.1--pyhdfd78af_0" + } + + withName: ISLANDPATH { + container = "quay.io/biocontainers/islandpath:1.0.6--hdfd78af_0" + } + + withName: MOBSUITE { + container = "quay.io/biocontainers/mob_suite:3.1.4--pyhdfd78af_0" + } + + withName: PLASMIDFINDER { + container = "quay.io/biocontainers/plasmidfinder:2.1.6--py310hdfd78af_1" + } + + withName: PLATON { + container = "quay.io/biocontainers/platon:1.6--pyhdfd78af_1" + } + + withName: PHIGARO { + container = "quay.io/biocontainers/phigaro:2.3.0--pyh7b7c402_0" + } + + withName: PHISPY { + container = "quay.io/biocontainers/phispy:4.2.21--py39h7cff6ad_0" + } + + withName: CARD_RGI { + container = "quay.io/biocontainers/rgi:5.2.1--pyhdfd78af_1" + } + } diff --git a/conf/singularity.config b/conf/singularity.config index a7ca1839..f75a7559 100644 --- a/conf/singularity.config +++ b/conf/singularity.config @@ -1,7 +1,9 @@ // Container usage and permission -docker.enabled = false -singularity.enabled = true -singularity.runOptions = '--writable-tmpfs' +docker.enabled = false +singularity.enabled = true +singularity.runOptions = '--writable-tmpfs -e --no-home -B $PWD' +singularity.autoMounts = true +env.SINGULARITY_DISABLE_CACHE = 1 /* @@ -12,63 +14,98 @@ singularity.runOptions = '--writable-tmpfs' // specific images process { - // container with various tools for general purposes + // + // Custom pipeline's containers with various tools for general purposes + // withLabel: 'db_download|db_tools|misc' { - container = 'docker://fmalmeida/bacannot:v3.2_misc' - } - - // container for perl tools - withLabel: 'perl' { - container = 'docker://fmalmeida/bacannot:v3.2_perlenv' - } - - // container for python tools - withLabel: 'python' { - container = 'docker://fmalmeida/bacannot:v3.2_pyenv' + container = 'docker://fmalmeida/bacannot@sha256:726e085f1bd71b47c2d8a38fd46d812aab7eb8978bab7bf3cde3aa2b7b3e0f2c' } // container for R tools withLabel: 'renv' { - container = 'docker://fmalmeida/bacannot:v3.2_renv' + container = 'docker://fmalmeida/bacannot@sha256:952f58a2c03e50f8a376073346fb1ccda28d6249e3fdfea07a3286a6ff1adf0c' } // container for bacannot server withLabel: 'server' { - container = 'docker://fmalmeida/bacannot:server' + container = 'docker://fmalmeida/bacannot@sha256:0ec3b289d6e0c624556d125b2ed9b63499178e266a315175fd87cf020a402898' } withLabel: 'jbrowse' { - container = 'docker://fmalmeida/bacannot:jbrowse' + container = 'docker://fmalmeida/bacannot@sha256:6afdca17b561bf212c1f976422aee3fe047563c32a15112a6262556d1f75201e' + } + + // + // Public containers used within the pipeline + // + withName: FLYE { + container = "https://depot.galaxyproject.org/singularity/flye:${params.flye_version}" } withName: UNICYCLER { - container = "https://depot.galaxyproject.org/singularity/unicycler:${params.unicycler_version}" + container = "https://depot.galaxyproject.org/singularity/unicycler:${params.unicycler_version}" } - withName: FLYE { - container = "https://depot.galaxyproject.org/singularity/flye:${params.flye_version}" + withName: 'AMRFINDER_DB|AMRFINDER' { + container = "docker://ncbi/amr:3.11.2-2022-12-19.1" } withName: BAKTA { - container = "https://depot.galaxyproject.org/singularity/bakta:${params.bakta_version}" + container = "https://depot.galaxyproject.org/singularity/bakta:${params.bakta_version}" } - /* - * Other (non-image) customization - */ - - // islandPath dimob container - withName: 'ISLANDPATH' { - // it generally fails without any reason on the first time - errorStrategy = 'retry' - maxRetries = 5 + withName: BARRNAP { + container = "https://depot.galaxyproject.org/singularity/barrnap:0.9--hdfd78af_4" } - - // kofamscan container - withName: 'KOFAMSCAN' { - // it generally fails without any reason on the first time - errorStrategy = 'retry' - maxRetries = 2 + + withName: REFSEQ_MASHER { + container = "https://depot.galaxyproject.org/singularity/refseq_masher:0.1.2--py_0" + } + + withName: MLST { + container = "https://depot.galaxyproject.org/singularity/mlst:2.19.0--hdfd78af_1" + } + + withName: PROKKA { + container = "https://depot.galaxyproject.org/singularity/prokka:1.14.6--pl5321hdfd78af_4" + } + + withName: KOFAMSCAN { + container = "https://depot.galaxyproject.org/singularity/kofamscan:1.3.0--hdfd78af_2" } + + withName: INTEGRON_FINDER { + container = "https://depot.galaxyproject.org/singularity/integron_finder:2.0.1--pyhdfd78af_0" + } + + withName: ISLANDPATH { + container = "https://depot.galaxyproject.org/singularity/islandpath:1.0.6--hdfd78af_0" + } + + withName: MOBSUITE { + // container = "https://depot.galaxyproject.org/singularity/mob_suite:3.1.4--pyhdfd78af_0" + container = "docker://quay.io/biocontainers/mob_suite:3.1.4--pyhdfd78af_0" + } + + withName: PLASMIDFINDER { + container = "https://depot.galaxyproject.org/singularity/plasmidfinder:2.1.6--py310hdfd78af_1" + } + + withName: PLATON { + container = "https://depot.galaxyproject.org/singularity/platon:1.6--pyhdfd78af_1" + } + + withName: PHIGARO { + container = "https://depot.galaxyproject.org/singularity/phigaro:2.3.0--pyh7b7c402_0" + } + + withName: PHISPY { + container = "https://depot.galaxyproject.org/singularity/phispy:4.2.21--py39h7cff6ad_0" + } + + withName: CARD_RGI { + container = "https://depot.galaxyproject.org/singularity/rgi:5.2.1--pyhdfd78af_1" + } + } diff --git a/docker/misc/Dockerfile b/docker/misc/Dockerfile index 3853a59f..3e7c3422 100644 --- a/docker/misc/Dockerfile +++ b/docker/misc/Dockerfile @@ -2,42 +2,63 @@ FROM nfcore/base LABEL authors="Felipe Almeida" \ description="Docker image containing any-based bacannot tools" -# Install the conda environment -RUN conda install -c bioconda -c defaults -c conda-forge -c anaconda -c falmeida --force-reinstall --update-deps --no-channel-priority \ - curl \ - git \ - 'python=3.7' \ - 'blast=2.12.0' \ - 'diamond=2.0.15' \ - 'bedtools=2.30' \ - 'samtools=1.14' \ - 'kma' \ - 'kofamscan' \ - 'ncbi-amrfinderplus' \ - 'nanopolish' \ - 'biopython==1.78' \ - gff-toolbox \ - seqkit && \ +# install mamba +RUN conda install -n base -c conda-forge 'mamba=1.5' --yes && \ conda clean -afy +RUN pip install --upgrade pip -# Create env for digIS -RUN conda create -y -n digIS -c bioconda -c defaults -c conda-forge -c anaconda -c falmeida --no-channel-priority 'hmmer==3.1b2' 'biopython==1.77' nomkl && \ - conda clean -afy +# Install the conda environment +RUN mamba install -y \ + -c bioconda -c defaults -c conda-forge -c anaconda -c falmeida \ + --no-channel-priority \ + 'python=3.9' \ + 'blast>=2.12' \ + 'diamond>=2.0.15' \ + 'bedtools>=2.30' \ + 'kma' \ + 'nanopolish' \ + 'biopython==1.78' \ + seqkit \ + bioawk \ + 'easy_circos==0.4' \ + 'conda-forge::openssl>=1.1' \ + 'pyproj=3.2' \ + emboss \ + libtiff \ + jq && \ + mamba clean -afy +RUN git clone https://github.com/fmalmeida/pythonScripts.git && \ + cd pythonScripts && \ + pip install . && \ + falmeida-py --help -# Create env for antismash -RUN conda create -y -n antismash \ - -c bioconda -c defaults -c conda-forge -c anaconda -c falmeida --no-channel-priority \ - 'antismash>=6' 'anaconda::jinja2' 'anaconda::markupsafe' emboss nomkl && \ - rm -r /opt/conda/envs/antismash/lib/*/site-packages/antismash/databases && \ - conda clean -afy +# Install samtools +RUN apt-get update -y && apt-get install -y samtools + +# Install gff-toolbox +RUN git clone https://github.com/fmalmeida/gff-toolbox.git +RUN cd gff-toolbox && \ + python3 -m pip install --upgrade pip 'matplotlib==3.7.3' && \ + python3 setup.py install && \ + gff-toolbox -h + +# Create env for digIS +RUN mamba create -y \ + -c bioconda -c defaults -c conda-forge -c anaconda -c falmeida \ + -n digIS \ + --no-channel-priority \ + 'hmmer==3.1b2' 'biopython==1.77' nomkl && \ + mamba clean -afy # Install pip packages -RUN pip install docopt pandas tabulate numpy bcbio-gff cgecore gitpython setuptools python-dateutil 'biopython==1.78' +# RUN pip install docopt pandas tabulate numpy bcbio-gff cgecore gitpython setuptools python-dateutil 'biopython==1.78' # Install KEGGDecoder -RUN conda create -n KEGGDecoder python=3.6 && \ +RUN mamba create \ + -n KEGGDecoder \ + python=3.6 && \ conda run -n KEGGDecoder python3 -m pip install KEGGDecoder && \ - conda clean -afy + mamba clean -afy # set CONDA_PREFIX ENV CONDA_PREFIX=/opt/conda @@ -48,29 +69,35 @@ COPY argminer_bkp/argminer.fasta /work/argminer.fasta COPY victors_bkp/victors_06-2022.fasta /work/victors.fasta # get a copy of resfinder -RUN conda create -y -n resfinder \ - -c bioconda -c anaconda -c conda-forge -c defaults \ +RUN mamba create -y -n resfinder \ + -c bioconda -c defaults -c conda-forge -c anaconda -c falmeida \ 'resfinder>=4.1' docopt pandas && \ - conda clean -afy + mamba clean -afy # get a copy of digis RUN git clone -b master https://github.com/janka2012/digIS.git COPY custom_fix_grange_digis.py /work/digIS/src/common/grange.py ENV PATH=/work/digIS:$PATH -# install jq -RUN apt-get update && apt-get install -y jq +# Create env for antismash +RUN mamba create -y -n antismash -c bioconda -c conda-forge \ + 'bioconda::antismash>=6' 'anaconda::flask' 'anaconda::jinja2' 'anaconda::markupsafe' nomkl && \ + rm -rf /opt/conda/envs/antismash/lib/*/site-packages/antismash/databases && \ + mamba clean -afy + +# fix bioperl +RUN mamba create -n perl -y \ + -c bioconda -c conda-forge -c anaconda -c defaults \ + perl-bioperl perl-app-cpanminus perl-yaml +RUN mamba run -n perl PERL5LIB= PERL_LOCAL_LIB_ROOT= cpanm Bio::Root::RootI -# install bioawk -RUN conda create -y -n bioawk \ - -c bioconda -c anaconda -c conda-forge -c defaults \ - bioawk && \ - conda clean -afy +# fix python +RUN python3 -m pip install cryptography==38.0.4 + +# install get zenodo +RUN pip3 install zenodo_get # fix permissions RUN chmod 777 -R /work -RUN chmod 777 -R /opt/conda/envs/antismash/lib/*/site-packages/antismash -RUN chmod 777 -R /opt/conda/envs/resfinder - -# fix antismash download script -# RUN sed -i 's|ftp://|http://|g' /opt/conda/envs/antismash/lib/*/site-packages/antismash/download_databases.py \ No newline at end of file +RUN chmod 777 -R /opt/conda/envs/antismash/lib/**/site-packages/antismash +RUN chmod 777 -R /opt/conda/envs/resfinder \ No newline at end of file diff --git a/docker/perlenv/Dockerfile b/docker/perlenv/Dockerfile deleted file mode 100644 index 2e8b1d24..00000000 --- a/docker/perlenv/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -FROM nfcore/base -LABEL authors="Felipe Almeida" -LABEL description="Docker image containing perl-based bacannot tools" -ENV IMAGE=perl - -# Install the conda environment -RUN conda install \ - -c bioconda -c defaults -c conda-forge -c anaconda -c r -c falmeida \ - --force-reinstall --update-deps --no-channel-priority \ - 'prokka>=1.14' \ - 'hmmer=3.1b2' \ - barrnap \ - mlst \ - 'islandpath>=1.0.6' \ - 'python>3' \ - 'perl>=5.26' \ - 'perl-bioperl>=1.7.8' \ - perl-digest-sha1 \ - perl-app-cpanminus \ - perl-local-lib \ - 'easy_circos>=0.3' -ENV PERL5LIB="/opt/conda/lib/perl5/site_perl" - -# Fix perl -- prokka -RUN apt-get update -y && apt-get install -y build-essential libexpat1-dev -RUN cpanm Test::Needs --force --reinstall -RUN cpanm Test::RequiresInternet Test::NoWarnings --force --reinstall -RUN conda install -c conda-forge -y 'perl-xml-parser>2.44' -RUN cpanm XML::Twig Bio::Perl --force --reinstall || true -RUN apt-get install -y libtiff5 - -# set CONDA_PREFIX -ENV CONDA_PREFIX=/opt/conda - -WORKDIR /work - -# fix permissions for singularity -RUN chmod -R 777 /work /opt/conda/db \ No newline at end of file diff --git a/docker/perlenv/build.sh b/docker/perlenv/build.sh deleted file mode 100644 index 51153a5c..00000000 --- a/docker/perlenv/build.sh +++ /dev/null @@ -1 +0,0 @@ -../../bin/build_image.sh $1 diff --git a/docker/pyenv/Dockerfile b/docker/pyenv/Dockerfile deleted file mode 100644 index 3a4fe8bc..00000000 --- a/docker/pyenv/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -FROM nfcore/base -LABEL authors="Felipe Almeida" \ - description="Docker image containing python-based bacannot tools" - -# Install the conda environment -# RUN conda install -c conda-forge -y 'mamba>=0.21' -RUN conda install -y \ - -c bioconda -c defaults -c conda-forge -c anaconda \ - 'openssl=1.1.1' \ - 'platon>=1.6' \ - phispy \ - plasmidfinder \ - 'python>=3.7' \ - refseq_masher \ - 'gsl==2.7' -RUN conda install -c conda-forge -y 'mamba>=0.21' -RUN mamba create -n rgi -c bioconda -c defaults -c conda-forge -c anaconda 'rgi>=5.2.1' -RUN mamba create -n phigaro -c bioconda -c defaults -c conda-forge -c anaconda phigaro -RUN mamba create -n falmeida-py -c falmeida -c bioconda -c defaults -c conda-forge -c anaconda 'falmeida-py>=0.9' - -# set CONDA_PREFIX -ENV CONDA_PREFIX=/opt/conda - -# Fix permissions -WORKDIR /work -RUN chmod -R 777 /work /opt/conda/envs/rgi/lib/python*/site-packages/app \ No newline at end of file diff --git a/docker/pyenv/build.sh b/docker/pyenv/build.sh deleted file mode 100644 index 51153a5c..00000000 --- a/docker/pyenv/build.sh +++ /dev/null @@ -1 +0,0 @@ -../../bin/build_image.sh $1 diff --git a/docker/renv/Dockerfile b/docker/renv/Dockerfile index e28066a8..5641c0b1 100644 --- a/docker/renv/Dockerfile +++ b/docker/renv/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:22.04 LABEL MAINTAINER Felipe Marques de Almeida @@ -13,7 +13,7 @@ RUN apt-get update && \ ## Install R RUN DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata && \ - DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y r-base r-base-core r-api-3.5 + DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y r-base r-base-core ## Install R-packages RUN DEBIAN_FRONTEND=noninteractive apt-get update && \ diff --git a/docker/renv/reports/no_integronfinder.Rmd b/docker/renv/reports/no_integronfinder.Rmd new file mode 100644 index 00000000..175d25ce --- /dev/null +++ b/docker/renv/reports/no_integronfinder.Rmd @@ -0,0 +1 @@ +Not a integron have been predicted with [Integron Finder](https://github.com/gem-pasteur/Integron_Finder). This might have happened either because your genome really do not have integron sequences or due to misassemblies. You can always try to run the online version of the tool: https://integronfinder.readthedocs.io/en/latest/user_guide/webserver.html diff --git a/docker/renv/reports/report_MGEs.Rmd b/docker/renv/reports/report_MGEs.Rmd index f17cabfe..80fed6a0 100644 --- a/docker/renv/reports/report_MGEs.Rmd +++ b/docker/renv/reports/report_MGEs.Rmd @@ -13,8 +13,10 @@ params: phispy_tsv: plasmid_finder_tab: platon_tsv: + mobsuite_tsv: gi_image: digis: + integronfinder: query: gff: output: @@ -47,10 +49,12 @@ check_lines <- function(x) { # Read plasmids plasmid_finder_tab <- try(read.csv(params$plasmid_finder_tab, sep = "\t"), silent = TRUE) -platon_tsv <- try(read.csv(params$platon_tsv, sep = "\t"), silent = TRUE) +platon_tsv <- try(read.csv(params$platon_tsv, sep = "\t"), silent = TRUE) +mobsuite_tsv <- try(read.csv(params$mobsuite_tsv, sep = "\t", header=TRUE), silent = TRUE) # always have a line for chr if ( (class(plasmid_finder_tab) != 'try-error' & check_lines(plasmid_finder_tab) > 0) | - (class(platon_tsv) != 'try-error' & check_lines(platon_tsv) > 0) + (class(platon_tsv) != 'try-error' & check_lines(platon_tsv) > 0) | + (class(mobsuite_tsv) != 'try-error' & check_lines(mobsuite_tsv) > 1) ) { plasmids_not_null <- TRUE plasmids_null <- FALSE @@ -72,6 +76,16 @@ if (class(digis_gff) != 'try-error' & check_lines(digis_gff) > 0) { digis_null <- TRUE } +## Read Integron Finder GFF +integronfinder_gff <- try(gffRead(params$integronfinder), silent = TRUE) +if (class(integronfinder_gff) != 'try-error' & check_lines(integronfinder_gff) > 0) { + integronfinder_not_null <- TRUE + integronfinder_null <- FALSE +} else { + integronfinder_not_null <- FALSE + integronfinder_null <- TRUE +} + ## Read PHAST documents phast_prot_blast <- try(read.delim(params$phast_prot_blast, header = TRUE), silent = TRUE) phast_genome_blast <- try(read.delim(params$phast_genome_blast, header = TRUE), silent = TRUE) @@ -173,10 +187,15 @@ In this context, this pipeline is capable of automatically annotating some mobil 6. [Platon](https://github.com/oschwengers/platon); + Platon detects plasmid contigs within bacterial draft genomes from WGS short-read assemblies. + Therefore, Platon analyzes the natural distribution biases of certain protein coding genes between chromosomes and plasmids. -7. [IslandPath](https://github.com/brinkmanlab/islandpath). +7. [MOB Suite](https://github.com/phac-nml/mob-suite); + + Software tools for clustering, reconstruction and typing of plasmids from draft assemblies. + + In the pipeline, only the typer tool is used. +8. [IslandPath](https://github.com/brinkmanlab/islandpath). + IslandPath-DIMOB is a standalone software to predict genomic islands in bacterial and archaeal genomes based on the presence of dinucleotide biases and mobility genes. -8. [digIS](https://github.com/janka2012/digIS). +9. [digIS](https://github.com/janka2012/digIS). + digIS is a command-line tool for detection of insertion sequences (IS) in prokaryotic genomes. +10. [Integron Finder](https://github.com/gem-pasteur/Integron_Finder). + + a command line tool to identity integrons in DNA sequences ### Prediction thresholds @@ -231,4 +250,12 @@ knitr::include_graphics(gi_image) ``` ```{r, digis_conditional_block_2, echo=FALSE, results='asis', eval=digis_null, child='no_digis.Rmd'} +``` + +## Integron detection + +```{r, integronfinder_conditional_block, echo=FALSE, results='asis', eval=integronfinder_not_null, child='yes_integronfinder.Rmd'} +``` + +```{r, integronfinder_conditional_block_2, echo=FALSE, results='asis', eval=integronfinder_null, child='no_integronfinder.Rmd'} ``` \ No newline at end of file diff --git a/docker/renv/reports/yes_digis.Rmd b/docker/renv/reports/yes_digis.Rmd index c3aedaa3..b52eccc0 100644 --- a/docker/renv/reports/yes_digis.Rmd +++ b/docker/renv/reports/yes_digis.Rmd @@ -11,7 +11,7 @@ Insertions sequences have been predicted with [digIS](https://github.com/janka20 > The program is executed **with** the GenBank annotation
-(#tab:write-table-ices-full) Insertions sequences predicted by digIS in GFF format. +(#tab:write-table-digis-full) Insertions sequences predicted by digIS in GFF format. ```{r} datatable(digis_gff, escape = FALSE, diff --git a/docker/renv/reports/yes_integronfinder.Rmd b/docker/renv/reports/yes_integronfinder.Rmd new file mode 100644 index 00000000..164e7137 --- /dev/null +++ b/docker/renv/reports/yes_integronfinder.Rmd @@ -0,0 +1,12 @@ +Integrons have been predicted with [Integron Finder](https://github.com/gem-pasteur/Integron_Finder). More information on how the software operates can be found in its [paper](https://www.mdpi.com/2076-2607/10/4/700). + +
+(#tab:write-table-integronfinder-full) Integrons predicted by Integron Finder in GFF format. +```{r} +datatable(integronfinder_gff, + escape = FALSE, + filter = 'top', + options = dt_opt_lst, + extensions = 'Buttons', + rownames = F) +``` \ No newline at end of file diff --git a/docker/renv/reports/yes_plasmids.Rmd b/docker/renv/reports/yes_plasmids.Rmd index b01dd20f..7af439d9 100644 --- a/docker/renv/reports/yes_plasmids.Rmd +++ b/docker/renv/reports/yes_plasmids.Rmd @@ -54,4 +54,30 @@ datatable(results, columnDefs = list(list(visible=FALSE, targets=c(1,2)))), extensions = 'Buttons', rownames = F) +``` + +### MOB suite (typer) + +[MOB-typer](https://github.com/phac-nml/mob-suite) provides _in silico_ predictions of the replicon family, relaxase type, mate-pair formation type and predicted transferability of the plasmid. Using a combination of biomarkers and MOB-cluster codes, it will also provide an observed host-range of your plasmid based on its replicon, relaxase and cluster assignment. This is combined with information mined from the literature to provide a prediction of the taxonomic rank at which the plasmid is likely to be stably maintained but it does not provide source attribution predictions. + +* The complete results can be found in the directory `plasmids/mob_suite` under the main output directory. + +(#tab:mobsuite-results) In silico typing of plasmids with MOB suite +```{r} +results <- mobsuite_tsv + +# Render dt +datatable(results, + escape = FALSE, + filter = 'top', + options = list(pageLength = 5, + lengthMenu = c(5, 10, 15, 20, 50), + dom='flrtBip', + buttons = c('copy', 'csv', 'excel', 'colvis'), + scrollX = TRUE, + fixedColumns = FALSE, + autoWidth = TRUE, + columnDefs = list(list(visible=FALSE, targets=c(1,2)))), + extensions = 'Buttons', + rownames = F) ``` \ No newline at end of file diff --git a/docker/renv/scripts/rscripts/gff2sql.R b/docker/renv/scripts/rscripts/gff2sql.R deleted file mode 100644 index 61eac29d..00000000 --- a/docker/renv/scripts/rscripts/gff2sql.R +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/Rscript -doc <- 'usage: gff2sql.R [--input= --out= --fasta= --nucleotide= --aminoacid=] - -options: - -i, --input= GFF file to transform in SQL - -o, --out= SQL database name to output [default: out.sql] - -n, --nucleotide= Takes in the nucleotide FASTA. - -a, --aminoacid= Takes in the protein FASTA - -f, --fasta= Takes in the genome FASTA' - -# Loading required packages -suppressMessages(library("docopt")) -suppressMessages(library(RSQLite)) -suppressMessages(library(dplyr)) -suppressMessages(library(stringr)) -suppressMessages(library(DataCombine)) -suppressMessages(library(Biostrings)) - -# Parse help -opt <- docopt(doc) - -# Useful functions -## Query the 9th column -getAttributeField <- function (x, field, attrsep = ";") { - s = strsplit(x, split = attrsep, fixed = TRUE) - sapply(s, function(atts) { - a = strsplit(atts, split = "=", fixed = TRUE) - m = match(field, sapply(a, "[", 1)) - if (!is.na(m)) { rv = a[[m]][2] - } - else { - rv = as.character(NA) - } - return(rv) - }) -} - -## Add table to SQL db -addTable <- function (con, sql, input) { - ## Open db - suppressWarnings(dbBegin(con)) - - ## Send rule - res <- suppressWarnings(dbSendQuery(con, sql)) - - ## Insert data based on rule - suppressWarnings(dbBind(res, input)) - suppressWarnings(dbFetch(res)) - suppressWarnings(dbClearResult(res)) - - ## Close db - suppressWarnings(dbCommit(con)) -} - -# Loading SQL database driver -drv <- dbDriver("SQLite") -dbname <- file.path("/work", opt$out) -con <- dbConnect(drv, dbname=dbname) - -##################################### -### First STEP load GENOME to sql ### -##################################### -fastaFile <- readDNAStringSet(opt$fasta) -seq_name = names(fastaFile) -#sequence = paste(fastaFile) -sequence_len = sapply(fastaFile, function(x) { - length(x)[[1]] -}) -genome <- data.frame(seq_name, sequence_len) -names(genome) <- c("Contig", "Length") - -# Create SQL table for the genome sequence -suppressWarnings(dbGetQuery(con, "CREATE Table Genome (Contig TEXT, Length TEXT)")) -# Create sql rule -sql <- "INSERT INTO Genome VALUES ($Contig, $Length)" -# Add to SQL db -addTable(con, sql, genome) - -################################### -### Second STEP load GFF to sql ### -################################### - -# Loading GFF file -gff <- read.delim(opt$input, header = FALSE, stringsAsFactors = FALSE) -# Give data a header -names(gff) <- c("chr", "source", "feature", "start", "end", "score", "strand", "frame", "attributes") -# Get IDs -gff$ID <- getAttributeField(as.character(gff$attributes), "ID", ";") -# Reorder columns -gff <- gff %>% select(chr, source, ID, feature, start, end, score, strand, frame, attributes) -# Create SQL table to store GFF data -suppressWarnings(dbGetQuery(con, "CREATE Table GFF (Contig TEXT, Source TEXT, ID TEXT, Feature TEXT, - Start INTEGER, End INTEGER, Score INTEGER, Strand TEXT, - Frame INTEGER, Attributes TEXT)")) -# Create sql rule -sql <- "INSERT INTO GFF VALUES ($chr, $source, $ID, $feature, -$start, $end, $score, $strand, $frame, $attributes)" -# Add to SQL db -addTable(con, sql, gff) - -############################################## -### Third STEP load gene nucl fasta to sql ### -############################################## - -## Loading Protein fasta -genes <- readAAStringSet(opt$aminoacid) -gene_ids <- sapply(names(genes), function(x) { - unlist(strsplit(as.character(x), " "))[1] -}) -gene_desc <- sapply(names(genes), function(x) { - paste0(unlist(strsplit(as.character(x), " "))[-1], collapse = " ") -}) -sequences = paste(genes) -genes_aa <- data.frame(gene_ids, gene_desc, sequences) -names(genes_aa) <- c("ID", "Description", "Sequence") -## Create SQL table to store Protein FASTA -suppressWarnings(dbGetQuery(con, "CREATE Table ProteinFasta (ID TEXT, Description TEXT, Sequence TEXT)")) -## Create sql rule -sql <- "INSERT INTO ProteinFasta VALUES ($ID, $Description, $Sequence)" -# Add to SQL db -addTable(con, sql, genes_aa) - -## Loading Nucleotide fasta -genes <- readDNAStringSet(opt$nucleotide) -gene_ids <- sapply(names(genes), function(x) { - unlist(strsplit(as.character(x), " "))[1] -}) -gene_desc <- sapply(names(genes), function(x) { - paste0(unlist(strsplit(as.character(x), " "))[-1], collapse = " ") -}) -sequences = paste(genes) -genes_ncl <- data.frame(gene_ids, gene_desc, sequences) -names(genes_ncl) <- c("ID", "Description", "Sequence") -## Create SQL table to store Protein FASTA -suppressWarnings(dbGetQuery(con, "CREATE Table NucleotideFasta (ID TEXT, Description TEXT, Sequence TEXT)")) -## Create sql rule -sql <- "INSERT INTO NucleotideFasta VALUES ($ID, $Description, $Sequence)" -# Add to SQL db -addTable(con, sql, genes_ncl) diff --git a/docker/set_version.sh b/docker/set_version.sh index b082ce7f..f02bd27e 100644 --- a/docker/set_version.sh +++ b/docker/set_version.sh @@ -1 +1 @@ -export NEW_VERSION=v3.2 +export NEW_VERSION=v3.3 diff --git a/docs/config.md b/docs/config.md index 00664232..9aebaa19 100644 --- a/docs/config.md +++ b/docs/config.md @@ -13,191 +13,5 @@ Default configuration --------------------- ```groovy -/* - - Required / Default Parameters. - This parameters must always be set - -*/ -params { - - /* - - DB DOWNLOAD WORKFLOW - - */ - -// Trigger database download and formatting workflow? --> will not run annotation -// Will download and format a database inside {output} parameter - get_dbs = false - force_update = false - - /* - - ANNOTATION INPUTS - - */ - -// Input data mus be given inside a well-formated samplesheet. -// We provide a well-formated example at: https://github.com/fmalmeida/test_datasets/raw/main/bacannot_testing_samplesheets/samplesheet.yaml -// -// Please read the example samplesheet so you can understand how to properly fill it. -// -// It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet - input = null - -// path to directory containing databases used by bacannot -// you can download databases with: -// nextflow run fmalmeida/bacannot --get_dbs --output bacannot_dbs -profile - bacannot_db = null - - /* - - GENERAL PARAMETERS - - */ - -// Main output folder name. More than one bacannot annotation can be redirected -// to the same output parameter. It is good to keep related annotations together. -// A subdirectory with the filename will be created inside this directory. - output = 'results' - -// Number of minimum overlapping base pairs required for merging -// Negative values, such as -20, means the number of required overlapping bases for merging. -// Positive values, such as 5, means the maximum distance accepted between features for merging. -// By default (if Blank), this process is not executed. For execution the user needs to provide a value - bedtools_merge_distance = null - - /* - * Bakta optional - */ -// If user set path to an existing bakta database, the pipeline will use bakta instead of prokka - bakta_db = null - - /* - * Prokka optional parameters - */ -// Include comprehensive PGAP hmm database in prokka annotation instead of TIGRFAM. -// PGAP is big and using it may have higher running times but better results - prokka_use_pgap = false - -// Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria') - prokka_kingdom = null - -// Translation table code. Must be set if the above is set. -// Example: params.prokka_genetic.code = 11 - prokka_genetic_code = null - -// Use rnammer instead of Barrnap? False or True? - prokka_use_rnammer = false - - /* - * Resfinder species panel - */ - -// Species panel to be used when annotating with Resfinder. -// It sets a default for all samples in the samplesheet. -// If a sample has a different value inside the samplesheet it will overwrite the value for that sample -// If blank it will not be executed. -// It must be identical (without the *) as written in their webservice https://cge.cbs.dtu.dk/services/ResFinder/. -// E.g. 'Escherichia coli'; 'Klebsiella' ... - resfinder_species = null - - /* - * Handling the execution of processes - * - * By default, all processes are executed. These - * parameters tells wheter NOT to run a process. - * - * Which means: false will allow its execution - * while true will create a barrier and skip a process. - */ -// (NOT RUN?) Plasmids annotation (controls PlasmidFinder execution) - skip_plasmid_search = false - -// (NOT RUN?) General Virulence annotation (controls VFDB and Victors scan) - skip_virulence_search = false - -// (NOT RUN?) Resistance annotation (controls AMRfinder and RGI) - skip_resistance_search = false - -// (NOT RUN?) ICE annotation (controls ICEberg annotation) - skip_iceberg_search = false - -// (NOT RUN?) prophage annotation (controls PHAST and Phigaro) - skip_prophage_search = false - -// (NOT RUN?) KO (KEGG Orthology) annotation - skip_kofamscan = false - -// (NOT RUN?) antiSMASH (secondary metabolite) annotation - skip_antismash = false - - /* - * Custom databases can be used to annotate additional genes in the genome. - * It runs a BLAST alignment against the genome, therefore, the custom database - * More than one custom database can be given separated by commas. - * Gene headers must be properly formated as described in the - * documentation: https://bacannot.readthedocs.io/en/latest/custom-db - */ -// Custom fastas (PROT / NUCL) - custom_db = null -// Custom annotation using list of NCBI protein accs - ncbi_proteins = null - - /* - * Annotation thresholds to be used when scanning specific databases and features - * Select a combination of thresholds that is meaningful for your data. Some of - * the databases are protein-only, others are nucleotide only. We cannnot control - * that and the databases will be scanned either if blastp or blastn using these - * thresholds described here. - */ - -// Identity threshold for plasmid annotation - plasmids_minid = 90 - -// Coverage threshold for plasmid annotation - plasmids_mincov = 60 - -// Virulence genes identity threshold - blast_virulence_minid = 90 - -// Virulence genes coverage threshold - blast_virulence_mincov = 90 - -// AMR genes identity threshold - blast_resistance_minid= 90 - -// AMR genes coverage threshold - blast_resistance_mincov = 90 - -// MGEs (ICEs and Phages) identity threshold - blast_MGEs_minid = 85 - -// MGEs (ICEs and Phages) coverage threshold - blast_MGEs_mincov = 85 - -// User's custom database identity threshold - blast_custom_minid = 65 - -// User's custom database coverage threshold - blast_custom_mincov = 65 - - /* - * Resources allocation configuration - * Defaults only, expecting to be overwritten - */ -// Select versions of bioconda quay.io additional tools -// Tools that are not part of the core of the pipeline, -// but can eventually be used by users - unicycler_version = '0.4.8--py38h8162308_3' - flye_version = '2.9--py39h39abbe0_0' - bakta_version = '1.6.1--pyhdfd78af_0' - -// Max resource options - max_memory = '20.GB' - max_cpus = 16 - max_time = '40.h' - -} +{% include 'defaults.config' %} ``` \ No newline at end of file diff --git a/docs/defaults.config b/docs/defaults.config new file mode 100644 index 00000000..0d43fe48 --- /dev/null +++ b/docs/defaults.config @@ -0,0 +1,191 @@ +/* + + Required / Default Parameters. + This parameters must always be set + +*/ +params { + + /* + + DB DOWNLOAD WORKFLOW + + */ + +// Trigger database download and formatting workflow? --> will not run annotation +// Will download and format a database inside {output} parameter + get_dbs = false + force_update = false + get_zenodo_db = false // download pre-built database + + /* + + ANNOTATION INPUTS + + */ + +// Input data mus be given inside a well-formated samplesheet. +// We provide a well-formated example at: https://github.com/fmalmeida/test_datasets/raw/main/bacannot_testing_samplesheets/samplesheet.yaml +// +// Please read the example samplesheet so you can understand how to properly fill it. +// +// It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet + input = null + +// Enable reads deduplication for assembly? (If input has reads) + enable_deduplication = false + +// path to directory containing databases used by bacannot +// you can download databases with: +// nextflow run fmalmeida/bacannot --get_dbs --output bacannot_dbs -profile + bacannot_db = null + + /* + + GENERAL PARAMETERS + + */ + +// Main output folder name. More than one bacannot annotation can be redirected +// to the same output parameter. It is good to keep related annotations together. +// A subdirectory with the filename will be created inside this directory. + output = 'results' + +// Number of minimum overlapping base pairs required for merging +// Negative values, such as -20, means the number of required overlapping bases for merging. +// Positive values, such as 5, means the maximum distance accepted between features for merging. +// By default (if Blank), this process is not executed. For execution the user needs to provide a value + bedtools_merge_distance = null + + /* + * Bakta optional + */ +// If user set path to an existing bakta database, the pipeline will use bakta instead of prokka + bakta_db = null + + /* + * Prokka optional parameters + */ +// Include comprehensive PGAP hmm database in prokka annotation instead of TIGRFAM. +// PGAP is big and using it may have higher running times but better results + prokka_use_pgap = false + +// Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria') + prokka_kingdom = null + +// Translation table code. Must be set if the above is set. +// Example: params.prokka_genetic.code = 11 + prokka_genetic_code = null + +// Use rnammer instead of Barrnap? False or True? + prokka_use_rnammer = false + + /* + * Resfinder species panel + */ + +// Species panel to be used when annotating with Resfinder. +// It sets a default for all samples in the samplesheet. +// If a sample has a different value inside the samplesheet it will overwrite the value for that sample +// If blank it will not be executed. +// It must be identical (without the *) as written in their webservice https://cge.cbs.dtu.dk/services/ResFinder/. +// E.g. 'Escherichia coli'; 'Klebsiella' ... + resfinder_species = null + + /* + * Handling the execution of processes + * + * By default, all processes are executed. These + * parameters tells wheter NOT to run a process. + * + * Which means: false will allow its execution + * while true will create a barrier and skip a process. + */ +// (NOT RUN?) Plasmids annotation (controls PlasmidFinder execution) + skip_plasmid_search = false + +// (NOT RUN?) General Virulence annotation (controls VFDB and Victors scan) + skip_virulence_search = false + +// (NOT RUN?) Resistance annotation (controls AMRfinder and RGI) + skip_resistance_search = false + +// (NOT RUN?) ICE annotation (controls ICEberg annotation) + skip_iceberg_search = false + +// (NOT RUN?) prophage annotation (controls PHAST and Phigaro) + skip_prophage_search = false + +// (NOT RUN?) KO (KEGG Orthology) annotation + skip_kofamscan = false + +// (NOT RUN?) antiSMASH (secondary metabolite) annotation + skip_antismash = false + + /* + * Custom databases can be used to annotate additional genes in the genome. + * It runs a BLAST alignment against the genome, therefore, the custom database + * More than one custom database can be given separated by commas. + * Gene headers must be properly formated as described in the + * documentation: https://bacannot.readthedocs.io/en/latest/custom-db + */ +// Custom fastas (PROT / NUCL) + custom_db = null +// Custom annotation using list of NCBI protein accs + ncbi_proteins = null + + /* + * Annotation thresholds to be used when scanning specific databases and features + * Select a combination of thresholds that is meaningful for your data. Some of + * the databases are protein-only, others are nucleotide only. We cannnot control + * that and the databases will be scanned either if blastp or blastn using these + * thresholds described here. + */ + +// Identity threshold for plasmid annotation + plasmids_minid = 90 + +// Coverage threshold for plasmid annotation + plasmids_mincov = 60 + +// Virulence genes identity threshold + blast_virulence_minid = 90 + +// Virulence genes coverage threshold + blast_virulence_mincov = 90 + +// AMR genes identity threshold + blast_resistance_minid= 90 + +// AMR genes coverage threshold + blast_resistance_mincov = 90 + +// MGEs (ICEs and Phages) identity threshold + blast_MGEs_minid = 85 + +// MGEs (ICEs and Phages) coverage threshold + blast_MGEs_mincov = 85 + +// User's custom database identity threshold + blast_custom_minid = 65 + +// User's custom database coverage threshold + blast_custom_mincov = 65 + + /* + * Resources allocation configuration + * Defaults only, expecting to be overwritten + */ +// Select versions of bioconda quay.io additional tools +// Tools that are not part of the core of the pipeline, +// but can eventually be used by users + unicycler_version = '0.5.0--py310h6cc9453_3' + flye_version = '2.9--py39h6935b12_1' + bakta_version = '1.7.0--pyhdfd78af_1' + +// Max resource options + max_memory = '20.GB' + max_cpus = 16 + max_time = '40.h' + +} diff --git a/docs/index.md b/docs/index.md index 2854ee3b..4cb28ba9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,7 +2,7 @@ -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3627669-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3627669) +[![F1000 Paper](https://img.shields.io/badge/Citation%20F1000-10.12688/f1000research.139488.1-orange)](https://doi.org/10.12688/f1000research.139488.1) [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/fmalmeida/bacannot?include_prereleases&label=Latest%20release)](https://github.com/fmalmeida/bacannot/releases) [![Documentation](https://img.shields.io/badge/Documentation-readthedocs-brightgreen)](https://bacannot.readthedocs.io/en/latest/?badge=latest) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) @@ -10,6 +10,7 @@ [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![License](https://img.shields.io/badge/License-GPL%203-black)](https://github.com/fmalmeida/bacannot/blob/master/LICENSE) [![Follow on Twitter](http://img.shields.io/badge/twitter-%40fmarquesalmeida-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/fmarquesalmeida) +[![Zenodo Archive](https://img.shields.io/badge/Zenodo-Archive-blue)](https://doi.org/10.5281/zenodo.3627669) ## About @@ -33,8 +34,9 @@ The pipeline's main steps are: | Annotation of virulence genes | [Victors](http://www.phidias.us/victors/) and [VFDB](http://www.mgc.ac.cn/VFs/main.htm) | | Prophage sequences and genes annotation | [PHASTER](http://phast.wishartlab.com/), [Phigaro](https://github.com/bobeobibo/phigaro) and [PhySpy](https://github.com/linsalrob/PhiSpy) | | Annotation of integrative and conjugative elements | [ICEberg](http://db-mml.sjtu.edu.cn/ICEberg/) | +| Annotation of bacterial integrons | [Integron Finder](https://github.com/gem-pasteur/Integron_Finder) | | Focused detection of insertion sequences | [digIS](https://github.com/janka2012/digIS) | -| _In silico_ detection of plasmids | [Plasmidfinder](https://cge.cbs.dtu.dk/services/PlasmidFinder/) and [Platon](https://github.com/oschwengers/platon) | +| _In silico_ detection and typing of plasmids | [Plasmidfinder](https://cge.cbs.dtu.dk/services/PlasmidFinder/), [Platon](https://github.com/oschwengers/platon) and [MOB-typer](https://github.com/phac-nml/mob-suite)| | Prediction and visualization of genomic islands | [IslandPath-DIMOB](https://github.com/brinkmanlab/islandpath) and [gff-toolbox](https://github.com/fmalmeida/gff-toolbox) | | Custom annotation from formatted FASTA or NCBI protein IDs | [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs) | | Merge of annotation results | [bedtools](https://bedtools.readthedocs.io/en/latest/) | diff --git a/docs/installation.md b/docs/installation.md index fce8e38f..8035ba43 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -19,19 +19,30 @@ nextflow pull fmalmeida/bacannot ## Downloading docker images -The custom docker images used by the pipeline are: +> The pipeline uses both custom and public images. +> All images can be downloaded on the fly, automatically by nextflow, and this is the recommended way to do it. + +If you want to download it yourself, you can find all the images used in the pipeline described in the file [docker.config](https://github.com/fmalmeida/bacannot/blob/master/conf/docker.config) (for docker) and [singularity.config](https://github.com/fmalmeida/bacannot/blob/master/conf/singularity.config) (for singularity). + +The images are defined like the following: ```bash -docker pull fmalmeida/bacannot:v3.2_misc ; -docker pull fmalmeida/bacannot:v3.2_perlenv ; -docker pull fmalmeida/bacannot:v3.2_pyenv ; -docker pull fmalmeida/bacannot:v3.2_renv ; -docker pull fmalmeida/bacannot:jbrowse ; +... +withLabel: 'db_download|db_tools|misc' { + container = 'fmalmeida/bacannot@sha256:726e085f1bd71b47c2d8a38fd46d812aab7eb8978bab7bf3cde3aa2b7b3e0f2c' +} +... ``` -> The pipeline also uses other public images available in biocontainers. All images can be downloaded on the fly, automatically be nextflow. +And could be downloaded like this: + +```bash +docker pull fmalmeida/bacannot@sha256:726e085f1bd71b47c2d8a38fd46d812aab7eb8978bab7bf3cde3aa2b7b3e0f2c +``` -!!! info "Using singularity" +> You would need to do it for each image. + +!!! info "If using singularity" **Docker and singularity images are downloaded on the fly**. Be sure to properly set `NXF_SINGULARITY_LIBRARYDIR` env variable to a writable directory if using Singularity. This will make that the downloaded images are reusable through different executions. Read more at: https://www.nextflow.io/docs/latest/singularity.html#singularity-docker-hub @@ -40,18 +51,48 @@ docker pull fmalmeida/bacannot:jbrowse ; ```bash # apply this command to each image # just change the "/" and ":" for "-". - # E.g. Image fmalmeida/bacannot:v3.2_misc becomes fmalmeida-bacannot-v3.2_misc.img - singularity pull --dir $NXF_SINGULARITY_LIBRARYDIR fmalmeida-bacannot-v3.2_misc.img docker://fmalmeida/bacannot:v3.2_misc + # E.g. Image fmalmeida/bacannot:v3.3_misc becomes fmalmeida-bacannot-v3.3_misc.img + singularity pull --dir $NXF_SINGULARITY_LIBRARYDIR fmalmeida-bacannot-v3.3_misc.img docker://fmalmeida/bacannot:v3.3_misc ``` +## Bacannot databases + +Bacannot databases are not inside the docker images anymore to avoid huge images and problems with connections and limit rates with dockerhub. + +### Pre-formatted + +Users can directly download pre-formatted databases from Zenodo: https://doi.org/10.5281/zenodo.7615811 + +Useful for standardization and also overcoming known issues that may arise when formatting databases with `singularity` profile. + +A module to download the latest pre-formatted database has also been made available: + +```bash +# Download pipeline pre-built databases +nextflow run fmalmeida/bacannot \ + --get_zenodo_db \ + --output ./ \ + -profile +``` + +### I want to generate a new formatted database + +```{bash .annotate hl_lines="5"} +# Download pipeline databases +nextflow run fmalmeida/bacannot \ + --get_dbs \ + --output bacannot_dbs \ + -profile +``` + ## Testing your installation After that, you can run the pipeline with a testing dataset by selecting one of the available profiles: 1. Docker - * `nextflow run fmalmeida/mpgap -profile docker,test` + * `nextflow run fmalmeida/mpgap -profile docker,test` --bacannot_db ./bacannot_dbs 2. Singularity - * `nextflow run fmalmeida/mpgap -profile singularity,test` + * `nextflow run fmalmeida/mpgap -profile singularity,test` --bacannot_db ./bacannot_dbs !!! note "About NF profiles" diff --git a/docs/manual.md b/docs/manual.md index e9791743..e42c1227 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -29,9 +29,10 @@ The pipeline accepts as input two other input files types that are used to perfo ## Input/output options -|
Parameter
| Required | Default | Description | +|
Parameter
| Required | Default | Description | | :--------------------------------------- | :------- | :------ | :---------- | | `--input` | :material-check: | NA | Input samplesheet describing all the samples to be analysed | +| `--enable_deduplication` | :material-close: | false | Run deduplication command on input reads before assembly. Only useful for samples where reads are given instead of a genome fasta. | | `--output` | :material-check: | results | Name of directory to store output values. A sub-directory for each genome will be created inside this main directory. | | `--bacannot_db` | :material-check: | NA | Path for root directory containing required bacannot databases | @@ -45,6 +46,7 @@ The pipeline accepts as input two other input files types that are used to perfo | :--------------------------------------- | :------- | :------ | :---------- | | `--get_dbs` | :material-close: | false | Instead of running the analysis workflow, it will try to download required databases and save them in `--output` | | `--force_update` | :material-close: | false | Instead of only downloading missing databases, download everything again and overwrite. | +| `--get_zenodo_db` | :material-close: | false | Download pre-built databases stored in zenodo. [See quickstart](quickstart.md#). !!! tip "" @@ -88,7 +90,7 @@ The use of this parameter sets a default value for input samples. If a sample ha |
Parameter
| Required | Default | Description | | :--------------------------------------- | :------- | :------ | :---------- | | `--skip_virulence_search` | :material-close: | false | Tells whether not to run virulence factors annotation. It skips both vfdb and victors annotation | -| `--skip_plasmid_search` | :material-close: | false | Tells whether not to run plasmid detection modules | +| `--skip_plasmid_search` | :material-close: | false | Tells whether not to run plasmid detection/typing modules | | `--skip_resistance_search` | :material-close: | false | Tells whether not to run resistance genes annotation modules | | `--skip_iceberg_search` | :material-close: | false | Tells whether not to run mobile genetic elements annotation with ICEberg | | `--skip_prophage_search` | :material-close: | false | Tells whether not to run prophage annotation modules | @@ -131,7 +133,7 @@ Users can now select the version of the non-core tools Bakta, Unicyler and Flye. | Parameter | Default | Description | | :-------- | :------ | :---------- | -| `--bakta_version` | 1.6.1--pyhdfd78af_0 | Bakta tool version | +| `--bakta_version` | 1.7.0--pyhdfd78af_1 | Bakta tool version | | `--flye_version` | 2.9--py39h39abbe0_0 | Flye tool version | | `--unicycler_version` | 0.4.8--py38h8162308_3 | Unicycler tool version | diff --git a/docs/outputs.md b/docs/outputs.md index 92ffc09e..ba350809 100644 --- a/docs/outputs.md +++ b/docs/outputs.md @@ -26,11 +26,12 @@ After a successful execution, you will have something like this: |ย ย  โ”œโ”€โ”€ gffs # A copy of the main GFF files produced during the annotation |ย ย  โ”œโ”€โ”€ genomic_islands # Genomic Islands predicted with IslandPath-DIMOB |ย ย  โ”œโ”€โ”€ ICEs # Results from ICEberg database annotation +|ย ย  โ”œโ”€โ”€ integron_finder # Results from Integron Finder tool annotation |ย ย  โ”œโ”€โ”€ jbrowse # The files that set up the JBrowse genome browser |ย ย  โ”œโ”€โ”€ KOfamscan # Results from annotation with KEGG database |ย ย  โ”œโ”€โ”€ methylations # Methylated sites predicted with Nanopolish (if fast5 is given) |ย ย  โ”œโ”€โ”€ MLST # MLST results with mlst pipeline -|ย ย  โ”œโ”€โ”€ plasmids # Plasmid annotation results from Platon and Plasmidfinder +|ย ย  โ”œโ”€โ”€ plasmids # Plasmid annotation results from Platon, Plasmidfinder and MOB Suite |ย ย  โ”œโ”€โ”€ prophages # Prophage annotation results from PhiSpy, Phigaro and PHAST |ย ย  โ”œโ”€โ”€ refseq_masher # Closest NCBI Resfseq genomes identified with refseq_masher |ย ย  โ”œโ”€โ”€ report_files # Annotation reports in HTML format diff --git a/docs/quickstart.md b/docs/quickstart.md index 3f68c7f0..42b5ee89 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -41,6 +41,13 @@ Users can directly download pre-formatted databases from Zenodo: https://doi.org Useful for standardization and also overcoming known issues that may arise when formatting databases with `singularity` profile. +A module to download the latest pre-formatted database has also been made available: + +```bash +# Download pipeline pre-built databases +nextflow run fmalmeida/bacannot --get_zenodo_db --output ./ -profile +``` + #### I want to generate a new formatted database ```{bash .annotate hl_lines="5"} @@ -95,4 +102,4 @@ nextflow run fmalmeida/bacannot -profile docker,quicktest --bacannot_db ./bacann ### Annotation with bakta -User can also perform the core generic annotation with bakta instead of prokka. Please read [the manual](manual#bakta-annotation). +User can also perform the core generic annotation with bakta instead of prokka. Please read [the manual](manual.md#bakta-annotation). diff --git a/docs/reports/report_MGEs.html b/docs/reports/report_MGEs.html index c85ef8f1..9f6e4eda 100644 --- a/docs/reports/report_MGEs.html +++ b/docs/reports/report_MGEs.html @@ -11,7 +11,7 @@ - + Annotation of mobile genetic elements @@ -4910,7 +4910,7 @@

Annotation of mobile genetic elements

Produced with bacannot pipeline

-

05 May 2023

+

19 March 2023

@@ -4949,6 +4949,11 @@

About

  • Platon detects plasmid contigs within bacterial draft genomes from WGS short-read assemblies.
  • Therefore, Platon analyzes the natural distribution biases of certain protein coding genes between chromosomes and plasmids.
  • +
  • MOB Suite; +
      +
    • Software tools for clustering, reconstruction and typing of plasmids from draft assemblies.
    • +
    • In the pipeline, only the typer tool is used.
    • +
  • IslandPath.
    • IslandPath-DIMOB is a standalone software to predict genomic islands in bacterial and archaeal genomes based on the presence of dinucleotide biases and mobility genes.
    • @@ -4957,6 +4962,10 @@

      About

      • digIS is a command-line tool for detection of insertion sequences (IS) in prokaryotic genomes.
      +
    • Integron Finder. +
        +
      • a command line tool to identity integrons in DNA sequences
      • +
    • Prediction thresholds

      @@ -4995,8 +5004,8 @@

      Plasmidfinder

      Table 1: In silico detection of plasmids with Plasmidfinder -
      - +
      +

      Platon

      @@ -5007,8 +5016,20 @@

      Platon

      Table 2: In silico detection of plasmids with Platon -
      - +
      + +
      +
      +

      MOB suite (typer)

      +

      MOB-typer provides in silico predictions of the replicon family, relaxase type, mate-pair formation type and predicted transferability of the plasmid. Using a combination of biomarkers and MOB-cluster codes, it will also provide an observed host-range of your plasmid based on its replicon, relaxase and cluster assignment. This is combined with information mined from the literature to provide a prediction of the taxonomic rank at which the plasmid is likely to be stably maintained but it does not provide source attribution predictions.

      +
        +
      • The complete results can be found in the directory plasmids/mob_suite under the main output directory.
      • +
      + +Table 3: In silico typing of plasmids with MOB suite + +
      +
      @@ -5018,7 +5039,7 @@

      Prophage detection

      Phigaro

      -

      Phigaro is a standalone command-line application that is able to detect prophage regions taking raw genome and metagenome assemblies as an input. It also produces dynamic annotated โ€œprophage genome mapsโ€ and marks possible transposon insertion spots inside prophages. Its results can be nicely visualized in its own html report file stored in its output directory. The genomic regions predicted as putative prophage sequences are also summarized in Table 3.

      +

      Phigaro is a standalone command-line application that is able to detect prophage regions taking raw genome and metagenome assemblies as an input. It also produces dynamic annotated โ€œprophage genome mapsโ€ and marks possible transposon insertion spots inside prophages. Its results can be nicely visualized in its own html report file stored in its output directory. The genomic regions predicted as putative prophage sequences are also summarized in Table 4.

      • Check it out at:
          @@ -5028,37 +5049,37 @@

          Phigaro


        -Table 3: Putative prophage sequences annotated with phigaro software +Table 4: Putative prophage sequences annotated with phigaro software -
        - +
        +

      PhiSpy

      -

      PhiSpy is a standalone tool that identifies prophages in Bacterial (and probably Archaeal) genomes. Given an annotated genome it will use several approaches to identify the most likely prophage regions. The genomic regions predicted as putative prophage sequences are also summarized in Table 4.

      +

      PhiSpy is a standalone tool that identifies prophages in Bacterial (and probably Archaeal) genomes. Given an annotated genome it will use several approaches to identify the most likely prophage regions. The genomic regions predicted as putative prophage sequences are also summarized in Table 5.

      • Check the results at prophages/phispy in the main output directory

      -Table 4: Putative prophage sequences annotated with phispy software +Table 5: Putative prophage sequences annotated with phispy software -
      - +
      +

      PHAST database

      -

      All prophage genes from PHAST database that had good alignments to the genes of the query genome are summarized in Table 5. The protein sequences of these genes were aligned against the gene sequences predicted by Prokka via BLASTp. They are all available in the genome browser provided. A good way to interrogate this annotation is to visualize the putative prophage regions predicted by phigaro and phispy interpolating it with the prophage gene annotation provided with phast database.

      +

      All prophage genes from PHAST database that had good alignments to the genes of the query genome are summarized in Table 6. The protein sequences of these genes were aligned against the gene sequences predicted by Prokka via BLASTp. They are all available in the genome browser provided. A good way to interrogate this annotation is to visualize the putative prophage regions predicted by phigaro and phispy interpolating it with the prophage gene annotation provided with phast database.

      Unfortunately, PHASTER database have no searchable interface to visualize its prophages. Therefore, this table has no links to external sources.

      -Table 5: Prophage genes annotated using PHAST database via BLASTp +Table 6: Prophage genes annotated using PHAST database via BLASTp -
      - +
      +
      @@ -5068,28 +5089,28 @@

      ICEberg database

      Analysis of full-length ICEs

      -

      Full-length ICEs are available at ICEberg database in nucleotide fastas while the proteins found inside these ICEs are in protein fastas. Since the ICEfinder script has no licenses to be incorporated to the pipeline, we try to search for the full-length ICEs. However, they are very difficult to be completely found in new genomes, thus they are scanned without coverage or identity thresholds. The filtering and selection of these is up to you. We have found a total of 35 alignments in the query genome, check it out in table 6.

      +

      Full-length ICEs are available at ICEberg database in nucleotide fastas while the proteins found inside these ICEs are in protein fastas. Since the ICEfinder script has no licenses to be incorporated to the pipeline, we try to search for the full-length ICEs. However, they are very difficult to be completely found in new genomes, thus they are scanned without coverage or identity thresholds. The filtering and selection of these is up to you. We have found a total of 35 alignments in the query genome, check it out in table 7.

      Users are advised to also use the ICEfinder tool to predict the putative genomic position of known ICEs since we are not allowed to include this step under this pipeline.


      -Table 6: Alignment of full-length ICEs to the query genome via BLASTn +Table 7: Alignment of full-length ICEs to the query genome via BLASTn -
      - +
      +

      Analysis of ICEโ€™s proteins

      -

      All query genes predicted by Prokka that have a match in ICEberg database are shown in Table 7. It is summarized the ICE id and all its genes that were found in the query genome. All of them are linked to the database for further investigations.

      +

      All query genes predicted by Prokka that have a match in ICEberg database are shown in Table 8. It is summarized the ICE id and all its genes that were found in the query genome. All of them are linked to the database for further investigations.

      Take note: The fact that the genome possess some proteins from ICEs does not necessarily means that the ICE is present in the genome. Please, check the number of proteins that the ICE of origin posses in the ICEberg database list of ICEs, and then make inferences based one the alignments you see.

      Users are advised to also use the ICEfinder tool to predict the putative genomic position of known ICEs since we are not allowed to include this step under this pipeline.


      -Table 7: ICE genes annotated from ICEberg database via BLASTp +Table 8: ICE genes annotated from ICEberg database via BLASTp -
      - +
      +
      The number of genes from known ICEs (from [ICEberg](https://bioinfo-mml.sjtu.edu.cn/ICEberg2/index.php)) found in the query genome

      @@ -5115,10 +5136,14 @@

      IS detection


      -Table 6: Insertions sequences predicted by digIS in GFF format. +Table 9: Insertions sequences predicted by digIS in GFF format. -
      - +
      + +
      +
      +

      Integron detection

      +

      Not a integron have been predicted with Integron Finder. This might have happened either because your genome really do not have integron sequences or due to misassemblies. You can always try to run the online version of the tool: https://integronfinder.readthedocs.io/en/latest/user_guide/webserver.html

      diff --git a/docs/reports/report_general.html b/docs/reports/report_general.html index 911d15e7..d005400b 100644 --- a/docs/reports/report_general.html +++ b/docs/reports/report_general.html @@ -11,7 +11,7 @@ - + Generic annotation @@ -4910,7 +4910,7 @@

      Generic annotation

      Produced with bacannot pipeline

      -

      05 May 2023

      +

      19 March 2023

      @@ -4924,14 +4924,14 @@

      About

      RefSeq Masher

      RefSeq Masher is a tool that enables to rapidly find what NCBI RefSeq genomes match or are contained within your sequence data using Mash MinHash with a Mash sketch database of NCBI RefSeq Genomes. The results are shown below (bacannot outputs only the top 10).

      -
      - +
      +

      MLST

      Bacannot uses the mlst package to scan the PubMLST schemes available in order to classify the genome under public multilocus sequence type schemes. The results for ecoli are shown below.

      -
      - +
      +

      Prokka

      @@ -4939,21 +4939,21 @@

      Prokka

      In bacannot, when using prokka, the prokka database is incremented with either TIGRFAM hmm hosted at NCBI or with the extensive PGAP hmm database hosted at NCBI with the parameter --prokka_use_pgap is used.

      -
      - +
      +

      Barrnap

      Barrnap is a fast Ribosomal RNA predictor for bacterias, from the same developer of Prokka. It is fast and produces a GFF of the predicted rRNAs (See below).

      -
      - +
      +

      KEGG KOs

      KEGG KOs are annotated with KofamScan, which is a gene function annotation tool based on KEGG Orthology and hidden Markov model. You need KOfam database to use this tool. Online version is available on https://www.genome.jp/tools/kofamkoala/.

      After annotation, the results are plotted with KEGGDecoder (See below).
      -KEGGDecoder heatmap of KofamScan annotation results. +KEGGDecoder heatmap of KofamScan annotation results.

      Figure 1: KEGGDecoder heatmap of KofamScan annotation results.

      diff --git a/docs/reports/report_resistance.html b/docs/reports/report_resistance.html index 073b7637..653209ae 100644 --- a/docs/reports/report_resistance.html +++ b/docs/reports/report_resistance.html @@ -11,7 +11,7 @@ - + Annotation of amr determinants @@ -4910,7 +4910,7 @@

      Annotation of amr determinants

      Produced with bacannot pipeline

      -

      05 May 2023

      +

      19 March 2023

      @@ -4981,11 +4981,11 @@

      CARD RGI

      Table 1: RGI annotation results. The perfect hits are highlighted in yellow while the strict hits in light blue. -
      - +
      +


      -RGI's phenotype prediction. AMR genes are listed in alphabetical order and unique resistome profiles are displayed with their frequency. Yellow represents a perfect hit, Blue-green represents a strict hit. +RGI's phenotype prediction. AMR genes are listed in alphabetical order and unique resistome profiles are displayed with their frequency. Yellow represents a perfect hit, Blue-green represents a strict hit.

      Figure 1: RGIโ€™s phenotype prediction. AMR genes are listed in alphabetical order and unique resistome profiles are displayed with their frequency. Yellow represents a perfect hit, Blue-green represents a strict hit.

      @@ -5019,8 +5019,8 @@

      Supporting Data

      Table 2: Resistance genes annotated from NCBI AMR curated database using AMRfinderplus -
      - +
      +
      Resistome Predicted using NCBI's AMRFinderplus

      @@ -5040,8 +5040,8 @@

      BLAST summary

      Table 3: Resistance genes detected using ARGminer database via BLASTp -
      - +
      +
      @@ -5053,8 +5053,8 @@

      Prokka

      Table 4: Generic annotation of resistance determinants by Prokka -
      - +
      +
      diff --git a/docs/reports/report_virulence.html b/docs/reports/report_virulence.html index 05b9a600..1ddc92c4 100644 --- a/docs/reports/report_virulence.html +++ b/docs/reports/report_virulence.html @@ -11,7 +11,7 @@ - + Annotation of virulence factors @@ -4910,7 +4910,7 @@

      Annotation of virulence factors

      Produced with bacannot pipeline

      -

      05 May 2023

      +

      19 March 2023

      @@ -4990,8 +4990,8 @@

      Detailed information

      Table 1: Virulence factors annotated using the VFDB database via BLASTn -
      - +
      +
      @@ -5003,8 +5003,8 @@

      Victors

      Table 2: Virulence factors annotated using the Victors database via BLASTp -
      - +
      +
      diff --git a/docs/requirements.txt b/docs/requirements.txt index d5ff7eda..42c51dbf 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -14,4 +14,5 @@ mergedeep>=1.3.4 colorama>=0.4; platform_system == 'Windows' mkdocs-pymdownx-material-extras mkdocs-git-revision-date-plugin -mkdocs-material \ No newline at end of file +mkdocs-material +mkdocs-macros-plugin \ No newline at end of file diff --git a/lib/WorkflowBacannot.groovy b/lib/WorkflowBacannot.groovy index 94e05db2..527e14c0 100755 --- a/lib/WorkflowBacannot.groovy +++ b/lib/WorkflowBacannot.groovy @@ -10,8 +10,20 @@ class WorkflowBacannot { public static void initialise(params, log) { // input has been given and user does not want to download databases? - if (!params.input && !params.get_dbs) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.yml'. Or select the download databases mode with --get_dbs." + if (!params.input && !params.get_dbs && !params.get_zenodo_db) { + log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.yml'. Or select the download databases mode with --get_dbs or --get_zenodo_db" + System.exit(1) + } + + // using incompatible parameters? + if (params.input && (params.get_dbs || params.get_zenodo_db)) { + log.error "Not possible to run (--input) the pipeline and try to download databases (--get_dbs or --get_zenodo_db). Please do one or another." + System.exit(1) + } + + // input has been given and user does not want to download databases? + if (params.get_dbs && params.get_zenodo_db) { + log.error "Please select either --get_dbs or --get_zenodo_db, not both at the same time." System.exit(1) } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 7531147f..6ccb1c75 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -10,7 +10,7 @@ class WorkflowMain { public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " https://doi.org/10.5281/zenodo.3627669\n\n" + + " https://doi.org/10.12688/f1000research.139488.1\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + @@ -74,6 +74,30 @@ class WorkflowMain { System.exit(0) } + // Download docker config + if (params.get_docker_config) { + new File("docker.config").write(new URL ("https://github.com/fmalmeida/bacannot/raw/master/conf/docker.config").getText()) + log.info """ + docker.config file saved in working directory + After configuration, run: + nextflow run fmalmeida/bacannot -c ./docker.config + Nice code + """.stripIndent() + System.exit(0) + } + + // Download singularity config + if (params.get_singularity_config) { + new File("singularity.config").write(new URL ("https://github.com/fmalmeida/bacannot/raw/master/conf/singularity.config").getText()) + log.info """ + singularity.config file saved in working directory + After configuration, run: + nextflow run fmalmeida/bacannot -c ./singularity.config + Nice code + """.stripIndent() + System.exit(0) + } + // Validate workflow parameters via the JSON schema if (params.validate_params) { NfcoreSchema.validateParameters(workflow, params, log) diff --git a/main.nf b/main.nf index f86a7eb3..7c8421ef 100644 --- a/main.nf +++ b/main.nf @@ -36,7 +36,7 @@ include { CREATE_DBS } from './workflows/bacannot_dbs.nf' workflow { - if (params.get_dbs) { + if (params.get_dbs || params.get_zenodo_db) { CREATE_DBS() } else { if (params.input) { diff --git a/markdown/CHANGELOG.md b/markdown/CHANGELOG.md index b47caeed..c5bbbec5 100644 --- a/markdown/CHANGELOG.md +++ b/markdown/CHANGELOG.md @@ -2,6 +2,20 @@ The tracking for changes started in v2.1 +## v3.3 [01-October-2023] + +* [[#50](https://github.com/fmalmeida/bacannot/issues/50)] -- Add `Integron Finder` tool to the pipeline +* [[#69](https://github.com/fmalmeida/bacannot/issues/69)] -- Change how tools use docker images in order to: + * make tools use public bioconda images whenever possible to allow easy addition of tools and avoid much conflicts in docker images + * dimish the size and tools inside the docker images, the docker images now are only built to contain tools and all required for modules that cannot just use bioconda docker images. +* [[#81](https://github.com/fmalmeida/bacannot/issues/81)] -- Add `MOB Suite` tool to the pipeline +* [[#85](https://github.com/fmalmeida/bacannot/issues/85)] -- Include checkup on header size for Prokka +* [[#98](https://github.com/fmalmeida/bacannot/issues/98)] -- Add ICEberg and PHAST blastp results to json summary +* [[#100](https://github.com/fmalmeida/bacannot/issues/100)] -- Update pipeline to use docker shasum instead of tags +* [[#107](https://github.com/fmalmeida/bacannot/issues/107)] -- Add a parameter, `--enable_deduplication` for deduplicating input reads before assembly +* Update unicycler docker image to latest '0.5.0--py310h6cc9453_3' to avoid errors originated from previous image containing buggy installation. +* Other minor changes / updates highlited in [[#93](https://github.com/fmalmeida/bacannot/pull/93)] + ## v3.2 [19-December-2022] * Fixes https://github.com/fmalmeida/bacannot/issues/68 reported by @lam-c diff --git a/markdown/list_of_tools.md b/markdown/list_of_tools.md index 68d4bf21..27a1727a 100644 --- a/markdown/list_of_tools.md +++ b/markdown/list_of_tools.md @@ -16,10 +16,12 @@ These are the tools that wrapped inside bacannot. **Cite** the tools whenever yo | Annotation of virulence genes | [Victors](http://www.phidias.us/victors/) and [VFDB](http://www.mgc.ac.cn/VFs/main.htm) | | Prophage sequences and genes annotation | [PHASTER](http://phast.wishartlab.com/), [Phigaro](https://github.com/bobeobibo/phigaro) and [PhySpy](https://github.com/linsalrob/PhiSpy) | | Annotation of integrative and conjugative elements | [ICEberg](http://db-mml.sjtu.edu.cn/ICEberg/) | +| Annotation of bacterial integrons | [Integron Finder](https://github.com/gem-pasteur/Integron_Finder) | | Focused detection of insertion sequences | [digIS](https://github.com/janka2012/digIS) | -| _In silico_ detection of plasmids | [Plasmidfinder](https://cge.cbs.dtu.dk/services/PlasmidFinder/) and [Platon](https://github.com/oschwengers/platon) | +| _In silico_ detection and typing of plasmids | [Plasmidfinder](https://cge.cbs.dtu.dk/services/PlasmidFinder/), [Platon](https://github.com/oschwengers/platon) and [MOB-typer](https://github.com/phac-nml/mob-suite)| | Prediction and visualization of genomic islands | [IslandPath-DIMOB](https://github.com/brinkmanlab/islandpath) and [gff-toolbox](https://github.com/fmalmeida/gff-toolbox) | | Custom annotation from formatted FASTA or NCBI protein IDs | [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs) | | Merge of annotation results | [bedtools](https://bedtools.readthedocs.io/en/latest/) | | Genome Browser renderization | [JBrowse](http://jbrowse.org/) | +| Circos plot generation | [easy_circos](https://easy_circos.readthedocs.io/en/latest/index.html) | | Renderization of automatic reports and shiny app for results interrogation | [R Markdown](https://rmarkdown.rstudio.com/), [Shiny](https://shiny.rstudio.com/) and [SequenceServer](https://sequenceserver.com/) | diff --git a/mkdocs.yml b/mkdocs.yml index 58482435..1a5157ba 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,6 +22,8 @@ theme: repo: fontawesome/brands/github-alt plugins: - git-revision-date + - search + - macros markdown_extensions: - pymdownx.emoji: emoji_index: !!python/name:materialx.emoji.twemoji diff --git a/modules/KOs/kofamscan.nf b/modules/KOs/kofamscan.nf index 87d25450..cd07f66e 100644 --- a/modules/KOs/kofamscan.nf +++ b/modules/KOs/kofamscan.nf @@ -4,7 +4,7 @@ process KOFAMSCAN { else "$filename" } tag "${prefix}" - label = [ 'misc', 'process_high' ] + label = [ 'process_high', 'error_retry' ] input: tuple val(prefix), file('proteins.faa') diff --git a/modules/MGEs/draw_gis.nf b/modules/MGEs/draw_gis.nf index ff64cfd2..13277613 100644 --- a/modules/MGEs/draw_gis.nf +++ b/modules/MGEs/draw_gis.nf @@ -5,7 +5,6 @@ process DRAW_GIS { } tag "${prefix}" label = [ 'misc', 'process_ultralow' ] - input: tuple val(prefix), file(gff), file(gis_bed) diff --git a/modules/MGEs/integron_finder.nf b/modules/MGEs/integron_finder.nf new file mode 100644 index 00000000..b06fccdf --- /dev/null +++ b/modules/MGEs/integron_finder.nf @@ -0,0 +1,42 @@ +process INTEGRON_FINDER { + publishDir "${params.output}", mode: 'copy', saveAs: { filename -> + if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" + else "${prefix}/integron_finder/$filename" + } + tag "${prefix}" + label = [ 'process_medium' ] + + input: + tuple val(prefix), file(genome) + + output: + tuple val(prefix), path("*") , emit: all + tuple val(prefix), path("${prefix}_integrons.gbk"), emit: gbk, optional: true + path("integronfinder_version.txt") + + script: + def args = task.ext.args ?: '' + """ + # Get version + integron_finder --version > integronfinder_version.txt ; + + # run tool + integron_finder \\ + --local-max \\ + --func-annot \\ + --pdf \\ + --gbk \\ + --cpu $task.cpus \\ + $args \\ + $genome + + # move results + mv Results_Integron_Finder_${prefix}/* . ; + rm -rf Results_Integron_Finder_${prefix} ; + + # convert to gff if available + for gbk in \$(ls *.gbk) ; do + cat \$gbk >> ${prefix}_integrons.gbk ; + done + """ +} diff --git a/modules/MGEs/integron_finder_2gff.nf b/modules/MGEs/integron_finder_2gff.nf new file mode 100644 index 00000000..6abaeab3 --- /dev/null +++ b/modules/MGEs/integron_finder_2gff.nf @@ -0,0 +1,24 @@ +process INTEGRON_FINDER_2GFF { + publishDir "${params.output}/${prefix}/integron_finder", mode: 'copy' + tag "${prefix}" + label = [ 'misc', 'process_low' ] + + input: + tuple val(prefix), file(gbk) + + output: + tuple val(prefix), path("${prefix}_integrons.gff"), emit: gff + + script: + def args = task.ext.args ?: '' + """ + # convert to gff if available + touch ${prefix}_integrons.gff ; + for gbk in \$(ls *.gbk) ; do + conda run -n perl bp_genbank2gff3 \$gbk -o - | \ + grep 'integron_id' | \ + sed 's|ID=.*integron_id=|ID=|g' | \ + sed 's/GenBank/Integron_Finder/g' >> ${prefix}_integrons.gff + done + """ +} diff --git a/modules/MGEs/islandpath.nf b/modules/MGEs/islandpath.nf index d7ded993..d9ef1714 100644 --- a/modules/MGEs/islandpath.nf +++ b/modules/MGEs/islandpath.nf @@ -1,7 +1,9 @@ process ISLANDPATH { publishDir "${params.output}/${prefix}/genomic_islands", mode: 'copy' tag "${prefix}" - label = [ 'perl', 'process_low' ] + label = [ 'process_low' ] + errorStrategy = 'retry' + maxRetries = 5 input: tuple val(prefix), file("annotation.gbk") diff --git a/modules/MGEs/mob_suite.nf b/modules/MGEs/mob_suite.nf new file mode 100644 index 00000000..14256e92 --- /dev/null +++ b/modules/MGEs/mob_suite.nf @@ -0,0 +1,36 @@ +process MOBSUITE { + publishDir "${params.output}", mode: 'copy', saveAs: { filename -> + if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" + else "${prefix}/plasmids/mob_suite/$filename" + } + tag "${prefix}" + label = [ 'process_medium' ] + + input: + tuple val(prefix), file(genome) + + output: + tuple val(prefix), path("${prefix}_mobtyper_results.txt"), emit: results + path("mobtyper_version.txt") + + script: + def args = task.ext.args ?: '' + """ + # Get version + mob_typer --version > mobtyper_version.txt ; + + # run tool + mob_typer \\ + --multi \\ + --num_threads $task.cpus \\ + --sample_id $prefix \\ + --infile $genome \\ + $args \\ + --out_file ${prefix}_mobtyper_results.txt + + # convert to gff if available + # for gbk in \$(ls *.gbk) ; do + # cat \$gbk >> ${prefix}_integrons.gbk ; + # done + """ +} diff --git a/modules/MGEs/plasmidfinder.nf b/modules/MGEs/plasmidfinder.nf index 318e93ea..91580170 100644 --- a/modules/MGEs/plasmidfinder.nf +++ b/modules/MGEs/plasmidfinder.nf @@ -4,7 +4,7 @@ process PLASMIDFINDER { else null } tag "${prefix}" - label = [ 'python', 'process_low' ] + label = [ 'process_low' ] input: tuple val(prefix), file(genome) diff --git a/modules/MGEs/platon.nf b/modules/MGEs/platon.nf index 086ab4d3..e4be53ec 100644 --- a/modules/MGEs/platon.nf +++ b/modules/MGEs/platon.nf @@ -5,7 +5,7 @@ process PLATON { else null } tag "${prefix}" - label = [ 'python', 'process_medium' ] + label = [ 'process_medium' ] input: tuple val(prefix), file(genome) diff --git a/modules/assembly/flye.nf b/modules/assembly/flye.nf index 05e73580..d588d01e 100644 --- a/modules/assembly/flye.nf +++ b/modules/assembly/flye.nf @@ -4,7 +4,7 @@ process FLYE { else if (filename == "flye_${prefix}") "assembly" else null } - label 'process_high' + label = [ 'process_high', 'error_retry' ] tag "${prefix}" input: @@ -18,14 +18,21 @@ process FLYE { script: lr = (lr_type == 'nanopore') ? '--nano-raw' : '--pacbio-raw' + dedup_lr = params.enable_deduplication ? + "gunzip -cf $lreads | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_reads.fastq.gz" : + "ln -s $lreads ${prefix}_deduplicated_reads.fastq.gz" + """ # Save flye version flye -v > flye_version.txt ; + # remove duplicate reads + $dedup_lr + # Run flye flye \\ ${lr} \\ - $lreads \\ + ${prefix}_deduplicated_reads.fastq.gz \\ --out-dir flye_${prefix} \\ --threads $task.cpus &> flye.log ; diff --git a/modules/assembly/unicycler.nf b/modules/assembly/unicycler.nf index 75883bf5..145f1c1f 100644 --- a/modules/assembly/unicycler.nf +++ b/modules/assembly/unicycler.nf @@ -4,7 +4,7 @@ process UNICYCLER { else if (filename == "unicycler_${prefix}") "assembly" else null } - label 'process_high' + label = [ 'process_high', 'error_retry' ] tag "${prefix}" input: @@ -17,14 +17,55 @@ process UNICYCLER { path('unicycler_version.txt'), emit: version script: - unpaired_param = (sreads.getName() != "input.3") ? "-s $sreads" : "" - paired_param = (sread1.getName() != "input.1" && sread2.getName() != "input.2") ? "-1 $sread1 -2 $sread2" : "" - lr_param = (lreads.getName() != "input.4") ? "-l $lreads" : "" + unpaired_param = "" + dedup_sreads = "" + paired_param = "" + dedup_paired = "" + lr_param = "" + dedup_lr = "" + + // sreads + if (sreads.getName() != "input.3") { + + dedup_sreads = params.enable_deduplication ? + "gunzip -cf $sreads | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_sreads.fastq.gz" : + "ln -s $sreads ${prefix}_deduplicated_sreads.fastq.gz" + + unpaired_param = "-s ${prefix}_deduplicated_sreads.fastq.gz" + + } + + // paired + if (sread1.getName() != "input.1" && sread2.getName() != "input.2") { + + dedup_paired = params.enable_deduplication ? + "gunzip -cf $sread1 | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_sread_R1.fastq.gz && gunzip -cf $sread2 | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_sread_R2.fastq.gz" : + "ln -s $sread1 ${prefix}_deduplicated_sread_R1.fastq.gz && ln -s $sread2 ${prefix}_deduplicated_sread_R2.fastq.gz" + + paired_param = "-1 ${prefix}_deduplicated_sread_R1.fastq.gz -2 ${prefix}_deduplicated_sread_R2.fastq.gz" + + } + + // lreads + if (lreads.getName() != "input.4") { + + dedup_lr = params.enable_deduplication ? + "gunzip -cf $lreads | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_lreads.fastq.gz" : + "ln -s $lreads ${prefix}_deduplicated_lreads.fastq.gz" + + lr_param = "-l $lreads" + + } """ # Save unicycler version unicycler --version > unicycler_version.txt + # remove duplicate reads + $dedup_sreads + $dedup_paired + $dedup_lr + # Run unicycler unicycler \\ $paired_param \\ diff --git a/modules/bacannot_dbs/amrfinder.nf b/modules/bacannot_dbs/amrfinder.nf index 0d3b3955..c5039973 100644 --- a/modules/bacannot_dbs/amrfinder.nf +++ b/modules/bacannot_dbs/amrfinder.nf @@ -1,7 +1,7 @@ process AMRFINDER_DB { publishDir "${params.output}/amrfinder_db", mode: 'copy', overwrite: "$params.force_update" - label = [ 'db_download', 'process_ultralow' ] - + label 'process_ultralow' + output: file("*") diff --git a/modules/bacannot_dbs/antismash.nf b/modules/bacannot_dbs/antismash.nf index 9f0fdd2f..5e9b8962 100644 --- a/modules/bacannot_dbs/antismash.nf +++ b/modules/bacannot_dbs/antismash.nf @@ -1,7 +1,7 @@ process ANTISMASH_DB { publishDir "${params.output}/antismash_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") diff --git a/modules/bacannot_dbs/argminer.nf b/modules/bacannot_dbs/argminer.nf index bacbb91a..6b2881ea 100644 --- a/modules/bacannot_dbs/argminer.nf +++ b/modules/bacannot_dbs/argminer.nf @@ -1,7 +1,7 @@ process ARGMINER_DB { publishDir "${params.output}/argminer_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") diff --git a/modules/bacannot_dbs/card.nf b/modules/bacannot_dbs/card.nf index d28feacb..411802a0 100644 --- a/modules/bacannot_dbs/card.nf +++ b/modules/bacannot_dbs/card.nf @@ -1,7 +1,7 @@ process CARD_DB { publishDir "${params.output}/card_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") @@ -11,5 +11,5 @@ process CARD_DB { wget --tries=10 https://card.mcmaster.ca/latest/data tar -xvf data ./card.json rm data - """ + """ } diff --git a/modules/bacannot_dbs/get_zenodo.nf b/modules/bacannot_dbs/get_zenodo.nf new file mode 100644 index 00000000..cb9aab4f --- /dev/null +++ b/modules/bacannot_dbs/get_zenodo.nf @@ -0,0 +1,19 @@ +process GET_ZENODO_DB { + publishDir "${params.output}", mode: 'copy', overwrite: "$params.force_update" + label = [ 'db_download', 'process_low' ] + + tag "Downloading pre-built databases" + + output: + file("*") + + script: + """ + # download database from zenodo + zenodo_get https://doi.org/10.5281/zenodo.7615811 + + # organize data + tar zxvf *.tar.gz && rm *.tar.gz + rm -rf \$( find . -name 'pipeline_info' ) + """ +} \ No newline at end of file diff --git a/modules/bacannot_dbs/iceberg.nf b/modules/bacannot_dbs/iceberg.nf index f7f859b8..1cebc233 100644 --- a/modules/bacannot_dbs/iceberg.nf +++ b/modules/bacannot_dbs/iceberg.nf @@ -1,7 +1,7 @@ process ICEBERG_DB { publishDir "${params.output}/iceberg_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") diff --git a/modules/bacannot_dbs/kofamscan.nf b/modules/bacannot_dbs/kofamscan.nf index 71baa955..2a5e6303 100644 --- a/modules/bacannot_dbs/kofamscan.nf +++ b/modules/bacannot_dbs/kofamscan.nf @@ -1,19 +1,28 @@ process KOFAMSCAN_DB { publishDir "${params.output}/kofamscan_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_low' ] - + output: file("*") script: + if (workflow.containerEngine != 'singularity') { + chmod_cmd = 'chmod a+rw profiles.tar.gz ko_list' + chown_cmd = 'chown -R root:\$(id -g) profiles' + tar_cmd = '--same-owner' + } else { + chmod_cmd = '' + chown_cmd = '' + tar_cmd = '' + } """ # download kofamscan database wget --tries=10 ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz wget --tries=10 ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz gunzip ko_list.gz - chmod a+rw profiles.tar.gz ko_list - tar --same-owner -xvzf profiles.tar.gz - chown -R root:\$(id -g) profiles + $chmod_cmd + tar $tar_cmd -xvzf profiles.tar.gz + $chown_cmd rm -rf profiles.tar.gz # for the sake of size and fastness diff --git a/modules/bacannot_dbs/mlst.nf b/modules/bacannot_dbs/mlst.nf index 5c6401de..11de39df 100644 --- a/modules/bacannot_dbs/mlst.nf +++ b/modules/bacannot_dbs/mlst.nf @@ -1,7 +1,7 @@ process MLST_DB { publishDir "${params.output}/mlst_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") diff --git a/modules/bacannot_dbs/phast.nf b/modules/bacannot_dbs/phast.nf index 6e1771e7..0c7587ef 100644 --- a/modules/bacannot_dbs/phast.nf +++ b/modules/bacannot_dbs/phast.nf @@ -1,7 +1,7 @@ process PHAST_DB { publishDir "${params.output}/phast_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") diff --git a/modules/bacannot_dbs/phigaro.nf b/modules/bacannot_dbs/phigaro.nf index 7c9c2bb5..03f9c5a5 100644 --- a/modules/bacannot_dbs/phigaro.nf +++ b/modules/bacannot_dbs/phigaro.nf @@ -1,7 +1,7 @@ process PHIGARO_DB { publishDir "${params.output}/phigaro_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_medium' ] - + output: file("*") diff --git a/modules/bacannot_dbs/plasmidfinder.nf b/modules/bacannot_dbs/plasmidfinder.nf index 18e778b4..05073cad 100644 --- a/modules/bacannot_dbs/plasmidfinder.nf +++ b/modules/bacannot_dbs/plasmidfinder.nf @@ -1,7 +1,7 @@ process PLASMIDFINDER_DB { publishDir "${params.output}", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") diff --git a/modules/bacannot_dbs/platon.nf b/modules/bacannot_dbs/platon.nf index e8e40c77..2d30881d 100644 --- a/modules/bacannot_dbs/platon.nf +++ b/modules/bacannot_dbs/platon.nf @@ -1,7 +1,7 @@ process PLATON_DB { publishDir "${params.output}/platon_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_low' ] - + output: file("*") diff --git a/modules/bacannot_dbs/prokka.nf b/modules/bacannot_dbs/prokka.nf index 594e7f59..f82a24e2 100644 --- a/modules/bacannot_dbs/prokka.nf +++ b/modules/bacannot_dbs/prokka.nf @@ -1,7 +1,7 @@ process PROKKA_DB { publishDir "${params.output}/prokka_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_low' ] - + output: file("*") diff --git a/modules/bacannot_dbs/resfinder.nf b/modules/bacannot_dbs/resfinder.nf index 46914a12..2c2d1f43 100644 --- a/modules/bacannot_dbs/resfinder.nf +++ b/modules/bacannot_dbs/resfinder.nf @@ -1,7 +1,7 @@ process RESFINDER_DB { publishDir "${params.output}/resfinder_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") diff --git a/modules/bacannot_dbs/vfdb.nf b/modules/bacannot_dbs/vfdb.nf index 6b9112f5..2a1673d9 100644 --- a/modules/bacannot_dbs/vfdb.nf +++ b/modules/bacannot_dbs/vfdb.nf @@ -1,7 +1,7 @@ process VFDB_DB { publishDir "${params.output}/vfdb_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") diff --git a/modules/bacannot_dbs/victors.nf b/modules/bacannot_dbs/victors.nf index 6d9aa0f9..b5c28409 100644 --- a/modules/bacannot_dbs/victors.nf +++ b/modules/bacannot_dbs/victors.nf @@ -1,7 +1,7 @@ process VICTORS_DB { publishDir "${params.output}/victors_db", mode: 'copy', overwrite: "$params.force_update" label = [ 'db_download', 'process_ultralow' ] - + output: file("*") diff --git a/modules/generic/bakta.nf b/modules/generic/bakta.nf index 80253324..8d673ca2 100644 --- a/modules/generic/bakta.nf +++ b/modules/generic/bakta.nf @@ -1,11 +1,11 @@ process BAKTA { publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> - if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" - else if (filename == "annotation") "$filename" - else null + if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" + else if (filename == "annotation") "$filename" + else null } tag "${prefix}" - label = [ 'misc', 'process_medium', 'error_retry' ] + label = [ 'process_medium', 'error_retry' ] input: tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), file(assembly), val(resfinder_species) @@ -33,6 +33,9 @@ process BAKTA { # Save bakta version bakta --version &> bakta_version.txt ; + # clean headers char limit + awk '{ if (\$0 ~ />/) print substr(\$0,1,21) ; else print \$0 }' $assembly > cleaned_header.fasta + # Run bakta bakta \\ --output annotation \\ @@ -41,7 +44,7 @@ process BAKTA { --prefix ${prefix} \\ --strain '${prefix}' \\ --db $bakta_db \\ - $assembly + cleaned_header.fasta # fix fasta headers cut -f 1 -d ' ' annotation/${prefix}.fna > tmp.fa diff --git a/modules/generic/barrnap.nf b/modules/generic/barrnap.nf index cea24619..b646ab91 100644 --- a/modules/generic/barrnap.nf +++ b/modules/generic/barrnap.nf @@ -4,7 +4,7 @@ process BARRNAP { else "rRNA/$filename" } tag "${prefix}" - label = [ 'perl', 'process_low' ] + label = [ 'process_low' ] input: tuple val(prefix), file(genome) diff --git a/modules/generic/circos.nf b/modules/generic/circos.nf index 4fe3d0bf..c0c7ccfe 100644 --- a/modules/generic/circos.nf +++ b/modules/generic/circos.nf @@ -4,8 +4,7 @@ process CIRCOS { else "$filename" } tag "$prefix" - - label = [ 'perl', 'process_low' ] + label = [ 'misc', 'process_low' ] input: tuple val(prefix), path(inputs, stageAs: 'results*') diff --git a/modules/generic/gc_skew.nf b/modules/generic/gc_skew.nf index e4827c78..68ea6170 100644 --- a/modules/generic/gc_skew.nf +++ b/modules/generic/gc_skew.nf @@ -1,7 +1,6 @@ process GC_SKEW { tag "$prefix" - - label = [ 'python', 'process_low' ] + label = [ 'misc', 'process_low' ] input: tuple val(prefix), path(inputs) diff --git a/modules/generic/gff2gbk.nf b/modules/generic/gff2gbk.nf index df73cb82..c1e0ff88 100644 --- a/modules/generic/gff2gbk.nf +++ b/modules/generic/gff2gbk.nf @@ -10,9 +10,6 @@ process GFF2GBK { path "*.genbank", emit: results """ - # Activate env - export PATH=/opt/conda/envs/antismash/bin:\$PATH - # Run emboss seqret seqret \\ -sequence $input \\ diff --git a/modules/generic/gff2sql.nf b/modules/generic/gff2sql.nf index 536609af..0c34824a 100644 --- a/modules/generic/gff2sql.nf +++ b/modules/generic/gff2sql.nf @@ -33,9 +33,6 @@ process CREATE_SQL { fi - # Save results with better name - mv /work/${prefix}.sqlite . ; - # Save parser cp /work/bscripts/run_server.sh . ; """ diff --git a/modules/generic/jbrowse.nf b/modules/generic/jbrowse.nf index ea2cc183..d0b80a62 100644 --- a/modules/generic/jbrowse.nf +++ b/modules/generic/jbrowse.nf @@ -4,7 +4,7 @@ process JBROWSE { tag "${prefix}" input: - tuple val(prefix), file(merged_gff), file(draft), file("prokka_gff"), file(barrnap), file(gc_bedGraph), file(gc_chrSizes), file(resfinder_gff), file(phigaro), file(genomic_islands), file("methylation"), file("chr.sizes"), file(phispy_tsv), file(digIS_gff), file(antiSMASH), file(custom_annotations) + tuple val(prefix), file(merged_gff), file(draft), file("prokka_gff"), file(barrnap), file(gc_bedGraph), file(gc_chrSizes), file(resfinder_gff), file(phigaro), file(genomic_islands), file("methylation"), file("chr.sizes"), file(phispy_tsv), file(digIS_gff), file(antiSMASH), file(custom_annotations), file(integron_finder) output: path "*", emit: results @@ -29,6 +29,7 @@ process JBROWSE { -S chr.sizes \\ -R $resfinder_gff \\ -d $digIS_gff \\ - -A $antiSMASH + -A $antiSMASH \\ + -i $integron_finder """ } diff --git a/modules/generic/karyotype.nf b/modules/generic/karyotype.nf index eb8eb8d9..cc2b37ad 100644 --- a/modules/generic/karyotype.nf +++ b/modules/generic/karyotype.nf @@ -1,6 +1,5 @@ process MAKE_KARYOTYPE { tag "$prefix" - label = [ 'misc', 'process_low' ] input: diff --git a/modules/generic/mash.nf b/modules/generic/mash.nf index 6264fdb6..b2ca8961 100644 --- a/modules/generic/mash.nf +++ b/modules/generic/mash.nf @@ -4,7 +4,7 @@ process REFSEQ_MASHER { else "refseq_masher/$filename" } tag "${prefix}" - label = [ 'python', 'process_low' ] + label = [ 'process_low' ] input: tuple val(prefix), path(genome) diff --git a/modules/generic/merge_annotations.nf b/modules/generic/merge_annotations.nf index 96d589b7..7343bd8a 100644 --- a/modules/generic/merge_annotations.nf +++ b/modules/generic/merge_annotations.nf @@ -4,7 +4,7 @@ process MERGE_ANNOTATIONS { tag "${prefix}" input: - tuple val(prefix), file('prokka_gff'), file(kofamscan), file(vfdb), file(victors), file(amrfinder), file(resfinder), file(rgi), file(iceberg), file(phast), file('digis_gff'), file(custom_databases) + tuple val(prefix), file('prokka_gff'), file(kofamscan), file(vfdb), file(victors), file(amrfinder), file(resfinder), file(rgi), file(iceberg), file(phast), file('digis_gff'), file(custom_databases), file(integron_finder) output: tuple val(prefix), path("${prefix}.gff") , emit: gff @@ -108,5 +108,11 @@ process MERGE_ANNOTATIONS { cat ${prefix}.gff transposable_elements_digis.gff | bedtools sort > tmp.out.gff ; ( cat tmp.out.gff > ${prefix}.gff && rm tmp.out.gff ); fi + + ### integron_finder results + ### integrons are unique / complete elements and should not be intersected + cat ${prefix}.gff $integron_finder | bedtools sort > tmp.gff ; + cat tmp.gff > ${prefix}.gff + rm tmp.gff """ } diff --git a/modules/generic/merge_summaries.nf b/modules/generic/merge_summaries.nf index d4437b10..55c50389 100644 --- a/modules/generic/merge_summaries.nf +++ b/modules/generic/merge_summaries.nf @@ -1,7 +1,6 @@ process MERGE_SUMMARIES { publishDir "${params.output}", mode: 'copy' label = [ 'misc', 'process_low' ] - input: path(summaries) diff --git a/modules/generic/mlst.nf b/modules/generic/mlst.nf index 488327c8..6a4ced52 100644 --- a/modules/generic/mlst.nf +++ b/modules/generic/mlst.nf @@ -1,10 +1,10 @@ process MLST { publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> - if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" - else "MLST/$filename" + if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" + else "MLST/$filename" } tag "${prefix}" - label = [ 'perl', 'process_ultralow' ] + label = [ 'process_ultralow' ] input: tuple val(prefix), file(genome) @@ -19,9 +19,7 @@ process MLST { script: """ # update tool database - mlst_dir=\$(which mlst | sed 's/bin\\/mlst//g') - cp ${bacannot_db}/mlst_db/* -r \${mlst_dir}/db/pubmlst/ - ( cd \$mlst_dir/scripts && ./mlst-make_blast_db ) + mlst-make_blast_db.sh ${bacannot_db}/mlst_db # Save mlst tool version mlst --version > mlst_version.txt ; diff --git a/modules/generic/prepare_circos.nf b/modules/generic/prepare_circos.nf index 8afb64df..eec1f513 100644 --- a/modules/generic/prepare_circos.nf +++ b/modules/generic/prepare_circos.nf @@ -1,6 +1,5 @@ process PREPARE_CIRCOS { tag "$prefix" - label = [ 'misc', 'process_low' ] input: diff --git a/modules/generic/prokka.nf b/modules/generic/prokka.nf index 61ae6d7f..bd31e81d 100644 --- a/modules/generic/prokka.nf +++ b/modules/generic/prokka.nf @@ -1,11 +1,11 @@ process PROKKA { publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> - if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" - else if (filename == "annotation") "$filename" - else null + if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" + else if (filename == "annotation") "$filename" + else null } tag "${prefix}" - label = [ 'perl', 'process_medium' ] + label = [ 'process_medium' ] input: tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), file(assembly), val(resfinder_species) @@ -26,11 +26,13 @@ process PROKKA { path('prokka_version.txt'), emit: version script: - kingdom = (params.prokka_kingdom) ? "--kingdom ${params.prokka_kingdom}" : '' - gcode = (params.prokka_genetic_code) ? "--gcode ${params.prokka_genetic_code}" : '' - rnammer = (params.prokka_use_rnammer) ? "--rnammer" : '' - models = (params.prokka_use_pgap) ? "PGAP_NCBI.hmm" : "TIGRFAMs_15.0.hmm" + kingdom = (params.prokka_kingdom) ? "--kingdom ${params.prokka_kingdom}" : '' + gcode = (params.prokka_genetic_code) ? "--gcode ${params.prokka_genetic_code}" : '' + rnammer = (params.prokka_use_rnammer) ? "--rnammer" : '' + models = (params.prokka_use_pgap) ? "PGAP_NCBI.hmm" : "TIGRFAMs_15.0.hmm" """ + #!/usr/bin/env bash + # save prokka version prokka -v &> prokka_version.txt ; @@ -45,6 +47,9 @@ process PROKKA { # hmmpress ( cd prokka_db/hmm/ ; for i in *.hmm ; do hmmpress -f \$i ; done ) + # clean headers char limit + awk '{ if (\$0 ~ />/) print substr(\$0,1,21) ; else print \$0 }' $assembly > cleaned_header.fasta + # run prokka prokka \\ --dbdir prokka_db \\ @@ -56,7 +61,7 @@ process PROKKA { --genus '' \\ --species '' \\ --strain \"${prefix}\" \\ - $assembly + cleaned_header.fasta # remove tmp dir to gain space rm -r prokka_db diff --git a/modules/generic/reports.nf b/modules/generic/reports.nf index 29e2d418..772d470a 100644 --- a/modules/generic/reports.nf +++ b/modules/generic/reports.nf @@ -4,7 +4,7 @@ process REPORT { tag "${prefix}" input: - tuple val(prefix), file('annotation_stats.tsv'), file(gff), file(barrnap), file(mlst), file(keggsvg), file(refseq_masher_txt), file(amrfinder), file(rgi), file(rgi_parsed), file(rgi_heatmap), file(argminer_out), file(resfinder_tab), file(resfinder_point), file(resfinder_phenotable), file(vfdb_blastn), file(victors_blastp), file(phigaro_txt), file(phispy_tsv), file(iceberg_blastp), file(iceberg_blastn), file(plasmids_tsv), file(platon_tsv), file(gi_image), file(phast_blastp), file(digIS) + tuple val(prefix), file('annotation_stats.tsv'), file(gff), file(barrnap), file(mlst), file(keggsvg), file(refseq_masher_txt), file(amrfinder), file(rgi), file(rgi_parsed), file(rgi_heatmap), file(argminer_out), file(resfinder_tab), file(resfinder_point), file(resfinder_phenotable), file(vfdb_blastn), file(victors_blastp), file(phigaro_txt), file(phispy_tsv), file(iceberg_blastp), file(iceberg_blastn), file(plasmids_tsv), file(platon_tsv), file(mobsuite_tsv), file(gi_image), file(phast_blastp), file(digIS), file(integronfinder) output: path '*.html', emit: results @@ -23,54 +23,68 @@ process REPORT { ## Generate generic Report rmarkdown::render("report_general.Rmd" , \ - params = list( generic_annotation = "annotation_stats.tsv", \ - generic_annotator = "${generic_annotator}", \ - kegg = "$keggsvg", \ - barrnap = "$barrnap", \ - mlst = "$mlst", \ - refseq_masher = "$refseq_masher_txt", \ - query = "${prefix}")) ; + params = list( + generic_annotation = "annotation_stats.tsv", \ + generic_annotator = "${generic_annotator}", \ + kegg = "$keggsvg", \ + barrnap = "$barrnap", \ + mlst = "$mlst", \ + refseq_masher = "$refseq_masher_txt", \ + query = "${prefix}" + ) + ) ; ## Generate Resistance Report - rmarkdown::render("report_resistance.Rmd", params = list(\ - blast_id = ${params.blast_resistance_minid} , \ - blast_cov = ${params.blast_resistance_mincov}, \ - amrfinder = "$amrfinder", \ - query = "${prefix}", \ - rgitool = "$rgi", \ - rgiparsed = "$rgi_parsed", \ - rgi_heatmap = "$rgi_heatmap", \ - argminer_blastp = "$argminer_out", \ - resfinder_tab = "$resfinder_tab", \ - resfinder_pointfinder = "$resfinder_point", \ - resfinder_phenotype = "$resfinder_phenotable", \ - generic_annotator = "${generic_annotator}", \ - gff = "$gff")) ; + rmarkdown::render("report_resistance.Rmd", \ + params = list(\ + blast_id = ${params.blast_resistance_minid} , \ + blast_cov = ${params.blast_resistance_mincov}, \ + amrfinder = "$amrfinder", \ + query = "${prefix}", \ + rgitool = "$rgi", \ + rgiparsed = "$rgi_parsed", \ + rgi_heatmap = "$rgi_heatmap", \ + argminer_blastp = "$argminer_out", \ + resfinder_tab = "$resfinder_tab", \ + resfinder_pointfinder = "$resfinder_point", \ + resfinder_phenotype = "$resfinder_phenotable", \ + generic_annotator = "${generic_annotator}", \ + gff = "$gff" + ) + ) ; ## Generate Virulence Report rmarkdown::render("report_virulence.Rmd" , \ - params = list( blast_id = ${params.blast_virulence_minid} , \ - blast_cov = ${params.blast_virulence_mincov}, \ - vfdb_blast = "$vfdb_blastn", \ - gff = "$gff", \ - victors_blast = "$victors_blastp", \ - query = "${prefix}")) ; + params = list( + blast_id = ${params.blast_virulence_minid} , \ + blast_cov = ${params.blast_virulence_mincov}, \ + vfdb_blast = "$vfdb_blastn", \ + gff = "$gff", \ + victors_blast = "$victors_blastp", \ + query = "${prefix}" + ) + ) ; ## Generate MGEs report rmarkdown::render("report_MGEs.Rmd", \ - params = list( blast_id = ${params.blast_MGEs_minid}, \ - blast_cov = ${params.blast_MGEs_mincov}, \ - phigaro_dir = "${params.output}/prophages/phigaro", \ - phigaro_txt = "$phigaro_txt", \ - phispy_tsv = "$phispy_tsv", \ - ice_prot_blast = "$iceberg_blastp", \ - ice_genome_blast = "$iceberg_blastn", \ - plasmid_finder_tab = "$plasmids_tsv", \ - platon_tsv = "$platon_tsv", \ - query = "${prefix}", \ - gi_image = "$gi_image", \ - digis = "$digIS", \ - gff = "$gff", \ - phast_prot_blast = "$phast_blastp" )) ; + params = list( + blast_id = ${params.blast_MGEs_minid}, \ + blast_cov = ${params.blast_MGEs_mincov}, \ + phigaro_dir = "${params.output}/prophages/phigaro", \ + phigaro_txt = "$phigaro_txt", \ + phispy_tsv = "$phispy_tsv", \ + ice_prot_blast = "$iceberg_blastp", \ + ice_genome_blast = "$iceberg_blastn", \ + plasmid_finder_tab = "$plasmids_tsv", \ + platon_tsv = "$platon_tsv", \ + mobsuite_tsv = "$mobsuite_tsv", \ + query = "${prefix}", \ + gi_image = "$gi_image", \ + digis = "$digIS", \ + integronfinder = "$integronfinder", \ + gff = "$gff", \ + phast_prot_blast = "$phast_blastp" + ) + ) ; """ } diff --git a/modules/generic/sequenceserver.nf b/modules/generic/sequenceserver.nf index 2663ef97..2048dffe 100644 --- a/modules/generic/sequenceserver.nf +++ b/modules/generic/sequenceserver.nf @@ -1,8 +1,7 @@ process SEQUENCESERVER { publishDir "${params.output}/${prefix}/SequenceServerDBs", mode: 'copy' tag "${prefix}" - label = [ 'server', 'process_ultralow' ] - + label = [ 'server', 'process_ultralow' ] input: tuple val(prefix), file(genome), file(genes), file(proteins) diff --git a/modules/generic/summary.nf b/modules/generic/summary.nf index eda601ad..f443ee8c 100644 --- a/modules/generic/summary.nf +++ b/modules/generic/summary.nf @@ -1,22 +1,34 @@ process SUMMARY { publishDir "${params.output}/${prefix}", mode: 'copy' tag "${prefix}" - label = [ 'python', 'process_low' ] - + label = [ 'misc', 'process_low' ] input: tuple val(prefix), - file(annotation), file(stageAs: "results/${prefix}/MLST/*"), - file(stageAs: "results/${prefix}/rRNA/*"), file(stageAs: "results/${prefix}/*"), - file(stageAs: "results/${prefix}/plasmids/*"), file(stageAs: "results/${prefix}/plasmids/*"), - file(stageAs: "results/${prefix}/genomic_islands/*"), file(stageAs: "results/${prefix}/virulence/vfdb/*"), - file(stageAs: "results/${prefix}/virulence/victors/*"), file(stageAs: "results/${prefix}/prophages/phast_db/*"), - file(stageAs: "results/${prefix}/prophages/phigaro/*"), file(stageAs: "results/${prefix}/prophages/*"), - file(stageAs: "results/${prefix}/ICEs/*"), file(stageAs: "results/${prefix}/resistance/AMRFinderPlus/*"), - file(stageAs: "results/${prefix}/resistance/RGI/*"), file(stageAs: "results/${prefix}/resistance/ARGMiner/*"), - file(stageAs: "results/${prefix}/resistance/*"), file(stageAs: "results/${prefix}/methylations/*"), - file(stageAs: "results/${prefix}/refseq_masher/*"), file(stageAs: "results/${prefix}/*"), - file(stageAs: "results/${prefix}/*"), file(stageAs: "results/${prefix}/gffs/*") + file(annotation), + file(stageAs: "results/${prefix}/MLST/*"), + file(stageAs: "results/${prefix}/rRNA/*"), + file(stageAs: "results/${prefix}/*"), + file(stageAs: "results/${prefix}/plasmids/*"), + file(stageAs: "results/${prefix}/plasmids/*"), + file(stageAs: "results/${prefix}/genomic_islands/*"), + file(stageAs: "results/${prefix}/virulence/vfdb/*"), + file(stageAs: "results/${prefix}/virulence/victors/*"), + file(stageAs: "results/${prefix}/prophages/phast_db/*"), + file(stageAs: "results/${prefix}/prophages/phigaro/*"), + file(stageAs: "results/${prefix}/prophages/*"), + file(stageAs: "results/${prefix}/ICEs/*"), + file(stageAs: "results/${prefix}/resistance/AMRFinderPlus/*"), + file(stageAs: "results/${prefix}/resistance/RGI/*"), + file(stageAs: "results/${prefix}/resistance/ARGMiner/*"), + file(stageAs: "results/${prefix}/resistance/*"), + file(stageAs: "results/${prefix}/methylations/*"), + file(stageAs: "results/${prefix}/refseq_masher/*"), + file(stageAs: "results/${prefix}/*"), + file(stageAs: "results/${prefix}/*"), + file(stageAs: "results/${prefix}/gffs/*"), + file(stageAs: "results/${prefix}/integron_finder/*"), + file(stageAs: "results/${prefix}/plasmids/mob_suite/*") output: tuple val(prefix), path("${prefix}_summary.json"), emit: summaries @@ -25,7 +37,7 @@ process SUMMARY { """ mkdir -p results/${prefix}/annotation ln -rs annotation/* results/${prefix}/annotation - source activate falmeida-py + sed -i 's/s:/:/g' results/${prefix}/annotation/${prefix}.txt falmeida-py bacannot2json -i results -o ${prefix}_summary.json """ } diff --git a/modules/prophages/phigaro.nf b/modules/prophages/phigaro.nf index f172cb86..915a2ec7 100644 --- a/modules/prophages/phigaro.nf +++ b/modules/prophages/phigaro.nf @@ -4,7 +4,7 @@ process PHIGARO { else "prophages/phigaro/$filename" } tag "${prefix}" - label = [ 'python', 'process_medium' ] + label = [ 'process_medium' ] input: tuple val(prefix), file("assembly.fasta") @@ -18,10 +18,7 @@ process PHIGARO { path('phigaro_version.txt') , emit: version script: - """ - # activate env - source activate phigaro - + """ # get tool version phigaro -V > phigaro_version.txt ; diff --git a/modules/prophages/phispy.nf b/modules/prophages/phispy.nf index 84b8cab4..ef32fc90 100644 --- a/modules/prophages/phispy.nf +++ b/modules/prophages/phispy.nf @@ -5,7 +5,7 @@ process PHISPY { else null } tag "${prefix}" - label = [ 'python', 'process_medium' ] + label = [ 'process_medium' ] input: tuple val(prefix), file(input) diff --git a/modules/resistance/amrfinder.nf b/modules/resistance/amrfinder.nf index 9aa7c3cc..6f36e228 100644 --- a/modules/resistance/amrfinder.nf +++ b/modules/resistance/amrfinder.nf @@ -4,7 +4,7 @@ process AMRFINDER { else "resistance/AMRFinderPlus/$filename" } tag "${prefix}" - label = [ 'misc', 'process_medium' ] + label = [ 'process_medium' ] input: tuple val(prefix), file(proteins) diff --git a/modules/resistance/amrfinder2tsv.nf b/modules/resistance/amrfinder2tsv.nf index 51a09c37..41f52854 100644 --- a/modules/resistance/amrfinder2tsv.nf +++ b/modules/resistance/amrfinder2tsv.nf @@ -1,6 +1,5 @@ process AMRFINDER2TSV { tag "$prefix" - label = [ 'renv', 'process_low' ] input: diff --git a/modules/resistance/resfinder.nf b/modules/resistance/resfinder.nf index 36dbee8d..9b7105e9 100644 --- a/modules/resistance/resfinder.nf +++ b/modules/resistance/resfinder.nf @@ -20,14 +20,11 @@ process RESFINDER { script: resistance_minid = params.blast_resistance_minid / 100.00 resistance_mincov = params.blast_resistance_mincov / 100.00 - if (resfinder_species.toLowerCase() != "other") + """ # activate env source activate resfinder - # Make databases available - ln -rs ${bacannot_db}/resfinder_db/db_* \$(dirname \$(which run_resfinder.py)) - # Run resfinder acquired resistance run_resfinder.py \\ --inputfasta $genome \\ @@ -35,53 +32,39 @@ process RESFINDER { --species \"${resfinder_species}\" \\ --min_cov ${resistance_mincov} \\ --threshold ${resistance_minid} \\ + --db_path_point ${bacannot_db}/resfinder_db/db_pointfinder \\ + --db_path_res ${bacannot_db}/resfinder_db/db_resfinder \\ --acquired ; # Fix name of pheno table mv resfinder/pheno_table.txt resfinder/args_pheno_table.txt &> /dev/null ; # Run resfinder pointfinder resistance - run_resfinder.py \\ - --inputfasta $genome \\ - -o resfinder \\ - --species \"${resfinder_species}\" \\ - --min_cov ${resistance_mincov} \\ - --threshold ${resistance_minid} \\ - --point ; - - # Fix name of pheno table - mv resfinder/pheno_table.txt resfinder/mutation_pheno_table.txt &> /dev/null ; - - # Convert to GFF - resfinder2gff.py \\ - -i resfinder/ResFinder_results_tab.txt > resfinder/results_tab.gff ; - """ - - else if (resfinder_species.toLowerCase() == "other") - """ - # activate env - source activate resfinder + if [ \"${resfinder_species.toLowerCase()}\" != "other" ]; then - # Make databases available - ln -rs ${bacannot_db}/resfinder_db/db_* \$(dirname \$(which run_resfinder.py)) + run_resfinder.py \\ + --inputfasta $genome \\ + -o resfinder \\ + --species \"${resfinder_species}\" \\ + --min_cov ${resistance_mincov} \\ + --threshold ${resistance_minid} \\ + --db_path_point ${bacannot_db}/resfinder_db/db_pointfinder \\ + --db_path_res ${bacannot_db}/resfinder_db/db_resfinder \\ + --point ; - # Run resfinder acquired resistance - run_resfinder.py \\ - --inputfasta $genome \\ - -o resfinder \\ - --species \"${resfinder_species}\" \\ - --min_cov ${resistance_mincov} \\ - --threshold ${resistance_minid} \\ - --acquired ; - - # Fix name of pheno table - mv resfinder/pheno_table.txt resfinder/args_pheno_table.txt &> /dev/null ; - - # touch pointfinder - touch resfinder/PointFinder_results.txt ; + # Fix name of pheno table + mv resfinder/pheno_table.txt resfinder/mutation_pheno_table.txt &> /dev/null ; + + else + # touch pointfinder + touch resfinder/PointFinder_results.txt ; + + fi + # Convert to GFF resfinder2gff.py \\ -i resfinder/ResFinder_results_tab.txt > resfinder/results_tab.gff ; """ + } diff --git a/modules/resistance/rgi_annotation.nf b/modules/resistance/rgi_annotation.nf index cd83eed7..9ae28955 100644 --- a/modules/resistance/rgi_annotation.nf +++ b/modules/resistance/rgi_annotation.nf @@ -5,7 +5,7 @@ process CARD_RGI { else "resistance/RGI/$filename" } tag "${prefix}" - label = [ 'python', 'process_medium' ] + label = [ 'process_medium' ] input: tuple val(prefix), path(input) @@ -20,10 +20,7 @@ process CARD_RGI { path("*_version.txt") , emit: version script: - """ - # activate env - source activate rgi - + """ # load database rgi load --card_json ${bacannot_db}/card_db/card.json --local diff --git a/modules/virulence/vfdb2tsv.nf b/modules/virulence/vfdb2tsv.nf index 595e1548..3a27daa6 100644 --- a/modules/virulence/vfdb2tsv.nf +++ b/modules/virulence/vfdb2tsv.nf @@ -1,6 +1,5 @@ process VFDB2TSV { tag "$prefix" - label = [ 'renv', 'process_low' ] input: diff --git a/nextflow.config b/nextflow.config index 45c8494d..11ead557 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,11 +13,12 @@ includeConfig 'conf/defaults.config' params { // Boilerplate options - tracedir = "${params.output}/pipeline_info" plaintext_email = false monochrome_logs = false help = false get_config = false + get_docker_config = false + get_singularity_config = false get_samplesheet = false validate_params = true show_hidden_params = false @@ -82,19 +83,19 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/bacannot_timeline_${trace_timestamp}.html" + file = "${params.output}/pipeline_info/bacannot_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/bacannot_report_${trace_timestamp}.html" + file = "${params.output}/pipeline_info/bacannot_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/bacannot_trace_${trace_timestamp}.txt" + file = "${params.output}/pipeline_info/bacannot_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/bacannot_pipeline_dag_${trace_timestamp}.svg" + file = "${params.output}/pipeline_info/bacannot_pipeline_dag_${trace_timestamp}.svg" } /* @@ -106,8 +107,8 @@ manifest { description = "Nextflow pipeline for bacterial genome annotation" homePage = "https://github.com/fmalmeida/bacannot" mainScript = "main.nf" - nextflowVersion = ">=20.10.0" - version = '3.2' + nextflowVersion = "!>=22.10.1" + version = '3.3' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index 31ad9105..e95d3c34 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,10 +11,16 @@ "default": "", "properties": { "get_dbs": { - "type": "boolean" + "type": "boolean", + "description": "Download and build all the required databases on the fly (get today's version)" }, "force_update": { - "type": "boolean" + "type": "boolean", + "description": "Should we overwriting existing databases if any?" + }, + "get_zenodo_db": { + "type": "boolean", + "description": "Download latest pre-built databases from Zenodo?" } } }, @@ -28,6 +34,10 @@ "type": "string", "description": "Path to input samplesheet" }, + "enable_deduplication": { + "type": "boolean", + "description": "Execute deduplication on reads before assembly." + }, "output": { "type": "string", "description": "Path for output directory", @@ -53,15 +63,18 @@ "properties": { "max_cpus": { "type": "integer", - "default": 16 + "default": 16, + "description": "Maximum number of cpus a single module can use." }, "max_memory": { "type": "string", - "default": "20.GB" + "default": "20.GB", + "description": "Maximum memory a single module can use." }, "max_time": { "type": "string", - "default": "40.h" + "default": "40.h", + "description": "Maximum time a module can run." } } }, @@ -167,7 +180,7 @@ "plasmids_minid": { "type": "number", "description": "Identity threshold for plasmid annotation", - "default": 90.0, + "default": 90, "minimum": 0, "maximum": 100, "help_text": "Must be between 0 and 100", @@ -176,7 +189,7 @@ "plasmids_mincov": { "type": "number", "description": "overage threshold for plasmid annotation", - "default": 60.0, + "default": 60, "minimum": 0, "maximum": 100, "help_text": "Must be between 0 and 100", @@ -185,7 +198,7 @@ "blast_virulence_minid": { "type": "number", "description": "Identity threshold for virulence factors annotation", - "default": 90.0, + "default": 90, "minimum": 0, "maximum": 100, "help_text": "Must be between 0 and 100", @@ -194,7 +207,7 @@ "blast_virulence_mincov": { "type": "number", "description": "overage threshold for virulence factors annotation", - "default": 90.0, + "default": 90, "minimum": 0, "maximum": 100, "help_text": "Must be between 0 and 100", @@ -203,7 +216,7 @@ "blast_resistance_minid": { "type": "number", "description": "Identity threshold for resistance genes annotation", - "default": 90.0, + "default": 90, "minimum": 0, "maximum": 100, "help_text": "Must be between 0 and 100", @@ -212,7 +225,7 @@ "blast_resistance_mincov": { "type": "number", "description": "overage threshold for resistance genes annotation", - "default": 90.0, + "default": 90, "minimum": 0, "maximum": 100, "help_text": "Must be between 0 and 100", @@ -221,7 +234,7 @@ "blast_MGEs_minid": { "type": "number", "description": "Identity threshold for ICEs and prophages annotation", - "default": 85.0, + "default": 85, "minimum": 0, "maximum": 100, "help_text": "Must be between 0 and 100", @@ -230,7 +243,7 @@ "blast_MGEs_mincov": { "type": "number", "description": "overage threshold for ICEs and prophages annotation", - "default": 85.0, + "default": 85, "minimum": 0, "maximum": 100, "help_text": "Must be between 0 and 100", @@ -260,7 +273,7 @@ "blast_custom_minid": { "type": "number", "description": "Min. identity % for the annotation using user's custom database", - "default": 65.0, + "default": 65, "minimum": 0, "maximum": 100, "hidden": true @@ -268,7 +281,7 @@ "blast_custom_mincov": { "type": "number", "description": "Min. gene/subject coverage % for the annotation using user's custom database", - "default": 65.0, + "default": 65, "minimum": 0, "maximum": 100, "hidden": true @@ -292,6 +305,16 @@ "description": "Download template config for parameters", "fa_icon": "fas fa-question-circle" }, + "get_docker_config": { + "type": "boolean", + "description": "Download template docker config for containers.", + "fa_icon": "fas fa-question-circle" + }, + "get_singularity_config": { + "type": "boolean", + "description": "Download template singularity config for containers.", + "fa_icon": "fas fa-question-circle" + }, "get_samplesheet": { "type": "boolean", "fa_icon": "fas fa-question-circle", @@ -302,13 +325,6 @@ "help_text": "Number of minimum overlapping base pairs required for merging\nNegative values, such as -20, means the number of required overlapping bases for merging.\nPositive values, such as 5, means the maximum distance accepted between features for merging.\nBy default (if Blank), this process is not executed. For execution the user needs to provide a value", "description": "Minimum overlapping base pairs required for merging" }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.output}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -326,17 +342,17 @@ "unicycler_version": { "type": "string", "description": "Select quay.io image tag for tool", - "default": "0.4.8--py38h8162308_3" + "default": "0.5.0--py310h6cc9453_3" }, "flye_version": { "type": "string", "description": "Select quay.io image tag for tool", - "default": "2.9--py39h39abbe0_0" + "default": "2.9--py39h6935b12_1" }, "bakta_version": { "type": "string", "description": "Select quay.io image tag for tool", - "default": "1.6.1--pyhdfd78af_0" + "default": "1.7.0--pyhdfd78af_1" } } }, diff --git a/workflows/bacannot.nf b/workflows/bacannot.nf index 57abf5d9..93087adf 100644 --- a/workflows/bacannot.nf +++ b/workflows/bacannot.nf @@ -14,12 +14,15 @@ include { KOFAMSCAN } from '../modules/KOs/kofamscan' include { KEGG_DECODER } from '../modules/KOs/kegg-decoder' include { PLASMIDFINDER } from '../modules/MGEs/plasmidfinder' include { PLATON } from '../modules/MGEs/platon' +include { MOBSUITE } from '../modules/MGEs/mob_suite' include { VFDB } from '../modules/virulence/vfdb' include { VICTORS } from '../modules/virulence/victors' include { PHAST } from '../modules/prophages/phast' include { PHIGARO } from '../modules/prophages/phigaro' include { PHISPY } from '../modules/prophages/phispy' include { ICEBERG } from '../modules/MGEs/iceberg' +include { INTEGRON_FINDER } from '../modules/MGEs/integron_finder' +include { INTEGRON_FINDER_2GFF } from '../modules/MGEs/integron_finder_2gff' include { ISLANDPATH } from '../modules/MGEs/islandpath' include { DRAW_GIS } from '../modules/MGEs/draw_gis' include { DIGIS } from '../modules/MGEs/digIS' @@ -120,16 +123,26 @@ workflow BACANNOT { PLATON( annotation_out_ch.genome, dbs_ch ) platon_output_ch = PLATON.out.results platon_all_ch = PLATON.out.all + // mob suite + MOBSUITE( annotation_out_ch.genome ) + mobsuite_output_ch = MOBSUITE.out.results } else { plasmidfinder_all_ch = Channel.empty() plasmidfinder_output_ch = Channel.empty() platon_output_ch = Channel.empty() platon_all_ch = Channel.empty() + mobsuite_output_ch = Channel.empty() } + // TODO: Maybe add in MGE optional? + // IslandPath software ISLANDPATH( annotation_out_ch.gbk ) + // Integron_finder software + INTEGRON_FINDER( annotation_out_ch.genome ) + INTEGRON_FINDER_2GFF( INTEGRON_FINDER.out.gbk ) + // Virulence search if (params.skip_virulence_search == false) { // VFDB @@ -286,7 +299,8 @@ workflow BACANNOT { .join(iceberg_output_blastp_ch, remainder: true) .join(phast_output_ch, remainder: true) .join(DIGIS.out.gff, remainder: true) - .join(ch_custom_annotations, remainder: true) + .join(ch_custom_annotations, remainder: true) + .join(INTEGRON_FINDER_2GFF.out.gff, remainder: true) ) /* @@ -326,6 +340,7 @@ workflow BACANNOT { .join( MERGE_ANNOTATIONS.out.digis_gff ) .join( antismash_output_ch, remainder: true ) .join( MERGE_ANNOTATIONS.out.customdb_gff.groupTuple(), remainder: true ) + .join( INTEGRON_FINDER_2GFF.out.gff, remainder: true ) ) // Render reports @@ -357,9 +372,11 @@ workflow BACANNOT { .join( iceberg_output_blastn_ch, remainder: true ) .join( plasmidfinder_output_ch, remainder: true ) .join( platon_output_ch, remainder: true ) + .join( mobsuite_output_ch, remainder: true ) .join( DRAW_GIS.out.example, remainder: true ) .join( phast_output_ch, remainder: true ) .join( MERGE_ANNOTATIONS.out.digis_gff ) + .join( INTEGRON_FINDER_2GFF.out.gff, remainder: true ) ) // @@ -367,27 +384,29 @@ workflow BACANNOT { // SUMMARY( annotation_out_ch.all - .join( MLST.out.all , remainder: true ) - .join( BARRNAP.out.all , remainder: true ) - .join( kofamscan_all_ch , remainder: true ) - .join( plasmidfinder_all_ch , remainder: true ) - .join( platon_all_ch , remainder: true ) - .join( ISLANDPATH.out.results , remainder: true ) - .join( vfdb_all_ch , remainder: true ) - .join( victors_all_ch , remainder: true ) - .join( phast_all_ch , remainder: true ) - .join( phigaro_all_ch , remainder: true ) - .join( phispy_all_ch , remainder: true ) - .join( iceberg_all_ch , remainder: true ) - .join( amrfinder_all_ch , remainder: true ) - .join( rgi_all_ch , remainder: true ) - .join( argminer_all_ch , remainder: true ) - .join( resfinder_all_ch , remainder: true ) - .join( CALL_METHYLATION.out.all , remainder: true ) - .join( REFSEQ_MASHER.out.results, remainder: true ) - .join( DIGIS.out.all , remainder: true ) - .join( antismash_all_ch , remainder: true ) - .join( MERGE_ANNOTATIONS.out.all, remainder: true ) + .join( MLST.out.all , remainder: true ) + .join( BARRNAP.out.all , remainder: true ) + .join( kofamscan_all_ch , remainder: true ) + .join( plasmidfinder_all_ch , remainder: true ) + .join( platon_all_ch , remainder: true ) + .join( ISLANDPATH.out.results , remainder: true ) + .join( vfdb_all_ch , remainder: true ) + .join( victors_all_ch , remainder: true ) + .join( phast_all_ch , remainder: true ) + .join( phigaro_all_ch , remainder: true ) + .join( phispy_all_ch , remainder: true ) + .join( iceberg_all_ch , remainder: true ) + .join( amrfinder_all_ch , remainder: true ) + .join( rgi_all_ch , remainder: true ) + .join( argminer_all_ch , remainder: true ) + .join( resfinder_all_ch , remainder: true ) + .join( CALL_METHYLATION.out.all , remainder: true ) + .join( REFSEQ_MASHER.out.results , remainder: true ) + .join( DIGIS.out.all , remainder: true ) + .join( antismash_all_ch , remainder: true ) + .join( MERGE_ANNOTATIONS.out.all , remainder: true ) + .join( INTEGRON_FINDER_2GFF.out.gff, remainder: true ) + .join( mobsuite_output_ch , remainder: true ) ) MERGE_SUMMARIES( SUMMARY.out.summaries.map{ it[1] }.collect() diff --git a/workflows/bacannot_dbs.nf b/workflows/bacannot_dbs.nf index c44d4d36..ca6c76e3 100644 --- a/workflows/bacannot_dbs.nf +++ b/workflows/bacannot_dbs.nf @@ -16,6 +16,7 @@ include { ICEBERG_DB } from '../modules/bacannot_dbs/iceberg.nf' include { PHAST_DB } from '../modules/bacannot_dbs/phast.nf' include { KOFAMSCAN_DB } from '../modules/bacannot_dbs/kofamscan.nf' include { ANTISMASH_DB } from '../modules/bacannot_dbs/antismash.nf' +include { GET_ZENODO_DB } from '../modules/bacannot_dbs/get_zenodo.nf' /* DEF WORKFLOW @@ -23,21 +24,25 @@ include { ANTISMASH_DB } from '../modules/bacannot_dbs/antismash.nf' workflow CREATE_DBS { - download_db("prokka", "PROKKA_DB") - download_db("mlst", "MLST_DB") - download_db("kofamscan", "KOFAMSCAN_DB") - download_db("card", "CARD_DB") - download_db("resfinder", "RESFINDER_DB") - download_db("amrfinder", "AMRFINDER_DB") - download_db("argminer", "ARGMINER_DB") - download_db("platon", "PLATON_DB") - download_db("plasmidfinder", "PLASMIDFINDER_DB") - download_db("phigaro", "PHIGARO_DB") - download_db("phast", "PHAST_DB") - download_db("vfdb", "VFDB_DB") - download_db("victors", "VICTORS_DB") - download_db("iceberg", "ICEBERG_DB") - download_db("antismash", "ANTISMASH_DB") + if ( params.get_dbs && !params.get_zenodo_db ) { + download_db("prokka", "PROKKA_DB") + download_db("mlst", "MLST_DB") + download_db("kofamscan", "KOFAMSCAN_DB") + download_db("card", "CARD_DB") + download_db("resfinder", "RESFINDER_DB") + download_db("amrfinder", "AMRFINDER_DB") + download_db("argminer", "ARGMINER_DB") + download_db("platon", "PLATON_DB") + download_db("plasmidfinder", "PLASMIDFINDER_DB") + download_db("phigaro", "PHIGARO_DB") + download_db("phast", "PHAST_DB") + download_db("vfdb", "VFDB_DB") + download_db("victors", "VICTORS_DB") + download_db("iceberg", "ICEBERG_DB") + download_db("antismash", "ANTISMASH_DB") + } else if ( !params.get_dbs && params.get_zenodo_db ) { + GET_ZENODO_DB() + } }