diff --git a/.github/workflows/build_kofamscan.yml b/.github/workflows/build_kofamscan.yml deleted file mode 100644 index a7cf5947..00000000 --- a/.github/workflows/build_kofamscan.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: build-kofamscan - -on: - workflow_dispatch: - schedule: - - cron: '0 0 1 */6 *' - -jobs: - - build: - runs-on: ubuntu-latest - env: - DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} - DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }} - steps: - - id: keydb - uses: pozetroninc/github-action-get-latest-release@master - with: - owner: fmalmeida - repo: bacannot - excludes: prerelease, draft - - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: download github repo - run: | - git clone https://github.com/fmalmeida/bacannot.git - - - name: Build and push docker image - id: buildx - run: | - # get more space - sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android - sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET - - # enter docker dir - cd bacannot/docker - - # login to docker - docker login -u "$DOCKERHUB_USERNAME" -p "$DOCKERHUB_PASS" - - # create image - docker build -t fmalmeida/bacannot:kofamscan_teste -f Dockerfile_kofamscan . - docker push fmalmeida/bacannot:kofamscan_teste diff --git a/.github/workflows/test_pr_docker.yml b/.github/workflows/test_pr_docker.yml index c6ebe0ae..24f9f90d 100644 --- a/.github/workflows/test_pr_docker.yml +++ b/.github/workflows/test_pr_docker.yml @@ -1,39 +1,46 @@ name: Testing new PR with docker on: pull_request: - branches: master - types: [ ready_for_review, synchronize, reopened ] + branches: [master, dev] + types: [ opened, synchronize, reopened ] jobs: run_nextflow: name: Run pipeline for the upcoming PR runs-on: ubuntu-latest - env: - DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} - DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }} - + steps: - - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Install Nextflow - env: - CAPSULE_LOG: none - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Clean environment - run: | - sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android - sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET - - - name: Build bacannot database - run: | - nextflow run main.nf -profile docker --get_dbs --output bacannot_dbs --max_cpus 2 --max_memory '6.GB' --max_time '6.h' - rm -rf bacannot_dbs/antismash_db bacannot_dbs/kofamscan_db bacannot_dbs/prokka_db/PGAP_NCBI.hmm # remove unused in quicktest to diminish size - - - name: Run the pipeline - run: | - nextflow run main.nf -profile docker,quicktest --bacannot_db bacannot_dbs + + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + env: + CAPSULE_LOG: none + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Clean environment + run: | + sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android + sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET + + - name: Get database + run: | + nextflow run main.nf -profile docker --get_zenodo_db --output ./ --max_memory '6.GB' --max_cpus 2 + nextflow run main.nf -profile docker --get_zenodo_db --output ./ --max_memory '6.GB' --max_cpus 2 -resume + nextflow run main.nf -profile docker --get_zenodo_db --output ./ --max_memory '6.GB' --max_cpus 2 -resume + sudo rm -r work .nextflow* + yes | docker system prune + + - name: Run quicktest profile + run: | + nextflow run main.nf -profile docker,quicktest --bacannot_db $( realpath ./bac* ) --output ./results --max_memory '6.GB' --max_cpus 2 + sudo rm -r work .nextflow* + yes | docker system prune + + - name: View results + run: | + sudo apt-get install -y tree + tree ./results diff --git a/.github/workflows/test_pr_singularity.yml b/.github/workflows/test_pr_singularity.yml deleted file mode 100644 index b8bd776c..00000000 --- a/.github/workflows/test_pr_singularity.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Testing new PR with singularity -on: - pull_request: - branches: [ master, dev, develop ] - types: [ ready_for_review, synchronize, reopened ] - -jobs: - run_nextflow: - name: Run pipeline for the upcoming PR - runs-on: ubuntu-latest - - steps: - - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Install Nextflow - env: - CAPSULE_LOG: none - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Install Singularity - uses: eWaterCycle/setup-singularity@v7 - with: - singularity-version: 3.8.3 - - - name: Clean environment - run: | - sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android - sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET - - - name: Build bacannot database - run: | - nextflow run main.nf -profile singularity --get_dbs --output bacannot_dbs --max_cpus 2 --max_memory '6.GB' --max_time '6.h' - rm -rf bacannot_dbs/antismash_db bacannot_dbs/kofamscan_db bacannot_dbs/prokka_db/PGAP_NCBI.hmm # remove unused in quicktest to diminish size - - - name: Run the pipeline - run: | - nextflow run main.nf -profile singularity,quicktest --bacannot_db bacannot_dbs diff --git a/.zenodo.json b/.zenodo.json index 3865b55a..cade7500 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -2,7 +2,7 @@ "description": "

The pipeline

\n\n

bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more, while providing nice an beautiful interactive documents for results exploration.

", "license": "other-open", "title": "fmalmeida/bacannot: A generic but comprehensive bacterial annotation pipeline", - "version": "v3.3", + "version": "v3.3.3", "upload_type": "software", "creators": [ { diff --git a/README.md b/README.md index 36625099..2eda798d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3627669-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3627669) +[![F1000 Paper](https://img.shields.io/badge/Citation%20F1000-10.12688/f1000research.139488.1-orange)](https://doi.org/10.12688/f1000research.139488.1) [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/fmalmeida/bacannot?include_prereleases&label=Latest%20release)](https://github.com/fmalmeida/bacannot/releases) [![Documentation](https://img.shields.io/badge/Documentation-readthedocs-brightgreen)](https://bacannot.readthedocs.io/en/latest/?badge=latest) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) @@ -8,6 +8,7 @@ [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![License](https://img.shields.io/badge/License-GPL%203-black)](https://github.com/fmalmeida/bacannot/blob/master/LICENSE) [![Follow on Twitter](http://img.shields.io/badge/twitter-%40fmarquesalmeida-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/fmarquesalmeida) +[![Zenodo Archive](https://img.shields.io/badge/Zenodo-Archive-blue)](https://doi.org/10.5281/zenodo.3627669) [![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/github.com/fmalmeida/bacannot) @@ -87,16 +88,7 @@ These images have been kept separate to not create massive Docker image and to a ## Installation -1. If you don't have it already install [Docker](https://docs.docker.com/) in your computer. - * After installed, you need to download the required Docker images - - ```bash - docker pull fmalmeida/bacannot:v3.3_misc ; - docker pull fmalmeida/bacannot:v3.3_renv ; - docker pull fmalmeida/bacannot:jbrowse ; - ``` - -๐Ÿ”ฅ Nextflow can also automatically handle images download on the fly when executed. All the other docker images from **biocontainers** are downloaded automatically. If docker has exceeded its download limit rates, please try again in a few hours. +1. If you don't have it already install either [Docker](https://docs.docker.com/) or [Singularity](https://docs.sylabs.io/guides/3.5/user-guide/index.html) in your computer. 2. Install Nextflow (version 20.10 or higher): @@ -110,54 +102,7 @@ These images have been kept separate to not create massive Docker image and to a ๐Ÿ”ฅ Users can get let the pipeline always updated with: `nextflow pull fmalmeida/bacannot` -### Downloading and updating databases - -Bacannot databases are not inside the docker images anymore to avoid huge images and problems with connections and limit rates with dockerhub. - -#### Pre-formatted - -Users can directly download pre-formatted databases from Zenodo: https://doi.org/10.5281/zenodo.7615811 - -Useful for standardization and also overcoming known issues that may arise when formatting databases with `singularity` profile. - -A module to download the latest pre-formatted database has also been made available: -```bash -# Download pipeline pre-built databases -nextflow run fmalmeida/bacannot --get_zenodo_db --output ./ -profile -``` - -#### I want to generate a new formatted database - -To download and format a copy of required bacannot databases users can execute the following: - -```bash -# Download pipeline databases -nextflow run fmalmeida/bacannot --get_dbs --output bacannot_dbs -profile -``` - -This will produce a directory like this: - -```bash -bacannot_dbs -โ”œโ”€โ”€ amrfinder_db -โ”œโ”€โ”€ antismash_db -โ”œโ”€โ”€ argminer_db -โ”œโ”€โ”€ card_db -โ”œโ”€โ”€ iceberg_db -โ”œโ”€โ”€ kofamscan_db -โ”œโ”€โ”€ mlst_db -โ”œโ”€โ”€ phast_db -โ”œโ”€โ”€ phigaro_db -โ”œโ”€โ”€ pipeline_info -โ”œโ”€โ”€ plasmidfinder_db -โ”œโ”€โ”€ platon_db -โ”œโ”€โ”€ prokka_db -โ”œโ”€โ”€ resfinder_db -โ”œโ”€โ”€ vfdb_db -โ””โ”€โ”€ victors_db -``` - -> To update databases you can either download a new one to a new directory. Remove the database you want to get a new one from the root bacannot dir and use the same command above to save in the same directory (the pipeline will only try to download missing databases). Or, you can use the parameter `--force_update` to download everything again. +Please refer to the installation page, for a complete guide on required images and databases. ยป ## Quickstart @@ -242,11 +187,19 @@ It will result in the following: 2. The JBrowse wrapper in the shiny server is not capable of displaying the GC content and methylation plots when available. It can only display the simpler tracks. If the user wants to visualise and interrogate the GC or methylation tracks it must open the JBrowse outside from the shiny server. For that, two options are available: * You can navigate to the `jbrowse` directory under your sample's output folder and simply execute `http-server`. This command can be found at: https://www.npmjs.com/package/http-server * Or, you can download the [JBrowse Desktop app](https://jbrowse.org/docs/jbrowse_desktop.html) and, from inside the app, select the folder `jbrowse/data` that is available in your sample's output directory. -3. If you face some weird error using v3.1, please, before opening a flag, try updating your docker image, we had some inconsistencies lately and this may be the source of the issue. +3. If you face some weird error using v3.1 or v3.2, please, before opening a ticket, try updating your docker images, we had some inconsistencies lately and this may be the source of the issue. +4. If facing an issue with the `BACANNOT:SUMMARY` module, identical or similar to the one reported in issue [[#96]](https://github.com/fmalmeida/bacannot/issues/96), please, before opening a ticket, try updating the python env docker image: `docker pull fmalmeida/bacannot:v3.2_pyenv`. The image has been recently updated to have the latest version of my python scripts, and that may solve the issue. If not, please open another. +5. Sometimes, the `BACANNOT:UNICYCLER` may fail with different, random issues, that does not seem correct, or seem really very random. For example, saying that a read is not available, even though it is there. After some tracing, we realised that the unicycler 0.4.8 installation from conda, and the biocontainer form quay.io is causing this random problem. To solve this issue, please run with a newer version of the tool. This solves the issue in most cases: `--unicycler_version 0.5.0--py310h6cc9453_3`. + * Because `v3.2` is already tagged and frozen with Zenodo, we will not update it, thus, for this version, using the parameter to overwrite the tool version should be used. + * In `v3.3`, unicycler version will be defaulted to `0.5.0--py310h6cc9453_3` ## Citation -To cite this tool please refer to our [Zenodo tag](https://doi.org/10.5281/zenodo.3627669). +In order to cite this pipeline, please refer to: + +> Almeida FMd, Campos TAd and Pappas Jr GJ. Scalable and versatile container-based pipelines for de novo genome assembly and bacterial annotation. [version 1; peer review: awaiting peer review]. F1000Research 2023, 12:1205 (https://doi.org/10.12688/f1000research.139488.1) + +Additionally, archived versions of the pipeline are also found in [Zenodo](https://doi.org/10.5281/zenodo.3627669). This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [GPLv3](https://github.com/fmalmeida/bacannot/blob/master/LICENSE). diff --git a/conf/defaults.config b/conf/defaults.config index 0bdf2051..e8c5f69c 100644 --- a/conf/defaults.config +++ b/conf/defaults.config @@ -32,6 +32,9 @@ params { // It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet input = null +// Enable reads deduplication for assembly? (If input has reads) + enable_deduplication = false + // path to directory containing databases used by bacannot // you can download databases with: // nextflow run fmalmeida/bacannot --get_dbs --output bacannot_dbs -profile @@ -122,6 +125,12 @@ params { // (NOT RUN?) sourmash skip_sourmash = false +// (NOT RUN?) integron finder tool + skip_integron_finder = false + +// (NOT RUN?) CIRCOS tool + skip_circos = false + /* * Custom databases can be used to annotate additional genes in the genome. * It runs a BLAST alignment against the genome, therefore, the custom database @@ -199,4 +208,4 @@ params { max_cpus = 16 max_time = '40.h' -} \ No newline at end of file +} diff --git a/conf/docker.config b/conf/docker.config index 7c376d5a..f8150ded 100644 --- a/conf/docker.config +++ b/conf/docker.config @@ -4,6 +4,7 @@ docker { enabled = true runOptions = '--platform linux/amd64 -u root:$(id -g)' } +params.running_engine = 'docker' /* @@ -18,21 +19,27 @@ process { // Custom pipeline's containers with various tools for general purposes // withLabel: 'db_download|db_tools|misc' { - container = 'fmalmeida/bacannot:v3.3_misc' + container = 'fmalmeida/bacannot@sha256:bdb31637cacf99736656ab3b69f1f01ba1b5eb026771d5c266b4c84e96153057' } // container for R tools withLabel: 'renv' { - container = 'fmalmeida/bacannot:v3.3_renv' + container = 'fmalmeida/bacannot@sha256:952f58a2c03e50f8a376073346fb1ccda28d6249e3fdfea07a3286a6ff1adf0c' } // container for bacannot server withLabel: 'server' { - container = 'fmalmeida/bacannot:server' + container = 'fmalmeida/bacannot@sha256:0ec3b289d6e0c624556d125b2ed9b63499178e266a315175fd87cf020a402898' } + // container for jbrowser withLabel: 'jbrowse' { - container = 'fmalmeida/bacannot:jbrowse' + container = 'fmalmeida/bacannot@sha256:6afdca17b561bf212c1f976422aee3fe047563c32a15112a6262556d1f75201e' + } + + // container for antismash + withName: 'ANTISMASH|ANTISMASH_DB' { + container = 'fmalmeida/bacannot@sha256:fe42fbbfb7d4a026dafb146cb533ee7f1d9a97b25ec6df64840796c343707ebb' } // @@ -95,7 +102,7 @@ process { } withName: PHIGARO { - container = "quay.io/biocontainers/phigaro:2.3.0--pyh7b7c402_0" + container = "quay.io/biocontainers/phigaro:2.4.0--pyhdfd78af_0" } withName: PHISPY { diff --git a/conf/singularity.config b/conf/singularity.config index c0dc40ff..383b7776 100644 --- a/conf/singularity.config +++ b/conf/singularity.config @@ -1,9 +1,13 @@ // Container usage and permission -docker.enabled = false -singularity.enabled = true -// singularity.runOptions = '--writable-tmpfs -B $PWD' -singularity.autoMounts = true +docker.enabled = false env.SINGULARITY_DISABLE_CACHE = 1 +singularity { + enabled = true + envWhitelist = ['SINGULARITY_TMPDIR'] + autoMounts = true +} +params.running_engine = 'singularity' +// singularity.runOptions = '--writable-tmpfs -e --no-home -B $PWD' /* @@ -18,21 +22,27 @@ process { // Custom pipeline's containers with various tools for general purposes // withLabel: 'db_download|db_tools|misc' { - container = 'docker://fmalmeida/bacannot:v3.3_misc' + container = 'docker://fmalmeida/bacannot@sha256:bdb31637cacf99736656ab3b69f1f01ba1b5eb026771d5c266b4c84e96153057' } // container for R tools withLabel: 'renv' { - container = 'docker://fmalmeida/bacannot:v3.3_renv' + container = 'docker://fmalmeida/bacannot@sha256:952f58a2c03e50f8a376073346fb1ccda28d6249e3fdfea07a3286a6ff1adf0c' } // container for bacannot server withLabel: 'server' { - container = 'docker://fmalmeida/bacannot:server' + container = 'docker://fmalmeida/bacannot@sha256:0ec3b289d6e0c624556d125b2ed9b63499178e266a315175fd87cf020a402898' } + // container for jbrowser withLabel: 'jbrowse' { - container = 'docker://fmalmeida/bacannot:jbrowse' + container = 'docker://fmalmeida/bacannot@sha256:6afdca17b561bf212c1f976422aee3fe047563c32a15112a6262556d1f75201e' + } + + // container for antismash + withName: 'ANTISMASH|ANTISMASH_DB' { + container = 'docker://fmalmeida/bacannot@sha256:fe42fbbfb7d4a026dafb146cb533ee7f1d9a97b25ec6df64840796c343707ebb' } // @@ -96,7 +106,7 @@ process { } withName: PHIGARO { - container = "https://depot.galaxyproject.org/singularity/phigaro:2.3.0--pyh7b7c402_0" + container = "https://depot.galaxyproject.org/singularity/phigaro:2.4.0--pyhdfd78af_0" } withName: PHISPY { diff --git a/docker/antismash/Dockerfile b/docker/antismash/Dockerfile new file mode 100644 index 00000000..3fc621fe --- /dev/null +++ b/docker/antismash/Dockerfile @@ -0,0 +1,29 @@ +FROM nfcore/base +LABEL authors="Felipe Almeida" \ + description="Docker image containing antismash for bacannot" + +# install mamba +RUN conda install \ + -n base -c conda-forge 'mamba=1.5' --yes && \ + conda clean -afy + +# set CONDA_PREFIX +ENV CONDA_PREFIX=/opt/conda + +# install antismash +RUN mamba create -y \ + -n antismash \ + -c bioconda -c conda-forge \ + 'bioconda::antismash-lite==6.1.1' 'anaconda::flask' 'anaconda::jinja2' 'anaconda::markupsafe' emboss nomkl && \ + chmod 777 -R /opt/conda/envs/antismash/lib/**/site-packages/antismash && \ + mamba clean -afy + +# update PATH variable +ENV PATH=/opt/conda/envs/antismash/bin:$PATH + +# install ubuntu packages +RUN apt-get update && apt-get install -y build-essential libtinfo5 libtiff5 libopenjp2-7 + +# fix permissions +WORKDIR /work +RUN chmod 777 -R /work diff --git a/docker/antismash/build.sh b/docker/antismash/build.sh new file mode 100644 index 00000000..51153a5c --- /dev/null +++ b/docker/antismash/build.sh @@ -0,0 +1 @@ +../../bin/build_image.sh $1 diff --git a/docker/misc/Dockerfile b/docker/misc/Dockerfile index 89002fa7..35e1a971 100644 --- a/docker/misc/Dockerfile +++ b/docker/misc/Dockerfile @@ -3,38 +3,44 @@ LABEL authors="Felipe Almeida" \ description="Docker image containing any-based bacannot tools" # install mamba -RUN conda update -n root conda --yes && \ - conda update -n base conda --yes && \ - conda install -c conda-forge -y 'mamba>=1.4' +RUN conda install -n base -c conda-forge 'mamba=1.5' --yes && \ + conda clean -afy +RUN pip install --upgrade pip + +# Install ubuntu packages +RUN apt-get update -y && apt-get install -y samtools libarchive13 build-essential # Install the conda environment RUN mamba install -y \ -c bioconda -c defaults -c conda-forge -c anaconda -c falmeida \ --no-channel-priority \ - 'conda-forge::python>=3.7' \ + 'python>=3.9' \ 'blast>=2.12' \ 'diamond>=2.0.15' \ 'bedtools>=2.30' \ 'kma' \ 'nanopolish' \ - 'biopython==1.78' \ + 'biopython==1.83' \ seqkit \ bioawk \ - 'easy_circos>=0.3' \ - 'falmeida-py>=1.2' \ - 'conda-forge::openssl>=1.1.1' \ + 'easy_circos==0.4' \ + 'conda-forge::openssl>=1.1' \ + 'pyproj=3.2' \ emboss \ libtiff \ jq && \ mamba clean -afy -# Install samtools -RUN apt-get update -y && apt-get install -y samtools +# install my custom scripts +RUN git clone https://github.com/fmalmeida/pythonScripts.git && \ + cd pythonScripts && \ + pip install . && \ + falmeida-py --help # Install gff-toolbox -RUN git clone https://github.com/fmalmeida/gff-toolbox.git && \ - cd gff-toolbox && \ - python setup.py install && \ +RUN git clone https://github.com/fmalmeida/gff-toolbox.git +RUN cd gff-toolbox && \ + python3 setup.py install && \ gff-toolbox -h # Create env for digIS @@ -67,6 +73,7 @@ COPY victors_bkp/victors_06-2022.fasta /work/victors.fasta RUN mamba create -y -n resfinder \ -c bioconda -c defaults -c conda-forge -c anaconda -c falmeida \ 'resfinder>=4.1' docopt pandas && \ + chmod 777 -R /opt/conda/envs/resfinder && \ mamba clean -afy # get a copy of digis @@ -74,18 +81,14 @@ RUN git clone -b master https://github.com/janka2012/digIS.git COPY custom_fix_grange_digis.py /work/digIS/src/common/grange.py ENV PATH=/work/digIS:$PATH -# Create env for antismash -RUN conda create -y -n antismash -c bioconda -c conda-forge \ - 'antismash>=6' 'anaconda::jinja2' 'anaconda::markupsafe' nomkl && \ - rm -rf /opt/conda/envs/antismash/lib/*/site-packages/antismash/databases && \ - mamba clean -afy - # fix bioperl -RUN mamba create -n perl -y -c bioconda -c conda-forge perl-bioperl perl-app-cpanminus perl-yaml -RUN conda run -n perl PERL5LIB= PERL_LOCAL_LIB_ROOT= cpanm Bio::Root::RootI +RUN mamba create -n perl -y \ + -c bioconda -c conda-forge -c anaconda -c defaults \ + perl-bioperl perl-app-cpanminus perl-yaml +RUN mamba run -n perl PERL5LIB= PERL_LOCAL_LIB_ROOT= cpanm Bio::Root::RootI # fix python -RUN python3 -m pip install cryptography==38.0.4 +RUN python3 -m pip install cryptography==38.0.4 'biopython==1.83' 'matplotlib==3.7.3' # install get zenodo RUN pip3 install zenodo_get @@ -95,5 +98,3 @@ RUN apt-get install -y unzip # fix permissions RUN chmod 777 -R /work -RUN chmod 777 -R /opt/conda/envs/antismash/lib/*/site-packages/antismash -RUN chmod 777 -R /opt/conda/envs/resfinder \ No newline at end of file diff --git a/docker/renv/Dockerfile b/docker/renv/Dockerfile index e28066a8..5641c0b1 100644 --- a/docker/renv/Dockerfile +++ b/docker/renv/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:22.04 LABEL MAINTAINER Felipe Marques de Almeida @@ -13,7 +13,7 @@ RUN apt-get update && \ ## Install R RUN DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata && \ - DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y r-base r-base-core r-api-3.5 + DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y r-base r-base-core ## Install R-packages RUN DEBIAN_FRONTEND=noninteractive apt-get update && \ diff --git a/docs/config.md b/docs/config.md index 9962abef..9aebaa19 100644 --- a/docs/config.md +++ b/docs/config.md @@ -13,191 +13,5 @@ Default configuration --------------------- ```groovy -/* - - Required / Default Parameters. - This parameters must always be set - -*/ -params { - - /* - - DB DOWNLOAD WORKFLOW - - */ - -// Trigger database download and formatting workflow? --> will not run annotation -// Will download and format a database inside {output} parameter - get_dbs = false - force_update = false - - /* - - ANNOTATION INPUTS - - */ - -// Input data mus be given inside a well-formated samplesheet. -// We provide a well-formated example at: https://github.com/fmalmeida/test_datasets/raw/main/bacannot_testing_samplesheets/samplesheet.yaml -// -// Please read the example samplesheet so you can understand how to properly fill it. -// -// It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet - input = null - -// path to directory containing databases used by bacannot -// you can download databases with: -// nextflow run fmalmeida/bacannot --get_dbs --output bacannot_dbs -profile - bacannot_db = null - - /* - - GENERAL PARAMETERS - - */ - -// Main output folder name. More than one bacannot annotation can be redirected -// to the same output parameter. It is good to keep related annotations together. -// A subdirectory with the filename will be created inside this directory. - output = 'results' - -// Number of minimum overlapping base pairs required for merging -// Negative values, such as -20, means the number of required overlapping bases for merging. -// Positive values, such as 5, means the maximum distance accepted between features for merging. -// By default (if Blank), this process is not executed. For execution the user needs to provide a value - bedtools_merge_distance = null - - /* - * Bakta optional - */ -// If user set path to an existing bakta database, the pipeline will use bakta instead of prokka - bakta_db = null - - /* - * Prokka optional parameters - */ -// Include comprehensive PGAP hmm database in prokka annotation instead of TIGRFAM. -// PGAP is big and using it may have higher running times but better results - prokka_use_pgap = false - -// Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria') - prokka_kingdom = null - -// Translation table code. Must be set if the above is set. -// Example: params.prokka_genetic.code = 11 - prokka_genetic_code = null - -// Use rnammer instead of Barrnap? False or True? - prokka_use_rnammer = false - - /* - * Resfinder species panel - */ - -// Species panel to be used when annotating with Resfinder. -// It sets a default for all samples in the samplesheet. -// If a sample has a different value inside the samplesheet it will overwrite the value for that sample -// If blank it will not be executed. -// It must be identical (without the *) as written in their webservice https://cge.cbs.dtu.dk/services/ResFinder/. -// E.g. 'Escherichia coli'; 'Klebsiella' ... - resfinder_species = null - - /* - * Handling the execution of processes - * - * By default, all processes are executed. These - * parameters tells wheter NOT to run a process. - * - * Which means: false will allow its execution - * while true will create a barrier and skip a process. - */ -// (NOT RUN?) Plasmids annotation (controls PlasmidFinder execution) - skip_plasmid_search = false - -// (NOT RUN?) General Virulence annotation (controls VFDB and Victors scan) - skip_virulence_search = false - -// (NOT RUN?) Resistance annotation (controls AMRfinder and RGI) - skip_resistance_search = false - -// (NOT RUN?) ICE annotation (controls ICEberg annotation) - skip_iceberg_search = false - -// (NOT RUN?) prophage annotation (controls PHAST and Phigaro) - skip_prophage_search = false - -// (NOT RUN?) KO (KEGG Orthology) annotation - skip_kofamscan = false - -// (NOT RUN?) antiSMASH (secondary metabolite) annotation - skip_antismash = false - - /* - * Custom databases can be used to annotate additional genes in the genome. - * It runs a BLAST alignment against the genome, therefore, the custom database - * More than one custom database can be given separated by commas. - * Gene headers must be properly formated as described in the - * documentation: https://bacannot.readthedocs.io/en/latest/custom-db - */ -// Custom fastas (PROT / NUCL) - custom_db = null -// Custom annotation using list of NCBI protein accs - ncbi_proteins = null - - /* - * Annotation thresholds to be used when scanning specific databases and features - * Select a combination of thresholds that is meaningful for your data. Some of - * the databases are protein-only, others are nucleotide only. We cannnot control - * that and the databases will be scanned either if blastp or blastn using these - * thresholds described here. - */ - -// Identity threshold for plasmid annotation - plasmids_minid = 90 - -// Coverage threshold for plasmid annotation - plasmids_mincov = 60 - -// Virulence genes identity threshold - blast_virulence_minid = 90 - -// Virulence genes coverage threshold - blast_virulence_mincov = 90 - -// AMR genes identity threshold - blast_resistance_minid= 90 - -// AMR genes coverage threshold - blast_resistance_mincov = 90 - -// MGEs (ICEs and Phages) identity threshold - blast_MGEs_minid = 85 - -// MGEs (ICEs and Phages) coverage threshold - blast_MGEs_mincov = 85 - -// User's custom database identity threshold - blast_custom_minid = 65 - -// User's custom database coverage threshold - blast_custom_mincov = 65 - - /* - * Resources allocation configuration - * Defaults only, expecting to be overwritten - */ -// Select versions of bioconda quay.io additional tools -// Tools that are not part of the core of the pipeline, -// but can eventually be used by users - unicycler_version = '0.4.8--py38h8162308_3' - flye_version = '2.9--py39h39abbe0_0' - bakta_version = '1.7.0--pyhdfd78af_1' - -// Max resource options - max_memory = '20.GB' - max_cpus = 16 - max_time = '40.h' - -} +{% include 'defaults.config' %} ``` \ No newline at end of file diff --git a/docs/defaults.config b/docs/defaults.config new file mode 100644 index 00000000..0d43fe48 --- /dev/null +++ b/docs/defaults.config @@ -0,0 +1,191 @@ +/* + + Required / Default Parameters. + This parameters must always be set + +*/ +params { + + /* + + DB DOWNLOAD WORKFLOW + + */ + +// Trigger database download and formatting workflow? --> will not run annotation +// Will download and format a database inside {output} parameter + get_dbs = false + force_update = false + get_zenodo_db = false // download pre-built database + + /* + + ANNOTATION INPUTS + + */ + +// Input data mus be given inside a well-formated samplesheet. +// We provide a well-formated example at: https://github.com/fmalmeida/test_datasets/raw/main/bacannot_testing_samplesheets/samplesheet.yaml +// +// Please read the example samplesheet so you can understand how to properly fill it. +// +// It is also documented in the main manual: https://bacannot.readthedocs.io/en/latest/samplesheet + input = null + +// Enable reads deduplication for assembly? (If input has reads) + enable_deduplication = false + +// path to directory containing databases used by bacannot +// you can download databases with: +// nextflow run fmalmeida/bacannot --get_dbs --output bacannot_dbs -profile + bacannot_db = null + + /* + + GENERAL PARAMETERS + + */ + +// Main output folder name. More than one bacannot annotation can be redirected +// to the same output parameter. It is good to keep related annotations together. +// A subdirectory with the filename will be created inside this directory. + output = 'results' + +// Number of minimum overlapping base pairs required for merging +// Negative values, such as -20, means the number of required overlapping bases for merging. +// Positive values, such as 5, means the maximum distance accepted between features for merging. +// By default (if Blank), this process is not executed. For execution the user needs to provide a value + bedtools_merge_distance = null + + /* + * Bakta optional + */ +// If user set path to an existing bakta database, the pipeline will use bakta instead of prokka + bakta_db = null + + /* + * Prokka optional parameters + */ +// Include comprehensive PGAP hmm database in prokka annotation instead of TIGRFAM. +// PGAP is big and using it may have higher running times but better results + prokka_use_pgap = false + +// Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria') + prokka_kingdom = null + +// Translation table code. Must be set if the above is set. +// Example: params.prokka_genetic.code = 11 + prokka_genetic_code = null + +// Use rnammer instead of Barrnap? False or True? + prokka_use_rnammer = false + + /* + * Resfinder species panel + */ + +// Species panel to be used when annotating with Resfinder. +// It sets a default for all samples in the samplesheet. +// If a sample has a different value inside the samplesheet it will overwrite the value for that sample +// If blank it will not be executed. +// It must be identical (without the *) as written in their webservice https://cge.cbs.dtu.dk/services/ResFinder/. +// E.g. 'Escherichia coli'; 'Klebsiella' ... + resfinder_species = null + + /* + * Handling the execution of processes + * + * By default, all processes are executed. These + * parameters tells wheter NOT to run a process. + * + * Which means: false will allow its execution + * while true will create a barrier and skip a process. + */ +// (NOT RUN?) Plasmids annotation (controls PlasmidFinder execution) + skip_plasmid_search = false + +// (NOT RUN?) General Virulence annotation (controls VFDB and Victors scan) + skip_virulence_search = false + +// (NOT RUN?) Resistance annotation (controls AMRfinder and RGI) + skip_resistance_search = false + +// (NOT RUN?) ICE annotation (controls ICEberg annotation) + skip_iceberg_search = false + +// (NOT RUN?) prophage annotation (controls PHAST and Phigaro) + skip_prophage_search = false + +// (NOT RUN?) KO (KEGG Orthology) annotation + skip_kofamscan = false + +// (NOT RUN?) antiSMASH (secondary metabolite) annotation + skip_antismash = false + + /* + * Custom databases can be used to annotate additional genes in the genome. + * It runs a BLAST alignment against the genome, therefore, the custom database + * More than one custom database can be given separated by commas. + * Gene headers must be properly formated as described in the + * documentation: https://bacannot.readthedocs.io/en/latest/custom-db + */ +// Custom fastas (PROT / NUCL) + custom_db = null +// Custom annotation using list of NCBI protein accs + ncbi_proteins = null + + /* + * Annotation thresholds to be used when scanning specific databases and features + * Select a combination of thresholds that is meaningful for your data. Some of + * the databases are protein-only, others are nucleotide only. We cannnot control + * that and the databases will be scanned either if blastp or blastn using these + * thresholds described here. + */ + +// Identity threshold for plasmid annotation + plasmids_minid = 90 + +// Coverage threshold for plasmid annotation + plasmids_mincov = 60 + +// Virulence genes identity threshold + blast_virulence_minid = 90 + +// Virulence genes coverage threshold + blast_virulence_mincov = 90 + +// AMR genes identity threshold + blast_resistance_minid= 90 + +// AMR genes coverage threshold + blast_resistance_mincov = 90 + +// MGEs (ICEs and Phages) identity threshold + blast_MGEs_minid = 85 + +// MGEs (ICEs and Phages) coverage threshold + blast_MGEs_mincov = 85 + +// User's custom database identity threshold + blast_custom_minid = 65 + +// User's custom database coverage threshold + blast_custom_mincov = 65 + + /* + * Resources allocation configuration + * Defaults only, expecting to be overwritten + */ +// Select versions of bioconda quay.io additional tools +// Tools that are not part of the core of the pipeline, +// but can eventually be used by users + unicycler_version = '0.5.0--py310h6cc9453_3' + flye_version = '2.9--py39h6935b12_1' + bakta_version = '1.7.0--pyhdfd78af_1' + +// Max resource options + max_memory = '20.GB' + max_cpus = 16 + max_time = '40.h' + +} diff --git a/docs/index.md b/docs/index.md index 3ee7b0df..4cb28ba9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,7 +2,7 @@ -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3627669-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3627669) +[![F1000 Paper](https://img.shields.io/badge/Citation%20F1000-10.12688/f1000research.139488.1-orange)](https://doi.org/10.12688/f1000research.139488.1) [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/fmalmeida/bacannot?include_prereleases&label=Latest%20release)](https://github.com/fmalmeida/bacannot/releases) [![Documentation](https://img.shields.io/badge/Documentation-readthedocs-brightgreen)](https://bacannot.readthedocs.io/en/latest/?badge=latest) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) @@ -10,6 +10,7 @@ [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![License](https://img.shields.io/badge/License-GPL%203-black)](https://github.com/fmalmeida/bacannot/blob/master/LICENSE) [![Follow on Twitter](http://img.shields.io/badge/twitter-%40fmarquesalmeida-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/fmarquesalmeida) +[![Zenodo Archive](https://img.shields.io/badge/Zenodo-Archive-blue)](https://doi.org/10.5281/zenodo.3627669) ## About diff --git a/docs/installation.md b/docs/installation.md index 5056467d..2bc4a5fa 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -19,17 +19,30 @@ nextflow pull fmalmeida/bacannot ## Downloading docker images -The custom docker images used by the pipeline are: +> The pipeline uses both custom and public images. +> All images can be downloaded on the fly, automatically by nextflow, and this is the recommended way to do it. + +If you want to download it yourself, you can find all the images used in the pipeline described in the file [docker.config](https://github.com/fmalmeida/bacannot/blob/master/conf/docker.config) (for docker) and [singularity.config](https://github.com/fmalmeida/bacannot/blob/master/conf/singularity.config) (for singularity). + +The images are defined like the following: ```bash -docker pull fmalmeida/bacannot:v3.3_misc ; -docker pull fmalmeida/bacannot:v3.3_renv ; -docker pull fmalmeida/bacannot:jbrowse ; +... +withLabel: 'db_download|db_tools|misc' { + container = 'fmalmeida/bacannot@sha256:bdb31637cacf99736656ab3b69f1f01ba1b5eb026771d5c266b4c84e96153057' +} +... ``` -> The pipeline also uses other public images available in biocontainers. All images can be downloaded on the fly, automatically be nextflow. +And could be downloaded like this: + +```bash +docker pull fmalmeida/bacannot@sha256:bdb31637cacf99736656ab3b69f1f01ba1b5eb026771d5c266b4c84e96153057 +``` -!!! info "Using singularity" +> You would need to do it for each image. + +!!! info "If using singularity" **Docker and singularity images are downloaded on the fly**. Be sure to properly set `NXF_SINGULARITY_LIBRARYDIR` env variable to a writable directory if using Singularity. This will make that the downloaded images are reusable through different executions. Read more at: https://www.nextflow.io/docs/latest/singularity.html#singularity-docker-hub @@ -39,17 +52,56 @@ docker pull fmalmeida/bacannot:jbrowse ; # apply this command to each image # just change the "/" and ":" for "-". # E.g. Image fmalmeida/bacannot:v3.3_misc becomes fmalmeida-bacannot-v3.3_misc.img - singularity pull --dir $NXF_SINGULARITY_LIBRARYDIR fmalmeida-bacannot-v3.3_misc.img docker://fmalmeida/bacannot:v3.3_misc + # for singularity --> prepare env variables + # remember to properly set NXF_SINGULARITY_LIBRARYDIR + # read more at https://www.nextflow.io/docs/latest/singularity.html#singularity-docker-hub + export NXF_SINGULARITY_LIBRARYDIR= # Set a path to your singularity storage dir + export NXF_SINGULARITY_CACHEDIR= # Set a path to your singularity cache dir + export SINGULARITY_CACHEDIR= # Set a path to your singularity cache dir + + singularity pull \ + --dir $NXF_SINGULARITY_LIBRARYDIR \ + fmalmeida-bacannot-v3.3_misc.img docker://fmalmeida/bacannot:v3.3_misc ``` +## Bacannot databases + +Bacannot databases are not inside the docker images anymore to avoid huge images and problems with connections and limit rates with dockerhub. + +### Pre-formatted + +Users can directly download pre-formatted databases from Zenodo: https://doi.org/10.5281/zenodo.7615811 + +Useful for standardization and also overcoming known issues that may arise when formatting databases with `singularity` profile. + +A module to download the latest pre-formatted database has also been made available: + +```bash +# Download pipeline pre-built databases +nextflow run fmalmeida/bacannot \ + --get_zenodo_db \ + --output ./ \ + -profile +``` + +### I want to generate a new formatted database + +```{bash .annotate hl_lines="5"} +# Download pipeline databases +nextflow run fmalmeida/bacannot \ + --get_dbs \ + --output bacannot_dbs \ + -profile +``` + ## Testing your installation After that, you can run the pipeline with a testing dataset by selecting one of the available profiles: 1. Docker - * `nextflow run fmalmeida/mpgap -profile docker,test` + * `nextflow run fmalmeida/mpgap -profile docker,test --bacannot_db ./bacannot_dbs` 2. Singularity - * `nextflow run fmalmeida/mpgap -profile singularity,test` + * `nextflow run fmalmeida/mpgap -profile singularity,test --bacannot_db ./bacannot_dbs` !!! note "About NF profiles" diff --git a/docs/manual.md b/docs/manual.md index 95cb1b35..1a2b1df3 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -29,9 +29,10 @@ The pipeline accepts as input two other input files types that are used to perfo ## Input/output options -|
Parameter
| Required | Default | Description | +|
Parameter
| Required | Default | Description | | :--------------------------------------- | :------- | :------ | :---------- | | `--input` | :material-check: | NA | Input samplesheet describing all the samples to be analysed | +| `--enable_deduplication` | :material-close: | false | Run deduplication command on input reads before assembly. Only useful for samples where reads are given instead of a genome fasta. | | `--output` | :material-check: | results | Name of directory to store output values. A sub-directory for each genome will be created inside this main directory. | | `--bacannot_db` | :material-check: | NA | Path for root directory containing required bacannot databases | @@ -45,6 +46,7 @@ The pipeline accepts as input two other input files types that are used to perfo | :--------------------------------------- | :------- | :------ | :---------- | | `--get_dbs` | :material-close: | false | Instead of running the analysis workflow, it will try to download required databases and save them in `--output` | | `--force_update` | :material-close: | false | Instead of only downloading missing databases, download everything again and overwrite. | +| `--get_zenodo_db` | :material-close: | false | Download pre-built databases stored in zenodo. [See quickstart](quickstart.md#). !!! tip "" @@ -94,6 +96,8 @@ The use of this parameter sets a default value for input samples. If a sample ha | `--skip_prophage_search` | :material-close: | false | Tells whether not to run prophage annotation modules | | `--skip_kofamscan` | :material-close: | false | Tells whether not to run KEGG orthology (KO) annotation with KofamScan | | `--skip_antismash` | :material-close: | false | Tells whether or not to run antiSMASH (secondary metabolite) annotation. AntiSMASH is executed using only its core annotation modules in order to keep it fast. | +| `--skip_circos` | :material-close: | false | Tells whether or not to run the final `CIRCOS` module. When the input genome has many contigs, its results are not meaningful. | +| `--skip_integron_finder` | :material-close: | false | Tells whether or not to run the integron finder tool. | ## Custom databases diff --git a/docs/quickstart.md b/docs/quickstart.md index 5490726f..42b5ee89 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -102,4 +102,4 @@ nextflow run fmalmeida/bacannot -profile docker,quicktest --bacannot_db ./bacann ### Annotation with bakta -User can also perform the core generic annotation with bakta instead of prokka. Please read [the manual](manual#bakta-annotation). +User can also perform the core generic annotation with bakta instead of prokka. Please read [the manual](manual.md#bakta-annotation). diff --git a/docs/requirements.txt b/docs/requirements.txt index d5ff7eda..42c51dbf 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -14,4 +14,5 @@ mergedeep>=1.3.4 colorama>=0.4; platform_system == 'Windows' mkdocs-pymdownx-material-extras mkdocs-git-revision-date-plugin -mkdocs-material \ No newline at end of file +mkdocs-material +mkdocs-macros-plugin \ No newline at end of file diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 57d85384..6ccb1c75 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -10,7 +10,7 @@ class WorkflowMain { public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " https://doi.org/10.5281/zenodo.3627669\n\n" + + " https://doi.org/10.12688/f1000research.139488.1\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + diff --git a/markdown/CHANGELOG.md b/markdown/CHANGELOG.md index fcd33376..03654f50 100644 --- a/markdown/CHANGELOG.md +++ b/markdown/CHANGELOG.md @@ -2,12 +2,35 @@ The tracking for changes started in v2.1 -## v3.3 [TBD] +## v3.3.3 [11-March-2024] + +* [[#118](https://github.com/fmalmeida/bacannot/issues/116)] + * Add a parameter to allow user to skip `INTEGRON_FINDER` execution. + * Add a parameter to allow user to skip `CIRCOS` execution. +* Split antismash docker image, and added some snippets to fix some workarounds to allow pipeline to run with singularity + +## v3.3.2 [09-February-2024] + +* [[#116](https://github.com/fmalmeida/bacannot/issues/116)] -- Small update to avoid having `integron_finder` gbks with start position as 0, since it breaks conversion to gff. + +## v3.3.1 [29-October-2023] + +* [[#111](https://github.com/fmalmeida/bacannot/issues/111)] -- Updated `falmeida-py` package version to fix problem with missing key for Summary. + +## v3.3 [01-October-2023] * [[#50](https://github.com/fmalmeida/bacannot/issues/50)] -- Add `Integron Finder` tool to the pipeline -* [#69](https://github.com/fmalmeida/bacannot/issues/69) -- Change how tools use docker images in order to: +* [[#69](https://github.com/fmalmeida/bacannot/issues/69)] -- Change how tools use docker images in order to: * make tools use public bioconda images whenever possible to allow easy addition of tools and avoid much conflicts in docker images * dimish the size and tools inside the docker images, the docker images now are only built to contain tools and all required for modules that cannot just use bioconda docker images. +* [[#81](https://github.com/fmalmeida/bacannot/issues/81)] -- Add `MOB Suite` tool to the pipeline +* [[#85](https://github.com/fmalmeida/bacannot/issues/85)] -- Include checkup on header size for Prokka +* [[#98](https://github.com/fmalmeida/bacannot/issues/98)] -- Add ICEberg and PHAST blastp results to json summary +* [[#100](https://github.com/fmalmeida/bacannot/issues/100)] -- Update pipeline to use docker shasum instead of tags +* [[#107](https://github.com/fmalmeida/bacannot/issues/107)] -- Add a parameter, `--enable_deduplication` for deduplicating input reads before assembly +* Update unicycler docker image to latest '0.5.0--py310h6cc9453_3' to avoid errors originated from previous image containing buggy installation. +* Other minor changes / updates highlited in [[#93](https://github.com/fmalmeida/bacannot/pull/93)] + ## v3.2 [19-December-2022] * Fixes https://github.com/fmalmeida/bacannot/issues/68 reported by @lam-c diff --git a/mkdocs.yml b/mkdocs.yml index 58482435..1a5157ba 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,6 +22,8 @@ theme: repo: fontawesome/brands/github-alt plugins: - git-revision-date + - search + - macros markdown_extensions: - pymdownx.emoji: emoji_index: !!python/name:materialx.emoji.twemoji diff --git a/modules/MGEs/digIS.nf b/modules/MGEs/digIS.nf index 665abf60..a1ad29aa 100644 --- a/modules/MGEs/digIS.nf +++ b/modules/MGEs/digIS.nf @@ -18,14 +18,8 @@ process DIGIS { script: """ - # activate env - source activate digIS - # run digIS - python3 \$(which digIS_search.py) -i $genome -g $genbank -o digIS - - # deactivate env - conda deactivate + conda run -n digIS python3 \$(which digIS_search.py) -i $genome -g $genbank -o digIS # parse digIS to get nucleotide and aminoacide # also put ids in uppercase diff --git a/modules/MGEs/integron_finder.nf b/modules/MGEs/integron_finder.nf index b06fccdf..1cbfc14e 100644 --- a/modules/MGEs/integron_finder.nf +++ b/modules/MGEs/integron_finder.nf @@ -1,7 +1,7 @@ process INTEGRON_FINDER { - publishDir "${params.output}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" - else "${prefix}/integron_finder/$filename" + else "integron_finder/$filename" } tag "${prefix}" label = [ 'process_medium' ] diff --git a/modules/MGEs/integron_finder_2gff.nf b/modules/MGEs/integron_finder_2gff.nf index 6abaeab3..f314c748 100644 --- a/modules/MGEs/integron_finder_2gff.nf +++ b/modules/MGEs/integron_finder_2gff.nf @@ -12,13 +12,13 @@ process INTEGRON_FINDER_2GFF { script: def args = task.ext.args ?: '' """ + # fix 0-based sequences + sed -e 's/ 0\\.\\./ 1\\.\\./g' -e 's/complement(0\\.\\./complement(1\\.\\./g' $gbk > fixed.gbk + # convert to gff if available - touch ${prefix}_integrons.gff ; - for gbk in \$(ls *.gbk) ; do - conda run -n perl bp_genbank2gff3 \$gbk -o - | \ - grep 'integron_id' | \ - sed 's|ID=.*integron_id=|ID=|g' | \ - sed 's/GenBank/Integron_Finder/g' >> ${prefix}_integrons.gff - done + conda run -n perl bp_genbank2gff3 fixed.gbk -o - | \\ + grep 'integron_id' | \\ + sed 's|ID=.*integron_id=|ID=|g' | \\ + sed 's/GenBank/Integron_Finder/g' >> ${prefix}_integrons.gff """ } diff --git a/modules/MGEs/mob_suite.nf b/modules/MGEs/mob_suite.nf index 14256e92..0ae1c3d3 100644 --- a/modules/MGEs/mob_suite.nf +++ b/modules/MGEs/mob_suite.nf @@ -1,7 +1,7 @@ process MOBSUITE { - publishDir "${params.output}", mode: 'copy', saveAs: { filename -> + publishDir "${params.output}/${prefix}", mode: 'copy', saveAs: { filename -> if (filename.indexOf("_version.txt") > 0) "tools_versioning/$filename" - else "${prefix}/plasmids/mob_suite/$filename" + else "plasmids/mob_suite/$filename" } tag "${prefix}" label = [ 'process_medium' ] diff --git a/modules/assembly/flye.nf b/modules/assembly/flye.nf index 5bbc4668..d588d01e 100644 --- a/modules/assembly/flye.nf +++ b/modules/assembly/flye.nf @@ -18,14 +18,21 @@ process FLYE { script: lr = (lr_type == 'nanopore') ? '--nano-raw' : '--pacbio-raw' + dedup_lr = params.enable_deduplication ? + "gunzip -cf $lreads | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_reads.fastq.gz" : + "ln -s $lreads ${prefix}_deduplicated_reads.fastq.gz" + """ # Save flye version flye -v > flye_version.txt ; + # remove duplicate reads + $dedup_lr + # Run flye flye \\ ${lr} \\ - $lreads \\ + ${prefix}_deduplicated_reads.fastq.gz \\ --out-dir flye_${prefix} \\ --threads $task.cpus &> flye.log ; diff --git a/modules/assembly/unicycler.nf b/modules/assembly/unicycler.nf index 032a3957..145f1c1f 100644 --- a/modules/assembly/unicycler.nf +++ b/modules/assembly/unicycler.nf @@ -17,14 +17,55 @@ process UNICYCLER { path('unicycler_version.txt'), emit: version script: - unpaired_param = (sreads.getName() != "input.3") ? "-s $sreads" : "" - paired_param = (sread1.getName() != "input.1" && sread2.getName() != "input.2") ? "-1 $sread1 -2 $sread2" : "" - lr_param = (lreads.getName() != "input.4") ? "-l $lreads" : "" + unpaired_param = "" + dedup_sreads = "" + paired_param = "" + dedup_paired = "" + lr_param = "" + dedup_lr = "" + + // sreads + if (sreads.getName() != "input.3") { + + dedup_sreads = params.enable_deduplication ? + "gunzip -cf $sreads | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_sreads.fastq.gz" : + "ln -s $sreads ${prefix}_deduplicated_sreads.fastq.gz" + + unpaired_param = "-s ${prefix}_deduplicated_sreads.fastq.gz" + + } + + // paired + if (sread1.getName() != "input.1" && sread2.getName() != "input.2") { + + dedup_paired = params.enable_deduplication ? + "gunzip -cf $sread1 | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_sread_R1.fastq.gz && gunzip -cf $sread2 | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_sread_R2.fastq.gz" : + "ln -s $sread1 ${prefix}_deduplicated_sread_R1.fastq.gz && ln -s $sread2 ${prefix}_deduplicated_sread_R2.fastq.gz" + + paired_param = "-1 ${prefix}_deduplicated_sread_R1.fastq.gz -2 ${prefix}_deduplicated_sread_R2.fastq.gz" + + } + + // lreads + if (lreads.getName() != "input.4") { + + dedup_lr = params.enable_deduplication ? + "gunzip -cf $lreads | awk '{if(NR%4==1) \$0=sprintf(\"@1_%d\",(1+i++)); print;}' | gzip -c > ${prefix}_deduplicated_lreads.fastq.gz" : + "ln -s $lreads ${prefix}_deduplicated_lreads.fastq.gz" + + lr_param = "-l $lreads" + + } """ # Save unicycler version unicycler --version > unicycler_version.txt + # remove duplicate reads + $dedup_sreads + $dedup_paired + $dedup_lr + # Run unicycler unicycler \\ $paired_param \\ diff --git a/modules/bacannot_dbs/antismash.nf b/modules/bacannot_dbs/antismash.nf index 5e9b8962..e554f9ff 100644 --- a/modules/bacannot_dbs/antismash.nf +++ b/modules/bacannot_dbs/antismash.nf @@ -6,9 +6,33 @@ process ANTISMASH_DB { file("*") script: + def antismash_version='6.1.1' + + if (params.running_engine == 'singularity') + """ + mkdir local-install + export PYTHONUSERBASE=./local-install + export PATH=/opt/conda/envs/antismash/bin:\$PATH + + # install locally so it can download dbs + # singularity has many read-write permissions for this tool + wget https://dl.secondarymetabolites.org/releases/${antismash_version}/antismash-${antismash_version}.tar.gz + tar zxvf antismash-${antismash_version}.tar.gz + python -m pip install --user ./antismash-${antismash_version} + export PYTHONPATH=\$(realpath \$( find ./local-install -name 'site-packages' )) + + # now download it + # download antismash database + ./local-install/bin/download-antismash-databases --database-dir ./ + + # delete it + rm -rf ./local-install ./antismash-${antismash_version}* + """ + + else """ # download antismash database export PATH=/opt/conda/envs/antismash/bin:\$PATH - download-antismash-databases --database-dir \$(pwd) + download-antismash-databases --database-dir ./ """ } diff --git a/modules/generic/antismash.nf b/modules/generic/antismash.nf index 207c12d4..b56df0a5 100644 --- a/modules/generic/antismash.nf +++ b/modules/generic/antismash.nf @@ -6,6 +6,8 @@ process ANTISMASH { tag "${prefix}" label = [ 'misc', 'process_medium' ] + // if (params.running_engine = 'singularity') { runOptions = '--writable-tmpfs -e --no-home -B $PWD' } + input: tuple val(prefix), file(genbank) file(bacannot_db) @@ -18,10 +20,69 @@ process ANTISMASH { script: def gbk_suffix = (params.bakta_db) ? "gbff" : "gbk" def gbk_prefix = "${genbank.baseName}" - "${gbk_suffix}" - """ - # Activate env + def antismash_version='6.1.1' + + if (params.running_engine == 'singularity') + """ + # Get tool version + antismash --version > antismash_version.txt ; + + # activate env + mkdir local-install + export PYTHONUSERBASE=./local-install export PATH=/opt/conda/envs/antismash/bin:\$PATH + + # singularity has many read-write permissions for this tool + wget https://dl.secondarymetabolites.org/releases/${antismash_version}/antismash-${antismash_version}.tar.gz + tar zxvf antismash-${antismash_version}.tar.gz + python -m pip install --user ./antismash-${antismash_version} + export PYTHONPATH=\$(realpath \$( find ./local-install -name 'site-packages' )) + + # Run tool + ./local-install/bin/antismash \\ + --output-dir antiSMASH \\ + --genefinding-tool none \\ + --databases ${bacannot_db}/antismash_db \\ + -c $task.cpus \\ + $genbank ; + + # enter results dir + cd antiSMASH ; + + # produce gff from main results + seqret \\ + -sequence ${gbk_prefix}.gbk \\ + -feature \\ + -fformat genbank \\ + -fopenfile ${gbk_prefix}.gbk \\ + -osformat gff \\ + -osname_outseq ${gbk_prefix} \\ + -auto ; + + # get the locus tags annotated as list + # only when results exist + if ls *region*gbk 1> /dev/null 2>&1; then + + grep \\ + "locus_tag" \\ + *region*gbk | \\ + cut \\ + -f 2 \\ + -d "=" | \\ + tr -d '"' | \\ + sort -u > gene_ids.lst ; + + # subset regions GFF from main GFF for JBrowse + grep \\ + -w \\ + -f gene_ids.lst \\ + ${gbk_prefix}.gff > regions.gff ; + fi + """ + + else + """ # Get tool version antismash --version > antismash_version.txt ; @@ -29,10 +90,10 @@ process ANTISMASH { antismash \\ --output-dir antiSMASH \\ --genefinding-tool none \\ - -c $task.cpus \\ --databases ${bacannot_db}/antismash_db \\ + -c $task.cpus \\ $genbank ; - + # enter results dir cd antiSMASH ; @@ -67,4 +128,5 @@ process ANTISMASH { fi """ + } diff --git a/modules/resistance/resfinder.nf b/modules/resistance/resfinder.nf index 9b7105e9..017efffa 100644 --- a/modules/resistance/resfinder.nf +++ b/modules/resistance/resfinder.nf @@ -24,6 +24,7 @@ process RESFINDER { """ # activate env source activate resfinder + export PATH=/opt/conda/envs/resfinder/lib/python3.12/site-packages/resfinder/:\$PATH # Run resfinder acquired resistance run_resfinder.py \\ diff --git a/nextflow.config b/nextflow.config index 1afb0454..b0654f31 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,7 +13,6 @@ includeConfig 'conf/defaults.config' params { // Boilerplate options - tracedir = "${params.output}/pipeline_info" plaintext_email = false monochrome_logs = false help = false @@ -23,7 +22,7 @@ params { get_samplesheet = false validate_params = true show_hidden_params = false - schema_ignore_params = 'enable_conda,monochrome_logs,plaintext_email' + schema_ignore_params = 'enable_conda,monochrome_logs,plaintext_email,running_engine' enable_conda = false monochrome_logs = false @@ -84,19 +83,19 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/bacannot_timeline_${trace_timestamp}.html" + file = "${params.output}/pipeline_info/bacannot_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/bacannot_report_${trace_timestamp}.html" + file = "${params.output}/pipeline_info/bacannot_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/bacannot_trace_${trace_timestamp}.txt" + file = "${params.output}/pipeline_info/bacannot_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/bacannot_pipeline_dag_${trace_timestamp}.svg" + file = "${params.output}/pipeline_info/bacannot_pipeline_dag_${trace_timestamp}.svg" } /* @@ -109,7 +108,7 @@ manifest { homePage = "https://github.com/fmalmeida/bacannot" mainScript = "main.nf" nextflowVersion = "!>=22.10.1" - version = '3.3' + version = '3.3.3' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index 287901ca..416ad42b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -34,6 +34,10 @@ "type": "string", "description": "Path to input samplesheet" }, + "enable_deduplication": { + "type": "boolean", + "description": "Execute deduplication on reads before assembly." + }, "output": { "type": "string", "description": "Path for output directory", @@ -59,15 +63,18 @@ "properties": { "max_cpus": { "type": "integer", - "default": 16 + "default": 16, + "description": "Maximum number of cpus a single module can use." }, "max_memory": { "type": "string", - "default": "20.GB" + "default": "20.GB", + "description": "Maximum memory a single module can use." }, "max_time": { "type": "string", - "default": "40.h" + "default": "40.h", + "description": "Maximum time a module can run." } } }, @@ -146,6 +153,12 @@ "help_text": "If true, the process will be skipped!", "hidden": true }, + "skip_circos": { + "type": "boolean", + "description": "Skip (do not run) circos?", + "help_text": "If true, the process will be skipped!", + "hidden": true + }, "skip_plasmid_search": { "type": "boolean", "description": "Skip (do not run) plasmidfinder?", @@ -170,6 +183,12 @@ "help_text": "If true, the process will be skipped!", "hidden": true }, + "skip_integron_finder": { + "type": "boolean", + "description": "Skip (do not run) integron finder?", + "help_text": "If true, the process will be skipped!", + "hidden": true + }, "skip_prophage_search": { "type": "boolean", "description": "Skip (do not run) prophage annotation?", @@ -181,6 +200,12 @@ "description": "Skip (do not run) KO annotation?", "help_text": "If true, the process will be skipped!", "hidden": true + }, + "skip_sourmash": { + "type": "boolean", + "description": "Skip (do not run) sourmash tool?", + "help_text": "If true, the process will be skipped!", + "hidden": true } }, "fa_icon": "fas fa-tasks" @@ -339,13 +364,6 @@ "help_text": "Number of minimum overlapping base pairs required for merging\nNegative values, such as -20, means the number of required overlapping bases for merging.\nPositive values, such as 5, means the maximum distance accepted between features for merging.\nBy default (if Blank), this process is not executed. For execution the user needs to provide a value", "description": "Minimum overlapping base pairs required for merging" }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "results/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -363,9 +381,7 @@ "unicycler_version": { "type": "string", "description": "Select quay.io image tag for tool", - "default": "0.4.8--py38h8162308_3", - "hidden": true, - "help_text": "Select a different version of tool based on the bioconda containers tag." + "default": "0.5.0--py310h6cc9453_3" }, "flye_version": { "type": "string", diff --git a/workflows/bacannot.nf b/workflows/bacannot.nf index ffeac379..039bc6c9 100644 --- a/workflows/bacannot.nf +++ b/workflows/bacannot.nf @@ -143,8 +143,13 @@ workflow BACANNOT { ISLANDPATH( annotation_out_ch.gbk ) // Integron_finder software - INTEGRON_FINDER( annotation_out_ch.genome ) - INTEGRON_FINDER_2GFF( INTEGRON_FINDER.out.gbk ) + if (!params.skip_integron_finder) { + INTEGRON_FINDER( annotation_out_ch.genome ) + INTEGRON_FINDER_2GFF( INTEGRON_FINDER.out.gbk ) + ch_integron_finder_gff = INTEGRON_FINDER_2GFF.out.gff + } else { + ch_integron_finder_gff = Channel.empty() + } // Virulence search if (params.skip_virulence_search == false) { @@ -327,7 +332,7 @@ workflow BACANNOT { .join(phast_output_ch, remainder: true) .join(DIGIS.out.gff, remainder: true) .join(ch_custom_annotations, remainder: true) - .join(INTEGRON_FINDER_2GFF.out.gff, remainder: true) + .join(ch_integron_finder_gff, remainder: true) ) /* @@ -367,7 +372,7 @@ workflow BACANNOT { .join( MERGE_ANNOTATIONS.out.digis_gff ) .join( antismash_output_ch, remainder: true ) .join( MERGE_ANNOTATIONS.out.customdb_gff.groupTuple(), remainder: true ) - .join( INTEGRON_FINDER_2GFF.out.gff, remainder: true ) + .join( ch_integron_finder_gff, remainder: true ) ) // Render reports @@ -403,7 +408,7 @@ workflow BACANNOT { .join( DRAW_GIS.out.example, remainder: true ) .join( phast_output_ch, remainder: true ) .join( MERGE_ANNOTATIONS.out.digis_gff ) - .join( INTEGRON_FINDER_2GFF.out.gff, remainder: true ) + .join( ch_integron_finder_gff, remainder: true ) ) // @@ -432,7 +437,7 @@ workflow BACANNOT { .join( DIGIS.out.all , remainder: true ) .join( antismash_all_ch , remainder: true ) .join( MERGE_ANNOTATIONS.out.all , remainder: true ) - .join( INTEGRON_FINDER_2GFF.out.gff, remainder: true ) + .join( ch_integron_finder_gff , remainder: true ) .join( mobsuite_output_ch , remainder: true ) ) MERGE_SUMMARIES( @@ -440,19 +445,19 @@ workflow BACANNOT { ) // Render circos plots - circos_input_ch = - annotation_out_ch.genome - .join( annotation_out_ch.gff , remainder: true ) - .join( MERGE_ANNOTATIONS.out.gff, remainder: true ) - .join( PHISPY.out.gff , remainder: true ) - .map{ - it -> - sample = it[0] - it.remove(0) - [ sample, it ] - } - CIRCOS( - circos_input_ch - ) + if (!params.skip_circos) { + circos_input_ch = + annotation_out_ch.genome + .join( annotation_out_ch.gff , remainder: true ) + .join( MERGE_ANNOTATIONS.out.gff, remainder: true ) + .join( PHISPY.out.gff , remainder: true ) + .map{ + it -> + sample = it[0] + it.remove(0) + [ sample, it ] + } + CIRCOS( circos_input_ch ) + } }