From c003c6e77a4de766482a6197f7817ad78251fd29 Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 25 Mar 2020 11:40:30 +0200 Subject: [PATCH 01/56] Added aligner-biscuit, with all relevant steps and parameters and update multiqc accordingly, added picard CollectInsertSizeMetrics and CollectGcBiasMetrics for all aligners, added Samtools sort for bismark aligner. The fasta is assumed to contain the assembly name, not just genome.fa. DOCKERFILE, enviroment.yml, parameters.settings.json, bin/scrape_software_versions.py, conf/base.config, README.md were changed accordingly. Bin/biscuit_QC.sh, bin/setup.sh were added for biscuit_QC step --- Dockerfile | 12 + README.md | 35 +- bin/biscuit_QC.sh | 531 +++++++++++++++++++ bin/scrape_software_versions.py | 17 +- bin/setup.sh | 39 ++ conf/base.config | 93 +++- environment.yml | 13 +- main.nf | 910 +++++++++++++++++++++++++------- parameters.settings.json | 86 ++- 9 files changed, 1506 insertions(+), 230 deletions(-) create mode 100755 bin/biscuit_QC.sh create mode 100755 bin/setup.sh diff --git a/Dockerfile b/Dockerfile index 0fd6f898..b1acaf45 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,18 @@ FROM nfcore/base:1.7 LABEL authors="Phil Ewels" \ description="Docker image containing all software requirements for the nf-core/methylseq pipeline" +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends apt-utils && \ + apt-get install zlib1g-dev -y && \ + apt-get install libbz2-dev -y && \ + apt-get install liblzma-dev -y && \ + apt-get install libncurses5-dev -y && \ + apt-get install curl -y + +RUN cd / && \ + curl -OL $(curl -s https://api.github.com/repos/zwdzwd/biscuit/releases/latest | grep browser_download_url | grep linux_amd64 | cut -d '"' -f 4) && \ + chmod 755 biscuit*linux_amd64 && \ + mv biscuit*linux_amd64 biscuit COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a diff --git a/README.md b/README.md index 69a57be8..530063f5 100644 --- a/README.md +++ b/README.md @@ -14,23 +14,24 @@ The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics workflow The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. The pipeline allows you to choose between running either [Bismark](https://github.com/FelixKrueger/Bismark) or [bwa-meth](https://github.com/brentp/bwa-meth) / [MethylDackel](https://github.com/dpryan79/methyldackel). -Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for alignment), `--aligner bismark_hisat` or `--aligner bwameth`. - -| Step | Bismark workflow | bwa-meth workflow | -|----------------------------------------------|------------------|-----------------------| -| Generate Reference Genome Index _(optional)_ | Bismark | bwa-meth | -| Raw data QC | FastQC | FastQC | -| Adapter sequence trimming | Trim Galore! | Trim Galore! | -| Align Reads | Bismark | bwa-meth | -| Deduplicate Alignments | Bismark | Picard MarkDuplicates | -| Extract methylation calls | Bismark | MethylDackel | -| Sample report | Bismark | - | -| Summary Report | Bismark | - | -| Alignment QC | Qualimap | Qualimap | -| Sample complexity | Preseq | Preseq | -| Project Report | MultiQC | MultiQC | - -## Quick Start +Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for alignment), `--aligner bismark_hisat` or `--aligner bwameth` or `--aligner biscuit`. + +| Step | Bismark workflow | bwa-meth workflow | biscuit | +|----------------------------------------------|------------------|-----------------------|-------------------| +| Generate Reference Genome Index _(optional)_ | Bismark | bwa-meth | biscuit | +| Raw data QC | FastQC | FastQC | FastQC | +| Adapter sequence trimming | Trim Galore! | Trim Galore! | Trim Galore! | +| Align Reads | Bismark | bwa-meth | biscuit | +| Deduplicate Alignments | Bismark | Picard MarkDuplicates | samblaster | +| Extract methylation calls | Bismark | MethylDackel | biscuit | +| Sample report | Bismark | - | biscuit QC | +| Summary Report | Bismark | - | - | +| Picard Metrics | Picard | Picard | Picard | +| Alignment QC | Qualimap | Qualimap | Qualimap | +| Sample complexity | Preseq | Preseq | Preseq | +| Project Report | MultiQC | MultiQC | MultiQC | + +## Quick Start i. Install [`nextflow`](https://nf-co.re/usage/installation) diff --git a/bin/biscuit_QC.sh b/bin/biscuit_QC.sh new file mode 100755 index 00000000..40b55bce --- /dev/null +++ b/bin/biscuit_QC.sh @@ -0,0 +1,531 @@ +#!/usr/bin/env bash +## make sure the following is in PATH +## biscuit samtools, bedtools, awk + +# Use python's argparse module in shell scripts +# +# The function `argparse` parses its arguments using +# argparse.ArgumentParser; the parser is defined in the function's +# stdin. +# +# Executing ``argparse.bash`` (as opposed to sourcing it) prints a +# script template. +# +# https://github.com/nhoffman/argparse-bash +# MIT License - Copyright (c) 2015 Noah Hoffman + +argparse(){ + argparser=$(mktemp 2>/dev/null || mktemp -t argparser) + cat > "$argparser" <> "$argparser" + + cat >> "$argparser" < /dev/null; then + eval $(python "$argparser" "$@") + retval=0 + else + python "$argparser" "$@" + retval=1 + fi + + rm "$argparser" + return $retval +} + +#!/usr/bin/env bash +################################################################################ +## +## Quality Control script for BISCUIT output +## +## Output from this script can be fed into MultiQC to produce a nice HTML output +## showing the different BISCUIT QC metrics +## +## Notes: +## 1.) biscuit, samtools, bedtools, and awk all must be in PATH for script to +## work +## +## Created by: +## Wanding Zhou +## +## Creation date: +## May 2019 +## +## Update notes: +## Dec 2019 - +## - Clean up code to make more readable +## - Catch empty files, alert user, and remove files +## +################################################################################ + +# Check for biscuit, samtools, bedtools, awk in PATH +function check_path { + if [[ `which biscuit 2>&1 > /dev/null` ]]; then + echo "biscuit does not exist in PATH" + exit 1 + fi + if [[ `which samtools 2>&1 > /dev/null` ]]; then + echo "samtools does not exist in PATH" + exit 1 + fi + if [[ `which bedtools 2>&1 > /dev/null` ]]; then + echo "bedtools does not exist in PATH" + exit 1 + fi + if [[ `which awk 2>&1 > /dev/null` ]]; then + echo "awk does not exist in PATH" + exit 1 + fi + if [[ `which python 2>&1 > /dev/null` ]]; then + echo "python does not exist in PATH" + exit 1 + fi +} + +# Check that certain variables have been set and files exist +#TODO: Change "" to "NULL"/"NA"/something similar +#TODO: Also change in BISCUIT QC setup files +function check_variables { + VARS=" + BISCUIT_CPGBED + BISCUIT_CGIBED + BISCUIT_RMSK + BISCUIT_EXON + BISCUIT_GENE + BISCUIT_TOPGC_BED + BISCUIT_BOTGC_BED + input_bam + input_vcf + " + + for var in $VARS; do + if [[ ${!var} != "" && ! -f ${!var} ]]; then + >&2 echo "$var: ${!var} does not exist" + exit 1 + fi + done +} + +# Check if QC files have at least some information +function basic_check_output_filled { + prepend_path=$QCdir/${sname} + TWO_LINE_FILES=" + ${prepend_path}_all_cv_table.txt + ${prepend_path}_covdist_cpg_q40_botgc_table.txt + ${prepend_path}_covdist_cpg_q40_table.txt + ${prepend_path}_covdist_cpg_q40_topgc_table.txt + ${prepend_path}_covdist_cpg_table.txt + ${prepend_path}_covdist_q40_botgc_table.txt + ${prepend_path}_covdist_q40_table.txt + ${prepend_path}_covdist_q40_topgc_table.txt + ${prepend_path}_covdist_table.txt + ${prepend_path}_cpg_cv_table.txt + ${prepend_path}_cpg_dist_table.txt + ${prepend_path}_CpGRetentionByReadPos.txt + ${prepend_path}_CpGRetentionDist.txt + ${prepend_path}_CpHRetentionByReadPos.txt + ${prepend_path}_freqOfTotalRetentionPerRead.txt + ${prepend_path}_isize_score_table.txt + ${prepend_path}_mapq_table.txt + ${prepend_path}_totalBaseConversionRate.txt + ${prepend_path}_totalReadConversionRate.txt + " + ONE_LINE_FILES=" + ${prepend_path}_dup_report.txt + ${prepend_path}_strand_table.txt + " + + echo "Running basic check on BISCUIT QC output" + echo "Will remove any files that were obviously not filled properly" + echo "This avoids clashes when running MultiQC" + + # All files that have a description line, then a table header line + for FILE in ${TWO_LINE_FILES}; do + if [[ ! -f "${FILE}" ]]; then + >&2 echo "--- {FILE} --- was not initially created. Skipping!" + continue + fi + if [[ `wc -l ${FILE} | awk '{print $1}'` -lt 3 ]]; then + >&2 echo "--- ${FILE} --- has no entries. Check related files!" + >&2 echo "Deleting --- ${FILE} --- since there are no entries to help with debugging." + rm -f ${FILE} + fi + done + + # Files with only a description line + for FILE in ${ONE_LINE_FILES}; do + if [[ ! -f "${FILE}" ]]; then + >&2 echo "--- {FILE} --- was not initially created. Skipping!" + continue + fi + if [[ `wc -l ${FILE} | awk '{print $1}'` -lt 2 ]]; then + >&2 echo "--- ${FILE} --- has no entries. Check related files!" + >&2 echo "Deleting --- ${FILE} --- since there are no entries to help with debugging." + rm -f ${FILE} + fi + done +} + +function biscuitQC { + + # Simple check for necessary command line tools + check_path + + # Check variables and their associated files exist + check_variables + + # Create $QCdir if it does not exist + if [ ! -d $QCdir ]; then + mkdir -p $QCdir + fi + + echo "Running BISCUIT QC" + set -xe pipefail + ########################## + ## base coverage + ########################## + if [[ "$BISCUIT_QC_BASECOV" == true ]]; then + >&2 echo "`date`---- BISCUIT_QC_BASECOV ----" + # bedtools genomecov -bga -split -ibam $input_bam -g ${BISCUIT_REFERENCE}.fai | bedtools sort >$QCdir/${sname}_bga.bed + # samtools view -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g ${BISCUIT_REFERENCE}.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_q40.bed + bedtools genomecov -bga -split -ibam $input_bam -g ${BISCUIT_REFERENCE}.fai | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga.bed + samtools view -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g ${BISCUIT_REFERENCE}.fai -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_q40.bed + + echo -e "BISCUITqc Depth Distribution (All)" >$QCdir/${sname}_covdist_table.txt + echo -e "depth\tcount" >>$QCdir/${sname}_covdist_table.txt + awk '{cnt[$4]+=$3-$2}END{for(cov in cnt) {print int(cov)"\t"int(cnt[cov]);}}' $QCdir/${sname}_bga.bed | sort -k1,1n -T $QCdir >>$QCdir/${sname}_covdist_table.txt + + echo -e "BISCUITqc Depth Distribution (Q40)" >$QCdir/${sname}_covdist_q40_table.txt + echo -e "depth\tcount" >>$QCdir/${sname}_covdist_q40_table.txt + awk '{cnt[$4]+=$3-$2}END{for(cov in cnt) {print int(cov)"\t"int(cnt[cov]);}}' $QCdir/${sname}_bga_q40.bed | sort -k1,1n -T $QCdir >>$QCdir/${sname}_covdist_q40_table.txt + fi + + ########################## + ## duplicate_coverage + ########################## + [[ ! -f "$QCdir/${sname}_bga.bed" ]] && BISCUIT_QC_DUPLICATE=false + [[ ! -f "$QCdir/${sname}_bga_q40.bed" ]] && BISCUIT_QC_DUPLCIATE=false + if [[ "$BISCUIT_QC_DUPLICATE" == true ]]; then + >&2 echo "`date`---- BISCUIT_QC_DUPLICATE ----" + # duplicate + #samtools view -f 0x400 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_dup.bed + samtools view -f 0x400 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_dup.bed + + # duplication rate + echo -e "BISCUITqc Read Duplication Table" >$QCdir/${sname}_dup_report.txt + echo -ne "#bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt + awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga.bed >>$QCdir/${sname}_dup_report.txt + echo -ne "#bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt + awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_dup.bed >>$QCdir/${sname}_dup_report.txt + + if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then + # high GC content + echo -ne "#high-GC bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt + bedtools intersect -a $QCdir/${sname}_bga.bed -b $BISCUIT_TOPGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt + echo -ne "#high-GC bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt + bedtools intersect -a $QCdir/${sname}_bga_dup.bed -b $BISCUIT_TOPGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt + + # low GC content + echo -ne "#low-GC bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt + bedtools intersect -a $QCdir/${sname}_bga.bed -b $BISCUIT_BOTGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt + echo -ne "#low-GC bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt + bedtools intersect -a $QCdir/${sname}_bga_dup.bed -b $BISCUIT_BOTGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt + fi + + ## Q40 + # duplicate + # samtools view -f 0x400 -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_dup_q40.bed + samtools view -f 0x400 -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_dup_q40.bed + + # duplication rate + echo -ne "#bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt + awk '$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_q40.bed >>$QCdir/${sname}_dup_report.txt + echo -ne "#bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt + awk '$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_dup_q40.bed >>$QCdir/${sname}_dup_report.txt + + if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then + # high GC content + echo -ne "#high-GC bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt + bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt + echo -ne "#high-GC bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt + bedtools intersect -a $QCdir/${sname}_bga_dup_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt + + # low GC content + echo -ne "#low-GC bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt + bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt + echo -ne "#low-GC bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt + bedtools intersect -a $QCdir/${sname}_bga_dup_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt + fi + fi + + ########################## + ## cpg coverage + ########################## + + [[ ! -f "$BISCUIT_CPGBED" ]] && BISCUIT_QC_CPGCOV=false + [[ ! -f "$QCdir/${sname}_bga.bed" ]] && BISCUIT_QC_CPGCOV=false + [[ ! -f "$QCdir/${sname}_bga_q40.bed" ]] && BISCUIT_QC_CPGCOV=false + if [[ "$BISCUIT_QC_CPGCOV" == true ]]; then + >&2 echo "`date`---- BISCUIT_QC_CPGCOV ----" + bedtools intersect -a $BISCUIT_CPGBED -b $QCdir/${sname}_bga.bed -wo -sorted | bedtools groupby -g 1-3 -c 7 -o min >$QCdir/${sname}_cpg.bed + bedtools intersect -a $BISCUIT_CPGBED -b $QCdir/${sname}_bga_q40.bed -wo -sorted | bedtools groupby -g 1-3 -c 7 -o min >$QCdir/${sname}_cpg_q40.bed + + echo -e "BISCUITqc CpG Depth Distribution (All)" >$QCdir/${sname}_covdist_cpg_table.txt + echo -e "depth\tcount" >>$QCdir/${sname}_covdist_cpg_table.txt + awk '{cnt[$4]+=1}END{for(cov in cnt) {print int(cov)"\t"int(cnt[cov]);}}' $QCdir/${sname}_cpg.bed | sort -k1,1n >>$QCdir/${sname}_covdist_cpg_table.txt + + echo -e "BISCUITqc CpG Depth Distribution (Q40)" >$QCdir/${sname}_covdist_cpg_q40_table.txt + echo -e "depth\tcount" >>$QCdir/${sname}_covdist_cpg_q40_table.txt + awk '{cnt[$4]+=1}END{for(cov in cnt) {print int(cov)"\t"int(cnt[cov]);}}' $QCdir/${sname}_cpg_q40.bed | sort -k1,1n >>$QCdir/${sname}_covdist_cpg_q40_table.txt + fi + + ########################## + ## cpg distribution + ########################## + + [[ ! -f "$QCdir/${sname}_cpg_q40.bed" ]] && BISCUIT_QC_CPGDIST=false + [[ ! -f "$QCdir/${sname}_cpg.bed" ]] && BISCUIT_QC_CPGDIST=false + [[ ! -f "$BISCUIT_EXON" ]] && BISCUIT_QC_CPGDIST=false + [[ ! -f "$BISCUIT_RMSK" ]] && BISCUIT_QC_CPGDIST=false + [[ ! -f "$BISCUIT_GENE" ]] && BISCUIT_QC_CPGDIST=false + [[ ! -f "$BISCUIT_CGIBED" ]] && BISCUIT_QC_CPGDIST=false + if [[ "$BISCUIT_QC_CPGDIST" == true ]]; then + >&2 echo "`date`---- BISCUIT_QC_CPGDIST ----" + # whole genome + echo -e "BISCUITqc CpG Distribution Table" >$QCdir/${sname}_cpg_dist_table.txt + wc -l $QCdir/${sname}_cpg_q40.bed | awk -F" " '{printf("Territory\tAll\tUniqCov\tAllCov\nTotalCpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt + awk '$4>0{a+=1}END{printf("\t%d",a)}' $QCdir/${sname}_cpg_q40.bed >>$QCdir/${sname}_cpg_dist_table.txt + awk '$4>0{a+=1}END{printf("\t%d\n",a)}' $QCdir/${sname}_cpg.bed >>$QCdir/${sname}_cpg_dist_table.txt + + # exon + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_EXON) -sorted | wc -l | awk -F" " '{printf("ExonicCpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_EXON) -sorted | awk '$4>0{a+=1}END{printf("\t%d",a)}' >>$QCdir/${sname}_cpg_dist_table.txt + bedtools intersect -a $QCdir/${sname}_cpg.bed -b <(bedtools merge -i $BISCUIT_EXON) -sorted | awk '$4>0{a+=1}END{printf("\t%d\n",a)}' >>$QCdir/${sname}_cpg_dist_table.txt + + # repeat + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_RMSK) -sorted | wc -l | awk -F" " '{printf("RepeatCpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_RMSK) -sorted | awk '$4>0{a+=1}END{printf("\t%d",a)}' >>$QCdir/${sname}_cpg_dist_table.txt + bedtools intersect -a $QCdir/${sname}_cpg.bed -b <(bedtools merge -i $BISCUIT_RMSK) -sorted | awk '$4>0{a+=1}END{printf("\t%d\n",a)}' >>$QCdir/${sname}_cpg_dist_table.txt + + # gene + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_GENE) -sorted | wc -l | awk -F" " '{printf("GenicCpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_GENE) -sorted | awk '$4>0{a+=1}END{printf("\t%d",a)}' >>$QCdir/${sname}_cpg_dist_table.txt + bedtools intersect -a $QCdir/${sname}_cpg.bed -b <(bedtools merge -i $BISCUIT_GENE) -sorted | awk '$4>0{a+=1}END{printf("\t%d\n",a)}' >>$QCdir/${sname}_cpg_dist_table.txt + + # CGI + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_CGIBED) -sorted | wc -l | awk -F" " '{printf("CGICpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_CGIBED) -sorted | awk '$4>0{a+=1}END{printf("\t%d",a)}' >>$QCdir/${sname}_cpg_dist_table.txt + bedtools intersect -a $QCdir/${sname}_cpg.bed -b <(bedtools merge -i $BISCUIT_CGIBED) -sorted | awk '$4>0{a+=1}END{printf("\t%d\n",a)}' >>$QCdir/${sname}_cpg_dist_table.txt + + >&2 echo "`date`---- BISCUIT_QC_CGICOV ----" + # how CGI is covered by at least one q40-read in at least one CpG + echo >>$QCdir/${sname}_cpg_dist_table.txt + echo -ne "#CpG Islands\t" >>$QCdir/${sname}_cpg_dist_table.txt + zcat $BISCUIT_CGIBED | wc -l >>$QCdir/${sname}_cpg_dist_table.txt + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_CGIBED) -sorted -wo | awk '$4>0{print $5":"$6"-"$7}' | uniq -c | awk -F" " '{print $2"\t"$1}' >> $QCdir/${sname}_cpg_dist_table_temp.txt + echo -ne "#CpG Islands covered by at least one q40-read in at least one CpG\t" >>$QCdir/${sname}_cpg_dist_table.txt + less $QCdir/${sname}_cpg_dist_table_temp.txt | wc -l >>$QCdir/${sname}_cpg_dist_table.txt + echo -ne "#CpG Islands covered by at least one q40-read in at least three CpGs\t" >>$QCdir/${sname}_cpg_dist_table.txt + awk -F" " '$2>=3' $QCdir/${sname}_cpg_dist_table_temp.txt | wc -l >>$QCdir/${sname}_cpg_dist_table.txt + echo -ne "#CpG Islands covered by at least one q40-read in at least five CpGs\t" >>$QCdir/${sname}_cpg_dist_table.txt + awk -F" " '$2>=5' $QCdir/${sname}_cpg_dist_table_temp.txt | wc -l >>$QCdir/${sname}_cpg_dist_table.txt + echo -ne "#CpG Islands covered by at least one q40-read in at least ten CpGs\t" >>$QCdir/${sname}_cpg_dist_table.txt + awk -F" " '$2>=10' $QCdir/${sname}_cpg_dist_table_temp.txt | wc -l >>$QCdir/${sname}_cpg_dist_table.txt + /bin/rm $QCdir/${sname}_cpg_dist_table_temp.txt + fi + + ########################## + ## uniformity + ########################## + [[ ! -f "$QCdir/${sname}_covdist_q40_table.txt" ]] && BISCUIT_QC_UNIFORMITY=false + [[ ! -f "$QCdir/${sname}_bga_q40.bed" ]] && BISCUIT_QC_UNIFORMITY=false + if [[ "$BISCUIT_QC_UNIFORMITY" == true ]]; then + >&2 echo "`date`---- BISCUIT_QC_UNIFORMITY ----" + + echo -e "BISCUITqc Uniformity Table" >$QCdir/${sname}_all_cv_table.txt + awk -v sname="${sname}" '{cnt[$1]=$2}END{for (cov in cnt) {sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print "sample\tmu\tsigma\tcv\n"sname"_all\t"mu"\t"sigma"\t"sigma/mu}' $QCdir/${sname}_covdist_q40_table.txt >>$QCdir/${sname}_all_cv_table.txt + + if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then + echo -e "BISCUITqc Depth Distribution (high GC, Q40)" >$QCdir/${sname}_covdist_q40_topgc_table.txt + echo -e "depth\tcount" >>$QCdir/${sname}_covdist_q40_topgc_table.txt + bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk -v sname="${sname}" -v output="$QCdir/${sname}_all_cv_table.txt" '{cnt[$4]+=$3-$2}END{for (cov in cnt) {print cov"\t"cnt[cov]; sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print sname"_all_topgc\t"mu"\t"sigma"\t"sigma/mu >>output}' | sort -k1,1n >>$QCdir/${sname}_covdist_q40_topgc_table.txt + + echo -e "BISCUITqc Depth Distribution (low GC, Q40)" >$QCdir/${sname}_covdist_q40_botgc_table.txt + echo -e "depth\tcount" >>$QCdir/${sname}_covdist_q40_botgc_table.txt + bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk -v sname="${sname}" -v output="$QCdir/${sname}_all_cv_table.txt" '{cnt[$4]+=$3-$2}END{for (cov in cnt) {print cov"\t"cnt[cov]; sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print sname"_all_botgc\t"mu"\t"sigma"\t"sigma/mu >>output}' | sort -k1,1n >>$QCdir/${sname}_covdist_q40_botgc_table.txt + fi + fi + + ########################## + ## cpg uniformity + ########################## + [[ ! -f "$QCdir/${sname}_covdist_cpg_q40_table.txt" ]] && BISCUIT_QC_CPGUNIF=false + [[ ! -f "$QCdir/${sname}_cpg_q40.bed" ]] && BISCUIT_QC_CPGUNIF=false + if [[ "$BISCUIT_QC_CPGUNIF" == true ]]; then + >&2 echo "`date`---- BISCUIT_QC_CPGUNIF ----" + + echo -e "BISCUITqc CpG Uniformity Table" >$QCdir/${sname}_cpg_cv_table.txt + awk -v sname="${sname}" '{cnt[$1]=$2}END{for(cov in cnt) {sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print "sample\tmu\tsigma\tcv\n"sname"_cpg\t"mu"\t"sigma"\t"sigma/mu}' $QCdir/${sname}_covdist_cpg_q40_table.txt >>$QCdir/${sname}_cpg_cv_table.txt + + if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then + echo -e "BISCUITqc CpG Depth Distribution (high GC, Q40)" >$QCdir/${sname}_covdist_cpg_q40_topgc_table.txt + echo -e "depth\tcount" >>$QCdir/${sname}_covdist_cpg_q40_topgc_table.txt + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk -v sname="${sname}" -v output="$QCdir/${sname}_cpg_cv_table.txt" '{cnt[$4]+=1}END{for (cov in cnt) {print cov"\t"cnt[cov]; sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print sname"_cpg_topgc\t"mu"\t"sigma"\t"sigma/mu >>output}' | sort -k1,1n >>$QCdir/${sname}_covdist_cpg_q40_topgc_table.txt + + echo -e "BISCUITqc CpG Depth Distribution (low GC, Q40)" >$QCdir/${sname}_covdist_cpg_q40_botgc_table.txt + echo -e "depth\tcount" >>$QCdir/${sname}_covdist_cpg_q40_botgc_table.txt + bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk -v sname="${sname}" -v output="$QCdir/${sname}_cpg_cv_table.txt" '{cnt[$4]+=1}END{for (cov in cnt) {print cov"\t"cnt[cov]; sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print sname"_cpg_botgc\t"mu"\t"sigma"\t"sigma/mu >>output}' | sort -k1,1n >>$QCdir/${sname}_covdist_cpg_q40_botgc_table.txt + fi + fi + + ########################## + ## bisulfite conversion + ########################## + [[ ! -f "$input_vcf" ]] && BISCUIT_QC_BSCONV=false + if [[ "$BISCUIT_QC_BSCONV" == true ]]; then + >&2 echo "`date`---- BISCUIT_QC_BSCONV ----" + + #echo -e "BISCUITqc Frequency of Total Retention per Read Table" >$QCdir/${sname}_freqOfTotalRetentionPerRead.txt + #samtools view -h -q 40 $input_bam | biscuit bsconv $BISCUIT_REFERENCE - | awk 'match($0,/ZN:Z:([^ ]*)/,a){print gensub(/[A-Z,_]+/, "\t", "g", a[1])}' | cut -f2,4,6,8 | awk -v OFS="\t" '{ra[$1]+=1;rc[$2]+=1;rg[$3]+=1;rt[$4]+=1;}END{for(k in ra) {print "CA", k, ra[k]} for(k in rc) {print "CC", k, rc[k]} for(k in rg) {print "CG", k, rg[k]} for(k in rt) {print "CT", k, rt[k]}}' | sort -k1,1 -k2,2n | awk 'BEGIN{print "CTXT\tnumRET\tCnt"}{print}' >>$QCdir/${sname}_freqOfTotalRetentionPerRead.txt + + echo -e "BISCUITqc Frequency of Total Retention per Read Table\nCTXT\tnumRET\tCnt" >$QCdir/${sname}_freqOfTotalRetentionPerRead.txt + samtools view -h -q 40 $input_bam | biscuit bsconv $BISCUIT_REFERENCE - | grep -E -o --color "ZN:Z:[^ ].*" | awk -F '[^0-9]*' -v OFS="\t" '{ra[$2]+=1;rc[$4]+=1;rg[$6]+=1;rt[$8]+=1;}END{for(k in ra) {print "CA", k, ra[k]} for(k in rc) {print "CC", k, rc[k]} for(k in rg) {print "CG", k, rg[k]} for(k in rt) {print "CT", k, rt[k]}}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_freqOfTotalRetentionPerRead.txt + + echo -e "BISCUITqc Conversion Rate by Base Average Table" >$QCdir/${sname}_totalBaseConversionRate.txt + biscuit vcf2bed -et c $input_vcf | awk '{beta_sum[$6]+=$8; beta_cnt[$6]+=1;} END{print "CA\tCC\tCG\tCT"; print beta_sum["CA"]/beta_cnt["CA"]"\t"beta_sum["CC"]/beta_cnt["CC"]"\t"beta_sum["CG"]/beta_cnt["CG"]"\t"beta_sum["CT"]/beta_cnt["CT"];}' >>$QCdir/${sname}_totalBaseConversionRate.txt + + echo -e "BISCUITqc Conversion Rate by Read Average Table" >$QCdir/${sname}_totalReadConversionRate.txt + samtools view -hq 40 -F 0x900 $input_bam | biscuit bsconv -b $BISCUIT_REFERENCE - | awk '{for(i=1;i<=8;++i) a[i]+=$i;}END{print "CpA\tCpC\tCpG\tCpT"; print a[1]/(a[1]+a[2])"\t"a[3]/(a[3]+a[4])"\t"a[5]/(a[5]+a[6])"\t"a[7]/(a[7]+a[8]);}' >>$QCdir/${sname}_totalReadConversionRate.txt + + echo -e "BISCUITqc CpH Retention by Read Position Table" >$QCdir/${sname}_CpHRetentionByReadPos.txt + echo -e "ReadInPair\tPosition\tConversion/Retention\tCount" >>$QCdir/${sname}_CpHRetentionByReadPos.txt + #samtools view -hq 40 $input_bam | biscuit cinread $BISCUIT_REFERENCE - -t ch -p QPAIR,CQPOS,CRETENTION | sort | uniq -c | awk -F" " '$4!="N"{print $2"\t"$3"\t"$4"\t"$1}' | sort -k1,1 -k2,2n -T $QCdir >>$QCdir/${sname}_CpHRetentionByReadPos.txt + samtools view -hq 40 $input_bam | biscuit cinread $BISCUIT_REFERENCE - -t ch -p QPAIR,CQPOS,CRETENTION | awk -v OFS="\t" '$3!="N"{sorting[$0]++ } END {for (i in sorting) print i,sorting[i]}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_CpHRetentionByReadPos.txt + + echo -e "BISCUITqc CpG Retention by Read Position Table" >$QCdir/${sname}_CpGRetentionByReadPos.txt + echo -e "ReadInPair\tPosition\tConversion/Retention\tCount" >>$QCdir/${sname}_CpGRetentionByReadPos.txt + #samtools view -hq 40 $input_bam | biscuit cinread $BISCUIT_REFERENCE - -t cg -p QPAIR,CQPOS,CRETENTION | sort | uniq -c | awk -F" " '$4!="N"{print $2"\t"$3"\t"$4"\t"$1}' | sort -k1,1 -k2,2n -T $QCdir >>$QCdir/${sname}_CpGRetentionByReadPos.txt + samtools view -hq 40 $input_bam | biscuit cinread $BISCUIT_REFERENCE - -t cg -p QPAIR,CQPOS,CRETENTION | awk -v OFS="\t" '$3!="N"{sorting[$0]++ } END {for (i in sorting) print i,sorting[i]}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_CpGRetentionByReadPos.txt + fi + + #################### + ## mapping_summary + #################### + if [[ "$BISCUIT_QC_MAPPING" == true ]]; then + >&2 echo "`date`---- BISCUIT_QC_MAPPING ----" + echo -e "BISCUITqc Strand Table" >$QCdir/${sname}_strand_table.txt + biscuit cinread -p QPAIR,STRAND,BSSTRAND $BISCUIT_REFERENCE $input_bam | awk '{a[$1$2$3]+=1}END{for(strand in a) {print "strand\t"strand"\t"a[strand];}}' >>$QCdir/${sname}_strand_table.txt + + echo -e "BISCUITqc Mapping Quality Table" >$QCdir/${sname}_mapq_table.txt + echo -e "MapQ\tCount" >>$QCdir/${sname}_mapq_table.txt + samtools view -F 0x100 -f 0x4 $input_bam | wc -l | cat <(echo -ne "unmapped\t") - >>$QCdir/${sname}_mapq_table.txt + samtools view -F 0x104 $input_bam | awk '{cnt[$5]+=1}END{for(mapq in cnt) {print mapq"\t"cnt[mapq];}}' | sort -k1,1n >>$QCdir/${sname}_mapq_table.txt + ## insert size + ## this excludes read by AS (40) and mapq (40) + echo -e "BISCUITqc Insert Size, Score Table" >$QCdir/${sname}_isize_score_table.txt + echo -e "InsertSize/Score\tValue\tFraction" >>$QCdir/${sname}_isize_score_table.txt + # samtools view -F 0x104 $input_bam | awk '{match($0,/AS:i:([0-9]*)/,a); score[a[1]]+=1; sumscore+=1; if (and($2,0x2) && a[1]>=40 && $5>=40 && $9>=0 && $9 <=2000) {isize[$9]+=1; sumisize+=1}}END{for(k in isize){print "I\t"k"\t"isize[k] / sumisize} for(k in score){print "S\t"k"\t"score[k] / sumscore}}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_isize_score_table.txt + samtools view -F 0x104 $input_bam | grep -E 'AS:i:([0-9]*)' | awk -F 'AS:i:|\t' '{score[$17]+=1; sumscore+=1; if (and($2,0x2) && $17>=40 && $5>=40 && $9>=0 && $9 <=2000) {isize[$9]+=1; sumisize+=1}}END{for(k in isize){print "I\t"k"\t"isize[k] / sumisize} for(k in score){print "S\t"k"\t"score[k] / sumscore}}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_isize_score_table.txt + fi + + + ################################### + ## CpG retention distribution + ################################### + [[ ! -f "$input_vcf" ]] && BISCUIT_QC_BETAS=false + if [[ "$BISCUIT_QC_BETAS" == true ]]; then + echo -e "BISCUITqc Retention Distribution Table" >$QCdir/${sname}_CpGRetentionDist.txt + echo -e "RetentionFraction\tCount" >>$QCdir/${sname}_CpGRetentionDist.txt + biscuit vcf2bed -t cg $input_vcf | awk '$5>=3{a[sprintf("%3.0f", $4*100)]+=1}END{for (beta in a) print beta"\t"a[beta];}' | sort -k1,1n >>$QCdir/${sname}_CpGRetentionDist.txt + fi + + ################################### + ## Remove bed files + ################################### + if [[ "$remove_bed_files" ]]; then + rm $QCdir/*.bed + fi + + ######################################## + ## Running check on output files + ######################################## + basic_check_output_filled +} + + + + +ARGPARSE_DESCRIPTION="Run QC on biscuit output" +argparse "$@" <', + help='Path to vcf outupt from BISCUIT') +parser.add_argument('-o', '--outdir', type=str, default='BISCUITqc', + help='output directory [default %(default)s]') +parser.add_argument('--do_not_remove_bed', action='store_false', + default=True, help='Whether remove bed files [default %(default)s]') +parser.add_argument('-p', '--processes', type=int, default=1, + help='Number of processes to use [default %(default)s]') +EOF + +if [[ ! -d "$assets_directory" ]]; then + echo "Assets directory missing: $assets_directory."; + exit 1; +fi + +source $(dirname ${BASH_SOURCE[0]})/setup.sh $genome $assets_directory +input_vcf=$vcf +QCdir=$outdir +sname=$sample_name +remove_bed_files=$do_not_remove_bed + +>&2 echo "## Running BISCUIT QC script with following configuration ##" +>&2 echo "==============" +>&2 echo "sample name: $sname" +>&2 echo "input bam: $input_bam" +>&2 echo "input vcf: $input_vcf" +>&2 echo "output dir: $QCdir" +>&2 echo "REFERENCE: $BISCUIT_REFERENCE" +>&2 echo "CPGBED: $BISCUIT_CPGBED" +>&2 echo "CGIBED: $BISCUIT_CGIBED" +>&2 echo "RMSK: $BISCUIT_RMSK" +>&2 echo "EXON: $BISCUIT_EXON" +>&2 echo "GENE: $BISCUIT_GENE" +>&2 echo "TOPGC_BED: $BISCUIT_TOPGC_BED" +>&2 echo "BOTGC_BED: $BISCUIT_BOTGC_BED" +>&2 echo "==============" +biscuitQC +#>&2 echo $remove_bed_files $QCdir $input_vcf + +>&2 echo -e "\nDone." diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index b8880dc8..62a5749a 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -20,10 +20,17 @@ 'BWA': ['v_bwa.txt', r"Version: (\S+)"], 'bwa-meth': ['v_bwameth.txt', r"bwa-meth\.py (\S+)"], 'Picard MarkDuplicates': ['v_picard_markdups.txt', r"([\d\.]+)"], - 'MethylDackel': ['v_methyldackel.txt', r"(.+)"], + 'Picard CreateSequenceDictionary': ['v_picard_createseqdict.txt', r"([\d\.]+)"], + 'Picard CollectInsertSizeMetrics': ['v_picard_collectinssize.txt', r"([\d\.]+)"], + 'Picard CollectGcBiasMetrics': ['v_picard_collectgcbias.txt', r"([\d\.]+)"], + 'samblaster': ['v_samblaster.txt', r"samblaster: Version (\S+)"], + 'biscuit': ['v_biscuit.txt', r"Version: (\S+)"], + 'fastasort': ['v_fastasort.txt', r"fastasort from exonerate version (\S+)"], + 'MethylDackel': ['v_methyldackel.txt', r"(.+)"], 'Qualimap': ['v_qualimap.txt', r"QualiMap v.(\S+)"], 'Preseq': ['v_preseq.txt', r"Version: (\S+)"], 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], + } results = OrderedDict() results['nf-core/methylseq'] = 'N/A' @@ -45,6 +52,12 @@ results['Qualimap'] = 'N/A' results['Preseq'] = 'N/A' results['MultiQC'] = 'N/A' +results['samblaster'] = 'N/A' +results['biscuit'] = 'N/A' +results['fastasort'] = 'N/A' +results['Picard CreateSequenceDictionary'] = 'N/A' +results['Picard CollectInsertSizeMetrics'] = 'N/A' +results['Picard CollectGcBiasMetrics'] = 'N/A' # Search each file using its regex for k, v in regexes.items(): @@ -58,7 +71,7 @@ results[k] = False # Remove empty keys (defining them above ensures correct order) -for k in ['Bismark', 'Bismark Deduplication', 'Bismark methXtract', 'Bismark Report', 'Bismark Summary', 'Samtools', 'BWA', 'bwa-meth', 'Picard MarkDuplicates', 'MethylDackel']: +for k in ['Bismark', 'Bismark Deduplication', 'Bismark methXtract', 'Bismark Report', 'Bismark Summary', 'Samtools', 'BWA', 'bwa-meth', 'Picard MarkDuplicates', 'MethylDackel','samblaster','biscuit','fastasort','Picard CreateSequenceDictionary','Picard CollectInsertSizeMetrics','Picard CollectGcBiasMetrics']: if results[k] == 'N/A': del(results[k]) diff --git a/bin/setup.sh b/bin/setup.sh new file mode 100755 index 00000000..fd3901e6 --- /dev/null +++ b/bin/setup.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +## edit this +##### required ##### +## samtools fai-indexed reference +export BISCUIT_REFERENCE="$1" + +assets_dir="$2" +#this_dir=$(dirname ${BASH_SOURCE[0]}) +##### optional ##### +## use if the file is nonexistent, the corresponding +## QC section will be skipped +## CpGs +export BISCUIT_CPGBED="$assets_dir/cpg.bed.gz" +## CpG islands +export BISCUIT_CGIBED="$assets_dir/cgi.bed.gz" +## repeat masker bed file +export BISCUIT_RMSK="$assets_dir/rmsk.bed.gz" +## merged exon bed file +export BISCUIT_EXON="$assets_dir/exon.bed.gz" +## genes +export BISCUIT_GENE="$assets_dir/genes.bed.gz" +## locations for the top 100bp bins in GC content +export BISCUIT_TOPGC_BED="$assets_dir/windows100bp.gc_content.top10p.bed.gz" +## locations for the bottom 100bp bins in GC content +export BISCUIT_BOTGC_BED="$assets_dir/windows100bp.gc_content.bot10p.bed.gz" + +### QC operations to perform ### +export BISCUIT_QC_BASECOV=true +export BISCUIT_QC_DUPLICATE=true +export BISCUIT_QC_CPGCOV=true +export BISCUIT_QC_CPGDIST=true +export BISCUIT_QC_CGICOV=true +export BISCUIT_QC_UNIFORMITY=true +export BISCUIT_QC_CPGUNIF=true +export BISCUIT_QC_BSCONV=true +export BISCUIT_QC_CGICOV=true +export BISCUIT_QC_MAPPING=true +export BISCUIT_QC_BETAS=true diff --git a/conf/base.config b/conf/base.config index efbf7202..12c44d30 100644 --- a/conf/base.config +++ b/conf/base.config @@ -26,8 +26,14 @@ process { memory = { check_max( 64.GB * task.attempt, 'memory') } time = { check_max( 36.h * task.attempt, 'time') } } + withName:makeBwaMemIndex { + cpus = { check_max( 8 * task.attempt, 'cpus') } + memory = { check_max( 64.GB * task.attempt, 'memory') } + time = { check_max( 36.h * task.attempt, 'time') } + } + withName:trim_galore { - cpus = { check_max( 15 * task.attempt, 'cpus') } + cpus = { check_max( 8 * task.attempt, 'cpus') } memory = { check_max( 6.GB * task.attempt, 'memory') } time = { check_max( 1.d * task.attempt, 'time') } } @@ -48,11 +54,15 @@ process { } withName:qualimap { cpus = { check_max( 4 * task.attempt, 'cpus') } - memory = { check_max( 32.GB * task.attempt, 'memory') } - time = { check_max( 6.h * task.attempt, 'time') } + memory = { check_max( 40.GB * task.attempt, 'memory') } + time = { check_max( 20.h * task.attempt, 'time') } } withName:preseq { errorStrategy = 'ignore' + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 8.GB * task.attempt, 'memory') } + time = { check_max( 15.h * task.attempt, 'time') } + } withName:get_software_versions { validExitStatus = [0,1] @@ -77,6 +87,81 @@ process { cpus = { check_max( 6 * task.attempt, 'cpus') } memory = { check_max( 48.GB * task.attempt, 'memory') } time = { check_max( 1.d * task.attempt, 'time') } + } + + +withName:biscuit_align { + cpus = { check_max( 10 * task.attempt, 'cpus') } + memory = { check_max( 64.GB * task.attempt, 'memory') } + time = { check_max( 6.d * task.attempt, 'time') } + } + withName:makeBwaBISCUITIndex { + cpus = { check_max( 8 * task.attempt, 'cpus') } + memory = { check_max( 64.GB * task.attempt, 'memory') } + time = { check_max( 36.h * task.attempt, 'time') } + } + withName:samtools_sort_index_flagstat_biscuit { + cpus = { check_max( 6 * task.attempt, 'cpus') } + memory = { check_max( 36.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } + } + + withName:markDuplicates_samblaster { + cpus = { check_max( 10 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 3.d * task.attempt, 'time') } + } + + withName:createVCF { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 3.d * task.attempt, 'time') } + } + withName:CreateVCF { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } + } + withName:biscuit_QC { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 5.d * task.attempt, 'time') } + } + withName:intersect_soloWCGW_file { + cpus = { check_max( 1 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 12.h * task.attempt, 'time') } + } + withName:createBedgraph { + cpus = { check_max( 1 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 12.h * task.attempt, 'time') } + } + withName:picardMetrics { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } + } + withName:prepareGenomeToPicard { + cpus = { check_max( 2 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 1.d * task.attempt, 'time') } + } + withName:epiread_convertion { + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } + } + withName:get_SNP_file { + cpus = { check_max( 2 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } + } + + withName:fastqc { + cpus = { check_max( 6 * task.attempt, 'cpus') } + memory = { check_max( 32.GB * task.attempt, 'memory') } + time = { check_max( 2.d * task.attempt, 'time') } } } @@ -87,3 +172,5 @@ params { max_time = 240.h igenomes_base = 's3://ngi-igenomes/igenomes/' } + +// cleanup = true diff --git a/environment.yml b/environment.yml index 8850c341..1ffb82c2 100644 --- a/environment.yml +++ b/environment.yml @@ -1,12 +1,11 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-methylseq-1.4.1 +name: nf-core-methylseq-1.5dev channels: - conda-forge - bioconda - defaults dependencies: - - conda-forge::python=3.7.3 - conda-forge::pigz=2.3.4 - conda-forge::r-markdown=1.1 - bioconda::fastqc=0.11.8 @@ -16,10 +15,14 @@ dependencies: - bioconda::bowtie2=2.3.5 - bioconda::hisat2=2.1.0 - bioconda::bismark=0.22.3 - - bioconda::qualimap=2.2.2d + - bioconda::qualimap=2.2.2c - bioconda::preseq=2.0.3 - - bioconda::multiqc=1.8 + - bioconda::multiqc=1.7 # bwa-meth pipeline - - bioconda::picard=2.21.4 + - bioconda::picard=2.21.3 - bioconda::bwameth=0.2.2 - bioconda::methyldackel=0.4.0 +# added + - bioconda::samblaster=0.1.24 + - conda-forge::python=3.6.5 + - bioconda::bedtools=2.29.1 diff --git a/main.nf b/main.nf index 2580d61a..96a1b8c6 100644 --- a/main.nf +++ b/main.nf @@ -21,7 +21,7 @@ def helpMessage() { Mandatory arguments: --aligner [str] Alignment tool to use (default: bismark) - Available: bismark, bismark_hisat, bwameth + Available: bismark, bismark_hisat, bwameth, biscuit --reads [path] Path to input data (must be surrounded with quotes) -profile [str] Configuration profile to use. Can use multiple (comma separated) Available: conda, docker, singularity, awsbatch, test and more. @@ -30,28 +30,32 @@ def helpMessage() { --genome [str] Name of iGenomes reference --single_end [bool] Specifies that the input is single end reads --comprehensive [bool] Output information for all cytosine contexts - --cytosine_report [bool] Output stranded cytosine report during Bismark's bismark_methylation_extractor step. --ignore_flags [bool] Run MethylDackel with the flag to ignore SAM flags. --meth_cutoff [int] Specify a minimum read coverage to report a methylation call during Bismark's bismark_methylation_extractor step. - --min_depth [int] Specify a minimum read coverage for MethylDackel to report a methylation call. + --min_depth [int] Specify a minimum read coverage for MethylDackel to report a methylation call or for biscuit pileup. --methyl_kit [bool] Run MethylDackel with the --methyl_kit flag to produce files suitable for use with the methylKit R package. --skip_deduplication [bool] Skip deduplication step after alignment. This is turned on automatically if --rrbs is specified --non_directional [bool] Run alignment against all four possible strands --save_align_intermeds [bool] Save aligned intermediates to results directory --save_trimmed [bool] Save trimmed reads to results directory + --save_pileup_file [bool] Save vcf-pileup and index-vcf files from biscuit aligner to results directory + --save_snp_file Save SNP bed-file from biscuit to results directory. Relevant only if '--epiread' is specified --unmapped [bool] Save unmapped reads to fastq files --relax_mismatches [bool] Turn on to relax stringency for alignment (set allowed penalty with --num_mismatches) --num_mismatches [float] 0.6 will allow a penalty of bp * -0.6 - for 100bp reads (bismark default is 0.2) --known_splices [file] Supply a .gtf file containing known splice sites (bismark_hisat only) --slamseq [bool] Run bismark in SLAM-seq mode --local_alignment [bool] Allow soft-clipping of reads (potentially useful for single-cell experiments) - --bismark_align_cpu_per_multicore [int] Specify how many CPUs are required per --multicore for bismark align (default = 3) - --bismark_align_mem_per_multicore [str] Specify how much memory is required per --multicore for bismark align (default = 13.GB) + --soloWCGW_file [path] soloWCGW file, to intersect with methyl_extract bed file. soloWCGW for hg38 can be downlaod from: www.cse.huji.ac.il/~ekushele/solo_WCGW_cpg_hg38.bed. EXPERMINTAL! + --assets_dir [path] Assets directory for biscuit_QC, REQUIRED IF IN BISCUIT ALIGNER. can be found at: https://www.cse.huji.ac.il/~ekushele/assets.html + --epiread [bool] Convert bam to biscuit epiread format + References If not specified in the configuration file or you wish to overwrite any of the references. --fasta [file] Path to Fasta reference --fasta_index [path] Path to Fasta Index --bismark_index [path] Path to Bismark index + --bwa_biscuit_index [path] Path to Biscuit index --bwa_meth_index [path] Path to bwameth index --save_reference [bool] Save reference(s) to results directory @@ -70,13 +74,14 @@ def helpMessage() { --accell [bool] --zymo [bool] --cegx [bool] + --swift [bool] Other options: --outdir [path] The output directory where the results will be saved --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + --name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. AWSBatch options: --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch @@ -91,17 +96,21 @@ if (params.help) { } // Validate inputs -assert params.aligner == 'bwameth' || params.aligner == 'bismark' || params.aligner == 'bismark_hisat' : "Invalid aligner option: ${params.aligner}. Valid options: 'bismark', 'bwameth', 'bismark_hisat'" +assert params.aligner == 'bwameth' || params.aligner == 'bismark' || params.aligner == 'bismark_hisat' || params.aligner == 'biscuit' : "Invalid aligner option: ${params.aligner}. Valid options: 'bismark', 'bwameth', 'bismark_hisat', 'biscuit'" /* * SET UP CONFIGURATION VARIABLES */ - + // These params need to be set late, after the iGenomes config is loaded params.bismark_index = params.genome ? params.genomes[ params.genome ].bismark ?: false : false params.bwa_meth_index = params.genome ? params.genomes[ params.genome ].bwa_meth ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.fasta_index = params.genome ? params.genomes[ params.genome ].fasta_index ?: false : false +params.bwa_biscuit_index = false +params.soloWCGW_file = false +assembly_name = (params.fasta.toString().lastIndexOf('/') == -1) ?: params.fasta.toString().substring( params.fasta.toString().lastIndexOf('/')+1) + // Check if genome exists in the config file if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { @@ -116,29 +125,29 @@ ch_splicesites_for_bismark_hisat_align = params.known_splices ? Channel.fromPath if( params.aligner =~ /bismark/ ){ assert params.bismark_index || params.fasta : "No reference genome index or fasta file specified" - ch_wherearemyfiles_for_alignment.set { ch_wherearemyfiles_for_bismark_align } - + ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bismark_align;ch_wherearemyfiles_for_bismark_dedup_samtools_sort;ch_wherearemyfiles_for_bismark_samtools_sort } + Channel + .fromPath(params.fasta, checkIfExists: true) + .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } + .into { ch_fasta_for_makeBismarkIndex; ch_fasta_for_picard } + if( params.bismark_index ){ Channel .fromPath(params.bismark_index, checkIfExists: true) .ifEmpty { exit 1, "Bismark index file not found: ${params.bismark_index}" } - .into { ch_bismark_index_for_bismark_align; ch_bismark_index_for_bismark_methXtract } - } - else if( params.fasta ){ - Channel - .fromPath(params.fasta, checkIfExists: true) - .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } - .set { ch_fasta_for_makeBismarkIndex } + .set { ch_bismark_index_for_bismark_align } + ch_fasta_for_makeBismarkIndex.close() } + } -else if( params.aligner == 'bwameth' ){ - assert params.fasta : "No Fasta reference specified! This is required by MethylDackel." - ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bwamem_align; ch_wherearemyfiles_for_samtools_sort_index_flagstat } +else if( params.aligner == 'bwameth' || params.aligner == 'biscuit'){ + assert params.fasta : "No Fasta reference specified!" + ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bwamem_align; ch_wherearemyfiles_for_biscuit_align; ch_wherearemyfiles_for_samtools_sort_index_flagstat; ch_wherearemyfiles_for_samblaster } Channel .fromPath(params.fasta, checkIfExists: true) .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } - .into { ch_fasta_for_makeBwaMemIndex; ch_fasta_for_makeFastaIndex; ch_fasta_for_methyldackel } + .into { ch_fasta_for_makeBwaMemIndex; ch_fasta_for_makeFastaIndex; ch_fasta_for_methyldackel; ch_fasta_for_pileup; ch_fasta_for_epiread; ch_fasta_for_biscuitQC; ch_fasta_for_picard} if( params.bwa_meth_index ){ Channel @@ -148,16 +157,33 @@ else if( params.aligner == 'bwameth' ){ ch_fasta_for_makeBwaMemIndex.close() } + if( params.bwa_biscuit_index ){ + Channel + .fromPath("${params.bwa_biscuit_index}*", checkIfExists: true) + .ifEmpty { exit 1, "bwa (biscuit) index file(s) not found: ${params.bwa_biscuit_index}" } + .set { ch_bwa_index_for_biscuit } + ch_fasta_for_makeBwaMemIndex.close() + } + if( params.fasta_index ){ Channel .fromPath(params.fasta_index, checkIfExists: true) .ifEmpty { exit 1, "fasta index file not found: ${params.fasta_index}" } - .set { ch_fasta_index_for_methyldackel } + .into { ch_fasta_index_for_methyldackel; ch_fasta_index_for_biscuitQC; ch_fasta_index_for_createVCF; ch_fasta_index_for_epiread } ch_fasta_for_makeFastaIndex.close() } + } + +if( params.aligner == 'biscuit' ) { + + Channel + .fromPath("${params.assets_dir}", checkIfExists: true) + .ifEmpty { exit 1, "Assets directory for biscuit QC not found: ${params.assets_dir}" } + .set { ch_assets_dir_for_biscuit_qc } } -if( workflow.profile == 'uppmax' ){ + +if( workflow.profile == 'uppmax' || workflow.profile == 'uppmax_devel' ){ if( !params.project ) exit 1, "No UPPMAX project ID found! Use --project" } @@ -168,40 +194,55 @@ if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { custom_runName = workflow.runName } -// Trimming presets -clip_r1 = params.clip_r1 -clip_r2 = params.clip_r2 -three_prime_clip_r1 = params.three_prime_clip_r1 -three_prime_clip_r2 = params.three_prime_clip_r2 +// Library prep presets +params.rrbs = false +params.pbat = false +params.single_cell = false +params.epignome = false +params.accel = false +params.zymo = false +params.cegx = false +params.swift = false if(params.pbat){ - clip_r1 = 9 - clip_r2 = 9 - three_prime_clip_r1 = 9 - three_prime_clip_r2 = 9 + params.clip_r1 = 9 + params.clip_r2 = 9 + params.three_prime_clip_r1 = 9 + params.three_prime_clip_r2 = 9 } else if( params.single_cell ){ - clip_r1 = 6 - clip_r2 = 6 - three_prime_clip_r1 = 6 - three_prime_clip_r2 = 6 + params.clip_r1 = 6 + params.clip_r2 = 6 + params.three_prime_clip_r1 = 6 + params.three_prime_clip_r2 = 6 } else if( params.epignome ){ - clip_r1 = 8 - clip_r2 = 8 - three_prime_clip_r1 = 8 - three_prime_clip_r2 = 8 + params.clip_r1 = 8 + params.clip_r2 = 8 + params.three_prime_clip_r1 = 8 + params.three_prime_clip_r2 = 8 } else if( params.accel || params.zymo ){ - clip_r1 = 10 - clip_r2 = 15 - three_prime_clip_r1 = 10 - three_prime_clip_r2 = 10 + params.clip_r1 = 10 + params.clip_r2 = 15 + params.three_prime_clip_r1 = 10 + params.three_prime_clip_r2 = 10 } else if( params.cegx ){ - clip_r1 = 6 - clip_r2 = 6 - three_prime_clip_r1 = 2 - three_prime_clip_r2 = 2 + params.clip_r1 = 6 + params.clip_r2 = 6 + params.three_prime_clip_r1 = 2 + params.three_prime_clip_r2 = 2 +} +else if( params.swift ){ + params.clip_r1 = 0 + params.clip_r2 = 14 + params.three_prime_clip_r1 = 0 + params.three_prime_clip_r2 = 0 +} else { + params.clip_r1 = 0 + params.clip_r2 = 0 + params.three_prime_clip_r1 = 0 + params.three_prime_clip_r2 = 0 } if (workflow.profile.contains('awsbatch')) { @@ -242,64 +283,86 @@ if (params.readPaths) { .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } } +if (params.soloWCGW_file) { + Channel + .fromPath(params.soloWCGW_file) + .into { ch_soloWCGW_for_biscuitVCF; } +} // Header log info log.info nfcoreHeader() def summary = [:] -summary['Run Name'] = custom_runName ?: workflow.runName -summary['Reads'] = params.reads -summary['Aligner'] = params.aligner -summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' -if(params.known_splices) summary['Spliced alignment'] = 'Yes' -if(params.slamseq) summary['SLAM-seq'] = 'Yes' -if(params.local_alignment) summary['Local alignment'] = 'Yes' -if(params.genome) summary['Genome'] = params.genome -if(params.bismark_index) summary['Bismark Index'] = params.bismark_index -if(params.bwa_meth_index) summary['BWA-Meth Index'] = "${params.bwa_meth_index}*" -if(params.fasta) summary['Fasta Ref'] = params.fasta -if(params.fasta_index) summary['Fasta Index'] = params.fasta_index -if(params.rrbs) summary['RRBS Mode'] = 'On' -if(params.relax_mismatches) summary['Mismatch Func'] = "L,0,-${params.num_mismatches} (Bismark default = L,0,-0.2)" -if(params.skip_trimming) summary['Trimming Step'] = 'Skipped' -if(params.pbat) summary['Trim Profile'] = 'PBAT' -if(params.single_cell) summary['Trim Profile'] = 'Single Cell' -if(params.epignome) summary['Trim Profile'] = 'TruSeq (EpiGnome)' -if(params.accel) summary['Trim Profile'] = 'Accel-NGS (Swift)' -if(params.zymo) summary['Trim Profile'] = 'Zymo Pico-Methyl' -if(params.cegx) summary['Trim Profile'] = 'CEGX' -summary['Trimming'] = "5'R1: $clip_r1 / 5'R2: $clip_r2 / 3'R1: $three_prime_clip_r1 / 3'R2: $three_prime_clip_r2" -summary['Deduplication'] = params.skip_deduplication || params.rrbs ? 'No' : 'Yes' +if (workflow.revision) summary['Pipeline Release'] = workflow.revision +summary['Pipeline Name'] = 'nf-core/methylseq' +summary['Run Name'] = custom_runName ?: workflow.runName +summary['Reads'] = params.reads +summary['Aligner'] = params.aligner +summary['Spliced alignment'] = params.known_splices ? 'Yes' : 'No' +summary['SLAM-seq'] = params.slamseq ? 'Yes' : 'No' +summary['Local alignment'] = params.local_alignment ? 'Yes' : 'No' +summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' +summary['Genome'] = params.genome +if( params.bismark_index ) summary['Bismark Index'] = params.bismark_index +if( params.bwa_meth_index ) summary['BWA Meth Index'] = "${params.bwa_meth_index}*" +if( params.bwa_biscuit_index ) summary['BWA Index'] = "${params.bwa_biscuit_index}*" +if( params.fasta ) summary['Fasta Ref'] = params.fasta +if( params.fasta_index ) summary['Fasta Index'] = params.fasta_index +if( params.rrbs ) summary['RRBS Mode'] = 'On' +if( params.relax_mismatches ) summary['Mismatch Func'] = "L,0,-${params.num_mismatches} (Bismark default = L,0,-0.2)" +if( params.skip_trimming ) summary['Trimming Step'] = 'Skipped' +if( params.pbat ) summary['Trim Profile'] = 'PBAT' +if( params.single_cell ) summary['Trim Profile'] = 'Single Cell' +if( params.epignome ) summary['Trim Profile'] = 'TruSeq (EpiGnome)' +if( params.accel ) summary['Trim Profile'] = 'Accel-NGS (Swift)' +if( params.zymo ) summary['Trim Profile'] = 'Zymo Pico-Methyl' +if( params.cegx ) summary['Trim Profile'] = 'CEGX' +if( params.swift ) summary['Trim Profile'] = 'Swift' + +summary['Trim R1'] = params.clip_r1 +summary['Trim R2'] = params.clip_r2 +summary["Trim 3' R1"] = params.three_prime_clip_r1 +summary["Trim 3' R2"] = params.three_prime_clip_r2 +summary['Deduplication'] = params.skip_deduplication || params.rrbs ? 'No' : 'Yes' summary['Directional Mode'] = params.single_cell || params.zymo || params.non_directional ? 'No' : 'Yes' -summary['All C Contexts'] = params.comprehensive ? 'Yes' : 'No' -summary['Cytosine report'] = params.cytosine_report ? 'Yes' : 'No' -if(params.min_depth) summary['Minimum Depth'] = params.min_depth -if(params.ignore_flags) summary['MethylDackel'] = 'Ignoring SAM Flags' -if(params.methyl_kit) summary['MethylDackel'] = 'Producing methyl_kit output' -save_intermeds = []; -if(params.save_reference) save_intermeds.add('Reference genome build') -if(params.save_trimmed) save_intermeds.add('Trimmed FastQ files') -if(params.unmapped) save_intermeds.add('Unmapped reads') -if(params.save_align_intermeds) save_intermeds.add('Intermediate BAM files') -if(save_intermeds.size() > 0) summary['Save Intermediates'] = save_intermeds.join(', ') -if(params.bismark_align_cpu_per_multicore) summary['Bismark align CPUs per --multicore'] = params.bismark_align_cpu_per_multicore -if(params.bismark_align_mem_per_multicore) summary['Bismark align memory per --multicore'] = params.bismark_align_mem_per_multicore +summary['All C Contexts'] = params.comprehensive ? 'Yes' : 'No' +if( params.min_depth ) summary['Minimum Depth'] = params.min_depth +if( params.ignore_flags ) summary['MethylDackel'] = 'Ignoring SAM Flags' +if( params.methyl_kit ) summary['MethylDackel'] = 'Producing methyl_kit output' +if( params.assets_dir ) summary['Assets Directory'] = params.assets_dir +if( params.soloWCGW_file ) summary['soloWCGW File'] = params.soloWCGW_file +if( params.epiread ) summary['Epiread'] = params.epiread ? 'Yes' : 'No' + +summary['Save Reference'] = params.save_reference ? 'Yes' : 'No' +summary['Save Trimmed'] = params.save_trimmed ? 'Yes' : 'No' +summary['Save Unmapped'] = params.unmapped ? 'Yes' : 'No' +summary['Save Intermediates'] = params.save_align_intermeds ? 'Yes' : 'No' +summary['Save Pileups'] = params.save_pileup_file ? 'Yes' : 'No' +summary['Save SNP bed-file'] = params.save_snp_file ? 'Yes' : 'No' + + +summary['Current home'] = "$HOME" +summary['Current path'] = "$PWD" +if( params.project ) summary['UPPMAX Project'] = params.project + +summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" +if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" summary['Output dir'] = params.outdir summary['Launch dir'] = workflow.launchDir summary['Working dir'] = workflow.workDir -summary['Pipeline dir'] = workflow.projectDir +summary['Script dir'] = workflow.projectDir summary['User'] = workflow.userName -summary['Config Profile'] = workflow.profile -if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" if (workflow.profile.contains('awsbatch')) { summary['AWS Region'] = params.awsregion summary['AWS Queue'] = params.awsqueue } -if(params.project) summary['Cluster Project'] = params.project +summary['Config Profile'] = workflow.profile if (params.config_profile_description) summary['Config Description'] = params.config_profile_description if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact if (params.config_profile_url) summary['Config URL'] = params.config_profile_url -summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" -if(params.email) summary['E-mail Address'] = params.email -if(params.email_on_fail) summary['E-mail on failure'] = params.email_on_fail +if (params.email || params.email_on_fail) { + summary['E-mail Address'] = params.email + summary['E-mail on failure'] = params.email_on_fail + summary['MultiQC maxsize'] = params.max_multiqc_email_size +} log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") log.info "-\033[2m--------------------------------------------------\033[0m-" @@ -335,32 +398,41 @@ process get_software_versions { output: file 'software_versions_mqc.yaml' into ch_software_versions_yaml_for_multiqc - file "software_versions.csv" + file "software_versions.csv" into ch_try + //fastasort --version &> v_fastasort.txt 2>&1 || true script: """ - echo "$workflow.manifest.version" &> v_ngi_methylseq.txt - echo "$workflow.nextflow.version" &> v_nextflow.txt - bismark_genome_preparation --version &> v_bismark_genome_preparation.txt - fastqc --version &> v_fastqc.txt - cutadapt --version &> v_cutadapt.txt - trim_galore --version &> v_trim_galore.txt - bismark --version &> v_bismark.txt - deduplicate_bismark --version &> v_deduplicate_bismark.txt - bismark_methylation_extractor --version &> v_bismark_methylation_extractor.txt - bismark2report --version &> v_bismark2report.txt - bismark2summary --version &> v_bismark2summary.txt - samtools --version &> v_samtools.txt - hisat2 --version &> v_hisat2.txt - bwa &> v_bwa.txt 2>&1 || true - bwameth.py --version &> v_bwameth.txt - picard MarkDuplicates --version &> v_picard_markdups.txt 2>&1 || true - MethylDackel --version &> v_methyldackel.txt - qualimap --version &> v_qualimap.txt || true - preseq &> v_preseq.txt - multiqc --version &> v_multiqc.txt - scrape_software_versions.py &> software_versions_mqc.yaml - """ + echo "$workflow.manifest.version" &> v_ngi_methylseq.txt + echo "$workflow.nextflow.version" &> v_nextflow.txt + bismark_genome_preparation --version &> v_bismark_genome_preparation.txt + fastqc --version &> v_fastqc.txt + cutadapt --version &> v_cutadapt.txt + trim_galore --version &> v_trim_galore.txt + bismark --version &> v_bismark.txt + deduplicate_bismark --version &> v_deduplicate_bismark.txt + bismark_methylation_extractor --version &> v_bismark_methylation_extractor.txt + bismark2report --version &> v_bismark2report.txt + bismark2summary --version &> v_bismark2summary.txt + samtools --version &> v_samtools.txt + hisat2 --version &> v_hisat2.txt + bwa &> v_bwa.txt 2>&1 || true + bwameth.py --version &> v_bwameth.txt + picard MarkDuplicates --version &> v_picard_markdups.txt 2>&1 || true + picard CreateSequenceDictionary --version &> v_picard_createseqdict.txt 2>&1 || true + picard CollectInsertSizeMetrics --version &> v_picard_collectinssize.txt 2>&1 || true + picard CollectGcBiasMetrics --version &> v_picard_collectgcbias.txt 2>&1 || true + MethylDackel --version &> v_methyldackel.txt + qualimap --version &> v_qualimap.txt || true + preseq &> v_preseq.txt + multiqc --version &> v_multiqc.txt + samblaster --version &> v_samblaster.txt + biscuit &>v_biscuit.txt 2>&1 || true + $baseDir/bin/scrape_software_versions.py &> software_versions_mqc.yaml + """ + + + } /* @@ -375,7 +447,7 @@ if( !params.bismark_index && params.aligner =~ /bismark/ ){ file fasta from ch_fasta_for_makeBismarkIndex output: - file "BismarkIndex" into ch_bismark_index_for_bismark_align, ch_bismark_index_for_bismark_methXtract + file "BismarkIndex" into ch_bismark_index_for_bismark_align script: aligner = params.aligner == 'bismark_hisat' ? '--hisat2' : '--bowtie2' @@ -391,7 +463,7 @@ if( !params.bismark_index && params.aligner =~ /bismark/ ){ /* * PREPROCESSING - Build bwa-mem index */ -if( !params.bwa_meth_index && params.aligner == 'bwameth' ){ +if( !params.bwa_meth_index && params.aligner == 'bwameth'){ process makeBwaMemIndex { tag "$fasta" publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' @@ -409,10 +481,34 @@ if( !params.bwa_meth_index && params.aligner == 'bwameth' ){ } } +/* + * PREPROCESSING - Build bwa index, using biscuit + */ +if(!params.bwa_biscuit_index && params.aligner == 'biscuit' ){ + process makeBwaBISCUITIndex { + tag "$fasta" + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' + + input: + file fasta from ch_fasta_for_makeBwaMemIndex + + output: + file "${fasta}*" into ch_bwa_index_for_biscuit + + script: + """ + mkdir BiscuitIndex + cp $fasta BiscuitIndex/ + biscuit index $fasta + cp ${fasta}* BiscuitIndex + """ + } +} + /* * PREPROCESSING - Index Fasta file */ -if( !params.fasta_index && params.aligner == 'bwameth' ){ +if( !params.fasta_index && params.aligner == 'bwameth' || !params.fasta_index && params.aligner == 'biscuit' ){ process makeFastaIndex { tag "$fasta" publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' @@ -421,7 +517,7 @@ if( !params.fasta_index && params.aligner == 'bwameth' ){ file fasta from ch_fasta_for_makeFastaIndex output: - file "${fasta}.fai" into ch_fasta_index_for_methyldackel + file "${fasta}.fai" into ch_fasta_index_for_methyldackel,ch_fasta_index_for_biscuitQC,ch_fasta_index_for_createVCF,ch_fasta_index_for_epiread script: """ @@ -458,7 +554,7 @@ process fastqc { * STEP 2 - Trim Galore! */ if( params.skip_trimming ){ - ch_trimmed_reads_for_alignment = ch_read_files_for_trim_galore + ch_trimmed_reads_for_alignment = ch_read_files_for_trim_galore ch_trim_galore_results_for_multiqc = Channel.from(false) } else { process trim_galore { @@ -483,27 +579,18 @@ if( params.skip_trimming ){ file "where_are_my_files.txt" script: - def c_r1 = clip_r1 > 0 ? "--clip_r1 $clip_r1" : '' - def c_r2 = clip_r2 > 0 ? "--clip_r2 $clip_r2" : '' - def tpc_r1 = three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 $three_prime_clip_r1" : '' - def tpc_r2 = three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 $three_prime_clip_r2" : '' - def rrbs = params.rrbs ? "--rrbs" : '' - def cores = 1 - if(task.cpus){ - cores = (task.cpus as int) - 4 - if (params.single_end) cores = (task.cpus as int) - 3 - if (cores < 1) cores = 1 - if (cores > 4) cores = 4 - } + c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' + c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' + tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' + tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' + rrbs = params.rrbs ? "--rrbs" : '' if( params.single_end ) { """ - trim_galore --fastqc --gzip $reads \ - $rrbs $c_r1 $tpc_r1 --cores $cores + trim_galore --fastqc --gzip $rrbs $c_r1 $tpc_r1 $reads """ } else { """ - trim_galore --fastqc --gzip --paired $reads \ - $rrbs $c_r1 $c_r2 $tpc_r1 $tpc_r2 --cores $cores + trim_galore --paired --fastqc --gzip $rrbs $c_r1 $c_r2 $tpc_r1 $tpc_r2 $reads """ } } @@ -531,7 +618,7 @@ if( params.aligner =~ /bismark/ ){ file knownsplices from ch_splicesites_for_bismark_hisat_align output: - set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_preseq + set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_samtools_sort_index_flagstat set val(name), file("*report.txt") into ch_bismark_align_log_for_bismark_report, ch_bismark_align_log_for_bismark_summary, ch_bismark_align_log_for_multiqc file "*.fq.gz" optional true file "where_are_my_files.txt" @@ -562,13 +649,6 @@ if( params.aligner =~ /bismark/ ){ cpu_per_multicore = 3 mem_per_multicore = (13.GB).toBytes() } - // Check if the user has specified this and overwrite if so - if(params.bismark_align_cpu_per_multicore) { - cpu_per_multicore = (params.bismark_align_cpu_per_multicore as int) - } - if(params.bismark_align_mem_per_multicore) { - mem_per_multicore = (params.bismark_align_mem_per_multicore as nextflow.util.MemoryUnit).toBytes() - } // How many multicore splits can we afford with the cpus we have? ccore = ((task.cpus as int) / cpu_per_multicore) as int // Check that we have enough memory, assuming 13GB memory per instance (typical for mouse alignment) @@ -595,12 +675,44 @@ if( params.aligner =~ /bismark/ ){ $splicesites """ } + + /* + * STEP 4 - Samtools sort bismark + */ + process samtools_sort_index_flagstat_bismark { + tag "$name" + publishDir "${params.outdir}/samtools", mode: 'copy', + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "logs/$filename" + else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat + file wherearemyfiles from ch_wherearemyfiles_for_bismark_samtools_sort.collect() + + output: + set val(name), file("*.sorted.bam") into ch_bam_for_preseq,ch_bam_sorted_for_picard + file "where_are_my_files.txt" + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $bam \\ + -@ ${task.cpus} $sort_mem \\ + -o ${bam.baseName}.sorted.bam + """ + } + + /* - * STEP 4 - Bismark deduplicate + * STEP 5 - Bismark deduplicate */ if( params.skip_deduplication || params.rrbs ) { - ch_bam_for_bismark_deduplicate.into { ch_bam_dedup_for_bismark_methXtract; ch_bam_dedup_for_qualimap } + ch_bam_for_bismark_deduplicate.into { ch_bam_dedup_for_bismark_methXtract; ch_dedup_bam_for_samtools_sort_index_flagstat } ch_bismark_dedup_log_for_bismark_report = Channel.from(false) ch_bismark_dedup_log_for_bismark_summary = Channel.from(false) ch_bismark_dedup_log_for_multiqc = Channel.from(false) @@ -614,7 +726,7 @@ if( params.aligner =~ /bismark/ ){ set val(name), file(bam) from ch_bam_for_bismark_deduplicate output: - set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract, ch_bam_dedup_for_qualimap + set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract,ch_dedup_bam_for_samtools_sort_index_flagstat set val(name), file("*.deduplication_report.txt") into ch_bismark_dedup_log_for_bismark_report, ch_bismark_dedup_log_for_bismark_summary, ch_bismark_dedup_log_for_multiqc script: @@ -624,9 +736,41 @@ if( params.aligner =~ /bismark/ ){ """ } } + + /* + * STEP 6 - Samtools sort bismark after dedup + */ + process samtools_sort_index_flagstat_dedup_bismark { + tag "$name" + publishDir "${params.outdir}/samtools", mode: 'copy', + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "logs/$filename" + else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam) from ch_dedup_bam_for_samtools_sort_index_flagstat + file wherearemyfiles from ch_wherearemyfiles_for_bismark_dedup_samtools_sort.collect() + + output: + set val(name), file("*.sorted.bam") into ch_bam_dedup_for_qualimap + file "where_are_my_files.txt" + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $bam \\ + -@ ${task.cpus} $sort_mem \\ + -o ${bam.baseName}.sorted.bam + """ + } + /* - * STEP 5 - Bismark methylation extraction + * STEP 7 - Bismark methylation extraction */ process bismark_methXtract { tag "$name" @@ -636,13 +780,11 @@ if( params.aligner =~ /bismark/ ){ else if( filename.indexOf("M-bias" ) > 0) "m-bias/$filename" else if( filename.indexOf(".cov" ) > 0 ) "methylation_coverage/$filename" else if( filename.indexOf("bedGraph" ) > 0 ) "bedGraph/$filename" - else if( filename.indexOf("CpG_report" ) > 0 ) "stranded_CpG_report/$filename" else "methylation_calls/$filename" } input: set val(name), file(bam) from ch_bam_dedup_for_bismark_methXtract - file index from ch_bismark_index_for_bismark_methXtract.collect() output: set val(name), file("*splitting_report.txt") into ch_bismark_splitting_report_for_bismark_report, ch_bismark_splitting_report_for_bismark_summary, ch_bismark_splitting_report_for_multiqc @@ -651,12 +793,11 @@ if( params.aligner =~ /bismark/ ){ script: comprehensive = params.comprehensive ? '--comprehensive --merge_non_CpG' : '' - cytosine_report = params.cytosine_report ? "--cytosine_report --genome_folder ${index} " : '' meth_cutoff = params.meth_cutoff ? "--cutoff ${params.meth_cutoff}" : '' multicore = '' if( task.cpus ){ // Numbers based on Bismark docs - ccore = ((task.cpus as int) / 3) as int + ccore = ((task.cpus as int) / 10) as int if( ccore > 1 ){ multicore = "--multicore $ccore" } @@ -672,7 +813,7 @@ if( params.aligner =~ /bismark/ ){ if(params.single_end) { """ bismark_methylation_extractor $comprehensive $meth_cutoff \\ - $multicore $buffer $cytosine_report \\ + $multicore $buffer \\ --bedGraph \\ --counts \\ --gzip \\ @@ -683,7 +824,7 @@ if( params.aligner =~ /bismark/ ){ } else { """ bismark_methylation_extractor $comprehensive $meth_cutoff \\ - $multicore $buffer $cytosine_report \\ + $multicore $buffer \\ --ignore_r2 2 \\ --ignore_3prime_r2 2 \\ --bedGraph \\ @@ -705,7 +846,7 @@ if( params.aligner =~ /bismark/ ){ /* - * STEP 6 - Bismark Sample Report + * STEP 8 - Bismark Sample Report */ process bismark_report { tag "$name" @@ -728,7 +869,7 @@ if( params.aligner =~ /bismark/ ){ } /* - * STEP 7 - Bismark Summary Report + * STEP 8 - Bismark Summary Report */ process bismark_summary { publishDir "${params.outdir}/bismark_summary", mode: 'copy' @@ -756,6 +897,7 @@ else { ch_bismark_mbias_for_multiqc = Channel.from(false) ch_bismark_reports_results_for_multiqc = Channel.from(false) ch_bismark_summary_results_for_multiqc = Channel.from(false) + } @@ -778,12 +920,12 @@ if( params.aligner == 'bwameth' ){ file wherearemyfiles from ch_wherearemyfiles_for_bwamem_align.collect() output: - set val(name), file('*.bam') into ch_bam_for_samtools_sort_index_flagstat, ch_bam_for_preseq + set val(name), file('*.bam') into ch_bam_for_samtools_sort_index_flagstat file "where_are_my_files.txt" script: fasta = bwa_meth_indices[0].toString() - '.bwameth' - '.c2t' - '.amb' - '.ann' - '.bwt' - '.pac' - '.sa' - prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ + prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ """ bwameth.py \\ --threads ${task.cpus} \\ @@ -798,7 +940,7 @@ if( params.aligner == 'bwameth' ){ */ process samtools_sort_index_flagstat { tag "$name" - publishDir "${params.outdir}/bwa-mem_alignments", mode: 'copy', + publishDir "${params.outdir}/samtools", mode: 'copy', saveAs: {filename -> if(filename.indexOf("report.txt") > 0) "logs/$filename" else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename @@ -811,7 +953,7 @@ if( params.aligner == 'bwameth' ){ file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() output: - set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates + set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates,ch_bam_for_preseq,ch_bam_sorted_for_picard file "${bam.baseName}.sorted.bam.bai" into ch_bam_index file "${bam.baseName}_flagstat_report.txt" into ch_flagstat_results_for_multiqc file "${bam.baseName}_stats_report.txt" into ch_samtools_stats_results_for_multiqc @@ -882,8 +1024,8 @@ if( params.aligner == 'bwameth' ){ input: set val(name), file(bam) from ch_bam_dedup_for_methyldackel file bam_index from ch_bam_index_for_methyldackel - file fasta from ch_fasta_for_methyldackel - file fasta_index from ch_fasta_index_for_methyldackel + file fasta from ch_fasta_for_methyldackel.collect() + file fasta_index from ch_fasta_index_for_methyldackel.collect() output: file "${bam.baseName}*" into ch_methyldackel_results_for_multiqc @@ -905,17 +1047,310 @@ else { ch_samtools_stats_results_for_multiqc = Channel.from(false) ch_markDups_results_for_multiqc = Channel.from(false) ch_methyldackel_results_for_multiqc = Channel.from(false) + } + + +////////////////////////////////////////////////////// +/* + * Process with BISCUIT and assorted tools (samblaster) + */ +if( params.aligner == 'biscuit' ){ + process biscuit_align { + tag "$name" + publishDir "${params.outdir}/biscuit_alignments", mode: 'copy', + saveAs: {filename -> + if( !params.save_align_intermeds && filename == "where_are_my_files.txt" ) filename + else if( params.save_align_intermeds && filename != "where_are_my_files.txt" ) filename + else null + } + + input: + set val(name), file(reads) from ch_trimmed_reads_for_alignment + file bwa_indices from ch_bwa_index_for_biscuit.collect() + file wherearemyfiles from ch_wherearemyfiles_for_biscuit_align.collect() + + output: + set val(name), file('*.bam') into ch_bam_for_markDuplicates, ch_bam_for_samtools_sort_index_flagstat + file "where_are_my_files.txt" + + script: + fasta = bwa_indices[0].toString() - '.bwameth' - '.c2t' - '.amb' - '.ann' - '.bwt' - '.pac' - '.sa' - '.fai' - '.par' - '.dau' -'.bis' + assembly = fasta.replaceAll(/\.\w+/,"") + prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ + non_directional = params.non_directional ? 0 : 1 + // Paired-end or single end input files + input = params.pbat ? params.single_end ? reads + " -b 3" : "${reads[1]} ${reads[0]}" : reads + + + //pbat or not + + """ + biscuit align -M -b $non_directional -t ${task.cpus} $fasta $input | samtools view -Sb > ${name}.${assembly}.bam + """ + } + +/* +* STEP 4 - Mark duplicates +*/ + if( params.skip_deduplication || params.rrbs ) { + ch_bam_for_markDuplicates.into { ch_bam_dedup_for_methyldackel; ch_bam_dedup_for_qualimap; ch_samblaster_for_samtools_sort_index_flagstat } + ch_markDups_results_for_multiqc = Channel.from(false) + } else { + process markDuplicates_samblaster { + tag "$name" + + publishDir "${params.outdir}", mode: 'copy', + saveAs: {filename -> + if( filename.indexOf("log") > 0 ) "biscuit_markDuplicates/$filename" + else null + } + + input: + set val(name), file(bam) from ch_bam_for_markDuplicates + file wherearemyfiles from ch_wherearemyfiles_for_samblaster.collect() + + output: + set val(name), file("${bam.baseName}.samblaster.bam") into ch_bam_dedup_for_methyldackel, ch_samblaster_for_samtools_sort_index_flagstat + file "*log" into ch_samblaster_for_multiqc + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + unmapped = params.single_end ? '--ignoreUnmated' : '' + + """ + samtools sort -n $bam -@ ${task.cpus} $sort_mem| samtools view -h | samblaster -M $unmapped -d "${bam.baseName}_discordant.sam" -s "${bam.baseName}_split.sam" -u "${bam.baseName}_.fastq" --excludeDups --addMateTags | samtools view -Sb > ${bam.baseName}.samblaster.bam + cp .command.log ${bam.baseName}.log + """ + } + } + + /* + * STEP 5.- samtools flagstat on samples + */ + process samtools_sort_index_flagstat_biscuit { + tag "$name" + publishDir "${params.outdir}", mode: 'copy', + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "samtools/logs/$filename" + else if(filename.indexOf("sorted.bam") > 0) "biscuit_alignments/$filename" + else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat + set val(name_samblaster), file(samblaster_bam) from ch_samblaster_for_samtools_sort_index_flagstat + file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() + + output: + set val(name_samblaster), file("*.sorted.bam") into ch_bam_dedup_for_qualimap,ch_bam_for_preseq,ch_bam_sorted_for_pileup, ch_bam_sorted_for_epiread, ch_bam_noDups_for_QC,ch_bam_sorted_for_picard + file "*.sorted.bam.bai" into ch_bam_index_sorted_for_pileup,ch_bam_index_for_epiread,ch_bam_index_noDups_for_QC + file "${samblaster_bam.baseName}_flagstat_report.txt" into ch_flagstat_results_biscuit_for_multiqc + file "${samblaster_bam.baseName}_stats_report.txt" into ch_samtools_stats_results_biscuit_for_multiqc + file "where_are_my_files.txt" + + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $samblaster_bam \\ + -@ ${task.cpus} $sort_mem -l 9 \\ + -o ${samblaster_bam.baseName}.sorted.bam + samtools index ${samblaster_bam.baseName}.sorted.bam + + samtools flagstat ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_flagstat_report.txt + samtools stats ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_stats_report.txt + """ + } + + + /* + * STEP 6 - Create vcf file with pileup, to extract methylation + */ + process createVCF { + tag "$name" + publishDir "${params.outdir}/methylation_extract", mode: 'copy', + saveAs: {filename -> + if( !params.save_pileup_file && filename == "where_are_my_files.txt") filename + else if( filename.indexOf("vcf.gz") > 0 && params.save_pileup_file && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam) from ch_bam_sorted_for_pileup + file bam_index from ch_bam_index_sorted_for_pileup + file fasta from ch_fasta_for_pileup.collect() + file fasta_index from ch_fasta_index_for_createVCF.collect() + + output: + file "${bam.baseName}.*" + set val(name), file("*.vcf.gz*") into ch_vcf_biscuit_qc ,ch_vcf_for_bedgraph,ch_vcf_for_epiread + + script: + filter_duplication = params.skip_deduplication || params.rrbs ? '-u' : '' + all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' + """ + biscuit pileup -q ${task.cpus} $filter_duplication $fasta ${bam} -o "${bam.baseName}".vcf + bgzip -@ ${task.cpus} -f "${bam.baseName}.vcf" + tabix -f -p vcf "${bam.baseName}.vcf.gz" + """ + } + + + + /* + * STEP 7 - intersect vcf file with soloWCGW file + */ + process createBedgraph { + tag "$name" + publishDir "${params.outdir}/methylation_extract", mode: 'copy' + + + input: + set val(name), file(vcf) from ch_vcf_for_bedgraph + + output: + set val(name), file("*bedgraph" ) into ch_bedgraph_for_intersect_soloWCGW + + script: + min_depth = params.min_depth > 0 ? "${params.min_depth}" : '1' + all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' + """ + biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${vcf[0].baseName}.bedgraph" + + """ + } + + if (params.soloWCGW_file) { + process intersect_soloWCGW { + tag "$name" + publishDir "${params.outdir}/methylation_extract", mode: 'copy' + + + input: + set val(name), file(bedgraph) from ch_bedgraph_for_intersect_soloWCGW + file soloWGCW from ch_soloWCGW_for_biscuitVCF.collect() + + output: + file "*bedgraph" + + script: + """ + bedtools intersect -wa -a "${bedgraph[0].baseName}.bedgraph" -b $soloWGCW > "${bedgraph[0].baseName}_soloWCGW.bedgraph" + """ + } + } + + + if (params.epiread) { + /*************************** + THE PROCESS IS IN PROGRESS! + ****************************/ + process get_SNP_file { + tag "$name" + publishDir "${params.outdir}", mode: 'copy', + saveAs: {filename -> + if(params.save_snp_file && filename != "where_are_my_files.txt") "epireads/snp/$filename" + else null + } + + input: + set val(name), file(vcf) from ch_vcf_for_epiread + + output: + file "${vcf[0].baseName}.snp.bed" into ch_snp_for_epiread + script: + """ + biscuit vcf2bed -t snp "${vcf[0]}" > "${vcf[0].baseName}.snp.bed" + """ + } + + process epiread_convertion { + tag "$name" + publishDir "${params.outdir}/epireads", mode: 'copy' + + input: + set val(name), file(bam) from ch_bam_sorted_for_epiread + file bam_index from ch_bam_index_for_epiread + file fasta from ch_fasta_for_epiread.collect() + file fasta_index from ch_fasta_index_for_epiread.collect() + file snp from ch_snp_for_epiread + + output: + file "*epiread" + + script: + snp_file = (snp.size()>0) ? "-B " + snp.toString() : '' + if (params.single_end) { + """ + biscuit epiread -q ${task.cpus} $fasta $bam $snp_file -o "${bam.baseName}".epiread + """ + } else { + """ + biscuit epiread -q ${task.cpus} $fasta $bam $snp_file | sort --parallel=${task.cpus} -T . -k2,2 -k3,3n | awk 'BEGIN{qname="";rec=""} qname==\$2{print rec"\t"\$5"\t"\$6"\t"\$7"\t"\$8;qname=""} qname!=\$2{qname=\$2;rec=\$1"\t"\$4"\t"\$5"\t"\$6"\t"\$7"\t"\$8;pair=\$3}' > "${bam.baseName}".epiread + """ + } + } + } + + + process biscuit_QC { + tag "$bam_name" + publishDir "${params.outdir}/biscuit_QC", mode: 'copy' + + input: + set val(name), file(vcf) from ch_vcf_biscuit_qc + set val(bam_name), file(bam) from ch_bam_noDups_for_QC + file fasta from ch_fasta_for_biscuitQC.collect() + file fasta_index from ch_fasta_index_for_biscuitQC.collect() + file assets from ch_assets_dir_for_biscuit_qc.collect() + output: + file "*_biscuitQC" into ch_QC_results_for_multiqc + + script: + assembly = fasta.toString().replaceAll(/\.\w+/,"") + """ + $baseDir/bin/biscuit_QC.sh -v ${vcf[0]} -o ${bam_name}.${assembly}_biscuitQC $assets $fasta ${bam_name}.${assembly} ${bam} -p ${task.cpus} + """ + } + +} // end of biscuit if block +else { + ch_flagstat_results_biscuit_for_multiqc = Channel.from(false) + ch_samtools_stats_results_biscuit_for_multiqc = Channel.from(false) + ch_markDups_results_for_multiqc = Channel.from(false) + ch_methyldackel_results_for_multiqc = Channel.from(false) + ch_QC_results_for_multiqc = Channel.from(false) + ch_samblaster_for_multiqc = Channel.from(false) + ch_vcf_biscuit_qc = Channel.from(false) + ch_assets_dir_for_biscuit_qc = Channel.from(false) + ch_bam_sorted_for_pileup = Channel.from(false) + ch_bam_index_sorted_for_pileup = Channel.from(false) + ch_bam_noDups_for_QC = Channel.from(false) + ch_bam_sorted_for_epiread = Channel.from(false) + ch_bam_index_for_epiread = Channel.from(false) + } + +//////////////////////////////////////////////////////// + + + + + /* * STEP 8 - Qualimap */ process qualimap { tag "$name" publishDir "${params.outdir}/qualimap", mode: 'copy' - - input: + + input: set val(name), file(bam) from ch_bam_dedup_for_qualimap output: @@ -924,23 +1359,115 @@ process qualimap { script: gcref = params.genome.toString().startsWith('GRCh') ? '-gd HUMAN' : '' gcref = params.genome.toString().startsWith('GRCm') ? '-gd MOUSE' : '' - def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' - """ - samtools sort $bam \\ - -@ ${task.cpus} $sort_mem \\ - -o ${bam.baseName}.sorted.bam - qualimap bamqc $gcref \\ - -bam ${bam.baseName}.sorted.bam \\ - -outdir ${bam.baseName}_qualimap \\ - --collect-overlap-pairs \\ - --java-mem-size=${task.memory.toGiga()}G \\ - -nt ${task.cpus} - """ + """ + qualimap bamqc $gcref \\ + -bam ${bam.baseName}.bam \\ + -outdir ${bam.baseName}_qualimap \\ + --collect-overlap-pairs \\ + --java-mem-size=${task.memory.toGiga()}G \\ + -nt ${task.cpus} + + """ } + + /* + * STEP 9 - Picard - Preparation step + */ + process prepareGenomeToPicard { + publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: 'copy' + + input: + file fasta from ch_fasta_for_picard + file file_try from ch_try + output: + file "${fasta.baseName}.picard.fa" into ch_fasta_picard_for_picard + file "${fasta.baseName}.picard.dict" into ch_fasta_picard_dict_for_picard + + + script: + if( !task.memory ){ + log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." + avail_mem = 3 + } else { + avail_mem = task.memory.toGiga() + } + """ + mv ${fasta} ${fasta.baseName}.picard.fa + picard -Xmx${avail_mem}g CreateSequenceDictionary \\ + R=${fasta.baseName}.picard.fa \\ + O=${fasta.baseName}.picard.dict + """ + } + + + + /* + * STEP 10 - Picard InsertSizeMetrics and GcBiasMetrics + */ + process picardMetrics { + tag "$name" + publishDir "${params.outdir}/picardMetrics", mode: 'copy', + saveAs: { filename -> + if (filename.indexOf(".txt") > 0) filename + else if (filename.indexOf(".pdf") > 0) "pdf/$filename" + else null + } + input: + set val(name), file(bam) from ch_bam_sorted_for_picard + file fasta from ch_fasta_picard_for_picard.collect() + file dict from ch_fasta_picard_dict_for_picard.collect() + + output: + file "${bam.baseName}.*.pdf" + file "${bam.baseName}.*.txt" into ch_picard_results_for_multiqc + + script: + if( !task.memory ){ + log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." + avail_mem = 3 + } else { + avail_mem = task.memory.toGiga() + } + """ + picard -Xmx${avail_mem}g CollectInsertSizeMetrics \\ + INPUT=$bam \\ + OUTPUT=${bam.baseName}.insert_size_metrics.txt \\ + HISTOGRAM_FILE=${bam.baseName}.insert_size_histogram.pdf \\ + ASSUME_SORTED=true \\ + VALIDATION_STRINGENCY=LENIENT + set +e + + + picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ + INPUT=$bam \\ + OUTPUT=${bam.baseName}.gc_bias_metrics.txt \\ + CHART=${bam.baseName}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${bam.baseName}.summary_metrics.txt \\ + ASSUME_SORTED=true \\ + IS_BISULFITE_SEQUENCED=true \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT + + [ ! "\$?" -eq "0" ] && picard -Xmx${avail_mem}g ReorderSam I=$bam O=${bam.baseName}.picard.bam SEQUENCE_DICTIONARY=$fasta VALIDATION_STRINGENCY=LENIENT TMP_DIR=. && picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ + INPUT=${bam.baseName}.picard.bam \\ + OUTPUT=${bam.baseName}.gc_bias_metrics.txt \\ + CHART=${bam.baseName}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${bam.baseName}.summary_metrics.txt \\ + ASSUME_SORTED=true \\ + IS_BISULFITE_SEQUENCED=true \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT + echo "fine" + """ + + } + + + /* - * STEP 9 - preseq + * STEP 11 - preseq */ process preseq { tag "$name" @@ -953,18 +1480,14 @@ process preseq { file "${bam.baseName}.ccurve.txt" into preseq_results script: - def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' - """ - samtools sort $bam \\ - -@ ${task.cpus} $sort_mem \\ - -o ${bam.baseName}.sorted.bam - preseq lc_extrap -v -B ${bam.baseName}.sorted.bam -o ${bam.baseName}.ccurve.txt - """ + """ + preseq lc_extrap -v -B ${bam.baseName}.bam -o ${bam.baseName}.ccurve.txt + """ + } -/* - * STEP 10 - MultiQC +/* + * STEP 12 - MultiQC */ process multiqc { publishDir "${params.outdir}/MultiQC", mode: 'copy' @@ -981,10 +1504,16 @@ process multiqc { file ('bismark/*') from ch_bismark_summary_results_for_multiqc.collect().ifEmpty([]) file ('samtools/*') from ch_flagstat_results_for_multiqc.flatten().collect().ifEmpty([]) file ('samtools/*') from ch_samtools_stats_results_for_multiqc.flatten().collect().ifEmpty([]) - file ('picard/*') from ch_markDups_results_for_multiqc.flatten().collect().ifEmpty([]) + file ('samtools/*') from ch_flagstat_results_biscuit_for_multiqc.flatten().collect().ifEmpty([]) + file ('samtools/*') from ch_samtools_stats_results_biscuit_for_multiqc.flatten().collect().ifEmpty([]) + file ('bwa-mem_markDuplicates/*') from ch_markDups_results_for_multiqc.flatten().collect().ifEmpty([]) file ('methyldackel/*') from ch_methyldackel_results_for_multiqc.flatten().collect().ifEmpty([]) file ('qualimap/*') from ch_qualimap_results_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from preseq_results.collect().ifEmpty([]) + file ('biscuit_QC/*') from ch_QC_results_for_multiqc.collect().ifEmpty([]) + file ('biscuit_markDuplicates/*') from ch_samblaster_for_multiqc.collect().ifEmpty([]) + file ('picardMetrics/*') from ch_picard_results_for_multiqc.collect().ifEmpty([]) + file ('software_versions/*') from ch_software_versions_yaml_for_multiqc.collect() file workflow_summary from create_workflow_summary(summary) @@ -998,12 +1527,12 @@ process multiqc { rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' """ multiqc -f $rtitle $rfilename --config $multiqc_config . \\ - -m custom_content -m picard -m qualimap -m bismark -m samtools -m preseq -m cutadapt -m fastqc + -m custom_content -m picard -m qualimap -m bismark -m samtools -m preseq -m cutadapt -m fastqc -m biscuit -m samblaster """ } /* - * STEP 11 - Output Description HTML + * STEP 13 - Output Description HTML */ process output_documentation { publishDir "${params.outdir}/pipeline_info", mode: 'copy' @@ -1131,7 +1660,6 @@ workflow.onComplete { checkHostname() log.info "${c_purple}[nf-core/methylseq]${c_red} Pipeline completed with errors${c_reset}" } - } def nfcoreHeader() { diff --git a/parameters.settings.json b/parameters.settings.json index 39003b18..c0446847 100644 --- a/parameters.settings.json +++ b/parameters.settings.json @@ -82,6 +82,15 @@ "render": "check-box", "default_value": false, "type": "boolean" + }, + { + "name": "swift", + "label": "Library type: Swift", + "usage": "Set trimming settings for Wwift libraries", + "group": "Trimming presets", + "render": "check-box", + "default_value": false, + "type": "boolean" }, { "name": "single_end", @@ -141,7 +150,8 @@ "choices": [ "bismark", "bismark_hisat", - "bwameth" + "bwameth", + "biscuit" ], "default_value": "bismark", "group": "Alignment" @@ -177,7 +187,19 @@ "EF2", "Sbi1", "Sscrofa10.2", - "AGPv3" + "AGPv3", + "hg38", + "hg19", + "mm10", + "bosTau8", + "ce10", + "canFam3", + "danRer10", + "dm6", + "panTro4", + "rn6", + "sacCer3", + "susScr3" ], "default_value": "" }, @@ -200,6 +222,16 @@ "type": "string", "pattern": ".*", "default_value": "" + }, + { + "name": "bwa_biscuit_index", + "label": "bwa biscuit index", + "usage": "Path to bwa-biscuit index", + "group": "Alignment", + "render": "file", + "type": "string", + "pattern": ".*", + "default_value": "" }, { "name": "fasta_index", @@ -220,6 +252,26 @@ "type": "string", "pattern": ".*", "default_value": "" + }, + { + "name": "assets_dir", + "label": "assets directory", + "usage": "Path to assets directory for biscuit_QC", + "group": "Advanced: biscuit workflow only" + "render": "file", + "type": "string", + "pattern": ".*", + "default_value": "" + }, + { + "name": "soloWCGW_file", + "label": "soloWCGW file", + "usage": "Path to soloWCGW file to intersect with biscuit bedgraph files", + "group": "Advanced: biscuit workflow only" + "render": "file", + "type": "string", + "pattern": ".*", + "default_value": "" }, { "name": "known_splices", @@ -278,6 +330,16 @@ "pattern": "^$|(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$)", "type": "string", "default_value": "" + }, + { + "name": "email_on_fail", + "label": "Your email address", + "usage": "Your email address, required to receive failture notification.", + "group": "Pipeline defaults", + "render": "textfield", + "pattern": "^$|(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$)", + "type": "string", + "default_value": "" }, { "name": "max_cpus", @@ -311,7 +373,16 @@ { "name": "save_trimmed", "label": "Save Trimmed FastQ files", - "usage": "Save the trimmed FastQ files the the results directory.", + "usage": "Save the trimmed FastQ files to the results directory.", + "group": "Pipeline defaults", + "render": "check-box", + "default_value": false, + "type": "boolean" + }, + { + "name": "save_pileup_file", + "label": "Save vcf-pileup and index-vcf files", + "usage": "Save the vcf-pileup and index-vcf files from biscuit aligner to the results directory.", "group": "Pipeline defaults", "render": "check-box", "default_value": false, @@ -476,15 +547,6 @@ "type": "boolean", "group": "Advanced: bismark workflow only" }, - { - "name": "cytosine_report", - "label": "Cytosine report", - "usage": "Reports position, strand, trinucleotide context and methylation state of all cytosines", - "render": "none", - "default_value": false, - "type": "boolean", - "group": "Advanced: bismark workflow only" - }, { "name": "meth_cutoff", "label": "Methylation cutoff", From 43235460d9445dd0e76a59d97f28b3c1cb9401b2 Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 25 Mar 2020 14:42:00 +0200 Subject: [PATCH 02/56] workflow files --- .github/workflows/branch.yml | 17 ++++++++++++ .github/workflows/ci.yml | 30 +++++++++++++++++++++ .github/workflows/linting.yml | 50 +++++++++++++++++++++++++++++++++++ .travis.yml | 50 ----------------------------------- 4 files changed, 97 insertions(+), 50 deletions(-) create mode 100644 .github/workflows/branch.yml create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/linting.yml delete mode 100644 .travis.yml diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml new file mode 100644 index 00000000..8385e276 --- /dev/null +++ b/.github/workflows/branch.yml @@ -0,0 +1,17 @@ +name: nf-core branch protection +# This workflow is triggered on PRs to master branch on the repository +# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` +on: + pull_request: + branches: + - master + +jobs: + test: + runs-on: ubuntu-18.04 + steps: + # PRs are only ok if coming from an nf-core `dev` branch or a fork `patch` branch + - name: Check PRs + run: | + { [[ $(git remote get-url origin) == *nf-core/methylseq ]] && [[ ${GITHUB_HEAD_REF} = "dev" ]]; } || [[ ${GITHUB_HEAD_REF} == "patch" ]] + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..f9037712 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,30 @@ +name: nf-core CI +# This workflow is triggered on pushes and PRs to the repository. +# It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +on: [push, pull_request] + +jobs: + test: + env: + NXF_VER: ${{ matrix.nxf_ver }} + NXF_ANSI_LOG: false + runs-on: ubuntu-latest + strategy: + matrix: + # Nextflow versions: check pipeline minimum and current latest + nxf_ver: ['19.10.0', ''] + steps: + - uses: actions/checkout@v2 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Pull docker image + run: | + docker pull nfcore/methylseq:dev + docker tag nfcore/methylseq:dev nfcore/methylseq:dev + - name: Run pipeline with test data + run: | + # TODO nf-core: You can customise CI pipeline run tests as required + # (eg. adding multiple test runs with different parameters) + nextflow run ${GITHUB_WORKSPACE} -profile test,docker diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 00000000..1e0827a8 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,50 @@ +name: nf-core linting +# This workflow is triggered on pushes and PRs to the repository. +# It runs the `nf-core lint` and markdown lint tests to ensure that the code meets the nf-core guidelines +on: + push: + pull_request: + release: + types: [published] + +jobs: + Markdown: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-node@v1 + with: + node-version: '10' + - name: Install markdownlint + run: npm install -g markdownlint-cli + - name: Run Markdownlint + run: markdownlint ${GITHUB_WORKSPACE} -c ${GITHUB_WORKSPACE}/.github/markdownlint.yml + YAML: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - uses: actions/setup-node@v1 + with: + node-version: '10' + - name: Install yaml-lint + run: npm install -g yaml-lint + - name: Run yaml-lint + run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml") + nf-core: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - uses: actions/setup-python@v1 + with: + python-version: '3.6' + architecture: 'x64' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nf-core + - name: Run nf-core lint + run: nf-core lint ${GITHUB_WORKSPACE} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b797172c..00000000 --- a/.travis.yml +++ /dev/null @@ -1,50 +0,0 @@ -sudo: required -language: python -jdk: openjdk8 -services: docker -python: '3.6' -cache: pip -matrix: - fast_finish: true - -before_install: - # PRs to master are only ok if coming from dev branch - - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && ([ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ] || [ $TRAVIS_PULL_REQUEST_BRANCH = "patch" ]))' - # Pull the docker image first so the test doesn't wait for this - - docker pull nfcore/methylseq:dev - # Fake the tag locally so that the pipeline runs properly - # Looks weird when this is :dev to :dev, but makes sense when testing code for a release (:dev to :1.0.1) - - docker tag nfcore/methylseq:dev nfcore/methylseq:1.4.1 - -install: - # Install Nextflow - - mkdir /tmp/nextflow && cd /tmp/nextflow - - wget -qO- get.nextflow.io | bash - - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow - # Install nf-core/tools - - pip install --upgrade pip - - pip install nf-core - # Reset - - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests - # Install markdownlint-cli - - sudo apt-get install npm && npm install -g markdownlint-cli - -env: - - ALIGNER=bismark ALIGNER_REF="--bismark_index ${TRAVIS_BUILD_DIR}/tests/results/reference_genome/BismarkIndex/" NXF_VER='19.04.0' - - ALIGNER=bismark ALIGNER_REF="--bismark_index ${TRAVIS_BUILD_DIR}/tests/results/reference_genome/BismarkIndex/" - - ALIGNER=bismark_hisat ALIGNER_REF="--bismark_index ${TRAVIS_BUILD_DIR}/tests/results/reference_genome/BismarkIndex/" NXF_VER='19.04.0' - - ALIGNER=bismark_hisat ALIGNER_REF="--bismark_index ${TRAVIS_BUILD_DIR}/tests/results/reference_genome/BismarkIndex/" - - ALIGNER=bwameth ALIGNER_REF="--bwa_meth_index ${TRAVIS_BUILD_DIR}/tests/results/reference_genome/genome.fa" NXF_VER='19.04.0' - - ALIGNER=bwameth ALIGNER_REF="--bwa_meth_index ${TRAVIS_BUILD_DIR}/tests/results/reference_genome/genome.fa" - -script: - # Lint the pipeline code - - nf-core lint ${TRAVIS_BUILD_DIR} - # Lint the documentation - - markdownlint ${TRAVIS_BUILD_DIR} -c ${TRAVIS_BUILD_DIR}/.github/markdownlint.yml - # Run, build reference genome - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --aligner $ALIGNER --save_reference - # Basic run with supplied reference - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --aligner $ALIGNER $ALIGNER_REF - # Run, RRBS mode with no trimming - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --aligner $ALIGNER --skip_trimming --rrbs From 67175452deaa1565b2920dc683e48449a88c6ff8 Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 25 Mar 2020 20:47:07 +0200 Subject: [PATCH 03/56] change docs: output.md and usage.md. Update CHANGELOG.md accordingly. Update enviroment.yml to newer version. Change main.nf slightly, on definision of publish output directory --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 15 +++++++ README.md | 2 +- docs/output.md | 92 ++++++++++++++++++++++++++++++++++++++-- docs/usage.md | 57 ++++++++++++++++++++++--- environment.yml | 11 ++--- main.nf | 67 ++++++++++++++--------------- 7 files changed, 195 insertions(+), 51 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f9037712..6c238607 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: - name: Pull docker image run: | docker pull nfcore/methylseq:dev - docker tag nfcore/methylseq:dev nfcore/methylseq:dev + docker tag nfcore/methylseq:dev nfcore/methylseq:1.4.1 - name: Run pipeline with test data run: | # TODO nf-core: You can customise CI pipeline run tests as required diff --git a/CHANGELOG.md b/CHANGELOG.md index b1cb8dcc..6a328c4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,20 @@ + # nf-core/methylseq +## V new-my-updates + +### New features + +* Added Picard CollectInsertSizeMetrics and Picard CollectGcBiasMetrics +* Improved qulimap and preseq by adding `samtools sort` and `samtools index` step in the Bismark aligner +* Added biscuit aligner as an optional aligner, with all relative steps (alignment, mark duplicates, methylation extraction, QC for biscuit, and opptional epiread file creation). + +### Software updates + +* _new dependency_: samblaster`0.1.24` +* _new dependect_: bedtools `2.29.1` +* _new_: biscuit tool `0.3.11` + ## [v1.4.1](https://github.com/nf-core/methylseq/releases/tag/1.4.1) - 2019-12-11 ### New features diff --git a/README.md b/README.md index 530063f5..3865c016 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for | Align Reads | Bismark | bwa-meth | biscuit | | Deduplicate Alignments | Bismark | Picard MarkDuplicates | samblaster | | Extract methylation calls | Bismark | MethylDackel | biscuit | -| Sample report | Bismark | - | biscuit QC | +| Sample Report | Bismark | - | biscuit QC | | Summary Report | Bismark | - | - | | Picard Metrics | Picard | Picard | Picard | | Alignment QC | Qualimap | Qualimap | Qualimap | diff --git a/docs/output.md b/docs/output.md index 58776b7c..5cbed6ac 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,3 +1,4 @@ + # nf-core/methylseq Output This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. @@ -15,8 +16,10 @@ and processes data using the following steps: * [Deduplication](#deduplication) - deduplicating reads * [Methylation Extraction](#methylation-extraction) - calling cytosine methylation steps * [Bismark Reports](#bismark-reports) - single-sample and summary analysis reports +* [Biscuit Reports](#biscuit reports) - single-sample analysis reports for biscuit aligner * [Qualimap](#qualimap) - tool for genome alignments QC * [Preseq](#preseq) - tool for estimating sample complexity +* [Picard](#picard) - tool for generating metrics of statistics * [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline * [Pipeline Info](#pipeline-info) - reports from nextflow about the pipeline run @@ -57,7 +60,7 @@ Single-end data will have slightly different file names and only one FastQ file ### Alignment -Bismark and bwa-meth convert all Cytosines contained within the sequenced reads to Thymine _in-silico_ and then align against a three-letter reference genome. This method avoids methylation-specific alignment bias. The alignment produces a BAM file of genomic alignments. +Bismark and bwa-meth convert all Cytosines contained within the sequenced reads to Thymine _in-silico_ and then align against a three-letter reference genome. This method avoids methylation-specific alignment bias. The alignment produces a BAM file of genomic alignments. _+__________ **Bismark output directory: `results/bismark_alignments/`** _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment tool and the output file names will not differ between the options._ @@ -87,9 +90,27 @@ _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment t * `logs/sample_stats.txt` * Summary file giving lots of metrics about the aligned BAM file. +**biscuit output directory: `results/biscuit_alignnts/`** + +* `sample.assembly.bam` + * Aligned reads in BAM format. + * **NB:** Only saved if `--save_align_intermeds` is used +* `sample.assembly.sorted.bam` + * Aligned reads in a sorted BAM file. + * **NB:** Only saved if `--save_align_intermeds`, `--skip_deduplication` or `--rrbs` is specified when running the pipeline. + * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam` +* `sample.assembly.sorted.bam.bai` + * Index of sorted BAM file + * **NB:** Only saved if `--save_align_intermeds`, `--skip_deduplication` or `--rrbs` is specified when running the pipeline. + * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam.bai` +* `logs/sample_flagstat.txt` + * Summary file describing the number of reads which aligned in different ways. +* `logs/sample_stats.txt` + * Summary file giving lots of metrics about the aligned BAM file. + ### Deduplication -This step removes alignments with identical mapping position to avoid technical duplication in the results. Note that it is skipped if `--save_align_intermeds`, `--skip_deduplication` or `--rrbs` is specified when running the pipeline. +This step removes alignments with identical mapping position to avoid technical duplication in the results. Note that it is skipped if `--skip_deduplication` or `--rrbs` is specified when running the pipeline. **Bismark output directory: `results/bismark_deduplicated/`** @@ -101,6 +122,7 @@ This step removes alignments with identical mapping position to avoid technical **bwa-meth output directory: `results/bwa-mem_markDuplicates/`** > **NB:** The bwa-meth step doesn't remove duplicate reads from the BAM file, it just labels them. +> * `sample.sorted.markDups.bam` * BAM file with only unique alignments. @@ -109,11 +131,21 @@ This step removes alignments with identical mapping position to avoid technical * `logs/sample.sorted.markDups_metrics.txt` * Log file giving summary statistics about deduplication. + +**biscuit output directory: `results/biscuit_markDuplicates/`** + +> **NB:** The biscuit (samblaster) step doesn't remove duplicate reads from the BAM file, it just labels them. +> + + +* `sample.assembly.txt` + * Log file giving summary statistics about deduplication. + ### Methylation Extraction The methylation extractor step takes a BAM file with aligned reads and generates files containing cytosine methylation calls. It produces a few different output formats, described below. -Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` when running the pipeline. +Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` or `--skip_deduplication` or `--rrbs`when running the pipeline. Filename abbreviations stand for the following reference alignment strands: @@ -142,6 +174,28 @@ Filename abbreviations stand for the following reference alignment strands: * `sample.bedGraph` * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. +**biscuit workflow output directory: `results/methylation_extract/`** + +* `sample.bedgraph` + * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. +* `sample.bedgraph` + * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW file. + * **NB:** Created only if `--soloWCGW_file` is specified. **EXPERIMENTAL!** + * `sample.vcf.gz` + * VCF file with the pileup information, used for creating the bedgraph file. + * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. + * `sample.vcf.gz.tbi` + * Index file for `sample.vcf.gz` + * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. +> **NB** if `--epriread` is specified in the pipeline, then: +> **output directory:** `results/epireads` : + > * `sample.epiread` + Storing CpG retention pattern as well as SNP information on the same read in a compact way +> * `snp/sample.snp.bed` + > Storing the SNP information of the `sample.epiread` file. **EXTRACTING SNP PROCSS IS IN PROGRESS!** + **NB:** Only saved if `--save_snp_file` is specified when running the pipeline. +> + ### Bismark Reports Bismark generates a HTML reports describing results for each sample, as well as a summary report for the whole run. @@ -150,6 +204,14 @@ Bismark generates a HTML reports describing results for each sample, as well as **Output directory: `results/bismark_summary`** +### Biscuit Reports + +Biscuit generates a directory with different statistical reports describing results for each sample. The statistical reports are converted to plots plotted in the MultiQC report. + +**Output directory: `results/biscuit_QC/sample_biscuitQC/`** + + + ## Qualimap [Qualimap BamQC](http://qualimap.bioinfo.cipf.es/doc_html/analysis.html#bam-qc) is a general-use quality-control tool that generates a number of statistics about aligned BAM files. It's not specific to bisulfite data, but it produces several useful stats - for example, insert size and coverage statistics. @@ -172,6 +234,30 @@ Note that these are predictive numbers only, not absolute. The MultiQC plot can * `sample_ccurve.txt` * This file contains plot values for the complexity curve, plotted in the MultiQC report. +## Picard + +[Picard]([https://broadinstitute.github.io/picard/picard-metric-definitions.html](https://broadinstitute.github.io/picard/picard-metric-definitions.html)) is a set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF. +The two metrics created here are: +* [GcBiasMetrics]([https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics](https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics)) +* [InsertSizeMetrics]([https://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics](https://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics)) - Metrics about the insert size distribution of a paired-end library, created by the CollectInsertSizeMetrics program and usually written to a file with the extension ".insert_size_metrics". + + +**Output directory: `results/picardMetrics`** + +* `sample.insert_size_metrics.txt` + * This file contains plot values for the number of reads at a given insert size, plotted in the MultiQC report. +* `sample.gc_bias_metrics.txt` + * This file contains plot values for the bias in coverage across regions of the genome with varying GC content, plotted in the MultiQC report. +* `sample.summary_metrics.txt` + * This file contains a table summerizing the `sample.gc_bias_metrics.txt` data. + * `pdf/sample.insert_size_histogram.pdf` + * This file contains a plot of insert size histogram, created by Picard. + * `pdf/sample.gc_bias_metrics.pdf` + * This file contains a plot of GC bias of all reads, created by Picard. + + + + ## MultiQC [MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. diff --git a/docs/usage.md b/docs/usage.md index 54201bee..9ca46d3a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,3 +1,5 @@ + + # nf-core/methylseq: Usage ## Table of contents @@ -6,7 +8,7 @@ * [Table of contents](#table-of-contents) * [Introduction](#introduction) - * [Bismark and bwa-meth workflow](#bismark-and-bwa-meth-workflow) + * [Bismark, bwa-meth and biscuit workflow](#bismark-and-bwa-meth-workflow) * [Running the pipeline](#running-the-pipeline) * [Updating the pipeline](#updating-the-pipeline) * [Reproducibility](#reproducibility) @@ -33,6 +35,11 @@ * [`--unmapped`](#--unmapped) * [`--save_trimmed`](#--save_trimmed) * [`--save_align_intermeds`](#--save_align_intermeds) + * [`--save_pileup_file`](#--save_pileup_file) + * [`--epiread`](#--epiread) + * [`--save_snp_file`](#--save_snp_file) + * [`--soloWCGW_file`](#--soloWCGW_file) + * [`--assets_dir`](#--assets_dir) * [`--min_depth`](#--min_depth) * [`--meth_cutoff`](#--meth_cutoff) * [`--ignore_flags`](#--ignore_flags) @@ -76,14 +83,16 @@ It is recommended to limit the Nextflow Java virtual machines memory. We recomme NXF_OPTS='-Xms1g -Xmx4g' ``` -### Bismark and bwa-meth workflow +### Bismark, bwa-meth and biscuit workflow -The nf-core/methylseq package is actually two pipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline. +The nf-core/methylseq package is actually threepipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline. Since bismark v0.21.0 it is also possible to use [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) as alignment tool. To run this workflow, invoke the pipeline with the command line flag `--aligner bismark_hisat`. HISAT2 also supports splice-aware alignment if analysis of RNA is desired (e.g. [SLAMseq](https://science.sciencemag.org/content/360/6390/800) experiments), a file containing a list of known splicesites can be provided with `--known_splices`. The second workflow uses [BWA-Meth](https://github.com/brentp/bwa-meth) and [MethylDackel](https://github.com/dpryan79/methyldackel) instead of Bismark. To run this workflow, run the pipeline with the command line flag `--aligner bwameth`. +The third workflow uses [biscuit]([https://github.com/huishenlab/biscuit](https://github.com/huishenlab/biscuit)) . This workflow uses biscuit as an aligner, and biscuit-QC for quality control. To run this workflow, run the pipeline with the command line flag `--aligner biscuit` + ## Running the pipeline The typical command for running the pipeline is as follows: @@ -216,7 +225,7 @@ params { If you don't want to use the Illumina iGenomes references, you can supply your own reference genome. -The minimum requirement is just a FASTA file - the pipeline will automatically generate the relevant reference index from this. You can use the command line option `--save_reference` to keep the generated references so that they can be added to your config and used again in the future. The bwa-meth workflow always needs a FASTA file, for methylation calling. +The minimum requirement is just a FASTA file - the pipeline will automatically generate the relevant reference index from this. You can use the command line option `--save_reference` to keep the generated references so that they can be added to your config and used again in the future. The bwa-meth and biscuit workflows always need a FASTA file, for methylation calling. The FASTA is also required for the Picard metrics generating. ### `--fasta` @@ -234,6 +243,11 @@ If you prefer, you can specify the full path to your reference genome when you r # /path/to/ref/genome.fa.bwameth.c2t.bwt --bwa_meth_index /path/to/ref/genome.fa +# biscuit index filename base +# where for example the index files are called: +# /path/to/ref/genome.fa.bis.amb +--bwa_meth_index /path/to/ref/genome.fa + # Genome Fasta index file --fasta_index /path/to/genome.fa.fai ``` @@ -271,6 +285,7 @@ The pipeline also accepts a number of presets for common bisulfite library prepa | `--accel` | 10 | 15 | 10 | 10 | | `--zymo` | 10 | 15 | 10 | 10 | | `--cegx` | 6 | 6 | 2 | 2 | +| `--swift` | 0 | 14 | 0 | 0 | ### `--rrbs` @@ -288,11 +303,13 @@ By default, the pipeline includes a deduplication step after alignment. Use `--s ### `--pbat` -Using the `--pbat` parameter will affect the trimming (see above) and also set the `--pbat` flag when aligning with Bismark. It tells Bismark to align complementary strands (the opposite of `--directional`). +Using the `--pbat` parameter will affect the trimming (see above) and also set the `--pbat` flag when aligning with Bismark and biscuit. +For bismark, it tells the aligner to align complementary strands (the opposite of `--directional`). +For biscuit, it tells the aligner to switch between reads in paired-end (or align to synthesized strand on single-end). ### `--non_directional` -By default, Bismark assumes that libraries are directional and does not align against complementary strands. If your library prep was not directional, use `--non_directional` to align against all four possible strands. +By default, Bismark and biscuit assume that libraries are directional and do not align against complementary strands. If your library prep was not directional, use `--non_directional` to align against all four possible strands. Note that the `--single_cell` and `--zymo` parameters both set the `--non_directional` workflow flag automatically. @@ -304,6 +321,8 @@ If specified, this flag instructs the Bismark methylation extractor to use the ` If using the bwa-meth workflow, the flag makes MethylDackel report CHG and CHH contexts as well. +if using the biscuit aligner, the flag generate the bedgraph file extracting all possible types from the pileup file (including c, cg, ch, hcg, gch). + ### `--cytosine_report` By default, Bismark does not produce stranded calls. With this option the output considers all Cs on both forward and reverse strands and reports their position, strand, trinucleotide context and methylation state. @@ -326,9 +345,33 @@ By default, trimmed FastQ files will not be saved to the results directory. Spec By default intermediate BAM files will not be saved. The final BAM files created after the deduplication step are always. Set to true to also copy out BAM files from the initial Bismark alignment step. If `--skip_deduplication` or `--rrbs` is specified then BAMs from the initial alignment will always be saved. +### `--save_pileup_file` + +When running with biscuit aligner, the methylation extraction is based on vcf file. By default these vcf files will not be saved. Set to true to also copy out the vcf-file and the index-vcf file. + +### `--epiread` + +[Epiread]([https://github.com/zhou-lab/biscuit/wiki/Convert-to-epiread-format](https://github.com/zhou-lab/biscuit/wiki/Convert-to-epiread-format)) format is a compact way of storing CpG retention pattern as well as SNP information on the same read. This option will tell the biscuit workflow to generate epiread file. The SNP file used for the epiread format can be saved using `--save_snp_file`. + +### `--save_snp_file` + +SNP file created from a sample in order to be used in the epiread file is not saved in final output directory. Set to true to copy out the SNP file. +> **NB: The SNP detection is in progress of development within the biscuit tool** + +### `--soloWCGW_file` + +This will generatea methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW after extracting methylation from vcf, using biscuit workflow. +> **NB: The soloWCGW is experimental, and currently available only for hg38** +> + +### `--assets_dir` + +Path to a directory containing needed file for biscuit-QC step. The needed files for hg38,hg19 and mm10 can be found in [here](https://www.cse.huji.ac.il/~ekushele/assets.html). +**This paramater is mandatory when running the pipeline using biscuit workflow** + ### `--min_depth` -Specify to specify a minimum read coverage for MethylDackel to report a methylation call. +Specify to specify a minimum read coverage for MethylDackel or biscuit vcf2bed to report a methylation call. ### `--meth_cutoff` diff --git a/environment.yml b/environment.yml index 1ffb82c2..fd5ab82d 100644 --- a/environment.yml +++ b/environment.yml @@ -2,10 +2,11 @@ # conda env create -f environment.yml name: nf-core-methylseq-1.5dev channels: - - conda-forge + - conda-forge - bioconda - defaults dependencies: + - conda-forge::python=3.7.3 - conda-forge::pigz=2.3.4 - conda-forge::r-markdown=1.1 - bioconda::fastqc=0.11.8 @@ -15,14 +16,14 @@ dependencies: - bioconda::bowtie2=2.3.5 - bioconda::hisat2=2.1.0 - bioconda::bismark=0.22.3 - - bioconda::qualimap=2.2.2c + - bioconda::qualimap=2.2.2d - bioconda::preseq=2.0.3 - - bioconda::multiqc=1.7 + - bioconda::multiqc=1.8 # bwa-meth pipeline - - bioconda::picard=2.21.3 + - bioconda::picard=2.21.4 - bioconda::bwameth=0.2.2 - bioconda::methyldackel=0.4.0 + # added - bioconda::samblaster=0.1.24 - - conda-forge::python=3.6.5 - bioconda::bedtools=2.29.1 diff --git a/main.nf b/main.nf index 96a1b8c6..4dc9aaee 100644 --- a/main.nf +++ b/main.nf @@ -1081,11 +1081,10 @@ if( params.aligner == 'biscuit' ){ assembly = fasta.replaceAll(/\.\w+/,"") prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ non_directional = params.non_directional ? 0 : 1 - // Paired-end or single end input files + // Paired-end or single end input files and pbat or not input = params.pbat ? params.single_end ? reads + " -b 3" : "${reads[1]} ${reads[0]}" : reads - //pbat or not """ biscuit align -M -b $non_directional -t ${task.cpus} $fasta $input | samtools view -Sb > ${name}.${assembly}.bam @@ -1096,7 +1095,7 @@ if( params.aligner == 'biscuit' ){ * STEP 4 - Mark duplicates */ if( params.skip_deduplication || params.rrbs ) { - ch_bam_for_markDuplicates.into { ch_bam_dedup_for_methyldackel; ch_bam_dedup_for_qualimap; ch_samblaster_for_samtools_sort_index_flagstat } + ch_bam_for_markDuplicates.into { ch_bam_dedup_for_qualimap; ch_samblaster_for_samtools_sort_index_flagstat } ch_markDups_results_for_multiqc = Channel.from(false) } else { process markDuplicates_samblaster { @@ -1113,7 +1112,7 @@ if( params.aligner == 'biscuit' ){ file wherearemyfiles from ch_wherearemyfiles_for_samblaster.collect() output: - set val(name), file("${bam.baseName}.samblaster.bam") into ch_bam_dedup_for_methyldackel, ch_samblaster_for_samtools_sort_index_flagstat + set val(name), file("${bam.baseName}.samblaster.bam") into ch_samblaster_for_samtools_sort_index_flagstat file "*log" into ch_samblaster_for_multiqc script: @@ -1135,10 +1134,10 @@ if( params.aligner == 'biscuit' ){ tag "$name" publishDir "${params.outdir}", mode: 'copy', saveAs: {filename -> - if(filename.indexOf("report.txt") > 0) "samtools/logs/$filename" - else if(filename.indexOf("sorted.bam") > 0) "biscuit_alignments/$filename" - else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename - else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + if(filename.indexOf("report.txt") > 0) "biscuit_alignments/logs/$filename" + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename.indexOf("sorted.bam") > 0) "biscuit_alignments/$filename" + else if( (!params.save_align_intermeds && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename else null } @@ -1189,23 +1188,22 @@ if( params.aligner == 'biscuit' ){ file fasta_index from ch_fasta_index_for_createVCF.collect() output: - file "${bam.baseName}.*" - set val(name), file("*.vcf.gz*") into ch_vcf_biscuit_qc ,ch_vcf_for_bedgraph,ch_vcf_for_epiread - + set val(name), file("${name}.vcf.gz*") into ch_vcf_biscuit_qc ,ch_vcf_for_bedgraph,ch_vcf_for_epiread + script: filter_duplication = params.skip_deduplication || params.rrbs ? '-u' : '' all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' """ - biscuit pileup -q ${task.cpus} $filter_duplication $fasta ${bam} -o "${bam.baseName}".vcf - bgzip -@ ${task.cpus} -f "${bam.baseName}.vcf" - tabix -f -p vcf "${bam.baseName}.vcf.gz" + biscuit pileup -q ${task.cpus} $filter_duplication $fasta ${bam} -o ${name}.vcf + bgzip -@ ${task.cpus} -f ${name}.vcf + tabix -f -p vcf ${name}.vcf.gz """ } /* - * STEP 7 - intersect vcf file with soloWCGW file + * STEP 7 - create bedgraph file from vcf */ process createBedgraph { tag "$name" @@ -1222,11 +1220,13 @@ if( params.aligner == 'biscuit' ){ min_depth = params.min_depth > 0 ? "${params.min_depth}" : '1' all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' """ - biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${vcf[0].baseName}.bedgraph" + biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" """ } - + /*************** + *EXPERIMENTAL!!* + ***************/ if (params.soloWCGW_file) { process intersect_soloWCGW { tag "$name" @@ -1239,10 +1239,9 @@ if( params.aligner == 'biscuit' ){ output: file "*bedgraph" - script: """ - bedtools intersect -wa -a "${bedgraph[0].baseName}.bedgraph" -b $soloWGCW > "${bedgraph[0].baseName}_soloWCGW.bedgraph" + bedtools intersect -wa -a "${bedgraph[0].baseName}.bedgraph" -b $soloWGCW > ${name}_soloWCGW.bedgraph """ } } @@ -1264,10 +1263,10 @@ if( params.aligner == 'biscuit' ){ set val(name), file(vcf) from ch_vcf_for_epiread output: - file "${vcf[0].baseName}.snp.bed" into ch_snp_for_epiread + file "${name}.snp.bed" into ch_snp_for_epiread script: """ - biscuit vcf2bed -t snp "${vcf[0]}" > "${vcf[0].baseName}.snp.bed" + biscuit vcf2bed -t snp "${vcf[0]}" > "${name}.snp.bed" """ } @@ -1289,11 +1288,11 @@ if( params.aligner == 'biscuit' ){ snp_file = (snp.size()>0) ? "-B " + snp.toString() : '' if (params.single_end) { """ - biscuit epiread -q ${task.cpus} $fasta $bam $snp_file -o "${bam.baseName}".epiread + biscuit epiread -q ${task.cpus} $fasta $bam $snp_file -o ${name}.epiread """ } else { """ - biscuit epiread -q ${task.cpus} $fasta $bam $snp_file | sort --parallel=${task.cpus} -T . -k2,2 -k3,3n | awk 'BEGIN{qname="";rec=""} qname==\$2{print rec"\t"\$5"\t"\$6"\t"\$7"\t"\$8;qname=""} qname!=\$2{qname=\$2;rec=\$1"\t"\$4"\t"\$5"\t"\$6"\t"\$7"\t"\$8;pair=\$3}' > "${bam.baseName}".epiread + biscuit epiread -q ${task.cpus} $fasta $bam $snp_file | sort --parallel=${task.cpus} -T . -k2,2 -k3,3n | awk 'BEGIN{qname="";rec=""} qname==\$2{print rec"\t"\$5"\t"\$6"\t"\$7"\t"\$8;qname=""} qname!=\$2{qname=\$2;rec=\$1"\t"\$4"\t"\$5"\t"\$6"\t"\$7"\t"\$8;pair=\$3}' > ${name}.epiread """ } } @@ -1376,7 +1375,7 @@ process qualimap { */ process prepareGenomeToPicard { publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: 'copy' + saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: 'copy' input: file fasta from ch_fasta_for_picard @@ -1420,8 +1419,8 @@ process qualimap { file dict from ch_fasta_picard_dict_for_picard.collect() output: - file "${bam.baseName}.*.pdf" - file "${bam.baseName}.*.txt" into ch_picard_results_for_multiqc + file "${name}.*.pdf" + file "${name}.*.txt" into ch_picard_results_for_multiqc script: if( !task.memory ){ @@ -1433,8 +1432,8 @@ process qualimap { """ picard -Xmx${avail_mem}g CollectInsertSizeMetrics \\ INPUT=$bam \\ - OUTPUT=${bam.baseName}.insert_size_metrics.txt \\ - HISTOGRAM_FILE=${bam.baseName}.insert_size_histogram.pdf \\ + OUTPUT=${name}.insert_size_metrics.txt \\ + HISTOGRAM_FILE=${name}.insert_size_histogram.pdf \\ ASSUME_SORTED=true \\ VALIDATION_STRINGENCY=LENIENT set +e @@ -1442,9 +1441,9 @@ process qualimap { picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ INPUT=$bam \\ - OUTPUT=${bam.baseName}.gc_bias_metrics.txt \\ - CHART=${bam.baseName}.gc_bias_metrics.pdf \\ - SUMMARY_OUTPUT=${bam.baseName}.summary_metrics.txt \\ + OUTPUT=${name}.gc_bias_metrics.txt \\ + CHART=${name}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ ASSUME_SORTED=true \\ IS_BISULFITE_SEQUENCED=true \\ REFERENCE_SEQUENCE=$fasta \\ @@ -1452,9 +1451,9 @@ process qualimap { [ ! "\$?" -eq "0" ] && picard -Xmx${avail_mem}g ReorderSam I=$bam O=${bam.baseName}.picard.bam SEQUENCE_DICTIONARY=$fasta VALIDATION_STRINGENCY=LENIENT TMP_DIR=. && picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ INPUT=${bam.baseName}.picard.bam \\ - OUTPUT=${bam.baseName}.gc_bias_metrics.txt \\ - CHART=${bam.baseName}.gc_bias_metrics.pdf \\ - SUMMARY_OUTPUT=${bam.baseName}.summary_metrics.txt \\ + OUTPUT=${name}.gc_bias_metrics.txt \\ + CHART=${name}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ ASSUME_SORTED=true \\ IS_BISULFITE_SEQUENCED=true \\ REFERENCE_SEQUENCE=$fasta \\ From add54f33c32a4637b7e7788d98939bce24dd4137 Mon Sep 17 00:00:00 2001 From: ekushele Date: Tue, 21 Apr 2020 09:50:25 +0300 Subject: [PATCH 04/56] change environment.yml and Dockerfile to include biscuit as bioconda recipe --- Dockerfile | 13 +------------ environment.yml | 1 + 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index b1acaf45..93666f67 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,9 @@ FROM nfcore/base:1.7 LABEL authors="Phil Ewels" \ description="Docker image containing all software requirements for the nf-core/methylseq pipeline" -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends apt-utils && \ - apt-get install zlib1g-dev -y && \ - apt-get install libbz2-dev -y && \ - apt-get install liblzma-dev -y && \ - apt-get install libncurses5-dev -y && \ - apt-get install curl -y - -RUN cd / && \ - curl -OL $(curl -s https://api.github.com/repos/zwdzwd/biscuit/releases/latest | grep browser_download_url | grep linux_amd64 | cut -d '"' -f 4) && \ - chmod 755 biscuit*linux_amd64 && \ - mv biscuit*linux_amd64 biscuit COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a + RUN conda env export --name nf-core-methylseq-1.4.1 > nf-core-methylseq-1.4.1.yml ENV PATH /opt/conda/envs/nf-core-methylseq-1.4.1/bin:$PATH diff --git a/environment.yml b/environment.yml index fd5ab82d..df341e75 100644 --- a/environment.yml +++ b/environment.yml @@ -27,3 +27,4 @@ dependencies: # added - bioconda::samblaster=0.1.24 - bioconda::bedtools=2.29.1 + - bioconda::biscuit=0.3.15 From 42c3f95dce819e365df5f059591c5b03e6bbc178 Mon Sep 17 00:00:00 2001 From: ekushele Date: Tue, 21 Apr 2020 19:42:20 +0300 Subject: [PATCH 05/56] passed lint test --- .github/workflows/ci.yml | 6 +++--- environment.yml | 9 +++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4b79fa22..4e81739f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,8 +21,8 @@ jobs: ref_index: --bismark_index results/reference_genome/BismarkIndex/ - aligner: 'bwameth' ref_index: --bwa_meth_index results/reference_genome/genome.fa - - aligner 'biscuit' - ref_index: --bwa_biscuit_index results/reference_genome/genome.fa + - aligner: 'biscuit' + ref_index: --bwa_biscuit_index results/reference_genome/genome.fa steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -32,7 +32,7 @@ jobs: - name: Pull docker image run: | docker pull nfcore/methylseq:dev - docker tag nfcore/methylseq:dev nfcore/methylseq:1.5 + docker tag nfcore/methylseq:dev nfcore/methylseq:dev - name: Run pipeline with test data run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{matrix.aligner}} --save_reference diff --git a/environment.yml b/environment.yml index 048cd5f9..fc621b09 100644 --- a/environment.yml +++ b/environment.yml @@ -1,12 +1,8 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -<<<<<<< HEAD -name: nf-core-methylseq-1.5dev -======= name: nf-core-methylseq-1.6dev ->>>>>>> db919ce18d759ddea618caef57f825317dad61c1 channels: - - conda-forge + - conda-forge - bioconda - defaults dependencies: @@ -34,4 +30,5 @@ dependencies: # added - bioconda::samblaster=0.1.24 - bioconda::bedtools=2.29.1 - - bioconda::biscuit=0.3.15 +# - bioconda::biscuit=0.3.15 + - bioconda::biscuit=0.3.15.20200318 From ff55790c2628083ea4f0bc36b24653dba68cec68 Mon Sep 17 00:00:00 2001 From: ekushele <56130301+ekushele@users.noreply.github.com> Date: Wed, 22 Apr 2020 12:00:11 +0300 Subject: [PATCH 06/56] Update docs/usage.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Patrick Hüther --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 4d063947..37c3d2f2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -83,7 +83,7 @@ NXF_OPTS='-Xms1g -Xmx4g' ### Bismark, bwa-meth and biscuit workflow -The nf-core/methylseq package is actually threepipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline. +The nf-core/methylseq package is actually three pipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline. Since bismark v0.21.0 it is also possible to use [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) as alignment tool. To run this workflow, invoke the pipeline with the command line flag `--aligner bismark_hisat`. HISAT2 also supports splice-aware alignment if analysis of RNA is desired (e.g. [SLAMseq](https://science.sciencemag.org/content/360/6390/800) experiments), a file containing a list of known splicesites can be provided with `--known_splices`. From ed1203f3fe9f278ac443196defa7cd3c8251b62d Mon Sep 17 00:00:00 2001 From: ekushele <56130301+ekushele@users.noreply.github.com> Date: Wed, 22 Apr 2020 12:00:56 +0300 Subject: [PATCH 07/56] Update docs/usage.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Patrick Hüther --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 37c3d2f2..0caf3e2c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -359,7 +359,7 @@ SNP file created from a sample in order to be used in the epiread file is not sa ### `--soloWCGW_file` -This will generatea methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW after extracting methylation from vcf, using biscuit workflow. +This will generate methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW after extracting methylation from vcf, using biscuit workflow. > **NB: The soloWCGW is experimental, and currently available only for hg38** > From 81946ead440595d6b83610059e8b0075a9107193 Mon Sep 17 00:00:00 2001 From: ekushele <56130301+ekushele@users.noreply.github.com> Date: Wed, 22 Apr 2020 12:01:17 +0300 Subject: [PATCH 08/56] Update CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Patrick Hüther --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 351173aa..31964e57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ ### Software updates * _new dependency_: samblaster`0.1.24` -* _new dependect_: bedtools `2.29.1` +* _new dependency_: bedtools `2.29.1` * _new_: biscuit tool `0.3.11` ## [v1.5](https://github.com/nf-core/methylseq/releases/tag/1.5) - 2020-04-09 From 2b5d492d79acab6ee774029d93a9f2f2afce6710 Mon Sep 17 00:00:00 2001 From: ekushele <56130301+ekushele@users.noreply.github.com> Date: Wed, 22 Apr 2020 12:05:11 +0300 Subject: [PATCH 09/56] Update docs/usage.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Patrick Hüther --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 0caf3e2c..f3a63c1a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -366,7 +366,7 @@ This will generate methylation statuses in [bedGraph](http://genome.ucsc.edu/gol ### `--assets_dir` Path to a directory containing needed file for biscuit-QC step. The needed files for hg38,hg19 and mm10 can be found in [here](https://www.cse.huji.ac.il/~ekushele/assets.html). -**This paramater is mandatory when running the pipeline using biscuit workflow** +**This parameter is mandatory when running the pipeline using biscuit workflow** ### `--min_depth` From 0c0d93d20af07b3de087f4d4ca87259bfe21fec0 Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 22 Apr 2020 12:13:40 +0300 Subject: [PATCH 10/56] change trim_galore process to newer version --- main.nf | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index c0c3e5d7..e04ccce3 100644 --- a/main.nf +++ b/main.nf @@ -581,18 +581,27 @@ if( params.skip_trimming ){ file "where_are_my_files.txt" script: - c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' - c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' - tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' - tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' - rrbs = params.rrbs ? "--rrbs" : '' + def c_r1 = clip_r1 > 0 ? "--clip_r1 $clip_r1" : '' + def c_r2 = clip_r2 > 0 ? "--clip_r2 $clip_r2" : '' + def tpc_r1 = three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 $three_prime_clip_r1" : '' + def tpc_r2 = three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 $three_prime_clip_r2" : '' + def rrbs = params.rrbs ? "--rrbs" : '' + def cores = 1 + if(task.cpus){ + cores = (task.cpus as int) - 4 + if (params.single_end) cores = (task.cpus as int) - 3 + if (cores < 1) cores = 1 + if (cores > 4) cores = 4 + } if( params.single_end ) { """ - trim_galore --fastqc --gzip $rrbs $c_r1 $tpc_r1 $reads + trim_galore --fastqc --gzip $reads \ + $rrbs $c_r1 $tpc_r1 --cores $cores """ } else { """ - trim_galore --paired --fastqc --gzip $rrbs $c_r1 $c_r2 $tpc_r1 $tpc_r2 $reads + trim_galore --fastqc --gzip --paired $reads \ + $rrbs $c_r1 $c_r2 $tpc_r1 $tpc_r2 --cores $cores """ } } From 7b71df0d10b27564bb368c5689258976cf1a0bc9 Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 22 Apr 2020 12:23:39 +0300 Subject: [PATCH 11/56] changed bismark_methXtract to newer version --- main.nf | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index e04ccce3..a551ce56 100644 --- a/main.nf +++ b/main.nf @@ -783,7 +783,7 @@ if( params.aligner =~ /bismark/ ){ /* * STEP 7 - Bismark methylation extraction */ - process bismark_methXtract { + process bismark_methXtract { tag "$name" publishDir "${params.outdir}/bismark_methylation_calls", mode: 'copy', saveAs: {filename -> @@ -791,11 +791,13 @@ if( params.aligner =~ /bismark/ ){ else if( filename.indexOf("M-bias" ) > 0) "m-bias/$filename" else if( filename.indexOf(".cov" ) > 0 ) "methylation_coverage/$filename" else if( filename.indexOf("bedGraph" ) > 0 ) "bedGraph/$filename" + else if( filename.indexOf("CpG_report" ) > 0 ) "stranded_CpG_report/$filename" else "methylation_calls/$filename" } input: set val(name), file(bam) from ch_bam_dedup_for_bismark_methXtract + file index from ch_bismark_index_for_bismark_methXtract.collect() output: set val(name), file("*splitting_report.txt") into ch_bismark_splitting_report_for_bismark_report, ch_bismark_splitting_report_for_bismark_summary, ch_bismark_splitting_report_for_multiqc @@ -804,11 +806,12 @@ if( params.aligner =~ /bismark/ ){ script: comprehensive = params.comprehensive ? '--comprehensive --merge_non_CpG' : '' + cytosine_report = params.cytosine_report ? "--cytosine_report --genome_folder ${index} " : '' meth_cutoff = params.meth_cutoff ? "--cutoff ${params.meth_cutoff}" : '' multicore = '' if( task.cpus ){ // Numbers based on Bismark docs - ccore = ((task.cpus as int) / 10) as int + ccore = ((task.cpus as int) / 3) as int if( ccore > 1 ){ multicore = "--multicore $ccore" } @@ -824,7 +827,7 @@ if( params.aligner =~ /bismark/ ){ if(params.single_end) { """ bismark_methylation_extractor $comprehensive $meth_cutoff \\ - $multicore $buffer \\ + $multicore $buffer $cytosine_report \\ --bedGraph \\ --counts \\ --gzip \\ @@ -835,7 +838,7 @@ if( params.aligner =~ /bismark/ ){ } else { """ bismark_methylation_extractor $comprehensive $meth_cutoff \\ - $multicore $buffer \\ + $multicore $buffer $cytosine_report \\ --ignore_r2 2 \\ --ignore_3prime_r2 2 \\ --bedGraph \\ From 8aee069e039de99991c21cced3995d5e99524d9c Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 22 Apr 2020 12:25:23 +0300 Subject: [PATCH 12/56] remove ch_try --- main.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/main.nf b/main.nf index a551ce56..a45c20b5 100644 --- a/main.nf +++ b/main.nf @@ -400,8 +400,6 @@ process get_software_versions { output: file 'software_versions_mqc.yaml' into ch_software_versions_yaml_for_multiqc - file "software_versions.csv" into ch_try - //fastasort --version &> v_fastasort.txt 2>&1 || true script: """ @@ -1399,7 +1397,6 @@ process qualimap { input: file fasta from ch_fasta_for_picard - file file_try from ch_try output: file "${fasta.baseName}.picard.fa" into ch_fasta_picard_for_picard file "${fasta.baseName}.picard.dict" into ch_fasta_picard_dict_for_picard From e0e3a0971d1de93ba22c03f1741454443e73bcdd Mon Sep 17 00:00:00 2001 From: ekushele <56130301+ekushele@users.noreply.github.com> Date: Wed, 22 Apr 2020 12:31:13 +0300 Subject: [PATCH 13/56] Update CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Patrick Hüther --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 31964e57..62d50c94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ * Added Picard CollectInsertSizeMetrics and Picard CollectGcBiasMetrics * Improved qulimap and preseq by adding `samtools sort` and `samtools index` step in the Bismark aligner -* Added biscuit aligner as an optional aligner, with all relative steps (alignment, mark duplicates, methylation extraction, QC for biscuit, and opptional epiread file creation). +* Added biscuit aligner as an optional aligner, with all relative steps (alignment, mark duplicates, methylation extraction, QC for biscuit, and optional epiread file creation). ### Software updates From c8775cf704470af665fef05d6e001701ef5c8164 Mon Sep 17 00:00:00 2001 From: ekushele <56130301+ekushele@users.noreply.github.com> Date: Wed, 22 Apr 2020 12:31:33 +0300 Subject: [PATCH 14/56] Update docs/usage.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Patrick Hüther --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index f3a63c1a..de76b8f6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -224,7 +224,7 @@ params { If you don't want to use the Illumina iGenomes references, you can supply your own reference genome. -The minimum requirement is just a FASTA file - the pipeline will automatically generate the relevant reference index from this. You can use the command line option `--save_reference` to keep the generated references so that they can be added to your config and used again in the future. The bwa-meth and biscuit workflows always need a FASTA file, for methylation calling. The FASTA is also required for the Picard metrics generating. +The minimum requirement is just a FASTA file - the pipeline will automatically generate the relevant reference index from this. You can use the command line option `--save_reference` to keep the generated references so that they can be added to your config and used again in the future. The bwa-meth and biscuit workflows always need a FASTA file, for methylation calling. The FASTA is also required for the generation of Picard metrics. ### `--fasta` From 2d691dcb68c29dabfde3f05246c488652459731f Mon Sep 17 00:00:00 2001 From: ekushele <56130301+ekushele@users.noreply.github.com> Date: Wed, 22 Apr 2020 12:32:10 +0300 Subject: [PATCH 15/56] Update docs/output.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Patrick Hüther --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 5cbed6ac..2f53890f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -145,7 +145,7 @@ This step removes alignments with identical mapping position to avoid technical The methylation extractor step takes a BAM file with aligned reads and generates files containing cytosine methylation calls. It produces a few different output formats, described below. -Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` or `--skip_deduplication` or `--rrbs`when running the pipeline. +Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` or `--skip_deduplication` or `--rrbs` when running the pipeline. Filename abbreviations stand for the following reference alignment strands: From 46b8b7c4ddaaa8c63d61d3db6b590a363f2d5862 Mon Sep 17 00:00:00 2001 From: ekushele <56130301+ekushele@users.noreply.github.com> Date: Wed, 22 Apr 2020 12:32:59 +0300 Subject: [PATCH 16/56] Update docs/output.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Patrick Hüther --- docs/output.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 2f53890f..2af2b5df 100644 --- a/docs/output.md +++ b/docs/output.md @@ -122,7 +122,6 @@ This step removes alignments with identical mapping position to avoid technical **bwa-meth output directory: `results/bwa-mem_markDuplicates/`** > **NB:** The bwa-meth step doesn't remove duplicate reads from the BAM file, it just labels them. -> * `sample.sorted.markDups.bam` * BAM file with only unique alignments. From f9f799ec34838703b2924b27571ae022064c1f65 Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 22 Apr 2020 12:38:51 +0300 Subject: [PATCH 17/56] set parameters to false --- main.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/main.nf b/main.nf index a45c20b5..b1e2324a 100644 --- a/main.nf +++ b/main.nf @@ -112,6 +112,10 @@ params.bwa_biscuit_index = false params.soloWCGW_file = false assembly_name = (params.fasta.toString().lastIndexOf('/') == -1) ?: params.fasta.toString().substring( params.fasta.toString().lastIndexOf('/')+1) +params.save_pileup_file = false +params.epiread = false +params.save_snp_file = false +params.assets_dir = false // Check if genome exists in the config file if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { From b7d3e8f7888e959588c4c4450d0a5b835cf877b6 Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 22 Apr 2020 14:36:18 +0300 Subject: [PATCH 18/56] remove SNP step, until biscuit finish the progress with it --- docs/output.md | 5 +---- docs/usage.md | 8 +------- main.nf | 31 ++----------------------------- 3 files changed, 4 insertions(+), 40 deletions(-) diff --git a/docs/output.md b/docs/output.md index 2af2b5df..b98276a3 100644 --- a/docs/output.md +++ b/docs/output.md @@ -189,10 +189,7 @@ Filename abbreviations stand for the following reference alignment strands: > **NB** if `--epriread` is specified in the pipeline, then: > **output directory:** `results/epireads` : > * `sample.epiread` - Storing CpG retention pattern as well as SNP information on the same read in a compact way -> * `snp/sample.snp.bed` - > Storing the SNP information of the `sample.epiread` file. **EXTRACTING SNP PROCSS IS IN PROGRESS!** - **NB:** Only saved if `--save_snp_file` is specified when running the pipeline. + Storing CpG retention pattern on the read in a compact way > ### Bismark Reports diff --git a/docs/usage.md b/docs/usage.md index de76b8f6..4d2d19b8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -35,7 +35,6 @@ * [`--save_align_intermeds`](#--save_align_intermeds) * [`--save_pileup_file`](#--save_pileup_file) * [`--epiread`](#--epiread) - * [`--save_snp_file`](#--save_snp_file) * [`--soloWCGW_file`](#--soloWCGW_file) * [`--assets_dir`](#--assets_dir) * [`--min_depth`](#--min_depth) @@ -350,12 +349,7 @@ When running with biscuit aligner, the methylation extraction is based on vcf fi ### `--epiread` -[Epiread]([https://github.com/zhou-lab/biscuit/wiki/Convert-to-epiread-format](https://github.com/zhou-lab/biscuit/wiki/Convert-to-epiread-format)) format is a compact way of storing CpG retention pattern as well as SNP information on the same read. This option will tell the biscuit workflow to generate epiread file. The SNP file used for the epiread format can be saved using `--save_snp_file`. - -### `--save_snp_file` - -SNP file created from a sample in order to be used in the epiread file is not saved in final output directory. Set to true to copy out the SNP file. -> **NB: The SNP detection is in progress of development within the biscuit tool** +[Epiread]([https://github.com/zhou-lab/biscuit/wiki/Convert-to-epiread-format](https://github.com/zhou-lab/biscuit/wiki/Convert-to-epiread-format)) format is a compact way of storing CpG retention pattern on the same read. This option will tell the biscuit workflow to generate epiread file. ### `--soloWCGW_file` diff --git a/main.nf b/main.nf index b1e2324a..3163bbd3 100644 --- a/main.nf +++ b/main.nf @@ -39,7 +39,6 @@ def helpMessage() { --save_align_intermeds [bool] Save aligned intermediates to results directory --save_trimmed [bool] Save trimmed reads to results directory --save_pileup_file [bool] Save vcf-pileup and index-vcf files from biscuit aligner to results directory - --save_snp_file Save SNP bed-file from biscuit to results directory. Relevant only if '--epiread' is specified --unmapped [bool] Save unmapped reads to fastq files --relax_mismatches [bool] Turn on to relax stringency for alignment (set allowed penalty with --num_mismatches) --num_mismatches [float] 0.6 will allow a penalty of bp * -0.6 - for 100bp reads (bismark default is 0.2) @@ -114,7 +113,6 @@ assembly_name = (params.fasta.toString().lastIndexOf('/') == -1) ?: params.fasta params.save_pileup_file = false params.epiread = false -params.save_snp_file = false params.assets_dir = false // Check if genome exists in the config file @@ -342,7 +340,6 @@ summary['Save Trimmed'] = params.save_trimmed ? 'Yes' : 'No' summary['Save Unmapped'] = params.unmapped ? 'Yes' : 'No' summary['Save Intermediates'] = params.save_align_intermeds ? 'Yes' : 'No' summary['Save Pileups'] = params.save_pileup_file ? 'Yes' : 'No' -summary['Save SNP bed-file'] = params.save_snp_file ? 'Yes' : 'No' summary['Current home'] = "$HOME" @@ -1270,28 +1267,6 @@ if( params.aligner == 'biscuit' ){ if (params.epiread) { - /*************************** - THE PROCESS IS IN PROGRESS! - ****************************/ - process get_SNP_file { - tag "$name" - publishDir "${params.outdir}", mode: 'copy', - saveAs: {filename -> - if(params.save_snp_file && filename != "where_are_my_files.txt") "epireads/snp/$filename" - else null - } - - input: - set val(name), file(vcf) from ch_vcf_for_epiread - - output: - file "${name}.snp.bed" into ch_snp_for_epiread - script: - """ - biscuit vcf2bed -t snp "${vcf[0]}" > "${name}.snp.bed" - """ - } - process epiread_convertion { tag "$name" publishDir "${params.outdir}/epireads", mode: 'copy' @@ -1301,20 +1276,18 @@ if( params.aligner == 'biscuit' ){ file bam_index from ch_bam_index_for_epiread file fasta from ch_fasta_for_epiread.collect() file fasta_index from ch_fasta_index_for_epiread.collect() - file snp from ch_snp_for_epiread output: file "*epiread" script: - snp_file = (snp.size()>0) ? "-B " + snp.toString() : '' if (params.single_end) { """ - biscuit epiread -q ${task.cpus} $fasta $bam $snp_file -o ${name}.epiread + biscuit epiread -q ${task.cpus} $fasta $bam -o ${name}.epiread """ } else { """ - biscuit epiread -q ${task.cpus} $fasta $bam $snp_file | sort --parallel=${task.cpus} -T . -k2,2 -k3,3n | awk 'BEGIN{qname="";rec=""} qname==\$2{print rec"\t"\$5"\t"\$6"\t"\$7"\t"\$8;qname=""} qname!=\$2{qname=\$2;rec=\$1"\t"\$4"\t"\$5"\t"\$6"\t"\$7"\t"\$8;pair=\$3}' > ${name}.epiread + biscuit epiread -q ${task.cpus} $fasta $bam | sort --parallel=${task.cpus} -T . -k2,2 -k3,3n | awk 'BEGIN{qname="";rec=""} qname==\$2{print rec"\t"\$5"\t"\$6"\t"\$7"\t"\$8;qname=""} qname!=\$2{qname=\$2;rec=\$1"\t"\$4"\t"\$5"\t"\$6"\t"\$7"\t"\$8;pair=\$3}' > ${name}.epiread """ } } From cb95f33b82e0c04cce2bada2a4c00a5c5571a46e Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 22 Apr 2020 14:58:57 +0300 Subject: [PATCH 19/56] changed web address for soloWCGW file --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 3163bbd3..841fb507 100644 --- a/main.nf +++ b/main.nf @@ -45,7 +45,7 @@ def helpMessage() { --known_splices [file] Supply a .gtf file containing known splice sites (bismark_hisat only) --slamseq [bool] Run bismark in SLAM-seq mode --local_alignment [bool] Allow soft-clipping of reads (potentially useful for single-cell experiments) - --soloWCGW_file [path] soloWCGW file, to intersect with methyl_extract bed file. soloWCGW for hg38 can be downlaod from: www.cse.huji.ac.il/~ekushele/solo_WCGW_cpg_hg38.bed. EXPERMINTAL! + --soloWCGW_file [path] soloWCGW file, to intersect with methyl_extract bed file. soloWCGW for hg38 can be downlaod from: http://zwdzwd.io/pmd/solo_WCGW_hg38.bed.gz EXPERMINTAL! --assets_dir [path] Assets directory for biscuit_QC, REQUIRED IF IN BISCUIT ALIGNER. can be found at: https://www.cse.huji.ac.il/~ekushele/assets.html --epiread [bool] Convert bam to biscuit epiread format From 8472d461d2ecded257f5ccb45e3f7e1d3b7a0c57 Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 22 Apr 2020 17:25:12 +0300 Subject: [PATCH 20/56] change soloWCGW to solo WCGW-in common PMDs --- docs/output.md | 2 +- docs/usage.md | 2 +- main.nf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/output.md b/docs/output.md index b98276a3..f0e76fcb 100644 --- a/docs/output.md +++ b/docs/output.md @@ -178,7 +178,7 @@ Filename abbreviations stand for the following reference alignment strands: * `sample.bedgraph` * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. * `sample.bedgraph` - * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW file. + * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW-commonPMDs file. * **NB:** Created only if `--soloWCGW_file` is specified. **EXPERIMENTAL!** * `sample.vcf.gz` * VCF file with the pileup information, used for creating the bedgraph file. diff --git a/docs/usage.md b/docs/usage.md index 4d2d19b8..7519e21b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -353,7 +353,7 @@ When running with biscuit aligner, the methylation extraction is based on vcf fi ### `--soloWCGW_file` -This will generate methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW after extracting methylation from vcf, using biscuit workflow. +This will generate methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW in common-PMDs after extracting methylation from vcf, using biscuit workflow. > **NB: The soloWCGW is experimental, and currently available only for hg38** > diff --git a/main.nf b/main.nf index 841fb507..4318aaae 100644 --- a/main.nf +++ b/main.nf @@ -45,7 +45,7 @@ def helpMessage() { --known_splices [file] Supply a .gtf file containing known splice sites (bismark_hisat only) --slamseq [bool] Run bismark in SLAM-seq mode --local_alignment [bool] Allow soft-clipping of reads (potentially useful for single-cell experiments) - --soloWCGW_file [path] soloWCGW file, to intersect with methyl_extract bed file. soloWCGW for hg38 can be downlaod from: http://zwdzwd.io/pmd/solo_WCGW_hg38.bed.gz EXPERMINTAL! + --soloWCGW_file [path] soloWCGW in common-PMDs file, to intersect with methyl_extract bed file. soloWCGW in common PMDs for hg38 can be downlaod from: http://zwdzwd.io/pmd/solo_WCGW_inCommonPMDs_hg38.bed.gz EXPERMINTAL! --assets_dir [path] Assets directory for biscuit_QC, REQUIRED IF IN BISCUIT ALIGNER. can be found at: https://www.cse.huji.ac.il/~ekushele/assets.html --epiread [bool] Convert bam to biscuit epiread format From 532e997da49a2ba868fa22e80238b0a7f4a82c61 Mon Sep 17 00:00:00 2001 From: ekushele Date: Thu, 23 Apr 2020 09:04:20 +0300 Subject: [PATCH 21/56] remove soloWCGW step --- conf/base.config | 5 ---- docs/output.md | 3 --- docs/usage.md | 7 ------ main.nf | 62 +++++++++++++----------------------------------- 4 files changed, 16 insertions(+), 61 deletions(-) diff --git a/conf/base.config b/conf/base.config index 12c44d30..e632da8d 100644 --- a/conf/base.config +++ b/conf/base.config @@ -127,11 +127,6 @@ withName:biscuit_align { memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 5.d * task.attempt, 'time') } } - withName:intersect_soloWCGW_file { - cpus = { check_max( 1 * task.attempt, 'cpus') } - memory = { check_max( 32.GB * task.attempt, 'memory') } - time = { check_max( 12.h * task.attempt, 'time') } - } withName:createBedgraph { cpus = { check_max( 1 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } diff --git a/docs/output.md b/docs/output.md index f0e76fcb..5a5caa45 100644 --- a/docs/output.md +++ b/docs/output.md @@ -177,9 +177,6 @@ Filename abbreviations stand for the following reference alignment strands: * `sample.bedgraph` * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. -* `sample.bedgraph` - * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW-commonPMDs file. - * **NB:** Created only if `--soloWCGW_file` is specified. **EXPERIMENTAL!** * `sample.vcf.gz` * VCF file with the pileup information, used for creating the bedgraph file. * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. diff --git a/docs/usage.md b/docs/usage.md index 7519e21b..4868e7d5 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -35,7 +35,6 @@ * [`--save_align_intermeds`](#--save_align_intermeds) * [`--save_pileup_file`](#--save_pileup_file) * [`--epiread`](#--epiread) - * [`--soloWCGW_file`](#--soloWCGW_file) * [`--assets_dir`](#--assets_dir) * [`--min_depth`](#--min_depth) * [`--meth_cutoff`](#--meth_cutoff) @@ -351,12 +350,6 @@ When running with biscuit aligner, the methylation extraction is based on vcf fi [Epiread]([https://github.com/zhou-lab/biscuit/wiki/Convert-to-epiread-format](https://github.com/zhou-lab/biscuit/wiki/Convert-to-epiread-format)) format is a compact way of storing CpG retention pattern on the same read. This option will tell the biscuit workflow to generate epiread file. -### `--soloWCGW_file` - -This will generate methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format, intersected with soloWCGW in common-PMDs after extracting methylation from vcf, using biscuit workflow. -> **NB: The soloWCGW is experimental, and currently available only for hg38** -> - ### `--assets_dir` Path to a directory containing needed file for biscuit-QC step. The needed files for hg38,hg19 and mm10 can be found in [here](https://www.cse.huji.ac.il/~ekushele/assets.html). diff --git a/main.nf b/main.nf index 4318aaae..6335d31d 100644 --- a/main.nf +++ b/main.nf @@ -45,7 +45,6 @@ def helpMessage() { --known_splices [file] Supply a .gtf file containing known splice sites (bismark_hisat only) --slamseq [bool] Run bismark in SLAM-seq mode --local_alignment [bool] Allow soft-clipping of reads (potentially useful for single-cell experiments) - --soloWCGW_file [path] soloWCGW in common-PMDs file, to intersect with methyl_extract bed file. soloWCGW in common PMDs for hg38 can be downlaod from: http://zwdzwd.io/pmd/solo_WCGW_inCommonPMDs_hg38.bed.gz EXPERMINTAL! --assets_dir [path] Assets directory for biscuit_QC, REQUIRED IF IN BISCUIT ALIGNER. can be found at: https://www.cse.huji.ac.il/~ekushele/assets.html --epiread [bool] Convert bam to biscuit epiread format @@ -108,7 +107,6 @@ params.bwa_meth_index = params.genome ? params.genomes[ params.genome ].bwa_meth params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.fasta_index = params.genome ? params.genomes[ params.genome ].fasta_index ?: false : false params.bwa_biscuit_index = false -params.soloWCGW_file = false assembly_name = (params.fasta.toString().lastIndexOf('/') == -1) ?: params.fasta.toString().substring( params.fasta.toString().lastIndexOf('/')+1) params.save_pileup_file = false @@ -287,11 +285,6 @@ if (params.readPaths) { .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } } -if (params.soloWCGW_file) { - Channel - .fromPath(params.soloWCGW_file) - .into { ch_soloWCGW_for_biscuitVCF; } -} // Header log info log.info nfcoreHeader() def summary = [:] @@ -332,7 +325,6 @@ if( params.min_depth ) summary['Minimum Depth'] = params.min_depth if( params.ignore_flags ) summary['MethylDackel'] = 'Ignoring SAM Flags' if( params.methyl_kit ) summary['MethylDackel'] = 'Producing methyl_kit output' if( params.assets_dir ) summary['Assets Directory'] = params.assets_dir -if( params.soloWCGW_file ) summary['soloWCGW File'] = params.soloWCGW_file if( params.epiread ) summary['Epiread'] = params.epiread ? 'Yes' : 'No' summary['Save Reference'] = params.save_reference ? 'Yes' : 'No' @@ -1224,46 +1216,24 @@ if( params.aligner == 'biscuit' ){ /* * STEP 7 - create bedgraph file from vcf */ - process createBedgraph { - tag "$name" - publishDir "${params.outdir}/methylation_extract", mode: 'copy' - - - input: - set val(name), file(vcf) from ch_vcf_for_bedgraph - - output: - set val(name), file("*bedgraph" ) into ch_bedgraph_for_intersect_soloWCGW - - script: - min_depth = params.min_depth > 0 ? "${params.min_depth}" : '1' - all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' - """ - biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" - - """ - } - /*************** - *EXPERIMENTAL!!* - ***************/ - if (params.soloWCGW_file) { - process intersect_soloWCGW { - tag "$name" - publishDir "${params.outdir}/methylation_extract", mode: 'copy' + process createBedgraph { + tag "$name" + publishDir "${params.outdir}/methylation_extract", mode: 'copy' - input: - set val(name), file(bedgraph) from ch_bedgraph_for_intersect_soloWCGW - file soloWGCW from ch_soloWCGW_for_biscuitVCF.collect() - - output: - file "*bedgraph" - script: - """ - bedtools intersect -wa -a "${bedgraph[0].baseName}.bedgraph" -b $soloWGCW > ${name}_soloWCGW.bedgraph - """ - } - } + input: + set val(name), file(vcf) from ch_vcf_for_bedgraph + + output: + set val(name), file("*bedgraph" ) + + script: + min_depth = params.min_depth > 0 ? "${params.min_depth}" : '1' + all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' + """ + biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" + """ + } if (params.epiread) { From 1f4dfd51e4f0dc4e48360efc4a2d6fb1f8a4c4be Mon Sep 17 00:00:00 2001 From: ekushele Date: Tue, 28 Apr 2020 09:20:15 +0300 Subject: [PATCH 22/56] remove swift --- docs/usage.md | 1 - main.nf | 9 --------- 2 files changed, 10 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 4868e7d5..6e8898d6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -282,7 +282,6 @@ The pipeline also accepts a number of presets for common bisulfite library prepa | `--accel` | 10 | 15 | 10 | 10 | | `--zymo` | 10 | 15 | 10 | 10 | | `--cegx` | 6 | 6 | 2 | 2 | -| `--swift` | 0 | 14 | 0 | 0 | ### `--rrbs` diff --git a/main.nf b/main.nf index 6335d31d..cc1ff0ac 100644 --- a/main.nf +++ b/main.nf @@ -72,7 +72,6 @@ def helpMessage() { --accell [bool] --zymo [bool] --cegx [bool] - --swift [bool] Other options: --outdir [file] The output directory where the results will be saved @@ -203,7 +202,6 @@ params.epignome = false params.accel = false params.zymo = false params.cegx = false -params.swift = false if(params.pbat){ params.clip_r1 = 9 params.clip_r2 = 9 @@ -233,12 +231,6 @@ else if( params.cegx ){ params.clip_r2 = 6 params.three_prime_clip_r1 = 2 params.three_prime_clip_r2 = 2 -} -else if( params.swift ){ - params.clip_r1 = 0 - params.clip_r2 = 14 - params.three_prime_clip_r1 = 0 - params.three_prime_clip_r2 = 0 } else { params.clip_r1 = 0 params.clip_r2 = 0 @@ -312,7 +304,6 @@ if( params.epignome ) summary['Trim Profile'] = 'TruSeq (EpiGnome)' if( params.accel ) summary['Trim Profile'] = 'Accel-NGS (Swift)' if( params.zymo ) summary['Trim Profile'] = 'Zymo Pico-Methyl' if( params.cegx ) summary['Trim Profile'] = 'CEGX' -if( params.swift ) summary['Trim Profile'] = 'Swift' summary['Trim R1'] = params.clip_r1 summary['Trim R2'] = params.clip_r2 From ac62a93f22b5fea4c21078b7f5bc45e9f194399c Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 8 Feb 2021 09:23:31 +0200 Subject: [PATCH 23/56] added epiread convertion with paired-end merging and get SNP files and a process to build biscuitQC_assets, and running biscuit_QC.sh and built_biscuit_QC_assets.pl from within bioconda recipe --- bin/biscuit_QC.sh | 531 ----- bin/epiread_pairedEnd_convertion | Bin 0 -> 178240 bytes bin/epiread_pairedEnd_convertion.cpp | 686 +++++++ bin/processUcscDbsnp.pl | 80 + bin/setup.sh | 39 - cpg.bed | 0 gc_content.bed | 0 main.nf | 2659 +++++++++++++------------- nextflow.config | 9 + 9 files changed, 2134 insertions(+), 1870 deletions(-) delete mode 100755 bin/biscuit_QC.sh create mode 100755 bin/epiread_pairedEnd_convertion create mode 100644 bin/epiread_pairedEnd_convertion.cpp create mode 100755 bin/processUcscDbsnp.pl delete mode 100755 bin/setup.sh create mode 100644 cpg.bed create mode 100644 gc_content.bed diff --git a/bin/biscuit_QC.sh b/bin/biscuit_QC.sh deleted file mode 100755 index 40b55bce..00000000 --- a/bin/biscuit_QC.sh +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env bash -## make sure the following is in PATH -## biscuit samtools, bedtools, awk - -# Use python's argparse module in shell scripts -# -# The function `argparse` parses its arguments using -# argparse.ArgumentParser; the parser is defined in the function's -# stdin. -# -# Executing ``argparse.bash`` (as opposed to sourcing it) prints a -# script template. -# -# https://github.com/nhoffman/argparse-bash -# MIT License - Copyright (c) 2015 Noah Hoffman - -argparse(){ - argparser=$(mktemp 2>/dev/null || mktemp -t argparser) - cat > "$argparser" <> "$argparser" - - cat >> "$argparser" < /dev/null; then - eval $(python "$argparser" "$@") - retval=0 - else - python "$argparser" "$@" - retval=1 - fi - - rm "$argparser" - return $retval -} - -#!/usr/bin/env bash -################################################################################ -## -## Quality Control script for BISCUIT output -## -## Output from this script can be fed into MultiQC to produce a nice HTML output -## showing the different BISCUIT QC metrics -## -## Notes: -## 1.) biscuit, samtools, bedtools, and awk all must be in PATH for script to -## work -## -## Created by: -## Wanding Zhou -## -## Creation date: -## May 2019 -## -## Update notes: -## Dec 2019 - -## - Clean up code to make more readable -## - Catch empty files, alert user, and remove files -## -################################################################################ - -# Check for biscuit, samtools, bedtools, awk in PATH -function check_path { - if [[ `which biscuit 2>&1 > /dev/null` ]]; then - echo "biscuit does not exist in PATH" - exit 1 - fi - if [[ `which samtools 2>&1 > /dev/null` ]]; then - echo "samtools does not exist in PATH" - exit 1 - fi - if [[ `which bedtools 2>&1 > /dev/null` ]]; then - echo "bedtools does not exist in PATH" - exit 1 - fi - if [[ `which awk 2>&1 > /dev/null` ]]; then - echo "awk does not exist in PATH" - exit 1 - fi - if [[ `which python 2>&1 > /dev/null` ]]; then - echo "python does not exist in PATH" - exit 1 - fi -} - -# Check that certain variables have been set and files exist -#TODO: Change "" to "NULL"/"NA"/something similar -#TODO: Also change in BISCUIT QC setup files -function check_variables { - VARS=" - BISCUIT_CPGBED - BISCUIT_CGIBED - BISCUIT_RMSK - BISCUIT_EXON - BISCUIT_GENE - BISCUIT_TOPGC_BED - BISCUIT_BOTGC_BED - input_bam - input_vcf - " - - for var in $VARS; do - if [[ ${!var} != "" && ! -f ${!var} ]]; then - >&2 echo "$var: ${!var} does not exist" - exit 1 - fi - done -} - -# Check if QC files have at least some information -function basic_check_output_filled { - prepend_path=$QCdir/${sname} - TWO_LINE_FILES=" - ${prepend_path}_all_cv_table.txt - ${prepend_path}_covdist_cpg_q40_botgc_table.txt - ${prepend_path}_covdist_cpg_q40_table.txt - ${prepend_path}_covdist_cpg_q40_topgc_table.txt - ${prepend_path}_covdist_cpg_table.txt - ${prepend_path}_covdist_q40_botgc_table.txt - ${prepend_path}_covdist_q40_table.txt - ${prepend_path}_covdist_q40_topgc_table.txt - ${prepend_path}_covdist_table.txt - ${prepend_path}_cpg_cv_table.txt - ${prepend_path}_cpg_dist_table.txt - ${prepend_path}_CpGRetentionByReadPos.txt - ${prepend_path}_CpGRetentionDist.txt - ${prepend_path}_CpHRetentionByReadPos.txt - ${prepend_path}_freqOfTotalRetentionPerRead.txt - ${prepend_path}_isize_score_table.txt - ${prepend_path}_mapq_table.txt - ${prepend_path}_totalBaseConversionRate.txt - ${prepend_path}_totalReadConversionRate.txt - " - ONE_LINE_FILES=" - ${prepend_path}_dup_report.txt - ${prepend_path}_strand_table.txt - " - - echo "Running basic check on BISCUIT QC output" - echo "Will remove any files that were obviously not filled properly" - echo "This avoids clashes when running MultiQC" - - # All files that have a description line, then a table header line - for FILE in ${TWO_LINE_FILES}; do - if [[ ! -f "${FILE}" ]]; then - >&2 echo "--- {FILE} --- was not initially created. Skipping!" - continue - fi - if [[ `wc -l ${FILE} | awk '{print $1}'` -lt 3 ]]; then - >&2 echo "--- ${FILE} --- has no entries. Check related files!" - >&2 echo "Deleting --- ${FILE} --- since there are no entries to help with debugging." - rm -f ${FILE} - fi - done - - # Files with only a description line - for FILE in ${ONE_LINE_FILES}; do - if [[ ! -f "${FILE}" ]]; then - >&2 echo "--- {FILE} --- was not initially created. Skipping!" - continue - fi - if [[ `wc -l ${FILE} | awk '{print $1}'` -lt 2 ]]; then - >&2 echo "--- ${FILE} --- has no entries. Check related files!" - >&2 echo "Deleting --- ${FILE} --- since there are no entries to help with debugging." - rm -f ${FILE} - fi - done -} - -function biscuitQC { - - # Simple check for necessary command line tools - check_path - - # Check variables and their associated files exist - check_variables - - # Create $QCdir if it does not exist - if [ ! -d $QCdir ]; then - mkdir -p $QCdir - fi - - echo "Running BISCUIT QC" - set -xe pipefail - ########################## - ## base coverage - ########################## - if [[ "$BISCUIT_QC_BASECOV" == true ]]; then - >&2 echo "`date`---- BISCUIT_QC_BASECOV ----" - # bedtools genomecov -bga -split -ibam $input_bam -g ${BISCUIT_REFERENCE}.fai | bedtools sort >$QCdir/${sname}_bga.bed - # samtools view -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g ${BISCUIT_REFERENCE}.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_q40.bed - bedtools genomecov -bga -split -ibam $input_bam -g ${BISCUIT_REFERENCE}.fai | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga.bed - samtools view -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g ${BISCUIT_REFERENCE}.fai -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_q40.bed - - echo -e "BISCUITqc Depth Distribution (All)" >$QCdir/${sname}_covdist_table.txt - echo -e "depth\tcount" >>$QCdir/${sname}_covdist_table.txt - awk '{cnt[$4]+=$3-$2}END{for(cov in cnt) {print int(cov)"\t"int(cnt[cov]);}}' $QCdir/${sname}_bga.bed | sort -k1,1n -T $QCdir >>$QCdir/${sname}_covdist_table.txt - - echo -e "BISCUITqc Depth Distribution (Q40)" >$QCdir/${sname}_covdist_q40_table.txt - echo -e "depth\tcount" >>$QCdir/${sname}_covdist_q40_table.txt - awk '{cnt[$4]+=$3-$2}END{for(cov in cnt) {print int(cov)"\t"int(cnt[cov]);}}' $QCdir/${sname}_bga_q40.bed | sort -k1,1n -T $QCdir >>$QCdir/${sname}_covdist_q40_table.txt - fi - - ########################## - ## duplicate_coverage - ########################## - [[ ! -f "$QCdir/${sname}_bga.bed" ]] && BISCUIT_QC_DUPLICATE=false - [[ ! -f "$QCdir/${sname}_bga_q40.bed" ]] && BISCUIT_QC_DUPLCIATE=false - if [[ "$BISCUIT_QC_DUPLICATE" == true ]]; then - >&2 echo "`date`---- BISCUIT_QC_DUPLICATE ----" - # duplicate - #samtools view -f 0x400 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_dup.bed - samtools view -f 0x400 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_dup.bed - - # duplication rate - echo -e "BISCUITqc Read Duplication Table" >$QCdir/${sname}_dup_report.txt - echo -ne "#bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt - awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga.bed >>$QCdir/${sname}_dup_report.txt - echo -ne "#bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt - awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_dup.bed >>$QCdir/${sname}_dup_report.txt - - if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then - # high GC content - echo -ne "#high-GC bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt - bedtools intersect -a $QCdir/${sname}_bga.bed -b $BISCUIT_TOPGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt - echo -ne "#high-GC bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt - bedtools intersect -a $QCdir/${sname}_bga_dup.bed -b $BISCUIT_TOPGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt - - # low GC content - echo -ne "#low-GC bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt - bedtools intersect -a $QCdir/${sname}_bga.bed -b $BISCUIT_BOTGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt - echo -ne "#low-GC bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt - bedtools intersect -a $QCdir/${sname}_bga_dup.bed -b $BISCUIT_BOTGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt - fi - - ## Q40 - # duplicate - # samtools view -f 0x400 -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_dup_q40.bed - samtools view -f 0x400 -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_dup_q40.bed - - # duplication rate - echo -ne "#bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt - awk '$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_q40.bed >>$QCdir/${sname}_dup_report.txt - echo -ne "#bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt - awk '$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_dup_q40.bed >>$QCdir/${sname}_dup_report.txt - - if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then - # high GC content - echo -ne "#high-GC bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt - bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt - echo -ne "#high-GC bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt - bedtools intersect -a $QCdir/${sname}_bga_dup_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt - - # low GC content - echo -ne "#low-GC bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt - bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt - echo -ne "#low-GC bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt - bedtools intersect -a $QCdir/${sname}_bga_dup_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt - fi - fi - - ########################## - ## cpg coverage - ########################## - - [[ ! -f "$BISCUIT_CPGBED" ]] && BISCUIT_QC_CPGCOV=false - [[ ! -f "$QCdir/${sname}_bga.bed" ]] && BISCUIT_QC_CPGCOV=false - [[ ! -f "$QCdir/${sname}_bga_q40.bed" ]] && BISCUIT_QC_CPGCOV=false - if [[ "$BISCUIT_QC_CPGCOV" == true ]]; then - >&2 echo "`date`---- BISCUIT_QC_CPGCOV ----" - bedtools intersect -a $BISCUIT_CPGBED -b $QCdir/${sname}_bga.bed -wo -sorted | bedtools groupby -g 1-3 -c 7 -o min >$QCdir/${sname}_cpg.bed - bedtools intersect -a $BISCUIT_CPGBED -b $QCdir/${sname}_bga_q40.bed -wo -sorted | bedtools groupby -g 1-3 -c 7 -o min >$QCdir/${sname}_cpg_q40.bed - - echo -e "BISCUITqc CpG Depth Distribution (All)" >$QCdir/${sname}_covdist_cpg_table.txt - echo -e "depth\tcount" >>$QCdir/${sname}_covdist_cpg_table.txt - awk '{cnt[$4]+=1}END{for(cov in cnt) {print int(cov)"\t"int(cnt[cov]);}}' $QCdir/${sname}_cpg.bed | sort -k1,1n >>$QCdir/${sname}_covdist_cpg_table.txt - - echo -e "BISCUITqc CpG Depth Distribution (Q40)" >$QCdir/${sname}_covdist_cpg_q40_table.txt - echo -e "depth\tcount" >>$QCdir/${sname}_covdist_cpg_q40_table.txt - awk '{cnt[$4]+=1}END{for(cov in cnt) {print int(cov)"\t"int(cnt[cov]);}}' $QCdir/${sname}_cpg_q40.bed | sort -k1,1n >>$QCdir/${sname}_covdist_cpg_q40_table.txt - fi - - ########################## - ## cpg distribution - ########################## - - [[ ! -f "$QCdir/${sname}_cpg_q40.bed" ]] && BISCUIT_QC_CPGDIST=false - [[ ! -f "$QCdir/${sname}_cpg.bed" ]] && BISCUIT_QC_CPGDIST=false - [[ ! -f "$BISCUIT_EXON" ]] && BISCUIT_QC_CPGDIST=false - [[ ! -f "$BISCUIT_RMSK" ]] && BISCUIT_QC_CPGDIST=false - [[ ! -f "$BISCUIT_GENE" ]] && BISCUIT_QC_CPGDIST=false - [[ ! -f "$BISCUIT_CGIBED" ]] && BISCUIT_QC_CPGDIST=false - if [[ "$BISCUIT_QC_CPGDIST" == true ]]; then - >&2 echo "`date`---- BISCUIT_QC_CPGDIST ----" - # whole genome - echo -e "BISCUITqc CpG Distribution Table" >$QCdir/${sname}_cpg_dist_table.txt - wc -l $QCdir/${sname}_cpg_q40.bed | awk -F" " '{printf("Territory\tAll\tUniqCov\tAllCov\nTotalCpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt - awk '$4>0{a+=1}END{printf("\t%d",a)}' $QCdir/${sname}_cpg_q40.bed >>$QCdir/${sname}_cpg_dist_table.txt - awk '$4>0{a+=1}END{printf("\t%d\n",a)}' $QCdir/${sname}_cpg.bed >>$QCdir/${sname}_cpg_dist_table.txt - - # exon - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_EXON) -sorted | wc -l | awk -F" " '{printf("ExonicCpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_EXON) -sorted | awk '$4>0{a+=1}END{printf("\t%d",a)}' >>$QCdir/${sname}_cpg_dist_table.txt - bedtools intersect -a $QCdir/${sname}_cpg.bed -b <(bedtools merge -i $BISCUIT_EXON) -sorted | awk '$4>0{a+=1}END{printf("\t%d\n",a)}' >>$QCdir/${sname}_cpg_dist_table.txt - - # repeat - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_RMSK) -sorted | wc -l | awk -F" " '{printf("RepeatCpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_RMSK) -sorted | awk '$4>0{a+=1}END{printf("\t%d",a)}' >>$QCdir/${sname}_cpg_dist_table.txt - bedtools intersect -a $QCdir/${sname}_cpg.bed -b <(bedtools merge -i $BISCUIT_RMSK) -sorted | awk '$4>0{a+=1}END{printf("\t%d\n",a)}' >>$QCdir/${sname}_cpg_dist_table.txt - - # gene - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_GENE) -sorted | wc -l | awk -F" " '{printf("GenicCpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_GENE) -sorted | awk '$4>0{a+=1}END{printf("\t%d",a)}' >>$QCdir/${sname}_cpg_dist_table.txt - bedtools intersect -a $QCdir/${sname}_cpg.bed -b <(bedtools merge -i $BISCUIT_GENE) -sorted | awk '$4>0{a+=1}END{printf("\t%d\n",a)}' >>$QCdir/${sname}_cpg_dist_table.txt - - # CGI - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_CGIBED) -sorted | wc -l | awk -F" " '{printf("CGICpGs\t%s",$1)}' >>$QCdir/${sname}_cpg_dist_table.txt - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_CGIBED) -sorted | awk '$4>0{a+=1}END{printf("\t%d",a)}' >>$QCdir/${sname}_cpg_dist_table.txt - bedtools intersect -a $QCdir/${sname}_cpg.bed -b <(bedtools merge -i $BISCUIT_CGIBED) -sorted | awk '$4>0{a+=1}END{printf("\t%d\n",a)}' >>$QCdir/${sname}_cpg_dist_table.txt - - >&2 echo "`date`---- BISCUIT_QC_CGICOV ----" - # how CGI is covered by at least one q40-read in at least one CpG - echo >>$QCdir/${sname}_cpg_dist_table.txt - echo -ne "#CpG Islands\t" >>$QCdir/${sname}_cpg_dist_table.txt - zcat $BISCUIT_CGIBED | wc -l >>$QCdir/${sname}_cpg_dist_table.txt - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b <(bedtools merge -i $BISCUIT_CGIBED) -sorted -wo | awk '$4>0{print $5":"$6"-"$7}' | uniq -c | awk -F" " '{print $2"\t"$1}' >> $QCdir/${sname}_cpg_dist_table_temp.txt - echo -ne "#CpG Islands covered by at least one q40-read in at least one CpG\t" >>$QCdir/${sname}_cpg_dist_table.txt - less $QCdir/${sname}_cpg_dist_table_temp.txt | wc -l >>$QCdir/${sname}_cpg_dist_table.txt - echo -ne "#CpG Islands covered by at least one q40-read in at least three CpGs\t" >>$QCdir/${sname}_cpg_dist_table.txt - awk -F" " '$2>=3' $QCdir/${sname}_cpg_dist_table_temp.txt | wc -l >>$QCdir/${sname}_cpg_dist_table.txt - echo -ne "#CpG Islands covered by at least one q40-read in at least five CpGs\t" >>$QCdir/${sname}_cpg_dist_table.txt - awk -F" " '$2>=5' $QCdir/${sname}_cpg_dist_table_temp.txt | wc -l >>$QCdir/${sname}_cpg_dist_table.txt - echo -ne "#CpG Islands covered by at least one q40-read in at least ten CpGs\t" >>$QCdir/${sname}_cpg_dist_table.txt - awk -F" " '$2>=10' $QCdir/${sname}_cpg_dist_table_temp.txt | wc -l >>$QCdir/${sname}_cpg_dist_table.txt - /bin/rm $QCdir/${sname}_cpg_dist_table_temp.txt - fi - - ########################## - ## uniformity - ########################## - [[ ! -f "$QCdir/${sname}_covdist_q40_table.txt" ]] && BISCUIT_QC_UNIFORMITY=false - [[ ! -f "$QCdir/${sname}_bga_q40.bed" ]] && BISCUIT_QC_UNIFORMITY=false - if [[ "$BISCUIT_QC_UNIFORMITY" == true ]]; then - >&2 echo "`date`---- BISCUIT_QC_UNIFORMITY ----" - - echo -e "BISCUITqc Uniformity Table" >$QCdir/${sname}_all_cv_table.txt - awk -v sname="${sname}" '{cnt[$1]=$2}END{for (cov in cnt) {sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print "sample\tmu\tsigma\tcv\n"sname"_all\t"mu"\t"sigma"\t"sigma/mu}' $QCdir/${sname}_covdist_q40_table.txt >>$QCdir/${sname}_all_cv_table.txt - - if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then - echo -e "BISCUITqc Depth Distribution (high GC, Q40)" >$QCdir/${sname}_covdist_q40_topgc_table.txt - echo -e "depth\tcount" >>$QCdir/${sname}_covdist_q40_topgc_table.txt - bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk -v sname="${sname}" -v output="$QCdir/${sname}_all_cv_table.txt" '{cnt[$4]+=$3-$2}END{for (cov in cnt) {print cov"\t"cnt[cov]; sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print sname"_all_topgc\t"mu"\t"sigma"\t"sigma/mu >>output}' | sort -k1,1n >>$QCdir/${sname}_covdist_q40_topgc_table.txt - - echo -e "BISCUITqc Depth Distribution (low GC, Q40)" >$QCdir/${sname}_covdist_q40_botgc_table.txt - echo -e "depth\tcount" >>$QCdir/${sname}_covdist_q40_botgc_table.txt - bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk -v sname="${sname}" -v output="$QCdir/${sname}_all_cv_table.txt" '{cnt[$4]+=$3-$2}END{for (cov in cnt) {print cov"\t"cnt[cov]; sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print sname"_all_botgc\t"mu"\t"sigma"\t"sigma/mu >>output}' | sort -k1,1n >>$QCdir/${sname}_covdist_q40_botgc_table.txt - fi - fi - - ########################## - ## cpg uniformity - ########################## - [[ ! -f "$QCdir/${sname}_covdist_cpg_q40_table.txt" ]] && BISCUIT_QC_CPGUNIF=false - [[ ! -f "$QCdir/${sname}_cpg_q40.bed" ]] && BISCUIT_QC_CPGUNIF=false - if [[ "$BISCUIT_QC_CPGUNIF" == true ]]; then - >&2 echo "`date`---- BISCUIT_QC_CPGUNIF ----" - - echo -e "BISCUITqc CpG Uniformity Table" >$QCdir/${sname}_cpg_cv_table.txt - awk -v sname="${sname}" '{cnt[$1]=$2}END{for(cov in cnt) {sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print "sample\tmu\tsigma\tcv\n"sname"_cpg\t"mu"\t"sigma"\t"sigma/mu}' $QCdir/${sname}_covdist_cpg_q40_table.txt >>$QCdir/${sname}_cpg_cv_table.txt - - if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then - echo -e "BISCUITqc CpG Depth Distribution (high GC, Q40)" >$QCdir/${sname}_covdist_cpg_q40_topgc_table.txt - echo -e "depth\tcount" >>$QCdir/${sname}_covdist_cpg_q40_topgc_table.txt - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk -v sname="${sname}" -v output="$QCdir/${sname}_cpg_cv_table.txt" '{cnt[$4]+=1}END{for (cov in cnt) {print cov"\t"cnt[cov]; sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print sname"_cpg_topgc\t"mu"\t"sigma"\t"sigma/mu >>output}' | sort -k1,1n >>$QCdir/${sname}_covdist_cpg_q40_topgc_table.txt - - echo -e "BISCUITqc CpG Depth Distribution (low GC, Q40)" >$QCdir/${sname}_covdist_cpg_q40_botgc_table.txt - echo -e "depth\tcount" >>$QCdir/${sname}_covdist_cpg_q40_botgc_table.txt - bedtools intersect -a $QCdir/${sname}_cpg_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk -v sname="${sname}" -v output="$QCdir/${sname}_cpg_cv_table.txt" '{cnt[$4]+=1}END{for (cov in cnt) {print cov"\t"cnt[cov]; sum_cov+=cnt[cov]*cov; sum_cnt+=cnt[cov];} for(cov in cnt) {sum_var+=((cov-mu)^2)*cnt[cov];} mu=sum_cov/sum_cnt; sigma=sqrt(sum_var/sum_cnt); print sname"_cpg_botgc\t"mu"\t"sigma"\t"sigma/mu >>output}' | sort -k1,1n >>$QCdir/${sname}_covdist_cpg_q40_botgc_table.txt - fi - fi - - ########################## - ## bisulfite conversion - ########################## - [[ ! -f "$input_vcf" ]] && BISCUIT_QC_BSCONV=false - if [[ "$BISCUIT_QC_BSCONV" == true ]]; then - >&2 echo "`date`---- BISCUIT_QC_BSCONV ----" - - #echo -e "BISCUITqc Frequency of Total Retention per Read Table" >$QCdir/${sname}_freqOfTotalRetentionPerRead.txt - #samtools view -h -q 40 $input_bam | biscuit bsconv $BISCUIT_REFERENCE - | awk 'match($0,/ZN:Z:([^ ]*)/,a){print gensub(/[A-Z,_]+/, "\t", "g", a[1])}' | cut -f2,4,6,8 | awk -v OFS="\t" '{ra[$1]+=1;rc[$2]+=1;rg[$3]+=1;rt[$4]+=1;}END{for(k in ra) {print "CA", k, ra[k]} for(k in rc) {print "CC", k, rc[k]} for(k in rg) {print "CG", k, rg[k]} for(k in rt) {print "CT", k, rt[k]}}' | sort -k1,1 -k2,2n | awk 'BEGIN{print "CTXT\tnumRET\tCnt"}{print}' >>$QCdir/${sname}_freqOfTotalRetentionPerRead.txt - - echo -e "BISCUITqc Frequency of Total Retention per Read Table\nCTXT\tnumRET\tCnt" >$QCdir/${sname}_freqOfTotalRetentionPerRead.txt - samtools view -h -q 40 $input_bam | biscuit bsconv $BISCUIT_REFERENCE - | grep -E -o --color "ZN:Z:[^ ].*" | awk -F '[^0-9]*' -v OFS="\t" '{ra[$2]+=1;rc[$4]+=1;rg[$6]+=1;rt[$8]+=1;}END{for(k in ra) {print "CA", k, ra[k]} for(k in rc) {print "CC", k, rc[k]} for(k in rg) {print "CG", k, rg[k]} for(k in rt) {print "CT", k, rt[k]}}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_freqOfTotalRetentionPerRead.txt - - echo -e "BISCUITqc Conversion Rate by Base Average Table" >$QCdir/${sname}_totalBaseConversionRate.txt - biscuit vcf2bed -et c $input_vcf | awk '{beta_sum[$6]+=$8; beta_cnt[$6]+=1;} END{print "CA\tCC\tCG\tCT"; print beta_sum["CA"]/beta_cnt["CA"]"\t"beta_sum["CC"]/beta_cnt["CC"]"\t"beta_sum["CG"]/beta_cnt["CG"]"\t"beta_sum["CT"]/beta_cnt["CT"];}' >>$QCdir/${sname}_totalBaseConversionRate.txt - - echo -e "BISCUITqc Conversion Rate by Read Average Table" >$QCdir/${sname}_totalReadConversionRate.txt - samtools view -hq 40 -F 0x900 $input_bam | biscuit bsconv -b $BISCUIT_REFERENCE - | awk '{for(i=1;i<=8;++i) a[i]+=$i;}END{print "CpA\tCpC\tCpG\tCpT"; print a[1]/(a[1]+a[2])"\t"a[3]/(a[3]+a[4])"\t"a[5]/(a[5]+a[6])"\t"a[7]/(a[7]+a[8]);}' >>$QCdir/${sname}_totalReadConversionRate.txt - - echo -e "BISCUITqc CpH Retention by Read Position Table" >$QCdir/${sname}_CpHRetentionByReadPos.txt - echo -e "ReadInPair\tPosition\tConversion/Retention\tCount" >>$QCdir/${sname}_CpHRetentionByReadPos.txt - #samtools view -hq 40 $input_bam | biscuit cinread $BISCUIT_REFERENCE - -t ch -p QPAIR,CQPOS,CRETENTION | sort | uniq -c | awk -F" " '$4!="N"{print $2"\t"$3"\t"$4"\t"$1}' | sort -k1,1 -k2,2n -T $QCdir >>$QCdir/${sname}_CpHRetentionByReadPos.txt - samtools view -hq 40 $input_bam | biscuit cinread $BISCUIT_REFERENCE - -t ch -p QPAIR,CQPOS,CRETENTION | awk -v OFS="\t" '$3!="N"{sorting[$0]++ } END {for (i in sorting) print i,sorting[i]}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_CpHRetentionByReadPos.txt - - echo -e "BISCUITqc CpG Retention by Read Position Table" >$QCdir/${sname}_CpGRetentionByReadPos.txt - echo -e "ReadInPair\tPosition\tConversion/Retention\tCount" >>$QCdir/${sname}_CpGRetentionByReadPos.txt - #samtools view -hq 40 $input_bam | biscuit cinread $BISCUIT_REFERENCE - -t cg -p QPAIR,CQPOS,CRETENTION | sort | uniq -c | awk -F" " '$4!="N"{print $2"\t"$3"\t"$4"\t"$1}' | sort -k1,1 -k2,2n -T $QCdir >>$QCdir/${sname}_CpGRetentionByReadPos.txt - samtools view -hq 40 $input_bam | biscuit cinread $BISCUIT_REFERENCE - -t cg -p QPAIR,CQPOS,CRETENTION | awk -v OFS="\t" '$3!="N"{sorting[$0]++ } END {for (i in sorting) print i,sorting[i]}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_CpGRetentionByReadPos.txt - fi - - #################### - ## mapping_summary - #################### - if [[ "$BISCUIT_QC_MAPPING" == true ]]; then - >&2 echo "`date`---- BISCUIT_QC_MAPPING ----" - echo -e "BISCUITqc Strand Table" >$QCdir/${sname}_strand_table.txt - biscuit cinread -p QPAIR,STRAND,BSSTRAND $BISCUIT_REFERENCE $input_bam | awk '{a[$1$2$3]+=1}END{for(strand in a) {print "strand\t"strand"\t"a[strand];}}' >>$QCdir/${sname}_strand_table.txt - - echo -e "BISCUITqc Mapping Quality Table" >$QCdir/${sname}_mapq_table.txt - echo -e "MapQ\tCount" >>$QCdir/${sname}_mapq_table.txt - samtools view -F 0x100 -f 0x4 $input_bam | wc -l | cat <(echo -ne "unmapped\t") - >>$QCdir/${sname}_mapq_table.txt - samtools view -F 0x104 $input_bam | awk '{cnt[$5]+=1}END{for(mapq in cnt) {print mapq"\t"cnt[mapq];}}' | sort -k1,1n >>$QCdir/${sname}_mapq_table.txt - ## insert size - ## this excludes read by AS (40) and mapq (40) - echo -e "BISCUITqc Insert Size, Score Table" >$QCdir/${sname}_isize_score_table.txt - echo -e "InsertSize/Score\tValue\tFraction" >>$QCdir/${sname}_isize_score_table.txt - # samtools view -F 0x104 $input_bam | awk '{match($0,/AS:i:([0-9]*)/,a); score[a[1]]+=1; sumscore+=1; if (and($2,0x2) && a[1]>=40 && $5>=40 && $9>=0 && $9 <=2000) {isize[$9]+=1; sumisize+=1}}END{for(k in isize){print "I\t"k"\t"isize[k] / sumisize} for(k in score){print "S\t"k"\t"score[k] / sumscore}}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_isize_score_table.txt - samtools view -F 0x104 $input_bam | grep -E 'AS:i:([0-9]*)' | awk -F 'AS:i:|\t' '{score[$17]+=1; sumscore+=1; if (and($2,0x2) && $17>=40 && $5>=40 && $9>=0 && $9 <=2000) {isize[$9]+=1; sumisize+=1}}END{for(k in isize){print "I\t"k"\t"isize[k] / sumisize} for(k in score){print "S\t"k"\t"score[k] / sumscore}}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_isize_score_table.txt - fi - - - ################################### - ## CpG retention distribution - ################################### - [[ ! -f "$input_vcf" ]] && BISCUIT_QC_BETAS=false - if [[ "$BISCUIT_QC_BETAS" == true ]]; then - echo -e "BISCUITqc Retention Distribution Table" >$QCdir/${sname}_CpGRetentionDist.txt - echo -e "RetentionFraction\tCount" >>$QCdir/${sname}_CpGRetentionDist.txt - biscuit vcf2bed -t cg $input_vcf | awk '$5>=3{a[sprintf("%3.0f", $4*100)]+=1}END{for (beta in a) print beta"\t"a[beta];}' | sort -k1,1n >>$QCdir/${sname}_CpGRetentionDist.txt - fi - - ################################### - ## Remove bed files - ################################### - if [[ "$remove_bed_files" ]]; then - rm $QCdir/*.bed - fi - - ######################################## - ## Running check on output files - ######################################## - basic_check_output_filled -} - - - - -ARGPARSE_DESCRIPTION="Run QC on biscuit output" -argparse "$@" <', - help='Path to vcf outupt from BISCUIT') -parser.add_argument('-o', '--outdir', type=str, default='BISCUITqc', - help='output directory [default %(default)s]') -parser.add_argument('--do_not_remove_bed', action='store_false', - default=True, help='Whether remove bed files [default %(default)s]') -parser.add_argument('-p', '--processes', type=int, default=1, - help='Number of processes to use [default %(default)s]') -EOF - -if [[ ! -d "$assets_directory" ]]; then - echo "Assets directory missing: $assets_directory."; - exit 1; -fi - -source $(dirname ${BASH_SOURCE[0]})/setup.sh $genome $assets_directory -input_vcf=$vcf -QCdir=$outdir -sname=$sample_name -remove_bed_files=$do_not_remove_bed - ->&2 echo "## Running BISCUIT QC script with following configuration ##" ->&2 echo "==============" ->&2 echo "sample name: $sname" ->&2 echo "input bam: $input_bam" ->&2 echo "input vcf: $input_vcf" ->&2 echo "output dir: $QCdir" ->&2 echo "REFERENCE: $BISCUIT_REFERENCE" ->&2 echo "CPGBED: $BISCUIT_CPGBED" ->&2 echo "CGIBED: $BISCUIT_CGIBED" ->&2 echo "RMSK: $BISCUIT_RMSK" ->&2 echo "EXON: $BISCUIT_EXON" ->&2 echo "GENE: $BISCUIT_GENE" ->&2 echo "TOPGC_BED: $BISCUIT_TOPGC_BED" ->&2 echo "BOTGC_BED: $BISCUIT_BOTGC_BED" ->&2 echo "==============" -biscuitQC -#>&2 echo $remove_bed_files $QCdir $input_vcf - ->&2 echo -e "\nDone." diff --git a/bin/epiread_pairedEnd_convertion b/bin/epiread_pairedEnd_convertion new file mode 100755 index 0000000000000000000000000000000000000000..06bd8cd7f1103594ffb159e5383febb32a046b7a GIT binary patch literal 178240 zcmeFa34D~*^*26Y5wzMw1&d2H?jVW*i2{lSab!Ti$fBslfDi#?Nkf8hL4zTdaU4z6 zC|Z|PZA-1yxIhsV!y-fZxx}T46iZxcpE0N@bxGawe!u73`#g7&8C3fFzn{f4uRppJjGWLy3f{=IBEKQA*j(7x=lmoy9crGlYz(2sn6-#;@A zp>bxsOnO;q__w9i`th@8U3}ue<7ZDlZuYFX0{cEs3; z!V>MvkdpcyfQe)MU1#3;*ddRu{AXM1*WXm^ciTH(^}oH1bw4wGx}gm1Ik3wSpx^!pV#rUrn|Lw7G)GuZ>9JH~q_@y~lcK_A4rMGwMbmW{phrclJtd<_Hyma8c z2j||q$FE90?Xh&v;UgjsKY}h9|2~8A_eoRx6o*ZNpATd@{AXyJ4*w$vlMcUZkM#JT zL22pQ_kx1b;h)Grzw_Sd+vmez)6rR*ogQDEp?x#_CLNt;;8*GJ%QEmYCIkPAK_nfW zt}yy^`0W|SeJKNeb_V*(Gt5gF{4m|PH5vGcWuQMU1OBl+)6?IQ!5;R@z)${<)6*%) zz~@D9+;noCkb!Llh{&jnXaYtp)&)5w9dsYU$U7o>@ev?5D4`<+W zQwDurn?c@dGWgfs8SMXn4E)Tdq(ec_Ha@Lx&D#CuP0@&lS_V- zo_=cvxz5jkFU}y>of+uwk%7*=8RFY#8Rlj04D+}m1D%^Q*k@%1`*}Wt-Hyy4@5T)B z4#{A*zsaD7Lon~T_^v8rm;3#pJEMOl=|NeeckLp{DstfN2%~Za{nDL*6_-vJGko^Exy2KvUOc;4>(bX; zFm`;|Nkv7|Dk}Q-??2$;sq<$|1If}^b7zd4HomNX|7kO)mKK$jPMuXYf8@0A_^J3O zJav4y@T_T}q6@~C%?378vvH_6K`G7y$>OPVMouW2kXIBQe^Sx7voeo6eqhn~{Gti{ zi^5@yfp+}|8Y-6dce$V+Vno&_vR8m|zf8N}wvuBlEQM4eB+B)lpTj9X@ zv#u7oG^MW)W5S#VeYcGa6XV`uqOHhp&CE=C1-@!1j&5MZYkU5wZ~w`khb zvS~BL;|Amv6_w2_op*WB?BcmI%4QZ7mzK^emD!Q@Q)xm|%ZiJNE2b5fl+BtqSGo;E zoSB`@OT(EX?uqRUd5|0GGlx~37!4Z26UO!>$n5fI0<=;>VB*5DE4Fu15!SbOXtl3@o2Ai+-H_!Lj4SS##$XmzIU+7&ralk@2L` zVo+F692Plu!GBJgHg8VJ)Y4*q5#d^9)<(#QwNcIX;;GY%#7K>Mq%Jk4!!B5fuL7&? z@nt1xEnbZ(^9Yh}NG4Hu7IF%Z8)O`zKgL*q6$k>9&M2Q#JU3VwQz!B}qgRF`NpctH zeozszva+eOX2aVHOJ~g~E-Eda$+fCv-t1Y^uIN7yW<9sKc)CF&w5oN^8dy?Xe3@C_ z{!{5i@bSai2Z8UI?_qTv1E)73mF$f#8#oPSj!$%i6vL-?%e0&^U{^gAD%YsFE6M4u zJKn4cbr=ZreG<<6X|w0e*93&l|9!ex-gdzy^Gh$8h4d0JVOO2{L|tTLQs!Q+vv|U^ zSy)pfu$LB#Ps{ZE;FyK5Zd_Gp{RfngyN@ZFUtDIEM46HCWx+~0ue_{i-X%q)Q|Hbo z)&L3J54Hix3BbD^J?%PEs9T%?11zoSH=J%KAGnSbTTE ze*Gn7&aSU(gDE|>9c9JgaW(_>_h~;Up)o}Wk#;<{xzj~g|2OuxX79fX{*f=oF8{%b zxq#4h zbBgC61_Oshe(t;?*@kfY0sOQ%C7}iL=a%4zsO%COUuJPLeuO3uU2uDCYj*?G+0=EGPi&#}?61AqM?cwrgR+~@vh=iX(3<$&@IS@i zKs(f3J}YnJ5{Icig)X=s&JwP5#4Ob6e>~!2U0dF|25aD%a1=q&@6zw#*PcMZL7;Iclq;AFE) z=${7ux%BT8`UWun?DGR?B!4+4YFTK`J1zdcyA?mF9lp^#&#}~;ciQ$AuA8Ulpw!|; zN<#i!=HT~r@bwPfKO;r`N(X;{)4svM(}L{ZMh8E{B2hLv_=tnw?BGW`_$CK`j)V6c z{CEf7;^5D9@U0I1LI=Oh=#5$$=iuud{0Ikc&%ZJ0p&|!w&!>r>?%?e?H1U@>cza$> z{1gXo&&!FQ>fn2o=se>3LhR92c{`9bMVHcrEZyn=QAqyuinA;v`CaI9sGU{zQMu&#KCWL z@b)f-)r}5*f2aLs2aj7B|F6lx+h?*^={fkH`mIBu76*T@gKu^4yz}$_ejeCuu8%|n zcJPNf_#6j+n1k=-;1753xeopa2jAPlAL-!p9Q;uZevpGd+QAn%_}&ga;^6x@_(BJN zjDw%#;QKoGDGvTv2S3xnALrmp9DF|qU*X`7ckq=CKF`4~aqt5ie2s%Y!ND(e@B18@McL9e7%D|$)U5-!4Goq4Gw;=gWu@jdFN{X8Xf#87Kw7RgCFYPn;iV9 z4&HO{r#bi*2VdaeTOIr`2Ol!)2xH{w4!*mCAMW6D9Q+v$zL$d!JNR4&f2M=)?chf` z_&f(c%E1qE@Mk&r0tY|F!ABhYSO;I|;LmpOlN@}ZgP-EyCph?-4xZ1B*uN47f1X96 ztZ?v?9DJpNKi|PGaqt&7`2T+VuLl0Bf&XgY|8)&SsI8_NGK;atK^0&gMA)I8ZJ@T-KGk|!Gk-b9$Gc(PvLwS<|1C+h@WMVP5~ zvPR&?2{YwRRto$OVW!&25`ph0%oIC0Mc}&#Gqp|@3VaJ;rqsy-fo~wpR63a_@O6Zl zLML+tzLGFg=VXq+rG%L>Cqn{XN|>o~vgK=;f5J?WlT89&K$u4w$wq<45oW5KY!G-9 zVW!B*dVxYA(WHgw$^?Xo7@ezr_rh02NOGd{9t`$p;h>>$K}NOU>FMyZYb*&iBD|w2Y-We=7x>l)(CdN>V_(ZBg zhZF2!IzKfW>RgOox$u)v*KO83CD{PqCd=Bt9LLYYX>;J-`k^wtVC z9rDiCJ$7`uaH9Nr`O>C@IO=wKL)`)}TBNQr_@a~%@pFt|&-HOl27sY#g3z{JY zZzetAl}K~n0;eh{BZI&=fX%ZV>mNphaR(V9FF^Q1bV(gzFQuuHC%kXxl0n}EJ|U#KQHfQOdME zE`GTNxRsIE#APO2yfHr%N`$x5&|=~3k!hRRhnhNsx*fbO`<4b+$W>B@(BV(WbfLBu zI&O%>^A9>1Q5Kz+fEs+LK?c#8%c3W;6{^4acUxN(25Y`e6(~p#NL3&?L#sD|u=3+$ zjS#C^Kgnod*o5&&w+o>I$i7{qi-jQ$eg8=WbT3LKqTn<>^g+V+DZCPl;Xhx3`+cz&Pts!n!Sxi&oHD6mp7q^)^5eWRnPL@PWDaQtrprq&Ep_+VN9-qaArDV)gqObH8{LII}X*6F`7=rVd&T4Gxm!%#Ez;kF7sJ2Al0Ka2VBr}j^XdEKa0Nv@BJU^+=N-ajds}6BZ=*IJ3Ttj*Z?@SROTt}mZXTb8ed8UB|XJ60?8AtMy=gIq`^>nNhU#r?) z;D=Zflm^0$A`<=>XKp+ZQ~ekf5oP=vdNtvM!W%L5_>US9!`cKi;>;0NNn6=0UI=ex zxRWL?PQ|KpE0m+Q@okG$KO*i##IgLX&vB@PcbqI7OuIz)VH%Oi!hTIzVqFIuZ*{-% zST&4BG*T6zTf(~n?R*^${HN+rRAeO00W)Swena|1SFJDb6(lKb7q)qJ5SBHwcwpM< zxvXM@eP>d@>KX%fAsCCZEwGg+R&l{H{DIOGRAj z{UJupk-8kItL5n6MOx~55z(AR>Lp5YUj7CxTSfrBtf0^axIi1% za43L}No_4PX=E~bdGZoC6L&$5`oj0I%LnwU_{96rUP4Sff7vstg@?tQbg95JV6pNA z?Ruf|=<0IDEOzUl%R)gzmy8KLyAPoMi6VbaCC9^c*s?W5e9GNs3*{D7AqPc$o$1nD z^(&R4zqb(az0+N6!n+nxTNYvEnjLWO zI5<1yRZd?!?@zH^scK)flkhGN+Cj-6c3VJ9HBuEIe6<`I^zwCcVbD%(X2ulQ9|k?Fst%f#qN=?J>Z*4-Cjfh61XMr6U$lJ*PI3`nE13Sr;!2EeAntYpi=nPjh-1vspMq89k#_pV;4(rT) zIyE&iHD;^fuQP|6ghW)s)b=ru^Dc6+32(TlGh9b|87z`&R6R~U`xBEXQ;OkaQD9p{p5KVS9PrRm+&USBkbx_XiR1}xUskrgFD{AS;Kp6V$e6wwCm=y z%Q3ir!8?@~5Xmz^$J?dVzR+n<)=C5m2+augoMO&7HpM;ry zLV3oCVN67bV$ziVlP5SE3GYDRR-4SjOu>VmjBqC+8I&YdAHmqMfmKU<0A!j6$~6GDR**;W|b;|Ja z!vRu2t^BbAO(<`FD$TBJyCHObvKv-DfM&!ZGvYEt&vqk@NF8xR%7`P%+!2TEWW;gF zS}yQblAM6iL2I)YSGaYBS*ev5YNeHOZ1&=jshED*i%$v&MBw zEU=EVwXU9Vjk=9eAJ=9*cRWhs(tm>Xuh)8M=C90)ojsM=zQub!9rW!beQRr^uepE{ zdB1vxqWE#^2(@S#1K!(HK^-gM?WSN0si5`?MhSb#f;>+_5gfH|DOe%`CcI4w<{0p4 zfO<&s);3aG7fh-sJ=~}ok1@~XSuZcV;`tvwqPp%Yx{{lp0+9)q2CAhibt_e4PF9*l zJ~RYJBZLK>8z;Pz^>ZUXAJ5ORt_?q>>O8tx{YjQSuys*@3a z*|AY!NfQN<;CUQc=$cewbheItzVz1%tLTS;w#$;$_EHCDRleB4sp;UJ!@*hQ|IESJ z8HhMII}HPgJ5x`UL`R29BSna!_HhuRtt=k_4L$QW_uNTC7Zg=H)?fZK`T_T6%!CzD z<_x&yoE=qFCw^4N!*av8?H^PPo2cjJy}EGnpWQ55pXH5W8P!d6NswAgGm2FcyV_=Z zOPFd#ne8=5E3x>IZ=+$nI-8D;es^)U;Wz-RBgYR3Z*N>tWUr|HIq({+Y9N=oP9cu=QN=D=)#6v>>#2Vubr7#59#((VA9S>r z$DuT*fbqDt={a30B@VrSgp90c=Nc*E^(@o_pFcHQnj`> zcM<`OCod2hIZ8vD_H2F#NF5_0H2l3MX{gYB-=-|p(h-`Qpc5zr0EQJTl!37SSFpr@ zM=Mxwz+MVA8gO?7TMYOmZ6P=uZB~$W=Fk0ez+vH4<>!2B=Vc`i*KuPF1m<6@SuFFu zt;#zq!0b}#-t|zNYGcveA6>-OQ@YGXm#T%Ev$7ZegDbWg2o9yY<6~i!XUFG&Fg{17aiMhl=$*IqINF%7wd@{3mf<|b zgug**%aDw1n)q!kxf{c4vbQMfDCi{JAXj4$UxkIf3d?!)uM(zShb#{BI>gA7S~ZJR zTl!GodY{m}Kmt)imB(v|ha=pyjJ)EJinFD%M6l+3w&%~KQXFBO8fl((F4ShyT8;D> zL_tuM`jlB)!RfG@{Dk)?CJ~_aOn7w)){q{!_@#pN2E0zeLIchR2q%9BN>w}K3Sj&n zN7<;NzNIQ$ulPZ7#Lcx+t|S)!PNm%}+?oT`F{tfXA~hoFX5F;sfjA`M81UB;7Aop1 zcwz`5v>|(f2sCw5gpZv38>FY52ewNF8}%Bx8zRqAE8m(C^Ssm7r!~_EwbYu zKu-CqfHD^JcaDNOP{KQ1L17WKCn#7Vq9nW{1k`k)mwuL2a8GSkXqtV43B*PH^#|?5 zHDB2*Y^b@ncgWZO5>$eV;t|*b!|js4_*yd#bxE9ms-@6bXbaTEK!nZk>axF}O#8OAhzh$uXwAlxW<28Rqj0dXYn+#Ng&QxEg~y-@(-zT-d=C z8eD(il2Jb+Ycc2poTk=xc5`snfOcS-tum|?{Vl+$DZS_*#GalB5dKbnnS&Fn!32&* ze6rN7O;kGVtYOS{aMm<({Ju8$B)l`w&M}vKr<=8zL)k=gvbnGmyF@Ko&-4p9*|5pM zqhCukBb)gZyDXo=0=r{lng#3~uN5#>Ep6iYZTE9N6JF;xQrb?SKg8rDezjeoHc9x+ z@AQE7=!+EuC&ATCy936~+>Elx>DkTXOEv5nWu(uQRuU!7c}<2eeEiPvaao$~=SX+l zTcIu*_S*9`K&rXIQDA?yKTyzesLDa(JsW@G9|R086rMv;5!VT_cYJ@=bji-z{i>LH z(Pgdl(6HSb2{OX@W2i701>8rgF2NZ%s@O-XO8ly`3x} z)F=&Y+A}Mlagord)p|4QGM~dZpIExt5e9WBClX7$0WVar(SV~BY%$n57^s%b)wdyv_+QnO+SV>b$(9R~TpACc}| zPukC4P726qN|)=UKjnYvp(>l^3Jl?DBP!cpe9Swu$ z#ZC@;+3}T3A=_DXSo+ec8!1J+=a)TvtDdHde!W!G3KM3(pi}#a=093r?U?fQuaK1g zoWY|ylQICbQKw*wl6ATr^*44i+ZB~xg z+Pk-2Dv09=UMe4tYiOofDcHES z>3M`yimxqDKPb@7Q?!}1R%OmRgT!m4KE*I*amLL_CC0f}O-p+wyzvUwksfN#P_V&( z`3g=k;86fmLd+E8JF2A4xC3xQOt?Yu%{UW}#KUz+CL18_N%yFrqZN%Hw>4`T*1b*P0vs6CQa!qky&qBpo@B1vj!CDp>3|9#bTSG z8Vm`kp7jBZy@UbtWSkpNT3#YA+7YVSUyW1<0Ng`C9Vp>_WwcK6sNF6gY zm_BaT*^G_Mk2^RUp6_*VdB&-4aBzhNx4^-b7~I7UF2~@;0_P-{ zH3oeW(LsXQXwbPtf8-~G1;#mccL-M++-H~{D^Q)mz2o2-4DLk-H^tzd0xo&IKa91r z`<R9h=N!NHXn`#HqHStI(9gR^Gz=#U^e zwYKyg%))BK8q~`U&RW&eG#?ws65hjsHU0Yqc~n@J%SG#%V%jU6^M;*k&jJT^rExvm z`XxkDH?j}?ZUN?UyGU~jGC=1vtos4bd z-Gt^889$H;vuY>2*G>}|Q-pa-qJRG0cTiMZHII~pe=)PH_Ud> z;b)MkV$(o#7Cl?gqPa_@ciH(?b2e`pi6*Jb{*fn0M$pct4(-tHn*n?r=bD}9vL>*X zW5`K5T|Pk_#Wk@?(dAuIQLQu*-Y-ri0X1Mdt?|<-HMc-HK>{>(gZM}4{QC`C0+x={ zhB7;r;>pr5;XSnxIYIatJ0rIGAX9ypRO8KrREPRTOgulomTV=wi$?fwFYs9aH1 z{vFcgW$zO0B9&2nkRegM4Dq3p+Lzp^$7+d*MAt;3uR&T3I+CT_2d*RWd!&2Sx&jO? z9|y0)b!b&H#=^Y`xM=>sVQV}!9!PWsz~qa2fKEF-E-xVM{EuMi)Z>0Bc4fH zyc@+Za<7MGo8PszV;!xD_8hpVMgLjQ;-U$}R6YhawKIpQ6;n+y?cbUP=@hX@=+GA$ z0!5s@PULOspokmvj$fPVQ!0t-je#N>sDW55G3gX>zpNbhNOx6)e54}2=^iNJX-shQ zyNU1%m^d7-zi@JR<-$2A^_V>;;PXl}%wEyUuX4P!NUB1E)@9c=;D(^Lw848nsga6GwZpt;zWhX%#M>DnF^4DjOfWHxll zSmCIAx2o{Qtmb=a()astC>cw5GtC%$c+q$5u_hV&O6;{(KqUUe#`8a^(VnYy4$n4i zUsl@|5U*lLTkG1-NSBwrn0ArMXmukeF;Tram|eldYKciVyXzUNN6c?y)clw0{64 zyZRN%(oU{%Ld-ex=oZLTqC2O05m=w)uUd=CHoYcQ*;_^ST5V$<)RnosS?5wJpud%B z7mcDEJm;ohi;{Nqa6EHk|JXwWR1^4ho+wg!fL`wLc@MJMqONLrB(9yR%8<0D zRn=9fpaXvDsKhSrr?FE!|MEEd;@T4Ko)gXuU)9eg;!6(Yryboap^) zsbch&z*qe;Y&#u^E41B`>YL#jF+BPv7qJeZNw&dF;zBB`o)+W=-Zbu+KtwE}GLQV; z1ZUl)K%Cxl1kJnJMp0{=BSV*@HZr==T-n|sR|)ny&N0h9`hiSv9Tz+^; z4!l{=95|P|+<06iYB%t~IasqT&vV~I#t&fq}2pNrR zLOl0q-5Fj)#NXpeR~jdOwi@XldAy5110CHfd2U1QMK5WO$Z^8C_H z6Y!ITUA}f`E%$?QzkARk>dyazXH{s457)BN@wbHQLE z@PywUuT>fPJIgMrYik&&_9CTtet5ACxebvy-TZAK zYwcuz6=_|nfRE%t_Ua1i%e1u&|F%Qh>!j_~Y#XnYF7axC%xZw;3-4Iz`vZ!)3q63y z(=LTl$t^-34bkEzAzn+BV@NulS=+vbS}Bdaje7YIavQFT=xx+kGs2Br z7vmG_Qb)+s5$ZU?u`+_bSCEU(HOb#|W@Ih0-d+DB$CVq8i7WV#3zd3J!fOVm`69)M zCPDN~;_UM@!gRu0N8E|hPGx(Xh$9stto%CYrTop z6?rWV%zL%6dlcmtI>h*h(@-{dfD za)D{GKbnB%x&NZYXD_~vdNe((bF?r~lL#eaS@c_imK^6z9JIn;hS{DrDPNJ=)1vPS zdfuZwO{Cm&C5GoW?_{A^Qhoft1NN;8O-m(d(W3*Bqm(4gSp>K7q)%#3i{2;bnWH`T zL5j$s(?x(VID@*5-r>iQ6rDWJ>^YwQ&9$okI}pS>QYkx!o+z@_>I91mHSl)V%%{~< zIq5|i@1;GS$i%MZ^T*K8V(|O}CmFdcZ6xLrY!=Ktv>I*)q^K_8xEd2aepEUZ5`%|L zgrT@HBzwFrPX+ZHd7Jk_Jv;N=)MGv-W5Ui)lpP@ho2z+3dg(X`Z;^sBI%-P+CLidD z&PGx@fw8{r!|03y&FQ_&qHQ;=L%Y+^4ksh`H9*t6;t>h31Estw=;i2x3h^Qx)>MR$ zsz7vUs@7sE{&6%b3=P~19~Cfc?Pd!TW|^8$pweGGfeS%$A1iiPX58l&>OcWmOrvG9|GwhAw``r3g)YGH?AVULBP;Rj>k z2m6K}j)v<%w9?Gf8{jF{wdEt2X}B)?rW>FWs|Tx|I1MpSkKK4L0GgS&(Hgc&GM`0) z=35G)Oa@C%b41V^EZXY0zn~L9i&5Ai4*(ipq7x#MrpgZ6M(f9`c3%WlcIfxDs>P|Y z!~a>UJg3UC@Q73~jD?uC#`Bx5RF+@o{KM$EM~TZTez1WxQYqG%&EO0d(0XZxYY>j0 z)_XE78$O}++N|fIp#RHaC)Jf&FU_#84*Gwk<4@6gZPs&G(En!bU!e8UELJOe(eaG~`~IHea)FM1 zK&8q8y?{uQAfV?bf52Pc^vvYX;qdOAR^7sbXfUGu{K1!#K*DQX#`7y&L2V7LpiT}y zxp0_V6E$)Qg7xnz&_*2dtAv(@$SG2V|-TpWNGm`s_Hj=dp?F@uhNEjS@re zj)m{m<5P#s{vg9UtA6_J83q0sY?DM=Sp+_N$Z*lI`xdEEE|QBmqCEz7`Y%T#8KVqB zcBFB;RO61%93VSAuhO1_*0taJmmQdn(r7f78@}~f$#w*<`bp0e5>b6$r^{!qkTnPOajh3uJbx>#Lf>1)EkK~X-(h5$i;zlq zn|~^x_QZRE3Kp2!RRqoQVKbwKLxF=4+7(cY@RtvAI2p&^sNW$C{dC1{EGh*p8;c8_ zUeJITjoc&Sv?)Uk-UiSLgziI93td|&$w+K6BE ze~(;o3flSnTjV zO{TtUhI*TI{{43{esbSEG|j-15jbfuDd|R3Ht)vlL(n##e=HT9YlgZDX!#n&EzP)| z{4%QyS9&;CpoUORe}H9pC3a39y{`s+19XAW>dk16Q^B}pB#7scSforzg@!w+mvFZR zbi7lbD_e9x;&BqX7aPXKzI@K5qU#16+^Q1=?p_6(@T%<^*% zuHNAO!0tCl zoOis~f*g3Mzmc;7UYEQ_8sYnV z@8fibmw-r#Bx$g#U+ZM+7D=!vcC!M(h{hnzY5i?56z|G95~AH~ew*Vw zN``}hL{3BwGj4^BTchI&HpOmE)p4a>?B*97x065alwjO3I<7Xw3#DpCLK2hvCxA8H zpD~<)MKGxr4}u$^^_RgBI4{68r&SGouz(0aNcYXPY%WoO(#P0--I{RX&*9a zXdefcYj8asT!F!5Ik+hX_YsD%d{!FVTMn+y;GTDI4F>m^gKILldmLQIq^nT}muqn4 z4z9r9raHJO1~@C$gWKD|H5uGDm{%)r$RsZO{f^`+Y-?53b{%__ z=m&uI7rrT`Lt_t|Oa|*=fk8h7v|0ED`h6-*Ot3Q1rh5={Jl5*qs}(ZdXv)!+5_?uz-UBY}CHF1aaiX zBE?T{M+^fosTUcxZG{XMkd1i$BbTZSR|!}2xhX@pQwLFRjb1$yjK-0Fiful=)D2Xll?yjBZ2lH(C<)Zt;(w4RtIPGa4m5|w68Lk z@Gf)O*`d#KaCZFRzDcCd&AEtm?YfyDHVpX>{6#Bn)k!Kc)9|3QtL7;9PG$V$51!$wl}e;Z z!hs}MlYfQ!c(v{I2>G!M@ZP@EmV9eb@x5LI8w>&B9wAg$H|Va?F#OR{5DH72 z8J_T_Wq8Ny%J94BZNx3_wxR0*f5ROo29k?=*vbRnBB{b%m?Bw@ z@io`{u{(Tu%gE^Kw5>*rFq+@J@@{e!824kdwT>F})F7O^OY28Q|H6)UNXO>p62FU` zhUMTO2gt#~S-=$6D!zDc|u(1kVG0fgefp@Y^~3L70^Yu$Cuh$mqoFN*nKw+zN4`8)AZK z85V`r;1!A;n1v-5Lm{!Q1+Sujet(PO8^b`=_8c-1E&r5MViP}QxA)5iQwie4DZgF2f)qlv6M&q+kwHsEdmS9P0}oiI;? zV1hV9z03?n7k(WrZ;ge&{!zGf^12$TB~T$19e<5&* z-2X!G`V^3e7D*IzH>6TWh!u1X`Wl;P#RRuRqJ`Ztu(nP!H_}9o0FQz;s6Nb5v@jP| zVEf1{u!c%GXm<#*3w5|Mqh+O~`CWA~cDyi8c|h=?RTgxoH^^rOB(N4`WYM}_qEmjZ zYRH8KyDpz9hT`25w834Qh$1s(Y)wbN{cB5VnP{>f*l&dx-!*qH zer=j23OdZ)zl(J4-U-@#kGYe$L=*8tVIXf+gUs5o!7Pd@4{~PF*N=&>*(=2EWae73 zSID%r>g=_c*`u+6jyO{?F7mL7Gsr3oDOE3NRS>3>ukRj5>C<8P9K8}-bCb~<>1&l8 z=eMZJG&2L>>uKxYE;Oo*cINPTx9@0suXQQ3-SW#cbiZ4&bQ(PG=8PIVMXTCr(5t=? z7vwTa(HyS=K!P6L!Q#EiY!|9-D9`c6;iDvj2z#MuC#b}a+lv;qqI2cddqnUF>bN%b z#ZI*<`uZgJ`bLD@2K=7WN_c$z8S*ed80jOPv_wx_{72&Dc(!}YN|*wgzRaYFUoiuR z9~@p;j$fwJhrT+)G`!U@vRBo5SeZl#@o`W|^8k?F@2_eFUNrnZmiYQm**-$MFTQa% zibRfIewh=S{5}#KVd`VdSah?^JOuSuPNMGF=2E-C4>s`!Ll|l!QT?&E=<*PaBU;-k zUF0iE@rAlfZbwJsGnLcBv-`+X=GlG5RA%>?Sd-mna$R?@6 zMD;=V#Gn8X;ZEjF0`pq|)xA-PdtkQ4c}0uZ>vCZ~_ACj36Ux+vM>7Tzj8~t)0@GIt z#l{qg7clT!cv5{zr%1e3n#7iy{@ta+D-m9+UdurUo2Ut$QIZ^0$J z{NA09;Kk{Vezs&i}?~=?!@5-IfyL2aY z7r6AD&wT7J(^y+}La#SnAj;qabw#>s35;wr_GhIL>@g*hl7;DtfsT?Dp)AOOrpx-L zQe$!}6Z(vBO46`4pZK=A>78Stci9p z1mQ^Ku)cI6F)$%rD`e|o@@xA=6>4)Zeyj?emeW-o5S?$+CcFu~_z8xg9!3%Zv-=zvT&vw$4QfMKu26L2fBa&=1kk7(hQ#RT0xeEu}4Oc?a>Ot;``K7ogS{dFe;&M9YGpVp%AGktuzS zl~VHq)KEydv=d?&R7c9dAcU@Yw`(Lz$BGu_B(t~>vKn*s3?@|Mx}A@V&a&v^!2!>4 zOsiXjTVH2I5C(7VqE;ln9wW;a1uHlLAfB!o9sQUCOhQ|E=1C0Tc0l@@1HxV}k{BJ! zwd3?7H*1=@j*RcgP}CqQ*$u(!a>2{>x13oBfwYyy#Eye4tgGxZ(fTI(5h~M8X zKwZ?#$=HekbXtYtJpe~XlX$Fbbo2w(pH2#(Em}tC87Rv=$HV(<;x|~O@IKJ>TyA`G zcoM#5mfsr=DGPl>29&XDIRe-JV4;_# zQ9`thwD`GYD5x>ArnPHuYLMeEq0*LtSLWwh2$o?_|EW~@*|Z0OAyTsiS|0crMT zbY*;xyx%=zE^^*SRHaFy(Pc%Ih$p`Jid>mHDx+WsG#Xi6Ae#9)p4WJ8NZVyx3Xlc)o|1%L;I&znbHj0D17&P ztighoVLndu6`DGi-*Y|=Of%E|oci`~g38ho(=}L+kus#LR5YHVkcFB400YOpnO&zb zZ&4e)-Xb#9ZfO*~>~f7bKvfHYF5pv6OSXC4`7t=3OOrJ#3pvuoXW{I0EPR>Df}E~o z(MZ{v~gaLXKHQ znz1A%AyH=VG=hWIl@AxOAZVWUTtlK`5c$MoM1k3`2_DAF4$rQQ zcM&No%a|se2MLp>fvXft9QRLl22DI~K+Zd^EKUvpgYn~iUJ6ScM;Z)=Luzi<5B~%k zV*8)1*-0!-cmu_8U6l!jwX!eh@jSR^AORfwu3O1floUAk;L+s0&fY*hO8VNBvb%KC z^M=ejBd*-{7~4%WpfrzQqkpTFEiVJu>x+6>=e=W`q2} z#thsavr0AfK6eKKVz<-g>@8`~!21@G&QCx@R`LI#`(=t@TsrXU!{WBEhLroQ4cgJ3 zj&lq~JUxGplS!e!7pDfyn6;EKVn z;su-0_Vk7t-SHi0k^u0hu;k~=K~6@zPK<&34t7D7r|5?j-2ilh$#s^JN{ej0B#CF1 zu9|L$#P{qZ1pE!&mTzG#_&IrF`NF|X#3A0s!x$X5{UU1~2IVN5&8Dkd2S%MKPV?IgxDFPPRQM3Fsdpoj4H)JESj>#@HsS zn6zsX$WHU3W;|YK;d7HKx9rd`8g^(^^l=P3=yD7o*PVx>^~Q9qMe$9THaRB{W=p@J zZpbV|qRKTpBvyxSLX}1HHGHq+3iep%wMq%zv0l_pULWL7Etv;7MD4tx#OON4-0+aY%Mc$YMyN%lfl-=ZO zG8Au?@Lb@>a6_}0`4J3MtuK;)ICH@P+fp8Ty5&vNaqTJR{ z4cL&R#1hk9{I-3EBEN&bx(>n+o53w~iP?<8JZR=T)zmROY`EQh%8$R|r0^^X1IY;y zdf`n?W?5jg`6NPDl%bKOLu*t${T6QHeX~Gj)vKn7>*BjgtX)n$+r>5v62!H9Y;8f+ zi&QhTrqD#vq8qD0Z7D+Fdxad^OD6)xWNsxs zAkjJ1bdN^&l+5WmoU5A2`=%TpccRMlLiGd1V6T_w8=)SjTUG|sUTOC+*nMgU9;;QD zrZKE;gfVi;&=YT3XdOM)3zhX0oyKV#BHt~jZO*Z6Vcr}9iVrvWTFw=_g5~)K?UP9| zgMQ6b_7pVnDP)ppqOz7wBA#!9le7|5YF!429zpF7mgowS+_^+oqlu9yWfc$`L|4sr z(%6&a=g>EPITuNi+0njUf{%2{_@rEvGEm|l#qL+&32ea3?pfY8`wN-F9 zfJj(?PSB_c0kX;Bc!s(tc9*6=OCUFY=q2?9KP89-nb4sqM|? z7bQF0HJq3=|RpQVpOxUPqxV+ zdx))psb6aPG353kMFdslaLd0)QI{Z<0yN4uN~m)A!n01PF%f2sG(wL~Aymq4_CoYD zi;{H}-ZZ7`K(v3=!lqE!abO0d7)x2=`w--uhhj6f`W|!)(9JJ6eB=sso|Z8O@~@!8 z-R7Y&zN!+Jr5#qSD@iBIr{m{4KY+tE={bDnhj5sMgY66)?ujag!*=_~3O!TD&Kf47 zklg^QQBjC6$(qYWbi-<{bj6WHc^tb17F_f2h zLynSgzcM=d4=?{RoU856HqpNkej2rS)R&zvoeO*8lB|dE8b3J$dgcj_rmq3av8NG? zkAqS}h??&6G6#%5U@_rc?}2ikbv#OAwimFir0>%8ECFF} z@~fz{K{26=5K3OsaCs7>sY{0TDlg`Hr~M=pyTU$X~-)R74Sa;lt%9+-T)daEz(K%Sr{$Bk4k4}TnY`V+w5 ziBcoAgM+7f@vR`*CcF|s778-9d7&H?w| zgcCnuc^P29CL!>`8&-dvqU=6VeRhEh$iM)bfe@(@H_kXdiw znY#<2So=7dG6NhLicm}Dxm=)x9Pu#2%Mes$mi8Fbrb@f1@rOzqa!2@H((WOnLIR0H zKUms(&^?2+cl*51BWFSpOA$C~pwAjk?pi+x11&M&Ib{XY7X}JXz45>o!6K8$)FUs5UGJSL<+) zydnH3Z5=0qe8f}GpR-eZ;-d~~D7dO=p0^?VTo#4_4Y4a3W|(|M9)+5ztm6z>UiE4) zGik>f*j~(ApQAa%&?^Mq>r`Pk<9rfG|Tz#jW2Po_9_544~ zahpZY*G%c4=O0neyG>I)-z25zc^y9ndiH%9Yq`HV-D{u z1lD;^NGUWueoW0ud;5~;;tB2(-ZastZzK~0Y`zzI1V@b^(mq+0GW?vOcCCVnnx&cDa|BdAG)xC>}raikoL>0XYhdmi>hEqKBv zdqqP8FYnstk}}TT6*3B4NJsrjf+r6={$cbge8?dZCPvT_?rYgmKs;xL&w3Hnh6!n(ltI(w~y&W;zJXz3|B zo63epMMLm8MQ444is)E3r#$7&$24q zn%n$W^h?c4P1Bd&r@12|MALK`lfA?sa12Kp8b z@hls5ZtQsqM!A~D*8!DOvAGXKoBJ>5NJ3PX@IGRe7HdBXrgG^gB9yPwJa;L~D3H|NUje)9`#skuj|Nz{`Bdr*@VqG&t9 z`VG)K!}>lKvJ`h4!H>ZLZQTibL2i-Lg-+#953fK%sBcjH4Tg?{>b{~se>oq)OF4Z2 zeh=1|RxhdmUT$-J!26l!b>RI0#6Rz)^Og5=MFtrq#*Yr~c7Bal^T|QqR8s9w7JLm8 zEo4HL%O+K|tTYF#clJgpbE^9~E*i8p9JIXK8LvB8SK~IPd=EJjOU5;E;tGi=ZGm0l zzlmsC4jZKpVs%u#@d2~e<|+6WEkank%8roFXz@$AVC1x;gXnrW86F&NYM*PAAE152 z;af<3qV&aEEsJY9oYJNEHIF|#9?AIrV)VHZ7Wm!zJR_|>H?94@qR)50BK&wK`Yi9D z&)vVbJ_rAY`fML7G#VN0-m)eoZl=_|fEaCKYH$JtACNpteBrci&O=PoBv#Ny)y%G8 zj5l|?NTr_RIk30rIU*BYOcnrM5GA}D-%EK&^MH57=Riir_$yA9f-|h%90W^C^KkaO zc|4k$c2A(4Uh|}6Z(D)(x}7T|y_=;Jvg`TL9)_i^K1Mg62e+B+e7xcv$F5Xo&RX<> z#2w_fet!nE-tBYLL*GmhbG7v1YX(ihqoRKzk_#!rYLJ1F-b~P2iMgo9#n3W@hdR6~ zz7%a&A|=hGZuV~=edti)o=nTiNW3Cf+QbTbX^Df|*qEi59&i2xyx58FRcPd%EsD2} zR)l%Ea31QBRY_RKc#?@;qHF=gk;k`Ij5>lZ3W#X7&D$y;xwBg)V`<+W`s97RqQ}%Z zRaXgF{^-IOR6`Q&vDl6lE9iwI^DM2R0B9Oq6AmU}T83?NcS-Z!)u zEJoFk)+|P=__5t?oGjvasCF&C=(jK@qaL{rl)j&^B0W(5xn{xVKTdXq(8LiQGZ z+^NT`$0I_1tWMEdFL2nd#_^bc%6m@unH7}UDy_$S@>+_R?@6zA9`iPlyXZ0h5KVu7 zj~NCV`<@V+o~rOe2oj zy#Jxaz+=YKqcnY6!jGvQ<1FQNV*4T|kT-!kF7m;0g8Txf)Xs&*L3otGxzP9@iV~dT zJp(W>?aMwDKAZ~;d%nUDj-00bQNEvYe7}?YpJ6m(wIs|mOJ1ch_c*!!M!=koJ*#@h z3h1AXJqeV}+S{!nj|SAgyGE3X-Gi)663(7ap-f&{q01KYoB()8xi9-uO6wHg>xFMx=3JZMEG1$GORQ*z_jK;O5qb`1zLi)QHxih7;LsN6} zO?W8H<~`Kv_exWLdUnUnrO6}4x)QYGO-DQyE#wr5Z@HFr^_Gr;T{A70$Fmb&uhD+= zw6iP*&ik1kXwrXya8}c#1v>s|Z=P?@_KdVI)+YB(_ZsXFBon}bAVzrVNO~Y2^VTil zoYt)7zkulGAdL6&shYeng2l^u9$>J5ke@$FjHtrzL{e)r{Lzv97qfF5j)VIDM1-p% zI~Bn`CqgkGb#TTw2%6MqM1VHWh&=G5`;5rmP>%bI$Q`IP&xo9f85LPmU)b!RD|u>O zULQ;(?ANH6Wi1oq~u2@5$FjuSw)Ecqooz+1sdXO_`?IK;4 zpE1}KKUfg-xedSxZ$6(r4pxCt0^+$1d(jzgl3y9Mk8eE1H)AzjPK{f-G=U%9_NK(g zzQRaqYvaF@K`k)n-o820NgN&@P37_W@8I$CAI{?p;jv>V?08Ukh1;@y$Ptb9!cmJrEyX;y(>S z%NXQ)pM0SR&oh|oN(>r^(fQ@3cm^g+_h14i?~rb!tMCl}P48gXD839zGDDgza+l^F+kDGMP}Vecj>LO()>fUg4|Ah{W9}~3Vj;{) zIkUI3EMC*m$OZ(n-r)g$CVGCnOj5$Tgdf}A!F05%lK(gCYQ^6vm}^&~_#n-$ z+Lb?<3}tD&n-5^grg$^%R-D7=1)p0spks0C0eqG^?O}BJFw|(S0%{T`Ud7Yg-Zz{| z%vJRnEM=}%@uPE~3-;3QX8bGzo}!Byr}}h%d9YZUzqsaX4rLyGoNQ^<_aVmh^Gw@T zS15^+D`Cw}eOilu31ZrSZAlpND{(gGWO~-p$fW!kenGZ{=hs%z{f2n zMZiiNu3=u=37C>F87Zzs%k=WicWcthw^`oAu=15S@?G$dBVXFSM}WG!(|hC;Or6|v z;#*UiI`K{?tAYPAwnCQ&pK2+l%RI!a#^TfwmImcC@svHnGs6Ry5hs`R8*rMJva0)T zYxGiphyc^&%Y^b4uQlVp%juxRVW9BsGcS`SGd9F$`{hoFa0|wRTkvRP@xS@eXn5@3 zqtct`@$n~X&2j(LdVJeTBU34S2ZMegl6^ZihAzkdigMBJACsRFnVI_1PZcTnI`!aoSxLX6H^Ixl# z$4be&$LuQbwzSrUTmm zT+0HQax6FiPyNfcrs%3{yq$jaz0Ob=W_lqaz+7=IEB2l6D)_S&cKf|SUx!)w0C8eU zR}8|kjFl}ox#^Hs$O37D_pz(2@dg=Fs>A{BD_42<$NH`EIt44d!`@``44oJ?S~=w!cm?CI)278U(G$l zFD07Dg47ugjM+p?GLysf@4NW^X~5Kg417NvH0g%8-d@xH8Q-5lD#LgCsfP(9yleSI z3U+l#cqIgJ`gB`^neR07X0tFvhu{tMo_JJ~If{H|nrJLW440e$b+`;BOO6bFA5uF6 z1ll2*`9ZqLe=qoxeA1ZQq%^SMlu`U|r`<*ghAWOA8)2v>0}~5N%jxAEIDM@nr&n;L z(;z6E$}cDGjMH#CPXFjpv{X*-R2(@ysvW1+QQ*hzb|Df0I&RJpr2Xu_gEw*wOn8S; z24}T7Y#^116w9nJ_k(4VBe}WuA?+9tA3%&iZs)9TYL%vLqt;~~az-dL?7WfDH;0`& zGWt1=Afw+5n=m?lWT+$*niz>6*B#~f(S81o7}VW+U;x-z^G)Zn1N;9q`w-mvd^tM$ z`5DnqhPCzB8mU^B6&d_?`G=Uz?iWtxA?mP;h83+tyLFTb{aM4$|NZ-~2L8X)KvoC~ zF8}72&6^cEa=HSz8kk#FTsn8^?A&l^>Acc@xfADJHh13Tb90B6jL4lmZ(8~MA-U2t zQ@inF3xjq9E}1pEICs|k+~PSUWmkmyhK>vM9#?$HaiL>}%`U^g;|q@q^$DRN=;jRx z4GHxN?H(#DopDW6}KdvUQPR-)8GXHJD!(+|xpu9#L_QZ{Se+##U_ z#nZ~>l@1wFG^PkF&W5szX3d>nTv{fC!PBg1Mf1x_XU&}sX1we`Sj4o@H-KLzC>*p;nES%A&|vHONXbvVf>Ls@ZFTie|z5611z zdX!1rlW#|PEgp9q)FTx7Cthutg0i>*u@dEBc;I6t%8j)+F+lm@oo#Kcr-wpc+y#Dy zheCV)3hltx!6@rd9*43K?o>P1u9 zCX{6;e~z5>Zj|1Gm>-m#f790XKFVU0d+rws9sgVC5oHC+LX;1pEJ4|TvIZr8OST?m z4t@*kb(F`V{1jybWsjePLRX_4gmNj$pP}4}vJ&Maq-}L5Yfv_zyc=Z`%Ez%TcJCDm zy^69o$}dqyQ1Z9I%TSK}BltyGit-thOHpn``8dk2QNE7yp#5Q2C{IDT^bzO{ z>@moTax}^w2Zlnoq8y0wX_OOC7T|QV0_E)}??$;AclF z3Y2G{T#s@(${i>dqx=@-gDCfS5`3d9L+PQsA7%HShC+{^?2U32$_UCMmbbOdMmY}U zjVR}$d>G{oC|^Li8D$H~4^ZYDjQLxEc}6)3Wg*HklqD!1U)k2S6lDX-l_;k_g?U0* ziZXOaD0Dr_T$B%^EI|1V$|)$jV?$Gkk{g>kls`k+fbu4kO(>6g8hq!5LT91ujq(nZ zqfxFwIUD6(&%nM=o{F*_Nt_LHRDqUWbN4J)VVqp?nbKB$OLbR-pU<&!Bu6WfRK3p$r|4 z_|O2kQRbp7KzT08DJXldh1`Tu)}fqW-{)gu#c|uqW0nt)i8UYm*F$sH%%90BN14M#K6}^Py0+DP@ z0s_^RR*G0r(fV_#R;df3Ma88^)hb$wsJNqIwJj>*S{GEL)|&Tw=A60D+~+>e#au2} z{x8o5a-Mt6nSJKWnKNhRS&Q(1^8h~tyc)2>;YwBf73~FhBVZZe%YY4lTL6~;4q68} z1$Y+VM!*`t&jHs1b~!?+uL1i3c7GiG6mT?PGvIQ-s{vmITn)Gta1&tWdej?WC%~>p zq8$K-0NxHb8E`e=xqx|3Am4z)0M`JX4Y(O_383nQaSoUdxD{|H;E6EFOaW{FoDaAZ za4FymfNKB`eG1_Livcm2tFr*}0WSd@3b+Pv3SbAA{N@7|0WJkB1zZDo-_sZ;fCoQ| zetxu4eE!hk~ne+D=ia4F!qfKLKm2lz4ILxA}&Azi=| z0lx+u3wZFc;0GK6xDIeK;8wtM0o%Te@PJ1EJ_I-d@O8juz^?(<10LK5?Fl#punk;n zCj(v$xB&20z?%UlZh~9@tONWI@OHos$3cF-iuoMyO2AUUn*nP89|l|u_!Zy^zzu&x zc)+cI9|C?3*r5RO_BDhDJQT1L@P~jk^!s(phk%0s?*p6wxDjv;;OBtr0lO9|b@)Hf z&Vc;@rvR1#&IhakTuQ(H+0wEe@NB?$0T%$~6e0hBy#T)gECoF64fJoo3cxD>e+qaX z;Dvx20p9@J0oe9Uq}vzy1RMld3OEVyXMl46uList@E*X`fSUj}0cO92@PPS%UHd`q z0}cW_18@@Hg@AJaZw9;?@L|B!fLj1J0e0Sk@PI{tUHdC_Jm4U}lK>|HP6M0+I3Mt8 zz{dbr1HKNp2{89wtp*?*yC#_zd72z<&Z>4cKoh!UO&ea0?*qlV=S? zeF63aZ2u1CA;4z92EbneE&;p_a3$alz>R=Myo>e%917TF5a@t|0G|My1o#=?9Khc1 zAzi>90~-@Q;x0kD%uO4gw4VP6CVo z&H=0gyc+PgfU5x?0Ne!lcfcKhI{>>5Q7Y?Wga^z6oCMexa1P*Dz^ef#1Fi;K1h@%s zIp7Y!4S-#bhu#P{2=L-hP;S6$0p|nW2DlWk+ozCcfc*eB1C9e!CtyAY%m@4ma42Ax zZ3qw82XH>%*?>y{9|BwhnDrUL19k>fCn7(eBRt?nz@dQTYdQt+Xu$b^H*80Ez&ilf z06qw~8Sn`}H5B>@U_M~yFA*NF2jCRI8G!QvF9Tc(cmv=Xz&imq13m_*hM`@lFjQ>-PXla2ck3IhU z(7C`t{P~BWYz|&>^G4=&9n-P>IXUyyiQRs5bl;xcDG~}_jQ=|hM;Qnq4~3nK|IHV- zv~(uiu-v?b86)<~9*Yd&C(+Nv|7oCi1dP$gxb*9QJ_Yn4HvJrz{t)m#0Uc9|8-BV= ze;w%aKyPo;N4oT{fu8~T7@M9E)(i&&cW_9l%RxsMapRxoh93g@-JoM?a_KW%I-x`J z%1U)hw7E7qRjGrxJy5TpQ5|28@YHl!|Lo_+5k5b2E~244EJYfB22W5Ot^s`|=m&x~ zR);6MdDsm4GSGEnnKn4nrK=2#U(lgexZ$f@dOqm)BuRfL=r@CYkRAS1H~bXPmx7Mz z#ErksrOyZbz9iw7f==ZNihm90E0WMRgHGiO3a>Ib|FiAYI5*#GJdsYi3Wzz;|%m80dI?F+?N2s2Br75#YgqJ!>xa+*PD-QY59!m z|7DAB)Z)zZvohWxPKs|O`09H>pQCgixA?{oU*?&VGWCZ7$ifW)?M?JD&|e3A2Hs9DRzW>+)y8Mt)37%VAWm3$1+U{AM0NHA(TTMm)m<+JoquKtBO= zE$dAkPSeTg@-9U>9eQJa17FrKD;=umawnGG{A}niN#c1I@$@Xjd<4F%Q$6uKl^rWr z9@5!3u%%@g@|Ja_#YgRRotv*g;F~b0rR6Eg*Uv3JZij~`XIMfh%J&z?x3pZ@7JTb0 zzS4^`7iXRCWd5pctPER_w_gXe3Dx-u#C77RmX^kLh--+a??&6{cxaB<2)+wXhVBhv zob|rNS9WRUC0Q3ci?h#flNsr(`6=Ekr2BMbOUs{8M_G#?0Zlt_S?#g`XEaO#q6;OhP>01AQu`_kUJA)Q=`{dikJjLOdsf?@7GJ>W6sNxMlkqJYRyR z#OBE;)(kX;=zGTbOZ>+W(T9NE8}uNVJsI>~0rY%??}_w#f=+W|)~i;#Tl3>m#L+Jx z4$8wC&4~i@b$3MJj>0~BGB&y{cxMk(}EtS%R#>n^lmo&6gT{O3E$nOTYBER zpsz&uLu|T?7b?d})X5#7--`FFUs>{$%JDlloq`VNPyg4_GRlr8W4LCZa+ZN!4th{N z8bCi4^q_Js0ev#)LFHU2;e+CT4)jR~?^jOB-zO3tlEdvo^<2Jupz`*^Iuqf`Y&z9} zwQlGKo-4saV=C4U!BdTB@`0KN`fWRjgZ8Cf4v2%=V<__Z9_UfLXJx{OWMn+$#p=sz z5YG`;`pai3?`F^oKo4p!x_$37(EY|jKEl&?Fe9KZzkJ!;;0zLGN*ue?FDS@Px|85*-7XHpicrFDY)UO;qjBoQ3m=n&@tRxy480YK%Wpm&qt6!$k+*> zSK~eF7^Gs_J)=q|LTS*R-2(6&i}x7aT6fZ3-sJ&lP|hd&?{ zpF9l0Vt-|lJkSR!9|k>WZloKa9|)jR9)=>1t3V%%_pD>=JdBRj1MNL-1`o39=Fw_P z+LL?+^rLOMrQ6V6L$eI}+PsTNFepVo!a|7a`cHIE_*`V)__gI^abHmeK z_gSC^$=(i~vBm@)8l@Y)!42OV^tqrPVbd#IdMW7hK@XbaYCvBAdVw8&rW<}S=vRPV zXwz$$PW40=_$>qdD!gZnI>yZ-jq!`!cs3%Q1F!WjXFkGy4*Ef$m*PEEPZS4FV|pBS zMJ0{^k6+)RI0k_}b|-O62G1<;_?3s+>s-(mg6`M9h)%u+OF;JMLGh?M^Dg;uM2M1V$er}eyAP($8P#n zppOJSDEuIauLeCx4r~E^SpYpBdCocj{SNfg?EF(5TDHnw;CVbC z4yvQkpi>!w`t}Ua*9C;9JS+nJzMbS@1$f>Gh=cO50rZ!X2O@47K>F%MW@P=t1(a2J}-w z_v@#mfAvHDib1c%d)B3h#f)dO?ufW)twvl6!Rx2z<|F(j&=-LI1m0uuP@jIpjpJ+Z z9JLgC2p%3UlR+;9eWD$n@?hEc=Y!{5@C4P_ zQqX@HkOt*(4d@M^Pqx#@I87Il=$k=b0eU~1F8V&v)%OuclJxUIUl%~nNAP!1hZ{hj zg7>U-v^J21)v`YI@Ad_yj4dxgPsk#Ut3fBYeCNhJ*XYFfc`1+ z*ks4gI=tR3p*V7|;2MfJg4&@M=p&P)F&gxKpifFnLrWM+qXs-Pz!NkdE(U!r=t1)# z`6~Sq^q@SF57Q#hgLLl?L2m}V#LlC&#^`{F_Il8N97pGgl=9yj^jAO+%6}>7?}8pQ z2h@PR1N0(0{#tiDE(X2bjV&!F*>qWWQu-@E9{_q_`9U87I*RG`Bg;ncp@a|0Uk4;I z65)gL*BkUnN%B_;`hq0+s{#Gj0r?|+hJ4K^{0y=Y-f);(2FQ%ev)D#Re3QYq_NL@~ zNJ_nqKUuc~&o^I#zWrwZ@=`i$kxmZ6UxoLqNPK>bzL^hNF$U&2;7g`Y&H&F1;Q1l= zV=`iv+xCk<{}bqbeT(u^hP*xr`uj~I=Xiut!SSlcr1J1WW;mUt-*OY7xXgFljUV8c&-Oe zP+q8BZw6iGrDJej-bFmw%l&m}6i4O!0DHBd55jw_UM$;WFVMSz9%OeK4f?^L`>hu! z{u!Vj0eaBdYZ2(jfbQ4+6n;7ALqG>`^LM)2f7XNWBhdZk01E#u=tqG*!45A9t}cH* z#uey(^-T0$pifPb{%D~Gt^Hp^b-Jx?Q2sW7zBvhf2k6g$ z4iWFxpXImE6&?A>06MkfX4Kz0(BHy))@z8x^as|fUZIMnemw>8{PGXM^*kT+bCb}Q zf<7B`zj2w5_}57IGQ7v~Kz&kdj+EEMDC=^>ab^g49dG58=8kfA?&tvKzpn_c)83$O z06nPANUA07&~+og?-^Zp~HutrlXWtc^{uYWD^q{#j>qw>t_3NIX{|EUCO1~KN zwn@^j0zE4UeF5lHe}Uzf@ImRX1-)~U@@)}%(7GrK9p}GD->(l+|LF<(2cQSlPci6Q zK@V(y&^LoV+-|?~-1c1ndPb7^xf%4Y5k9E@t_3|8;e+zOMZ%wAr=L-#GeG6bI!dWR zgdbwlG0q#oq00~YIMDNLy0r&aEc8w`-Lk7yfqo*w`=yVpsowb040<)*vpT{EWaOf; z`&enML|k_#$?J2VKa(V{p9tMAuauY7`16R#%cU?FCdtbXD7#;SH>lnxgZ@6~e*GdJ zX`KuD7SPYYdn_HAyDb03rQkVuW$<3t8qkjhJt&RMp!WnlSsHXEVser+=&VFF=!nYA zr?rn%40;pjgKfICk5mQvV$g%?X#wc-lF)AkeFo@$Wu*SO7WAn}!fye+4D^w9{xXK? z{*S3ptw6zMgFX}QS+`=dz_dBFg(Zi3V{7UG@cGHne1t6(`iFRri{njqtT%w?mv;x( zmo%i^cmD;TeWw{RY%kZA{15aM8HOUslWyC8iPGSe;f3mIxPnMp8@%#{#gb3GoYjD-1w)tePRLVe*@jGt|CF{`X;p!ADDzc)$xRiNJidaj+mwNJeObZW<- z`nehO<)8=E&sw1eMf6TK-O}TFgZ?1s z#Wo%7Lq|d={!-94gANtTO<(*)h+YHwd!Pr^-(t{TNkU%%`a014#&JI4-vIgppr3~K zSovuTTWh&b!1ED!N^Ksi59DE^tlRy;X%v8-3wp9NCW2=;c!JVs2E7jSxH%sAq-{B> zrz=3e0Cc~4qIqvJ$}k)B+wh)s2Bf+@2j8H(vzBWMz_-Kj4fE%t^s@Rx|5_c~?ma=D z33{OZ2l{N#gVr}ypkEDo7o-)-tH@tUe*x%=K@VDA-3m`}t-298P z!nwT`JEL<87G{sm9dv%1QMp6U&K)!?w_sRqui?4*!*jcWe|T<=%0tkJh-VVwAu(hg z`i1`;MmkEJ=~Sun0P(xy>CWFnYF57|Lh2V8jQU>HOPO08^<6I(Yvq)44ob{W8n>MYg&%t1mE190wm7y1{X7%vNh1;(6DZqhmVw*zsS5 za2(5dKcwyoIj@G)#*hQdrIPI*pX}V;PAv*8%Gl9XEz5Mi$Wi~wbl%8O7iKw6=cwDV z99&!eNS1R;j=ITl!1IdZywg@~bDS63s#V#}BW=}&Z0F9l>b-2|cWqTmwsT=y^?W-n zUsZ3w8S_I6_EYy}oSy+FR#eYx?bWs{=kfOHPsnt8^^)V<&|ZDvI2W~77iBwNwo~6^ z_XXz*Z5%koe$~c#sDpYq$64M%ZOn15>YzT&aTaz^7qoS@wO3E<$LZ`>;7nKL&M!jh zTqql;znPPqPjb`+A?Mom>ei5Raa;9l$hn}sdLYaBtex7F>+dDYz`0EbN2YKpG`*jEZ*PXcrpAQXlzUZj#4LSF9QBQJhd>(c{@)n%W|I2qo)0Ho_asa zxh+p!ntd?%AIWxBcT(H3d*l7FHV*u`ewE`qx4*h2$9cAs`XXmC-rsK9e^qC7Pdn$v z&XoB}I;(B%oUb~mi`qL|JE>3G6aR%BoQL*TOFKA!*k3)+!TIg}>Xi=8h5M^7J2+o- zR@d(5ywO=bxS#WMXZ22Qcko}F_hOcAmnTBEhn$Cds=tH|eeVZU2)y5#dFcFn^(*Ie zynp5#dR4w!n@#wCWjnX`RDa5GKF(Jk<~XnAt3R}L?#x$@w>$L8p6Z77&cegga~=nFm7x;Dtc$8DT13)GGr=dT6oinh+jJ=Nvy=>5)i&cegh z=k31tyTjXT-_Nr=QqH2bf9Xy+|l{q zD7C$#^U6``$~@;UN2z=AoV$)vFXTDb9;H6Yb1pbaUDC<d8*dQ@zyI zPR{+k)WXirO}*5Oot;a2sdZhteRZ_D=72XcknrlR&cBao^L$rl`7v!i?dn{1Oq#3fmBgAhU?mTs*`ttB@_aCWV zJ;M3yNcHg%-Trx``g3pR>EqOky$k+yoLX{hw_A==zv$y!SfH-zQ}D%cYGa>nZycvC zEkq8`4Cmo5{(lVvYRk1}hE5*u+~{P^aK6Y6T_18DbTVHHIh(RG=Vv&tIhogF^u+tS z8P2b=Gr!DmK5{Z2$#m|>&U`A1II`VdQ7 zZ@6|RANR|Ux=K>&K;3t+GgO_zv38)leaf8ERE;;4YYz`~>vcK~4u|I4ml=9F1D*4q znchZhHgQSwHfW6$xyFTCAevPYrZ7;pg2ZWVi6v4Hits5QVu^3wflEST@arZ^b49O zPjO}#{UZMNwFiXu8|s`Es%>|U`ju5q)IrBnoTku)p?1H~ZQ>T7<=R6+UAlFK!~FhN zg+j|hhdmxrFS$X{-wp{KFbRpC9~yR3M(9r&?f#OXo_Djg%i+-mCpcmCVj7{S0nEPBp4RnoIlu}^fRLkL%qj@ zSNW-MJWW5(^{xeeGsRE}7oGoynz$*nV6Szv?I)N_>+$wOpz-;W<({rG} zBLof*I8q>8y+zM7fwKiJ6nLe;Wdc_TTqp2lfm;P`7nohd@g6Af2!R6xjud#Rz-aQw2^FI9uRCfmaG#CUBL&bpl@&xK-eGf!QJ_5A?{( zs~dUz_XglM&Zqf1YQ%^i=JzU&POYw}%OBLIZ=Zr=2OPyOh9wYYBF5!LuvlD~Dg!oN z3n8zWkV;#8@mVUz2nH)yrgD^#e-;|CWGf>lA|Sdr-JcT&7dti|?q+{1P939)2daUZ(1#mWTZCoz==*Km7h` zbl4ByMVWalKHURi^H)6nKsCij-eqDI@sop@m?f6h`=yInROYAn_`9hk(r@GO2dOo_ zcr&}Y+>~0D`n1Kx@EgtR8L0m$p86*r62dR>;13hLR}UEod`KOn@@8|nOzFrUG^7&i ztybbO`VD^SXDUa|S;VR&}qaHq4$^|w0-8jfE!r$;b<4dI;ZUDZsyxZVYjn7qQ z3}GzY8%ht|TT1b~`yT&3gMm5?3J~G9{+SWlM+~n|0q+<8_n|lvf8pEAZ}d*OtJ07E z$pHMpP&g@`s?Rx|(G1j6g1_-yMsyPV2~ZG;zw8M{^b!0r!Oyvo5mCWEB>1O4W`xma z+e2HVc)s}^<3|YpDT06VCPrwVD!kq%_|dO0{s`gEfI_O{xq%U;y|5jx@efG-U;|Y@ zham&Rf643oo5nvqR|&qx6Hj+^!A=sy7*Fzxo(wcJ;k&=YzfC=dkzc~E-OBh=7^vrgm!b%U!3Vj!R1I{wn-{M(fGtbE3Imi}P+|Bb-=<^K!efB7?xX9xrJ z_F){)CrcS&;yBIZTtD58qz%zQo@ zDwKGnU5uQ$O7LIFJYwRf@83|mr@p}H(!D42j4xvRyZ7?%QyHiS1pl+g8DaYEaebM8 z>3fVnR`~1DVNBixrNPC(Q@Tq(VQhOLJ{!P)Xn*E^{T*hk7XDd+e^dIEN$YySpZ)+d zntUEGfa4i}aYoNo4AdVqUVd2Mao|my7Fr#Eapi|U3iwVKze`1Kn{m`1cq(u9dXB~9 z=X$~aY!le=9D@2bn^>gI$%-=)W*Wk|)eCgX9-t_bRPhkGdQr<2S|Cxe6>yL~u@mwbO1-CK& zr^5e%;HOAFCkcMci5&m6Qg5hU{d_I>E!XpJ3wQyxJ-FZAItpjJjHyb-+m(ab46aE zy7Y6yIRA9t)A(F98tp?5eeZ&vzT^G*&kMj`0X*e@)uSA~69e^u;Jx*Bd>Qk5$LV^( z=l+Ky5HX>qoy`2vdl(;;`0o{bpN|=V8q`nb1poXW2>f9fUn^z4$PxY`;3@xEGTu#n zo-g<>US>x8)K7Ui&pS6uJTmOmWZ*jsj}7h(z=tMsx~FepZqt9R7yMH@7>{AApU;8! zi~rD5IG(Tm&U_}GmB9P)KQH`u%Q!XT`t(ydo(Dv3KpoZ3oq``K4Px?h9_qbwMxq_~ zYvErn^4W~*-luW=WuJ1qMn0?)eD$9hZ{+O(mc%ui>hYYCRM!&@P3mK?Y(-?oxT1H@+)6Z$sCI0pNdkO<}zu?b(kr9o8 zKWzr{|4ce&N5MY}Jjt=;9y#`o@SiR5bQ1nE(J?5WhrhrGQ{D}N|K+=k&lmm^&Sw5P zkq>5mzfbT}S1}`k>*tx7{{89)H5`w3{N6733*O=I5J&pyhmJw{^xB0k6#Ra-GT)QJ z|B~QOmHe1-S5e3DT)&YSO+CCK__OY2yy+)3^~~S>BgUKZ{G&}zvL0dFOhiYQd{DGaZNn`Pn#LvMap%d@E6VU zum4iuDITvKrXhfTi|}7_7e_4Bt@_U9cs_rN@kNsEEWv-WoDrrTE;@(#FL{FThX{XY z4&x8r#E6N4zeez@C7y!>pF5ZNz4os21wZ@~W;E?}$UNph;XcOql6d+Ge#tUMnD~Dx z_@et6KUVmk6Z~qy4;K8WpK|;?MXq)e{3gNQCFQzI@Z-;A{&OB>gy|1Y3I5>s;_)NT z^KZw~fv0x#+QFU;;6LbR%->Dg@o>q{3c-)q7N2g$&zawAUpiLsMfWhjY2U%~nSZs^ ze-DZOLc#ZWkP#-`*95=gpNv0K zR`5s5cx#2pYR4F$^_qhKF9Bk=huS2@eXE`>Qk=^{_9s6Z~DWs z1su=l4U8|6_@Da~=5vK|MD&P%|pfq?r0ROSX)BUUX zcurl!@#j3x_^Twc^@6|eX+|6&_(>2r)DN=-Um*Bnf6e%He`kcLhpz?SZ7JiA6aKu5 zn175%ZeJ((-VZY))Drz{2i~u|qZV^KUiow8#f6Z7bcugFWdZm*fv56T?%;fw z{#kS><2QTC`v<`n-_HE{n_hVRui#h9#8t>-b;c6zpO=+0g1-Al&pp6Xx?XwpmEhOC z&U|JZRYTwqzt`^cx!@m@{!_}q)FHoN{ zea`tU$8+jCjL(;N-V=Q8SB&T)`17u1{(d(w!t{p&u4DYok{{EK(*=LxpPApJ` zD{|P>|IbnHR6oy4{U0moUMG0(zWF_Zzxyp_H2u8u^=#Mj?ynsRJjMTx^ea>U3k3h# zCmfIIR}RWa{4G)shW|vtulSt#4gNmCd-uh52!51|3&THhDaY^KKYc~;V;<-D`!Y}i zf6x3My~qet&+~3%{5?{i)xv+pO^olhlo3WAE(f0GsdtAmeu~3z8}PJ`Ki{*DpLH|G zGh;dPnRcHmc(2@UdkgdL*v$NpxJrjQI~<&HVH&D0+4X-n)-ecdP&SdJuSO zug;HhcvH_`3EsQ!c_zw3{B>*M`L6@sZ~T(n_rrI&P4d5r<1zjA#M>GFro>;sWOd~q z7~d%U@NmKBtzdl1n~Z2D_}>WrZ1hulx-(D@3Eu0UFy;=9XX8?4H0|}K;FtZC@uq%G zy3@a03xTKlDH3_xAnBIg#rTO2GNQlW?-BgT+ZbW^=d9#@W&G3fLQKCDc)$AmJMfgQ zS3jR{H{<_+enHPv2I}{~`^EFI@O#&5KfQ${EZo(7yc75-Uxq+#AEuyT)}(wx6OioK=?aJJcWPac)a$d zQ4cUaALE9eV;HD21^>-o7-91KQ}ieEF02|nuJO5Qzj=)Pa~8wH9`fgJ1fKjWy!-4w z2cF8iehY{H0Ry%FpBbO`Uq&1xc%0ftJn*X&c_?Ugz3_Y27yCWzpP$o!r*ysc%v%Ki zoYaHqS2>UP$8#|76i-jc8+uGW&lbGb4{H7z<{$o%W>+%(tH;s5{pz{RW6bYecl=oJ zUO$$11b?c`YerA_Dau3nocal;+k=5RU@hZ^zRHM>f^QJ~xoa44n&4j){F|>c;w{1d z3i6BMUm*R;%#YWu^Do!B0Q_d)DV|mT;)rKT{Gs*!@f4#!(EOM${UKlY&kW$d26&49 zkLWM-6f#iRPcVM_Hbx8){C2?)`Ir%x3BLYG=3j|%K#!@!pMQmf`92c#+!U*Z{T>m`rGBe(|GafUDpf0*Kg_{f*&mP zZ_=IoG{>JO{pu*m=N*EdFXPvYmod-ym$x2xs(vS3KXG;8!2gLKeXE~nN zrMC6)!OULTUFS z1pk2GKR`Hoe#k(zfnG@Q+$Zg2`anm)d-qu?fj2x7g$A1g@clM&{9b>vM+D#d4UWa+ z^W>M9zw%zjFOWoD75w7I88K1t6QB=K{?Gb=5iaRzcaqW?-_BB;5+}rKc2q8lN?(ua!jg6 z4F;a#_pakE3W(=E;eX~KPSEuKY?Pnk=^*{q)I+`Cz4nBE2>uzgGd+heP}4SZJPnU9 z;!weF5xm#`dC(iozYY2hJw|RU5`5>+89#x6I`mEE-vGUd9y9JrFws!{U%|S8o)QM? zD&YP4!+XMC2uRN)25JoClpp^>;3@v8Uo>s2Z9zkdrO&XI;Gf7?HvnZQ##UO(qI zG=Hvo@@E{+vaSrz+RFU7fb^Jt_=w=!%6J^AskmM19p?A0_dgK)c$vrV5dPo3%lu=O zGU7tPZxy_E-XHcJ^AEy2Lr;4K>T2Ng<=qDN34imajO`-C{UJB09=vv>n*{&R!_2ru z_#c5B@oSe?0`Q*#f0#NjcCJUNQ+583)4k*=j`+tC{~W=4^|W=s`=#6MUmTBDpDFl= z@srka#G@qsCg=ww- ztHSSZ=^4yGUGy3Ad+iJgR> zHp}=m^V-9LA0_o{#`{o|pXC2$Iw*vvt8}1Kfv0#*m;B^wD(;RK{O6K?`nDiF2SI+B zx@8Riy-MSA75QV*^F85z9e9fW1CjHEOjbkCj|sm@F#a|npPI?>c>PPy7W@Kf7t>y^0q=~TfVk;CzHkaj#;@V^C~#-kEP4`Uy@8+gC`XSL;co)bB2 z`pIR2_xfGGD0r{`dCzwK@y`XG+NJ+C&Y&4DZwTINKbh5@`F{erMURnV&j9b2|E?XF zf4=BxCjR+?_s&xt_G5mpo=_?HC-32OO?|Ee-Y?ykgg?(SpQ_5n=BDOZ)28*QR8`T& z==AEQ=4fN2xi(T+Q(qTtQjthiePnt~{nUz@NL6!vV^gGJ)*Mw?U)xX3;EjYg_w)z;2M5bIY2oXv5p(T3{AXhl_|p#rcf zTvrvTtgo9LZEUWtuj^CU(4Zoxmp7Ld4sNK9Rz}aMZi*7Uskw1hWiyioMIzPp6s>Uf zt*EJ~udF}`kj`{Q4yc}9SKk=LkA8)bNK8dElOuq}N zt5sxF>GB5&;YfH~aU_CYqsE;Q36B=! z=;8?~QvBm_!^W1304=WARq+U9rxBT#q{fdNSstDkK{};j#CrO;@@ABR#+6433p7WhuD&W-Qr_H;YPIAf#4#`u z(YY=xnp)9>!lRtkb<;~K%bN=eD`!-oBQ#c2H#e13Qbd*EaJaCbd|X+iFl@~ll%*7VS$Y3Rcw#|hVx)Xnqp?OX(EF94=s#H}`Hsu4*XjWBoBbG;NqLs~sP0eAX9R0~GjLGKu67b@8zZvKQ`kiN2gWHs%d6gl_nCUaG;fcX?1Gf(y0Zeod-w8Ms%W0;R)q~ zBDH0lDe8ODvxY^uOHkzujzp%{&C=bgUnHVOpXp7yQ?|Zx!cmh>SNS!S_ z;iU3`GALTxp)1NMbiasCKSBEy_G^F)(E^dj0g7xj`EDImg$1#+>rvk$NR~t!8|&&L zT8=>$NuDD`eTw=(29r!E?8oD{ei}w{w4&B;B!~OeH$>~gWhc=n?;9zpZ;Ffuq873t zt`}p-%n#jU~GEC%ix$~FF`GgDMywe8p?-9NcPMwn;0o>7!R4D z2c*^M&=VlsYcWKs(SSP7sR^a><^i)Y>QH9%KXhH{h2d~jRLeW6^8xPEA_FJY4i52J z6_wbcX!HRQl8CibG+P>sC<>1+rvf8!l-QIng6S@)(3xc-b7LFm?wq;R;hhGObZG1Y7>FH;MZs3>x(Zi-0i5*7`JNhXXsL}mx*Vk6?d zv(Q;c0ji4BRy3p}y9*24zH91{l8O`6^cy)aQ!wb$gmj6*0i+LV z-4iN!4K(qT`eC?VQ}sNSPJ?Qrjnku0BU~AlLe_PUr7;tmWksLV8fQv*jP)7}fVfUT zgM$P>{Ol5-_vpbTdJo>mVNM88e1->WKAKe4SdEoQW0X{-NJBkD)m)Bl=UNf8@3BzC zjnu}tsGJ#XZVH!S93wvo<5x>_G&awf>8e7j_wIdFID*HS-RQEWVYxTEqGnc3?V;sEf4zJ#bTKkX&;W(%5Co(j#{G4YKk^Rny_qyO@T#P zphVMp3k(LZC15ocsjZ(KE#YM&><`UF1F#CMs;`YyR8~ftnqtixX|9+a#uU$CQZ^V2 z!o0*5gW`x@BxxP0GE7Qi+&oLUv{f`qt4vts?Mwyptj$4*d)UnhD57UtR-3f6z-nSd9vxbq?nxHd`l`#hkVO4Yvn_G+{eBek?9=7Oa*NHIj&0)?JutOcozcA6|m z)9ybNmiRiJD{ZX>^;B7zQp9>$PRq+wLThOX0b1VNP!lYBf@9VqNK2RTFdUYNkpTQ| z4@`YzngxAKiQK1JyEfatwB{5wcVE4@^-@#>8CzY~gdHU#YI(?!RU6B;IE&_B%dDO1 z2r{dU-90r~nrEWYzspk-%}g_518ukJ@YWpb?m<9@FY?pjqg55n6|9H0+AzbtAY2Q} zjF0X^LQ0J4)ya(NWGjjb&be`?(eKEHW!Bc{cWEB~R!^g8Ywxn!VB#4|8`M?W!Vs>V zfcbk5C|Ft&$BBT1MXymbgJ;uZyR&|yrlDD806qD;Ju*(lr7>B(DQx83F33x>nLxJD z){g3d$J9Zb~ zNu<1aaBaoRsCM%p%`}$DL}O4IJE&_B!oC+Ji%n&$(ktPpLhc^q1kpg7AK2|^p4$+G zCAc_JLW^jF!1h}>GQNDY>>jnYb7I%Eob4XuV*yifCG7+C57>lu>%?nIj*T(5-e4Qp zMH6Y91fiikb%O3&zRSRgkx5i9*cuoqb+JpnC05$9_aZA6w+_-Mw5&18CPA~zv3I-@ zx4m22+TrjZvpGOYhA88$jRm&LvdY=8b-D_qk_SL zy_Br2Llih+KR%T?&Rt+*=adu=y&ccS3%v#}Z^j@FFM}Xs2BDox#zQ7mRo6vel!cd=c05Ys%DJ#Fn#^_Wh$#zX3W;^E#EmSo zJxWYs_AIeS&TeRM`2L{52a>>HyJ#_u0!$*&r6fF+iSym z$4{(bv+AmUg57p<6%N-XvxdpKB4z|?tu%tOo7FO8*U`>UtWH?Fv*p$O8luseKIWD5 z4>3z6+zW>oIp}Je#?kA_YKf616QNZ169(|*#p?65k$sLr14WODT-7qeZdW-Af5np-$^euQ&(_?U;Dd9$Q|J z)-PQp{X)lTtG!U@By3hE-9W*4fQpFTX(4ar${DsB0reiMMLjF;ovlWUF$C??+J4?& zAAIplFfrGMg2)JSv?!%z<(LW7Up=1cRymQ|>ZVBDteP5l9_rHuv4DynU3`f!ll z$|!w;S%CJ3WtFG>-(mA0^OEj6nBVAdmN}|mAAV?Uv$M^>=j=yqw6?OgLCnUI!q{Px zB4xuSMoPou;+iGJ4fXKO8$x`3U@Ua~sv;HItBA%zYfGz7&tMN8_Bt$SP`%vDUgi6{ zH4E=H_1scB_Cguk7sE)Lli^qvt|bcrNs8%>INqWs=!&}L5_W8)X0=^L-0A;2KjdI# zCFsmdtUJc`lk5Z-_iI&cMRgsIYdB2!`ncGxb@lPCbpzrp_adocaxXfk zGTK0gNxhRDsZoF;p6e>&!h)04C`3)8@78Y9u@eYMy&2Y6C+C?vyyxJ-_qV^k~$ z(s_!h??C&2ULDS;*1^3>J9@^nNmNJ7-3-D^v`Ly^-7b(|YT31JV95Y$?TeKrEron0 zMRW|A6?M;mHXP?DmeL7Vvfq)pDN4BLk#hvTp$_BUy^AEs@Xvq{#)9hkIDFi@b;DO;^p=#$VSo6>QD z&h4a7eEC2#+7d&>J?u=sVf{3sfK)}Z14LVEEK}KdUK6dG-aLa25!N?){Q#I1Lpo9X zLdn{xhMG!Exhg!gFDe6XWR%e_yh1U9a2Q3p?gUWDT~cR~57h6;9E~e4jFjkXji7$R z85>P`>eCUdA`mu!<(mo*Lzwrw)qUM^O0F%8Ffe_0j>&y;RgSUT12xZG%sW1u+- zgOfUJ5g~z+G-s`zkK)hJmtk_CjS37rsM=LH6SE7u#56zpAFA(PRZXX%$(F({Lr}8m zU((Z0RCY^PCF`r04-)*l%nXP)T3t!@ydNh_tp#{&YkHSm^W6<|ea^%>c^@|z_BuB_ z?F}tk4pt{dYb2PoQfVYhU$8x0boCkYjlaUVGh1o=7?}WPqB3ZTJWgUuF^KA2l6~$R z6;7u5;~o!N^>1xgMu#1b(`$Y{o;2B}W5Eo?7*d2stABml!+7^f(6B2)=>im^(NpGZ zqYqzeLwSY>!kUNm;p#e^q^^cTN#pccwYZYYM<+%}(y{o`lX2`sR7_tKCnB~nrY=@Y zzeKCpg@$-07uf%qQ_kprgA!L4dwgwSFD$5^Sd!{HQv~c~>Q3(TY5nY$a1?{La>!(A zuGc8WjUG@>Qjy9m2U;tp8cM^7e(1BC>KdRFH&64Z;q2g-rrSX5ql9Ubh=rC!{LUBt zG6P$xRNS@!hd2VEO0d;?9ZRwaKVrLVfXf&OJTbEKAkEE0jj@IeR z538q@OloRG?%)p!M}GM5(Q93OGpu0H-1SCl^DNw4iQ9E%>RnH|pr8=juUb!VjnlYE zk&c;>`@tYXWsd|(N0V?!SKmt#o-O*z1OzBAjVN07q<$O?R)2V=z#o^5SK*+`f?6wp zG?-joHhJNMDATj3uJ02EU2WOiCdj#HKpX-O!Hj-$b?F+;dFW5 znuD&puE*$YY>y@rub=F77SzOp%=%_&J=a=t|6ATpu61X+9T`{E<30#qhfZ7k|8H2_ zdHJdpU2B6P$ql}Vn+-(~m~X$$W`h_f_lXUtva#i7>bD}vVIphBNVKT6cA*B(mDoCz z4!4fMu-6U_o~y3YqX>K5b?6p9x`P)t_0>;H$=x{&2NSxv8d6lB#0}KX^ttk&tt)ID1P)~JY80G*=qRDdZz zMkVF1UKUXDsSdukJxJ&WP8FD~>3lwBLdc>Z-*>o7D`U7meIkVJ;FyJs>MPFGU^hBp zYS*vm9KLf2I$4l-PHk;Vt&e|VuJKCao!I75OVU*|Fb$AE)Y?cxgFZQ#*vw=}-?e5Y zht;4)h8q!g{xr*cYk2Q{GQWRiO|(K>w_Cqzq7JK;;w96m zC}7rk32g8T6XEOnG^OsxAVzhuU!>=9RpgciNYW*VJ8`5@4q78f<_FVkyG!zVQ}jsQ z_?yL2ST0yIp3y)_*+fi9?+Qz@T!^29_C9Nk-kZdikmzfBRk&L(!6(sk=UiHB&h*6u zy*(?Y((%cZ0C#>!i(7)*1<(UpcOC)m8+5xY?G?+-7SLkQf4qCuIN#oc>XeTcR_pEM z;>Z{rmWk~!QxtJ>IN@Maaoim=vOuEEXzlsxUDNND@$XiZaVT|Hsdna*8r`ge9thLw zG0Q2Fp}kVyLUxi!LyX#e_HuJaQMe!75rX54a$8z#r`Zy6t*-gFb$l{+Pi!sN?w)Q} z+!Kqq#6HSb-&kuHK%A&;moP_9-oL;mJ)7jG&=z`_?{-IGJYS}ginF_ zenWlyAyc~bFL+GA1s2N!Z5{D(@z{0MLR;aq3z8NogIrJg)*Gt#De&I=myD)lX>AyR z2~Olm+CQ*E!FXn7VSD+;cORIOs8D9*ppWl|OKfe@FJHmk@>^vK=OlQ~5fda9`f}R? z2Gj1luh^?P>hA&a9_;Q*G*SAP(-u~{CvG6PA~KEIdSYH&7cA>+PI+_R+UmNJY8;_O zv*>Lm9=)xdAkf2x(LErM8P)j8qUrAtcIi81;`;RXdn2UMOXD|V`s>XY?FR+NdwiVJ z{`RQ7PWHVk*G#V|jpDfZRj;qmcY^hAZlu=SEmWEH&nMsQF$qVlij=O@UL1 zIc~Ei6@4OD&4ll==<=y#EI`Gqswwlip>y}il0*UKj;Yo*BYe}nCqd+vbW%praje^w zHm2&yhKUig^JCpcSv8lU#iU_y1JgNn{|`axfnzTZ+FIQ7ZWnhvHL83X{80zLCjgzO_cq|QM z8b?cN<9ABVN8jQ?D5I^pW!e*KCU0S(qR~fIgZ6!Pp;shclWxy?u#%dOUdLPZ-0Da$ ziv!{oXOec-_xK>@8!D!yx znJ#@lnzlxTwJSs|)};x$;hwlxW2ft{xNWb!Z?A_wkflF|uD5!~Eth+bJ8E}BnXJK) zc$j;a=MzjIV{vS%xv`>BK8a;d^0Ez^M1k?htkLwbczt+d7e#_Krey0>G)g3xMgs{7 zSgV~$sUgujTf0si&@^i*Y|^+D-p9mDS>3nC65C_3rqaE6zyt_^`WpDTjhs+kUml77 zUNnn{)^@CT;~)vvO7zpdewTXCjOg)DcI$UHF7YtgkXy6%xwXZBSRr_EtIq?oYQ2jR z!IFbv+W($yuBlJQq{^3r$nKx-6&0x&17l9Y<`iLJ;izcyNV>2Y5)M0C>BxLKdYBoU z@hWKA#JSaY#!@TCyS{U4nrkFXv#_?xy!R>iI1FxSnPvxA4k4kU^ydlbyJjV&@N*C2 z9szn!(T;CT_F?jAHIf3JWup4q@+IAxtkq62n+Eg(4ETVrNAh?}=Z=bN=Ak_lUCVp9 zs+^!!rl4C@G&GQRcWwL{&e-8A@JRp~yK#yaHBInhAHArx^RTrGny~Qf8?B#aCZROy zq;6FD18A|caPXS0b?-N^i-|&|ePbM30JtgP&WN$Nvo zYrvAm%+KlDjgIfw_=Q_taK{aMTDz<1U9!bk?Yt9P%y-`Aw$P{h_TxpD-P2)(kx|{U=_yd=Ay1^To1!SO3n4!~VhLoIX;Ht*yP}ZSY7=r`p z+Of-otyya~G`%g&NvM-YQ8C}C&9+s0I`QmW+6^Kqa#~D0)=-t|woE+I;b;cbHZE+( z&#u*DB+fo%_K6_E{q7Y^?Q{=4UtTZCQEp%2e2Z_;3+Ss6@k#bA45$v7&Bz zRPVuIL=hglp~cZ@6|-uZnO3EV?1KbZA zpJ5-raaXCke2UeLN-p`Q;>P9Bs*WU|g7W0BwY8o0tNTY(oR!%sG}2PIiPa_^ej42?gETolwwj zgZYZq>U!Tpj^WDLQ-h>(tnP4yQTQjbaEafHXbmnOPqCYeWfcQ2Zf;Cey{tnBP`_$* z&EZH|M4wG)qD~3p(Mf1-Glb@ZZ(8p9Hy7Wf;;Pjz`2I`F_PU2`pNZ1RSefJG zJTJM@`fZq2^yOV!g3(AB7$FZ5&OclXtDBp=$9!qS$w(FWQ`YWGmZW9TL_&mb&d_Ij z#dVf!;<(KQA0U<$ggfImS4>ZB9O_N3*i9BCW%!CcCG-ExNrZ$I++ghG@@Eah`YWh* zclN))8M+BR3>l4LaUZ1>KhzkHGiBi*RKC4gLYC;(NuVb-8l$Lrufx2o57CPI47={)Pdi7^jUhM# z0a+2%+hK4d#<(UG9F-O4kV)Oj)#SvCMo!Xl^bd=S32^OX3A0PP-`2E}>>veApXup! zRWK-UzSG=4tdi_~W^38Uq?-aN=9JVz6@$!=-4w8IFcsRCl!T*1t(ELB2KtMw!hY~7 z!s(d$#wvVKEZX#)y4#Wq_enuM+}Z^O^JYb3vmQ-5KYWeh%}bHp=AsJ=4VDo2=(aw4 z1re4WPhEFl$J9$q%n*<}Klg4_WLuiqSlTS{f(8FHp;qh zOd~}D!@M@Fq6YWsHqUK{DyVVI^)*tCNKv1nKK)tLr=t)Ft{(gLJ2BwgF1K$wb=580 z7d$q?n_IGlE@w|^wTO-MI2k~;WAg75t)w5@-s-2x&0#(kJ-Qwtac3lrT|MsW#H`=N zX@m*(@05+R*5W?SIEzEIe#Y7F!22&QhuRO<i^4!Id=$x5oC2FlG##q;W zM{EheRH)BSr?T>kyKz9Y{sHFrw67t4k9^`Qu~x&a{KOX~qh{8mBgh76Y)=zTl1;R@ z4RJkIQyHB8T43R~nU(xD;gj8+pGx+~Viogs)khSq@0jfNo0%;Uk+`k0>)t>8(X6~J z59OgMItS~oge8;ba%LJ^KA$J2xipMzC83!M7gDM5GJT{5o1?G@j39e64M$i6(xFoi zs;p?JsKhskwM|B>58itUkydNxssZxdUB5NWu3w`%9%g?mH1%By*0yXam78o+7WpU^JPVQ^-@?AXHuUx@!RN`T4dNyR8vnKm2ZmHC z1)DASfcMnONM+M3h`VaGBei;{52O$KHqORYKwKu>ZGZ$jHeH3XcU+wXZ8E_f^gFj{ z1>WU|?I^P<*!985{ZxNlE`rUAanxopUnA}2I59Gbj#|SjYa~rcg@a%wnvJ{qN6e^b zENW_OhP$o!M(b~Zrr(CWS8I@oSC1vPOkn=OiFx+gP2Xpr&~A|J6;-n<>fmjc+S#XC z{Z%P;-ojN`Wwg6vAF1B;kCwATh%Mt<^$BZH#Uq8@51?Xy1x`}hnJ-cTkvkrT$7|8i z{RCMGIuP5R^ld=_ZZ*~(-*@C&Zp?=?#nKm9nk%YnAoa@e;lRq~!lvfRnv!(R_~GJ- zl{NgW!)hqVl{GQfYwQ_JYanSdp09@~Z;t*1$MtQ0bnK8u>niYRk4W{jl1WXCaFc_J zB|iHBfkS%^P*U~XF}Ph4eU1#$nDp&l6W{)5d#_SQ-GTNFsaJ&gs5f+Ce7`zg!`I(Y zv3*H-^QE;)GP$!-YUFv(>OCChpxNU412n}WbLm}iTUyB8w?XQ&;*RUHyWyybRbRVf zvDvE60HHHrs|8A0w2@V(lpIgO#*wJ7FD?|S!UQ@YT8X{Rl+q`v2Qz>eCJxEYzcC|+5@*~Sjw6# zf#Aa;w`o{S55ZkgQ$4*7zFkviO`8V4<@Bkvm=(rGta~cl3C~x*-Nl40>7fgZUWXy# z*?Ka@B(c5X?ixBOP&s3t^);CXXwsXe+*aKaDpg{)rd=ASz>(>5=0xCk-&Bu_+hAae z%r1y))Z*~)Q%3PLpAJKw7EsV9L~+3F_E3eYt0EPR(`V5ap#4n45Ps=My1#X+FD>%Y z=(|mQ?-L)jnxm)VVGx>F9%f>DDjpR%p#7iCeC*$`j3cH(g5NUeg2b*nQ592_F2( zPD1Yb?tu2qr!4}Kg%C)sU6%E4)L-_4Ro~B|NeW{`Z|BJ8G>dRPp+IjGwK^qz^>!G` zJ*@sP%H7W_ew)ocnr7k*6;j;vc2XW>>%z-P9bxsflncwz%erLKLtS9UI5HNeEiAx=iP%7Zue;8*z~UIbA?GrAyQs z8sQ`A3iV)1gO3N1SuMycfGs|lShx*>VZI!~ow8c9>AREJJvE!F+v^}SNR_~MH}I5c$hM>sQ;qh)P3SX~m8or1&}!#J^gwoKN^{iQSS1tb>}O+iI)qEK5m$26)YnBPk_}Ny%QUCyjw_k; zWBoVrG<6z#sJWSC=by6MXHaxqM9nrD^}bW;&Y`0@6QWI6EMZa|F#_jk3$)-RtB&7P zJ5hgD#JT|^VJ=SZQi$$Et;Dr0%~9;?*Ed#7$A&A<4C&CHWu23PWS}(y=PNwHdkU|J zZ$;W?$96qy_5H)#ZufNEnsHin1*v_(CS(45;Zk~6eNdtE`@qVh7U$|e3#m)@JvVzUQG}~v* z@mpV8u*cUSxgHeDWIC-3){;c*=e|~k^cmk=ZFkT5N+lLFJUDg$i4G-Ad)caIxnb!h7@@KJJ?kIPS;XlWaN%F9n0|_(@nKkbr7i!s?lF zQMk{ciKIE-Lz2O@TKk!*Y~kgzGT-F|seN(uxbnhC2{|ysa~We*OkJZR<$RL?^?~&I zI9P2Y%i=g)!18QKJV0u$kAy$Qck_wFM4vGCegHW41ohgZ-N)@OKyCW>i^S=c9b3qe83an6+Y>N1L_By%&x)?cBR- z7Y#yw4Wx?#rZ?iEHLayoptboWsZKV6$@tq>?tGJDr)(i5!!RURz*Jt| zj}GJT;P!Pf#E~SVThv@XGg{Y_B%(mkLe^)>TiljMx_3P6L#4Z8MzVZ=tbGmiSkhhG zIXUr~mGobkV9igW#(SLbY8X=d%KPj}#U%b477PuyEtNZLJWtU(W#OI(YE z@9-q-z3$djJ$^iC7HU=#?R`Q`8&c%pJDgU3GRxdFlZk2gEMQ$^ZeMZ-8hShovlS&Z)s9sgLz>-6>!I;F^BZs`L(Pac+5_c|Ju@Zv>W+T7v zw$LOMznV;KVTuVi{zER>5j(+5204tDjMLZ5#n|*&*Qz)QMo-azQSL`PPpfX8fs@!H zW<)Dz;*<;~|8!`1NTsT#Y__b-<4%iE#gssprP7;7QHG}`xl_9R=tu zfqi#?e84ZLZuc!0t*r#idpMo8`xRL9nsk{>aH|N!y^q<1ECCo~)y>s#Tbu_ccX9_w zt-rdyV6NWW52BflHIp|&{3(#Yt{5(>yezjL>CxAEPX+3TvXbFDvK1xcZ* zaJ<)==u(%5xD^uHtLBWpzdu^+2vth!yJS08m}&t2EV;M$*E$?8lT9p9%m1c{DP6VRW zJohl5W5TQOTiQTl*MXx_;!fGq7>#Q`TJqR6wKt)+FGa_nATuRWHV;OI3XqEw%z zxwUYK1H`?18qP3pWRz-bP<`s^o1=YjjnA=kA2t7)UN@`H)LGRvRmWCWsXp|J?od{J zs^-=qK8DSWOu>2XCajj?endcPjMh|80C{VuX;yvo6#{+m+Glz_el|zv;IH1B@6%XM zH>s;W(HW6xjTN={98(n{F~1OdWo1M@%b{KTI5HewXsnpaahP|Chf+tBI;R!2)s+Ze z-%JSuqm#ka7pf0#C??aZO2=~$A_?JthD5epUJN|glL+zJPY?WPga0P{T6r-r-)6>p zmgjdNe%s-Qq1qV#4>@j|m2L*k2e0c9MuWsPd^3m@O$#p0B`$W2!b6xU&0%x6E=YX^W@#YE4<W!zaOF`8VbG zohN*Ogfp;Wy$Q+C8{bWy@K^ko`3#iq7mLJO{@WzHX+N$FVy9YBYB>Y_PJc|k4dQOR z5Q_=FJY>Q#%#!|W_`T^r;t9Vpm%|x&^#Bu+p*Q^}JmE)&IlO^ydMd!1{%Z)Ii~rQO z=^0%s;m>5~t#6aQf$tzpNBlS8CrS8865jAp-1L~=27U${rDVcSk?>O_d;tZr9uqEx zLSQpC3AaSTFERO-hB4#Agg5278e#18m)3K+mqr*)p-8RAgg3A)*zEACfy1dO440UW zA#s>+26py@U(zVyrZVhhrA@CUoI!U58{tg-t&#AnCA@dS$TtB69wO;`>)C`4n!h-A z@qOQpdXuET{&>b73P3~KYgU=%*lV3F69(v%Xly}ldk#Q2`_%(54?*j{s>P1 I&rkLL0Xkqai2wiq literal 0 HcmV?d00001 diff --git a/bin/epiread_pairedEnd_convertion.cpp b/bin/epiread_pairedEnd_convertion.cpp new file mode 100644 index 00000000..3a5a2149 --- /dev/null +++ b/bin/epiread_pairedEnd_convertion.cpp @@ -0,0 +1,686 @@ +#include +#include +#include // std::stringstream +#include +#include + +#include +using namespace std; + +// Based on N. Loyfer Pattern algorithm + +string TAB = "\t"; +int MAX_PAT_LEN = 300; +struct CpG_record +{ + + string next_cpg; + int index; + CpG_record(){} + + CpG_record(string _next_cpg, int _index) : + next_cpg(_next_cpg), index(_index) {} +}; +unordered_map dictCpG; +struct SNP_record +{ + char ref; + char alt; + string next_snp; + string sp; + SNP_record(){} + + SNP_record(char _ref, char _alt,string snp,string _sp) : + ref(_ref), alt(_alt), next_snp(snp), sp(_sp) {} +}; +unordered_map dictSNP; + + +bool DEBUG = false; +bool SNP_data = true; //file is probably not empty, if file is empty-skip adding SMP data + +vector line2tokens(string &line); +//void convert_epiread(string genome_cpg); +void convert_epiread(ofstream& merged_epiread); +//int execute(string cmd, string& output); +string vec2string(vector &vec, string coordinates = string()); +//void merge_paired_and_print(vector l1, vector l2, string &genome); +void merge_paired_and_print(vector l1, vector l2,ofstream& merged_epiread); + +vector line2tokens(string &line) { + /** Break string line to words (a vector of string tokens) */ + vector result; + string cell; + stringstream lineStream(line); + while (getline(lineStream, cell, '\t')) { + result.push_back(cell); + } + return result; +} + + +//void convert_epiread(string genome_cpg) { +void convert_epiread(ofstream& merged_epiread) { + /** parse stdin for epiread paired-end file, sorted by name and order of mate + * Translate them to single-end like epiread format, and output to a file */ + + vector row1, row2, row_from_line; + + bool first_in_pair = true; + for (string line_str; getline(cin, line_str);) { + row_from_line = line2tokens(line_str); + // row is first out of a couple of rows + if (first_in_pair) { + //cout < &vec, string coordinates) { + /** print a epiread-like vector to stdout, tab separated */ + // vec length must be 8, or 6 if there is no SNP. + int num_of_column=8; + if (!SNP_data) + num_of_column=6; + string str_vec = vec[0] + coordinates; + //for (int i=1; isecond).index; + } else { + // This is an internal error - should never happen. + throw logic_error("Internal Error. Unknown CpG locus: " + locus); + } + return start_site; + +} + +// int CpGLastLoci(int &index) { + // //get the locus of the last CpG according to it's index + // string position = ""; + // for (auto it = dictCpG.begin(); it != dictCpG.end(); ++it) + // if (it->second == index) + // return stoi((it->first).substr((it->first).find("\t")+1)); + + // // This is an internal error - should never happen. + // throw logic_error("Internal Error. Unknown CpG index: " + index); + +// } + +int CpGLastLoci(string &chr,string &pos,int length_cpg) { + //get the locus of the last CpG according to window and length of string + if (length_cpg==0) return stoi(pos); + string locus = chr + TAB + pos; + auto search = dictCpG.find(locus); + if (search != dictCpG.end()) { + for (int i=0;isecond).next_cpg; + search = dictCpG.find(locus); + if (search == dictCpG.end()) + throw logic_error("Internal Error. Unknown CpG locus: " + locus); + } + return stoi((search->second).next_cpg); + + } + + + // This is an internal error - should never happen. + throw logic_error("Internal Error. Unknown CpG locus: " + locus); + +} + +SNP_record FindSNPRecord(string &locus) { + /** translate CpG index (in range 1,...,28M~) to dictionary */ + SNP_record variant; + auto search = dictSNP.find(locus); + if (search != dictSNP.end()) { + variant = search->second; + } else { + // This is an internal error - should never happen. + throw logic_error("Internal Error. Unknown SNP locus: " + locus); + } + return variant; +} + +// void initializeDictCpG(string cpg) +// { + + // int cpg_index=1; + // vector record; + // ifstream cpg_file(cpg, ios_base::in); + // string line; + // while (getline(cpg_file, line)) { + // record=line2tokens(line); + // //dictCpG.insert(make_pair(record[0]+TAB+record[1]+TAB+record[2],cpg_index++)); + // dictCpG.insert(make_pair(record[0]+TAB+record[1],cpg_index++)); + // } +// } + +void initializeDictCpG(string cpg) +{ + + int cpg_index=1; + vector record, next_record; + ifstream cpg_file(cpg, ios_base::in); + string line; + string next_cpg; + + //get first CpG + getline(cpg_file, line); + record = line2tokens(line); + + while (getline(cpg_file, line)) { + next_record = line2tokens(line); + next_cpg = next_record[1]; + dictCpG.insert(make_pair(record[0]+TAB+record[1],CpG_record(next_cpg,cpg_index++))); + record = next_record; + } + //last record-no "next cpg" + dictCpG.insert(make_pair(record[0]+TAB+record[1],CpG_record("",cpg_index++))); + +} + + +void initializeDictSNP(string snp) +{ + ifstream snp_file(snp, ios::in); + vector record,next_record; + string line_str; + if ( snp_file.peek() == fstream::traits_type::eof() ) + { + cerr <<"SNP file is empty"< &line) +{ // get the final merged SNP + string debug_data; + string cuttent_snp = line[7]; + string final_snp = "0:"; + int snp_length = line[7].length(); + SNP_record snp_rec = checkLocus(line[0],line[6],line[3],cuttent_snp[0]); + debug_data = "(Ref-" +convertChar2srting(snp_rec.ref)+",Alt-"+convertChar2srting(snp_rec.alt)+",SP-"+snp_rec.sp+")"; + final_snp += convertChar2srting(cuttent_snp[0]); + if (DEBUG) { + final_snp += debug_data; + } + if (snp_length ==1) + return final_snp; + + string next_pos = snp_rec.next_snp; + string absolute_pos = line[6]; + for (int i=1; i &line) +{ + string final_snp = ""; + if (line[6]==".") + return final_snp; + string debug_data; + string cuttent_snp = line[7]; + final_snp = "0:"; + int snp_length = line[7].length(); + string locus = line[0] + TAB + line[6]; + SNP_record snp_rec = FindSNPRecord(locus); + debug_data = "(Ref-" +convertChar2srting(snp_rec.ref)+",Alt-"+convertChar2srting(snp_rec.alt)+",SP-"+snp_rec.sp+")"; + final_snp += convertChar2srting(cuttent_snp[0]); + if (DEBUG) { + final_snp += debug_data; + } + if (snp_length ==1) + return final_snp; + + string next_pos = snp_rec.next_snp; + string absolute_pos = line[6]; + for (int i=1; i mergeSNP(vector l1, vector l2) +{//change snp to desired format, with relative index for each variant in format 0:var1:12:var2:13:var3 + + vector returned_snp; + + //if one mate has missing value-make it be the first vector + if (l2[6] == "." && l1[6] != ".") { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + //if both mates have SNP in the same variant-make the one with more variants be the first mate + if (l1[6] == l2[6] && l1[7].length() < l2[7].length()) { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + //if both mates are "-" strand, but same position + if (l1[6] != "." && l2[6] != "." && l1[4] == l2[4] && stoi(l1[6]) > stoi(l2[6])) { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + + //get SNP-length: number of variants for each line: + string snp1 = l1[7]; + string snp2 = l2[7]; + int snp1_length = l1[7].length(); + int snp2_length = l2[7].length(); + //if both snp has missing values + if (l1[6] == "." && l2[6] == ".") returned_snp.assign( {".","."} ); + //get SNP data + //if one read has missing value (".") than use the other value. + else if (l1[6] == "." ) + returned_snp.assign( {l2[6],GetFinalSNP(l2)} ); + + //both mates have values in the SNP column, in the same position. l1 has more variants + else if (l1[6] == l2[6]) { + //both mates have the same SNP variants + if (snp1==snp2) + returned_snp.assign( {l2[6],GetFinalSNP(l2)} ); + //same position, different variants in both mates, the first mate has longer SNP variant list + else + { + //check all common variants between mates are the same. If not- put "N" in the relevant position + for (int i=0; i &l1) +{//add first and last coordinates in case where there is SNP data + int last_snp = 0; // works even if there's no SNP data + string coordinates = ""; + if (SNP_data && l1[7] != ".") //if there's SNP data + { //get last index of SNP + string str_tmp = l1[7].substr(0,l1[7].rfind(":")); + str_tmp = str_tmp.substr(str_tmp.rfind(":")+1); + int last_snp = stoi(l1[6])+stoi(str_tmp); + } + + //get last index of CpG + //string window = l1[0] + "\t" + l1[4]; + //int index = CpGFirstIndex(window) + l1[5].length() - 1; + int last_CpG = CpGLastLoci(l1[0],l1[4], l1[5].length() - 1); + + //insert values to vector and print + if (SNP_data && l1[6] != ".") + //l1.insert(l1.begin()+1, to_string(min(stoi(l1[4]),stoi(l1[6])))); + coordinates += TAB+ to_string(min(stoi(l1[4]),stoi(l1[6]))); + else // no SNP data, coordinates depends only on first CpG + //l1.insert(l1.begin()+1, l1[4]); + coordinates += TAB+ l1[4] ; + //l1.insert(l1.begin()+2, to_string(max(last_snp,last_CpG))); + coordinates += TAB+ to_string(max(last_snp,last_CpG)); + return coordinates; +} + +//void merge_paired_and_print(vector l1, vector l2, string &genome) { +void merge_paired_and_print(vector l1, vector l2, ofstream& merged_epiread) { + /*Merge two epiread-formated line into one */ + + if (!DEBUG) { + l1[1] = "."; + l2[1] = "."; + } + + + bool flag_SNP_identical = (SNP_data) ? l1[6] == l2[6] && l1[7] == l2[7] : true; + //if l2 doesn't add any information to the read-length, sequence and SNP data: + if (l1[4] == l2[4] && l1[5] == l2[5] && ( flag_SNP_identical )) { + //there is an snp value on the identical lines + try { + if ( SNP_data && l1[6] != "." ) + l1[7] = GetFinalSNP(l1); + + //add_coordintes(l1); + string coordinates = add_coordintes(l1); + merged_epiread << vec2string(l1,coordinates) < stoi(l2[4])) { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + + string pattern1 = l1[5]; + string pattern2 = l2[5]; + int pattern1_len = pattern1.length(); + int pattern2_len = pattern2.length(); + + //0-based file + //int first_cpg_site1 = stoi(l1[4]); + //int first_cpg_site2 = stoi(l2[4]); + + string window1,window2; + //window1 = l1[0] + "\t" + to_string(first_cpg_site1) + "\t" + to_string(first_cpg_site1+1); + //window2 = l2[0] + "\t" + to_string(first_cpg_site2) + "\t" + to_string(first_cpg_site2+1); + window1 = l1[0] + "\t" + l1[4]; + window2 = l2[0] + "\t" + l2[4]; + int first_cpg1,first_cpg2; + try { + first_cpg1 = CpGFirstIndex(window1); + first_cpg2 = CpGFirstIndex(window2); + } + catch (std::exception &e) { + cout << vec2string(l1) << endl; + cout << vec2string(l2) << endl; + return; + } + + + int last_site = max(first_cpg1 + pattern1_len, first_cpg2 + pattern2_len); + int overall_len = last_site-first_cpg1; + + string merged_pattern; // output pattern + + if (overall_len > MAX_PAT_LEN) // sanity check: make sure the two reads are not too far apart + { + // throw invalid_argument("invalid pairing. merged read is too long "); + string output_error = "Problem with:\n" + l1[0] + "\t" + l1[1] + "\t" + l1[2] + "\t" + l1[3] + "\t" + l1[4] + "\n" + l2[0] + "\t" + l2[1] + "\t" + l2[2] + "\t" + l2[3] + "\t" + l2[4] ; + cerr < merged_snp; + try { + if (SNP_data) //SNP file is not empty + { + merged_snp = mergeSNP(l1,l2); + l1[5] = merged_pattern; + l1[6] = merged_snp[0]; + l1[7] = merged_snp[1]; + + } + //add_coordintes(l1); + merged_epiread << vec2string(l1,add_coordintes(l1)) <) +{ + chomp $line; + my @f = split(/\t/,$line); + + # get max MAF + my @allMafs = uniqVals($f[9]); + my $maxMaf = maxval(@allMafs); + + # get common alleles + my @allRef = uniqVals($f[10]); + my @allAlt = uniqVals($f[11]); + if (0) + { + print STDERR sprintf("refAlleles has more than one:\t%s\t%s\n",$f[10],makeList(@allRef)) + if (scalar(@allRef)>1); + print STDERR sprintf("altAlleles has more than one:\t%s\t%s\n",$f[11],makeList(@allAlt)) + if (scalar(@allAlt)>1); + } + + my @dbsnpRef = uniqVals($f[4]); + my @dbsnpAlt = uniqVals($f[6]); + my $usedbsnp = ( (scalar(@dbsnpAlt)$maxv); + } + return $maxv; +} + +sub makeList +{ + my (@flds) = @_; + return join(",",@flds); +} + +sub uniqVals +{ + my ($fld) = @_; + + my @allAlleles = split(/,/,$fld); + my $outAllelesH = {}; + foreach my $allele (@allAlleles) { $outAllelesH->{$allele}++ if ($allele); } + my @outAlleles = keys(%$outAllelesH); + +# print STDERR join("\t",$fld,@outAlleles)."\n"; + + return @outAlleles; +} diff --git a/bin/setup.sh b/bin/setup.sh deleted file mode 100755 index fd3901e6..00000000 --- a/bin/setup.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env bash - -## edit this -##### required ##### -## samtools fai-indexed reference -export BISCUIT_REFERENCE="$1" - -assets_dir="$2" -#this_dir=$(dirname ${BASH_SOURCE[0]}) -##### optional ##### -## use if the file is nonexistent, the corresponding -## QC section will be skipped -## CpGs -export BISCUIT_CPGBED="$assets_dir/cpg.bed.gz" -## CpG islands -export BISCUIT_CGIBED="$assets_dir/cgi.bed.gz" -## repeat masker bed file -export BISCUIT_RMSK="$assets_dir/rmsk.bed.gz" -## merged exon bed file -export BISCUIT_EXON="$assets_dir/exon.bed.gz" -## genes -export BISCUIT_GENE="$assets_dir/genes.bed.gz" -## locations for the top 100bp bins in GC content -export BISCUIT_TOPGC_BED="$assets_dir/windows100bp.gc_content.top10p.bed.gz" -## locations for the bottom 100bp bins in GC content -export BISCUIT_BOTGC_BED="$assets_dir/windows100bp.gc_content.bot10p.bed.gz" - -### QC operations to perform ### -export BISCUIT_QC_BASECOV=true -export BISCUIT_QC_DUPLICATE=true -export BISCUIT_QC_CPGCOV=true -export BISCUIT_QC_CPGDIST=true -export BISCUIT_QC_CGICOV=true -export BISCUIT_QC_UNIFORMITY=true -export BISCUIT_QC_CPGUNIF=true -export BISCUIT_QC_BSCONV=true -export BISCUIT_QC_CGICOV=true -export BISCUIT_QC_MAPPING=true -export BISCUIT_QC_BETAS=true diff --git a/cpg.bed b/cpg.bed new file mode 100644 index 00000000..e69de29b diff --git a/gc_content.bed b/gc_content.bed new file mode 100644 index 00000000..e69de29b diff --git a/main.nf b/main.nf index cc1ff0ac..3c1e1718 100644 --- a/main.nf +++ b/main.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow /* ======================================================================================== - nf-core/methylseq + nf-core/methylseq ======================================================================================== nf-core/methylseq Analysis Pipeline. #### Homepage / Documentation @@ -10,87 +10,96 @@ */ def helpMessage() { - log.info nfcoreHeader() - log.info""" - - Usage: - - The typical command for running the pipeline is as follows: - - nextflow run nf-core/methylseq --reads '*_R{1,2}.fastq.gz' -profile docker - - Mandatory arguments: - --aligner [str] Alignment tool to use (default: bismark) - Available: bismark, bismark_hisat, bwameth, biscuit - --reads [file] Path to input data (must be surrounded with quotes) - -profile [str] Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, test, awsbatch, and more - - Options: - --genome [str] Name of iGenomes reference - --single_end [bool] Specifies that the input is single end reads - --comprehensive [bool] Output information for all cytosine contexts - --ignore_flags [bool] Run MethylDackel with the flag to ignore SAM flags. - --meth_cutoff [int] Specify a minimum read coverage to report a methylation call during Bismark's bismark_methylation_extractor step. - --min_depth [int] Specify a minimum read coverage for MethylDackel to report a methylation call or for biscuit pileup. - --methyl_kit [bool] Run MethylDackel with the --methyl_kit flag to produce files suitable for use with the methylKit R package. - --skip_deduplication [bool] Skip deduplication step after alignment. This is turned on automatically if --rrbs is specified - --non_directional [bool] Run alignment against all four possible strands - --save_align_intermeds [bool] Save aligned intermediates to results directory - --save_trimmed [bool] Save trimmed reads to results directory + log.info nfcoreHeader() + log.info""" + + Usage: + + The typical command for running the pipeline is as follows: + + nextflow run nf-core/methylseq --reads '*_R{1,2}.fastq.gz' -profile docker + + Mandatory arguments: + --aligner [str] Alignment tool to use (default: bismark) + Available: bismark, bismark_hisat, bwameth, biscuit + --reads [file] Path to input data (must be surrounded with quotes) + -profile [str] Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, test, awsbatch, and more + + Options: + --genome [str] Name of iGenomes reference + --single_end [bool] Specifies that the input is single end reads + --comprehensive [bool] Output information for all cytosine contexts + --cytosine_report [bool] Output stranded cytosine report during Bismark's bismark_methylation_extractor step. + --ignore_flags [bool] Run MethylDackel with the flag to ignore SAM flags. + --meth_cutoff [int] Specify a minimum read coverage to report a methylation call during Bismark's bismark_methylation_extractor step. + --min_depth [int] Specify a minimum read coverage for MethylDackel to report a methylation call or for biscuit pileup. + --methyl_kit [bool] Run MethylDackel with the --methyl_kit flag to produce files suitable for use with the methylKit R package. + --skip_deduplication [bool] Skip deduplication step after alignment. This is turned on automatically if --rrbs is specified + --non_directional [bool] Run alignment against all four possible strands + --save_align_intermeds [bool] Save aligned intermediates to results directory + --save_trimmed [bool] Save trimmed reads to results directory --save_pileup_file [bool] Save vcf-pileup and index-vcf files from biscuit aligner to results directory - --unmapped [bool] Save unmapped reads to fastq files - --relax_mismatches [bool] Turn on to relax stringency for alignment (set allowed penalty with --num_mismatches) - --num_mismatches [float] 0.6 will allow a penalty of bp * -0.6 - for 100bp reads (bismark default is 0.2) - --known_splices [file] Supply a .gtf file containing known splice sites (bismark_hisat only) - --slamseq [bool] Run bismark in SLAM-seq mode - --local_alignment [bool] Allow soft-clipping of reads (potentially useful for single-cell experiments) - --assets_dir [path] Assets directory for biscuit_QC, REQUIRED IF IN BISCUIT ALIGNER. can be found at: https://www.cse.huji.ac.il/~ekushele/assets.html - --epiread [bool] Convert bam to biscuit epiread format - - - References If not specified in the configuration file or you wish to overwrite any of the references. - --fasta [file] Path to fasta reference - --fasta_index [path] Path to Fasta Index - --bismark_index [path] Path to Bismark index - --bwa_biscuit_index [path] Path to Biscuit index - --bwa_meth_index [path] Path to bwameth index - --save_reference [bool] Save reference(s) to results directory - - Trimming options: - --skip_trimming [bool] Skip read trimming - --clip_r1 [int] Trim the specified number of bases from the 5' end of read 1 (or single-end reads). - --clip_r2 [int] Trim the specified number of bases from the 5' end of read 2 (paired-end only). - --three_prime_clip_r1 [int] Trim the specified number of bases from the 3' end of read 1 AFTER adapter/quality trimming - --three_prime_clip_r2 [int] Trim the specified number of bases from the 3' end of read 2 AFTER adapter/quality trimming - --rrbs [bool] Turn on if dealing with MspI digested material. - - Trimming presets: - --pbat [bool] - --single_cell [bool] - --epignome [bool] - --accell [bool] - --zymo [bool] - --cegx [bool] - - Other options: - --outdir [file] The output directory where the results will be saved - --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful - --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic - - AWSBatch options: - --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion [str] The AWS Region for your AWS Batch job to run on - --awscli [str] Path to the AWS CLI tool - """.stripIndent() + --save_snp_file [bool] Save SNP bed-file from biscuit to results directory. Relevant only if '--epiread' is specified + --unmapped [bool] Save unmapped reads to fastq files + --relax_mismatches [bool] Turn on to relax stringency for alignment (set allowed penalty with --num_mismatches) + --num_mismatches [float] 0.6 will allow a penalty of bp * -0.6 - for 100bp reads (bismark default is 0.2) + --known_splices [file] Supply a .gtf file containing known splice sites (bismark_hisat only) + --slamseq [bool] Run bismark in SLAM-seq mode + --local_alignment [bool] Allow soft-clipping of reads (potentially useful for single-cell experiments) + --bismark_align_cpu_per_multicore [int] Specify how many CPUs are required per --multicore for bismark align (default = 3) + --bismark_align_mem_per_multicore [str] Specify how much memory is required per --multicore for bismark align (default = 13.GB) + --assets_dir [path] Path to assets directory for biscuit_QC + --epiread [bool] Convert bam to biscuit epiread format + --whitelist [file] The complement of blacklist, needed for SNP extraction For more instuctions: https://www.cse.huji.ac.il/~ekushele/assets.html#whitelist + --common_dbsnp [file] Common dbSNP for the relevant genome, for SNP filteration + --cpg_file [file] Path to CpG file for the relevant genome (0-besed coordinates, not compressed) + --debug_epiread Debug epiread merging for paired end-keep original epiread file and merged epiread file in debug mode + --debug_epiread_merging Debug epiread merging. Output merged epiread in debug mode + + References If not specified in the configuration file or you wish to overwrite any of the references. + --fasta [file] Path to fasta reference + --fasta_index [path] Path to Fasta Index + --bismark_index [path] Path to Bismark index + --bwa_biscuit_index [path] Path to Biscuit index + --bwa_meth_index [path] Path to bwameth index + --save_reference [bool] Save reference(s) to results directory + + Trimming options: + --skip_trimming [bool] Skip read trimming + --clip_r1 [int] Trim the specified number of bases from the 5' end of read 1 (or single-end reads). + --clip_r2 [int] Trim the specified number of bases from the 5' end of read 2 (paired-end only). + --three_prime_clip_r1 [int] Trim the specified number of bases from the 3' end of read 1 AFTER adapter/quality trimming + --three_prime_clip_r2 [int] Trim the specified number of bases from the 3' end of read 2 AFTER adapter/quality trimming + --rrbs [bool] Turn on if dealing with MspI digested material. + + Trimming presets: + --pbat [bool] + --single_cell [bool] + --epignome [bool] + --accell [bool] + --zymo [bool] + --cegx [bool] + + Other options: + --outdir [file] The output directory where the results will be saved + --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful + --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic + + AWSBatch options: + --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion [str] The AWS Region for your AWS Batch job to run on + --awscli [str] Path to the AWS CLI tool + + """.stripIndent() } // Show help message if (params.help) { - helpMessage() - exit 0 + helpMessage() + exit 0 } // Validate inputs @@ -105,147 +114,139 @@ params.bismark_index = params.genome ? params.genomes[ params.genome ].bismark ? params.bwa_meth_index = params.genome ? params.genomes[ params.genome ].bwa_meth ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.fasta_index = params.genome ? params.genomes[ params.genome ].fasta_index ?: false : false -params.bwa_biscuit_index = false assembly_name = (params.fasta.toString().lastIndexOf('/') == -1) ?: params.fasta.toString().substring( params.fasta.toString().lastIndexOf('/')+1) -params.save_pileup_file = false -params.epiread = false -params.assets_dir = false - // Check if genome exists in the config file if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" + exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" } Channel - .fromPath("$baseDir/assets/where_are_my_files.txt", checkIfExists: true) - .into { ch_wherearemyfiles_for_trimgalore; ch_wherearemyfiles_for_alignment } + .fromPath("$baseDir/assets/where_are_my_files.txt", checkIfExists: true) + .into { ch_wherearemyfiles_for_trimgalore; ch_wherearemyfiles_for_alignment } ch_splicesites_for_bismark_hisat_align = params.known_splices ? Channel.fromPath("${params.known_splices}", checkIfExists: true).collect() : file('null') if( params.aligner =~ /bismark/ ){ - assert params.bismark_index || params.fasta : "No reference genome index or fasta file specified" - ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bismark_align;ch_wherearemyfiles_for_bismark_dedup_samtools_sort;ch_wherearemyfiles_for_bismark_samtools_sort } + assert params.bismark_index || params.fasta : "No reference genome index or fasta file specified" + ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bismark_align;ch_wherearemyfiles_for_bismark_dedup_samtools_sort;ch_wherearemyfiles_for_bismark_samtools_sort } Channel .fromPath(params.fasta, checkIfExists: true) .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } .into { ch_fasta_for_makeBismarkIndex; ch_fasta_for_picard } - - if( params.bismark_index ){ - Channel - .fromPath(params.bismark_index, checkIfExists: true) - .ifEmpty { exit 1, "Bismark index file not found: ${params.bismark_index}" } - .set { ch_bismark_index_for_bismark_align } + + if( params.bismark_index ){ + Channel + .fromPath(params.bismark_index, checkIfExists: true) + .ifEmpty { exit 1, "Bismark index file not found: ${params.bismark_index}" } + .into { ch_bismark_index_for_bismark_align; ch_bismark_index_for_bismark_methXtract } ch_fasta_for_makeBismarkIndex.close() - } - + } + } else if( params.aligner == 'bwameth' || params.aligner == 'biscuit'){ - assert params.fasta : "No Fasta reference specified!" - ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bwamem_align; ch_wherearemyfiles_for_biscuit_align; ch_wherearemyfiles_for_samtools_sort_index_flagstat; ch_wherearemyfiles_for_samblaster } - - Channel - .fromPath(params.fasta, checkIfExists: true) - .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } - .into { ch_fasta_for_makeBwaMemIndex; ch_fasta_for_makeFastaIndex; ch_fasta_for_methyldackel; ch_fasta_for_pileup; ch_fasta_for_epiread; ch_fasta_for_biscuitQC; ch_fasta_for_picard} - - if( params.bwa_meth_index ){ - Channel - .fromPath("${params.bwa_meth_index}*", checkIfExists: true) - .ifEmpty { exit 1, "bwa-meth index file(s) not found: ${params.bwa_meth_index}" } - .set { ch_bwa_meth_indices_for_bwamem_align } - ch_fasta_for_makeBwaMemIndex.close() - } - - if( params.bwa_biscuit_index ){ - Channel - .fromPath("${params.bwa_biscuit_index}*", checkIfExists: true) - .ifEmpty { exit 1, "bwa (biscuit) index file(s) not found: ${params.bwa_biscuit_index}" } - .set { ch_bwa_index_for_biscuit } - ch_fasta_for_makeBwaMemIndex.close() - } - - if( params.fasta_index ){ - Channel - .fromPath(params.fasta_index, checkIfExists: true) - .ifEmpty { exit 1, "fasta index file not found: ${params.fasta_index}" } - .into { ch_fasta_index_for_methyldackel; ch_fasta_index_for_biscuitQC; ch_fasta_index_for_createVCF; ch_fasta_index_for_epiread } - ch_fasta_for_makeFastaIndex.close() - } - } + assert params.fasta : "No Fasta reference specified!" + ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bwamem_align; ch_wherearemyfiles_for_biscuit_align; ch_wherearemyfiles_for_samtools_sort_index_flagstat; ch_wherearemyfiles_for_samblaster } -if( params.aligner == 'biscuit' ) { + Channel + .fromPath(params.fasta, checkIfExists: true) + .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } + .into { ch_fasta_for_makeBwaMemIndex; ch_fasta_for_makeFastaIndex; ch_fasta_for_buildBiscuitQCAssets; ch_fasta_for_methyldackel; ch_fasta_for_pileup; ch_fasta_for_epiread; ch_fasta_for_biscuitQC; ch_fasta_for_picard} + + if( params.bwa_meth_index ){ + Channel + .fromPath("${params.bwa_meth_index}*", checkIfExists: true) + .ifEmpty { exit 1, "bwa-meth index file(s) not found: ${params.bwa_meth_index}" } + .set { ch_bwa_meth_indices_for_bwamem_align } + ch_fasta_for_makeBwaMemIndex.close() + } - Channel - .fromPath("${params.assets_dir}", checkIfExists: true) - .ifEmpty { exit 1, "Assets directory for biscuit QC not found: ${params.assets_dir}" } - .set { ch_assets_dir_for_biscuit_qc } -} + if( params.bwa_biscuit_index ){ + Channel + .fromPath("${params.bwa_biscuit_index}*", checkIfExists: true) + .ifEmpty { exit 1, "bwa (biscuit) index file(s) not found: ${params.bwa_biscuit_index}" } + .set { ch_bwa_index_for_biscuit } + ch_fasta_for_makeBwaMemIndex.close() + } + + if( params.fasta_index ){ + Channel + .fromPath(params.fasta_index, checkIfExists: true) + .ifEmpty { exit 1, "fasta index file not found: ${params.fasta_index}" } + .into { ch_fasta_index_for_methyldackel; ch_fasta_index_for_biscuitQC; ch_fasta_index_for_createVCF; ch_fasta_index_for_epiread } + ch_fasta_for_makeFastaIndex.close() + } + } +if( params.aligner == 'biscuit' && params.assets_dir ) { + Channel + .fromPath("${params.assets_dir}", checkIfExists: true) + .ifEmpty { exit 1, "Assets directory for biscuit QC not found: ${params.assets_dir}" } + .into { ch_assets_dir_for_biscuit_qc; ch_assets_dir_with_cpg_for_epiread } + ch_fasta_for_buildBiscuitQCAssets.close() + + // Channel + // .fromPath("${params.assets_dir}/cpg.bed.gz", checkIfExists: true) + // .ifEmpty { exit 1, "CpG file not found : ${params.cpg_file}" } + // .set { ch_cpg_for_epiread; ch_cpg_file_for_cpg_index; } + // } +} if( workflow.profile == 'uppmax' || workflow.profile == 'uppmax_devel' ){ - if( !params.project ) exit 1, "No UPPMAX project ID found! Use --project" + if( !params.project ) exit 1, "No UPPMAX project ID found! Use --project" } // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name custom_runName = params.name if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { - custom_runName = workflow.runName + custom_runName = workflow.runName } -// Library prep presets -params.rrbs = false -params.pbat = false -params.single_cell = false -params.epignome = false -params.accel = false -params.zymo = false -params.cegx = false +// Trimming presets +clip_r1 = params.clip_r1 +clip_r2 = params.clip_r2 +three_prime_clip_r1 = params.three_prime_clip_r1 +three_prime_clip_r2 = params.three_prime_clip_r2 if(params.pbat){ - params.clip_r1 = 9 - params.clip_r2 = 9 - params.three_prime_clip_r1 = 9 - params.three_prime_clip_r2 = 9 + clip_r1 = 9 + clip_r2 = 9 + three_prime_clip_r1 = 9 + three_prime_clip_r2 = 9 } else if( params.single_cell ){ - params.clip_r1 = 6 - params.clip_r2 = 6 - params.three_prime_clip_r1 = 6 - params.three_prime_clip_r2 = 6 + clip_r1 = 6 + clip_r2 = 6 + three_prime_clip_r1 = 6 + three_prime_clip_r2 = 6 } else if( params.epignome ){ - params.clip_r1 = 8 - params.clip_r2 = 8 - params.three_prime_clip_r1 = 8 - params.three_prime_clip_r2 = 8 + clip_r1 = 8 + clip_r2 = 8 + three_prime_clip_r1 = 8 + three_prime_clip_r2 = 8 } else if( params.accel || params.zymo ){ - params.clip_r1 = 10 - params.clip_r2 = 15 - params.three_prime_clip_r1 = 10 - params.three_prime_clip_r2 = 10 + clip_r1 = 10 + clip_r2 = 15 + three_prime_clip_r1 = 10 + three_prime_clip_r2 = 10 } else if( params.cegx ){ - params.clip_r1 = 6 - params.clip_r2 = 6 - params.three_prime_clip_r1 = 2 - params.three_prime_clip_r2 = 2 -} else { - params.clip_r1 = 0 - params.clip_r2 = 0 - params.three_prime_clip_r1 = 0 - params.three_prime_clip_r2 = 0 + clip_r1 = 6 + clip_r2 = 6 + three_prime_clip_r1 = 2 + three_prime_clip_r2 = 2 } if (workflow.profile.contains('awsbatch')) { - // AWSBatch sanity checking - if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" - // Check outdir paths to be S3 buckets if running on AWSBatch - // related: https://github.com/nextflow-io/nextflow/issues/813 - if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" - // Prevent trace files to be stored on S3 since S3 does not support rolling files. - if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." + // AWSBatch sanity checking + if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." } // Stage config files @@ -257,99 +258,127 @@ ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true) * Create a channel for input read files */ if (params.readPaths) { - if (params.single_end) { - Channel - .from(params.readPaths) - .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true) ] ] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } - } else { - Channel - .from(params.readPaths) - .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true), file(row[1][1], checkIfExists: true) ] ] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } - } + if (params.single_end) { + Channel + .from(params.readPaths) + .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true) ] ] } + .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } + .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } + } else { + Channel + .from(params.readPaths) + .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true), file(row[1][1], checkIfExists: true) ] ] } + .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } + .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } + } } else { - Channel - .fromFilePairs( params.reads, size: params.single_end ? 1 : 2 ) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --single_end on the command line." } - .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } + Channel + .fromFilePairs( params.reads, size: params.single_end ? 1 : 2 ) + .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --single_end on the command line." } + .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } } +if (params.epiread) { + if (params.whitelist) { + Channel + .fromPath(params.whitelist, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any whitelist file matching: ${params.whitelist}\nWhitelist file is mandatory if epiread file conversion is required" } + .into { ch_whitelist_for_SNP; ch_whitelist_for_epiread} + } + + if (params.common_dbsnp) { + Channel + .fromPath(params.common_dbsnp, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any dbSNP file matching: ${params.common_dbsnp}\n" } + .set { ch_commonSNP_for_SNP; } + } + // if (!params.single_end) + // assert params.cpg_file: "No CpG file specified" + + // ch_cpg_for_epiread = Channel.empty() + // if (!params.single_end) { + // if (params.cpg_file) { + // Channel + // .fromPath(params.cpg_file, checkIfExists: true) + // .ifEmpty { exit 1, "CpG file not found : ${params.cpg_file}" } + // .set { ch_cpg_for_epiread; ch_cpg_file_for_cpg_index; } + // } + // } + + +} // Header log info log.info nfcoreHeader() def summary = [:] -if (workflow.revision) summary['Pipeline Release'] = workflow.revision -summary['Pipeline Name'] = 'nf-core/methylseq' -summary['Run Name'] = custom_runName ?: workflow.runName -summary['Reads'] = params.reads -summary['Aligner'] = params.aligner -summary['Spliced alignment'] = params.known_splices ? 'Yes' : 'No' -summary['SLAM-seq'] = params.slamseq ? 'Yes' : 'No' -summary['Local alignment'] = params.local_alignment ? 'Yes' : 'No' -summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' -summary['Genome'] = params.genome -if( params.bismark_index ) summary['Bismark Index'] = params.bismark_index -if( params.bwa_meth_index ) summary['BWA Meth Index'] = "${params.bwa_meth_index}*" -if( params.bwa_biscuit_index ) summary['BWA Index'] = "${params.bwa_biscuit_index}*" -if( params.fasta ) summary['Fasta Ref'] = params.fasta -if( params.fasta_index ) summary['Fasta Index'] = params.fasta_index -if( params.rrbs ) summary['RRBS Mode'] = 'On' -if( params.relax_mismatches ) summary['Mismatch Func'] = "L,0,-${params.num_mismatches} (Bismark default = L,0,-0.2)" -if( params.skip_trimming ) summary['Trimming Step'] = 'Skipped' -if( params.pbat ) summary['Trim Profile'] = 'PBAT' -if( params.single_cell ) summary['Trim Profile'] = 'Single Cell' -if( params.epignome ) summary['Trim Profile'] = 'TruSeq (EpiGnome)' -if( params.accel ) summary['Trim Profile'] = 'Accel-NGS (Swift)' -if( params.zymo ) summary['Trim Profile'] = 'Zymo Pico-Methyl' -if( params.cegx ) summary['Trim Profile'] = 'CEGX' - -summary['Trim R1'] = params.clip_r1 -summary['Trim R2'] = params.clip_r2 -summary["Trim 3' R1"] = params.three_prime_clip_r1 -summary["Trim 3' R2"] = params.three_prime_clip_r2 -summary['Deduplication'] = params.skip_deduplication || params.rrbs ? 'No' : 'Yes' -summary['Directional Mode'] = params.single_cell || params.zymo || params.non_directional ? 'No' : 'Yes' -summary['All C Contexts'] = params.comprehensive ? 'Yes' : 'No' -if( params.min_depth ) summary['Minimum Depth'] = params.min_depth -if( params.ignore_flags ) summary['MethylDackel'] = 'Ignoring SAM Flags' -if( params.methyl_kit ) summary['MethylDackel'] = 'Producing methyl_kit output' -if( params.assets_dir ) summary['Assets Directory'] = params.assets_dir -if( params.epiread ) summary['Epiread'] = params.epiread ? 'Yes' : 'No' - -summary['Save Reference'] = params.save_reference ? 'Yes' : 'No' -summary['Save Trimmed'] = params.save_trimmed ? 'Yes' : 'No' -summary['Save Unmapped'] = params.unmapped ? 'Yes' : 'No' -summary['Save Intermediates'] = params.save_align_intermeds ? 'Yes' : 'No' -summary['Save Pileups'] = params.save_pileup_file ? 'Yes' : 'No' - - -summary['Current home'] = "$HOME" -summary['Current path'] = "$PWD" -if( params.project ) summary['UPPMAX Project'] = params.project - -summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" +summary['Run Name'] = custom_runName ?: workflow.runName +summary['Reads'] = params.reads +summary['Aligner'] = params.aligner +summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' +if(params.known_splices) summary['Spliced alignment'] = 'Yes' +if(params.slamseq) summary['SLAM-seq'] = 'Yes' +if(params.local_alignment) summary['Local alignment'] = 'Yes' +if(params.genome) summary['Genome'] = params.genome +if(params.bismark_index) summary['Bismark Index'] = params.bismark_index +if(params.bwa_meth_index) summary['BWA-Meth Index'] = "${params.bwa_meth_index}*" +if(params.bwa_biscuit_index) summary['BWA Index'] = "${params.bwa_biscuit_index}*" +if(params.fasta) summary['Fasta Ref'] = params.fasta +if(params.fasta_index) summary['Fasta Index'] = params.fasta_index +if(params.rrbs) summary['RRBS Mode'] = 'On' +if(params.relax_mismatches) summary['Mismatch Func'] = "L,0,-${params.num_mismatches} (Bismark default = L,0,-0.2)" +if(params.skip_trimming) summary['Trimming Step'] = 'Skipped' +if(params.pbat) summary['Trim Profile'] = 'PBAT' +if(params.single_cell) summary['Trim Profile'] = 'Single Cell' +if(params.epignome) summary['Trim Profile'] = 'TruSeq (EpiGnome)' +if(params.accel) summary['Trim Profile'] = 'Accel-NGS (Swift)' +if(params.zymo) summary['Trim Profile'] = 'Zymo Pico-Methyl' +if(params.cegx) summary['Trim Profile'] = 'CEGX' +summary['Trimming'] = "5'R1: $clip_r1 / 5'R2: $clip_r2 / 3'R1: $three_prime_clip_r1 / 3'R2: $three_prime_clip_r2" +summary['Deduplication'] = params.skip_deduplication || params.rrbs ? 'No' : 'Yes' +summary['Directional Mode'] = params.single_cell || params.zymo || params.non_directional ? 'No' : 'Yes' +summary['All C Contexts'] = params.comprehensive ? 'Yes' : 'No' +summary['Cytosine report'] = params.cytosine_report ? 'Yes' : 'No' +if(params.min_depth) summary['Minimum Depth'] = params.min_depth +if(params.ignore_flags) summary['MethylDackel'] = 'Ignoring SAM Flags' +if(params.methyl_kit) summary['MethylDackel'] = 'Producing methyl_kit output' +save_intermeds = []; +if(params.save_reference) save_intermeds.add('Reference genome build') +if(params.save_trimmed) save_intermeds.add('Trimmed FastQ files') +if(params.unmapped) save_intermeds.add('Unmapped reads') +if(params.save_align_intermeds) save_intermeds.add('Intermediate BAM files') +if(params.save_pileup_file) save_intermeds.add('Pileup files') +if(params.save_snp_file) save_intermeds.add('SNP bed-files') +if(save_intermeds.size() > 0) summary['Save Intermediates'] = save_intermeds.join(', ') +debug_mode = []; +if(params.debug_epiread) debug_mode.add('Debug epiread step') +if(params.debug_epiread_merging) debug_mode.add('Debug epiread merging') +if(debug_mode.size() > 0) summary['Debug mode'] = debug_mode.join(', ') +if(params.bismark_align_cpu_per_multicore) summary['Bismark align CPUs per --multicore'] = params.bismark_align_cpu_per_multicore +if(params.bismark_align_mem_per_multicore) summary['Bismark align memory per --multicore'] = params.bismark_align_mem_per_multicore +if(params.assets_dir) summary['Assets Directory'] = params.assets_dir +if(params.soloWCGW_file) summary['soloWCGW File'] = params.soloWCGW_file +if(params.whitelist) summary['Whitelist'] = params.whitelist +if(params.common_dbsnp) summary['Common SNP'] = params.common_dbsnp +if(params.cpg_file) summary['CpG File'] = params.cpg_file +if(params.epiread) summary['Epiread'] = 'Yes' +summary['Output dir'] = params.outdir +summary['Launch dir'] = workflow.launchDir +summary['Working dir'] = workflow.workDir +summary['Pipeline dir'] = workflow.projectDir +summary['User'] = workflow.userName +summary['Config Profile'] = workflow.profile if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" -summary['Output dir'] = params.outdir -summary['Launch dir'] = workflow.launchDir -summary['Working dir'] = workflow.workDir -summary['Script dir'] = workflow.projectDir -summary['User'] = workflow.userName if (workflow.profile.contains('awsbatch')) { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue - summary['AWS CLI'] = params.awscli + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue + summary['AWS CLI'] = params.awscli } -summary['Config Profile'] = workflow.profile +if(params.project) summary['Cluster Project'] = params.project if (params.config_profile_description) summary['Config Description'] = params.config_profile_description if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact if (params.config_profile_url) summary['Config URL'] = params.config_profile_url -if (params.email || params.email_on_fail) { - summary['E-mail Address'] = params.email - summary['E-mail on failure'] = params.email_on_fail - summary['MultiQC maxsize'] = params.max_multiqc_email_size -} +summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" +if(params.email) summary['E-mail Address'] = params.email +if(params.email_on_fail) summary['E-mail on failure'] = params.email_on_fail log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") log.info "-\033[2m--------------------------------------------------\033[0m-" @@ -357,36 +386,37 @@ log.info "-\033[2m--------------------------------------------------\033[0m-" checkHostname() Channel.from(summary.collect{ [it.key, it.value] }) - .map { k,v -> "
$k
${v ?: 'N/A'}
" } - .reduce { a, b -> return [a, b].join("\n ") } - .map { x -> """ - id: 'nf-core-methylseq-summary' - description: " - this information is collected when the pipeline is started." - section_name: 'nf-core/methylseq Workflow Summary' - section_href: 'https://github.com/nf-core/methylseq' - plot_type: 'html' - data: | -
- $x -
- """.stripIndent() } - .set { ch_workflow_summary } + .map { k,v -> "
$k
${v ?: 'N/A'}
" } + .reduce { a, b -> return [a, b].join("\n ") } + .map { x -> """ + id: 'nf-core-methylseq-summary' + description: " - this information is collected when the pipeline is started." + section_name: 'nf-core/methylseq Workflow Summary' + section_href: 'https://github.com/nf-core/methylseq' + plot_type: 'html' + data: | +
+ $x +
+ """.stripIndent() } + .set { ch_workflow_summary } /* * Parse software version numbers */ process get_software_versions { - publishDir "${params.outdir}/pipeline_info", mode: 'copy', - saveAs: { filename -> - if (filename.indexOf(".csv") > 0) filename - else null - } + publishDir "${params.outdir}/pipeline_info", mode: 'copy', + saveAs: { filename -> + if (filename.indexOf(".csv") > 0) filename + else null + } - output: - file 'software_versions_mqc.yaml' into ch_software_versions_yaml_for_multiqc + output: + file 'software_versions_mqc.yaml' into ch_software_versions_yaml_for_multiqc + file "software_versions.csv" - script: - """ + script: + """ echo "$workflow.manifest.version" &> v_ngi_methylseq.txt echo "$workflow.nextflow.version" &> v_nextflow.txt bismark_genome_preparation --version &> v_bismark_genome_preparation.txt @@ -412,102 +442,122 @@ process get_software_versions { multiqc --version &> v_multiqc.txt samblaster --version &> v_samblaster.txt biscuit &>v_biscuit.txt 2>&1 || true - $baseDir/bin/scrape_software_versions.py &> software_versions_mqc.yaml + scrape_software_versions.py &> software_versions_mqc.yaml """ - - - } /* * PREPROCESSING - Build Bismark index */ if( !params.bismark_index && params.aligner =~ /bismark/ ){ - process makeBismarkIndex { - publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { params.save_reference ? it : null }, mode: 'copy' - - input: - file fasta from ch_fasta_for_makeBismarkIndex - - output: - file "BismarkIndex" into ch_bismark_index_for_bismark_align - - script: - aligner = params.aligner == 'bismark_hisat' ? '--hisat2' : '--bowtie2' - slam = params.slamseq ? '--slam' : '' - """ - mkdir BismarkIndex - cp $fasta BismarkIndex/ - bismark_genome_preparation $aligner $slam BismarkIndex - """ - } + process makeBismarkIndex { + publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { params.save_reference ? it : null }, mode: 'copy' + + input: + file fasta from ch_fasta_for_makeBismarkIndex + + output: + file "BismarkIndex" into ch_bismark_index_for_bismark_align + + script: + aligner = params.aligner == 'bismark_hisat' ? '--hisat2' : '--bowtie2' + slam = params.slamseq ? '--slam' : '' + """ + mkdir BismarkIndex + cp $fasta BismarkIndex/ + bismark_genome_preparation $aligner $slam BismarkIndex + """ + } } /* * PREPROCESSING - Build bwa-mem index */ if( !params.bwa_meth_index && params.aligner == 'bwameth'){ - process makeBwaMemIndex { - tag "$fasta" - publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' + process makeBwaMemIndex { + tag "$fasta" + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' - input: - file fasta from ch_fasta_for_makeBwaMemIndex + input: + file fasta from ch_fasta_for_makeBwaMemIndex - output: - file "${fasta}*" into ch_bwa_meth_indices_for_bwamem_align + output: + file "${fasta}*" into ch_bwa_meth_indices_for_bwamem_align - script: - """ - bwameth.py index $fasta - """ - } + script: + """ + bwameth.py index $fasta + """ + } } /* * PREPROCESSING - Build bwa index, using biscuit */ if(!params.bwa_biscuit_index && params.aligner == 'biscuit' ){ - process makeBwaBISCUITIndex { - tag "$fasta" - publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' - - input: - file fasta from ch_fasta_for_makeBwaMemIndex - - output: - file "${fasta}*" into ch_bwa_index_for_biscuit - - script: - """ - mkdir BiscuitIndex - cp $fasta BiscuitIndex/ - biscuit index $fasta - cp ${fasta}* BiscuitIndex - """ - } + process makeBwaBISCUITIndex { + tag "$fasta" + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' + + input: + file fasta from ch_fasta_for_makeBwaMemIndex + + output: + file "${fasta}*" into ch_bwa_index_for_biscuit + + script: + """ + mkdir BiscuitIndex + cp $fasta BiscuitIndex/ + biscuit index $fasta + cp ${fasta}* BiscuitIndex + """ + } } /* * PREPROCESSING - Index Fasta file */ if( !params.fasta_index && params.aligner == 'bwameth' || !params.fasta_index && params.aligner == 'biscuit' ){ - process makeFastaIndex { - tag "$fasta" - publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' + process makeFastaIndex { + tag "$fasta" + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' + + input: + file fasta from ch_fasta_for_makeFastaIndex + + output: + file "${fasta}.fai" into ch_fasta_index_for_methyldackel,ch_fasta_index_for_biscuitQC,ch_fasta_index_for_createVCF,ch_fasta_index_for_epiread + + script: + """ + samtools faidx $fasta + """ + } +} - input: - file fasta from ch_fasta_for_makeFastaIndex +/* + * PREPROCESSING - Build Biscuit QC assets + */ +if( !params.assets_dir && params.aligner == 'biscuit' ) { + process buildBiscuitQCAssets { + tag "$fasta" + publishDir path: "${params.outdir}/reference_assets", saveAs: { params.save_reference ? it : null }, mode: 'copy' + + input: + file fasta from ch_fasta_for_buildBiscuitQCAssets + + output: + file "assets" into ch_assets_dir_for_biscuit_qc, ch_assets_dir_with_cpg_for_epiread + // file "cpg.bed" into ch_cpg_for_epiread - output: - file "${fasta}.fai" into ch_fasta_index_for_methyldackel,ch_fasta_index_for_biscuitQC,ch_fasta_index_for_createVCF,ch_fasta_index_for_epiread - script: - """ - samtools faidx $fasta - """ - } + script: + """ + build_biscuit_QC_assets.pl -r $fasta -o assets + """ + } } @@ -515,23 +565,23 @@ if( !params.fasta_index && params.aligner == 'bwameth' || !params.fasta_index & * STEP 1 - FastQC */ process fastqc { - tag "$name" - label 'process_medium' - publishDir "${params.outdir}/fastqc", mode: 'copy', - saveAs: { filename -> - filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename" - } - - input: - set val(name), file(reads) from ch_read_files_for_fastqc - - output: - file '*_fastqc.{zip,html}' into ch_fastqc_results_for_multiqc - - script: - """ - fastqc --quiet --threads $task.cpus $reads - """ + tag "$name" + label 'process_medium' + publishDir "${params.outdir}/fastqc", mode: 'copy', + saveAs: { filename -> + filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename" + } + + input: + set val(name), file(reads) from ch_read_files_for_fastqc + + output: + file '*_fastqc.{zip,html}' into ch_fastqc_results_for_multiqc + + script: + """ + fastqc --quiet --threads $task.cpus $reads + """ } /* @@ -539,360 +589,304 @@ process fastqc { */ if( params.skip_trimming ){ ch_trimmed_reads_for_alignment = ch_read_files_for_trim_galore - ch_trim_galore_results_for_multiqc = Channel.from(false) + ch_trim_galore_results_for_multiqc = Channel.from(false) } else { - process trim_galore { - tag "$name" - publishDir "${params.outdir}/trim_galore", mode: 'copy', - saveAs: {filename -> - if( filename.indexOf("_fastqc") > 0 ) "FastQC/$filename" - else if( filename.indexOf("trimming_report.txt" ) > 0) "logs/$filename" - else if( !params.save_trimmed && filename == "where_are_my_files.txt" ) filename - else if( params.save_trimmed && filename != "where_are_my_files.txt" ) filename - else null - } - - input: - set val(name), file(reads) from ch_read_files_for_trim_galore - file wherearemyfiles from ch_wherearemyfiles_for_trimgalore.collect() - - output: - set val(name), file('*fq.gz') into ch_trimmed_reads_for_alignment - file "*trimming_report.txt" into ch_trim_galore_results_for_multiqc - file "*_fastqc.{zip,html}" - file "where_are_my_files.txt" - - script: - def c_r1 = clip_r1 > 0 ? "--clip_r1 $clip_r1" : '' - def c_r2 = clip_r2 > 0 ? "--clip_r2 $clip_r2" : '' - def tpc_r1 = three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 $three_prime_clip_r1" : '' - def tpc_r2 = three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 $three_prime_clip_r2" : '' - def rrbs = params.rrbs ? "--rrbs" : '' - def cores = 1 - if(task.cpus){ - cores = (task.cpus as int) - 4 - if (params.single_end) cores = (task.cpus as int) - 3 - if (cores < 1) cores = 1 - if (cores > 4) cores = 4 - } - if( params.single_end ) { - """ - trim_galore --fastqc --gzip $reads \ - $rrbs $c_r1 $tpc_r1 --cores $cores - """ - } else { - """ - trim_galore --fastqc --gzip --paired $reads \ - $rrbs $c_r1 $c_r2 $tpc_r1 $tpc_r2 --cores $cores - """ - } - } + process trim_galore { + tag "$name" + publishDir "${params.outdir}/trim_galore", mode: 'copy', + saveAs: {filename -> + if( filename.indexOf("_fastqc") > 0 ) "FastQC/$filename" + else if( filename.indexOf("trimming_report.txt" ) > 0) "logs/$filename" + else if( !params.save_trimmed && filename == "where_are_my_files.txt" ) filename + else if( params.save_trimmed && filename != "where_are_my_files.txt" ) filename + else null + } + + input: + set val(name), file(reads) from ch_read_files_for_trim_galore + file wherearemyfiles from ch_wherearemyfiles_for_trimgalore.collect() + + output: + set val(name), file('*fq.gz') into ch_trimmed_reads_for_alignment + file "*trimming_report.txt" into ch_trim_galore_results_for_multiqc + file "*_fastqc.{zip,html}" + file "where_are_my_files.txt" + + script: + def c_r1 = clip_r1 > 0 ? "--clip_r1 $clip_r1" : '' + def c_r2 = clip_r2 > 0 ? "--clip_r2 $clip_r2" : '' + def tpc_r1 = three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 $three_prime_clip_r1" : '' + def tpc_r2 = three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 $three_prime_clip_r2" : '' + def rrbs = params.rrbs ? "--rrbs" : '' + def cores = 1 + if(task.cpus){ + cores = (task.cpus as int) - 4 + if (params.single_end) cores = (task.cpus as int) - 3 + if (cores < 1) cores = 1 + if (cores > 4) cores = 4 + } + if( params.single_end ) { + """ + trim_galore --fastqc --gzip $reads \ + $rrbs $c_r1 $tpc_r1 --cores $cores + """ + } else { + """ + trim_galore --fastqc --gzip --paired $reads \ + $rrbs $c_r1 $c_r2 $tpc_r1 $tpc_r2 --cores $cores + """ + } + } } /* * STEP 3.1 - align with Bismark */ if( params.aligner =~ /bismark/ ){ - process bismark_align { - tag "$name" - publishDir "${params.outdir}/bismark_alignments", mode: 'copy', - saveAs: {filename -> - if( filename.indexOf(".fq.gz") > 0 ) "unmapped/$filename" - else if( filename.indexOf("report.txt") > 0 ) "logs/$filename" - else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt" ) filename - else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt" ) filename - else null - } - - input: - set val(name), file(reads) from ch_trimmed_reads_for_alignment - file index from ch_bismark_index_for_bismark_align.collect() - file wherearemyfiles from ch_wherearemyfiles_for_bismark_align.collect() - file knownsplices from ch_splicesites_for_bismark_hisat_align - - output: - set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_samtools_sort_index_flagstat - set val(name), file("*report.txt") into ch_bismark_align_log_for_bismark_report, ch_bismark_align_log_for_bismark_summary, ch_bismark_align_log_for_multiqc - file "*.fq.gz" optional true - file "where_are_my_files.txt" - - script: - // Paired-end or single end input files - input = params.single_end ? reads : "-1 ${reads[0]} -2 ${reads[1]}" - - // Choice of read aligner - aligner = params.aligner == "bismark_hisat" ? "--hisat2" : "--bowtie2" - - // Optional extra bismark parameters - splicesites = params.aligner == "bismark_hisat" && knownsplices.name != 'null' ? "--known-splicesite-infile <(hisat2_extract_splice_sites.py ${knownsplices})" : '' - pbat = params.pbat ? "--pbat" : '' - non_directional = params.single_cell || params.zymo || params.non_directional ? "--non_directional" : '' - unmapped = params.unmapped ? "--unmapped" : '' - mismatches = params.relax_mismatches ? "--score_min L,0,-${params.num_mismatches}" : '' - soft_clipping = params.local_alignment ? "--local" : '' - - // Try to assign sensible bismark memory units according to what the task was given - multicore = '' - if( task.cpus ){ - // Numbers based on recommendation by Felix for a typical mouse genome - if( params.single_cell || params.zymo || params.non_directional ){ - cpu_per_multicore = 5 - mem_per_multicore = (18.GB).toBytes() - } else { - cpu_per_multicore = 3 - mem_per_multicore = (13.GB).toBytes() - } - // How many multicore splits can we afford with the cpus we have? - ccore = ((task.cpus as int) / cpu_per_multicore) as int - // Check that we have enough memory, assuming 13GB memory per instance (typical for mouse alignment) - try { - tmem = (task.memory as nextflow.util.MemoryUnit).toBytes() - mcore = (tmem / mem_per_multicore) as int - ccore = Math.min(ccore, mcore) - } catch (all) { - log.debug "Warning: Not able to define bismark align multicore based on available memory" - } - if( ccore > 1 ){ - multicore = "--multicore $ccore" - } - } - - // Main command - """ - bismark $input \\ - $aligner \\ - --bam $pbat $non_directional $unmapped $mismatches $multicore \\ - --genome $index \\ - $reads \\ - $soft_clipping \\ - $splicesites - """ - } + process bismark_align { + tag "$name" + publishDir "${params.outdir}/bismark_alignments", mode: 'copy', + saveAs: {filename -> + if( filename.indexOf(".fq.gz") > 0 ) "unmapped/$filename" + else if( filename.indexOf("report.txt") > 0 ) "logs/$filename" + else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt" ) filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt" ) filename + else null + } + + input: + set val(name), file(reads) from ch_trimmed_reads_for_alignment + file index from ch_bismark_index_for_bismark_align.collect() + file wherearemyfiles from ch_wherearemyfiles_for_bismark_align.collect() + file knownsplices from ch_splicesites_for_bismark_hisat_align + + output: + set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_samtools_sort_index_flagstat + set val(name), file("*report.txt") into ch_bismark_align_log_for_bismark_report, ch_bismark_align_log_for_bismark_summary, ch_bismark_align_log_for_multiqc + file "*.fq.gz" optional true + file "where_are_my_files.txt" + + script: + // Paired-end or single end input files + input = params.single_end ? reads : "-1 ${reads[0]} -2 ${reads[1]}" + + // Choice of read aligner + aligner = params.aligner == "bismark_hisat" ? "--hisat2" : "--bowtie2" + + // Optional extra bismark parameters + splicesites = params.aligner == "bismark_hisat" && knownsplices.name != 'null' ? "--known-splicesite-infile <(hisat2_extract_splice_sites.py ${knownsplices})" : '' + pbat = params.pbat ? "--pbat" : '' + non_directional = params.single_cell || params.zymo || params.non_directional ? "--non_directional" : '' + unmapped = params.unmapped ? "--unmapped" : '' + mismatches = params.relax_mismatches ? "--score_min L,0,-${params.num_mismatches}" : '' + soft_clipping = params.local_alignment ? "--local" : '' + + // Try to assign sensible bismark memory units according to what the task was given + multicore = '' + if( task.cpus ){ + // Numbers based on recommendation by Felix for a typical mouse genome + if( params.single_cell || params.zymo || params.non_directional ){ + cpu_per_multicore = 5 + mem_per_multicore = (18.GB).toBytes() + } else { + cpu_per_multicore = 3 + mem_per_multicore = (13.GB).toBytes() + } + // Check if the user has specified this and overwrite if so + if(params.bismark_align_cpu_per_multicore) { + cpu_per_multicore = (params.bismark_align_cpu_per_multicore as int) + } + if(params.bismark_align_mem_per_multicore) { + mem_per_multicore = (params.bismark_align_mem_per_multicore as nextflow.util.MemoryUnit).toBytes() + } + // How many multicore splits can we afford with the cpus we have? + ccore = ((task.cpus as int) / cpu_per_multicore) as int + // Check that we have enough memory, assuming 13GB memory per instance (typical for mouse alignment) + try { + tmem = (task.memory as nextflow.util.MemoryUnit).toBytes() + mcore = (tmem / mem_per_multicore) as int + ccore = Math.min(ccore, mcore) + } catch (all) { + log.debug "Warning: Not able to define bismark align multicore based on available memory" + } + if( ccore > 1 ){ + multicore = "--multicore $ccore" + } + } + + // Main command + """ + bismark $input \\ + $aligner \\ + --bam $pbat $non_directional $unmapped $mismatches $multicore \\ + --genome $index \\ + $reads \\ + $soft_clipping \\ + $splicesites + """ + } /* - * STEP 4 - Samtools sort bismark - */ - process samtools_sort_index_flagstat_bismark { - tag "$name" - publishDir "${params.outdir}/samtools", mode: 'copy', - saveAs: {filename -> - if(filename.indexOf("report.txt") > 0) "logs/$filename" - else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename - else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename - else null - } - - input: - set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat - file wherearemyfiles from ch_wherearemyfiles_for_bismark_samtools_sort.collect() - - output: - set val(name), file("*.sorted.bam") into ch_bam_for_preseq,ch_bam_sorted_for_picard - file "where_are_my_files.txt" - - script: - def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' - """ - samtools sort $bam \\ - -@ ${task.cpus} $sort_mem \\ - -o ${bam.baseName}.sorted.bam - """ - } + * STEP 4 - Bismark deduplicate + */ + if( params.skip_deduplication || params.rrbs ) { + ch_bam_for_bismark_deduplicate.into { ch_bam_dedup_for_bismark_methXtract; ch_bam_dedup_for_qualimap } + ch_bismark_dedup_log_for_bismark_report = Channel.from(false) + ch_bismark_dedup_log_for_bismark_summary = Channel.from(false) + ch_bismark_dedup_log_for_multiqc = Channel.from(false) + } else { + process bismark_deduplicate { + tag "$name" + publishDir "${params.outdir}/bismark_deduplicated", mode: 'copy', + saveAs: {filename -> filename.indexOf(".bam") == -1 ? "logs/$filename" : "$filename"} + + input: + set val(name), file(bam) from ch_bam_for_bismark_deduplicate + + output: + set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract,ch_bam_dedup_for_qualimap + set val(name), file("*.deduplication_report.txt") into ch_bismark_dedup_log_for_bismark_report, ch_bismark_dedup_log_for_bismark_summary, ch_bismark_dedup_log_for_multiqc + + script: + fq_type = params.single_end ? '-s' : '-p' + """ + deduplicate_bismark $fq_type --bam $bam + """ + } + } - - /* - * STEP 5 - Bismark deduplicate - */ - if( params.skip_deduplication || params.rrbs ) { - ch_bam_for_bismark_deduplicate.into { ch_bam_dedup_for_bismark_methXtract; ch_dedup_bam_for_samtools_sort_index_flagstat } - ch_bismark_dedup_log_for_bismark_report = Channel.from(false) - ch_bismark_dedup_log_for_bismark_summary = Channel.from(false) - ch_bismark_dedup_log_for_multiqc = Channel.from(false) - } else { - process bismark_deduplicate { - tag "$name" - publishDir "${params.outdir}/bismark_deduplicated", mode: 'copy', - saveAs: {filename -> filename.indexOf(".bam") == -1 ? "logs/$filename" : "$filename"} - - input: - set val(name), file(bam) from ch_bam_for_bismark_deduplicate - - output: - set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract,ch_dedup_bam_for_samtools_sort_index_flagstat - set val(name), file("*.deduplication_report.txt") into ch_bismark_dedup_log_for_bismark_report, ch_bismark_dedup_log_for_bismark_summary, ch_bismark_dedup_log_for_multiqc - - script: - fq_type = params.single_end ? '-s' : '-p' - """ - deduplicate_bismark $fq_type --bam $bam - """ - } - } - /* - * STEP 6 - Samtools sort bismark after dedup - */ - process samtools_sort_index_flagstat_dedup_bismark { - tag "$name" - publishDir "${params.outdir}/samtools", mode: 'copy', - saveAs: {filename -> - if(filename.indexOf("report.txt") > 0) "logs/$filename" - else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename - else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename - else null - } - - input: - set val(name), file(bam) from ch_dedup_bam_for_samtools_sort_index_flagstat - file wherearemyfiles from ch_wherearemyfiles_for_bismark_dedup_samtools_sort.collect() - - output: - set val(name), file("*.sorted.bam") into ch_bam_dedup_for_qualimap - file "where_are_my_files.txt" - - script: - def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' - """ - samtools sort $bam \\ - -@ ${task.cpus} $sort_mem \\ - -o ${bam.baseName}.sorted.bam - """ - } - + * STEP 5 - Bismark methylation extraction + */ + process bismark_methXtract { + tag "$name" + publishDir "${params.outdir}/bismark_methylation_calls", mode: 'copy', + saveAs: {filename -> + if( filename.indexOf("splitting_report.txt" ) > 0 ) "logs/$filename" + else if( filename.indexOf("M-bias" ) > 0) "m-bias/$filename" + else if( filename.indexOf(".cov" ) > 0 ) "methylation_coverage/$filename" + else if( filename.indexOf("bedGraph" ) > 0 ) "bedGraph/$filename" + else if( filename.indexOf("CpG_report" ) > 0 ) "stranded_CpG_report/$filename" + else "methylation_calls/$filename" + } + + input: + set val(name), file(bam) from ch_bam_dedup_for_bismark_methXtract + file index from ch_bismark_index_for_bismark_methXtract.collect() + + output: + set val(name), file("*splitting_report.txt") into ch_bismark_splitting_report_for_bismark_report, ch_bismark_splitting_report_for_bismark_summary, ch_bismark_splitting_report_for_multiqc + set val(name), file("*.M-bias.txt") into ch_bismark_mbias_for_bismark_report, ch_bismark_mbias_for_bismark_summary, ch_bismark_mbias_for_multiqc + file '*.{png,gz}' + + script: + comprehensive = params.comprehensive ? '--comprehensive --merge_non_CpG' : '' + cytosine_report = params.cytosine_report ? "--cytosine_report --genome_folder ${index} " : '' + meth_cutoff = params.meth_cutoff ? "--cutoff ${params.meth_cutoff}" : '' + multicore = '' + if( task.cpus ){ + // Numbers based on Bismark docs + ccore = ((task.cpus as int) / 3) as int + if( ccore > 1 ){ + multicore = "--multicore $ccore" + } + } + buffer = '' + if( task.memory ){ + mbuffer = (task.memory as nextflow.util.MemoryUnit) - 2.GB + // only set if we have more than 6GB available + if( mbuffer.compareTo(4.GB) == 1 ){ + buffer = "--buffer_size ${mbuffer.toGiga()}G" + } + } + if(params.single_end) { + """ + bismark_methylation_extractor $comprehensive $meth_cutoff \\ + $multicore $buffer $cytosine_report \\ + --bedGraph \\ + --counts \\ + --gzip \\ + -s \\ + --report \\ + $bam + """ + } else { + """ + bismark_methylation_extractor $comprehensive $meth_cutoff \\ + $multicore $buffer $cytosine_report \\ + --ignore_r2 2 \\ + --ignore_3prime_r2 2 \\ + --bedGraph \\ + --counts \\ + --gzip \\ + -p \\ + --no_overlap \\ + --report \\ + $bam + """ + } + } + + + ch_bismark_align_log_for_bismark_report + .join(ch_bismark_dedup_log_for_bismark_report) + .join(ch_bismark_splitting_report_for_bismark_report) + .join(ch_bismark_mbias_for_bismark_report) + .set{ ch_bismark_logs_for_bismark_report } + + + /* + * STEP 6 - Bismark Sample Report + */ + process bismark_report { + tag "$name" + publishDir "${params.outdir}/bismark_reports", mode: 'copy' + + input: + set val(name), file(align_log), file(dedup_log), file(splitting_report), file(mbias) from ch_bismark_logs_for_bismark_report + + output: + file '*{html,txt}' into ch_bismark_reports_results_for_multiqc + + script: + """ + bismark2report \\ + --alignment_report $align_log \\ + --dedup_report $dedup_log \\ + --splitting_report $splitting_report \\ + --mbias_report $mbias + """ + } + + /* + * STEP 7 - Bismark Summary Report + */ + process bismark_summary { + publishDir "${params.outdir}/bismark_summary", mode: 'copy' - /* - * STEP 7 - Bismark methylation extraction - */ - process bismark_methXtract { - tag "$name" - publishDir "${params.outdir}/bismark_methylation_calls", mode: 'copy', - saveAs: {filename -> - if( filename.indexOf("splitting_report.txt" ) > 0 ) "logs/$filename" - else if( filename.indexOf("M-bias" ) > 0) "m-bias/$filename" - else if( filename.indexOf(".cov" ) > 0 ) "methylation_coverage/$filename" - else if( filename.indexOf("bedGraph" ) > 0 ) "bedGraph/$filename" - else if( filename.indexOf("CpG_report" ) > 0 ) "stranded_CpG_report/$filename" - else "methylation_calls/$filename" - } - - input: - set val(name), file(bam) from ch_bam_dedup_for_bismark_methXtract - file index from ch_bismark_index_for_bismark_methXtract.collect() - - output: - set val(name), file("*splitting_report.txt") into ch_bismark_splitting_report_for_bismark_report, ch_bismark_splitting_report_for_bismark_summary, ch_bismark_splitting_report_for_multiqc - set val(name), file("*.M-bias.txt") into ch_bismark_mbias_for_bismark_report, ch_bismark_mbias_for_bismark_summary, ch_bismark_mbias_for_multiqc - file '*.{png,gz}' - - script: - comprehensive = params.comprehensive ? '--comprehensive --merge_non_CpG' : '' - cytosine_report = params.cytosine_report ? "--cytosine_report --genome_folder ${index} " : '' - meth_cutoff = params.meth_cutoff ? "--cutoff ${params.meth_cutoff}" : '' - multicore = '' - if( task.cpus ){ - // Numbers based on Bismark docs - ccore = ((task.cpus as int) / 3) as int - if( ccore > 1 ){ - multicore = "--multicore $ccore" - } - } - buffer = '' - if( task.memory ){ - mbuffer = (task.memory as nextflow.util.MemoryUnit) - 2.GB - // only set if we have more than 6GB available - if( mbuffer.compareTo(4.GB) == 1 ){ - buffer = "--buffer_size ${mbuffer.toGiga()}G" - } - } - if(params.single_end) { - """ - bismark_methylation_extractor $comprehensive $meth_cutoff \\ - $multicore $buffer $cytosine_report \\ - --bedGraph \\ - --counts \\ - --gzip \\ - -s \\ - --report \\ - $bam - """ - } else { - """ - bismark_methylation_extractor $comprehensive $meth_cutoff \\ - $multicore $buffer $cytosine_report \\ - --ignore_r2 2 \\ - --ignore_3prime_r2 2 \\ - --bedGraph \\ - --counts \\ - --gzip \\ - -p \\ - --no_overlap \\ - --report \\ - $bam - """ - } - } - - ch_bismark_align_log_for_bismark_report - .join(ch_bismark_dedup_log_for_bismark_report) - .join(ch_bismark_splitting_report_for_bismark_report) - .join(ch_bismark_mbias_for_bismark_report) - .set{ ch_bismark_logs_for_bismark_report } - - - /* - * STEP 8 - Bismark Sample Report - */ - process bismark_report { - tag "$name" - publishDir "${params.outdir}/bismark_reports", mode: 'copy' - - input: - set val(name), file(align_log), file(dedup_log), file(splitting_report), file(mbias) from ch_bismark_logs_for_bismark_report - - output: - file '*{html,txt}' into ch_bismark_reports_results_for_multiqc - - script: - """ - bismark2report \\ - --alignment_report $align_log \\ - --dedup_report $dedup_log \\ - --splitting_report $splitting_report \\ - --mbias_report $mbias - """ - } - - /* - * STEP 8 - Bismark Summary Report - */ - process bismark_summary { - publishDir "${params.outdir}/bismark_summary", mode: 'copy' - - input: - file ('*') from ch_bam_for_bismark_summary.collect() - file ('*') from ch_bismark_align_log_for_bismark_summary.collect() - file ('*') from ch_bismark_dedup_log_for_bismark_summary.collect() - file ('*') from ch_bismark_splitting_report_for_bismark_summary.collect() - file ('*') from ch_bismark_mbias_for_bismark_summary.collect() - - output: - file '*{html,txt}' into ch_bismark_summary_results_for_multiqc - - script: - """ - bismark2summary - """ - } + input: + file ('*') from ch_bam_for_bismark_summary.collect() + file ('*') from ch_bismark_align_log_for_bismark_summary.collect() + file ('*') from ch_bismark_dedup_log_for_bismark_summary.collect() + file ('*') from ch_bismark_splitting_report_for_bismark_summary.collect() + file ('*') from ch_bismark_mbias_for_bismark_summary.collect() + + output: + file '*{html,txt}' into ch_bismark_summary_results_for_multiqc + + script: + """ + bismark2summary + """ + } } // End of bismark processing block else { - ch_bismark_align_log_for_multiqc = Channel.from(false) - ch_bismark_dedup_log_for_multiqc = Channel.from(false) - ch_bismark_splitting_report_for_multiqc = Channel.from(false) - ch_bismark_mbias_for_multiqc = Channel.from(false) - ch_bismark_reports_results_for_multiqc = Channel.from(false) - ch_bismark_summary_results_for_multiqc = Channel.from(false) + ch_bismark_align_log_for_multiqc = Channel.from(false) + ch_bismark_dedup_log_for_multiqc = Channel.from(false) + ch_bismark_splitting_report_for_multiqc = Channel.from(false) + ch_bismark_mbias_for_multiqc = Channel.from(false) + ch_bismark_reports_results_for_multiqc = Channel.from(false) + ch_bismark_summary_results_for_multiqc = Channel.from(false) } @@ -901,259 +895,255 @@ else { * Process with bwa-mem and assorted tools */ if( params.aligner == 'bwameth' ){ - process bwamem_align { - tag "$name" - publishDir "${params.outdir}/bwa-mem_alignments", mode: 'copy', - saveAs: {filename -> - if( !params.save_align_intermeds && filename == "where_are_my_files.txt" ) filename - else if( params.save_align_intermeds && filename != "where_are_my_files.txt" ) filename - else null - } - - input: - set val(name), file(reads) from ch_trimmed_reads_for_alignment - file bwa_meth_indices from ch_bwa_meth_indices_for_bwamem_align.collect() - file wherearemyfiles from ch_wherearemyfiles_for_bwamem_align.collect() - - output: - set val(name), file('*.bam') into ch_bam_for_samtools_sort_index_flagstat - file "where_are_my_files.txt" - - script: - fasta = bwa_meth_indices[0].toString() - '.bwameth' - '.c2t' - '.amb' - '.ann' - '.bwt' - '.pac' - '.sa' - prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ - """ - bwameth.py \\ - --threads ${task.cpus} \\ - --reference $fasta \\ - $reads | samtools view -bS - > ${prefix}.bam - """ - } - - - /* - * STEP 4.- samtools flagstat on samples - */ - process samtools_sort_index_flagstat { - tag "$name" - publishDir "${params.outdir}/samtools", mode: 'copy', - saveAs: {filename -> - if(filename.indexOf("report.txt") > 0) "logs/$filename" - else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename - else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename - else null - } - - input: - set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat - file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() - - output: - set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates,ch_bam_for_preseq,ch_bam_sorted_for_picard - set val(name), file("${bam.baseName}.sorted.bam.bai") into ch_bam_index - file "${bam.baseName}_flagstat_report.txt" into ch_flagstat_results_for_multiqc - file "${bam.baseName}_stats_report.txt" into ch_samtools_stats_results_for_multiqc - file "where_are_my_files.txt" - - script: - def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' - """ - samtools sort $bam \\ - -@ ${task.cpus} $sort_mem \\ - -o ${bam.baseName}.sorted.bam - samtools index ${bam.baseName}.sorted.bam - samtools flagstat ${bam.baseName}.sorted.bam > ${bam.baseName}_flagstat_report.txt - samtools stats ${bam.baseName}.sorted.bam > ${bam.baseName}_stats_report.txt - """ - } - - /* - * STEP 5 - Mark duplicates - */ - if( params.skip_deduplication || params.rrbs ) { - ch_bam_sorted_for_markDuplicates.into { ch_bam_dedup_for_methyldackel; ch_bam_dedup_for_qualimap } - ch_bam_index.set { ch_bam_index_for_methyldackel } - ch_markDups_results_for_multiqc = Channel.from(false) - } else { - process markDuplicates { - tag "$name" - publishDir "${params.outdir}/bwa-mem_markDuplicates", mode: 'copy', - saveAs: {filename -> filename.indexOf(".bam") == -1 ? "logs/$filename" : "$filename"} - - input: - set val(name), file(bam) from ch_bam_sorted_for_markDuplicates - - output: - set val(name), file("${bam.baseName}.markDups.bam") into ch_bam_dedup_for_methyldackel, ch_bam_dedup_for_qualimap - set val(name), file("${bam.baseName}.markDups.bam.bai") into ch_bam_index_for_methyldackel //ToDo check if this correctly overrides the original channel - file "${bam.baseName}.markDups_metrics.txt" into ch_markDups_results_for_multiqc - - script: - if( !task.memory ){ - log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." - avail_mem = 3 - } else { - avail_mem = task.memory.toGiga() - } - """ - picard -Xmx${avail_mem}g MarkDuplicates \\ - INPUT=$bam \\ - OUTPUT=${bam.baseName}.markDups.bam \\ - METRICS_FILE=${bam.baseName}.markDups_metrics.txt \\ - REMOVE_DUPLICATES=false \\ - ASSUME_SORTED=true \\ - PROGRAM_RECORD_ID='null' \\ - VALIDATION_STRINGENCY=LENIENT - samtools index ${bam.baseName}.markDups.bam - """ - } - } - - /* - * STEP 6 - extract methylation with MethylDackel - */ - - process methyldackel { - tag "$name" - publishDir "${params.outdir}/MethylDackel", mode: 'copy' - - input: - set val(name), - file(bam), - file(bam_index), - file(fasta), - file(fasta_index) from ch_bam_dedup_for_methyldackel - .join(ch_bam_index_for_methyldackel) - .combine(ch_fasta_for_methyldackel) - .combine(ch_fasta_index_for_methyldackel) - - - output: - file "${bam.baseName}*" into ch_methyldackel_results_for_multiqc - - script: - all_contexts = params.comprehensive ? '--CHG --CHH' : '' - min_depth = params.min_depth > 0 ? "--minDepth ${params.min_depth}" : '' - ignore_flags = params.ignore_flags ? "--ignoreFlags" : '' - methyl_kit = params.methyl_kit ? "--methylKit" : '' - """ - MethylDackel extract $all_contexts $ignore_flags $methyl_kit $min_depth $fasta $bam - MethylDackel mbias $all_contexts $ignore_flags $fasta $bam ${bam.baseName} --txt > ${bam.baseName}_methyldackel.txt - """ - } + process bwamem_align { + tag "$name" + publishDir "${params.outdir}/bwa-mem_alignments", mode: 'copy', + saveAs: {filename -> + if( !params.save_align_intermeds && filename == "where_are_my_files.txt" ) filename + else if( params.save_align_intermeds && filename != "where_are_my_files.txt" ) filename + else null + } + + input: + set val(name), file(reads) from ch_trimmed_reads_for_alignment + file bwa_meth_indices from ch_bwa_meth_indices_for_bwamem_align.collect() + file wherearemyfiles from ch_wherearemyfiles_for_bwamem_align.collect() + + output: + set val(name), file('*.bam') into ch_bam_for_samtools_sort_index_flagstat, ch_bam_for_preseq + file "where_are_my_files.txt" + + script: + fasta = bwa_meth_indices[0].toString() - '.bwameth' - '.c2t' - '.amb' - '.ann' - '.bwt' - '.pac' - '.sa' + prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ + """ + bwameth.py \\ + --threads ${task.cpus} \\ + --reference $fasta \\ + $reads | samtools view -bS - > ${prefix}.bam + """ + } + + + /* + * STEP 4.- samtools flagstat on samples + */ + process samtools_sort_index_flagstat { + tag "$name" + publishDir "${params.outdir}/samtools", mode: 'copy', + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "logs/$filename" + else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat + file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() + + output: + set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates,ch_bam_sorted_for_picard + set val(name), file("${bam.baseName}.sorted.bam.bai") into ch_bam_index + file "${bam.baseName}_flagstat_report.txt" into ch_flagstat_results_for_multiqc + file "${bam.baseName}_stats_report.txt" into ch_samtools_stats_results_for_multiqc + file "where_are_my_files.txt" + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $bam \\ + -@ ${task.cpus} $sort_mem \\ + -o ${bam.baseName}.sorted.bam + samtools index ${bam.baseName}.sorted.bam + samtools flagstat ${bam.baseName}.sorted.bam > ${bam.baseName}_flagstat_report.txt + samtools stats ${bam.baseName}.sorted.bam > ${bam.baseName}_stats_report.txt + """ + } + + /* + * STEP 5 - Mark duplicates + */ + if( params.skip_deduplication || params.rrbs ) { + ch_bam_sorted_for_markDuplicates.into { ch_bam_dedup_for_methyldackel; ch_bam_dedup_for_qualimap } + ch_bam_index.set { ch_bam_index_for_methyldackel } + ch_markDups_results_for_multiqc = Channel.from(false) + } else { + process markDuplicates { + tag "$name" + publishDir "${params.outdir}/bwa-mem_markDuplicates", mode: 'copy', + saveAs: {filename -> filename.indexOf(".bam") == -1 ? "logs/$filename" : "$filename"} + + input: + set val(name), file(bam) from ch_bam_sorted_for_markDuplicates + + output: + set val(name), file("${bam.baseName}.markDups.bam") into ch_bam_dedup_for_methyldackel, ch_bam_dedup_for_qualimap + set val(name), file("${bam.baseName}.markDups.bam.bai") into ch_bam_index_for_methyldackel //ToDo check if this correctly overrides the original channel + file "${bam.baseName}.markDups_metrics.txt" into ch_markDups_results_for_multiqc + + script: + if( !task.memory ){ + log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." + avail_mem = 3 + } else { + avail_mem = task.memory.toGiga() + } + """ + picard -Xmx${avail_mem}g MarkDuplicates \\ + INPUT=$bam \\ + OUTPUT=${bam.baseName}.markDups.bam \\ + METRICS_FILE=${bam.baseName}.markDups_metrics.txt \\ + REMOVE_DUPLICATES=false \\ + ASSUME_SORTED=true \\ + PROGRAM_RECORD_ID='null' \\ + VALIDATION_STRINGENCY=LENIENT + samtools index ${bam.baseName}.markDups.bam + """ + } + } + + /* + * STEP 6 - extract methylation with MethylDackel + */ + + process methyldackel { + tag "$name" + publishDir "${params.outdir}/MethylDackel", mode: 'copy' + + input: + set val(name), + file(bam), + file(bam_index), + file(fasta), + file(fasta_index) from ch_bam_dedup_for_methyldackel + .join(ch_bam_index_for_methyldackel) + .combine(ch_fasta_for_methyldackel) + .combine(ch_fasta_index_for_methyldackel) + + + output: + file "${bam.baseName}*" into ch_methyldackel_results_for_multiqc + + script: + all_contexts = params.comprehensive ? '--CHG --CHH' : '' + min_depth = params.min_depth > 0 ? "--minDepth ${params.min_depth}" : '' + ignore_flags = params.ignore_flags ? "--ignoreFlags" : '' + methyl_kit = params.methyl_kit ? "--methylKit" : '' + """ + MethylDackel extract $all_contexts $ignore_flags $methyl_kit $min_depth $fasta $bam + MethylDackel mbias $all_contexts $ignore_flags $fasta $bam ${bam.baseName} --txt > ${bam.baseName}_methyldackel.txt + """ + } } // end of bwa-meth if block else { - ch_flagstat_results_for_multiqc = Channel.from(false) - ch_samtools_stats_results_for_multiqc = Channel.from(false) - ch_markDups_results_for_multiqc = Channel.from(false) - ch_methyldackel_results_for_multiqc = Channel.from(false) + ch_flagstat_results_for_multiqc = Channel.from(false) + ch_samtools_stats_results_for_multiqc = Channel.from(false) + ch_markDups_results_for_multiqc = Channel.from(false) + ch_methyldackel_results_for_multiqc = Channel.from(false) } - -////////////////////////////////////////////////////// /* * Process with BISCUIT and assorted tools (samblaster) */ if( params.aligner == 'biscuit' ){ - process biscuit_align { - tag "$name" - publishDir "${params.outdir}/biscuit_alignments", mode: 'copy', - saveAs: {filename -> - if( !params.save_align_intermeds && filename == "where_are_my_files.txt" ) filename - else if( params.save_align_intermeds && filename != "where_are_my_files.txt" ) filename - else null - } - - input: - set val(name), file(reads) from ch_trimmed_reads_for_alignment - file bwa_indices from ch_bwa_index_for_biscuit.collect() - file wherearemyfiles from ch_wherearemyfiles_for_biscuit_align.collect() - - output: - set val(name), file('*.bam') into ch_bam_for_markDuplicates, ch_bam_for_samtools_sort_index_flagstat - file "where_are_my_files.txt" - - script: - fasta = bwa_indices[0].toString() - '.bwameth' - '.c2t' - '.amb' - '.ann' - '.bwt' - '.pac' - '.sa' - '.fai' - '.par' - '.dau' -'.bis' - assembly = fasta.replaceAll(/\.\w+/,"") - prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ - non_directional = params.non_directional ? 0 : 1 - // Paired-end or single end input files and pbat or not - input = params.pbat ? params.single_end ? reads + " -b 3" : "${reads[1]} ${reads[0]}" : reads + process biscuit_align { + tag "$name" + publishDir "${params.outdir}/biscuit_alignments", mode: 'copy', + saveAs: {filename -> + if( !params.save_align_intermeds && filename == "where_are_my_files.txt" ) filename + else if( params.save_align_intermeds && filename != "where_are_my_files.txt" ) filename + else null + } - - - """ - biscuit align -M -b $non_directional -t ${task.cpus} $fasta $input | samtools view -Sb > ${name}.${assembly}.bam - """ - } + input: + set val(name), file(reads) from ch_trimmed_reads_for_alignment + file bwa_indices from ch_bwa_index_for_biscuit.collect() + file wherearemyfiles from ch_wherearemyfiles_for_biscuit_align.collect() -/* -* STEP 4 - Mark duplicates -*/ - if( params.skip_deduplication || params.rrbs ) { - ch_bam_for_markDuplicates.into { ch_bam_dedup_for_qualimap; ch_samblaster_for_samtools_sort_index_flagstat } - ch_markDups_results_for_multiqc = Channel.from(false) - } else { - process markDuplicates_samblaster { + output: + set val(name), file('*.bam') into ch_bam_for_markDuplicates, ch_bam_for_samtools_sort_index_flagstat + file "where_are_my_files.txt" + + script: + fasta = bwa_indices[0].toString() - '.bwameth' - '.c2t' - '.amb' - '.ann' - '.bwt' - '.pac' - '.sa' - '.fai' - '.par' - '.dau' -'.bis' + assembly = fasta.replaceAll(/\.\w+/,"") + prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ + non_directional = params.non_directional ? 0 : 1 + // Paired-end or single end input files and pbat or not + input = params.pbat ? params.single_end ? reads + " -b 3" : "${reads[1]} ${reads[0]}" : reads + + """ + biscuit align -M -b $non_directional -t ${task.cpus} $fasta $input | samtools view -Sb > ${name}.${assembly}.bam + """ + } + + /* + * STEP 4 - Mark duplicates + */ + if( params.skip_deduplication || params.rrbs ) { + ch_bam_for_markDuplicates.into { ch_bam_dedup_for_qualimap; ch_samblaster_for_samtools_sort_index_flagstat } + ch_markDups_results_for_multiqc = Channel.from(false) + } else { + process markDuplicates_samblaster { tag "$name" publishDir "${params.outdir}", mode: 'copy', - saveAs: {filename -> - if( filename.indexOf("log") > 0 ) "biscuit_markDuplicates/$filename" + saveAs: {filename -> + if( filename.indexOf("log") > 0 ) "biscuit_markDuplicates/$filename" else null - } + } - input: - set val(name), file(bam) from ch_bam_for_markDuplicates - file wherearemyfiles from ch_wherearemyfiles_for_samblaster.collect() + input: + set val(name), file(bam) from ch_bam_for_markDuplicates + file wherearemyfiles from ch_wherearemyfiles_for_samblaster.collect() - output: - set val(name), file("${bam.baseName}.samblaster.bam") into ch_samblaster_for_samtools_sort_index_flagstat - file "*log" into ch_samblaster_for_multiqc - + output: + set val(name), file("${bam.baseName}.samblaster.bam") into ch_samblaster_for_samtools_sort_index_flagstat + file "*log" into ch_samblaster_for_multiqc + script: - def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' - unmapped = params.single_end ? '--ignoreUnmated' : '' - - """ - samtools sort -n $bam -@ ${task.cpus} $sort_mem| samtools view -h | samblaster -M $unmapped -d "${bam.baseName}_discordant.sam" -s "${bam.baseName}_split.sam" -u "${bam.baseName}_.fastq" --excludeDups --addMateTags | samtools view -Sb > ${bam.baseName}.samblaster.bam - cp .command.log ${bam.baseName}.log - """ - } - } - - /* - * STEP 5.- samtools flagstat on samples - */ - process samtools_sort_index_flagstat_biscuit { - tag "$name" - publishDir "${params.outdir}", mode: 'copy', - saveAs: {filename -> - if(filename.indexOf("report.txt") > 0) "biscuit_alignments/logs/$filename" + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + unmapped = params.single_end ? '--ignoreUnmated' : '' + + """ + samtools sort -n $bam -@ ${task.cpus} $sort_mem| samtools view -h | samblaster -M $unmapped -d "${bam.baseName}_discordant.sam" -s "${bam.baseName}_split.sam" -u "${bam.baseName}_.fastq" --excludeDups --addMateTags | samtools view -Sb > ${bam.baseName}.samblaster.bam + cp .command.log ${bam.baseName}.log + """ + } + } + + /* + * STEP 5.- samtools flagstat on samples + */ + process samtools_sort_index_flagstat_biscuit { + tag "$name" + publishDir "${params.outdir}", mode: 'copy', + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "biscuit_alignments/logs/$filename" else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename.indexOf("sorted.bam") > 0) "biscuit_alignments/$filename" - else if( (!params.save_align_intermeds && !params.rrbs).every() && filename == "where_are_my_files.txt") filename - else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename - else null - } + else if( (!params.save_align_intermeds && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } - input: - set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat + input: + set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat set val(name_samblaster), file(samblaster_bam) from ch_samblaster_for_samtools_sort_index_flagstat - file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() + file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() - output: + output: set val(name_samblaster), file("*.sorted.bam") into ch_bam_dedup_for_qualimap,ch_bam_for_preseq,ch_bam_sorted_for_pileup, ch_bam_sorted_for_epiread, ch_bam_noDups_for_QC,ch_bam_sorted_for_picard file "*.sorted.bam.bai" into ch_bam_index_sorted_for_pileup,ch_bam_index_for_epiread,ch_bam_index_noDups_for_QC file "${samblaster_bam.baseName}_flagstat_report.txt" into ch_flagstat_results_biscuit_for_multiqc - file "${samblaster_bam.baseName}_stats_report.txt" into ch_samtools_stats_results_biscuit_for_multiqc - file "where_are_my_files.txt" + file "${samblaster_bam.baseName}_stats_report.txt" into ch_samtools_stats_results_biscuit_for_multiqc + file "where_are_my_files.txt" script: @@ -1167,29 +1157,28 @@ if( params.aligner == 'biscuit' ){ samtools flagstat ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_flagstat_report.txt samtools stats ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_stats_report.txt - """ - } + """ + } - + /* - * STEP 6 - Create vcf file with pileup, to extract methylation - */ - process createVCF { - tag "$name" - publishDir "${params.outdir}/methylation_extract", mode: 'copy', - saveAs: {filename -> + * STEP 6 - Create vcf file with pileup, to extract methylation + */ + process createVCF { + tag "$name" + publishDir "${params.outdir}/methylation_extract", mode: 'copy', + saveAs: {filename -> if( !params.save_pileup_file && filename == "where_are_my_files.txt") filename else if( filename.indexOf("vcf.gz") > 0 && params.save_pileup_file && filename != "where_are_my_files.txt") filename else null - } + } - input: - set val(name), file(bam) from ch_bam_sorted_for_pileup - file bam_index from ch_bam_index_sorted_for_pileup - file fasta from ch_fasta_for_pileup.collect() + input: + set val(name), file(bam), file (bam_index) from ch_bam_sorted_for_pileup.join(ch_bam_index_sorted_for_pileup) + file fasta from ch_fasta_for_pileup.collect() file fasta_index from ch_fasta_index_for_createVCF.collect() - output: + output: set val(name), file("${name}.vcf.gz*") into ch_vcf_biscuit_qc ,ch_vcf_for_bedgraph,ch_vcf_for_epiread script: @@ -1200,120 +1189,195 @@ if( params.aligner == 'biscuit' ){ bgzip -@ ${task.cpus} -f ${name}.vcf tabix -f -p vcf ${name}.vcf.gz """ - } - - + } - /* - * STEP 7 - create bedgraph file from vcf - */ + /* + * STEP 7 - create bedgraph file from vcf + */ process createBedgraph { tag "$name" - publishDir "${params.outdir}/methylation_extract", mode: 'copy' - + publishDir "${params.outdir}/methylation_extract", mode: 'copy' - input: - set val(name), file(vcf) from ch_vcf_for_bedgraph - - output: - set val(name), file("*bedgraph" ) - - script: - min_depth = params.min_depth > 0 ? "${params.min_depth}" : '1' - all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' - """ - biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" - """ + input: + set val(name), file(vcf) from ch_vcf_for_bedgraph + + output: + set val(name), file("*bedgraph" ) into ch_bedgraph_for_intersect_soloWCGW + + script: + min_depth = params.min_depth > 0 ? "${params.min_depth}" : '1' + all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' + """ + biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" + """ } - if (params.epiread) { + if (params.common_dbsnp) { + process reformat_SNP { + + input: + file commonSNP_file from ch_commonSNP_for_SNP.collect() + + output: + file("reformattedSNP.snv.txt.gz*" ) into ch_reformattedSNP_for_SNP + + script: + """ + less $commonSNP_file | $baseDir/bin/processUcscDbsnp.pl | grep snv | bgzip > reformattedSNP.snv.txt.gz + tabix -s 1 -b 2 -e 3 reformattedSNP.snv.txt.gz + """ + } + } + else { + ch_reformattedSNP_for_SNP = Channel.empty() + + } + + process get_SNP_file { + tag "$name" + publishDir "${params.outdir}/epireads/snp", mode: 'copy', + saveAs: {filename -> + if( filename.indexOf("bed") > 0 && params.save_snp_file && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(vcf) from ch_vcf_for_epiread + file whitelist_file from ch_whitelist_for_SNP.collect() + file reformatted_SNP from ch_reformattedSNP_for_SNP.collect().ifEmpty([]) + + output: + set val(name), file ("${name}.snp.bed") into ch_snp_for_epiread + file "*gz" + + script: + whitelist = params.whitelist ? "-R $whitelist_file" : '' + snp_file = (reformatted_SNP.size()>0) ? "-a ${reformatted_SNP[0]}" : '' + """ + bcftools annotate $whitelist -O z ${snp_file} -h $baseDir/assets/common_dbsnp.hdr -c CHROM,FROM,TO,TYPE,COMMON_SOME,COMMON_ALL,REF_MIN,ALT_MIN,REF_DBSNP,ALT_DBSNP,REF_ALL,ALT_ALL,RSID,MAX_MAF "${vcf[0]}" > "${name}-whitelist-dbSNP.vcf.gz" + tabix -p vcf "${name}-whitelist-dbSNP.vcf.gz" + bcftools view -O z -i'ALT!="N" & ALT!="." & ( (COUNT(GT=="0/1")>=1 & COMMON_ALL==1 & MAX_MAF>=0.05) | (COUNT(GT=="0/1" & GQ>=60)>=1) )' "${name}-whitelist-dbSNP.vcf.gz" > "${name}-whitelist-dbSNP-HET60.vcf.gz" + tabix -p vcf "${name}-whitelist-dbSNP-HET60.vcf.gz" + bcftools query -u -i'GT="0/1" & GQ>=10' --format '%CHROM\t%POS\t%POS\t%REF\t%ALT[\t%GT\t%GQ\t%SP\t%AC\t%AF1]\t%RSID\t%COMMON_ALL\t%MAX_MAF\t%REF_MIN\t%ALT_MIN\n' "${name}-whitelist-dbSNP-HET60.vcf.gz" | awk -v OFS="\t" '{\$2 = \$2 - 1; print}' > "${name}.snp.bed" + """ + } + process epiread_convertion { tag "$name" publishDir "${params.outdir}/epireads", mode: 'copy' - input: - set val(name), file(bam) from ch_bam_sorted_for_epiread - file bam_index from ch_bam_index_for_epiread - file fasta from ch_fasta_for_epiread.collect() - file fasta_index from ch_fasta_index_for_epiread.collect() + input: + set val(name), + file(bam), + file(bam_index), + file (snp), + file(fasta), + file(fasta_index), + file(whitelist) from ch_bam_sorted_for_epiread + .join(ch_bam_index_for_epiread) + .join(ch_snp_for_epiread) + .combine(ch_fasta_for_epiread) + .combine(ch_fasta_index_for_epiread) + .combine(ch_whitelist_for_epiread) + set val(cpg), file ("assets/cpg.bed.gz") from ch_assets_dir_with_cpg_for_epiread.collect() + + + output: + file "*${name}.e*.gz*" + file "${name}.original.epiread.*" optional true - output: - file "*epiread" - - script: - if (params.single_end) { - """ - biscuit epiread -q ${task.cpus} $fasta $bam -o ${name}.epiread - """ - } else { + script: + snp_file = (snp.size()>0) ? "-B " + snp.toString() : '' + debug_merging_epiread = (params.debug_epiread_merging || params.debug_epiread) ? "debug" : '' + no_filter_reverse = params.rrbs ? "-p" : '' + if (params.single_end) { """ - biscuit epiread -q ${task.cpus} $fasta $bam | sort --parallel=${task.cpus} -T . -k2,2 -k3,3n | awk 'BEGIN{qname="";rec=""} qname==\$2{print rec"\t"\$5"\t"\$6"\t"\$7"\t"\$8;qname=""} qname!=\$2{qname=\$2;rec=\$1"\t"\$4"\t"\$5"\t"\$6"\t"\$7"\t"\$8;pair=\$3}' > ${name}.epiread + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam + samtools index ${name}.bam + biscuit epiread -q ${task.cpus} $snp_file $no_filter_reverse $fasta ${name}.bam |sort --parallel=${task.cpus} -T . -k1,1Vf -k5,5n | bgzip > ${name}.epiread.gz + tabix -0 -s 1 -b 5 -e 5 ${name}.epiread.gz """ - } + } else if (params.debug_epiread) { + """ + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam + samtools index ${name}.bam + biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n > ${name}.original.epiread + less ${name}.original.epiread | $baseDir/bin/epiread_pairedEnd_convertion $cpg $snp ${name}.epiread $debug_merging_epiread > ${name}.err + sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz + sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz + sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.original.epiread | bgzip > ${name}.original.epiread.gz + tabix -0 -s 1 -b 5 -e 5 ${name}.original.epiread.gz + tabix -0 -p bed ${name}.epiread.gz + tabix -0 -s 1 -b 5 -e 5 ${name}.err.gz + """ + } + else { + """ + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam + samtools index ${name}.bam + biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n | $baseDir/bin/epiread_pairedEnd_convertion $cpg $snp ${name}.epiread $debug_merging_epiread > ${name}.err + sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz + sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz + tabix -0 -p bed ${name}.epiread.gz + tabix -0 -s 1 -b 5 -e 5 ${name}.err.gz + """ + } } } - - process biscuit_QC { - tag "$bam_name" + process biscuit_QC { + tag "$name" publishDir "${params.outdir}/biscuit_QC", mode: 'copy' input: - set val(name), file(vcf) from ch_vcf_biscuit_qc - set val(bam_name), file(bam) from ch_bam_noDups_for_QC - file fasta from ch_fasta_for_biscuitQC.collect() - file fasta_index from ch_fasta_index_for_biscuitQC.collect() - file assets from ch_assets_dir_for_biscuit_qc.collect() + set val(name), + file(vcf), + file(bam), + file(fasta), + file(fasta_index), + file(assets) from ch_vcf_biscuit_qc + .join(ch_bam_noDups_for_QC) + .combine(ch_fasta_for_biscuitQC) + .combine(ch_fasta_index_for_biscuitQC) + .combine(ch_assets_dir_for_biscuit_qc) + output: file "*_biscuitQC" into ch_QC_results_for_multiqc script: assembly = fasta.toString().replaceAll(/\.\w+/,"") """ - $baseDir/bin/biscuit_QC.sh -v ${vcf[0]} -o ${bam_name}.${assembly}_biscuitQC $assets $fasta ${bam_name}.${assembly} ${bam} -p ${task.cpus} + biscuit_QC.sh -v ${vcf[0]} -o ${name}.${assembly}_biscuitQC $assets $fasta ${name}.${assembly} ${bam} """ - } + } } // end of biscuit if block else { - ch_flagstat_results_biscuit_for_multiqc = Channel.from(false) - ch_samtools_stats_results_biscuit_for_multiqc = Channel.from(false) - ch_markDups_results_for_multiqc = Channel.from(false) - ch_methyldackel_results_for_multiqc = Channel.from(false) - ch_QC_results_for_multiqc = Channel.from(false) - ch_samblaster_for_multiqc = Channel.from(false) - ch_vcf_biscuit_qc = Channel.from(false) - ch_assets_dir_for_biscuit_qc = Channel.from(false) - ch_bam_sorted_for_pileup = Channel.from(false) - ch_bam_index_sorted_for_pileup = Channel.from(false) - ch_bam_noDups_for_QC = Channel.from(false) - ch_bam_sorted_for_epiread = Channel.from(false) - ch_bam_index_for_epiread = Channel.from(false) - } - -//////////////////////////////////////////////////////// - - - + ch_flagstat_results_biscuit_for_multiqc = Channel.from(false) + ch_samtools_stats_results_biscuit_for_multiqc = Channel.from(false) + ch_markDups_results_for_multiqc = Channel.from(false) + ch_QC_results_for_multiqc = Channel.from(false) + ch_samblaster_for_multiqc = Channel.from(false) +} /* * STEP 8 - Qualimap */ process qualimap { - tag "$name" - publishDir "${params.outdir}/qualimap", mode: 'copy' - + tag "$name" + publishDir "${params.outdir}/qualimap", mode: 'copy' + input: - set val(name), file(bam) from ch_bam_dedup_for_qualimap + set val(name), file(bam) from ch_bam_dedup_for_qualimap - output: - file "${bam.baseName}_qualimap" into ch_qualimap_results_for_multiqc + output: + file "${bam.baseName}_qualimap" into ch_qualimap_results_for_multiqc - script: - gcref = params.genome.toString().startsWith('GRCh') ? '-gd HUMAN' : '' - gcref = params.genome.toString().startsWith('GRCm') ? '-gd MOUSE' : '' + script: + gcref = params.genome.toString().startsWith('GRCh') ? '-gd HUMAN' : '' + gcref = params.genome.toString().startsWith('GRCm') ? '-gd MOUSE' : '' """ qualimap bamqc $gcref \\ -bam ${bam.baseName}.bam \\ @@ -1329,180 +1393,175 @@ process qualimap { /* * STEP 9 - Picard - Preparation step */ - process prepareGenomeToPicard { +process prepareGenomeToPicard { publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: 'copy' - - input: - file fasta from ch_fasta_for_picard - output: - file "${fasta.baseName}.picard.fa" into ch_fasta_picard_for_picard - file "${fasta.baseName}.picard.dict" into ch_fasta_picard_dict_for_picard - - - script: - if( !task.memory ){ - log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." - avail_mem = 3 - } else { - avail_mem = task.memory.toGiga() - } - """ - mv ${fasta} ${fasta.baseName}.picard.fa - picard -Xmx${avail_mem}g CreateSequenceDictionary \\ - R=${fasta.baseName}.picard.fa \\ - O=${fasta.baseName}.picard.dict - """ - } + saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: 'copy' + + input: + file fasta from ch_fasta_for_picard + output: + file "${fasta.baseName}.picard.fa" into ch_fasta_picard_for_picard + file "${fasta.baseName}.picard.dict" into ch_fasta_picard_dict_for_picard + + script: + if( !task.memory ){ + log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." + avail_mem = 3 + } else { + avail_mem = task.memory.toGiga() + } + """ + mv ${fasta} ${fasta.baseName}.picard.fa + picard -Xmx${avail_mem}g CreateSequenceDictionary \\ + R=${fasta.baseName}.picard.fa \\ + O=${fasta.baseName}.picard.dict + """ +} /* * STEP 10 - Picard InsertSizeMetrics and GcBiasMetrics */ - process picardMetrics { - tag "$name" - publishDir "${params.outdir}/picardMetrics", mode: 'copy', - saveAs: { filename -> - if (filename.indexOf(".txt") > 0) filename - else if (filename.indexOf(".pdf") > 0) "pdf/$filename" - else null - } - input: - set val(name), file(bam) from ch_bam_sorted_for_picard - file fasta from ch_fasta_picard_for_picard.collect() - file dict from ch_fasta_picard_dict_for_picard.collect() - - output: - file "${name}.*.pdf" - file "${name}.*.txt" into ch_picard_results_for_multiqc - - script: - if( !task.memory ){ - log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." - avail_mem = 3 - } else { - avail_mem = task.memory.toGiga() - } - """ - picard -Xmx${avail_mem}g CollectInsertSizeMetrics \\ - INPUT=$bam \\ - OUTPUT=${name}.insert_size_metrics.txt \\ - HISTOGRAM_FILE=${name}.insert_size_histogram.pdf \\ - ASSUME_SORTED=true \\ - VALIDATION_STRINGENCY=LENIENT - set +e - - - picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ - INPUT=$bam \\ - OUTPUT=${name}.gc_bias_metrics.txt \\ - CHART=${name}.gc_bias_metrics.pdf \\ - SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ - ASSUME_SORTED=true \\ - IS_BISULFITE_SEQUENCED=true \\ - REFERENCE_SEQUENCE=$fasta \\ - VALIDATION_STRINGENCY=LENIENT - - [ ! "\$?" -eq "0" ] && picard -Xmx${avail_mem}g ReorderSam I=$bam O=${bam.baseName}.picard.bam SEQUENCE_DICTIONARY=$fasta VALIDATION_STRINGENCY=LENIENT TMP_DIR=. && picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ - INPUT=${bam.baseName}.picard.bam \\ - OUTPUT=${name}.gc_bias_metrics.txt \\ - CHART=${name}.gc_bias_metrics.pdf \\ - SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ - ASSUME_SORTED=true \\ - IS_BISULFITE_SEQUENCED=true \\ - REFERENCE_SEQUENCE=$fasta \\ - VALIDATION_STRINGENCY=LENIENT - echo "fine" - """ - - } - +process picardMetrics { + tag "$name" + publishDir "${params.outdir}/picardMetrics", mode: 'copy', + saveAs: { filename -> + if (filename.indexOf(".txt") > 0) filename + else if (filename.indexOf(".pdf") > 0) "pdf/$filename" + else null + } + input: + set val(name), file(bam) from ch_bam_sorted_for_picard + file fasta from ch_fasta_picard_for_picard.collect() + file dict from ch_fasta_picard_dict_for_picard.collect() + output: + file "${name}.*.pdf" + file "${name}.*.txt" into ch_picard_results_for_multiqc + + script: + if( !task.memory ){ + log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." + avail_mem = 3 + } else { + avail_mem = task.memory.toGiga() + } + """ + picard -Xmx${avail_mem}g CollectInsertSizeMetrics \\ + INPUT=$bam \\ + OUTPUT=${name}.insert_size_metrics.txt \\ + HISTOGRAM_FILE=${name}.insert_size_histogram.pdf \\ + ASSUME_SORTED=true \\ + VALIDATION_STRINGENCY=LENIENT + set +e + + picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ + INPUT=$bam \\ + OUTPUT=${name}.gc_bias_metrics.txt \\ + CHART=${name}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ + ASSUME_SORTED=true \\ + IS_BISULFITE_SEQUENCED=true \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT + + [ ! "\$?" -eq "0" ] && picard -Xmx${avail_mem}g ReorderSam I=$bam O=${bam.baseName}.picard.bam SEQUENCE_DICTIONARY=$fasta VALIDATION_STRINGENCY=LENIENT TMP_DIR=. && picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ + INPUT=${bam.baseName}.picard.bam \\ + OUTPUT=${name}.gc_bias_metrics.txt \\ + CHART=${name}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ + ASSUME_SORTED=true \\ + IS_BISULFITE_SEQUENCED=true \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT + echo "fine" + """ +} /* * STEP 11 - preseq */ process preseq { - tag "$name" - publishDir "${params.outdir}/preseq", mode: 'copy' + tag "$name" + publishDir "${params.outdir}/preseq", mode: 'copy' - input: - set val(name), file(bam) from ch_bam_for_preseq + input: + set val(name), file(bam) from ch_bam_for_preseq - output: - file "${bam.baseName}.ccurve.txt" into preseq_results + output: + file "${bam.baseName}.ccurve.txt" into preseq_results - script: + script: """ preseq lc_extrap -v -B ${bam.baseName}.bam -o ${bam.baseName}.ccurve.txt """ } -/* +/* * STEP 12 - MultiQC */ process multiqc { - publishDir "${params.outdir}/MultiQC", mode: 'copy' - - input: - file (multiqc_config) from ch_multiqc_config - file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) - file ('fastqc/*') from ch_fastqc_results_for_multiqc.collect().ifEmpty([]) - file ('trimgalore/*') from ch_trim_galore_results_for_multiqc.collect().ifEmpty([]) - file ('bismark/*') from ch_bismark_align_log_for_multiqc.collect().ifEmpty([]) - file ('bismark/*') from ch_bismark_dedup_log_for_multiqc.collect().ifEmpty([]) - file ('bismark/*') from ch_bismark_splitting_report_for_multiqc.collect().ifEmpty([]) - file ('bismark/*') from ch_bismark_mbias_for_multiqc.collect().ifEmpty([]) - file ('bismark/*') from ch_bismark_reports_results_for_multiqc.collect().ifEmpty([]) - file ('bismark/*') from ch_bismark_summary_results_for_multiqc.collect().ifEmpty([]) - file ('samtools/*') from ch_flagstat_results_for_multiqc.flatten().collect().ifEmpty([]) - file ('samtools/*') from ch_samtools_stats_results_for_multiqc.flatten().collect().ifEmpty([]) - file ('samtools/*') from ch_flagstat_results_biscuit_for_multiqc.flatten().collect().ifEmpty([]) - file ('samtools/*') from ch_samtools_stats_results_biscuit_for_multiqc.flatten().collect().ifEmpty([]) - file ('bwa-mem_markDuplicates/*') from ch_markDups_results_for_multiqc.flatten().collect().ifEmpty([]) - file ('methyldackel/*') from ch_methyldackel_results_for_multiqc.flatten().collect().ifEmpty([]) - file ('qualimap/*') from ch_qualimap_results_for_multiqc.collect().ifEmpty([]) - file ('preseq/*') from preseq_results.collect().ifEmpty([]) - file ('biscuit_QC/*') from ch_QC_results_for_multiqc.collect().ifEmpty([]) - file ('biscuit_markDuplicates/*') from ch_samblaster_for_multiqc.collect().ifEmpty([]) - file ('picardMetrics/*') from ch_picard_results_for_multiqc.collect().ifEmpty([]) - - file ('software_versions/*') from ch_software_versions_yaml_for_multiqc.collect() - file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") - - output: - file "*multiqc_report.html" into ch_multiqc_report - file "*_data" - file "multiqc_plots" - - script: - rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' - custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' - """ - multiqc -f $rtitle $rfilename $custom_config_file . \\ - -m custom_content -m picard -m qualimap -m bismark -m samtools -m preseq -m cutadapt -m fastqc -m biscuit -m samblaster - """ + publishDir "${params.outdir}/MultiQC", mode: 'copy' + + input: + file (multiqc_config) from ch_multiqc_config + file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) + file ('fastqc/*') from ch_fastqc_results_for_multiqc.collect().ifEmpty([]) + file ('trimgalore/*') from ch_trim_galore_results_for_multiqc.collect().ifEmpty([]) + file ('bismark/*') from ch_bismark_align_log_for_multiqc.collect().ifEmpty([]) + file ('bismark/*') from ch_bismark_dedup_log_for_multiqc.collect().ifEmpty([]) + file ('bismark/*') from ch_bismark_splitting_report_for_multiqc.collect().ifEmpty([]) + file ('bismark/*') from ch_bismark_mbias_for_multiqc.collect().ifEmpty([]) + file ('bismark/*') from ch_bismark_reports_results_for_multiqc.collect().ifEmpty([]) + file ('bismark/*') from ch_bismark_summary_results_for_multiqc.collect().ifEmpty([]) + file ('samtools/*') from ch_flagstat_results_for_multiqc.flatten().collect().ifEmpty([]) + file ('samtools/*') from ch_samtools_stats_results_for_multiqc.flatten().collect().ifEmpty([]) + file ('samtools/*') from ch_flagstat_results_biscuit_for_multiqc.flatten().collect().ifEmpty([]) + file ('samtools/*') from ch_samtools_stats_results_biscuit_for_multiqc.flatten().collect().ifEmpty([]) + file ('bwa-mem_markDuplicates/*') from ch_markDups_results_for_multiqc.flatten().collect().ifEmpty([]) + file ('methyldackel/*') from ch_methyldackel_results_for_multiqc.flatten().collect().ifEmpty([]) + file ('qualimap/*') from ch_qualimap_results_for_multiqc.collect().ifEmpty([]) + file ('preseq/*') from preseq_results.collect().ifEmpty([]) + file ('biscuit_QC/*') from ch_QC_results_for_multiqc.collect().ifEmpty([]) + file ('biscuit_markDuplicates/*') from ch_samblaster_for_multiqc.collect().ifEmpty([]) + file ('picardMetrics/*') from ch_picard_results_for_multiqc.collect().ifEmpty([]) + + file ('software_versions/*') from ch_software_versions_yaml_for_multiqc.collect() + file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") + + output: + file "*multiqc_report.html" into ch_multiqc_report + file "*_data" + file "multiqc_plots" + + script: + rtitle = custom_runName ? "--title \"$custom_runName\"" : '' + rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' + """ + multiqc -f $rtitle $rfilename $custom_config_file . \\ + -m custom_content -m picard -m qualimap -m bismark -m samtools -m preseq -m cutadapt -m fastqc -m biscuit -m samblaster + """ } /* * STEP 13 - Output Description HTML */ process output_documentation { - publishDir "${params.outdir}/pipeline_info", mode: 'copy' + publishDir "${params.outdir}/pipeline_info", mode: 'copy' - input: - file output_docs from ch_output_docs + input: + file output_docs from ch_output_docs - output: - file "results_description.html" + output: + file "results_description.html" - script: - """ - markdown_to_html.py $output_docs -o results_description.html - """ + script: + """ + markdown_to_html.py $output_docs -o results_description.html + """ } /* @@ -1510,154 +1569,154 @@ process output_documentation { */ workflow.onComplete { - // Set up the e-mail variables - def subject = "[nf-core/methylseq] Successful: $workflow.runName" - if (!workflow.success) { - subject = "[nf-core/methylseq] FAILED: $workflow.runName" - } - def email_fields = [:] - email_fields['version'] = workflow.manifest.version - email_fields['runName'] = custom_runName ?: workflow.runName - email_fields['success'] = workflow.success - email_fields['dateComplete'] = workflow.complete - email_fields['duration'] = workflow.duration - email_fields['exitStatus'] = workflow.exitStatus - email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') - email_fields['errorReport'] = (workflow.errorReport ?: 'None') - email_fields['commandLine'] = workflow.commandLine - email_fields['projectDir'] = workflow.projectDir - email_fields['summary'] = summary - email_fields['summary']['Date Started'] = workflow.start - email_fields['summary']['Date Completed'] = workflow.complete - email_fields['summary']['Pipeline script file path'] = workflow.scriptFile - email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId - if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision - email_fields['summary']['Nextflow Version'] = workflow.nextflow.version - email_fields['summary']['Nextflow Build'] = workflow.nextflow.build - email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - - // On success try attach the multiqc report - def mqc_report = null - try { - if (workflow.success) { - mqc_report = ch_multiqc_report.getVal() - if (mqc_report.getClass() == ArrayList) { - log.warn "[nf-core/methylseq] Found multiple reports from process 'multiqc', will use only one" - mqc_report = mqc_report[0] - } - } - } catch (all) { - log.warn "[nfcore/methylseq] Could not attach MultiQC report to summary email" - } - - // Check if we are only sending emails on failure - email_address = params.email - if (!params.email && params.email_on_fail && !workflow.success) { - email_address = params.email_on_fail - } - - // Render the TXT template - def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$baseDir/assets/email_template.txt") - def txt_template = engine.createTemplate(tf).make(email_fields) - def email_txt = txt_template.toString() - - // Render the HTML template - def hf = new File("$baseDir/assets/email_template.html") - def html_template = engine.createTemplate(hf).make(email_fields) - def email_html = html_template.toString() - - // Render the sendmail template - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] - def sf = new File("$baseDir/assets/sendmail_template.txt") - def sendmail_template = engine.createTemplate(sf).make(smail_fields) - def sendmail_html = sendmail_template.toString() - - // Send the HTML e-mail - if (email_address) { - try { - if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } - // Try to send HTML e-mail using sendmail - [ 'sendmail', '-t' ].execute() << sendmail_html - log.info "[nf-core/methylseq] Sent summary e-mail to $email_address (sendmail)" - } catch (all) { - // Catch failures and try with plaintext - [ 'mail', '-s', subject, email_address ].execute() << email_txt - log.info "[nf-core/methylseq] Sent summary e-mail to $email_address (mail)" - } - } - - // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") - output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") - output_tf.withWriter { w -> w << email_txt } - - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_red = params.monochrome_logs ? '' : "\033[0;31m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - - if (workflow.stats.ignoredCount > 0 && workflow.success) { - log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" - log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" - log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" - } - - if (workflow.success) { - log.info "-${c_purple}[nf-core/methylseq]${c_green} Pipeline completed successfully${c_reset}-" - } else { - checkHostname() - log.info "-${c_purple}[nf-core/methylseq]${c_red} Pipeline completed with errors${c_reset}-" - } + // Set up the e-mail variables + def subject = "[nf-core/methylseq] Successful: $workflow.runName" + if (!workflow.success) { + subject = "[nf-core/methylseq] FAILED: $workflow.runName" + } + def email_fields = [:] + email_fields['version'] = workflow.manifest.version + email_fields['runName'] = custom_runName ?: workflow.runName + email_fields['success'] = workflow.success + email_fields['dateComplete'] = workflow.complete + email_fields['duration'] = workflow.duration + email_fields['exitStatus'] = workflow.exitStatus + email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + email_fields['errorReport'] = (workflow.errorReport ?: 'None') + email_fields['commandLine'] = workflow.commandLine + email_fields['projectDir'] = workflow.projectDir + email_fields['summary'] = summary + email_fields['summary']['Date Started'] = workflow.start + email_fields['summary']['Date Completed'] = workflow.complete + email_fields['summary']['Pipeline script file path'] = workflow.scriptFile + email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId + if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository + if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId + if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision + email_fields['summary']['Nextflow Version'] = workflow.nextflow.version + email_fields['summary']['Nextflow Build'] = workflow.nextflow.build + email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + // On success try attach the multiqc report + def mqc_report = null + try { + if (workflow.success) { + mqc_report = ch_multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList) { + log.warn "[nf-core/methylseq] Found multiple reports from process 'multiqc', will use only one" + mqc_report = mqc_report[0] + } + } + } catch (all) { + log.warn "[nfcore/methylseq] Could not attach MultiQC report to summary email" + } + + // Check if we are only sending emails on failure + email_address = params.email + if (!params.email && params.email_on_fail && !workflow.success) { + email_address = params.email_on_fail + } + + // Render the TXT template + def engine = new groovy.text.GStringTemplateEngine() + def tf = new File("$baseDir/assets/email_template.txt") + def txt_template = engine.createTemplate(tf).make(email_fields) + def email_txt = txt_template.toString() + + // Render the HTML template + def hf = new File("$baseDir/assets/email_template.html") + def html_template = engine.createTemplate(hf).make(email_fields) + def email_html = html_template.toString() + + // Render the sendmail template + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] + def sf = new File("$baseDir/assets/sendmail_template.txt") + def sendmail_template = engine.createTemplate(sf).make(smail_fields) + def sendmail_html = sendmail_template.toString() + + // Send the HTML e-mail + if (email_address) { + try { + if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + // Try to send HTML e-mail using sendmail + [ 'sendmail', '-t' ].execute() << sendmail_html + log.info "[nf-core/methylseq] Sent summary e-mail to $email_address (sendmail)" + } catch (all) { + // Catch failures and try with plaintext + [ 'mail', '-s', subject, email_address ].execute() << email_txt + log.info "[nf-core/methylseq] Sent summary e-mail to $email_address (mail)" + } + } + + // Write summary e-mail HTML to a file + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() + } + def output_hf = new File(output_d, "pipeline_report.html") + output_hf.withWriter { w -> w << email_html } + def output_tf = new File(output_d, "pipeline_report.txt") + output_tf.withWriter { w -> w << email_txt } + + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_red = params.monochrome_logs ? '' : "\033[0;31m"; + c_reset = params.monochrome_logs ? '' : "\033[0m"; + + if (workflow.stats.ignoredCount > 0 && workflow.success) { + log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" + log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" + log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" + } + + if (workflow.success) { + log.info "-${c_purple}[nf-core/methylseq]${c_green} Pipeline completed successfully${c_reset}-" + } else { + checkHostname() + log.info "-${c_purple}[nf-core/methylseq]${c_red} Pipeline completed with errors${c_reset}-" + } } def nfcoreHeader() { - // Log colors ANSI codes - c_black = params.monochrome_logs ? '' : "\033[0;30m"; - c_blue = params.monochrome_logs ? '' : "\033[0;34m"; - c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; - c_dim = params.monochrome_logs ? '' : "\033[2m"; - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - c_white = params.monochrome_logs ? '' : "\033[0;37m"; - c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; - - return """ -${c_dim}--------------------------------------------------${c_reset}- - ${c_green},--.${c_black}/${c_green},-.${c_reset} - ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} - ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} - ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} - ${c_green}`._,._,\'${c_reset} - ${c_purple} nf-core/methylseq v${workflow.manifest.version}${c_reset} - -${c_dim}--------------------------------------------------${c_reset}- - """.stripIndent() + // Log colors ANSI codes + c_black = params.monochrome_logs ? '' : "\033[0;30m"; + c_blue = params.monochrome_logs ? '' : "\033[0;34m"; + c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; + c_dim = params.monochrome_logs ? '' : "\033[2m"; + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_reset = params.monochrome_logs ? '' : "\033[0m"; + c_white = params.monochrome_logs ? '' : "\033[0;37m"; + c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; + + return """ -${c_dim}--------------------------------------------------${c_reset}- + ${c_green},--.${c_black}/${c_green},-.${c_reset} + ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} + ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} + ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} + ${c_green}`._,._,\'${c_reset} + ${c_purple} nf-core/methylseq v${workflow.manifest.version}${c_reset} + -${c_dim}--------------------------------------------------${c_reset}- + """.stripIndent() } def checkHostname() { - def c_reset = params.monochrome_logs ? '' : "\033[0m" - def c_white = params.monochrome_logs ? '' : "\033[0;37m" - def c_red = params.monochrome_logs ? '' : "\033[1;91m" - def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" - if (params.hostnames) { - def hostname = "hostname".execute().text.trim() - params.hostnames.each { prof, hnames -> - hnames.each { hname -> - if (hostname.contains(hname) && !workflow.profile.contains(prof)) { - log.error "====================================================\n" + - " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + - " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + - " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + - "============================================================" - } - } - } - } + def c_reset = params.monochrome_logs ? '' : "\033[0m" + def c_white = params.monochrome_logs ? '' : "\033[0;37m" + def c_red = params.monochrome_logs ? '' : "\033[1;91m" + def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" + if (params.hostnames) { + def hostname = "hostname".execute().text.trim() + params.hostnames.each { prof, hnames -> + hnames.each { hname -> + if (hostname.contains(hname) && !workflow.profile.contains(prof)) { + log.error "====================================================\n" + + " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + + " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + + " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + + "============================================================" + } + } + } + } } diff --git a/nextflow.config b/nextflow.config index 6e34d2aa..2f98feeb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -50,6 +50,15 @@ params { bismark_align_cpu_per_multicore = null bismark_align_mem_per_multicore = null + bwa_biscuit_index = false + whitelist = false + common_dbsnp = false + save_pileup_file = false + save_snp_file = false + epiread = false + cpg_file = false + debug_epiread = false + // Boilerplate options name = false multiqc_config = false From a76e7d560fd7c7d66b6757494ce1aa5d6054037b Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 8 Feb 2021 13:04:04 +0200 Subject: [PATCH 24/56] fix some problems with main.nf, get cpg from assets dir, add common_dbsnp.hdr to assets dir --- assets/common_dbsnp.hdr | 11 ++++++++ cpg.bed | 0 gc_content.bed | 0 main.nf | 58 ++++++++++++++++------------------------- 4 files changed, 34 insertions(+), 35 deletions(-) create mode 100755 assets/common_dbsnp.hdr delete mode 100644 cpg.bed delete mode 100644 gc_content.bed diff --git a/assets/common_dbsnp.hdr b/assets/common_dbsnp.hdr new file mode 100755 index 00000000..96af7747 --- /dev/null +++ b/assets/common_dbsnp.hdr @@ -0,0 +1,11 @@ +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= diff --git a/cpg.bed b/cpg.bed deleted file mode 100644 index e69de29b..00000000 diff --git a/gc_content.bed b/gc_content.bed deleted file mode 100644 index e69de29b..00000000 diff --git a/main.nf b/main.nf index 3c1e1718..59278f39 100644 --- a/main.nf +++ b/main.nf @@ -53,7 +53,6 @@ def helpMessage() { --epiread [bool] Convert bam to biscuit epiread format --whitelist [file] The complement of blacklist, needed for SNP extraction For more instuctions: https://www.cse.huji.ac.il/~ekushele/assets.html#whitelist --common_dbsnp [file] Common dbSNP for the relevant genome, for SNP filteration - --cpg_file [file] Path to CpG file for the relevant genome (0-besed coordinates, not compressed) --debug_epiread Debug epiread merging for paired end-keep original epiread file and merged epiread file in debug mode --debug_epiread_merging Debug epiread merging. Output merged epiread in debug mode @@ -184,12 +183,6 @@ if( params.aligner == 'biscuit' && params.assets_dir ) { .ifEmpty { exit 1, "Assets directory for biscuit QC not found: ${params.assets_dir}" } .into { ch_assets_dir_for_biscuit_qc; ch_assets_dir_with_cpg_for_epiread } ch_fasta_for_buildBiscuitQCAssets.close() - - // Channel - // .fromPath("${params.assets_dir}/cpg.bed.gz", checkIfExists: true) - // .ifEmpty { exit 1, "CpG file not found : ${params.cpg_file}" } - // .set { ch_cpg_for_epiread; ch_cpg_file_for_cpg_index; } - // } } if( workflow.profile == 'uppmax' || workflow.profile == 'uppmax_devel' ){ @@ -292,20 +285,6 @@ if (params.epiread) { .ifEmpty { exit 1, "Cannot find any dbSNP file matching: ${params.common_dbsnp}\n" } .set { ch_commonSNP_for_SNP; } } - // if (!params.single_end) - // assert params.cpg_file: "No CpG file specified" - - // ch_cpg_for_epiread = Channel.empty() - // if (!params.single_end) { - // if (params.cpg_file) { - // Channel - // .fromPath(params.cpg_file, checkIfExists: true) - // .ifEmpty { exit 1, "CpG file not found : ${params.cpg_file}" } - // .set { ch_cpg_for_epiread; ch_cpg_file_for_cpg_index; } - // } - // } - - } // Header log info log.info nfcoreHeader() @@ -355,10 +334,8 @@ if(debug_mode.size() > 0) summary['Debug mode'] = debug_mode.join(', ') if(params.bismark_align_cpu_per_multicore) summary['Bismark align CPUs per --multicore'] = params.bismark_align_cpu_per_multicore if(params.bismark_align_mem_per_multicore) summary['Bismark align memory per --multicore'] = params.bismark_align_mem_per_multicore if(params.assets_dir) summary['Assets Directory'] = params.assets_dir -if(params.soloWCGW_file) summary['soloWCGW File'] = params.soloWCGW_file if(params.whitelist) summary['Whitelist'] = params.whitelist if(params.common_dbsnp) summary['Common SNP'] = params.common_dbsnp -if(params.cpg_file) summary['CpG File'] = params.cpg_file if(params.epiread) summary['Epiread'] = 'Yes' summary['Output dir'] = params.outdir summary['Launch dir'] = workflow.launchDir @@ -549,13 +526,14 @@ if( !params.assets_dir && params.aligner == 'biscuit' ) { file fasta from ch_fasta_for_buildBiscuitQCAssets output: - file "assets" into ch_assets_dir_for_biscuit_qc, ch_assets_dir_with_cpg_for_epiread - // file "cpg.bed" into ch_cpg_for_epiread + file "*assets" into ch_assets_dir_for_biscuit_qc, ch_assets_dir_with_cpg_for_epiread script: + assembly = fasta.toString().replaceAll(/\.\w+/,"") + """ - build_biscuit_QC_assets.pl -r $fasta -o assets + build_biscuit_QC_assets.pl -r $fasta -o ${assembly}_assets """ } } @@ -1134,13 +1112,12 @@ if( params.aligner == 'biscuit' ){ } input: - set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat - set val(name_samblaster), file(samblaster_bam) from ch_samblaster_for_samtools_sort_index_flagstat + set val(name), file(samblaster_bam) from ch_samblaster_for_samtools_sort_index_flagstat file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() output: - set val(name_samblaster), file("*.sorted.bam") into ch_bam_dedup_for_qualimap,ch_bam_for_preseq,ch_bam_sorted_for_pileup, ch_bam_sorted_for_epiread, ch_bam_noDups_for_QC,ch_bam_sorted_for_picard - file "*.sorted.bam.bai" into ch_bam_index_sorted_for_pileup,ch_bam_index_for_epiread,ch_bam_index_noDups_for_QC + set val(name), file("*.sorted.bam") into ch_bam_dedup_for_qualimap,ch_bam_for_preseq,ch_bam_sorted_for_pileup, ch_bam_sorted_for_epiread, ch_bam_noDups_for_QC,ch_bam_sorted_for_picard + set val(name), file ("*.sorted.bam.bai") into ch_bam_index_sorted_for_pileup,ch_bam_index_for_epiread,ch_bam_index_noDups_for_QC file "${samblaster_bam.baseName}_flagstat_report.txt" into ch_flagstat_results_biscuit_for_multiqc file "${samblaster_bam.baseName}_stats_report.txt" into ch_samtools_stats_results_biscuit_for_multiqc file "where_are_my_files.txt" @@ -1271,7 +1248,7 @@ if( params.aligner == 'biscuit' ){ set val(name), file(bam), file(bam_index), - file (snp), + file(snp), file(fasta), file(fasta_index), file(whitelist) from ch_bam_sorted_for_epiread @@ -1280,7 +1257,7 @@ if( params.aligner == 'biscuit' ){ .combine(ch_fasta_for_epiread) .combine(ch_fasta_index_for_epiread) .combine(ch_whitelist_for_epiread) - set val(cpg), file ("assets/cpg.bed.gz") from ch_assets_dir_with_cpg_for_epiread.collect() + file (assets) from ch_assets_dir_with_cpg_for_epiread.collect() output: @@ -1289,6 +1266,7 @@ if( params.aligner == 'biscuit' ){ script: snp_file = (snp.size()>0) ? "-B " + snp.toString() : '' + cpg_file = assets.toString() + "/cpg.bed.gz" debug_merging_epiread = (params.debug_epiread_merging || params.debug_epiread) ? "debug" : '' no_filter_reverse = params.rrbs ? "-p" : '' if (params.single_end) { @@ -1300,10 +1278,12 @@ if( params.aligner == 'biscuit' ){ """ } else if (params.debug_epiread) { """ + zcat $cpg_file > cpg.bed + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n > ${name}.original.epiread - less ${name}.original.epiread | $baseDir/bin/epiread_pairedEnd_convertion $cpg $snp ${name}.epiread $debug_merging_epiread > ${name}.err + less ${name}.original.epiread | $baseDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.original.epiread | bgzip > ${name}.original.epiread.gz @@ -1314,9 +1294,11 @@ if( params.aligner == 'biscuit' ){ } else { """ + zcat $cpg_file > cpg.bed + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam - biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n | $baseDir/bin/epiread_pairedEnd_convertion $cpg $snp ${name}.epiread $debug_merging_epiread > ${name}.err + biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n | $baseDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz tabix -0 -p bed ${name}.epiread.gz @@ -1348,7 +1330,7 @@ if( params.aligner == 'biscuit' ){ script: assembly = fasta.toString().replaceAll(/\.\w+/,"") """ - biscuit_QC.sh -v ${vcf[0]} -o ${name}.${assembly}_biscuitQC $assets $fasta ${name}.${assembly} ${bam} + QC.sh -v ${vcf[0]} -o ${name}.${assembly}_biscuitQC $assets $fasta ${name}.${assembly} ${bam} """ } @@ -1378,7 +1360,13 @@ process qualimap { script: gcref = params.genome.toString().startsWith('GRCh') ? '-gd HUMAN' : '' gcref = params.genome.toString().startsWith('GRCm') ? '-gd MOUSE' : '' + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $bam \\ + -@ ${task.cpus} $sort_mem \\ + -o ${bam.baseName}.sorted.bam qualimap bamqc $gcref \\ -bam ${bam.baseName}.bam \\ -outdir ${bam.baseName}_qualimap \\ From f1e559732e2a8a264ea3dc508c41c9d35995846c Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 10 Feb 2021 11:24:10 +0200 Subject: [PATCH 25/56] change environment file, fix problems in scrape_software_versions.py fix problems in main --- .github/.dockstore.yml | 5 + .github/CONTRIBUTING.md | 4 +- .github/ISSUE_TEMPLATE/bug_report.md | 17 +- .github/ISSUE_TEMPLATE/feature_request.md | 12 +- .github/PULL_REQUEST_TEMPLATE.md | 15 +- .github/workflows/awsfulltest.yml | 42 ++ .github/workflows/awstest.yml | 46 ++ .github/workflows/branch.yml | 31 +- .github/workflows/ci.yml | 44 +- .github/workflows/linting.yml | 22 +- .github/workflows/push_dockerhub.yml | 40 ++ .gitignore | 1 + CHANGELOG.md | 12 + CODE_OF_CONDUCT.md | 6 +- Dockerfile | 5 +- README.md | 44 +- bin/markdown_to_html.py | 43 +- bin/scrape_software_versions.py | 35 +- conf/base.config | 1 - conf/igenomes.config | 1 + conf/test.config | 2 +- conf/test_full.config | 85 +++ docs/README.md | 11 +- docs/output.md | 85 +-- docs/usage.md | 282 ++------- environment.yml | 17 +- main.nf | 477 +++++++++------- nextflow.config | 19 +- nextflow_schema.json | 665 ++++++++++++++++++++++ 29 files changed, 1447 insertions(+), 622 deletions(-) create mode 100644 .github/.dockstore.yml create mode 100644 .github/workflows/awsfulltest.yml create mode 100644 .github/workflows/awstest.yml create mode 100644 .github/workflows/push_dockerhub.yml create mode 100644 conf/test_full.config create mode 100644 nextflow_schema.json diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml new file mode 100644 index 00000000..030138a0 --- /dev/null +++ b/.github/.dockstore.yml @@ -0,0 +1,5 @@ +# Dockstore config version, not pipeline version +version: 1.2 +workflows: + - subclass: nfl + primaryDescriptorPath: /nextflow.config diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index df9519c4..6e116810 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -46,7 +46,7 @@ These tests are run both with the latest available version of `Nextflow` and als ## Patch -: warning: Only in the unlikely and regretful event of a release happening with a bug. +:warning: Only in the unlikely and regretful event of a release happening with a bug. * On your own fork, make a new branch `patch` based on `upstream/master`. * Fix the bug, and bump version (X.Y.Z+1). @@ -54,4 +54,4 @@ These tests are run both with the latest available version of `Nextflow` and als ## Getting help -For further information/help, please consult the [nf-core/methylseq documentation](https://nf-co.re/nf-core/methylseq/docs) and don't hesitate to get in touch on the nf-core Slack [#methylseq](https://nfcore.slack.com/channels/methylseq) channel ([join our Slack here](https://nf-co.re/join/slack)). +For further information/help, please consult the [nf-core/methylseq documentation](https://nf-co.re/methylseq/usage) and don't hesitate to get in touch on the nf-core Slack [#methylseq](https://nfcore.slack.com/channels/methylseq) channel ([join our Slack here](https://nf-co.re/join/slack)). diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index d619122e..f07e0be8 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,24 +1,27 @@ + -## Describe the bug +## Description of the bug -A clear and concise description of what the bug is. + ## Steps to reproduce Steps to reproduce the behaviour: -1. Command line: `nextflow run ...` -2. See error: _Please provide your error message_ +1. Command line: +2. See error: ## Expected behaviour -A clear and concise description of what you expected to happen. + ## System @@ -33,10 +36,10 @@ A clear and concise description of what you expected to happen. ## Container engine -- Engine: +- Engine: - version: - Image tag: ## Additional context -Add any other context about the problem here. + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 9fe7a5e2..f7ef15a4 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,24 +1,26 @@ + ## Is your feature request related to a problem? Please describe -A clear and concise description of what the problem is. + -Ex. I'm always frustrated when [...] + ## Describe the solution you'd like -A clear and concise description of what you want to happen. + ## Describe alternatives you've considered -A clear and concise description of any alternative solutions or features you've considered. + ## Additional context -Add any other context about the feature request here. + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index d42a0672..cff68160 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,4 @@ + + ## PR checklist - [ ] This comment contains a description of changes (with reason) +- [ ] `CHANGELOG.md` is updated - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If necessary, also make a PR on the [nf-core/methylseq branch on the nf-core/test-datasets repo](https://github.com/nf-core/test-datasets/pull/new/nf-core/methylseq) -- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). -- [ ] Make sure your code lints (`nf-core lint .`). - [ ] Documentation in `docs` is updated -- [ ] `CHANGELOG.md` is updated -- [ ] `README.md` is updated - -**Learn more about contributing:** [CONTRIBUTING.md](https://github.com/nf-core/methylseq/tree/master/.github/CONTRIBUTING.md) \ No newline at end of file +- [ ] If necessary, also make a PR on the [nf-core/methylseq branch on the nf-core/test-datasets repo](https://github.com/nf-core/test-datasets/pull/new/nf-core/methylseq) diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml new file mode 100644 index 00000000..50f85b4b --- /dev/null +++ b/.github/workflows/awsfulltest.yml @@ -0,0 +1,42 @@ +name: nf-core AWS full size tests +# This workflow is triggered on published releases. +# It can be additionally triggered manually with GitHub actions workflow dispatch. +# It runs the -profile 'test_full' on AWS batch + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + run-awstest: + name: Run AWS full tests + if: github.repository == 'nf-core/methylseq' + runs-on: ubuntu-latest + steps: + - name: Setup Miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.7 + - name: Install awscli + run: conda install -c conda-forge awscli + - name: Start AWS batch job + # TODO nf-core: You can customise AWS full pipeline tests as required + # Add full size test data (but still relatively small datasets for few samples) + # on the `test_full.config` test runs with only one set of parameters + # Then specify `-profile test_full` instead of `-profile test` on the AWS batch command + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} + AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} + AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} + AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} + run: | + aws batch submit-job \ + --region eu-west-1 \ + --job-name nf-core-methylseq \ + --job-queue $AWS_JOB_QUEUE \ + --job-definition $AWS_JOB_DEFINITION \ + --container-overrides '{"command": ["nf-core/methylseq", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/methylseq/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/methylseq/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}' diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml new file mode 100644 index 00000000..b3ceed17 --- /dev/null +++ b/.github/workflows/awstest.yml @@ -0,0 +1,46 @@ +name: nf-core AWS test +# This workflow is triggered on push to the master branch. +# It can be additionally triggered manually with GitHub actions workflow dispatch. +# It runs the -profile 'test' on AWS batch. + +on: + workflow_dispatch: + +jobs: + run-awstest: + name: Run AWS tests + if: github.repository == 'nf-core/methylseq' + runs-on: ubuntu-latest + strategy: + matrix: + aligner: ["bismark", "bismark_hisat", "bwameth"] + include: + - aligner: "bismark" + ref_index: --bismark_index results/reference_genome/BismarkIndex/ + - aligner: "bismark_hisat" + ref_index: --bismark_index results/reference_genome/BismarkIndex/ + - aligner: "bwameth" + ref_index: --bwa_meth_index results/reference_genome/genome.fa + steps: + - name: Setup Miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.7 + - name: Install awscli + run: conda install -c conda-forge awscli + - name: Start AWS batch job + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} + AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} + AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} + AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} + run: | + aws batch submit-job \ + --region eu-west-1 \ + --job-name nf-core-methylseq \ + --job-queue $AWS_JOB_QUEUE \ + --job-definition $AWS_JOB_DEFINITION \ + --container-overrides '{"command": ["nf-core/methylseq", "-r '"${GITHUB_SHA}"' -profile test --aligner ${{matrix.aligner} --outdir s3://'"${AWS_S3_BUCKET}"'/methylseq/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/methylseq/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}' diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index af9fd4d2..e4368a43 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -3,14 +3,35 @@ name: nf-core branch protection # It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` on: pull_request: - branches: - - master + branches: [master] jobs: test: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest steps: - # PRs are only ok if coming from an nf-core `dev` branch or a fork `patch` branch + # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches - name: Check PRs + if: github.repository == 'nf-core/methylseq' run: | - { [[ $(git remote get-url origin) == *nf-core/methylseq ]] && [[ ${GITHUB_HEAD_REF} = "dev" ]]; } || [[ ${GITHUB_HEAD_REF} == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name}} == nf-core/methylseq ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + + + # If the above check failed, post a comment on the PR explaining the failure + # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@v1 + with: + message: | + Hi @${{ github.event.pull_request.user.login }}, + + It looks like this pull-request is has been made against the ${{github.event.pull_request.head.repo.full_name}} `master` branch. + The `master` branch on nf-core repositories should always contain code from the latest release. + Because of this, PRs to `master` are only allowed if they come from the ${{github.event.pull_request.head.repo.full_name}} `dev` branch. + + You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4e81739f..e620c5e5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,18 +1,26 @@ name: nf-core CI -# This workflow is triggered on pushes and PRs to the repository. -# It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors -on: [push, pull_request] +# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +on: + push: + branches: + - dev + pull_request: + release: + types: [published] jobs: test: + name: Run workflow tests + # Only run on push if this is the nf-core dev branch (merged PRs) + if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/methylseq') }} + runs-on: ubuntu-latest env: NXF_VER: ${{ matrix.nxf_ver }} NXF_ANSI_LOG: false - runs-on: ubuntu-latest strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['19.10.0', ''] + nxf_ver: ['20.04.0', ''] aligner: ['bismark', 'bismark_hisat', 'bwameth', 'biscuit'] include: - aligner: 'bismark' @@ -24,15 +32,31 @@ jobs: - aligner: 'biscuit' ref_index: --bwa_biscuit_index results/reference_genome/genome.fa steps: - - uses: actions/checkout@v2 - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Check if Dockerfile or Conda environment changed + uses: technote-space/get-diff-action@v4 + with: + FILES: | + Dockerfile + environment.yml + + - name: Build new docker image + if: env.MATCHED_FILES + run: docker build --no-cache . -t nfcore/methylseq:dev + - name: Pull docker image + if: ${{ !env.MATCHED_FILES }} run: | docker pull nfcore/methylseq:dev docker tag nfcore/methylseq:dev nfcore/methylseq:dev + + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Run pipeline with test data run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{matrix.aligner}} --save_reference diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 1e0827a8..8e8d5bbc 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -33,18 +33,36 @@ jobs: nf-core: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + + - name: Check out pipeline code + uses: actions/checkout@v2 + - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ + - uses: actions/setup-python@v1 with: python-version: '3.6' architecture: 'x64' + - name: Install dependencies run: | python -m pip install --upgrade pip pip install nf-core + - name: Run nf-core lint - run: nf-core lint ${GITHUB_WORKSPACE} + env: + GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} + run: nf-core -l lint_log.txt lint ${GITHUB_WORKSPACE} + + - name: Upload linting log file artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: linting-log-file + path: lint_log.txt + diff --git a/.github/workflows/push_dockerhub.yml b/.github/workflows/push_dockerhub.yml new file mode 100644 index 00000000..0482a48c --- /dev/null +++ b/.github/workflows/push_dockerhub.yml @@ -0,0 +1,40 @@ +name: nf-core Docker push +# This builds the docker image and pushes it to DockerHub +# Runs on nf-core repo releases and push event to 'dev' branch (PR merges) +on: + push: + branches: + - dev + release: + types: [published] + +jobs: + push_dockerhub: + name: Push new Docker image to Docker Hub + runs-on: ubuntu-latest + # Only run for the nf-core repo, for releases and merged PRs + if: ${{ github.repository == 'nf-core/methylseq' }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }} + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Build new docker image + run: docker build --no-cache . -t nfcore/methylseq:latest + + - name: Push Docker image to DockerHub (dev) + if: ${{ github.event_name == 'push' }} + run: | + echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin + docker tag nfcore/methylseq:latest nfcore/methylseq:dev + docker push nfcore/methylseq:dev + + - name: Push Docker image to DockerHub (release) + if: ${{ github.event_name == 'release' }} + run: | + echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin + docker push nfcore/methylseq:latest + docker tag nfcore/methylseq:latest nfcore/methylseq:${{ github.event.release.tag_name }} + docker push nfcore/methylseq:${{ github.event.release.tag_name }} diff --git a/.gitignore b/.gitignore index 6354f370..aa4bb5b3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ results/ .DS_Store tests/ testing/ +testing* *.pyc diff --git a/CHANGELOG.md b/CHANGELOG.md index 62d50c94..8f394111 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # nf-core/methylseq +<<<<<<< HEAD ## [v1.6dev] ### New features @@ -14,6 +15,15 @@ * _new dependency_: samblaster`0.1.24` * _new dependency_: bedtools `2.29.1` * _new_: biscuit tool `0.3.11` +======= +## v1.6dev - [date] + +### Pipeline Updates + +* Updated template to tools 1.11 +* Moved parameter documentation into new `nextflow_schema.json` file. +* Added new `--maxins` and `--minins` parameters to pass on to Bismark +>>>>>>> 9218f1199bca434af49b54963eea91cfed572597 ## [v1.5](https://github.com/nf-core/methylseq/releases/tag/1.5) - 2020-04-09 @@ -23,6 +33,8 @@ * Improved the multicore support for Bismark methXtract for more parallelisation ([#121](https://github.com/nf-core/methylseq/issues/121)) * Added `--cytosine_report` option to tell Bismark to give reports for all cytosines in the genome. * Added options `--bismark_align_cpu_per_multicore` and `--bismark_align_cpu_per_multicore` to customise how Bismark align `--multicore` is decided ([#124](https://github.com/nf-core/methylseq/issues/124)) +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Software updates diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index cf930c8a..405fb1bf 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -40,7 +40,7 @@ Project maintainers who do not follow or enforce the Code of Conduct in good fai ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct/][version] -[homepage]: http://contributor-covenant.org -[version]: http://contributor-covenant.org/version/1/4/ +[homepage]: https://contributor-covenant.org +[version]: https://www.contributor-covenant.org/version/1/4/code-of-conduct/ diff --git a/Dockerfile b/Dockerfile index 90ad002f..04602694 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,14 @@ -FROM nfcore/base:1.9 +FROM nfcore/base:1.11 LABEL authors="Phil Ewels" \ description="Docker image containing all software requirements for the nf-core/methylseq pipeline" # Install the conda environment COPY environment.yml / -RUN conda env create -f /environment.yml && conda clean -a +RUN conda env create --quiet -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') ENV PATH /opt/conda/envs/nf-core-methylseq-1.6dev/bin:$PATH # Dump the details of the installed packages to a file for posterity RUN conda env export --name nf-core-methylseq-1.6dev > nf-core-methylseq-1.6dev.yml + diff --git a/README.md b/README.md index ae3c873f..b510a259 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,11 @@ [![DOI](https://zenodo.org/badge/124913037.svg)](https://zenodo.org/badge/latestdoi/124913037) [![GitHub Actions CI Status](https://github.com/nf-core/methylseq/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/methylseq/actions) [![GitHub Actions Linting Status](https://github.com/nf-core/methylseq/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/methylseq/actions) -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.10.0-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.0-brightgreen.svg)](https://www.nextflow.io/) -[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) -[![Docker Container available](https://img.shields.io/docker/automated/nfcore/methylseq.svg)](https://hub.docker.com/r/nfcore/methylseq/) +[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](https://bioconda.github.io/) +[![Docker](https://img.shields.io/docker/automated/nfcore/methylseq.svg)](https://hub.docker.com/r/nfcore/methylseq) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23methylseq-4A154B?logo=slack)](https://nfcore.slack.com/channels/methylseq) nf-core/methylseq is a bioinformatics analysis pipeline used for Methylation (Bisulfite) sequencing data. It pre-processes raw data from FastQ inputs, aligns the reads and performs extensive quality-control on the results. @@ -32,38 +33,29 @@ Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for ## Quick Start -i. Install [`nextflow`](https://nf-co.re/usage/installation) +1. Install [`nextflow`](https://nf-co.re/usage/installation) -ii. Install either [`Docker`](https://docs.docker.com/engine/installation/) or [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) for full pipeline reproducibility (please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles)) +2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ -iii. Download the pipeline and test it on a minimal dataset with a single command +3. Download the pipeline and test it on a minimal dataset with a single command: -```bash -nextflow run nf-core/methylseq -profile test, -``` + ```bash + nextflow run nf-core/methylseq -profile test, + ``` -> Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. + > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. -iv. Start running your own analysis! +4. Start running your own analysis! -```bash -nextflow run nf-core/methylseq -profile --reads '*_R{1,2}.fastq.gz' --genome GRCh37 -``` + ```bash + nextflow run nf-core/methylseq -profile --input '*_R{1,2}.fastq.gz' --genome GRCh37 + ``` -See [usage docs](docs/usage.md) for all of the available options when running the pipeline. +See [usage docs](https://nf-co.re/methylseq/usage) for all of the available options when running the pipeline. ## Documentation -The nf-core/methylseq pipeline comes with documentation about the pipeline, found in the `docs/` directory: - -1. [Installation](https://nf-co.re/usage/installation) -2. Pipeline configuration - * [Local installation](https://nf-co.re/usage/local_installation) - * [Adding your own system config](https://nf-co.re/usage/adding_own_config) - * [Reference genomes](https://nf-co.re/usage/reference_genomes) -3. [Running the pipeline](docs/usage.md) -4. [Output and how to interpret the results](docs/output.md) -5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) +The nf-core/methylseq pipeline comes with documentation about the pipeline: [usage](https://nf-co.re/methylseq/usage) and [output](https://nf-co.re/methylseq/output). ## Credits @@ -80,7 +72,7 @@ These scripts were originally written for use at the [National Genomics Infrastr If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). -For further information or help, don't hesitate to get in touch on [Slack](https://nfcore.slack.com/channels/methylseq) (you can join with [this invite](https://nf-co.re/join/slack)). +For further information or help, don't hesitate to get in touch on the [Slack `#methylseq` channel](https://nfcore.slack.com/channels/methylseq) (you can join with [this invite](https://nf-co.re/join/slack)). ## Citation diff --git a/bin/markdown_to_html.py b/bin/markdown_to_html.py index 57cc4263..a26d1ff5 100755 --- a/bin/markdown_to_html.py +++ b/bin/markdown_to_html.py @@ -4,33 +4,23 @@ import markdown import os import sys +import io + def convert_markdown(in_fn): - input_md = open(in_fn, mode="r", encoding="utf-8").read() + input_md = io.open(in_fn, mode="r", encoding="utf-8").read() html = markdown.markdown( "[TOC]\n" + input_md, - extensions = [ - 'pymdownx.extra', - 'pymdownx.b64', - 'pymdownx.highlight', - 'pymdownx.emoji', - 'pymdownx.tilde', - 'toc' - ], - extension_configs = { - 'pymdownx.b64': { - 'base_path': os.path.dirname(in_fn) - }, - 'pymdownx.highlight': { - 'noclasses': True - }, - 'toc': { - 'title': 'Table of Contents' - } - } + extensions=["pymdownx.extra", "pymdownx.b64", "pymdownx.highlight", "pymdownx.emoji", "pymdownx.tilde", "toc"], + extension_configs={ + "pymdownx.b64": {"base_path": os.path.dirname(in_fn)}, + "pymdownx.highlight": {"noclasses": True}, + "toc": {"title": "Table of Contents"}, + }, ) return html + def wrap_html(contents): header = """ @@ -83,18 +73,19 @@ def wrap_html(contents): def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument('mdfile', type=argparse.FileType('r'), nargs='?', - help='File to convert. Defaults to stdin.') - parser.add_argument('-o', '--out', type=argparse.FileType('w'), - default=sys.stdout, - help='Output file name. Defaults to stdout.') + parser.add_argument("mdfile", type=argparse.FileType("r"), nargs="?", help="File to convert. Defaults to stdin.") + parser.add_argument( + "-o", "--out", type=argparse.FileType("w"), default=sys.stdout, help="Output file name. Defaults to stdout." + ) return parser.parse_args(args) + def main(args=None): args = parse_args(args) converted_md = convert_markdown(args.mdfile.name) html = wrap_html(converted_md) args.out.write(html) -if __name__ == '__main__': + +if __name__ == "__main__": sys.exit(main()) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 5f9c8b73..e30f2624 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -4,7 +4,7 @@ import re regexes = { - 'nf-core/methylseq': ['v_ngi_methylseq.txt', r"(\S+)"], + 'nf-core/methylseq': ['v_pipeline.txt', r"(\S+)"], 'Nextflow': ['v_nextflow.txt', r"(\S+)"], 'Bismark genomePrep': ['v_bismark_genome_preparation.txt', r"Bismark Genome Preparation Version: v(\S+)"], 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], @@ -25,12 +25,15 @@ 'Picard CollectGcBiasMetrics': ['v_picard_collectgcbias.txt', r"([\d\.]+)"], 'samblaster': ['v_samblaster.txt', r"samblaster: Version (\S+)"], 'biscuit': ['v_biscuit.txt', r"Version: (\S+)"], - 'fastasort': ['v_fastasort.txt', r"fastasort from exonerate version (\S+)"], - 'MethylDackel': ['v_methyldackel.txt', r"(.+)"], + 'bcftools': ['v_bcftools.txt', r"bcftools (\S+)"], + 'bedtools': ['v_bedtools.txt', r"bedtools (\S+)"], + 'parallel': ['v_parallel.txt', r"GNU parallel (\S+)"], + 'gawk': ['v_gawk.txt', r"GNU Awk (\S+)"], + 'MethylDackel': ['v_methyldackel.txt', r"(.+)"], 'Qualimap': ['v_qualimap.txt', r"QualiMap v.(\S+)"], 'Preseq': ['v_preseq.txt', r"Version: (\S+)"], 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], - + } results = OrderedDict() results['nf-core/methylseq'] = 'N/A' @@ -58,6 +61,11 @@ results['Picard CreateSequenceDictionary'] = 'N/A' results['Picard CollectInsertSizeMetrics'] = 'N/A' results['Picard CollectGcBiasMetrics'] = 'N/A' +results['bcftools'] = 'N/A' +results['bedtools'] = 'N/A' +results['gawk'] = 'N/A' +results['parallel'] = 'N/A' + # Search each file using its regex for k, v in regexes.items(): @@ -73,10 +81,10 @@ # Remove software set to false in results for k in list(results): if not results[k]: - del(results[k]) + del results[k] # Dump to YAML -print (''' +print(""" id: 'software_versions' section_name: 'nf-core/methylseq Software Versions' section_href: 'https://github.com/nf-core/methylseq' @@ -84,12 +92,13 @@ description: 'are collected at run time from the software output.' data: |
-''') -for k,v in results.items(): - print("
{}
{}
".format(k,v)) -print ("
") +""" +) +for k, v in results.items(): + print("
{}
{}
".format(k,v)) +print(" ") # Write out regexes as csv file: -with open('software_versions.csv', 'w') as f: - for k,v in results.items(): - f.write("{}\t{}\n".format(k,v)) +with open("software_versions.csv", "w") as f: + for k, v in results.items(): + f.write("{}\t{}\n".format(k, v)) diff --git a/conf/base.config b/conf/base.config index e632da8d..8427ec86 100644 --- a/conf/base.config +++ b/conf/base.config @@ -65,7 +65,6 @@ process { } withName:get_software_versions { - validExitStatus = [0,1] cache = false } withName:bwamem_align { diff --git a/conf/igenomes.config b/conf/igenomes.config index 2de92422..caeafceb 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -340,6 +340,7 @@ params { gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" mito_name = "chrM" + macs_gsize = "1.37e9" } 'dm6' { fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" diff --git a/conf/test.config b/conf/test.config index a59a3702..3ee85e58 100644 --- a/conf/test.config +++ b/conf/test.config @@ -17,7 +17,7 @@ params { // Input data single_end = true - readPaths = [ + input_paths = [ ['SRR389222_sub1', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub1.fastq.gz']], ['SRR389222_sub2', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub2.fastq.gz']], ['SRR389222_sub3', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub3.fastq.gz']] diff --git a/conf/test_full.config b/conf/test_full.config new file mode 100644 index 00000000..03eb4d5f --- /dev/null +++ b/conf/test_full.config @@ -0,0 +1,85 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running full-size tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a full size pipeline test. Use as follows: + * nextflow run nf-core/methylseq -profile test_full, + */ + +params { + config_profile_name = 'Full test profile' + config_profile_description = 'Full test dataset to check pipeline function' + + // Input data for full size test + input_paths = [ + ['SRR7961089_GSM3415653_MShef1', + [ + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961089_GSM3415653_MShef11_bulk_Homo_sapiens_Bisulfite-Seq_1.fastq.gz', + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961089_GSM3415653_MShef11_bulk_Homo_sapiens_Bisulfite-Seq_2.fastq.gz' + ], + ], + ['SRR7961103_GSM3415667_MShef11_low_oxygen_Q2', + [ + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961103_GSM3415667_MShef11_low_oxygen_Q2_BSseq_Homo_sapiens_Bisulfite-Seq_1.fastq.gz', + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961103_GSM3415667_MShef11_low_oxygen_Q2_BSseq_Homo_sapiens_Bisulfite-Seq_2.fastq.gz' + ], + ], + ['SRR7961102_GSM3415666_MShef11_low_oxygen_Q1', + [ + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961102_GSM3415666_MShef11_low_oxygen_Q1_BSseq_Homo_sapiens_Bisulfite-Seq_1.fastq.gz', + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961102_GSM3415666_MShef11_low_oxygen_Q1_BSseq_Homo_sapiens_Bisulfite-Seq_2.fastq.gz' + ], + ], + ['SRR7961104_GSM3415668_MShef11_low_oxygen_Q3', + [ + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961104_GSM3415668_MShef11_low_oxygen_Q3_BSseq_Homo_sapiens_Bisulfite-Seq_1.fastq.gz', + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961104_GSM3415668_MShef11_low_oxygen_Q3_BSseq_Homo_sapiens_Bisulfite-Seq_2.fastq.gz' + ], + ], + ['SRR7961150_GSM3415714_MShef4_bulk', + [ + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961150_GSM3415714_MShef4_bulk_BSseq_Homo_sapiens_Bisulfite-Seq_1.fastq.gz', + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961150_GSM3415714_MShef4_bulk_BSseq_Homo_sapiens_Bisulfite-Seq_2.fastq.gz' + ], + ], + ['SRR7961161_GSM3415725_MShef4_J1', + [ + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961161_GSM3415725_MShef4_J1_BSseq_Homo_sapiens_Bisulfite-Seq_1.fastq.gz', + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961161_GSM3415725_MShef4_J1_BSseq_Homo_sapiens_Bisulfite-Seq_2.fastq.gz' + ], + ], + ['SRR7961164_GSM3415728_MShef4_J3', + [ + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961164_GSM3415728_MShef4_J3_BSseq_Homo_sapiens_Bisulfite-Seq_1.fastq.gz', + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961164_GSM3415728_MShef4_J3_BSseq_Homo_sapiens_Bisulfite-Seq_2.fastq.gz' + ], + ], + ['SRR7961163_GSM3415727_MShef4_J2', + [ + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961163_GSM3415727_MShef4_J2_BSseq_Homo_sapiens_Bisulfite-Seq_1.fastq.gz', + 's3://nf-core-awsmegatests/methylseq/input_data/SRR7961163_GSM3415727_MShef4_J2_BSseq_Homo_sapiens_Bisulfite-Seq_2.fastq.gz' + ] + ] + ] + + genome = 'GRCh38' +} + + +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/009/SRR7961089/SRR7961089_1.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/009/SRR7961089/SRR7961089_2.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/003/SRR7961103/SRR7961103_1.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/003/SRR7961103/SRR7961103_2.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/002/SRR7961102/SRR7961102_1.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/002/SRR7961102/SRR7961102_2.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/004/SRR7961104/SRR7961104_1.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/004/SRR7961104/SRR7961104_2.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/000/SRR7961150/SRR7961150_1.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/000/SRR7961150/SRR7961150_2.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/001/SRR7961161/SRR7961161_1.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/001/SRR7961161/SRR7961161_2.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/004/SRR7961164/SRR7961164_1.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/004/SRR7961164/SRR7961164_2.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/003/SRR7961163/SRR7961163_1.fastq.gz +// ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR796/003/SRR7961163/SRR7961163_2.fastq.gz diff --git a/docs/README.md b/docs/README.md index 859e2837..f1f3d09b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,12 +1,3 @@ # nf-core/methylseq: Documentation -The nf-core/methylseq documentation is split into the following files: - -1. [Installation](https://nf-co.re/usage/installation) -2. Pipeline configuration - * [Local installation](https://nf-co.re/usage/local_installation) - * [Adding your own system config](https://nf-co.re/usage/adding_own_config) - * [Reference genomes](https://nf-co.re/usage/reference_genomes) -3. [Running the pipeline](usage.md) -4. [Output and how to interpret the results](output.md) -5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) +Please read this documentation on the nf-core website: [https://nf-co.re/methylseq/](https://nf-co.re/methylseq/) diff --git a/docs/output.md b/docs/output.md index 5a5caa45..1bce3d86 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,43 +1,56 @@ # nf-core/methylseq Output +## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/methylseq/output](https://nf-co.re/methylseq/output) + +> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ + +## Introduction + This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. Note that nf-core/methylseq contains two workflows - one for Bismark, one for bwa-meth. The results files produced will vary depending on which variant is run. +The output directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. + ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -* [FastQC](#fastqc) - read quality control -* [TrimGalore](#trimgalore) - adapter trimming -* [Alignment](#alignment) - aligning reads to reference genome -* [Deduplication](#deduplication) - deduplicating reads -* [Methylation Extraction](#methylation-extraction) - calling cytosine methylation steps -* [Bismark Reports](#bismark-reports) - single-sample and summary analysis reports -* [Biscuit Reports](#biscuit reports) - single-sample analysis reports for biscuit aligner -* [Qualimap](#qualimap) - tool for genome alignments QC -* [Preseq](#preseq) - tool for estimating sample complexity -* [Picard](#picard) - tool for generating metrics of statistics -* [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline -* [Pipeline Info](#pipeline-info) - reports from nextflow about the pipeline run +* [FastQC](#fastqc) - Read quality control +* [TrimGalore](#trimgalore) - Adapter trimming +* [Alignment](#alignment) - Aligning reads to reference genome +* [Deduplication](#deduplication) - Deduplicating reads +* [Methylation Extraction](#methylation-extraction) - Calling cytosine methylation steps +* [Bismark Reports](#bismark-reports) - Single-sample and summary analysis reports +* [Biscuit Reports](#biscuit reports) - Single-sample analysis reports for biscuit aligner +* [Qualimap](#qualimap) - Tool for genome alignments QC +* [Preseq](#preseq) - Tool for estimating sample complexity +* [Picard](#picard) - Tool for generating metrics of statistics +* [MultiQC](#multiqc) - Aggregate report describing results of the whole pipeline +* [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ## FastQC -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences. +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. -For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory. +**Output files:** -**Output directory: `results/fastqc`** +* `fastqc/` + * `*_fastqc.html`: FastQC report containing quality metrics for your untrimmed raw fastq files. +* `fastqc/zips/` + * `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. * `sample_fastqc.html` * FastQC report, containing quality metrics for your untrimmed raw fastq files * `sample_fastqc.zip` * zip file containing the FastQC report, tab-delimited data file and plot images +> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. + ## TrimGalore The nf-core/methylseq pipeline uses [TrimGalore!](http://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) for removal of adapter contamination and trimming of low quality regions. TrimGalore is a wrapper around [Cutadapt](https://github.com/marcelm/cutadapt) and runs FastQC after it finishes. @@ -253,36 +266,26 @@ The two metrics created here are: ## MultiQC -[MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. -The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability. +The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. -**Output directory: `results/multiqc`** +For more information about how to use MultiQC reports, see [https://multiqc.info](https://multiqc.info). -* `Project_multiqc_report.html` - * MultiQC report - a standalone HTML file that can be viewed in your web browser -* `Project_multiqc_data/` - * Directory containing parsed statistics from the different tools used in the pipeline +**Output files:** -For more information about how to use MultiQC reports, see [MultiQC](http://multiqc.info) +* `multiqc/` + * `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + * `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + * `multiqc_plots/`: directory containing static images from the report in various formats. -## Pipeline Info +## Pipeline information -Nextflow has several built-in reporting tools that give information about the pipeline run. +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. -**Output directory: `results/pipeline_info`** +**Output files:** -* `pipeline_dag.svg` - * DAG graph giving a diagrammatic view of the pipeline run. - * NB: If [Graphviz](http://www.graphviz.org/) was not installed when running the pipeline, this file will be in [DOT format](http://www.graphviz.org/content/dot-language) instead of SVG. -* `execution_report.html` - * Nextflow report describing parameters, computational resource usage and task bash commands used. -* `execution_timeline.html` - * A waterfall timeline plot showing the running times of the workflow tasks. -* `execution_trace.txt` - * A text file with machine-readable statistics about every task executed in the pipeline. -* `pipeline_report.html` - * A pipeline-specific HTML report describing the running of the pipeline. - * This is the same as sent in an email if `--email` was specified. -* `pipeline_report.txt` - * A text-only version of the same report. +* `pipeline_info/` + * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. + * Documentation for interpretation of results in HTML format: `results_description.html`. diff --git a/docs/usage.md b/docs/usage.md index 6e8898d6..ee7fe191 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,6 +2,10 @@ # nf-core/methylseq: Usage +## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/methylseq/usage](https://nf-co.re/methylseq/usage) + +> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ + ## Table of contents * [Table of contents](#table-of-contents) @@ -10,6 +14,7 @@ * [Running the pipeline](#running-the-pipeline) * [Updating the pipeline](#updating-the-pipeline) * [Reproducibility](#reproducibility) +<<<<<<< HEAD * [Main arguments](#main-arguments) * [`-profile`](#-profile) * [`--reads`](#--reads) @@ -82,6 +87,12 @@ NXF_OPTS='-Xms1g -Xmx4g' ### Bismark, bwa-meth and biscuit workflow The nf-core/methylseq package is actually three pipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline. +======= + +## Introduction + +The nf-core/methylseq package is actually two pipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline. +>>>>>>> 9218f1199bca434af49b54963eea91cfed572597 Since bismark v0.21.0 it is also possible to use [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) as alignment tool. To run this workflow, invoke the pipeline with the command line flag `--aligner bismark_hisat`. HISAT2 also supports splice-aware alignment if analysis of RNA is desired (e.g. [SLAMseq](https://science.sciencemag.org/content/360/6390/800) experiments), a file containing a list of known splicesites can be provided with `--known_splices`. @@ -94,7 +105,7 @@ The third workflow uses [biscuit]([https://github.com/huishenlab/biscuit](https: The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/methylseq --reads '*_R{1,2}.fastq.gz' -profile docker +nextflow run nf-core/methylseq --input '*_R{1,2}.fastq.gz' -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -124,13 +135,15 @@ First, go to the [nf-core/methylseq releases page](https://github.com/nf-core/me This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. -## Main arguments +## Core Nextflow arguments + +> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). ### `-profile` Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Conda) - see below. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -142,82 +155,47 @@ They are loaded in sequence, so later profiles can overwrite earlier profiles. If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. * `docker` - * A generic configuration profile to be used with [Docker](http://docker.com/) - * Pulls software from dockerhub: [`nfcore/methylseq`](http://hub.docker.com/r/nfcore/methylseq/) + * A generic configuration profile to be used with [Docker](https://docker.com/) + * Pulls software from Docker Hub: [`nfcore/methylseq`](https://hub.docker.com/r/nfcore/methylseq/) * `singularity` - * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) - * Pulls software from DockerHub: [`nfcore/methylseq`](http://hub.docker.com/r/nfcore/methylseq/) + * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) + * Pulls software from Docker Hub: [`nfcore/methylseq`](https://hub.docker.com/r/nfcore/methylseq/) +* `podman` + * A generic configuration profile to be used with [Podman](https://podman.io/) + * Pulls software from Docker Hub: [`nfcore/methylseq`](https://hub.docker.com/r/nfcore/methylseq/) * `conda` - * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker or Singularity. + * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity or Podman. * A generic configuration profile to be used with [Conda](https://conda.io/docs/) * Pulls most software from [Bioconda](https://bioconda.github.io/) * `test` * A profile with a complete configuration for automated testing * Includes links to test data so needs no other parameters -### `--reads` - -Use this to specify the location of your input FastQ files. For example: - -```bash ---reads 'path/to/data/sample_*_{1,2}.fastq' -``` - -Please note the following requirements: - -1. The path must be enclosed in quotes -2. The path must have at least one `*` wildcard character -3. When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs. - -If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz` - -### `--single_end` - -By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--single_end` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--reads`. For example: - -```bash ---single_end --reads '*.fastq' -``` - -It is not possible to run a mixture of single-end and paired-end files in one run. - -## Reference genomes - -The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. +### `-resume` -### `--genome` (using iGenomes) +Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. -There are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag. +You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. -You can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are: +### `-c` -* Human - * `--genome GRCh37` - * `--genome GRCh38` -* Mouse - * `--genome GRCm38` -* _Drosophila_ - * `--genome BDGP6` -* _S. cerevisiae_ - * `--genome 'R64-1-1'` +Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. -> There are numerous others - check the config file for more. +#### Custom resource requests -Note that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file. +Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources) it will automatically resubmit with higher requests (2 x original, then 3 x original). If it still fails after three times then the pipeline is stopped. -The syntax for this reference configuration is as follows: +Whilst these default requirements will hopefully work for most people with most data, you may find that you want to customise the compute resources that the pipeline requests. You can do this by creating a custom config file. For example, to give the workflow process `star` 32GB of memory, you could use the following config: ```nextflow -params { - genomes { - 'GRCh37' { - fasta = '' // Used if no index given - } - // Any number of additional genomes, key is used with --genome +process { + withName: bismark_align { + memory = 32.GB } } ``` +<<<<<<< HEAD ### Supplying reference indices If you don't want to use the Illumina iGenomes references, you can supply your own reference genome. @@ -357,188 +335,28 @@ Path to a directory containing needed file for biscuit-QC step. The needed files ### `--min_depth` Specify to specify a minimum read coverage for MethylDackel or biscuit vcf2bed to report a methylation call. +======= +See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more information. +>>>>>>> 9218f1199bca434af49b54963eea91cfed572597 -### `--meth_cutoff` - -Use this to specify a minimum read coverage to report a methylation call during Bismark's `bismark_methylation_extractor` step. - -### `--ignore_flags` - -Specify to run MethylDackel with the `--ignore_flags` flag to ignore SAM flags. - -### `--methyl_kit` - -Specify to run MethylDackel with the `--methyl_kit` flag to produce files suitable for use with the methylKit R package. - -### `--known_splices` - -Specify to run Bismark with the `--known-splicesite-infile` flag to run splice-aware alignment using HISAT2. A `.gtf` file has to be provided from which a list of known splicesites is created by the pipeline. (only works with `--aligner bismark_hisat`) - -### `--slamseq` - -Specify to run Bismark with the `--slam` flag to run bismark in [SLAM-seq mode](https://github.com/FelixKrueger/Bismark/blob/master/CHANGELOG.md#slam-seq-mode) (only works with `--aligner bismark_hisat`) - -### `--local_alignment` - -Specify to run Bismark with the `--local` flag to allow soft-clipping of reads. This should only be used with care in certain single-cell applications or PBAT libraries, which may produce chimeric read pairs. (See [Wu et al.](https://doi.org/10.1093/bioinformatics/btz125) (doesn't work with `--aligner bwameth`) - -### `--bismark_align_cpu_per_multicore` - -The pipeline makes use of the `--multicore` option for Bismark align. When using this option, -Bismark uses a large number of CPUs for every `--multicore` specified. The pipeline -calculates the number of `--multicore` based on the resources available to the task. -It divides the available CPUs by 3, or by 5 if any of `--single_cell`, `--zymo` or `--non_directional` -are specified. This is based on usage for a typical mouse genome. - -You may find when running the pipeline that Bismark is not using this many CPUs. To fine tune the -usage and speed, you can specify an integer with `--bismark_align_cpu_per_multicore` and the pipeline -will divide the available CPUs by this value instead. - -See the [bismark documentation](https://github.com/FelixKrueger/Bismark/tree/master/Docs#alignment) -for more information. - -### `--bismark_align_mem_per_multicore` - -Exactly as above, but for memory. By default, the pipeline divides the available memory by `13.GB`, -or `18.GB` if any of `--single_cell`, `--zymo` or `--non_directional` are specified. - -Note that the final `--multicore` value is based on the lowest limiting factor of both CPUs and memory. - -## Job resources - -### Automatic resubmission - -Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources) it will automatically resubmit with higher requests (2 x original, then 3 x original). If it still fails after three times then the pipeline is stopped. - -### Custom resource requests - -Wherever process-specific requirements are set in the pipeline, the default value can be changed by creating a custom config file. See the files hosted at [`nf-core/configs`](https://github.com/nf-core/configs/tree/master/conf) for examples. - -If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition below). You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. - -If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack). +If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition above). You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. -## AWS Batch specific parameters +If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). -Running the pipeline on AWS Batch requires a couple of specific parameters to be set according to your AWS Batch configuration. Please use [`-profile awsbatch`](https://github.com/nf-core/configs/blob/master/conf/awsbatch.config) and then specify all of the following parameters. +### Running in the background -### `--awsqueue` +Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. -The JobQueue that you intend to use on AWS Batch. +The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. -### `--awsregion` +Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. +Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). -The AWS region in which to run your job. Default is set to `eu-west-1` but can be adjusted to your needs. +#### Nextflow memory requirements -### `--awscli` - -The [AWS CLI](https://www.nextflow.io/docs/latest/awscloud.html#aws-cli-installation) path in your custom AMI. Default: `/home/ec2-user/miniconda/bin/aws`. - -Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to a S3 storage bucket of your choice - you'll get an error message notifying you if you didn't. - -## Other command line parameters - -### `--outdir` - -The output directory where the results will be saved. - -### `--email` - -Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run. - -### `--email_on_fail` - -This works exactly as with `--email`, except emails are only sent if the workflow is not successful. - -### `--max_multiqc_email_size` - -Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB). - -### `-name` - -Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. - -This is used in the MultiQC report (if not default) and in the summary HTML / e-mail (always). - -**NB:** Single hyphen (core Nextflow option) - -### `-resume` - -Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. - -You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. - -**NB:** Single hyphen (core Nextflow option) - -### `-c` - -Specify the path to a specific config file (this is a core NextFlow command). - -**NB:** Single hyphen (core Nextflow option) - -Note - you can use this to override pipeline defaults. - -### `--custom_config_version` - -Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`. +In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. +We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): ```bash -## Download and use config file with following git commid id ---custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96 -``` - -### `--custom_config_base` - -If you're running offline, nextflow will not be able to fetch the institutional config files -from the internet. If you don't need them, then this is not a problem. If you do need them, -you should download the files from the repo and tell nextflow where to find them with the -`custom_config_base` option. For example: - -```bash -## Download and unzip the config files -cd /path/to/my/configs -wget https://github.com/nf-core/configs/archive/master.zip -unzip master.zip - -## Run the pipeline -cd /path/to/my/data -nextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/ +NXF_OPTS='-Xms1g -Xmx4g' ``` - -> Note that the nf-core/tools helper package has a `download` command to download all required pipeline -> files + singularity containers + institutional configs in one go for you, to make this process easier. - -### `--max_memory` - -Use to set a top-limit for the default memory requirement for each process. -Should be a string in the format integer-unit. eg. `--max_memory '8.GB'` - -### `--max_time` - -Use to set a top-limit for the default time requirement for each process. -Should be a string in the format integer-unit. eg. `--max_time '2.h'` - -### `--max_cpus` - -Use to set a top-limit for the default CPU requirement for each process. -Should be a string in the format integer-unit. eg. `--max_cpus 1` - -### `--plaintext_email` - -Set to receive plain-text e-mails instead of HTML formatted. - -### `--multiqc_config` - -If you would like to supply a custom config file to MultiQC, you can specify a path with `--multiqc_config`. This is used _instead of_ the [config file](../conf/multiqc_config.yaml) that comes with the pipeline. - -### `--monochrome_logs` - -Set to disable colourful command line output and live life in monochrome. - -### `--project` - -UPPMAX profile only: Cluster project for SLURM job submissions. - -### `--clusterOptions` - -UPPMAX profile only: Submit arbitrary SLURM options. diff --git a/environment.yml b/environment.yml index fc621b09..05508e41 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-methylseq-1.6dev +name: nf-core-methylseq-1.5 channels: - conda-forge - bioconda @@ -15,20 +15,21 @@ dependencies: - bioconda::fastqc=0.11.9 # Default bismark pipeline - bioconda::trim-galore=0.6.5 - - bioconda::samtools=1.9 + - bioconda::samtools=1.10 - bioconda::bowtie2=2.3.5 - bioconda::hisat2=2.2.0 - - bioconda::bismark=0.22.3 + - bioconda::bismark=0.23.0 - bioconda::qualimap=2.2.2d - bioconda::preseq=2.0.3 - - bioconda::multiqc=1.8 + - bioconda::multiqc=1.9 # bwa-meth pipeline - bioconda::picard=2.22.2 - bioconda::bwameth=0.2.2 - - bioconda::methyldackel=0.5.0 - + - bioconda::methyldackel=0.5.1 # added - bioconda::samblaster=0.1.24 - bioconda::bedtools=2.29.1 -# - bioconda::biscuit=0.3.15 - - bioconda::biscuit=0.3.15.20200318 + - bioconda::biscuit=0.3.16.20200420=h2b0c03c_3 + - bioconda::bcftools=1.10 + - conda-forge::parallel + - gawk=5.1.0 diff --git a/main.nf b/main.nf index 59278f39..452b193f 100644 --- a/main.nf +++ b/main.nf @@ -20,77 +20,84 @@ def helpMessage() { nextflow run nf-core/methylseq --reads '*_R{1,2}.fastq.gz' -profile docker Mandatory arguments: - --aligner [str] Alignment tool to use (default: bismark) - Available: bismark, bismark_hisat, bwameth, biscuit - --reads [file] Path to input data (must be surrounded with quotes) - -profile [str] Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, test, awsbatch, and more + --aligner [str] Alignment tool to use (default: bismark) + Available: bismark, bismark_hisat, bwameth, biscuit + --reads [file] Path to input data (must be surrounded with quotes) + -profile [str] Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, test, awsbatch, and more Options: - --genome [str] Name of iGenomes reference - --single_end [bool] Specifies that the input is single end reads - --comprehensive [bool] Output information for all cytosine contexts - --cytosine_report [bool] Output stranded cytosine report during Bismark's bismark_methylation_extractor step. - --ignore_flags [bool] Run MethylDackel with the flag to ignore SAM flags. - --meth_cutoff [int] Specify a minimum read coverage to report a methylation call during Bismark's bismark_methylation_extractor step. - --min_depth [int] Specify a minimum read coverage for MethylDackel to report a methylation call or for biscuit pileup. - --methyl_kit [bool] Run MethylDackel with the --methyl_kit flag to produce files suitable for use with the methylKit R package. - --skip_deduplication [bool] Skip deduplication step after alignment. This is turned on automatically if --rrbs is specified - --non_directional [bool] Run alignment against all four possible strands - --save_align_intermeds [bool] Save aligned intermediates to results directory - --save_trimmed [bool] Save trimmed reads to results directory - --save_pileup_file [bool] Save vcf-pileup and index-vcf files from biscuit aligner to results directory + --genome [str] Name of iGenomes reference + --single_end [bool] Specifies that the input is single end reads + --comprehensive [bool] Output information for all cytosine contexts + --cytosine_report [bool] Output stranded cytosine report during Bismark's bismark_methylation_extractor step. + --ignore_flags [bool] Run MethylDackel with the flag to ignore SAM flags. + --meth_cutoff [int] Specify a minimum read coverage to report a methylation call during Bismark's bismark_methylation_extractor step. + --min_depth [int] Specify a minimum read coverage for MethylDackel to report a methylation call (used in bwa-meth) + --min_coverage [int] Specify a minimum read covarage for information extraction from the VCF file to bed file (used in BISCUIT) + --methyl_kit [bool] Run MethylDackel with the --methyl_kit flag to produce files suitable for use with the methylKit R package. + --skip_deduplication [bool] Skip deduplication step after alignment. This is turned on automatically if --rrbs is specified + --non_directional [bool] Run alignment against all four possible strands for Bismark aligner + --nondirectional_library [bool] Run alignment against all four possible strands for Biscuit aligner + --save_align_intermeds [bool] Save aligned intermediates to results directory + --save_trimmed [bool] Save trimmed reads to results directory + --save_pileup_file [bool] Save VCF-pileup and VCF-index files from biscuit aligner to results directory --save_snp_file [bool] Save SNP bed-file from biscuit to results directory. Relevant only if '--epiread' is specified - --unmapped [bool] Save unmapped reads to fastq files - --relax_mismatches [bool] Turn on to relax stringency for alignment (set allowed penalty with --num_mismatches) - --num_mismatches [float] 0.6 will allow a penalty of bp * -0.6 - for 100bp reads (bismark default is 0.2) - --known_splices [file] Supply a .gtf file containing known splice sites (bismark_hisat only) - --slamseq [bool] Run bismark in SLAM-seq mode - --local_alignment [bool] Allow soft-clipping of reads (potentially useful for single-cell experiments) + --unmapped [bool] Save unmapped reads to fastq files + --relax_mismatches [bool] Turn on to relax stringency for alignment (set allowed penalty with --num_mismatches) + --num_mismatches [float] 0.6 will allow a penalty of bp * -0.6 - for 100bp reads (bismark default is 0.2) + --known_splices [file] Supply a .gtf file containing known splice sites (bismark_hisat only) + --slamseq [bool] Run bismark in SLAM-seq mode + --local_alignment [bool] Allow soft-clipping of reads (potentially useful for single-cell experiments) + --minins [int] Bismark: The minimum insert size for valid paired-end alignments. + --maxins [int] Bismark: The maximum insert size for valid paired-end alignments. --bismark_align_cpu_per_multicore [int] Specify how many CPUs are required per --multicore for bismark align (default = 3) --bismark_align_mem_per_multicore [str] Specify how much memory is required per --multicore for bismark align (default = 13.GB) - --assets_dir [path] Path to assets directory for biscuit_QC - --epiread [bool] Convert bam to biscuit epiread format - --whitelist [file] The complement of blacklist, needed for SNP extraction For more instuctions: https://www.cse.huji.ac.il/~ekushele/assets.html#whitelist - --common_dbsnp [file] Common dbSNP for the relevant genome, for SNP filteration - --debug_epiread Debug epiread merging for paired end-keep original epiread file and merged epiread file in debug mode - --debug_epiread_merging Debug epiread merging. Output merged epiread in debug mode - - References If not specified in the configuration file or you wish to overwrite any of the references. - --fasta [file] Path to fasta reference - --fasta_index [path] Path to Fasta Index - --bismark_index [path] Path to Bismark index - --bwa_biscuit_index [path] Path to Biscuit index - --bwa_meth_index [path] Path to bwameth index - --save_reference [bool] Save reference(s) to results directory + --assets_dir [path] Path to assets directory for biscuit_QC + --epiread [bool] Convert bam to biscuit epiread format + --whitelist [file] Path to the file that is the complement of blacklist, needed for SNP extraction For more instuctions: https://www.cse.huji.ac.il/~ekushele/assets.html#whitelist + --common_dbsnp [file] Common dbSNP table of the relevant genome, for SNP filteration + --debug_epiread Debug epiread merging for paired end-keep original epiread file and merged epiread file in debug mode + --debug_epiread_merging Debug epiread merging. Output merged epiread in debug mode + + References If not specified in the configuration file or you wish to overwrite any of the references. + --fasta [file] Path to fasta reference + --fasta_index [path] Path to Fasta Index + --bismark_index [path] Path to Bismark index + --bwa_biscuit_index [path] Path to Biscuit index + --bwa_meth_index [path] Path to bwameth index + --save_reference [bool] Save reference(s) to results directory Trimming options: - --skip_trimming [bool] Skip read trimming - --clip_r1 [int] Trim the specified number of bases from the 5' end of read 1 (or single-end reads). - --clip_r2 [int] Trim the specified number of bases from the 5' end of read 2 (paired-end only). - --three_prime_clip_r1 [int] Trim the specified number of bases from the 3' end of read 1 AFTER adapter/quality trimming - --three_prime_clip_r2 [int] Trim the specified number of bases from the 3' end of read 2 AFTER adapter/quality trimming - --rrbs [bool] Turn on if dealing with MspI digested material. + --skip_trimming [bool] Skip read trimming + --clip_r1 [int] Trim the specified number of bases from the 5' end of read 1 (or single-end reads). + --clip_r2 [int] Trim the specified number of bases from the 5' end of read 2 (paired-end only). + --three_prime_clip_r1 [int] Trim the specified number of bases from the 3' end of read 1 AFTER adapter/quality trimming + --three_prime_clip_r2 [int] Trim the specified number of bases from the 3' end of read 2 AFTER adapter/quality trimming + --rrbs [bool] Turn on if dealing with MspI digested material. Trimming presets: --pbat [bool] --single_cell [bool] --epignome [bool] - --accell [bool] + --accel [bool] --zymo [bool] --cegx [bool] + --em_seq [bool] + Other options: - --outdir [file] The output directory where the results will be saved - --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful - --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic + --outdir [file] The output directory where the results will be saved + --publish_dir_mode [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy) + --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful + --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic AWSBatch options: - --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion [str] The AWS Region for your AWS Batch job to run on - --awscli [str] Path to the AWS CLI tool + --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion [str] The AWS Region for your AWS Batch job to run on + --awscli [str] Path to the AWS CLI tool """.stripIndent() } @@ -121,10 +128,10 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome } Channel - .fromPath("$baseDir/assets/where_are_my_files.txt", checkIfExists: true) + .fromPath("$projectDir/assets/where_are_my_files.txt", checkIfExists: true) .into { ch_wherearemyfiles_for_trimgalore; ch_wherearemyfiles_for_alignment } -ch_splicesites_for_bismark_hisat_align = params.known_splices ? Channel.fromPath("${params.known_splices}", checkIfExists: true).collect() : file('null') +ch_splicesites_for_bismark_hisat_align = params.known_splices ? Channel.fromPath(params.known_splices, checkIfExists: true).collect() : [] if( params.aligner =~ /bismark/ ){ assert params.bismark_index || params.fasta : "No reference genome index or fasta file specified" @@ -164,7 +171,7 @@ else if( params.aligner == 'bwameth' || params.aligner == 'biscuit'){ Channel .fromPath("${params.bwa_biscuit_index}*", checkIfExists: true) .ifEmpty { exit 1, "bwa (biscuit) index file(s) not found: ${params.bwa_biscuit_index}" } - .set { ch_bwa_index_for_biscuit } + .set { ch_bwa_index_for_biscuit } ch_fasta_for_makeBwaMemIndex.close() } @@ -190,17 +197,19 @@ if( workflow.profile == 'uppmax' || workflow.profile == 'uppmax_devel' ){ } // Has the run name been specified by the user? -// this has the bonus effect of catching both -name and --name +// this has the bonus effect of catching both -name and --name custom_runName = params.name if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { custom_runName = workflow.runName } -// Trimming presets +// Trimming / kit presets clip_r1 = params.clip_r1 clip_r2 = params.clip_r2 three_prime_clip_r1 = params.three_prime_clip_r1 three_prime_clip_r2 = params.three_prime_clip_r2 +bismark_minins = params.minins +bismark_maxins = params.maxins if(params.pbat){ clip_r1 = 9 clip_r2 = 9 @@ -231,7 +240,15 @@ else if( params.cegx ){ three_prime_clip_r1 = 2 three_prime_clip_r2 = 2 } +else if( params.em_seq ){ + bismark_maxins = 1000 + clip_r1 = 8 + clip_r2 = 8 + three_prime_clip_r1 = 8 + three_prime_clip_r2 = 8 +} +// Check AWS batch settings if (workflow.profile.contains('awsbatch')) { // AWSBatch sanity checking if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" @@ -243,32 +260,33 @@ if (workflow.profile.contains('awsbatch')) { } // Stage config files -ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() -ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true) +ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) +ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true) /* * Create a channel for input read files */ -if (params.readPaths) { +if (params.input_paths) { if (params.single_end) { Channel - .from(params.readPaths) + .from(params.input_paths) .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true) ] ] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } + .ifEmpty { exit 1, "params.input_paths was empty - no input files supplied" } + .into { ch_read_files_for_fastqc; ch_read_files_trimming } } else { Channel - .from(params.readPaths) + .from(params.input_paths) .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true), file(row[1][1], checkIfExists: true) ] ] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } + .ifEmpty { exit 1, "params.input_paths was empty - no input files supplied" } + .into { ch_read_files_for_fastqc; ch_read_files_trimming } } } else { Channel .fromFilePairs( params.reads, size: params.single_end ? 1 : 2 ) .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --single_end on the command line." } - .into { ch_read_files_for_fastqc; ch_read_files_for_trim_galore } + .into { ch_read_files_for_fastqc; ch_read_files_trimming } } if (params.epiread) { @@ -281,7 +299,7 @@ if (params.epiread) { if (params.common_dbsnp) { Channel - .fromPath(params.common_dbsnp, checkIfExists: true) + .fromPath(params.common_dbsnp, checkIfExists: true) .ifEmpty { exit 1, "Cannot find any dbSNP file matching: ${params.common_dbsnp}\n" } .set { ch_commonSNP_for_SNP; } } @@ -289,73 +307,81 @@ if (params.epiread) { // Header log info log.info nfcoreHeader() def summary = [:] -summary['Run Name'] = custom_runName ?: workflow.runName -summary['Reads'] = params.reads -summary['Aligner'] = params.aligner +if (workflow.revision) summary['Pipeline Release'] = workflow.revision +summary['Run Name'] = custom_runName ?: workflow.runName +summary['Input'] = params.input +summary['Aligner'] = params.aligner summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' -if(params.known_splices) summary['Spliced alignment'] = 'Yes' -if(params.slamseq) summary['SLAM-seq'] = 'Yes' -if(params.local_alignment) summary['Local alignment'] = 'Yes' -if(params.genome) summary['Genome'] = params.genome -if(params.bismark_index) summary['Bismark Index'] = params.bismark_index -if(params.bwa_meth_index) summary['BWA-Meth Index'] = "${params.bwa_meth_index}*" -if(params.bwa_biscuit_index) summary['BWA Index'] = "${params.bwa_biscuit_index}*" -if(params.fasta) summary['Fasta Ref'] = params.fasta -if(params.fasta_index) summary['Fasta Index'] = params.fasta_index -if(params.rrbs) summary['RRBS Mode'] = 'On' -if(params.relax_mismatches) summary['Mismatch Func'] = "L,0,-${params.num_mismatches} (Bismark default = L,0,-0.2)" -if(params.skip_trimming) summary['Trimming Step'] = 'Skipped' -if(params.pbat) summary['Trim Profile'] = 'PBAT' -if(params.single_cell) summary['Trim Profile'] = 'Single Cell' -if(params.epignome) summary['Trim Profile'] = 'TruSeq (EpiGnome)' -if(params.accel) summary['Trim Profile'] = 'Accel-NGS (Swift)' -if(params.zymo) summary['Trim Profile'] = 'Zymo Pico-Methyl' -if(params.cegx) summary['Trim Profile'] = 'CEGX' -summary['Trimming'] = "5'R1: $clip_r1 / 5'R2: $clip_r2 / 3'R1: $three_prime_clip_r1 / 3'R2: $three_prime_clip_r2" -summary['Deduplication'] = params.skip_deduplication || params.rrbs ? 'No' : 'Yes' -summary['Directional Mode'] = params.single_cell || params.zymo || params.non_directional ? 'No' : 'Yes' -summary['All C Contexts'] = params.comprehensive ? 'Yes' : 'No' -summary['Cytosine report'] = params.cytosine_report ? 'Yes' : 'No' -if(params.min_depth) summary['Minimum Depth'] = params.min_depth -if(params.ignore_flags) summary['MethylDackel'] = 'Ignoring SAM Flags' -if(params.methyl_kit) summary['MethylDackel'] = 'Producing methyl_kit output' -save_intermeds = []; -if(params.save_reference) save_intermeds.add('Reference genome build') -if(params.save_trimmed) save_intermeds.add('Trimmed FastQ files') -if(params.unmapped) save_intermeds.add('Unmapped reads') +if(params.known_splices) summary['Spliced alignment'] = 'Yes' +if(params.slamseq) summary['SLAM-seq'] = 'Yes' +if(params.local_alignment) summary['Local alignment'] = 'Yes' +if(params.genome) summary['Genome'] = params.genome +if(params.bismark_index) summary['Bismark Index'] = params.bismark_index +if(params.bwa_meth_index) summary['BWA-Meth Index'] = "${params.bwa_meth_index}*" +if(params.bwa_biscuit_index)summary['BWA Index'] = "${params.bwa_biscuit_index}*" +if(params.fasta) summary['Fasta Ref'] = params.fasta +if(params.fasta_index) summary['Fasta Index'] = params.fasta_index +if(params.rrbs) summary['RRBS Mode'] = 'On' +if(params.relax_mismatches) summary['Mismatch Func'] = "L,0,-${params.num_mismatches} (Bismark default = L,0,-0.2)" +if(params.skip_trimming) summary['Trimming Step'] = 'Skipped' +if(params.pbat) summary['Trim Profile'] = 'PBAT' +if(params.single_cell) summary['Trim Profile'] = 'Single Cell' +if(params.epignome) summary['Trim Profile'] = 'TruSeq (EpiGnome)' +if(params.accel) summary['Trim Profile'] = 'Accel-NGS (Swift)' +if(params.zymo) summary['Trim Profile'] = 'Zymo Pico-Methyl' +if(params.cegx) summary['Trim Profile'] = 'CEGX' +summary['Trimming'] = "5'R1: $clip_r1 / 5'R2: $clip_r2 / 3'R1: $three_prime_clip_r1 / 3'R2: $three_prime_clip_r2" +summary['Deduplication'] = params.skip_deduplication || params.rrbs ? 'No' : 'Yes' +summary['Directional Mode'] = params.single_cell || params.zymo || params.non_directional || params.nondirectional_library ? 'No' : 'Yes' +summary['All C Contexts'] = params.comprehensive ? 'Yes' : 'No' +summary['Cytosine report'] = params.cytosine_report ? 'Yes' : 'No' +if(params.min_depth) summary['Minimum Depth'] = params.min_depth +if(params.min_coverage) summary['Minimum Coverage'] = params.min_coverage + +if(params.ignore_flags) summary['MethylDackel'] = 'Ignoring SAM Flags' +if(params.methyl_kit) summary['MethylDackel'] = 'Producing methyl_kit output' +save_intermeds = []; +if(params.save_reference) save_intermeds.add('Reference genome build') +if(params.save_trimmed) save_intermeds.add('Trimmed FastQ files') +if(params.unmapped) save_intermeds.add('Unmapped reads') if(params.save_align_intermeds) save_intermeds.add('Intermediate BAM files') -if(params.save_pileup_file) save_intermeds.add('Pileup files') -if(params.save_snp_file) save_intermeds.add('SNP bed-files') +if(params.save_pileup_file) save_intermeds.add('Pileup files') +if(params.save_snp_file) save_intermeds.add('SNP bed-files') if(save_intermeds.size() > 0) summary['Save Intermediates'] = save_intermeds.join(', ') debug_mode = []; if(params.debug_epiread) debug_mode.add('Debug epiread step') if(params.debug_epiread_merging) debug_mode.add('Debug epiread merging') if(debug_mode.size() > 0) summary['Debug mode'] = debug_mode.join(', ') +if(params.minins) summary['Bismark min insert size'] = bismark_minins +if(params.maxins || params.em_seq) summary['Bismark max insert size'] = bismark_maxins if(params.bismark_align_cpu_per_multicore) summary['Bismark align CPUs per --multicore'] = params.bismark_align_cpu_per_multicore if(params.bismark_align_mem_per_multicore) summary['Bismark align memory per --multicore'] = params.bismark_align_mem_per_multicore -if(params.assets_dir) summary['Assets Directory'] = params.assets_dir -if(params.whitelist) summary['Whitelist'] = params.whitelist -if(params.common_dbsnp) summary['Common SNP'] = params.common_dbsnp -if(params.epiread) summary['Epiread'] = 'Yes' -summary['Output dir'] = params.outdir -summary['Launch dir'] = workflow.launchDir -summary['Working dir'] = workflow.workDir -summary['Pipeline dir'] = workflow.projectDir -summary['User'] = workflow.userName -summary['Config Profile'] = workflow.profile +if(params.assets_dir) summary['Assets Directory'] = params.assets_dir +if(params.whitelist) summary['Whitelist'] = params.whitelist +if(params.common_dbsnp) summary['Common SNP'] = params.common_dbsnp +if(params.epiread) summary['Epiread'] = 'Yes' +summary['Output dir'] = params.outdir +summary['Launch dir'] = workflow.launchDir +summary['Working dir'] = workflow.workDir +summary['Pipeline dir'] = workflow.projectDir +summary['User'] = workflow.userName if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" if (workflow.profile.contains('awsbatch')) { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue - summary['AWS CLI'] = params.awscli + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue + summary['AWS CLI'] = params.awscli } if(params.project) summary['Cluster Project'] = params.project if (params.config_profile_description) summary['Config Description'] = params.config_profile_description -if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact -if (params.config_profile_url) summary['Config URL'] = params.config_profile_url -summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" -if(params.email) summary['E-mail Address'] = params.email -if(params.email_on_fail) summary['E-mail on failure'] = params.email_on_fail +if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact +if (params.config_profile_url) summary['Config URL'] = params.config_profile_url +summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" +summary['Config Files'] = workflow.configFiles.join(', ') +if (params.email || params.email_on_fail) { + summary['E-mail Address'] = params.email + summary['E-mail on failure'] = params.email_on_fail + summary['MultiQC maxsize'] = params.max_multiqc_email_size +} log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") log.info "-\033[2m--------------------------------------------------\033[0m-" @@ -363,20 +389,20 @@ log.info "-\033[2m--------------------------------------------------\033[0m-" checkHostname() Channel.from(summary.collect{ [it.key, it.value] }) - .map { k,v -> "
$k
${v ?: 'N/A'}
" } - .reduce { a, b -> return [a, b].join("\n ") } - .map { x -> """ - id: 'nf-core-methylseq-summary' - description: " - this information is collected when the pipeline is started." - section_name: 'nf-core/methylseq Workflow Summary' - section_href: 'https://github.com/nf-core/methylseq' - plot_type: 'html' - data: | -
- $x -
- """.stripIndent() } - .set { ch_workflow_summary } + .map { k,v -> "
$k
${v ?: 'N/A'}
" } + .reduce { a, b -> return [a, b].join("\n ") } + .map { x -> """ + id: 'nf-core-methylseq-summary' + description: " - this information is collected when the pipeline is started." + section_name: 'nf-core/methylseq Workflow Summary' + section_href: 'https://github.com/nf-core/methylseq' + plot_type: 'html' + data: | +
+ $x +
+ """.stripIndent() } + .set { ch_workflow_summary } /* * Parse software version numbers @@ -394,7 +420,7 @@ process get_software_versions { script: """ - echo "$workflow.manifest.version" &> v_ngi_methylseq.txt + echo "$workflow.manifest.version" &> v_pipeline.txt echo "$workflow.nextflow.version" &> v_nextflow.txt bismark_genome_preparation --version &> v_bismark_genome_preparation.txt fastqc --version &> v_fastqc.txt @@ -418,8 +444,12 @@ process get_software_versions { preseq &> v_preseq.txt multiqc --version &> v_multiqc.txt samblaster --version &> v_samblaster.txt - biscuit &>v_biscuit.txt 2>&1 || true - scrape_software_versions.py &> software_versions_mqc.yaml + biscuit &>v_biscuit.txt 2>&1 || true + bcftools --version &> v_bcftools.txt + bedtools --version &> v_bedtools.txt + parallel --version &> v_parallel.txt + gawk --version > v_gawk.txt + scrape_software_versions.py &> software_versions_mqc.yaml """ } @@ -429,8 +459,7 @@ process get_software_versions { if( !params.bismark_index && params.aligner =~ /bismark/ ){ process makeBismarkIndex { publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { params.save_reference ? it : null }, mode: 'copy' - + saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode input: file fasta from ch_fasta_for_makeBismarkIndex @@ -454,7 +483,7 @@ if( !params.bismark_index && params.aligner =~ /bismark/ ){ if( !params.bwa_meth_index && params.aligner == 'bwameth'){ process makeBwaMemIndex { tag "$fasta" - publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode input: file fasta from ch_fasta_for_makeBwaMemIndex @@ -470,12 +499,12 @@ if( !params.bwa_meth_index && params.aligner == 'bwameth'){ } /* - * PREPROCESSING - Build bwa index, using biscuit + * PREPROCESSING - Build bwa-biscuit index, using biscuit */ if(!params.bwa_biscuit_index && params.aligner == 'biscuit' ){ process makeBwaBISCUITIndex { tag "$fasta" - publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode input: file fasta from ch_fasta_for_makeBwaMemIndex @@ -499,7 +528,7 @@ if(!params.bwa_biscuit_index && params.aligner == 'biscuit' ){ if( !params.fasta_index && params.aligner == 'bwameth' || !params.fasta_index && params.aligner == 'biscuit' ){ process makeFastaIndex { tag "$fasta" - publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: 'copy' + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode input: file fasta from ch_fasta_for_makeFastaIndex @@ -520,7 +549,7 @@ if( !params.fasta_index && params.aligner == 'bwameth' || !params.fasta_index & if( !params.assets_dir && params.aligner == 'biscuit' ) { process buildBiscuitQCAssets { tag "$fasta" - publishDir path: "${params.outdir}/reference_assets", saveAs: { params.save_reference ? it : null }, mode: 'copy' + publishDir path: "${params.outdir}/reference_assets", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode input: file fasta from ch_fasta_for_buildBiscuitQCAssets @@ -545,7 +574,7 @@ if( !params.assets_dir && params.aligner == 'biscuit' ) { process fastqc { tag "$name" label 'process_medium' - publishDir "${params.outdir}/fastqc", mode: 'copy', + publishDir "${params.outdir}/fastqc", mode: params.publish_dir_mode, saveAs: { filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename" } @@ -566,12 +595,12 @@ process fastqc { * STEP 2 - Trim Galore! */ if( params.skip_trimming ){ - ch_trimmed_reads_for_alignment = ch_read_files_for_trim_galore + ch_trimmed_reads_for_alignment = ch_read_files_trimming ch_trim_galore_results_for_multiqc = Channel.from(false) } else { process trim_galore { tag "$name" - publishDir "${params.outdir}/trim_galore", mode: 'copy', + publishDir "${params.outdir}/trim_galore", mode: params.publish_dir_mode, saveAs: {filename -> if( filename.indexOf("_fastqc") > 0 ) "FastQC/$filename" else if( filename.indexOf("trimming_report.txt" ) > 0) "logs/$filename" @@ -581,7 +610,7 @@ if( params.skip_trimming ){ } input: - set val(name), file(reads) from ch_read_files_for_trim_galore + set val(name), file(reads) from ch_read_files_trimming file wherearemyfiles from ch_wherearemyfiles_for_trimgalore.collect() output: @@ -623,7 +652,7 @@ if( params.skip_trimming ){ if( params.aligner =~ /bismark/ ){ process bismark_align { tag "$name" - publishDir "${params.outdir}/bismark_alignments", mode: 'copy', + publishDir "${params.outdir}/bismark_alignments", mode: params.publish_dir_mode, saveAs: {filename -> if( filename.indexOf(".fq.gz") > 0 ) "unmapped/$filename" else if( filename.indexOf("report.txt") > 0 ) "logs/$filename" @@ -639,7 +668,7 @@ if( params.aligner =~ /bismark/ ){ file knownsplices from ch_splicesites_for_bismark_hisat_align output: - set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_samtools_sort_index_flagstat + set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_preseq set val(name), file("*report.txt") into ch_bismark_align_log_for_bismark_report, ch_bismark_align_log_for_bismark_summary, ch_bismark_align_log_for_multiqc file "*.fq.gz" optional true file "where_are_my_files.txt" @@ -658,6 +687,8 @@ if( params.aligner =~ /bismark/ ){ unmapped = params.unmapped ? "--unmapped" : '' mismatches = params.relax_mismatches ? "--score_min L,0,-${params.num_mismatches}" : '' soft_clipping = params.local_alignment ? "--local" : '' + minins = bismark_minins ? "--minins $bismark_minins" : '' + maxins = bismark_maxins ? "--maxins $bismark_maxins" : '' // Try to assign sensible bismark memory units according to what the task was given multicore = '' @@ -696,14 +727,14 @@ if( params.aligner =~ /bismark/ ){ """ bismark $input \\ $aligner \\ - --bam $pbat $non_directional $unmapped $mismatches $multicore \\ + --bam $pbat $non_directional $unmapped $mismatches $multicore $minins $maxins \\ --genome $index \\ $reads \\ $soft_clipping \\ $splicesites """ } - + /* * STEP 4 - Bismark deduplicate */ @@ -715,14 +746,14 @@ if( params.aligner =~ /bismark/ ){ } else { process bismark_deduplicate { tag "$name" - publishDir "${params.outdir}/bismark_deduplicated", mode: 'copy', + publishDir "${params.outdir}/bismark_deduplicated", mode: params.publish_dir_mode, saveAs: {filename -> filename.indexOf(".bam") == -1 ? "logs/$filename" : "$filename"} input: set val(name), file(bam) from ch_bam_for_bismark_deduplicate output: - set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract,ch_bam_dedup_for_qualimap + set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract, ch_bam_dedup_for_qualimap set val(name), file("*.deduplication_report.txt") into ch_bismark_dedup_log_for_bismark_report, ch_bismark_dedup_log_for_bismark_summary, ch_bismark_dedup_log_for_multiqc script: @@ -738,7 +769,7 @@ if( params.aligner =~ /bismark/ ){ */ process bismark_methXtract { tag "$name" - publishDir "${params.outdir}/bismark_methylation_calls", mode: 'copy', + publishDir "${params.outdir}/bismark_methylation_calls", mode: params.publish_dir_mode, saveAs: {filename -> if( filename.indexOf("splitting_report.txt" ) > 0 ) "logs/$filename" else if( filename.indexOf("M-bias" ) > 0) "m-bias/$filename" @@ -805,7 +836,6 @@ if( params.aligner =~ /bismark/ ){ } } - ch_bismark_align_log_for_bismark_report .join(ch_bismark_dedup_log_for_bismark_report) .join(ch_bismark_splitting_report_for_bismark_report) @@ -818,7 +848,7 @@ if( params.aligner =~ /bismark/ ){ */ process bismark_report { tag "$name" - publishDir "${params.outdir}/bismark_reports", mode: 'copy' + publishDir "${params.outdir}/bismark_reports", mode: params.publish_dir_mode input: set val(name), file(align_log), file(dedup_log), file(splitting_report), file(mbias) from ch_bismark_logs_for_bismark_report @@ -840,7 +870,7 @@ if( params.aligner =~ /bismark/ ){ * STEP 7 - Bismark Summary Report */ process bismark_summary { - publishDir "${params.outdir}/bismark_summary", mode: 'copy' + publishDir "${params.outdir}/bismark_summary", mode: params.publish_dir_mode input: file ('*') from ch_bam_for_bismark_summary.collect() @@ -875,7 +905,7 @@ else { if( params.aligner == 'bwameth' ){ process bwamem_align { tag "$name" - publishDir "${params.outdir}/bwa-mem_alignments", mode: 'copy', + publishDir "${params.outdir}/bwa-mem_alignments", mode: params.publish_dir_mode, saveAs: {filename -> if( !params.save_align_intermeds && filename == "where_are_my_files.txt" ) filename else if( params.save_align_intermeds && filename != "where_are_my_files.txt" ) filename @@ -893,7 +923,7 @@ if( params.aligner == 'bwameth' ){ script: fasta = bwa_meth_indices[0].toString() - '.bwameth' - '.c2t' - '.amb' - '.ann' - '.bwt' - '.pac' - '.sa' - prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ + prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ """ bwameth.py \\ --threads ${task.cpus} \\ @@ -908,7 +938,7 @@ if( params.aligner == 'bwameth' ){ */ process samtools_sort_index_flagstat { tag "$name" - publishDir "${params.outdir}/samtools", mode: 'copy', + publishDir "${params.outdir}/bwa-mem_alignments", mode: params.publish_dir_mode, saveAs: {filename -> if(filename.indexOf("report.txt") > 0) "logs/$filename" else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename @@ -921,7 +951,7 @@ if( params.aligner == 'bwameth' ){ file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() output: - set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates,ch_bam_sorted_for_picard + set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates set val(name), file("${bam.baseName}.sorted.bam.bai") into ch_bam_index file "${bam.baseName}_flagstat_report.txt" into ch_flagstat_results_for_multiqc file "${bam.baseName}_stats_report.txt" into ch_samtools_stats_results_for_multiqc @@ -950,7 +980,7 @@ if( params.aligner == 'bwameth' ){ } else { process markDuplicates { tag "$name" - publishDir "${params.outdir}/bwa-mem_markDuplicates", mode: 'copy', + publishDir "${params.outdir}/bwa-mem_markDuplicates", mode: params.publish_dir_mode, saveAs: {filename -> filename.indexOf(".bam") == -1 ? "logs/$filename" : "$filename"} input: @@ -983,12 +1013,12 @@ if( params.aligner == 'bwameth' ){ } /* - * STEP 6 - extract methylation with MethylDackel + * STEP 6 - Extract methylation with MethylDackel */ process methyldackel { tag "$name" - publishDir "${params.outdir}/MethylDackel", mode: 'copy' + publishDir "${params.outdir}/MethylDackel", mode: params.publish_dir_mode input: set val(name), @@ -1052,12 +1082,13 @@ if( params.aligner == 'biscuit' ){ fasta = bwa_indices[0].toString() - '.bwameth' - '.c2t' - '.amb' - '.ann' - '.bwt' - '.pac' - '.sa' - '.fai' - '.par' - '.dau' -'.bis' assembly = fasta.replaceAll(/\.\w+/,"") prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ - non_directional = params.non_directional ? 0 : 1 - // Paired-end or single end input files and pbat or not - input = params.pbat ? params.single_end ? reads + " -b 3" : "${reads[1]} ${reads[0]}" : reads + + non_directional = params.single_cell || params.zymo || params.nondirectional_library ? 0 : 1 + // Paired-end or single-end input files and pbat or not + input = params.pbat ? params.single_end ? reads + " -b 3" : "${reads[1]} ${reads[0]} -b " + non_directional : "${reads[0]} ${reads[1]} -b " + non_directional """ - biscuit align -M -b $non_directional -t ${task.cpus} $fasta $input | samtools view -Sb > ${name}.${assembly}.bam + biscuit align -M -t ${task.cpus} $fasta $input | samtools view -Sb > ${name}.${assembly}.bam """ } @@ -1098,16 +1129,16 @@ if( params.aligner == 'biscuit' ){ } /* - * STEP 5.- samtools flagstat on samples + * STEP 5.- Samtools flagstat on samples */ process samtools_sort_index_flagstat_biscuit { tag "$name" publishDir "${params.outdir}", mode: 'copy', saveAs: {filename -> if(filename.indexOf("report.txt") > 0) "biscuit_alignments/logs/$filename" - else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename.indexOf("sorted.bam") > 0) "biscuit_alignments/$filename" + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename.indexOf("sorted.bam") > 0) "biscuit_alignments/$filename" else if( (!params.save_align_intermeds && !params.rrbs).every() && filename == "where_are_my_files.txt") filename - else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename else null } @@ -1138,7 +1169,7 @@ if( params.aligner == 'biscuit' ){ } - /* + /* * STEP 6 - Create vcf file with pileup, to extract methylation */ process createVCF { @@ -1158,18 +1189,17 @@ if( params.aligner == 'biscuit' ){ output: set val(name), file("${name}.vcf.gz*") into ch_vcf_biscuit_qc ,ch_vcf_for_bedgraph,ch_vcf_for_epiread - script: + script: filter_duplication = params.skip_deduplication || params.rrbs ? '-u' : '' - all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' """ - biscuit pileup -q ${task.cpus} $filter_duplication $fasta ${bam} -o ${name}.vcf + biscuit pileup -q ${task.cpus} $filter_duplication $fasta ${bam} -o ${name}.vcf bgzip -@ ${task.cpus} -f ${name}.vcf tabix -f -p vcf ${name}.vcf.gz """ } /* - * STEP 7 - create bedgraph file from vcf + * STEP 7 - Create bedgraph file from vcf */ process createBedgraph { tag "$name" @@ -1182,15 +1212,18 @@ if( params.aligner == 'biscuit' ){ set val(name), file("*bedgraph" ) into ch_bedgraph_for_intersect_soloWCGW script: - min_depth = params.min_depth > 0 ? "${params.min_depth}" : '1' + min_depth = params.min_coverage > 1 ? "${params.min_coverage}" : '1' all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' """ - biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" + biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" """ } if (params.epiread) { if (params.common_dbsnp) { + /* + * STEP 7.1 - Reformat SNP table for SNP file generation + */ process reformat_SNP { input: @@ -1208,9 +1241,11 @@ if( params.aligner == 'biscuit' ){ } else { ch_reformattedSNP_for_SNP = Channel.empty() - } + /* + * STEP 7.2 - SNP file generation for the epiread convertion + */ process get_SNP_file { tag "$name" publishDir "${params.outdir}/epireads/snp", mode: 'copy', @@ -1230,16 +1265,19 @@ if( params.aligner == 'biscuit' ){ script: whitelist = params.whitelist ? "-R $whitelist_file" : '' - snp_file = (reformatted_SNP.size()>0) ? "-a ${reformatted_SNP[0]}" : '' + snp_file = (reformatted_SNP.size()>0) ? "-a ${reformatted_SNP[0]}" : '' """ bcftools annotate $whitelist -O z ${snp_file} -h $baseDir/assets/common_dbsnp.hdr -c CHROM,FROM,TO,TYPE,COMMON_SOME,COMMON_ALL,REF_MIN,ALT_MIN,REF_DBSNP,ALT_DBSNP,REF_ALL,ALT_ALL,RSID,MAX_MAF "${vcf[0]}" > "${name}-whitelist-dbSNP.vcf.gz" tabix -p vcf "${name}-whitelist-dbSNP.vcf.gz" bcftools view -O z -i'ALT!="N" & ALT!="." & ( (COUNT(GT=="0/1")>=1 & COMMON_ALL==1 & MAX_MAF>=0.05) | (COUNT(GT=="0/1" & GQ>=60)>=1) )' "${name}-whitelist-dbSNP.vcf.gz" > "${name}-whitelist-dbSNP-HET60.vcf.gz" tabix -p vcf "${name}-whitelist-dbSNP-HET60.vcf.gz" - bcftools query -u -i'GT="0/1" & GQ>=10' --format '%CHROM\t%POS\t%POS\t%REF\t%ALT[\t%GT\t%GQ\t%SP\t%AC\t%AF1]\t%RSID\t%COMMON_ALL\t%MAX_MAF\t%REF_MIN\t%ALT_MIN\n' "${name}-whitelist-dbSNP-HET60.vcf.gz" | awk -v OFS="\t" '{\$2 = \$2 - 1; print}' > "${name}.snp.bed" + bcftools query -u -i'GT="0/1" & GQ>=10' --format '%CHROM\t%POS\t%POS\t%REF\t%ALT[\t%GT\t%GQ\t%SP\t%AC\t%AF1]\t%RSID\t%COMMON_ALL\t%MAX_MAF\t%REF_MIN\t%ALT_MIN\n' "${name}-whitelist-dbSNP-HET60.vcf.gz" | awk -v OFS="\t" '{\$2 = \$2 - 1; print}' > "${name}.snp.bed" """ } + /* + * STEP 7.3 - Convert bam to epiread file format + */ process epiread_convertion { tag "$name" publishDir "${params.outdir}/epireads", mode: 'copy' @@ -1251,7 +1289,7 @@ if( params.aligner == 'biscuit' ){ file(snp), file(fasta), file(fasta_index), - file(whitelist) from ch_bam_sorted_for_epiread + file(whitelist) from ch_bam_sorted_for_epiread .join(ch_bam_index_for_epiread) .join(ch_snp_for_epiread) .combine(ch_fasta_for_epiread) @@ -1261,7 +1299,7 @@ if( params.aligner == 'biscuit' ){ output: - file "*${name}.e*.gz*" + file "*${name}.e*.gz*" file "${name}.original.epiread.*" optional true script: @@ -1273,7 +1311,7 @@ if( params.aligner == 'biscuit' ){ """ bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam - biscuit epiread -q ${task.cpus} $snp_file $no_filter_reverse $fasta ${name}.bam |sort --parallel=${task.cpus} -T . -k1,1Vf -k5,5n | bgzip > ${name}.epiread.gz + biscuit epiread -q ${task.cpus} $snp_file $no_filter_reverse $fasta ${name}.bam |sort --parallel=${task.cpus} -T . -k1,1Vf -k5,5n | bgzip > ${name}.epiread.gz tabix -0 -s 1 -b 5 -e 5 ${name}.epiread.gz """ } else if (params.debug_epiread) { @@ -1298,7 +1336,7 @@ if( params.aligner == 'biscuit' ){ bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam - biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n | $baseDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err + biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n | $baseDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz tabix -0 -p bed ${name}.epiread.gz @@ -1307,7 +1345,10 @@ if( params.aligner == 'biscuit' ){ } } } - + + /* + * STEP 8 - Running QC of samples + */ process biscuit_QC { tag "$name" publishDir "${params.outdir}/biscuit_QC", mode: 'copy' @@ -1330,7 +1371,7 @@ if( params.aligner == 'biscuit' ){ script: assembly = fasta.toString().replaceAll(/\.\w+/,"") """ - QC.sh -v ${vcf[0]} -o ${name}.${assembly}_biscuitQC $assets $fasta ${name}.${assembly} ${bam} + QC.sh -v ${vcf[0]} -o ${name}.${assembly}_biscuitQC $assets $fasta ${name}.${assembly} ${bam} """ } @@ -1343,13 +1384,12 @@ else { ch_samblaster_for_multiqc = Channel.from(false) } - /* - * STEP 8 - Qualimap + * STEP 9 - Qualimap */ process qualimap { tag "$name" - publishDir "${params.outdir}/qualimap", mode: 'copy' + publishDir "${params.outdir}/qualimap", mode: params.publish_dir_mode input: set val(name), file(bam) from ch_bam_dedup_for_qualimap @@ -1361,12 +1401,12 @@ process qualimap { gcref = params.genome.toString().startsWith('GRCh') ? '-gd HUMAN' : '' gcref = params.genome.toString().startsWith('GRCm') ? '-gd MOUSE' : '' def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' """ samtools sort $bam \\ - -@ ${task.cpus} $sort_mem \\ - -o ${bam.baseName}.sorted.bam + -@ ${task.cpus} $sort_mem \\ + -o ${bam.baseName}.sorted.bam qualimap bamqc $gcref \\ -bam ${bam.baseName}.bam \\ -outdir ${bam.baseName}_qualimap \\ @@ -1379,7 +1419,7 @@ process qualimap { /* - * STEP 9 - Picard - Preparation step + * STEP 10 - Picard - Preparation step */ process prepareGenomeToPicard { publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, @@ -1399,7 +1439,7 @@ process prepareGenomeToPicard { avail_mem = task.memory.toGiga() } """ - mv ${fasta} ${fasta.baseName}.picard.fa + mv ${fasta} ${fasta.baseName}.picard.fa picard -Xmx${avail_mem}g CreateSequenceDictionary \\ R=${fasta.baseName}.picard.fa \\ O=${fasta.baseName}.picard.dict @@ -1409,7 +1449,7 @@ process prepareGenomeToPicard { /* - * STEP 10 - Picard InsertSizeMetrics and GcBiasMetrics + * STEP 11 - Picard InsertSizeMetrics and GcBiasMetrics */ process picardMetrics { tag "$name" @@ -1468,11 +1508,11 @@ process picardMetrics { } /* - * STEP 11 - preseq + * STEP 12 - preseq */ process preseq { tag "$name" - publishDir "${params.outdir}/preseq", mode: 'copy' + publishDir "${params.outdir}/preseq", mode: params.publish_dir_mode input: set val(name), file(bam) from ch_bam_for_preseq @@ -1488,10 +1528,10 @@ process preseq { } /* - * STEP 12 - MultiQC + * STEP 13 - MultiQC */ process multiqc { - publishDir "${params.outdir}/MultiQC", mode: 'copy' + publishDir "${params.outdir}/MultiQC", mode: params.publish_dir_mode input: file (multiqc_config) from ch_multiqc_config @@ -1515,7 +1555,6 @@ process multiqc { file ('biscuit_QC/*') from ch_QC_results_for_multiqc.collect().ifEmpty([]) file ('biscuit_markDuplicates/*') from ch_samblaster_for_multiqc.collect().ifEmpty([]) file ('picardMetrics/*') from ch_picard_results_for_multiqc.collect().ifEmpty([]) - file ('software_versions/*') from ch_software_versions_yaml_for_multiqc.collect() file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") @@ -1535,13 +1574,14 @@ process multiqc { } /* - * STEP 13 - Output Description HTML + * STEP 14 - Output Description HTML */ process output_documentation { - publishDir "${params.outdir}/pipeline_info", mode: 'copy' + publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode input: file output_docs from ch_output_docs + file images from ch_output_docs_images output: file "results_description.html" @@ -1556,7 +1596,6 @@ process output_documentation { * Completion e-mail notification */ workflow.onComplete { - // Set up the e-mail variables def subject = "[nf-core/methylseq] Successful: $workflow.runName" if (!workflow.success) { @@ -1607,18 +1646,18 @@ workflow.onComplete { // Render the TXT template def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$baseDir/assets/email_template.txt") + def tf = new File("$projectDir/assets/email_template.txt") def txt_template = engine.createTemplate(tf).make(email_fields) def email_txt = txt_template.toString() // Render the HTML template - def hf = new File("$baseDir/assets/email_template.html") + def hf = new File("$projectDir/assets/email_template.html") def html_template = engine.createTemplate(hf).make(email_fields) def email_html = html_template.toString() // Render the sendmail template - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] - def sf = new File("$baseDir/assets/sendmail_template.txt") + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] + def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) def sendmail_html = sendmail_template.toString() @@ -1631,7 +1670,11 @@ workflow.onComplete { log.info "[nf-core/methylseq] Sent summary e-mail to $email_address (sendmail)" } catch (all) { // Catch failures and try with plaintext - [ 'mail', '-s', subject, email_address ].execute() << email_txt + def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] + if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) { + mail_cmd += [ '-A', mqc_report ] + } + mail_cmd.execute() << email_html log.info "[nf-core/methylseq] Sent summary e-mail to $email_address (mail)" } } @@ -1677,13 +1720,13 @@ def nfcoreHeader() { c_white = params.monochrome_logs ? '' : "\033[0;37m"; c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; - return """ -${c_dim}--------------------------------------------------${c_reset}- + return """ -${c_dim}--------------------------------------------------${c_reset}- ${c_green},--.${c_black}/${c_green},-.${c_reset} - ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} - ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} - ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} + ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} + ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} + ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} ${c_green}`._,._,\'${c_reset} - ${c_purple} nf-core/methylseq v${workflow.manifest.version}${c_reset} + ${c_purple} nf-core/methylseq v${workflow.manifest.version}${c_reset} -${c_dim}--------------------------------------------------${c_reset}- """.stripIndent() } diff --git a/nextflow.config b/nextflow.config index 2f98feeb..2ad30554 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,7 +10,7 @@ params { // Workflow flags genome = false - reads = "data/*_R{1,2}.fastq.gz" + input = "data/*{1,2}.fastq.gz" single_end = false aligner = 'bismark' clip_r1 = 0 @@ -24,20 +24,25 @@ params { accel = false zymo = false cegx = false + em_seq = false comprehensive = false cytosine_report = false ignore_flags = false meth_cutoff = false methyl_kit = false min_depth = 0 + min_coverage = 1 skip_deduplication = false non_directional = false + nondirectional_library = false skip_trimming = false outdir = './results' save_align_intermeds = false known_splices = false slamseq = false local_alignment = false + minins = false + maxins = false save_reference = false save_trimmed = false unmapped = false @@ -49,6 +54,7 @@ params { // Bismark default is 0.2 (L,0,-0.2), Bowtie2 default is 0.6 (L,0,-0.6) bismark_align_cpu_per_multicore = null bismark_align_mem_per_multicore = null + publish_dir_mode = 'copy' bwa_biscuit_index = false whitelist = false @@ -56,7 +62,6 @@ params { save_pileup_file = false save_snp_file = false epiread = false - cpg_file = false debug_epiread = false // Boilerplate options @@ -117,7 +122,11 @@ profiles { singularity.enabled = true singularity.autoMounts = true } + podman { + podman.enabled = true + } test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } } // Load igenomes.config if required @@ -125,9 +134,11 @@ if (!params.igenomes_ignore) { includeConfig 'conf/igenomes.config' } -// Export this variable to prevent local Python libraries from conflicting with those in the container +// Export these variables to prevent local Python/R libraries from conflicting with those in the container env { PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" } // Capture exit codes from upstream processes when piping @@ -156,7 +167,7 @@ manifest { homePage = 'https://github.com/nf-core/methylseq' description = 'Methylation (Bisulfite-Sequencing) Best Practice analysis pipeline, part of the nf-core community.' mainScript = 'main.nf' - nextflowVersion = '>=19.10.0' + nextflowVersion = '>=20.04.0' version = '1.6dev' } diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 00000000..c51759da --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,665 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/methylseq/master/nextflow_schema.json", + "title": "nf-core/methylseq pipeline parameters", + "description": "Methylation (Bisulfite-Sequencing) Best Practice analysis pipeline, part of the nf-core community.", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string", + "fa_icon": "fas fa-dna", + "description": "Input FastQ files.", + "help_text": "Use this to specify the location of your input FastQ files. For example:\n\n```bash\n--input 'path/to/data/sample_*_{1,2}.fastq'\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The path must have at least one `*` wildcard character\n3. When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs.\n\nIf left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz`" + }, + "single_end": { + "type": "boolean", + "description": "Specifies that the input is single-end reads.", + "fa_icon": "fas fa-align-center", + "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--single_end` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--input`. For example:\n\n```bash\n--single_end --input '*.fastq'\n```\n\nIt is not possible to run a mixture of single-end and paired-end files in one run." + }, + "outdir": { + "type": "string", + "description": "The output directory where the results will be saved.", + "default": "./results", + "fa_icon": "fas fa-folder-open" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + } + } + }, + "alignment_options": { + "title": "Alignment options", + "type": "object", + "description": "", + "default": "", + "properties": { + "aligner": { + "type": "string", + "default": "bismark", + "description": "Alignment tool to use.", + "fa_icon": "fas fa-dot-circle", + "enum": [ + "bismark", + "bismark_hisat", + "bwameth", + "biscuit" + ], + "help_text": "The nf-core/methylseq package is actually three pipelines in one. The default workflow uses [Bismark](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) with [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) as alignment tool: unless specified otherwise, nf-core/methylseq will run this pipeline.\n\nSince bismark v0.21.0 it is also possible to use [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) as alignment tool. To run this workflow, invoke the pipeline with the command line flag `--aligner bismark_hisat`. HISAT2 also supports splice-aware alignment if analysis of RNA is desired (e.g. [SLAMseq](https://science.sciencemag.org/content/360/6390/800) experiments), a file containing a list of known splicesites can be provided with `--known_splices`.\n\nThe second workflow uses [BWA-Meth](https://github.com/brentp/bwa-meth) and [MethylDackel](https://github.com/dpryan79/methyldackel) instead of Bismark. To run this workflow, run the pipeline with the command line flag `--aligner bwameth`.\n\nThe third workflow uses [BISCUIT](https://github.com/huishenlab/biscuit) and [samblaster](https://github.com/GregoryFaust/samblaster) for mark duplicates instead of Bismark. To run this workflow, run the pipeline with the command line flag `--aligner biscuit`." + }, + "comprehensive": { + "type": "boolean", + "description": "Output information for all cytosine contexts.", + "fa_icon": "fas fa-arrows-alt", + "help_text": "By default, the pipeline only produces data for cytosine methylation states in CpG context. Specifying `--comprehensive` makes the pipeline give results for all cytosine contexts. Note that for large genomes (e.g. Human), these can be massive files. This is only recommended for small genomes (especially those that don't exhibit strong CpG context methylation specificity).\n\nIf specified, this flag instructs the Bismark methylation extractor to use the `--comprehensive` and `--merge_non_CpG` flags. This produces coverage files with information from about all strands and cytosine contexts merged into two files - one for CpG context and one for non-CpG context.\n\nIf using the bwa-meth workflow, the flag makes MethylDackel report CHG and CHH contexts as well.\nIf using the biscuit workflow, the flag extract all cytosine contexts (C, CG, CG, HCG, GCH) from the VCF to the bed file, instead of only CG" + }, + "save_align_intermeds": { + "type": "boolean", + "description": "Save aligned intermediates to results directory", + "fa_icon": "fas fa-save", + "hidden": true + } + }, + "required": [ + "aligner" + ], + "fa_icon": "fas fa-braille" + }, + "special_library_types": { + "title": "Special library types", + "type": "object", + "description": "Presets for working with specific bisulfite library preparation methods.", + "default": "", + "properties": { + "pbat": { + "type": "boolean", + "fa_icon": "fas fa-outdent", + "description": "Preset for working with PBAT libraries.", + "help_text": "Specify this parameter when working with PBAT _(Post Bisulfite Adapter Tagging)_ libraries.\n\nUsing this parameter sets the `--pbat` flag when aligning with Bismark. This tells Bismark to align complementary strands (the opposite of `--directional`). \nWhen using the BISCUIT aligner, it reverses the reads in paired-end, or set the read to be the synthesized strand for single-end.\n\nAdditionally, this is a trimming preset equivalent to `--clip_r1 6` `--clip_r2 9` `--three_prime_clip_r1 6` `--three_prime_clip_r2 9`" + }, + "rrbs": { + "type": "boolean", + "description": "Turn on if dealing with MspI digested material.", + "help_text": "Use this parameter when working with RRBS _(Reduced Representation Bisulfite Sequencing)_ data, that is digested using MspI.\n\nSpecifying `--rrbs` will pass on the `--rrbs` parameter to TrimGalore! See the [TrimGalore! documentation](https://github.com/FelixKrueger/TrimGalore/blob/master/Docs/Trim_Galore_User_Guide.md#rrbs-specific-options-mspi-digested-material) to read more about the effects of this option.\n\nThis parameter also makes the pipeline skip the deduplication step.", + "fa_icon": "fas fa-compress" + }, + "slamseq": { + "type": "boolean", + "description": "Run bismark in SLAM-seq mode.", + "fa_icon": "fas fa-wave-square", + "help_text": "Specify to run Bismark with the `--slam` flag to run bismark in [SLAM-seq mode](https://github.com/FelixKrueger/Bismark/blob/master/CHANGELOG.md#slam-seq-mode) \n\n> NB: Only works with when using the `bismark_hisat` aligner (`--aligner bismark_hisat`)" + }, + "em_seq": { + "type": "string", + "fa_icon": "fas fa-cubes", + "description": "Preset for EM-seq libraries.", + "help_text": "Equivalent to `--clip_r1 8` `--clip_r2 8` `--three_prime_clip_r1 8` `--three_prime_clip_r2 8`.\n\nAlso sets the `--maxins` flag to `1000` for Bismark." + }, + "single_cell": { + "type": "boolean", + "fa_icon": "fas fa-cut", + "description": "Trimming preset for single-cell bisulfite libraries.", + "help_text": "Equivalent to `--clip_r1 6` `--clip_r2 6` `--three_prime_clip_r1 6` `--three_prime_clip_r2 6`.\n\nAlso sets the `--non_directional` flag for Bismark and the `--nondirectional_library` flag for BISCUIT." + }, + "accel": { + "type": "boolean", + "fa_icon": "fas fa-cut", + "help_text": "Equivalent to `--clip_r1 10` `--clip_r2 15` `--three_prime_clip_r1 10` `--three_prime_clip_r2 10`", + "description": "Trimming preset for the Accel kit." + }, + "cegx": { + "type": "boolean", + "fa_icon": "fas fa-cut", + "description": "Trimming preset for the CEGX bisfulite kit.", + "help_text": "Equivalent to `--clip_r1 6` `--clip_r2 6` `--three_prime_clip_r1 2` `--three_prime_clip_r2 2`" + }, + "epignome": { + "type": "boolean", + "fa_icon": "fas fa-cut", + "description": "Trimming preset for the Epignome kit.", + "help_text": "Equivalent to `--clip_r1 8` `--clip_r2 8` `--three_prime_clip_r1 8` `--three_prime_clip_r2 8`" + }, + "zymo": { + "type": "boolean", + "fa_icon": "fas fa-cut", + "description": "Trimming preset for the Zymo kit.", + "help_text": "Equivalent to `--clip_r1 10` `--clip_r2 15` `--three_prime_clip_r1 10` `--three_prime_clip_r2 10`.\n\nAlso sets the `--non_directional` flag for Bismark and the `--nondirectional_library` flag for BISCUIT.." + } + }, + "fa_icon": "fas fa-prescription-bottle" + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Options for the reference genome indices used to align reads.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + }, + "fasta": { + "type": "string", + "fa_icon": "fas fa-font", + "description": "Path to FASTA genome file.", + "help_text": "If you have no genome reference available, the pipeline can build one using a FASTA file. This requires additional time and resources, so it's better to use a pre-build index if possible. You can use the command line option `--save_reference` to keep the generated references so that they can be added to your config and used again in the future.\n\nNote that FASTA file is always needed, for the Picard metrics creation." + }, + "fasta_index": { + "type": "string", + "description": "Path to Fasta index file.", + "help_text": "The FASTA index file (`.fa.fai`) is only needed when using the bwa_meth aligner. It is used by MethylDackel. If using Bismark this parameter is ignored.", + "fa_icon": "fas fa-bookmark" + }, + "bismark_index": { + "type": "string", + "description": "Path to a directory containing a Bismark reference index.", + "fa_icon": "fas fa-dot-circle" + }, + "bwa_meth_index": { + "type": "string", + "description": "bwameth index filename base", + "help_text": "The base filename for a bwa-meth genome reference index. Only used when using the bwa-meth aligner.\n\nNote that this is not a complete path, but rather a common filename _base_. For example, if you have file paths such as `/path/to/ref/genome.fa.bwameth.c2t.bwt`, you should specify `/path/to/ref/genome.fa`.", + "fa_icon": "far fa-dot-circle" + }, + "bwa_biscuit_index": { + "type": "string", + "description": "biscuit-bwa index filename base", + "help_text": "The base filename for a biscuit-bwa genome reference index. Only used when using the biscuit aligner.\n\nNote that this is not a complete path, but rather a common filename _base_. For example, if you have file paths such as `/path/to/ref/genome.fa.bwameth.bis.amb`, you should specify `/path/to/ref/genome.fa`.", + "fa_icon": "far fa-dot-circle" + }, + "save_reference": { + "type": "boolean", + "description": "Save reference(s) to results directory", + "fa_icon": "far fa-save" + }, + "igenomes_base": { + "type": "string", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes/", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + } + } + }, + "adapter_trimming": { + "title": "Adapter Trimming", + "type": "object", + "description": "Bisulfite libraries often require additional base pairs to be removed from the ends of the reads before alignment.", + "default": "", + "properties": { + "clip_r1": { + "type": "integer", + "description": "Trim the specified number of bases from the 5' end of read 1 (or single-end reads).", + "default": "0", + "fa_icon": "fas fa-cut" + }, + "clip_r2": { + "type": "integer", + "description": "Trim the specified number of bases from the 5' end of read 2 (paired-end only).", + "default": "0", + "fa_icon": "fas fa-cut" + }, + "three_prime_clip_r1": { + "type": "integer", + "description": "Trim the specified number of bases from the 3' end of read 1 AFTER adapter/quality trimming.", + "default": "0", + "fa_icon": "fas fa-cut" + }, + "three_prime_clip_r2": { + "type": "integer", + "description": "Trim the specified number of bases from the 3' end of read 2 AFTER adapter/quality trimming", + "default": "0", + "fa_icon": "fas fa-cut" + }, + "save_trimmed": { + "type": "boolean", + "description": "Save trimmed reads to results directory.", + "fa_icon": "fas fa-save", + "help_text": "By default, trimmed FastQ files will not be saved to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete.", + "hidden": true + } + }, + "fa_icon": "fas fa-cut", + "help_text": "In addition to manually specifying bases to be specified, the pipeline has a number of parameter presets:\n\n| Parameter | 5' R1 Trim | 5' R2 Trim | 3' R1 Trim | 3' R2 Trim |\n|-----------------|------------|------------|------------|------------|\n| `--pbat` | 6 | 9 | 6 | 9 |\n| `--single_cell` | 6 | 6 | 6 | 6 |\n| `--epignome` | 8 | 8 | 8 | 8 |\n| `--accel` | 10 | 15 | 10 | 10 |\n| `--zymo` | 10 | 15 | 10 | 10 |\n| `--cegx` | 6 | 6 | 2 | 2 |\n\nNote that you can use the `--skip_trimming` parameter to skip trimming completely." + }, + "bismark_options": { + "title": "Bismark options", + "type": "object", + "description": "Parameters specific to the Bismark workflow", + "default": "", + "fa_icon": "fas fa-circle", + "properties": { + "non_directional": { + "type": "boolean", + "description": "Run alignment against all four possible strands.", + "help_text": "By default, Bismark assumes that libraries are directional and does not align against complementary strands. If your library prep was not directional, use `--non_directional` to align against all four possible strands.\n\nNote that the `--single_cell` and `--zymo` parameters both set the `--non_directional` workflow flag automatically.", + "fa_icon": "fas fa-exchange-alt" + }, + "cytosine_report": { + "type": "boolean", + "description": "Output stranded cytosine report during Bismark's bismark_methylation_extractor step.", + "help_text": "By default, Bismark does not produce stranded calls. With this option the output considers all Cs on both forward and reverse strands and reports their position, strand, trinucleotide context and methylation state.", + "fa_icon": "fas fa-clipboard" + }, + "relax_mismatches": { + "type": "boolean", + "description": "Turn on to relax stringency for alignment (set allowed penalty with --num_mismatches).", + "help_text": "By default, Bismark is pretty strict about which alignments it accepts as valid. If you have good reason to believe that your reads will contain more mismatches than normal, this flags can be used to relax the stringency that Bismark uses when accepting alignments. This can greatly improve the number of aligned reads you get back, but may negatively impact the quality of your data.\n\nBismark uses the Bowtie alignment scoring mechanism to filter reads. Mismatches cost `-6`, gap opening `-5` and gap extension `-2`. So, a threshold of`-60` would allow 10 mismatches or ~ 8 x 1-2bp indels. The threshold is dependent on the length of reads, so a penalty value is used where `penalty * bp read length = threshold`.\n\nThe penalty value used by Bismark by default is `0.2`, so for 100bp reads this would be a threshold of `-20`. \n\nIf you specifying the `--relax_mismatches` pipeline flag, Bismark instead uses `0.6`, or a threshold of `-60`. This adds the Bismark flag `--score_min L,0,-0.6` to the alignment command.\n\nThe penalty value can be modified using the `--num_mismatches` pipeline option.", + "fa_icon": "fas fa-bullseye" + }, + "num_mismatches": { + "type": "number", + "default": 0.6, + "description": "0.6 will allow a penalty of bp * -0.6 - for 100bp reads (bismark default is 0.2)", + "help_text": "Customise the penalty in the function used to filter reads based on mismatches. The parameter `--relax_mismatches` must also be specified.\n\nSee the parameter documentation for `--relax_mismatches` for an explanation.", + "fa_icon": "fas fa-calculator" + }, + "unmapped": { + "type": "boolean", + "description": "Save unmapped reads to FastQ files", + "help_text": "Use the `--unmapped` flag to set the `--unmapped` flag with Bismark align and save the unmapped reads to FastQ files.", + "fa_icon": "fas fa-recycle" + }, + "meth_cutoff": { + "type": "integer", + "description": "Specify a minimum read coverage to report a methylation call", + "default": "0", + "help_text": "Use to discard any methylation calls with less than a given read coverage depth (in fold coverage) during Bismark's `bismark_methylation_extractor` step.", + "fa_icon": "fas fa-angle-double-down" + }, + "known_splices": { + "type": "string", + "description": "Supply a .gtf file containing known splice sites (bismark_hisat only).", + "help_text": "Specify to run Bismark with the `--known-splicesite-infile` flag to run splice-aware alignment using HISAT2. A `.gtf` file has to be provided from which a list of known splicesites is created by the pipeline\n\n> NB: This only works when using the `bismark_hisat` aligner with `--align`", + "fa_icon": "fas fa-barcode" + }, + "local_alignment": { + "type": "boolean", + "description": "Allow soft-clipping of reads (potentially useful for single-cell experiments).", + "help_text": "Specify to run Bismark with the `--local` flag to allow soft-clipping of reads. This should only be used with care in certain single-cell applications or PBAT libraries, which may produce chimeric read pairs. (See [Wu et al.](https://doi.org/10.1093/bioinformatics/btz125)).", + "fa_icon": "fas fa-search" + }, + "minins": { + "type": "integer", + "fa_icon": "fas fa-compress-alt", + "description": "The minimum insert size for valid paired-end alignments.", + "help_text": "For example, if `--minins 60` is specified and a paired-end alignment consists of two 20-bp alignments in the appropriate orientation with a 20-bp gap between them, that alignment is considered valid (as long as `--maxins` is also satisfied). A 19-bp gap would not be valid in that case.\n\nDefault: no flag (Bismark default: `0`)." + }, + "maxins": { + "type": "integer", + "fa_icon": "fas fa-expand-alt", + "description": "The maximum insert size for valid paired-end alignments.", + "help_text": "For example, if `--maxins 100` is specified and a paired-end alignment consists of two 20-bp alignments in the proper orientation with a 60-bp gap between them, that alignment is considered valid (as long as `--minins` is also satisfied). A 61-bp gap would not be valid in that case.\n\nDefault: not specified. Bismark default: `500`." + }, + "bismark_align_cpu_per_multicore": { + "type": "integer", + "default": 3, + "description": "Specify how many CPUs are required per --multicore for bismark align", + "hidden": true, + "help_text": "The pipeline makes use of the `--multicore` option for Bismark align. When using this option, Bismark uses a large number of CPUs for every `--multicore` specified. The pipeline calculates the number of `--multicore` based on the resources available to the task. It divides the available CPUs by 3, or by 5 if any of `--single_cell`, `--zymo` or `--non_directional` are specified. This is based on usage for a typical mouse genome.\n\nYou may find when running the pipeline that Bismark is not using this many CPUs. To fine tune the usage and speed, you can specify an integer with `--bismark_align_cpu_per_multicore` and the pipeline will divide the available CPUs by this value instead.\n\nSee the [bismark documentation](https://github.com/FelixKrueger/Bismark/tree/master/Docs#alignment) for more information.", + "fa_icon": "fas fa-cogs" + }, + "bismark_align_mem_per_multicore": { + "type": "string", + "default": "13.GB", + "description": "Specify how much memory is required per --multicore for bismark align", + "hidden": true, + "help_text": "Exactly the same as with `--bismark_align_cpu_per_multicore`, but for memory. By default, the pipeline divides the available memory by `13.GB`, or `18.GB` if any of `--single_cell`, `--zymo` or `--non_directional` are specified.\n\nNote that the final `--multicore` value is based on the lowest limiting factor of both CPUs and memory.", + "fa_icon": "fas fa-cogs" + } + } + }, + "bwa_meth_options": { + "title": "bwa-meth options", + "type": "object", + "description": "Parameters specific to the bwa-meth workflow", + "default": "", + "properties": { + "min_depth": { + "type": "integer", + "description": "Specify a minimum read coverage for MethylDackel to report a methylation call.", + "default": "0", + "fa_icon": "fas fa-angle-double-down" + }, + "ignore_flags": { + "type": "boolean", + "description": "MethylDackel - ignore SAM flags", + "fa_icon": "fas fa-eye-slash", + "help_text": "Run MethylDackel with the `--ignore_flags` option, to ignore SAM flags." + }, + "methyl_kit": { + "type": "boolean", + "description": "Save files for use with methylKit", + "help_text": "Run MethylDackel with the `--methyl_kit` option, to produce files suitable for use with the methylKit R package.", + "fa_icon": "fas fa-ellipsis-h" + } + }, + "fa_icon": "far fa-circle" + }, + "biscuit_options": { + "title": "biscuit options", + "type": "object", + "description": "Parameters specific to the BISCUIT workflow", + "default": "", + "properties": { + "min_coverage": { + "type": "integer", + "description": "Specify a minimum read coverage for information extraction from the VCF file to bed file.", + "default": "1", + "fa_icon": "fas fa-angle-double-down" + }, + "nondirectional_library": { + "type": "boolean", + "description": "Run alignment against all four possible strands.", + "help_text": "By default, the BISCUIT pipeline assumes that libraries are directional and does not align against complementary strands. If your library prep was not directional, use `--nondirectional_library` to align against all four possible strands.\n\nNote that the `--single_cell` and `--zymo` parameters both set the `--nondirectional_library` workflow flag automatically.", + "fa_icon": "fas fa-exchange-alt" + }, + "epiread": { + "type": "boolean", + "description": "Specify a minimum read coverage for MethylDackel to report a methylation call.", + "help_text": "[Epiread](https://huishenlab.github.io/biscuit/epiread_format/) is a compact way of storing CpG retention pattern on the same read. This option will tell the biscuit workflow to generate Epi-read file for the sample, as well as all needed steps.", + "fa_icon": "fas fa-angle-double-down" + }, + "common_dbsnp": { + "type": "string", + "description": "Common dbSNP table of the relevant genome, for SNP filteration", + "help_text": "Common-dbSNP table that contains at least the following fields: chrom, chromStart, chromEnd, name, ref, altCount, alts, shiftBases, freqSourceCount, minorAlleleFreq, majorAllele, minorAllele, maxFuncImpact, class, ucscNotes [can be downloaded from UCSC]. This table is used for SNP filteration in the Epi-read file. Relevant only if `--epiread` is set.", + "fa_icon": "fas fa-angle-double-down" + }, + "whitelist": { + "type": "string", + "description": "Path to the file that is the complement of blacklist.", + "help_text": "The whitelist is needed for SNP file generation.\nThe whitelist can be created with the following steps:\n
    \n
  1. Download the blacklist for your wanted genome from [here]( https://github.com/Boyle-Lab/Blacklist/tree/master/lists)
  2. \n
  3. Run: `bedtools complement -i your_black_list -g your_genome_chrome_sizes | grep -v _ | bgzip > whitelist.bed.gz`\nFor more instruction, run `bedtools complement`
  4. \n
\nRelevant only if `--epiread` is set.", + "fa_icon": "fas fa-angle-double-down" + }, + "debug_epiread": { + "type": "boolean", + "description": "Debug epiread merging for paired end reads.", + "fa_icon": "fas fa-eye-slash", + "help_text": "By default, merging two adjacent rows of the read mates in Epi-read format when running with paired-end mode will not output the debug data (about the reference allele, the alternative allele and the SP data), and the original Epi-read file will not be saved to the results directory. Specify this flag (or set to true in your config file) to run the merging with debug data and copy the original Epi-read file to the results directory when complete. \n If you don't want to keep the original files, check `debug_epiread_merging`.\nRelevant only if `--epiread` is set.", + "hidden": true + }, + "debug_epiread_merging": { + "type": "boolean", + "description": "Debug epiread merging. Output merged epiread in debug mode.", + "help_text": "By default, merging two adjacent rows of the read mates in Epi-read format when running with paired-end mode will not output the debug data (about the reference allele, the alternative allele and the SP data). Specify this flag (or set to true in your config file) to run the merging with the debug data.\nIf you want to keep the original Epi-read files, check `debug_epiread`\nRelevant only if `--epiread` is set.", + "fa_icon": "fas fa-ellipsis-h", + "hidden": true + }, + "assets_dir": { + "type": "string", + "description": "Path to assets directory for biscuit_QC", + "help_text": "Path to a directory containing needed file for biscuit-QC step\n> **NB** If none provided, will be generated automatically.", + "fa_icon": "fas fa-ellipsis-h" + }, + "save_pileup_file": { + "type": "boolean", + "description": "Save VCF-pileup and VCF-index files to results directory", + "help_text": "By default, the VCF and VCF-index files generated by `biscuit pileup` will not be save to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete. Relevant only if `--epiread` is set.", + "fa_icon": "fas fa-save" + }, + "save_snp_file": { + "type": "boolean", + "description": "Save SNP bed-file to results directory", + "help_text": "By default, the bed file with SNP information about the sample will not be save to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete. Relevant only if `--epiread` is set.", + "fa_icon": "fas fa-save" + } + }, + "fa_icon": "far fa-circle" + }, + "skip_pipeline_steps": { + "title": "Skip pipeline steps", + "type": "object", + "description": "", + "default": "", + "properties": { + "skip_trimming": { + "type": "boolean", + "description": "Skip read trimming.", + "fa_icon": "fas fa-fast-forward" + }, + "skip_deduplication": { + "type": "boolean", + "description": "Skip deduplication step after alignment.", + "help_text": "Deduplication removes PCR duplicate reads after alignment. Specifying this option will skip this step, leaving duplicate reads in your data.\n\nNote that this is turned on automatically if `--rrbs` is specified.", + "fa_icon": "fas fa-fast-forward" + } + }, + "fa_icon": "fas fa-fast-forward" + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "hidden": true, + "fa_icon": "fas fa-question-circle" + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "hidden": true, + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ] + }, + "name": { + "type": "string", + "description": "Workflow name.", + "fa_icon": "fas fa-fingerprint", + "hidden": true, + "help_text": "A custom name for the pipeline run. Unlike the core nextflow `-name` option with one hyphen this parameter can be reused multiple times, for example if using `-resume`. Passed through to steps such as MultiQC and used for things like report filenames and titles." + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "hidden": true, + "help_text": "This works exactly as with `--email`, except emails are only sent if the workflow is not successful." + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true, + "help_text": "Set to receive plain-text e-mails instead of HTML formatted." + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true, + "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached." + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true, + "help_text": "Set to disable colourful command line output and live life in monochrome." + }, + "multiqc_config": { + "type": "string", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "tracedir": { + "type": "string", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", + "hidden": true + } + } + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog", + "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", + "fa_icon": "fas fa-users-cog" + }, + "hostnames": { + "type": "string", + "description": "Institutional configs hostname.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "project": { + "type": "string", + "description": "Cluster Project", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "clusterOptions": { + "type": "string", + "fa_icon": "fas fa-users-cog", + "hidden": true + } + } + } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/alignment_options" + }, + { + "$ref": "#/definitions/special_library_types" + }, + { + "$ref": "#/definitions/reference_genome_options" + }, + { + "$ref": "#/definitions/adapter_trimming" + }, + { + "$ref": "#/definitions/bismark_options" + }, + { + "$ref": "#/definitions/bwa_meth_options" + }, + { + "$ref": "#/definitions/biscuit_options" + }, + { + "$ref": "#/definitions/skip_pipeline_steps" + }, + { + "$ref": "#/definitions/generic_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/institutional_config_options" + } + ] +} \ No newline at end of file From f972b356f0e36bb89598f1b918046088bf0afa04 Mon Sep 17 00:00:00 2001 From: ekushele Date: Wed, 10 Feb 2021 15:10:54 +0200 Subject: [PATCH 26/56] make main to be like in 1f4dfd5 commit (with sorted bam for preseq and qualimap), fix skip_deduplication for biscuit, change environment.yml file --- CHANGELOG.md | 30 ++++---- conf/base.config | 5 -- docs/output.md | 47 ++++++++---- environment.yml | 4 +- main.nf | 189 ++++++++++++++++++++++++++++++++++++++++------- nextflow.config | 2 + 6 files changed, 213 insertions(+), 64 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f394111..126554b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,29 +1,29 @@ + # nf-core/methylseq -<<<<<<< HEAD -## [v1.6dev] +## [v1.6dev] - [2020-10-02] ### New features -* Added Picard CollectInsertSizeMetrics and Picard CollectGcBiasMetrics +* Updated template to tools 1.11 +* Moved parameter documentation into new `nextflow_schema.json` file. +* Added new `--maxins` and `--minins` parameters to pass on to Bismark +Added Picard CollectInsertSizeMetrics and Picard CollectGcBiasMetrics * Improved qulimap and preseq by adding `samtools sort` and `samtools index` step in the Bismark aligner -* Added biscuit aligner as an optional aligner, with all relative steps (alignment, mark duplicates, methylation extraction, QC for biscuit, and optional epiread file creation). +* Added BISCUIT aligner as an optional aligner, with all relative steps (alignment, mark duplicates with [samblaster](https://github.com/GregoryFaust/samblaster), methylation extraction, QC for biscuit, and optional [Epi-read](https://huishenlab.github.io/biscuit/epiread_format/) file creation with SNP information ). ### Software updates -* _new dependency_: samblaster`0.1.24` -* _new dependency_: bedtools `2.29.1` -* _new_: biscuit tool `0.3.11` -======= -## v1.6dev - [date] +* _new_: samblaster `0.1.24` +* _new_: bedtools `2.29.1` +* _new_: biscuit `0.3.16` +* _new_: bcftools`1.10` +* _new_: parallel `20201122` +* _new_: gawk `5.1.0` +* samtools `1.9` > `1.10` +* methyldackel `0.5.0` > `0.5.1` -### Pipeline Updates - -* Updated template to tools 1.11 -* Moved parameter documentation into new `nextflow_schema.json` file. -* Added new `--maxins` and `--minins` parameters to pass on to Bismark ->>>>>>> 9218f1199bca434af49b54963eea91cfed572597 ## [v1.5](https://github.com/nf-core/methylseq/releases/tag/1.5) - 2020-04-09 diff --git a/conf/base.config b/conf/base.config index 8427ec86..2bc9a188 100644 --- a/conf/base.config +++ b/conf/base.config @@ -115,11 +115,6 @@ withName:biscuit_align { cpus = { check_max( 4 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 3.d * task.attempt, 'time') } - } - withName:CreateVCF { - cpus = { check_max( 4 * task.attempt, 'cpus') } - memory = { check_max( 32.GB * task.attempt, 'memory') } - time = { check_max( 2.d * task.attempt, 'time') } } withName:biscuit_QC { cpus = { check_max( 4 * task.attempt, 'cpus') } diff --git a/docs/output.md b/docs/output.md index 1bce3d86..2121261d 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,4 +1,5 @@ + # nf-core/methylseq Output ## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/methylseq/output](https://nf-co.re/methylseq/output) @@ -24,7 +25,7 @@ and processes data using the following steps: * [Deduplication](#deduplication) - Deduplicating reads * [Methylation Extraction](#methylation-extraction) - Calling cytosine methylation steps * [Bismark Reports](#bismark-reports) - Single-sample and summary analysis reports -* [Biscuit Reports](#biscuit reports) - Single-sample analysis reports for biscuit aligner +* [BISCUIT Reports](#biscuit-reports) - Single-sample analysis reports for BISCUIT aligner * [Qualimap](#qualimap) - Tool for genome alignments QC * [Preseq](#preseq) - Tool for estimating sample complexity * [Picard](#picard) - Tool for generating metrics of statistics @@ -73,7 +74,7 @@ Single-end data will have slightly different file names and only one FastQ file ### Alignment -Bismark and bwa-meth convert all Cytosines contained within the sequenced reads to Thymine _in-silico_ and then align against a three-letter reference genome. This method avoids methylation-specific alignment bias. The alignment produces a BAM file of genomic alignments. _+__________ +Bismark, bwa-meth and BISCUIT convert all Cytosines contained within the sequenced reads to Thymine _in-silico_ and then align against a three-letter reference genome. This method avoids methylation-specific alignment bias. The alignment produces a BAM file of genomic alignments. _+__________ **Bismark output directory: `results/bismark_alignments/`** _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment tool and the output file names will not differ between the options._ @@ -103,7 +104,7 @@ _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment t * `logs/sample_stats.txt` * Summary file giving lots of metrics about the aligned BAM file. -**biscuit output directory: `results/biscuit_alignnts/`** +**BISCUIT output directory: `results/biscuit_alignnts/`** * `sample.assembly.bam` * Aligned reads in BAM format. @@ -144,9 +145,9 @@ This step removes alignments with identical mapping position to avoid technical * Log file giving summary statistics about deduplication. -**biscuit output directory: `results/biscuit_markDuplicates/`** +**BISCUIT output directory: `results/biscuit_markDuplicates/`** -> **NB:** The biscuit (samblaster) step doesn't remove duplicate reads from the BAM file, it just labels them. +> **NB:** The BISCUIT (samblaster) step doesn't remove duplicate reads from the BAM file, it just labels them. > @@ -157,7 +158,7 @@ This step removes alignments with identical mapping position to avoid technical The methylation extractor step takes a BAM file with aligned reads and generates files containing cytosine methylation calls. It produces a few different output formats, described below. -Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` or `--skip_deduplication` or `--rrbs` when running the pipeline. +Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` (or `nondirectional_library`) or `--skip_deduplication` or `--rrbs` when running the pipeline. Filename abbreviations stand for the following reference alignment strands: @@ -186,20 +187,36 @@ Filename abbreviations stand for the following reference alignment strands: * `sample.bedGraph` * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. -**biscuit workflow output directory: `results/methylation_extract/`** +**BISCUIT workflow output directory: `results/methylation_extract/`** * `sample.bedgraph` * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. * `sample.vcf.gz` - * VCF file with the pileup information, used for creating the bedgraph file. + * VCF file with the pileup information, used for creating the bedGraph file. * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. * `sample.vcf.gz.tbi` * Index file for `sample.vcf.gz` * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. -> **NB** if `--epriread` is specified in the pipeline, then: -> **output directory:** `results/epireads` : - > * `sample.epiread` - Storing CpG retention pattern on the read in a compact way + +**NB** if `--epriread` is specified in the pipeline, then: +**output directory:** `results/epireads` : +* `sample.epiread.gz` + * Storing CpG retention pattern on the read in a compact way. For paired end mode, two adjacent rows of the read mates in Epi-read format are merged. +* `sample.epiread.gz.tbi` + * Index file for `sample.epiread.gz`. + * `sample.err.gz` + * In paired end mode, storing all CpG retention pattern of the reads that failed to be merged together. + * `sample.err.gz.tbi` + Index file for `sample.err.gz`. + * `sample.original.epiread.gz` + In paired end mode, storing all CpG retention pattern of the reads before the merging. + * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. +* `sample.original.epiread.gz.tbi` + * Index file for `sample.original.epiread.gz`. + * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. +* `snp/sample.snp.bed` + * bed file with SNP information about the sample + * **NB:** Only saved if `--save_snp_file` is specified when running the pipeline. > ### Bismark Reports @@ -210,11 +227,11 @@ Bismark generates a HTML reports describing results for each sample, as well as **Output directory: `results/bismark_summary`** -### Biscuit Reports +### BISCUIT Reports -Biscuit generates a directory with different statistical reports describing results for each sample. The statistical reports are converted to plots plotted in the MultiQC report. +BISCUIT generates a directory with different statistical reports describing results for each sample. The statistical reports are converted to plots plotted in the MultiQC report. -**Output directory: `results/biscuit_QC/sample_biscuitQC/`** +**Output directory: `results/biscuit_QC/sample.assembly_biscuitQC/`** diff --git a/environment.yml b/environment.yml index 05508e41..67f93686 100644 --- a/environment.yml +++ b/environment.yml @@ -29,7 +29,7 @@ dependencies: # added - bioconda::samblaster=0.1.24 - bioconda::bedtools=2.29.1 - - bioconda::biscuit=0.3.16.20200420=h2b0c03c_3 + - bioconda::biscuit=0.3.16 - bioconda::bcftools=1.10 - - conda-forge::parallel + - conda-forge::parallel=20201122 - gawk=5.1.0 diff --git a/main.nf b/main.nf index 452b193f..9fbfe7bc 100644 --- a/main.nf +++ b/main.nf @@ -403,7 +403,7 @@ Channel.from(summary.collect{ [it.key, it.value] }) """.stripIndent() } .set { ch_workflow_summary } - + /* * Parse software version numbers */ @@ -464,7 +464,7 @@ if( !params.bismark_index && params.aligner =~ /bismark/ ){ file fasta from ch_fasta_for_makeBismarkIndex output: - file "BismarkIndex" into ch_bismark_index_for_bismark_align + file "BismarkIndex" into ch_bismark_index_for_bismark_align, ch_bismark_index_for_bismark_methXtract script: aligner = params.aligner == 'bismark_hisat' ? '--hisat2' : '--bowtie2' @@ -650,9 +650,93 @@ if( params.skip_trimming ){ * STEP 3.1 - align with Bismark */ if( params.aligner =~ /bismark/ ){ + // process bismark_align { + // tag "$name" + // publishDir "${params.outdir}/bismark_alignments", mode: params.publish_dir_mode, + // saveAs: {filename -> + // if( filename.indexOf(".fq.gz") > 0 ) "unmapped/$filename" + // else if( filename.indexOf("report.txt") > 0 ) "logs/$filename" + // else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt" ) filename + // else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt" ) filename + // else null + // } + + // input: + // set val(name), file(reads) from ch_trimmed_reads_for_alignment + // file index from ch_bismark_index_for_bismark_align.collect() + // file wherearemyfiles from ch_wherearemyfiles_for_bismark_align.collect() + // file knownsplices from ch_splicesites_for_bismark_hisat_align + + // output: + // set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_samtools_sort_index_flagstat + // set val(name), file("*report.txt") into ch_bismark_align_log_for_bismark_report, ch_bismark_align_log_for_bismark_summary, ch_bismark_align_log_for_multiqc + // file "*.fq.gz" optional true + // file "where_are_my_files.txt" + + // script: + // // Paired-end or single end input files + // input = params.single_end ? reads : "-1 ${reads[0]} -2 ${reads[1]}" + + // // Choice of read aligner + // aligner = params.aligner == "bismark_hisat" ? "--hisat2" : "--bowtie2" + + // // Optional extra bismark parameters + // splicesites = params.aligner == "bismark_hisat" && knownsplices.name != 'null' ? "--known-splicesite-infile <(hisat2_extract_splice_sites.py ${knownsplices})" : '' + // pbat = params.pbat ? "--pbat" : '' + // non_directional = params.single_cell || params.zymo || params.non_directional ? "--non_directional" : '' + // unmapped = params.unmapped ? "--unmapped" : '' + // mismatches = params.relax_mismatches ? "--score_min L,0,-${params.num_mismatches}" : '' + // soft_clipping = params.local_alignment ? "--local" : '' + // minins = bismark_minins ? "--minins $bismark_minins" : '' + // maxins = bismark_maxins ? "--maxins $bismark_maxins" : '' + + // // Try to assign sensible bismark memory units according to what the task was given + // multicore = '' + // if( task.cpus ){ + // // Numbers based on recommendation by Felix for a typical mouse genome + // if( params.single_cell || params.zymo || params.non_directional ){ + // cpu_per_multicore = 5 + // mem_per_multicore = (18.GB).toBytes() + // } else { + // cpu_per_multicore = 3 + // mem_per_multicore = (13.GB).toBytes() + // } + // // Check if the user has specified this and overwrite if so + // if(params.bismark_align_cpu_per_multicore) { + // cpu_per_multicore = (params.bismark_align_cpu_per_multicore as int) + // } + // if(params.bismark_align_mem_per_multicore) { + // mem_per_multicore = (params.bismark_align_mem_per_multicore as nextflow.util.MemoryUnit).toBytes() + // } + // // How many multicore splits can we afford with the cpus we have? + // ccore = ((task.cpus as int) / cpu_per_multicore) as int + // // Check that we have enough memory, assuming 13GB memory per instance (typical for mouse alignment) + // try { + // tmem = (task.memory as nextflow.util.MemoryUnit).toBytes() + // mcore = (tmem / mem_per_multicore) as int + // ccore = Math.min(ccore, mcore) + // } catch (all) { + // log.debug "Warning: Not able to define bismark align multicore based on available memory" + // } + // if( ccore > 1 ){ + // multicore = "--multicore $ccore" + // } + // } + + // // Main command + // """ + // bismark $input \\ + // $aligner \\ + // --bam $pbat $non_directional $unmapped $mismatches $multicore $minins $maxins \\ + // --genome $index \\ + // $reads \\ + // $soft_clipping \\ + // $splicesites + // """ + // } process bismark_align { tag "$name" - publishDir "${params.outdir}/bismark_alignments", mode: params.publish_dir_mode, + publishDir "${params.outdir}/bismark_alignments", mode: 'copy', saveAs: {filename -> if( filename.indexOf(".fq.gz") > 0 ) "unmapped/$filename" else if( filename.indexOf("report.txt") > 0 ) "logs/$filename" @@ -668,7 +752,7 @@ if( params.aligner =~ /bismark/ ){ file knownsplices from ch_splicesites_for_bismark_hisat_align output: - set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_preseq + set val(name), file("*.bam") into ch_bam_for_bismark_deduplicate, ch_bam_for_bismark_summary, ch_bam_for_samtools_sort_index_flagstat set val(name), file("*report.txt") into ch_bismark_align_log_for_bismark_report, ch_bismark_align_log_for_bismark_summary, ch_bismark_align_log_for_multiqc file "*.fq.gz" optional true file "where_are_my_files.txt" @@ -687,8 +771,6 @@ if( params.aligner =~ /bismark/ ){ unmapped = params.unmapped ? "--unmapped" : '' mismatches = params.relax_mismatches ? "--score_min L,0,-${params.num_mismatches}" : '' soft_clipping = params.local_alignment ? "--local" : '' - minins = bismark_minins ? "--minins $bismark_minins" : '' - maxins = bismark_maxins ? "--maxins $bismark_maxins" : '' // Try to assign sensible bismark memory units according to what the task was given multicore = '' @@ -727,19 +809,50 @@ if( params.aligner =~ /bismark/ ){ """ bismark $input \\ $aligner \\ - --bam $pbat $non_directional $unmapped $mismatches $multicore $minins $maxins \\ + --bam $pbat $non_directional $unmapped $mismatches $multicore \\ --genome $index \\ $reads \\ $soft_clipping \\ $splicesites """ } + /* + * STEP 4 - Samtools sort bismark + */ + process samtools_sort_index_flagstat_bismark { + tag "$name" + publishDir "${params.outdir}/samtools", mode: 'copy', + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "logs/$filename" + else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam) from ch_bam_for_samtools_sort_index_flagstat + file wherearemyfiles from ch_wherearemyfiles_for_bismark_samtools_sort.collect() + + output: + set val(name), file("*.sorted.bam") into ch_bam_for_preseq,ch_bam_sorted_for_picard + file "where_are_my_files.txt" + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $bam \\ + -@ ${task.cpus} $sort_mem \\ + -o ${bam.baseName}.sorted.bam + """ + } + /* - * STEP 4 - Bismark deduplicate + * STEP 5 - Bismark deduplicate */ if( params.skip_deduplication || params.rrbs ) { - ch_bam_for_bismark_deduplicate.into { ch_bam_dedup_for_bismark_methXtract; ch_bam_dedup_for_qualimap } + ch_bam_for_bismark_deduplicate.into { ch_bam_dedup_for_bismark_methXtract; ch_dedup_bam_for_samtools_sort_index_flagstat } ch_bismark_dedup_log_for_bismark_report = Channel.from(false) ch_bismark_dedup_log_for_bismark_summary = Channel.from(false) ch_bismark_dedup_log_for_multiqc = Channel.from(false) @@ -753,7 +866,7 @@ if( params.aligner =~ /bismark/ ){ set val(name), file(bam) from ch_bam_for_bismark_deduplicate output: - set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract, ch_bam_dedup_for_qualimap + set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract, ch_dedup_bam_for_samtools_sort_index_flagstat set val(name), file("*.deduplication_report.txt") into ch_bismark_dedup_log_for_bismark_report, ch_bismark_dedup_log_for_bismark_summary, ch_bismark_dedup_log_for_multiqc script: @@ -764,6 +877,37 @@ if( params.aligner =~ /bismark/ ){ } } + /* + * STEP 6 - Samtools sort bismark after dedup + */ + process samtools_sort_index_flagstat_dedup_bismark { + tag "$name" + publishDir "${params.outdir}/samtools", mode: 'copy', + saveAs: {filename -> + if(filename.indexOf("report.txt") > 0) "logs/$filename" + else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename + else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename != "where_are_my_files.txt") filename + else null + } + + input: + set val(name), file(bam) from ch_dedup_bam_for_samtools_sort_index_flagstat + file wherearemyfiles from ch_wherearemyfiles_for_bismark_dedup_samtools_sort.collect() + + output: + set val(name), file("*.sorted.bam") into ch_bam_sorted_dedup_for_qualimap + file "where_are_my_files.txt" + + script: + def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false + def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' + """ + samtools sort $bam \\ + -@ ${task.cpus} $sort_mem \\ + -o ${bam.baseName}.sorted.bam + """ + } + /* * STEP 5 - Bismark methylation extraction */ @@ -918,7 +1062,7 @@ if( params.aligner == 'bwameth' ){ file wherearemyfiles from ch_wherearemyfiles_for_bwamem_align.collect() output: - set val(name), file('*.bam') into ch_bam_for_samtools_sort_index_flagstat, ch_bam_for_preseq + set val(name), file('*.bam') into ch_bam_for_samtools_sort_index_flagstat file "where_are_my_files.txt" script: @@ -951,7 +1095,7 @@ if( params.aligner == 'bwameth' ){ file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() output: - set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates + set val(name), file("${bam.baseName}.sorted.bam") into ch_bam_sorted_for_markDuplicates,ch_bam_for_preseq, ch_bam_sorted_for_picard set val(name), file("${bam.baseName}.sorted.bam.bai") into ch_bam_index file "${bam.baseName}_flagstat_report.txt" into ch_flagstat_results_for_multiqc file "${bam.baseName}_stats_report.txt" into ch_samtools_stats_results_for_multiqc @@ -974,7 +1118,7 @@ if( params.aligner == 'bwameth' ){ * STEP 5 - Mark duplicates */ if( params.skip_deduplication || params.rrbs ) { - ch_bam_sorted_for_markDuplicates.into { ch_bam_dedup_for_methyldackel; ch_bam_dedup_for_qualimap } + ch_bam_sorted_for_markDuplicates.into { ch_bam_dedup_for_methyldackel; ch_bam_sorted_dedup_for_qualimap } ch_bam_index.set { ch_bam_index_for_methyldackel } ch_markDups_results_for_multiqc = Channel.from(false) } else { @@ -987,7 +1131,7 @@ if( params.aligner == 'bwameth' ){ set val(name), file(bam) from ch_bam_sorted_for_markDuplicates output: - set val(name), file("${bam.baseName}.markDups.bam") into ch_bam_dedup_for_methyldackel, ch_bam_dedup_for_qualimap + set val(name), file("${bam.baseName}.markDups.bam") into ch_bam_dedup_for_methyldackel, ch_bam_sorted_dedup_for_qualimap set val(name), file("${bam.baseName}.markDups.bam.bai") into ch_bam_index_for_methyldackel //ToDo check if this correctly overrides the original channel file "${bam.baseName}.markDups_metrics.txt" into ch_markDups_results_for_multiqc @@ -1096,8 +1240,8 @@ if( params.aligner == 'biscuit' ){ * STEP 4 - Mark duplicates */ if( params.skip_deduplication || params.rrbs ) { - ch_bam_for_markDuplicates.into { ch_bam_dedup_for_qualimap; ch_samblaster_for_samtools_sort_index_flagstat } - ch_markDups_results_for_multiqc = Channel.from(false) + ch_bam_for_markDuplicates.into { ch_samblaster_for_samtools_sort_index_flagstat } + ch_samblaster_for_multiqc = Channel.from(false) } else { process markDuplicates_samblaster { tag "$name" @@ -1147,7 +1291,7 @@ if( params.aligner == 'biscuit' ){ file wherearemyfiles from ch_wherearemyfiles_for_samtools_sort_index_flagstat.collect() output: - set val(name), file("*.sorted.bam") into ch_bam_dedup_for_qualimap,ch_bam_for_preseq,ch_bam_sorted_for_pileup, ch_bam_sorted_for_epiread, ch_bam_noDups_for_QC,ch_bam_sorted_for_picard + set val(name), file("*.sorted.bam") into ch_bam_sorted_dedup_for_qualimap,ch_bam_for_preseq,ch_bam_sorted_for_pileup, ch_bam_sorted_for_epiread, ch_bam_noDups_for_QC,ch_bam_sorted_for_picard set val(name), file ("*.sorted.bam.bai") into ch_bam_index_sorted_for_pileup,ch_bam_index_for_epiread,ch_bam_index_noDups_for_QC file "${samblaster_bam.baseName}_flagstat_report.txt" into ch_flagstat_results_biscuit_for_multiqc file "${samblaster_bam.baseName}_stats_report.txt" into ch_samtools_stats_results_biscuit_for_multiqc @@ -1162,7 +1306,6 @@ if( params.aligner == 'biscuit' ){ -@ ${task.cpus} $sort_mem -l 9 \\ -o ${samblaster_bam.baseName}.sorted.bam samtools index ${samblaster_bam.baseName}.sorted.bam - samtools flagstat ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_flagstat_report.txt samtools stats ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_stats_report.txt """ @@ -1379,7 +1522,6 @@ if( params.aligner == 'biscuit' ){ else { ch_flagstat_results_biscuit_for_multiqc = Channel.from(false) ch_samtools_stats_results_biscuit_for_multiqc = Channel.from(false) - ch_markDups_results_for_multiqc = Channel.from(false) ch_QC_results_for_multiqc = Channel.from(false) ch_samblaster_for_multiqc = Channel.from(false) } @@ -1392,7 +1534,7 @@ process qualimap { publishDir "${params.outdir}/qualimap", mode: params.publish_dir_mode input: - set val(name), file(bam) from ch_bam_dedup_for_qualimap + set val(name), file(bam) from ch_bam_sorted_dedup_for_qualimap output: file "${bam.baseName}_qualimap" into ch_qualimap_results_for_multiqc @@ -1400,20 +1542,13 @@ process qualimap { script: gcref = params.genome.toString().startsWith('GRCh') ? '-gd HUMAN' : '' gcref = params.genome.toString().startsWith('GRCm') ? '-gd MOUSE' : '' - def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false - def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' - """ - samtools sort $bam \\ - -@ ${task.cpus} $sort_mem \\ - -o ${bam.baseName}.sorted.bam qualimap bamqc $gcref \\ -bam ${bam.baseName}.bam \\ -outdir ${bam.baseName}_qualimap \\ --collect-overlap-pairs \\ --java-mem-size=${task.memory.toGiga()}G \\ -nt ${task.cpus} - """ } diff --git a/nextflow.config b/nextflow.config index 2ad30554..932c2116 100644 --- a/nextflow.config +++ b/nextflow.config @@ -63,6 +63,8 @@ params { save_snp_file = false epiread = false debug_epiread = false + debug_epiread_merging = false + assets_dir = false // Boilerplate options name = false From 842ae997a6e69a62dbaedbc5521f58a804b8f809 Mon Sep 17 00:00:00 2001 From: ekushele Date: Thu, 11 Feb 2021 10:38:06 +0200 Subject: [PATCH 27/56] improve README.md for lint test --- CHANGELOG.md | 2 -- README.md | 8 ++++---- docs/output.md | 21 ++++++++------------- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cdb174ba..2525f9c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,6 @@ - # nf-core/methylseq - ## v1.6dev - [date] ### Pipeline Updates diff --git a/README.md b/README.md index b510a259..0b9649de 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,9 @@ Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for | Step | Bismark workflow | bwa-meth workflow | biscuit | |----------------------------------------------|------------------|-----------------------|-------------------| -| Generate Reference Genome Index _(optional)_ | Bismark | bwa-meth | biscuit | +| Generate Reference Genome Index _(optional)_ | Bismark | bwa-meth | biscuit | | Raw data QC | FastQC | FastQC | FastQC | -| Adapter sequence trimming | Trim Galore! | Trim Galore! | Trim Galore! | +| Adapter sequence trimming | Trim Galore! | Trim Galore! | Trim Galore! | | Align Reads | Bismark | bwa-meth | biscuit | | Deduplicate Alignments | Bismark | Picard MarkDuplicates | samblaster | | Extract methylation calls | Bismark | MethylDackel | biscuit | @@ -30,8 +30,8 @@ Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for | Alignment QC | Qualimap | Qualimap | Qualimap | | Sample complexity | Preseq | Preseq | Preseq | | Project Report | MultiQC | MultiQC | MultiQC | - -## Quick Start + +## Quick Start 1. Install [`nextflow`](https://nf-co.re/usage/installation) diff --git a/docs/output.md b/docs/output.md index 2121261d..a0ca9348 100644 --- a/docs/output.md +++ b/docs/output.md @@ -116,7 +116,7 @@ _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment t * `sample.assembly.sorted.bam.bai` * Index of sorted BAM file * **NB:** Only saved if `--save_align_intermeds`, `--skip_deduplication` or `--rrbs` is specified when running the pipeline. - * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam.bai` + * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam.bai` * `logs/sample_flagstat.txt` * Summary file describing the number of reads which aligned in different ways. * `logs/sample_stats.txt` @@ -144,12 +144,10 @@ This step removes alignments with identical mapping position to avoid technical * `logs/sample.sorted.markDups_metrics.txt` * Log file giving summary statistics about deduplication. - **BISCUIT output directory: `results/biscuit_markDuplicates/`** > **NB:** The BISCUIT (samblaster) step doesn't remove duplicate reads from the BAM file, it just labels them. -> - + * `sample.assembly.txt` * Log file giving summary statistics about deduplication. @@ -260,11 +258,11 @@ Note that these are predictive numbers only, not absolute. The MultiQC plot can ## Picard [Picard]([https://broadinstitute.github.io/picard/picard-metric-definitions.html](https://broadinstitute.github.io/picard/picard-metric-definitions.html)) is a set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF. + The two metrics created here are: * [GcBiasMetrics]([https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics](https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics)) * [InsertSizeMetrics]([https://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics](https://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics)) - Metrics about the insert size distribution of a paired-end library, created by the CollectInsertSizeMetrics program and usually written to a file with the extension ".insert_size_metrics". - **Output directory: `results/picardMetrics`** * `sample.insert_size_metrics.txt` @@ -273,14 +271,11 @@ The two metrics created here are: * This file contains plot values for the bias in coverage across regions of the genome with varying GC content, plotted in the MultiQC report. * `sample.summary_metrics.txt` * This file contains a table summerizing the `sample.gc_bias_metrics.txt` data. - * `pdf/sample.insert_size_histogram.pdf` - * This file contains a plot of insert size histogram, created by Picard. - * `pdf/sample.gc_bias_metrics.pdf` - * This file contains a plot of GC bias of all reads, created by Picard. - - - - +* `pdf/sample.insert_size_histogram.pdf` +* This file contains a plot of insert size histogram, created by Picard. +* `pdf/sample.gc_bias_metrics.pdf` + * This file contains a plot of GC bias of all reads, created by Picard. + ## MultiQC [MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. From 9aae95406451332061d0ac8ae3a2ec4b74f84c41 Mon Sep 17 00:00:00 2001 From: ekushele Date: Thu, 11 Feb 2021 10:56:35 +0200 Subject: [PATCH 28/56] improve output.md for lint test --- docs/output.md | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/docs/output.md b/docs/output.md index a0ca9348..bcbe2027 100644 --- a/docs/output.md +++ b/docs/output.md @@ -147,7 +147,6 @@ This step removes alignments with identical mapping position to avoid technical **BISCUIT output directory: `results/biscuit_markDuplicates/`** > **NB:** The BISCUIT (samblaster) step doesn't remove duplicate reads from the BAM file, it just labels them. - * `sample.assembly.txt` * Log file giving summary statistics about deduplication. @@ -189,32 +188,32 @@ Filename abbreviations stand for the following reference alignment strands: * `sample.bedgraph` * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. - * `sample.vcf.gz` - * VCF file with the pileup information, used for creating the bedGraph file. - * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. - * `sample.vcf.gz.tbi` - * Index file for `sample.vcf.gz` - * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. - +* `sample.vcf.gz` + * VCF file with the pileup information, used for creating the bedGraph file. + * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. +* `sample.vcf.gz.tbi` + * Index file for `sample.vcf.gz` + * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. + **NB** if `--epriread` is specified in the pipeline, then: **output directory:** `results/epireads` : * `sample.epiread.gz` - * Storing CpG retention pattern on the read in a compact way. For paired end mode, two adjacent rows of the read mates in Epi-read format are merged. + * Storing CpG retention pattern on the read in a compact way. For paired end mode, two adjacent rows of the read mates in Epi-read format are merged. * `sample.epiread.gz.tbi` * Index file for `sample.epiread.gz`. - * `sample.err.gz` - * In paired end mode, storing all CpG retention pattern of the reads that failed to be merged together. - * `sample.err.gz.tbi` - Index file for `sample.err.gz`. - * `sample.original.epiread.gz` - In paired end mode, storing all CpG retention pattern of the reads before the merging. - * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. -* `sample.original.epiread.gz.tbi` - * Index file for `sample.original.epiread.gz`. - * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. +* `sample.err.gz` + * In paired end mode, storing all CpG retention pattern of the reads that failed to be merged together. +* `sample.err.gz.tbi` + * Index file for `sample.err.gz`. +* `sample.original.epiread.gz` + * In paired end mode, storing all CpG retention pattern of the reads before the merging. + * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. +* `sample.original.epiread.gz.tbi` + * Index file for `sample.original.epiread.gz`. + * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. * `snp/sample.snp.bed` - * bed file with SNP information about the sample - * **NB:** Only saved if `--save_snp_file` is specified when running the pipeline. + * bed file with SNP information about the sample + * **NB:** Only saved if `--save_snp_file` is specified when running the pipeline. > ### Bismark Reports @@ -231,8 +230,6 @@ BISCUIT generates a directory with different statistical reports describing resu **Output directory: `results/biscuit_QC/sample.assembly_biscuitQC/`** - - ## Qualimap [Qualimap BamQC](http://qualimap.bioinfo.cipf.es/doc_html/analysis.html#bam-qc) is a general-use quality-control tool that generates a number of statistics about aligned BAM files. It's not specific to bisulfite data, but it produces several useful stats - for example, insert size and coverage statistics. @@ -260,6 +257,7 @@ Note that these are predictive numbers only, not absolute. The MultiQC plot can [Picard]([https://broadinstitute.github.io/picard/picard-metric-definitions.html](https://broadinstitute.github.io/picard/picard-metric-definitions.html)) is a set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF. The two metrics created here are: + * [GcBiasMetrics]([https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics](https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics)) * [InsertSizeMetrics]([https://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics](https://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics)) - Metrics about the insert size distribution of a paired-end library, created by the CollectInsertSizeMetrics program and usually written to a file with the extension ".insert_size_metrics". @@ -273,9 +271,9 @@ The two metrics created here are: * This file contains a table summerizing the `sample.gc_bias_metrics.txt` data. * `pdf/sample.insert_size_histogram.pdf` * This file contains a plot of insert size histogram, created by Picard. -* `pdf/sample.gc_bias_metrics.pdf` - * This file contains a plot of GC bias of all reads, created by Picard. - +* `pdf/sample.gc_bias_metrics.pdf` + * This file contains a plot of GC bias of all reads, created by Picard. + ## MultiQC [MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. From 50ceb39aaa44041a4e04d12772336fa37c2cc185 Mon Sep 17 00:00:00 2001 From: ekushele Date: Thu, 11 Feb 2021 11:02:22 +0200 Subject: [PATCH 29/56] improve output.md for lint test --- docs/output.md | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/docs/output.md b/docs/output.md index bcbe2027..55262fcb 100644 --- a/docs/output.md +++ b/docs/output.md @@ -112,11 +112,11 @@ _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment t * `sample.assembly.sorted.bam` * Aligned reads in a sorted BAM file. * **NB:** Only saved if `--save_align_intermeds`, `--skip_deduplication` or `--rrbs` is specified when running the pipeline. - * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam` + * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam` * `sample.assembly.sorted.bam.bai` * Index of sorted BAM file * **NB:** Only saved if `--save_align_intermeds`, `--skip_deduplication` or `--rrbs` is specified when running the pipeline. - * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam.bai` + * **NB:** If `--skip_deduplication` is not specified when running the pipeline, file name would be `sample.assembly.samblaster.sorted.bam.bai` * `logs/sample_flagstat.txt` * Summary file describing the number of reads which aligned in different ways. * `logs/sample_stats.txt` @@ -124,7 +124,7 @@ _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment t ### Deduplication -This step removes alignments with identical mapping position to avoid technical duplication in the results. Note that it is skipped if `--skip_deduplication` or `--rrbs` is specified when running the pipeline. +This step removes alignments with identical mapping position to avoid technical duplication in the results. Note that it is skipped if `--skip_deduplication` or `--rrbs` is specified when running the pipeline. **Bismark output directory: `results/bismark_deduplicated/`** @@ -150,7 +150,7 @@ This step removes alignments with identical mapping position to avoid technical * `sample.assembly.txt` * Log file giving summary statistics about deduplication. - + ### Methylation Extraction The methylation extractor step takes a BAM file with aligned reads and generates files containing cytosine methylation calls. It produces a few different output formats, described below. @@ -188,10 +188,10 @@ Filename abbreviations stand for the following reference alignment strands: * `sample.bedgraph` * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. -* `sample.vcf.gz` +* `sample.vcf.gz` * VCF file with the pileup information, used for creating the bedGraph file. * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. -* `sample.vcf.gz.tbi` +* `sample.vcf.gz.tbi` * Index file for `sample.vcf.gz` * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. @@ -200,21 +200,20 @@ Filename abbreviations stand for the following reference alignment strands: * `sample.epiread.gz` * Storing CpG retention pattern on the read in a compact way. For paired end mode, two adjacent rows of the read mates in Epi-read format are merged. * `sample.epiread.gz.tbi` - * Index file for `sample.epiread.gz`. -* `sample.err.gz` + * Index file for `sample.epiread.gz`. +* `sample.err.gz` * In paired end mode, storing all CpG retention pattern of the reads that failed to be merged together. * `sample.err.gz.tbi` * Index file for `sample.err.gz`. -* `sample.original.epiread.gz` +* `sample.original.epiread.gz` * In paired end mode, storing all CpG retention pattern of the reads before the merging. - * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. + * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. * `sample.original.epiread.gz.tbi` - * Index file for `sample.original.epiread.gz`. + * Index file for `sample.original.epiread.gz`. * **NB:** Only created if `--debug_epiread` is specified when running the pipeline. -* `snp/sample.snp.bed` - * bed file with SNP information about the sample +* `snp/sample.snp.bed` + * bed file with SNP information about the sample. * **NB:** Only saved if `--save_snp_file` is specified when running the pipeline. -> ### Bismark Reports From db609192084c5f25a2ade478df38086bfba2ac5f Mon Sep 17 00:00:00 2001 From: ekushele Date: Thu, 11 Feb 2021 13:29:48 +0200 Subject: [PATCH 30/56] pushing last commit before PR --- CHANGELOG.md | 2 +- docs/output.md | 2 +- main.nf | 1 + nextflow_schema.json | 12 ++++++------ 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2525f9c8..505d1a6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # nf-core/methylseq -## v1.6dev - [date] +## v1.6dev - [2020-02-11] ### Pipeline Updates diff --git a/docs/output.md b/docs/output.md index 55262fcb..ead637da 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,5 +1,4 @@ - # nf-core/methylseq Output ## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/methylseq/output](https://nf-co.re/methylseq/output) @@ -197,6 +196,7 @@ Filename abbreviations stand for the following reference alignment strands: **NB** if `--epriread` is specified in the pipeline, then: **output directory:** `results/epireads` : + * `sample.epiread.gz` * Storing CpG retention pattern on the read in a compact way. For paired end mode, two adjacent rows of the read mates in Epi-read format are merged. * `sample.epiread.gz.tbi` diff --git a/main.nf b/main.nf index c433daec..bee98bbb 100644 --- a/main.nf +++ b/main.nf @@ -489,6 +489,7 @@ if( !params.bwa_meth_index && params.aligner == 'bwameth' ){ output: file "${fasta}*" into ch_bwa_meth_indices_for_bwamem_align + file fasta script: """ diff --git a/nextflow_schema.json b/nextflow_schema.json index c51759da..9aa2b781 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -380,39 +380,39 @@ "type": "boolean", "description": "Specify a minimum read coverage for MethylDackel to report a methylation call.", "help_text": "[Epiread](https://huishenlab.github.io/biscuit/epiread_format/) is a compact way of storing CpG retention pattern on the same read. This option will tell the biscuit workflow to generate Epi-read file for the sample, as well as all needed steps.", - "fa_icon": "fas fa-angle-double-down" + "fa_icon": "fas fa-clipboard-list" }, "common_dbsnp": { "type": "string", "description": "Common dbSNP table of the relevant genome, for SNP filteration", "help_text": "Common-dbSNP table that contains at least the following fields: chrom, chromStart, chromEnd, name, ref, altCount, alts, shiftBases, freqSourceCount, minorAlleleFreq, majorAllele, minorAllele, maxFuncImpact, class, ucscNotes [can be downloaded from UCSC]. This table is used for SNP filteration in the Epi-read file. Relevant only if `--epiread` is set.", - "fa_icon": "fas fa-angle-double-down" + "fa_icon": "fas fa-chess-board" }, "whitelist": { "type": "string", "description": "Path to the file that is the complement of blacklist.", "help_text": "The whitelist is needed for SNP file generation.\nThe whitelist can be created with the following steps:\n
    \n
  1. Download the blacklist for your wanted genome from [here]( https://github.com/Boyle-Lab/Blacklist/tree/master/lists)
  2. \n
  3. Run: `bedtools complement -i your_black_list -g your_genome_chrome_sizes | grep -v _ | bgzip > whitelist.bed.gz`\nFor more instruction, run `bedtools complement`
  4. \n
\nRelevant only if `--epiread` is set.", - "fa_icon": "fas fa-angle-double-down" + "fa_icon": "fab fa-buromobelexperte" }, "debug_epiread": { "type": "boolean", "description": "Debug epiread merging for paired end reads.", - "fa_icon": "fas fa-eye-slash", + "fa_icon": "fas fa-check-square", "help_text": "By default, merging two adjacent rows of the read mates in Epi-read format when running with paired-end mode will not output the debug data (about the reference allele, the alternative allele and the SP data), and the original Epi-read file will not be saved to the results directory. Specify this flag (or set to true in your config file) to run the merging with debug data and copy the original Epi-read file to the results directory when complete. \n If you don't want to keep the original files, check `debug_epiread_merging`.\nRelevant only if `--epiread` is set.", "hidden": true }, "debug_epiread_merging": { "type": "boolean", "description": "Debug epiread merging. Output merged epiread in debug mode.", + "fa_icon": "far fa-check-square", "help_text": "By default, merging two adjacent rows of the read mates in Epi-read format when running with paired-end mode will not output the debug data (about the reference allele, the alternative allele and the SP data). Specify this flag (or set to true in your config file) to run the merging with the debug data.\nIf you want to keep the original Epi-read files, check `debug_epiread`\nRelevant only if `--epiread` is set.", - "fa_icon": "fas fa-ellipsis-h", "hidden": true }, "assets_dir": { "type": "string", "description": "Path to assets directory for biscuit_QC", "help_text": "Path to a directory containing needed file for biscuit-QC step\n> **NB** If none provided, will be generated automatically.", - "fa_icon": "fas fa-ellipsis-h" + "fa_icon": "fab fa-buffer" }, "save_pileup_file": { "type": "boolean", From 41d48d54b89ae9adce8f85b0cb2ee331b72b51c4 Mon Sep 17 00:00:00 2001 From: ekushele Date: Thu, 11 Feb 2021 22:07:54 +0200 Subject: [PATCH 31/56] change knwwn splice main --- assets/sendmail_template.txt | 2 +- conf/igenomes.config | 12 ++++++------ main.nf | 13 +++++++------ nextflow.config | 2 +- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index 5530cfbe..9c779cc4 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -14,7 +14,7 @@ Content-Transfer-Encoding: base64 Content-ID: Content-Disposition: inline; filename="nf-core-methylseq_logo.png" -<% out << new File("$baseDir/assets/nf-core-methylseq_logo.png"). +<% out << new File("$projectDir/assets/nf-core-methylseq_logo.png"). bytes. encodeBase64(). toString(). diff --git a/conf/igenomes.config b/conf/igenomes.config index caeafceb..31b7ee61 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -21,7 +21,7 @@ params { readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" mito_name = "MT" macs_gsize = "2.7e9" - blacklist = "${baseDir}/assets/blacklists/GRCh37-blacklist.bed" + blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" } 'GRCh38' { fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" @@ -33,7 +33,7 @@ params { bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" mito_name = "chrM" macs_gsize = "2.7e9" - blacklist = "${baseDir}/assets/blacklists/hg38-blacklist.bed" + blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" } 'GRCm38' { fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" @@ -46,7 +46,7 @@ params { readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" mito_name = "MT" macs_gsize = "1.87e9" - blacklist = "${baseDir}/assets/blacklists/GRCm38-blacklist.bed" + blacklist = "${projectDir}/assets/blacklists/GRCm38-blacklist.bed" } 'TAIR10' { fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" @@ -270,7 +270,7 @@ params { bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" mito_name = "chrM" macs_gsize = "2.7e9" - blacklist = "${baseDir}/assets/blacklists/hg38-blacklist.bed" + blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" } 'hg19' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" @@ -283,7 +283,7 @@ params { readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" mito_name = "chrM" macs_gsize = "2.7e9" - blacklist = "${baseDir}/assets/blacklists/hg19-blacklist.bed" + blacklist = "${projectDir}/assets/blacklists/hg19-blacklist.bed" } 'mm10' { fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" @@ -296,7 +296,7 @@ params { readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" mito_name = "chrM" macs_gsize = "1.87e9" - blacklist = "${baseDir}/assets/blacklists/mm10-blacklist.bed" + blacklist = "${projectDir}/assets/blacklists/mm10-blacklist.bed" } 'bosTau8' { fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" diff --git a/main.nf b/main.nf index bee98bbb..4f0d25fc 100644 --- a/main.nf +++ b/main.nf @@ -681,7 +681,8 @@ if( params.aligner =~ /bismark/ ){ aligner = params.aligner == "bismark_hisat" ? "--hisat2" : "--bowtie2" // Optional extra bismark parameters - splicesites = params.aligner == "bismark_hisat" && knownsplices.name != 'null' ? "--known-splicesite-infile <(hisat2_extract_splice_sites.py ${knownsplices})" : '' + splicesites = params.aligner == "bismark_hisat" && params.known_splices ? "--known-splicesite-infile <(hisat2_extract_splice_sites.py ${knownsplices})" : '' + pbat = params.pbat ? "--pbat" : '' non_directional = params.single_cell || params.zymo || params.non_directional ? "--non_directional" : '' unmapped = params.unmapped ? "--unmapped" : '' @@ -1295,7 +1296,7 @@ if( params.aligner == 'biscuit' ){ script: """ - less $commonSNP_file | $baseDir/bin/processUcscDbsnp.pl | grep snv | bgzip > reformattedSNP.snv.txt.gz + less $commonSNP_file | $projectDir/bin/processUcscDbsnp.pl | grep snv | bgzip > reformattedSNP.snv.txt.gz tabix -s 1 -b 2 -e 3 reformattedSNP.snv.txt.gz """ } @@ -1328,7 +1329,7 @@ if( params.aligner == 'biscuit' ){ whitelist = params.whitelist ? "-R $whitelist_file" : '' snp_file = (reformatted_SNP.size()>0) ? "-a ${reformatted_SNP[0]}" : '' """ - bcftools annotate $whitelist -O z ${snp_file} -h $baseDir/assets/common_dbsnp.hdr -c CHROM,FROM,TO,TYPE,COMMON_SOME,COMMON_ALL,REF_MIN,ALT_MIN,REF_DBSNP,ALT_DBSNP,REF_ALL,ALT_ALL,RSID,MAX_MAF "${vcf[0]}" > "${name}-whitelist-dbSNP.vcf.gz" + bcftools annotate $whitelist -O z ${snp_file} -h $projectDir/assets/common_dbsnp.hdr -c CHROM,FROM,TO,TYPE,COMMON_SOME,COMMON_ALL,REF_MIN,ALT_MIN,REF_DBSNP,ALT_DBSNP,REF_ALL,ALT_ALL,RSID,MAX_MAF "${vcf[0]}" > "${name}-whitelist-dbSNP.vcf.gz" tabix -p vcf "${name}-whitelist-dbSNP.vcf.gz" bcftools view -O z -i'ALT!="N" & ALT!="." & ( (COUNT(GT=="0/1")>=1 & COMMON_ALL==1 & MAX_MAF>=0.05) | (COUNT(GT=="0/1" & GQ>=60)>=1) )' "${name}-whitelist-dbSNP.vcf.gz" > "${name}-whitelist-dbSNP-HET60.vcf.gz" tabix -p vcf "${name}-whitelist-dbSNP-HET60.vcf.gz" @@ -1382,7 +1383,7 @@ if( params.aligner == 'biscuit' ){ bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n > ${name}.original.epiread - less ${name}.original.epiread | $baseDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err + less ${name}.original.epiread | $projectDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.original.epiread | bgzip > ${name}.original.epiread.gz @@ -1396,7 +1397,7 @@ if( params.aligner == 'biscuit' ){ zcat $cpg_file > cpg.bed bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam - biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n | $baseDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err + biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n | $projectDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz tabix -0 -p bed ${name}.epiread.gz @@ -1706,7 +1707,7 @@ workflow.onComplete { def email_html = html_template.toString() // Render the sendmail template - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) def sendmail_html = sendmail_template.toString() diff --git a/nextflow.config b/nextflow.config index 932c2116..0157458e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -110,7 +110,7 @@ try { profiles { aws { includeConfig 'conf/aws.config' } - conda { process.conda = "$baseDir/environment.yml" } + conda { process.conda = "$projectDir/environment.yml" } debug { process.beforeScript = 'echo $HOSTNAME' } docker { docker.enabled = true From 48d27166c571f52ee26d634ca66f76518877a7d5 Mon Sep 17 00:00:00 2001 From: ekushele Date: Sun, 14 Feb 2021 09:11:19 +0200 Subject: [PATCH 32/56] added option blacklist --- main.nf | 59 ++++++++++++++++++++++++++++++++++++++------ nextflow.config | 1 + nextflow_schema.json | 10 ++++++-- 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index 4f0d25fc..824dcfe3 100644 --- a/main.nf +++ b/main.nf @@ -55,7 +55,8 @@ def helpMessage() { --bismark_align_mem_per_multicore [str] Specify how much memory is required per --multicore for bismark align (default = 13.GB) --assets_dir [path] Path to assets directory for biscuit_QC --epiread [bool] Convert bam to biscuit epiread format - --whitelist [file] Path to the file that is the complement of blacklist, needed for SNP extraction For more instuctions: https://www.cse.huji.ac.il/~ekushele/assets.html#whitelist + --whitelist [file] Path to the file that is the complement of blacklist, needed for SNP extraction. This file can be generated if a blacklist is provided. + --blacklist [file] Path to the file with problematic regions of the genome --common_dbsnp [file] Common dbSNP table of the relevant genome, for SNP filteration --debug_epiread Debug epiread merging for paired end-keep original epiread file and merged epiread file in debug mode --debug_epiread_merging Debug epiread merging. Output merged epiread in debug mode @@ -87,7 +88,7 @@ def helpMessage() { Other options: --outdir [file] The output directory where the results will be saved - --publish_dir_mode [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy) + --publish_dir_mode [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy) --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) @@ -176,7 +177,7 @@ else if( params.aligner == 'bwameth' || params.aligner == 'biscuit'){ Channel .fromPath(params.fasta_index, checkIfExists: true) .ifEmpty { exit 1, "fasta index file not found: ${params.fasta_index}" } - .into { ch_fasta_index_for_methyldackel; ch_fasta_index_for_biscuitQC; ch_fasta_index_for_createVCF; ch_fasta_index_for_epiread } + .into { ch_fasta_index_for_methyldackel; ch_fasta_index_for_biscuitQC; ch_fasta_index_for_createVCF; ch_fasta_for_create_whitelist; ch_fasta_index_for_epiread } ch_fasta_for_makeFastaIndex.close() } } @@ -288,13 +289,21 @@ if (params.input_paths) { } if (params.epiread) { + assert params.blacklist || params.whitelist : "Cannot find any blacklist/whitelist file matching: ${params.whitelist}\nEither whitelist or blacklist are needed if '\--epiread\' is specified" + if (params.whitelist) { Channel .fromPath(params.whitelist, checkIfExists: true) - .ifEmpty { exit 1, "Cannot find any whitelist file matching: ${params.whitelist}\nWhitelist file is mandatory if epiread file conversion is required" } + .ifEmpty { exit 1, "Cannot find any whitelist file matching: ${params.whitelist}" } .into { ch_whitelist_for_SNP; ch_whitelist_for_epiread} } - + else { + Channel + .fromPath(params.blacklist, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any blacklist file matching: ${params.blacklist}" } + .set { ch_blacklist_for_create_whitelist;} + } + if (params.common_dbsnp) { Channel .fromPath(params.common_dbsnp, checkIfExists: true) @@ -302,6 +311,10 @@ if (params.epiread) { .set { ch_commonSNP_for_SNP; } } } +else +{ + ch_fasta_for_create_whitelist.close() +} // Header log info log.info nfcoreHeader() def summary = [:] @@ -356,6 +369,7 @@ if(params.bismark_align_cpu_per_multicore) summary['Bismark align CPUs per --mul if(params.bismark_align_mem_per_multicore) summary['Bismark align memory per --multicore'] = params.bismark_align_mem_per_multicore if(params.assets_dir) summary['Assets Directory'] = params.assets_dir if(params.whitelist) summary['Whitelist'] = params.whitelist +if(params.blacklist) summary['Blacklist'] = params.whitelist if(params.common_dbsnp) summary['Common SNP'] = params.common_dbsnp if(params.epiread) summary['Epiread'] = 'Yes' summary['Output dir'] = params.outdir @@ -534,7 +548,7 @@ if( !params.fasta_index && params.aligner == 'bwameth' || !params.fasta_index & file fasta from ch_fasta_for_makeFastaIndex output: - file "${fasta}.fai" into ch_fasta_index_for_methyldackel,ch_fasta_index_for_biscuitQC,ch_fasta_index_for_createVCF,ch_fasta_index_for_epiread + file "${fasta}.fai" into ch_fasta_index_for_methyldackel,ch_fasta_index_for_biscuitQC,ch_fasta_index_for_createVCF,ch_fasta_for_create_whitelist,ch_fasta_index_for_epiread script: """ @@ -1304,9 +1318,38 @@ if( params.aligner == 'biscuit' ){ else { ch_reformattedSNP_for_SNP = Channel.empty() } + + + /* + * STEP 7.2 - Create whitelist for SNP calling + */ + if ( !params.whitelist) { + process create_whitelist { + tag "$blacklist" + publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode + input: + file blacklist from ch_blacklist_for_create_whitelist + file fasta_index from ch_fasta_for_create_whitelist + + output: + file("whitelist.${name}.bed.gz" ) into ch_whitelist_for_SNP, ch_whitelist_for_epiread + file "sizes.${name}" + script: + name = assembly_name - '.fa' + + """ + cut -f1,2 $fasta_index > sizes.${name} + bedtools sort -g sizes.${name} -i $blacklist > ${blacklist.baseName}.sorted.bed + bedtools complement -i ${blacklist.baseName}.sorted.bed -g sizes.${name} | grep -v _ | bgzip > whitelist.${name}.bed.gz + """ + } + } + else { + ch_fasta_for_create_whitelist.close() + } /* - * STEP 7.2 - SNP file generation for the epiread convertion + * STEP 7.3 - SNP file generation for the epiread convertion */ process get_SNP_file { tag "$name" @@ -1338,7 +1381,7 @@ if( params.aligner == 'biscuit' ){ } /* - * STEP 7.3 - Convert bam to epiread file format + * STEP 7.4 - Convert bam to epiread file format */ process epiread_convertion { tag "$name" diff --git a/nextflow.config b/nextflow.config index 0157458e..3080b821 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,6 +58,7 @@ params { bwa_biscuit_index = false whitelist = false + blacklist = false common_dbsnp = false save_pileup_file = false save_snp_file = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 9aa2b781..e9a40cb6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -391,8 +391,14 @@ "whitelist": { "type": "string", "description": "Path to the file that is the complement of blacklist.", - "help_text": "The whitelist is needed for SNP file generation.\nThe whitelist can be created with the following steps:\n
    \n
  1. Download the blacklist for your wanted genome from [here]( https://github.com/Boyle-Lab/Blacklist/tree/master/lists)
  2. \n
  3. Run: `bedtools complement -i your_black_list -g your_genome_chrome_sizes | grep -v _ | bgzip > whitelist.bed.gz`\nFor more instruction, run `bedtools complement`
  4. \n
\nRelevant only if `--epiread` is set.", - "fa_icon": "fab fa-buromobelexperte" + "help_text": "The whitelist is needed for SNP file generation.\nThe whitelist will be created automatically if a blacklist is probided with `--blacklist`\nRelevant only if `--epiread` is set.", + "fa_icon": "far fa-list-alt" + }, + "blacklist": { + "type": "string", + "description": "Path to the file with problematic regions of the genome, known as blacklist.", + "help_text": "The whitelist is needed for white list creation, which is needed for SNP file generation.\nRelevant only if `--epiread` is set.", + "fa_icon": "fas fa-list-alt" }, "debug_epiread": { "type": "boolean", From 6efc3d66324e8f566cb98ec9e92112263bb6b4fa Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 15 Feb 2021 11:10:41 +0200 Subject: [PATCH 33/56] update CHANGELOG.md --- CHANGELOG.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 505d1a6c..af4bd63f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # nf-core/methylseq -## v1.6dev - [2020-02-11] +## v1.6dev - [2020-02-15] ### Pipeline Updates @@ -16,11 +16,21 @@ * _new_: samblaster `0.1.24` * _new_: bedtools `2.29.1` -* _new_: biscuit `0.3.16` +* _new_: biscuit `0.3.16.20200420` * _new_: bcftools`1.10` * _new_: parallel `20201122` * _new_: gawk `5.1.0` +* python `3.7.3` > `3.8.5` +* markdown `3.1.1` > `3.3.3` +* pymdown-extensions `6.0` > `8.1.1` +* pygments `2.6.1` > `2.7.4` +* pigz=2.3.4 > `2.5` +* trim-galore=0.6.5 > `0.6.6` * samtools `1.9` > `1.10` +* bowtie2=2.3.5 > `2.4.2` +* hisat2=2.2.0 > `2.2.1` +* bismark=0.22.3 > `0.23.0` +* preseq=2.0.3 > `3.1.2` * methyldackel `0.5.0` > `0.5.1` ## [v1.5](https://github.com/nf-core/methylseq/releases/tag/1.5) - 2020-04-09 From 37f01fcd4d24118de3e9568f636543ec67a495b0 Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 15 Feb 2021 11:11:41 +0200 Subject: [PATCH 34/56] update CHANGELOG.md --- CHANGELOG.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af4bd63f..1c735fe9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,13 +24,13 @@ * markdown `3.1.1` > `3.3.3` * pymdown-extensions `6.0` > `8.1.1` * pygments `2.6.1` > `2.7.4` -* pigz=2.3.4 > `2.5` -* trim-galore=0.6.5 > `0.6.6` +* pigz `2.3.4` > `2.5` +* trim-galore `0.6.5` > `0.6.6` * samtools `1.9` > `1.10` -* bowtie2=2.3.5 > `2.4.2` -* hisat2=2.2.0 > `2.2.1` -* bismark=0.22.3 > `0.23.0` -* preseq=2.0.3 > `3.1.2` +* bowtie2 `2.3.5` > `2.4.2` +* hisat2 `2.2.0` > `2.2.1` +* bismark `0.22.3` > `0.23.0` +* preseq `2.0.3` > `3.1.2` * methyldackel `0.5.0` > `0.5.1` ## [v1.5](https://github.com/nf-core/methylseq/releases/tag/1.5) - 2020-04-09 From 2f9aa2baf3da0e17fff9650701ebe6d69fea1197 Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 15 Feb 2021 11:20:20 +0200 Subject: [PATCH 35/56] clear markdown lint problems --- README.md | 2 +- docs/usage.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e51d5a73..383a7b73 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for | Sample complexity | Preseq | Preseq | Preseq | | Project Report | MultiQC | MultiQC | MultiQC | -## Quick Start +## Quick Start 1. Install [`nextflow`](https://nf-co.re/usage/installation) diff --git a/docs/usage.md b/docs/usage.md index 5f8e49f7..0ecb4708 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -140,4 +140,4 @@ We recommend adding the following line to your environment to limit this (typica ```bash NXF_OPTS='-Xms1g -Xmx4g' -``` \ No newline at end of file +``` From 022eafff1566b63226df645ca3916b5b49bc836a Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 15 Feb 2021 11:55:41 +0200 Subject: [PATCH 36/56] change environment.yml and CHANELOG.md to pass nf-core lint --- CHANGELOG.md | 5 +++-- environment.yml | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c735fe9..e240f6e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,8 +14,8 @@ ### Software updates -* _new_: samblaster `0.1.24` -* _new_: bedtools `2.29.1` +* _new_: samblaster `0.1.26` +* _new_: bedtools `2.30.0` * _new_: biscuit `0.3.16.20200420` * _new_: bcftools`1.10` * _new_: parallel `20201122` @@ -31,6 +31,7 @@ * hisat2 `2.2.0` > `2.2.1` * bismark `0.22.3` > `0.23.0` * preseq `2.0.3` > `3.1.2` +* picard `2.22.2` > `2.25.0` * methyldackel `0.5.0` > `0.5.1` ## [v1.5](https://github.com/nf-core/methylseq/releases/tag/1.5) - 2020-04-09 diff --git a/environment.yml b/environment.yml index 8005170b..7290da82 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-methylseq-1.5 +name: nf-core-methylseq-1.6dev channels: - conda-forge - bioconda @@ -23,12 +23,12 @@ dependencies: - bioconda::preseq=3.1.2 - bioconda::multiqc=1.9 # bwa-meth pipeline - - bioconda::picard=2.22.2 + - bioconda::picard=2.25.0 - bioconda::bwameth=0.2.2 - bioconda::methyldackel=0.5.1 # added - - bioconda::samblaster=0.1.24 - - bioconda::bedtools=2.29.1 + - bioconda::samblaster=0.1.26 + - bioconda::bedtools=2.30.0 - bioconda::biscuit=0.3.16.20200420 - bioconda::bcftools=1.10 - conda-forge::parallel=20201122 From 68bbb5b7befdf613dcf7b229cefe0b38247006b6 Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 15 Feb 2021 12:17:24 +0200 Subject: [PATCH 37/56] fix nf-core CI: '\--epiread to \'--epiread --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 8e5e2a88..c724c9dd 100644 --- a/main.nf +++ b/main.nf @@ -291,7 +291,7 @@ if (params.input_paths) { } if (params.epiread) { - assert params.blacklist || params.whitelist : "Cannot find any blacklist/whitelist file matching: ${params.whitelist}\nEither whitelist or blacklist are needed if '\--epiread\' is specified" + assert params.blacklist || params.whitelist : "Cannot find any blacklist/whitelist file matching: ${params.whitelist}\nEither whitelist or blacklist are needed if \'--epiread\' is specified" if (params.whitelist) { Channel From 3a4fc8111aff07b0c299e7875d436a55f3abfd8f Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 15 Feb 2021 12:28:44 +0200 Subject: [PATCH 38/56] add biscuit check to epiread if --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index c724c9dd..958530a9 100644 --- a/main.nf +++ b/main.nf @@ -290,7 +290,7 @@ if (params.input_paths) { .into { ch_read_files_fastqc; ch_read_files_trimming } } -if (params.epiread) { +if (params.aligner == 'biscuit' && params.epiread) { assert params.blacklist || params.whitelist : "Cannot find any blacklist/whitelist file matching: ${params.whitelist}\nEither whitelist or blacklist are needed if \'--epiread\' is specified" if (params.whitelist) { From bd2615a691e66b2acebe7b24f57fcfb0274821ee Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 15 Feb 2021 12:45:24 +0200 Subject: [PATCH 39/56] add biscuit before check to epiread if --- main.nf | 54 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/main.nf b/main.nf index 958530a9..6d18b745 100644 --- a/main.nf +++ b/main.nf @@ -290,32 +290,34 @@ if (params.input_paths) { .into { ch_read_files_fastqc; ch_read_files_trimming } } -if (params.aligner == 'biscuit' && params.epiread) { - assert params.blacklist || params.whitelist : "Cannot find any blacklist/whitelist file matching: ${params.whitelist}\nEither whitelist or blacklist are needed if \'--epiread\' is specified" - - if (params.whitelist) { - Channel - .fromPath(params.whitelist, checkIfExists: true) - .ifEmpty { exit 1, "Cannot find any whitelist file matching: ${params.whitelist}" } - .into { ch_whitelist_for_SNP; ch_whitelist_for_epiread} - } - else { - Channel - .fromPath(params.blacklist, checkIfExists: true) - .ifEmpty { exit 1, "Cannot find any blacklist file matching: ${params.blacklist}" } - .set { ch_blacklist_for_create_whitelist;} - } - - if (params.common_dbsnp) { - Channel - .fromPath(params.common_dbsnp, checkIfExists: true) - .ifEmpty { exit 1, "Cannot find any dbSNP file matching: ${params.common_dbsnp}\n" } - .set { ch_commonSNP_for_SNP; } - } -} -else -{ - ch_fasta_for_create_whitelist.close() +if (params.aligner == 'biscuit') { + if (params.epiread) { + assert params.blacklist || params.whitelist : "Cannot find any blacklist/whitelist file matching: ${params.whitelist}\nEither whitelist or blacklist are needed if \'--epiread\' is specified" + + if (params.whitelist) { + Channel + .fromPath(params.whitelist, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any whitelist file matching: ${params.whitelist}" } + .into { ch_whitelist_for_SNP; ch_whitelist_for_epiread} + } + else { + Channel + .fromPath(params.blacklist, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any blacklist file matching: ${params.blacklist}" } + .set { ch_blacklist_for_create_whitelist;} + } + + if (params.common_dbsnp) { + Channel + .fromPath(params.common_dbsnp, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any dbSNP file matching: ${params.common_dbsnp}\n" } + .set { ch_commonSNP_for_SNP; } + } + } + else + { + ch_fasta_for_create_whitelist.close() + } } // Header log info log.info nfcoreHeader() From f2f0cd1f15e403af6183c10f93a66b663209a1ee Mon Sep 17 00:00:00 2001 From: ekushele Date: Fri, 19 Feb 2021 07:53:03 +0200 Subject: [PATCH 40/56] add try and catch in epiread_pairedEnd_convertion --- bin/epiread_pairedEnd_convertion | Bin 178240 -> 178240 bytes bin/epiread_pairedEnd_convertion.cpp | 1329 +++++++++++++------------- 2 files changed, 643 insertions(+), 686 deletions(-) diff --git a/bin/epiread_pairedEnd_convertion b/bin/epiread_pairedEnd_convertion index 06bd8cd7f1103594ffb159e5383febb32a046b7a..eebec5dbe5523b25fd25c1ccb1e9468f29dc9817 100755 GIT binary patch delta 32374 zcmZ{N30zOv`~SVSWlTa*NohqwQiv=i6fq(Do~(ljF_^5M3>nFXI5;6>8QWN*?8amn z*@hU~7)+MolNw?$7~}JQpXFBi{{Hv%n%DcB_w$_RJm)#jIp^N{xtg3}o19{s%nGVa z$R&keA(u#1)&KL*ZlW5R)k*z9b?AC%+l;*K*_K*}=0el58+@1%TaZ=3qQd7jX6_e? z2?CT$l@!HWsjR#w^=y}DA(`Zet-m&NKY0oW-jn50>41aQer?uOulQfvl`l>(WGy);o8`*$+2e9>NQfSc`ZZRUOgk{6e%hiCI9&LquCs3 zOo}xFl5Hbl#NB>3giF zb0j+0#qhsfj6xR$zASR0=i_X)ScBo8n$+p2QjsNZbq;MOflEgHlCMZE7uf+a=|bEmC2A?(@2ppJho7&%GFB} z&O%gq&jyHJO@M{Z5%5mQrz4Yp)jPQhy3o3WHPTrgh- z?B*MIksOg9X$-pjGxIQHe@i5)Cri}ilaVP_U4v95GWkqo@=a4Kp1l!<{Kyno%CTdR z(i@glW{A`Z!k8_XP))6*XY=0+5$h0D1|V$Q-#T;z>%O-)!gqQ}IyWI!u_dpkDbWM(@# zA`=l%$f{OhhRd&*9##_Ur1ai~;I25uRB}xlxf{$pLi&fvsf-JgczCTWnlOh?qRpxR zR{F!V0ZMF6rK_ExS7^Z}`45xTYqOcpZmww|vo-g0^uV^HU+W>-iC}WLb+FG8-hT~H5FJ4w#1Y;ptLB=VSQjZ)Tlw%= zAVT~_AR@@T&^DES#E=UI{$z&?HvRU3TnVjaNY4>QxASFE5?wc8G=~|ThE_0YQ(`ou zp)k4)O3AkhQu%_=H$T>*BWochdpR@2cSb!7^_de9VpcMGGxZvb(Wm9w3pOne3u~8| zx3@4-qL3vO$}NCvyd$2{UgS`l;HoApA=6$qtYi$!7;-C!Xkk&+1{6z&BJz-6NW7p( zGBnJo{RS3tBS<*(-`^;wSS{wkV_~fACr0cCf$)p#T}QRVuhe34B+N}!$*r(qJ-Tvh zNwI8eg25&5nJq_IP2|-TTxv{8^7sZP-e5ShVY=b|5|5d8rO$O1nPVuEyjSz?8Bf)d-%j|2=1BSlVGzdQO&x*KGN#pw=^L!e2~W61PKKR1($gqAKE-YidlMghx7d z2dQt4`hY@6ctl;*k;FyRu2zf?O1bFvk7s04girXvGCc3QYf*_}i37{T{P2l`dvirq zcfgiZ>d;VI_ZMm1p@mwFOzq&;wHb_H0E(RWtp!*yr%ZGDC`yY{RhH-A_-=vpvO3Hn z5|WH{WqS1#Et1*!;f>n@d45zWG@H*@*^=T8wOp>7xJf%P%^7Up7dg_FH0;={`xK~R z&)~e?G0&|y)~T;9>^CuBwBFl~+>c^BD7OZFq(J9&jx=58g$%N#JXQ5utSSbqp)R6m{} z!bq;^zL45U<*QFgW~T^W@|d-W;FI$Nf&4W64)2EvnA&9`Zk>l{AL^6Yox7-&$i>bc zl~-_WG2O`pkSQke&lf(9!^?>DA$22-qv}?zTPhLGIgziT>UMqxfAkdP z4krEiiG!C)YxXE%>z@8^8w8H48T6QTQt zcN6iPh4I28GNg-lb2Bsxn~DRO^D7^f7;D9>$fFu9uJNkA0e9)#?lCvFVgj2*tYHp8 z`}c~zwlh1%BrZhb_smBel}stc$fIO*3?kuOTc{Rfc2^&@75TpFMD0RmmUFk7s>hx^ z!V2xnrzL>RoYBZ9(CC=9hxCjNRb$Aq=vHc7aw)o&q3?ZeK2;`EqUbGCRws&To~#ZN z)nZxgDXP2=cp5IMda?wGYP`%HA5pcF)#|9?xV6N28$>2|_fz|j4c!9Tc2V6R zs?pM|Wuh7{t8+y)Q&uO7YM!i)6xCu`?IWtGvf5EpD}UmxE9u&&k@LU5^YPhz1i*(^ zcg^N12drj!kbnWM+S%X9zCLxdy}yy)`qb4H{6@6C_0$mJ*LQ?=>ke7g_iK;$TSb`8 zbmL1go|!QkQ#m-^-AZcrYuBafZQezI?4qIw0#`+kAA|$E395gIYNk}aC#uo1dii&f z)i11O@fOkRt*)Y1w%&|guq+(!Y#~+phiV(QkZ%3ksl&;#{Jff4P1FEYHNz{OvY5*BC zpss!KZ`_|~qz()D{olxj0Uq`_g3XsKVt!?SqoLhREY{cT){d{?06C#^p{y^=Xvs|| z!^KN(U>Pn`avjTXd6H{VhAWnw2RMV?Nvb3(f-GoX;*XsS;wyx>0C&l{o07f@TDK4{ zxpQT>RLSix!(~ct1LF+xHa}XDMvw)`C3{z~q^Fi?uTSK#GF*V1m#$?veM(!Fa$;WV zl_6aC>^he+B~N#jWY(a%PIoXp`T+I$JbRNI8RQd?0g>Dw*eowCqp8pFET$s1>GL;; z-Qd=S^c$?viU}+UQj$jh%W6SiZ4}u)bUgJvaZuGb{4@s`+JYR-E<7JBm zz1cgWDoOdD^K}v?bIXSrO5r!s$3K%TQ&~%LVX$43U01nOc3w@I$}Q8I-ljWi15X-| z$}yp;AL$YkqTouFHBdWX!su`l1BbUZfQH_<=Z?7yH9#&&hy67||oXt)|A~w$qY@W-~ zN&n%k)PZEh@b+p=^4suQhNw%tuYB2jiWr%AiIHm>C90lsdG!_5Fjh?!PFjCptfPtNt z$&C>nh7`fFNo4Fy88KNMhiX9_d&uFtsJ?nmQ*#>$rsrKFyd?_p6_43Ja*xC|F9xvOQkc*z|v!`Vr02e>jjZ>ppTqy5O=Bkc_{ zS@Exu%`r^Q*~l`+5+&EY43{CfkTP73JT1fN zX>p^J6S?to8G~j3^g$CGs=*9hJ5)8asAS#eVZ~OJxv;v z;q=Y1Iyimrg~bZ2DsrSZnfK)gwFW8p(notqiT&vF+Tio#?r1kPl>9q7S(~+z%pX%v z?U9u^#$VNT{7mx4)*U$KXKt?eJ$4s;-)z-REI-Gy>^AZkdNY+3*OcNnxlDWrV%}Qc zdo+eA92~P*@*0x@I0Q9YF8XTUP1;TV&6KZs7o&tzeoyV{>@j|hgP%{3k@0@oKR*!?AE(VZMbwF{ z@faF9F$ixJCQsa;4YwfGCw1^B{E0Wqlzlmzv3v7;x)OE>P)&%cG3R(iHw`#ayg;C_T5qPZ-R^8 zS7=#)+{BpJrlZHV7cT3r=)Y18~V-7MprKD<+9IDL$JO1PAy zx9>508tWSPMm=2Sd)1n6LfUoRvksVohTx9cS44RfCrg+>TR!!nUv_5{2pJyrm z@ZakEp`Qo+l8xY(#%N^>nkXBu4<<{$aw3`2TZB7F>L9b1j28e&`8-js96=29iM@S* zcg7{TNIdSzoq58OS%*l&86H~t0n&9wb-RZ`)sE@FFIzHYMlbE)A#!1ci`tVsp7BM! z3$H)*QQvMV1#%zv&J||=XvTevpSen#@Rn4Z<>AqFKldU)n(l}p!`*0EMcU1(rP`8l zv)n2_%Vr|6=|4R{(q=W&cKk!~X4Nz-5|Rw&+XC*Ycq29w<~s4zxwx>rpYr$0V%)hh)!gr@D}TW_vX)NE8uE%YuU`Np^2B@wlD9H$eJ@&GoB?Y`=!^8=fMLwI;T4UX7R4yP_(7r_427+DEeI+|~ZsMaZ{x!!PaPx{B1D087ac*(~|sJnMZh zXYjL0QjTN__wXhevU+C^`S@*1?ZQmba&D++-fk|;lWXan@Qj^i%-GXRk~X)c8c2Sf z+s31rP_^SHL#}CvPz{jPXc9PY%4dzClVTN>wQHWOsz#9`Nj0&T-b`wv{WXm^&kxmh zO(XpP`?=lR!P=cli8w0``9!YE5BXy|$nN9_ zZORVvI@!BscU>|4e-xXMCJSq;_N3oJcej_@-7~#%7*|2gU4;P-*{@O%#r8LzJZXyo>b2gD0i=u-XeXkGG_@HE%(;121 zE{GV`r51XDXj)*;p>q%EIYX zRyD2wOMW+cI=MmFl(va+K_B?9%(LY zwG&ks=T51jYA086XR>5*fK&g6=YEpnS zGEfYoSauO5s(Ke8s1_!`qj;%4ZxiXWq){Zn#@00hO?7;XBpR3^c^j^>@s@slFMM6kI+k% z+#;dp!H>~QNhW|arSQn|vn^*MaarnGe-mezhs9E{1r!TTTImDV_iS%6WT}h8^K~qm z5ewMN$;~{97?%cg8i>?0-B_>3y6N4LSbJsgSVv3s)nXp?*4!f`zm#6fAlAzo86INZ zO)(|bj@>D-ww;lfIjQe0tm)0Gp_*!{gV{Zb$SVas`Iy+~qGf{@km>P{m)u>(=`my+ z`?Tm>8AaXvesHF<7*9b9OFcgM(#WbZ`ieXnlFD$NlABnD3zOX7GF&V;1KTM&l_3+Q zVo(__LvmhaxE#qk-0OzFn;Ott9Cd zaMhA`BdW;39O>dqwkAsZcqpo2W@aeYnmu7bf-AgEQ?*C#IENJ-;a<&ydIWaB%P`M^<}@YQBEBmuaEj zs|GR^@kB5?oeZLlwdt!!Ds^x5K244=D0Vc9)*q&RY;sbphTddV_S>rYnh-n}F&rYD zR7e5!F&tjaUC~J!Q4RQyS;gm+yh;2=ILj$xr55Ogr613cLC{Xh5ubVli7&4@mH1O# z`m>eUNU_Siq5IQS`V$I)@7y2#(3jPe&(2s)#U9DNm5r92SK*yUEMbl`AqUcGY0p=Y z-_vTkB$pA?S;hR1_=zd~wwlkyYyFysTnZsm_;*^0G?IiI-LE zBhG@rZKE+H{x-6{6$^1P%kR(YTxT)iA5p%6A-+fBo3lx{0qT_ebEo8MY(D*n+iDN3 zh?3CN)eV2c;i%-o!g%)kG5f%?WJ%b=Z0Xt;QTrZ73;lIf5rqPbqKqDo#w?~+@_lH@ zfJb32%g`8jcX*y_+6oipKON0A74VtQHBDT>Lz08c8#4mqGlq1-&S5&jLd}P5`n6=# z*H@A{Ydk_6n1Z21VOOcbYLNW<6jK1q7p&%s^OUI;?~9+?{m7g(Ee(BGWEZf>Fgn&_ znPv=X4+=q^!Hwmbeqe4H#rGd(uE@%4eb@>wr!yyhW==@08ocayn@tklzOn5u#p=;j z0AetRn+qAq8lAKl$W$En3e6r&(-^WT7NTZC&A`<9v3m%kV~Ws<6I!d7mL0STUHD!j z99++uiRB(nR-{kS787E-wr?#*!uT^Oy^D}=#ch-Exxk6Ywm}d z$)ojMt8PI9$lv0NUSFoOav2HF=o0+T5>A^UjnTdk!vUVY`JfBk*$7f1OlX3SiE~Y- zxVLO&|EdljZviBD!lCrTTX|vIxW`vPKJaI)`eee! zD14-KY@>^YsN~#n(H1Wyo}1h&<6W+Bm<@3#|x z!|@Sw#M@*$(=}9b5l2NyJ$EV`myD$5mT=FrXi?}Uj+7>H7tRJJXSzR0+)~@BX)^Qq z#jzUXhuxI|52F^D()$(n9rjXRF?%L`_By+2?ib>nr1TVM#+gd|Axquil z>tuy&^S9Qv3GEH@xn`HdxU7^Z7Fd-s9OanoZ`F_Q>;X00`ORR?*chd;=XkaPTQU zhvd%Is-$9;t?Qpmh#1}H_phw;x3cqrWOn8SZHXc4i@j~s#vfzJ@iu_yF-?9lvcW+sW2h$T=61aiJ0!kX1Wu2YL|`$Cu?A zR&8-=N_*BT^Ku~wwwudk`}=HNol7syCdadW(f*oEjM>gwlbM(>XYx~aX!WmVGQAUM zhUI#aNAVBSo0-I6f1o9S4HAFCkrY3*B2n)?2YyB_Jg!0>pRy-p{}}D!bnrv+U98Wfwfl-`)UF7S{~ z-WKfOrv6h!v-=!*GpDRm$BwCL|E$F0z8Z--;X!h9lC(t=vbvtwZt1pkEFxV4XCmJD zMuDMzm1F`RTi;z;uHRy4O5!tDOfXs#7tG~cm4@!@Ng(w5<@ zhgynp$DUfx}=C3!) zd9meQHn|S|pGGE7g?_AwKNpJTRngq^Be!wN6cJmI8RK@R$7aR;+E~>(3?Z?<29a~u z{;N=xdBPme%Wx)-ufMmKWpdB7`0(c-;%D+N`HqGp_9k`ne9`i z@dHZw*ffAN|7}UrjeY*7#il~Es4rXWxXQ37LAcnj6*0i@U zU&xUwd-fqk`JUwV-RIinp5(v#^~kGxn;Kq;=DlSgT8E^6m;Od?{d)=n_c?L{PQ5V4 z)yagvDiF6~8}ip5C$+=f$liP($G{#!`7lSW{H#Y-QGOLwTN{n8E;d}><$t=#dx381 zvhERvJsf$?G0EQAZ4v#J!#(zH9_WO1RJ!(eDK zC;sHfop8zesj!Zw&1(k*XO}a02W_&75L!l#TWMb+Qef2sE zWP8!9(CwpyPUihFC67{8%;|tb1hA^!yL*C2owpo5Jt7@tZY4gfMn;w!C`|5+L6}2ESZ=AJZ ztw`)EudG#X#%ea#o5M8xaUMwSud7Bp-%Y{~f#lsmZE>@#sP}=Y`URQ&uRnfA+x72s zZ9`L1??XfF+vZpYuH@?veXGuHf-c46;AfRw)5oUd&WCPlDhc?{!6|lvqV$e5HXIG? z8C8$n5>zl*WDWSwT3fDGtJ-1cn7EN6ygWxH3?CUkdCY|IEmwr7uKNSjDk|-vs_t~0 zs!lKrYhgAQ*(=J3mS(dbI$QpWmn;Co;%pwoI70CoT_ z2j&5f0gHflfE6;#<`2Nyz?z6`7_c?)0B{fxFLKPAfp>r-3<$C92K@T~;AG%t;B8m?Gd-;3MGYc%5wP zq$s}v1AsO0qTCRTLL>_Dz!AVyU?MORm<_xJ%mcmx76BcqD~cz6rU(YM1`Yxy0uzBB zfEmE1+s)=2U>GnT*k`BNT)|mUrUHF|>wuBK)LmwSc`^zwcfn(z6MlL+25b$y1MCC* z089YZtbybLwg$Gw&tW5gbAj`KTY#H^7lEgNkAaVX6|lEhx+sbdur_c2FdVo8I2w32 z(_l6)K*1kB|Lp`01zrT60loqj107ryrTIRD8aM>l3-}0_0BnhUFC91xcntUh@DA`A z@B=Ukzo6ExiH?Ecz|p`Fz?Hyxz>NkJHlwf`cpB)uAIk*T0a(FJQF;J%ne!V%0B z&lUrBY|ON>1GQW%PoN#}6)+Cy;G-xRz^1@6z+S*2;ACLb zSws#v9e4=15NNoC!d4XC0`~)*>tZU-VVMED0tW#H0cQfUff>L*fjPj+=P`0%Q=sCD z^$zp|E(C@F9|B{64i_+TU^p-XI2D)!JbA%vFz2IS0-@Byh%UkaurV+UI06_8%mgL^ zZvZoZ#lRe3ze`vxz?{p-_WFu)6&L`t%tPdWYk&#BC%|-|*A>h;um|uCaMV=;Hi!mD zmaAB&z^A}Spxv+7Ab_iZslc7UOrZNUgd7+SdC_ zDX=y02jC!J>s#1hfL{PN1K$EK0;~RpTmyCm+BU-a_q>hs9fg*_C}1Qo4mb?B2*|$F z-wBNW9R`34fknVHpltxQUZ5ZFBrpni4;aVlcW^oa?*OxbZ-Cc;HSZ#Fz&N0DAlCo6 zdzkA$MY#ox1-=3%0`305T(dgxEUd5lE#MB|Ti|J+b5ooF zz+j*m*bmt75!wSIfoZ^Dz(c?Vz+1orz_-8~K<6Muxep9Bpzs`pen6W)VE|Ygm9LUNx%f)TwogT7Vr?T?Gv1az}Y~{=7<=uHgFHn z5RQVg2^$R12bc;B17-r#f!BcNfv!1h4PmWnbMSQ{7z3<3T`Gw31+5Zp@wgMooBknF%V z1{4xe=mN|D9tY+CZvpdxFM&!Z&i9wt8G-A7VZcMcSm0G)BCzt`Xb+49<^Z<=^MMC| zN*g%z8ts7zz%XDAFcxUIghC<;?f-!RV0T~+a0oCTI1Z?U;f@992|NG{1Lgu_fwpha z9vB470JaC_0DA!Qfun#*TMP_X8))+mQwMYhj%ce4SEiz{0K`e)HsHU&vrO?l);lm8 zXx9!?@-IRMoB-?xTn$VB-T|foi-CuLwjU6Bpf512J(k}Gx}~zZT611ZpEgxptY<2) zy_k;mRb8yoQGQ3~R8c*=9T#ICLse;8*>1kpzLNEqwx3l%wI=?%ga5o1({Lsw7b>lxk;5vT7t`G?sN}Os)MEPBPIcG%NOezp)l!Y5Km7_&Dhn1s5IiHmy>0hw1vMP$}1nr5kwt6x3sS0yDXhc=Du4cQ0jz?9i zu!JsVXt{*$W~eNomzdfIX2nD6v4mExrgqTkETMg>sowtgEO03Qjon+1Zs$c7b1f|{ zTOcsF^HE|UA79sO)({=bf9aNLs=H03ui5-JWR^wr6029Mhm(cbxl7-muEj5=P7c7t z#k3hvJxhB#s2(=|G%%aJn3?r-HlrWYwV>4p^hYMm^h4TOLE4u-X3{f^cBP( z(QKY%1HK228-2i$b1f`J+VDJJX?6{E8uq}6RTX+l6YA=u*0Gs5!fgK062v;%*$Ew3 zkAVZusGp*foKzp1nRCo$ZziurH#(ui)kb;-dfG-K_4ij@R#wM+?KIMlUaAwVSzWDQ zbHHdeKV&2Lhx%8C!HG!BZ&+;Z(mtqbIeeO2XewIT99?2I4`a&P={gA2*YwBgYAtO| zDjn~JwNeu!Jxx_-#4?Y1F}O(E0&LEvn$0*C70bWqSZCEmy~U)iG{YI0bd9OF(j1g+ z@^n2Ty#&2Z`KY_HcH_`m4^tnpQar)uVfo#HPB_!)Qbxy6x|*U8=C?Gk22!rqQu?f+ z>f*BqRNPXtxgwgCnC&2%uvFU#dB3G}Sq;@k>#&?2sew2EySgE0#WfJLuFK8lm#oi` zRC7_iZ5&p}xOAufE>P~ef_7!tV+9@0u-^*0Sk`wl?6rbk0@`#(yRFR5D*6U>M9t9^ ze*UvUtmbG}%+`Bu!-dAPin5X!!fZ$hthvxkR{4M~8=;biiu#^jXLc&Dq%CWz?$%8! z;pS~6jj0L$18S-jv@TrWtxe~oyNz+B**uc@QJv051MU1uo-H(|CZC|6@lU-%uJ*8TdFSp>^;s(*uk?#sf`h(N%aU$Wdn8tdl73yzZZ8mRWJ{Yk0bsr+;T4<|Eb9pEF@La=#=yMU}%4^K# zYGwQDTB^(5*lBB6{<-+rVhgDNUECR#+Kw;P^@H4P4Sj|wu#Q5x_8OMIF4l1<`>vt> zm^7o7S`OYv&QMMdNS9>9xM|gKmG|x+|p#DVNyj2e?XZZJo{^x}Z zdcl}lT8(r%9$S&mOi1dd|K9;)L1(WJo2Iiv!lm*EbRGcR7}ndDbcZ*38zt;mTRPyL zC7nKHoGG0 z>G;}MH@+x;#dX|SdxXJV8_V^uq`8i_mLhFjgzj{%<6>Qe^1`*WelTtZ*xR(3Yt3fd zT$hF((;A29vr+m%z6|m!*mFy!-QuFkCWpzRAP-;9SBG^R%01TedlaihjHfT^sIFE! zQHx(EZml{PV&leiD)$z`O6+P4&)ZPd1iUu-xPrCzbOzX|&ap=r)6P@mZ^;|!L zuJgfQeKY8B)YV_;eWv}4s&x@cZ37RL)gYWZ#k6f*wU+8hm)657O~gO+(0n$CRi42r z4jbr}y08?zft~@XBe>#9U##r0tXQ4qHBl|8CkUH(yv&ScUi?e_k%{U~+Q}Ea9-(6Z z+79%J4bGaqyxS@~ux2mtR&1c>ec|IHZrRG$1&`>oQav=hMH|*bzS(Z%`BrBUByJo3 z|M|vV?)lKA_0^hIDEV#Vn~-%8%FeKW^UcQ=XHh#?=np4L=1S|u*Absf$Y+S&tfEj} zM6cAx{KTP{PG7L%A{2MiiS^;#ZPt~CnoswN+7$**nVJV~k?b2_$iADzl8l3K|4nS? zcj9Z0y=L^?gx3%3oX(=t8>sH}MsL>ZUZuVv&n#kiDv6s^*x5!;z;yl9-i%MN3l z*bUgyJ`M9bA?Hi`Yii?%`E%Vu-Lb%|N4w$veGBc%xWQZKcwoixTd)f;S1i0J^;6x{ z0kmyHWJ8`GW;u!uZK!%!JGkR^d@CD{6SZroR;ZY<75}6GomsB;q}{5jHR=q4hPKUY z#=cTIfGFXWC#ac_zhcH}Y3;U^7_y}JD_E8`k!Z-R)uOu_BF%QtU(l=iJ^hQpaazeA zkV6|XI8VDUxJ<_}$frgIf6!eFexbhrG-W$|=8q9tZ>NT z&|y2>%`kF1y~MB?+5;=8F>ml65gllL_02^*m-1Dn8~u*Ph`?i4o_v8>`LD~YV)vQq?_uh9%CzTHH1 z_h00Vt!by(JebYO8Hl24%M z1-6+~w#~#cVmS3;M1Cz0i6FG@d#HasEZV%gb z-L=#`)Fud%vQbzK3&Kg4F3PbeYb*CK|6Hs$qfPoAx{&Ga5b_)*-@J$7&za$v!8=iu zc`Rz9ydHL&L3fw5+YENMNxLX(8--n~&CuOOlbd1E4+v9Mk3b!u=Rm0&*w9?9YvX921bqDw8tV9W?GtW#vXN?tF*q?KaVm!8ma; zGDVIIs*j)VGWk~bIbZdtFQ7lSP(5g33mD7E6wk#Ota3>x(qkU-I3n!6UTS9CNFoVzS2ON*f)t+A;`@#5gt@DYFM2EcqIRNlS|jyxSn6dm zY4d$z>E$tLU>RvKlX~u>=W%OE?OJ1t>#$FHu$8*E#y}SCqoERGTC3h_4{F~Aws9|M zS$PCJ3d@pvPZPQWdg^w14jyVGi@i2*SCi?)Ht;;|1J_OE;hE;~Wq*^#$|MnF%$SFIkOngNo_1&OhIQtl z7Sp8+p3=PlZ|iKlw8W&9Y);x&C1|G%XlFag~+dd(;>eUo4eVN!pt9pnIXpih_aEJ}viKez! zD>QaFWHxU{HrKxyBkqUkQ-)Oz%grGOKNKFO^~1rn zK1@3^3^*)y=FO1#9pe@P?r5iDyxI;uWFI*Zs_9JOh;KL<^t7 zLD@FLhXwRSxY|*DOIt*!%^E4@HMpx8b<|?PXUYLftADI4E?QdMwX%3@X?4!ZV!d_m zkJidBpW$r~K02Re+wkCm2z909LC?&o z5Il}}es#s;EBo`+?XS|8U#kA9nR<;@tGg_ED#Qb&T3<_{w(vOZF&b*u>73DOM?;0z zLe5taiWl%MjalnZAb5M(JoGcczd9!fY&HDaBKex^Nf{S=X~C{zlAl;C>X;sWRjeTF zY+T2y^_BKg&|3=TZx#fO8Gdb-d=Z^EMy=j?V}_vh(HlP(_F^6j-a~-mg$~&`8s8N} zWyw>?+hx$JW7H0Y(N~0^s+8YB?p8&+kkNu*@A}vkXe;ck`&raG3Q#6j61-oAAf`%w zAv$FJC*})6_gASR_yhMjPw80IzVoQRg%X}@_{F|dVT~Q0iFyagYnTnjpSdmw+|uwX z0!}i1^b`7Wtm@zRTR9ner)DYd@u$$&`-!Y3+AY07dwivKs8K$+zhc{9c8}1#U#ZnQ znJ$RtodhTjc%EfqXBlJN?Dvu{x+xTOH$O`L4YiL`tJg?4Cd4>F_{F|LVdkFEwh(v9 zJ}zi|5Po>LW#Y6f!Rs?HUGhf`eyV?2^2KxmGq<=%h$Sfgu*=L%z42+Y@scn8UGVy# zg0amq{bN)ehxLsIpK%z2?Iof0g#e{8@{egfrc0UFGgI(*M&nn36c^JI5aVsrZ{yTP zfzw|JX=|zVrfB)fzq$>R|cRsGexH z?jY6TRewWyXBBKC%$xcrg}$C^eGwJLyZ<71eg4?1KE`*IDXqKlTJjEm2z~aH#jX&4 zq3?NuSLvR3wO*$VcX&mSTZ>YF8z{D?*vbi(t6d3zH>|Vsa=Gd=HAnJCsMka!(SY^B zoW4a_2Fhl%I}@)f*P8v_&aA$tb0BWKC|k7dBS1OQSctvP3Ih8SzZx|WeE3s(exmBH zg}u-`@uvw?zm@auyV zERflS>E~B?m}n5Qkv^EL`VaO0N$~o_tZysi^hd!*2~YyCZ?L$mdLamX{CSdpB=bcd zcwBp-|A4QfW>H(YWy~LDZ7`W6AG&C9UWP;qK1ME3eWK4wejnYCfZgnKdAg63=CZnrW>@JI^yaed*r!g-?IFagZqmln z)EM<6T{;b+n(&V9n1)Sk-9366Xy`2`&`nw$fZdRdJ!Z2Ya5(X6x8xsf7j+~Rzwm!T zQL1WHl%^jgA>o76kCTf?ACzBTVgHp#DEi%W)xJihbq5zWVjG7mCoo(;ke{m-4uF(Qa^Nr&|h9C2wgu%@-eRkuOCkAr(8DEMb|k` z`_95mf1jU)67~#!_5D&BJtyk10+fT2_q-;ENs{l2t&bV?K2P_}LXKCKa(gKsjHzJq zb5{kSJAYB~J_UmJl=`7#g??4JRy-u1EqUvMf2YC>J-T@-!2H<)s%0f7cz8iQeRCF zd=8zHhyV|g&Gh}K@-!hXrhA$Aaj|Ho2h44{5I>;znYgL+PH#O!it$V39A^BGXszE# zbethtpQUXf4($1x5HFRkb(<-~uJWMJ1A0O7IW&2W+DJ>-PLIq{{c9ZGBihxG&3Dcg z&7V;9TXbIj`Jkaeh;Pt9CLZ)cG}E_@iHE7? zSGRA4xZQuE-cIssB!7wi_AMTP-rN^_v=mRCE1F%RUUQMo9d--;r4-+h;*8sZ(07k5 z^F*^EdGPAFke4L*@+U*p`GOCZ8%b4ZKj1sTSANe4dT_4Vw#FGzQE)KvtIq z7%iBmb`1Qi+)8V_M2P2=>+}c7o9NIagmusb(P^Lnh5d<3_LLRWOAzcUFLvz%&z4yE ztbQ-~v3G@@o)cq`xO_>%Y}Z? z4Z+tJpv;wg>?_)1KHkR6mWP-gRfm=uJ|2bF$^BUOk4SO1^Yp=d+$M&}ap~?n zqoR3$H0v+BO-U2{W7_sR+_WA#D|p-p@N32@A(q)xl0Xk6Km9MEr;j3Gwa|Z0H^5wt zeKPMHq`6sZg!mS{!o-p11h2>D%XA@rNYw>++ZS3cu?%a4c*;7_tf91;F8T6rA@@nX z_ai!N0Ul;jZi)taOy9tZ3*Dt5h;-mmRGOoixfPyls}Gp>=oi}TWOC)u!2`Li`esGfoMKZT48TGnU#$v1#v|c zSUp?tb~0M}N**QoOY}a>Y0cKr_pn~0v78`%g+ARcn*T`K8le{Yjt($F?agi3i|oA( znz7#B(G5`RRQ>?Tm*N-Mr);9=3M{F6n;BZ9~0sEg6-#?#bgF^0aEc3P}D#i2f z3VoT9N)pxp>&c0iiTJmospHBmqVoCKn zCV1VepU;bCJ@?aLOA!)$g$1?7mU0E@J6}FdPv*rm?2zozty^CdeCPps8_jE!e?5Lr z%I{LIW#~Oy&VD^%McMU>XkPnI+G81-mw&r-@RB&B^XRH&YNJkxa%}ou^W?H<-W3ak zUHZCC$P@fx$?NM^yCTe;ruIawKFst~G}E__p;tw-hB8JzLaZE-{1PC$S_)7;`&H=o zlR>E}`6`l+*-RG_wF6%MUqN?pd<#;&H{!hqV1t^EG6)^e4d_m~_wEkW2voVY8(x+{&fzXdXCI$?=3J2SS? zBP+4=@R`m^gu_}!*IqWi{Ya?&NCPQ$qr`l{yGrqWY=vwx#~TE}{xJc&*e{&S`aZgp zVmI0)?-|0l5PwYIi4fnSCz$weks}JuD}Jpu3Gp-ff{7nVaTO_UTOh=Ds8^cWsIl`) z!8=HCH7qRFclle{O_GoOiGG`=#?)wx-GN>G1t|SjC(F65ogk`8{OKu6ffdZ7)SRKs$0mxr=>8pO9C=0{??wW^X^el8zjlwo!;H3Q(d|A>KySwOAPSkW=i^t^Om$w>JwO zI{?3$KjF zEk(PrYeijOze^-P{v2(&4llmH-9*O%YbZE4*~LCgX4f5S(f%~uyAGp`J1OX90+i*S z32_m<&&0uE%9UYK>{CIAGpYT0Jh)HZD|kH^OYwh?W`howLDFORtK`qq9&k!U!8^XAPuHvdoogTZv|XaDkas>Hc)dGYd^yA1xO3rCez@evXVGC9 t*q*b$TbiLBT$-VEtEQ>v=z}cH-$sAIkFRlDvNdZ}eRI%jzvf}_{{Y=baJ2vc delta 32056 zcmaLA34D#m_dotTk0eMfNk}9?5aW8}^i9!;G%_PQB{t&a3wN?#X;=FMRbn-ecMAe?wN-Gs=}I zSKjOTerXslLAy{<)BLqcT0hpH!eV399+p)h$hf|rJpe4*o#9>aU{zzo+KfxJ;;K~c zc3RV-trhR)5VWnqdWGg2_b0LN7G8$!L)O2=d&a3r>~M=%J(yK&S^K?rqGvxUL^ILJ z-QBQPOxF|}0CF)D@6dI2W*EQR}g=}KS zDwYrawc8W3D<^6Od5ubqwl0^3SE;S;Z>VZtut~B3!TL{9wy@}j_7Y}3(I)$&rm!(1 z6{c-qig2|2HM*u%J{a|6PJy17J-LJG&k+k~^fAZQU<%<0)zA@>b`$ikWKj=AR4toH zx`&RLGzXA<{%5-<5mtVESegxyHF-HU`9JMgOn!&pMMFnX{68I!BgeVqO!o6g8RdA) ze;u22?_$3nO25@h25g}@6(dy@jvr1h*2Rk16RLmW%sE{ySZh?es??UeTbIX?tu-oF zvGs^eSuXRYd%DzIpx%c@kv$pa*fvZ+*&=-9*cgQT^RG)JN{;PAq=uRgvX3MC;=CIn zMbR|0y&*mvCuUH(??6NuNg&)DTW~Q^l$s8m{@X@H=~VVT$M$BtXwq!s35m(q$yuG3 zh;8;sS|Nj2xi+S3%=o2dh9w9&;h&;NvK_RWq?iON)?q8Exm%BJeC|FPTWL_N|U zE|UQY(gspSEGp&Lu7NPy`-i=06Xd$THC9eC`A)_hSle!4ZNnp~e)1RDA$OzDUf*re z*3bqg*^+Sg9$9}EH|asnQXIc*Mgbm&P1_JE-Lgd?uCB=^MQw!k7~;b+L_oJ4o@0{p zDCD3*_B{MU`IAc{9YtXJ; zWCIGac=~#a(26#dPEsuP*DVbGe36sI{#u5yo$VU+_WslCZ5mQf)Ubx0KSq{0w$4<; z8kH8Ee2U^9DjFp*$CmX%48Yr>+cW^TgX93@v-<7ZTSh?raL`c~%&2YVGqZ{8;wFMs z6Wg&Gl9MC)@GTMMy((iwIvPuD+9R-)MVw@;zc_mjlZMdXP@-b zHTHf)6SsveFz9=!f63^Owe5@+Y->c!xHkpnaO5JtkawXskCIJ}t+7}ywBW>8MA6i! z<-L(yr#LokVvfb>KsnsGFp?3AZD1*C zV=X+QT6n%nM>z7RhAX&Wi)f1ur8egA_tY= zMcdCF6+1EmXd%be44|GA%>dHb-p-AUj*nSh=T`b5=G`T*#|=0^-!v8C+YgZSCv3C( zX(;brMCCfeBshX z&TkrRyYjsHja}^0qPmx8ak=2lhQ;)|0ax{fsJHfHO(_A?qcW`A|9)ovffhM7{sx><_Z6bU9PW`LN- zV5LoxS{EgL`jGYM7FECH@8lv%jBpA@or{cDoX*2Dz3mo1?(d-Er|8dJ~hds|T@8CMpo+`V46r6p()TZ8*z++tP2 zPb$_YnE?(h!I=Sev$2HkjVd0vOXDkcr?J1Y%%&VI&DLRBj~*Fl0WnYT9eYnUTc*{s{5#&>}cP1M#G)V-nXOvkag@=+tNq|B{pTm z-bc0@tYVCkVyqHdNHI={^`w}tL{BN^DzUs23zYbhx?oL8RpLV_R?5aF$|LBJxPu&J z&-yj-+;CG^x~Rli3Cqd{9c%fqR{g6R<8HEk{p%WiZ?KvD>luMJ*w+3H^xG`A|0rX| zFRaCYPmD#|*vZ+PAG{%)1uJKJr0+N-rb#hXiA$uIuEfu!7^}ojZnFOG zM$}0BQU-gyhZtg!W7h6iF79z(vW$1z87;nKf4gH zmlVyM>?p-xJB%$=fLKR}a!Gs2vdGg4(kxchdW{}qyyn)>@zImD?O#p5$O;D5 z&@Z!!gX+01lj9NlN;Z*ngGCMUwTzN%o?`n#%x_H})J`(QPd^=DZ#}l&W@ol?b6LO5S&G}ldH&UB@7D`ccKKBs<=Ji@;ep9_DJS}T2|~>F;bNlb&ajQu8dA?Ap`=F5=#YTmNT{&*Xt7aZg_f{1=e!n~ z$X#pvHMW09cu)kU8ynOFs+5v6`d^Yl?VDuwm$^z}l2FpVafSH|^)igjtkuw(`dQY0 zX#Hlrek@t>OI7isD}tq)Xwg>ItEY7O1KT&Wou12{4h=OnU1jx$jWm{QWDAE?H>Pc5 z+lRF?A~v%8VI2*9BWpa|-_n5^z?PzVHI_VM&r{+cDds9MMv4VWY%9fJl}bTUj8&qq z6yucWB1IP^>Qb!a2VX^d*A;eic$3frmt;uFa`$Di7F&AITAb{K(Tt$>)o9Aw@8_3T z#E81aN4f075%r9Yxh!==3;iX_8PUma?nT)n|#4eq;?t_BWdR$mWdv#J9tG zIfhfkdzqvJ%0{a-O@Moy_00Xfj@`%P3V-H7HBtsEOfiS=NKsUQ7%9bUmDZtB9InKG zOKi(~5ne6V$yQS$f!3&mXlwU0?1Ta9*z5P&=ucRyQJwU3Hgi;MOTh)vcAl#5CA+4n zP0Lw|eoFjT1**hHLWFmOl_$$-RK6_B*;GC)s|;6FzJ`chu%nwAdC&hf^j5^gVn%p1 zO#NQI#K{lG^w7;Gjm#w(- zCAi^=`x;#ST#b2r=wjJaLT{$cq7qz$N`z^}oJ^oGB?ys414thydtW_`2Cd6yQR3kvhk($t%j7a%l_OY-%E}$WHc5zh zB`<8d`N?;VTvYkN_CXHYHEy{7VVFL=507TucTNdcq8$uXPv5)=->;7?=@%BkJ z@8g;JN#-~{+30wJrH-G8`)j2M^=nQ;t;z>IQ=-mAzn2Sd1!Cce*8sy#v3DjkF@n(1 z3Bmd~c3?t~exBW%(A*e(l6l8f@BQyd(X$?ut9XGRAoK}d)X`KOIU(4dDv^H>wn;+7 zn@3@5_Uy2z@`Ekq1e+H(-2XOAMbFH-@yCTL(GEP1InK&Y3^dkcvz8Ob8Ff#xFD8cT zx7eMDq4c_G(nh1K1N&xDXWzxyqFTC&AX9qDRpNH$_sJ&b&#W@q7Dw5+PioV}?UOEk z?;aHuBh(`J{Fo5UHSx(Y)_?NH`YZO`{KSao*lw~vK5c0%Ji-D# z^VbVm*Uy5CceB{k&)&xe%v+!N`knn&?v!$d@J<#os=KJv*~xt3Cm3zMWsBphxu4A{ zUJPbJ9Aw|bH`MR4`|(Zm4CXPlNvk=BX`tyfsCvt34!B#1>gK4w`5Ifn&)9MBfsG|h zbgwYEQuLgM$jw`}Mu*vksotKI;VvIZis`4F)*R?Uc5!MI_j})1e{Np%v%KIdw(E zh(jomHX}2|0oA0CRhwSZaGc9pOz$7LBT3p0BHLuV*-pw6vwkrP{Xj>=mrAn^2}eSc zgDeuQ#HpIKzGfGv`!ehyrKQ4BZncXJvG7?f zEIshu3N^N{1<=|z%^Vwf{|S~A(#RrE_?54u{7_EK-(EC<=D?-zr_jYY+V>z8X?Eo> z;UzqnheIW225Y%Sq?$}KeWO{NrJ;U=t+CWUwoXP?pu|OcS$JY={X6zq zVtd~kvS}B!BbAY+!AgA2t|U$=dlfb*js+RLk}BwW9vg~}y~o(}x$TYF)7as;?Til7 z*eifOjfKx^r(a^z=e04Crn1a=t&Ofz*>ga}sVsPYYwWcj%@4KI|4Q~&?P6Y1G!JlR zDVk^Zzq^F-aAo{RiU~^mMT%)kydcGFB_5Sxo)XigXb~q7=3t|g{nY8OREiNwoFm2I zN}Mdk1SP&N#WW@MlVY|KyGSuliD6RIpHO)`hjpTsxtv-|4T*%5V2&_4MC)GQV5_MW0=Ly1AY(&F%vfc}#46mJR)dGM0 zFLro=w|;|NTTn+oz+NrzF;?tgHB!8c={s1CfklD6?ZO4+!;srWgWXM-Cgmz8C*;_t zDsjIQ%_hE(qKjJhk6E+D!5*(MbGCT$Z4EsqzA(G~7!Q4%GOxddyCdeG zp%l}VHl0mO4Q_w*4xF_va<=kTQLIVQS*kLBBL~Z@{RE=T*uwHs-5dOfvUyyYqj$6f zXAU8*SW7%C%Svd>o+p&xTvUVOi0dIw##qG-C}9_;xJcpxq+P1wf=k$?E3O7OTa(Ri z(YT4jc+_TgWJ&b_UBpDunOvYOe*u$!+>u6tXmkHQMf6Uh=e*P>mxD5YoiM^rNH&ny zsQENmYPT<7Axne1+A%q{X=&zMUH_$Mc)ueObEVAN$;>coH<4nx(#CIOJC-)F%p-3` zk+EoQ5a=9nn$hTdb zbxUyBigPW&>nIIFCw)W{mt$+cj{Ueiu=j0IiFTLvH;OVgP?#3| zl;aD^dEB9t(-l%Q+n6WC2s^c&W6Nc|Rs?(8x``GCl5cD1?y<7P>1!EZ5ooDd)Z+A? zidq~kOp9J=++HtQ^iw`BNzrWL1Vr0a#F!uQf*Pgt)|Ba&(3weN2FyxudEMT~I4wlB zmX(LA${&?5G~4c9f-?i?1kToP9b3q2y;Hq}K24o!Wx>&aDX~xvr+KdCLo`D)&(bSo zA~!4lbxT(^9ptl`h^Xsxz72eBw>wO3Cku{~L*zgd+bbHv6i3JF zB+@UBHeZnnCdYP|>f{z?T^V2*vIf@HB$M1t&0zyXTQrG#gf{MtNm(nNM=M!)aVT;} zgE}c&{?2uAD=h?ud@Up0+>$r-B}&k&7A zEv7)Ov4N{<8*DYRtg7MNSad`x&QSlYQAf#W<{EZjm2W_}=!5)jqMW!6%koX>PngE9 zVlP+K?s#{F=&spf8jXH&0DlSJMezjucWfrWo$VPLw%W7XnGZGXS>>U}o>g{@I{mD& z3+|HEwpH=b-*!t(A1$k|#4L!9I;$~8Vq33|KuF02a?sD0&)xMTg=*92g zZsf1VNaOOi z%GsGUzHKwfpmvc#uVRCCAjK8bb{P@nuYN;LPS|oFs3q}=)m_`#@)8@AEs$8N`}wQX zYUuB@XQep$3THXCo?=?%k2N+6Ln_l+NE;=h^dT2xCx`c~9%pC#SdX+uhJoc|*L zd8G+u+wxTDc$q=u!M(4vd_k7hwws4Tbz?dCYg?Pl2GBLZ>fS_Jxlnl4wyVf$Kdc_& z)hw2XY`T*eLvcUB?}FA%F`6!8>FfH}{+dz0Po?yI6rB=P&>%34O_it#7MuVe8iWxPC|BpqZ4YJnDqga&~V0uSUR9wrj)B z_y99Mt%qATY9b}7FnUgQeX?oIm@Tb)*qT&9+oG+p^`SSH+ws5T?DFNgQsKDrMSj0*K7|DKk|8+wl0bnWhYa`e%b7bk7Z9bcQg7g zCXX@Ui>28ayxBOjgf;u3ss1zj;ENbz{NjxLUpVW=q(v-yYoy;e)Oc7^%xO|HwY?4i zrv~OVtMg?YhwI7clzp!^>$#_5_>ct@V&ChC#-@H^_7aWA#1LGLEd-;WJj{Yln9ey_ ze-L~6lQ+w~>=?RY;&Kp}{0?Ylin|8@G8GA0%fS-u8}@;X0SO?(xLC zxrTe1hD`n(RsY+wT60VFEE0zP%XTj$i{E0!}erUsF5sKqy_56i3k;u$pN5Ka(W7EAt<@o6#7rix=U4zhj3pyd2(!3#4{k z_9mlFL=Olh)?<4TTb-EFiTqQ@K_F zpk1}fGs*54s$rcTk50xf-*4Gg&t#bgLL6Gc2OIIXH!Jw3JaajiS|M&ajH%Z-5uVKT zR26pg;1WGMqt~Gn-SFs|@oVO4-3a@PP0RYgJ#MOK-&QD43~J+PY}iHjr3aldT)wGi z1W%nToFRw%;+MMJ5vKUn{nj^FEU!P6wUa(06t%N5KK%BuUhef2Fi2x|JKdATUh~L! z_voKGe)-Y;*k%@U%r~R*@r4dX=tpebDKBH}c$Rr8UeC=4$yrsd@%<6hUSCS?m1^4B zV+)$Cj2;6pzSLSct{Zd{|L)CZ&@Lk*GPgE2O5WGS?cm z)gNp}hx|X477j(01yWL65=zQ*ELK#vji4UkRLmb<{IS+&1bg+WdWP@iYP#`Y7?qjt zWpDQJmDl>gjE}B*=?;-_$XtK&WNFuS8=D4Z#9kk&8^!>Z{*#CMf`QVb+aN;GQkOyO z=bsjbcJ5yk8TJm*L;6{&?ZUgVMn6$g)JUP?oReXLhOmuk?iu@Ud|al|;&-UJj6K7N zO}y;xebWiUSGJ$5zF45BezG4+z3s){PFm z(9}7jQx7)bu7~^gvC{AzYE^oj-z#JFUDq;3&7SP5`)|9g>;?_a;^<>}jx-^jG0ge* zl}5X+>_b~kwspM=yZHMuV|p~i0fuW=cFg8wbdF&+ZCUkOM`K34aM8mTBr(aq+p;t0 zCMQ<*gty^!&Ph+aitjLl<^5bI<4%4Lqj^G3IqTkyE1`*v~ytB}rw!s>(08HXQR^k!2!kbkk7%*nlUg$B2i ztdbO2R3(dwm|T{2w-#GmKtE}}RnW=d0z9w{Y1LTUr)?bKO@TdaWxVRd@(McQUAODA zrHw1L|9=`9`y(1!P5~gcnjMQ8>fJFT_nDhfXLq=$QV@+x&#^osTF;Hvv*~}2<>5P$ z9Gg!&yM5_C-;BC{{jB?>wk~Px5Z$22%n5GDtnJ@p4P$B;+y0N$c+ip+y!2o_UZxpM zTV@oz4AJou5dT*}_(@N%SJRBBPC=?&(Ip8}n4d|{>&bB$@pWu2xc5iZ z@_Rr2z4zlLkC`yOHSeM8zC2FX$61bq+3k;9H7&c9-CosA(=yPhaG)=GdJ-6cK@6;- zX=#{>$-sCd&03&O4D`T!B<&-h1&cVWs-``C-);WTncT#2dnJ% z!j78u_iDS{uajocDy)GaPJ16k!; z57->oZ#^6UCjln|HvyLc12@?1hk?<+o4`@PSHJ{d4UFtsU^vh%&2AqIYz*823^^0)_)u;)|_Oz+1pXpn>nrwgBq`PXapv?*T^v9lSN| zb6_3dL0}}%Dcx=#4Q!Qe!8iLTEC8_!_$}}p(0?y_42%I*t&TMZY!3Ve*asM~&u)(g zt^lqD9swQ(mc6R#gRf2w0waMZfun%WfQi6278JIi;JhD^11AG*z)iq% zH8pKFus$#g7!9m*z-}K0>~GPuA5h2x@hecP zjg2B3JqP*$BY^#Y!-4gWqvs?8(}1Ia*}!yQ9PD9p((U2^b5!3XB8Z1EvDQPGQ3W4h3EYjsrdio(5J8 z(6m>;=0N`(Br7l)I2pJNxDt33mSMhF{ebs@5y19m(Q{xNFaekbOaop5W&?e`w_EIaC`5h_2My38 zpdWA%Fanqj91g5-4i11#foZ@PU^a07d8`&-jUSNh4K*zY7z`W^j0OG;j0bkg#he3Y z0uKXs18)M4U$kI@XoO_Bh;<6=dZbTVZi;s z-oVSic;Iv3X`p8a)_>h!G1noQ)*Lt-7z<1QjsvEVe9LY>4IB%!0TY4cnxY55`oNdK z7-0X~$a3Ha;4NCE7yx1xFdj&MVY3pr510wO z4!jO@x(f$DKcHuGoB_Zv;2_`tU@|ZsxCOWpcm$XUya&7vbi0T8z@|XYP)%zK46~r{ zHVOlP9{}Tl3BZ-WKE& zA@Cf~vn}qDz%Zb-0LczqV?iMSh3&vJp!ZYkY{2HgJYWx?)(+=8&=2?$7y+#M3=V)n zzy#n}U>fi+FdOLbC+Y*;fLeP5^c?kpmw^$$8h@ca(9#fv1Qa&BfCJ!Hz--_*z&zj? zpca7}3(ybf_BR{=YXgS^M*|apOMq#>4Zv*RZeSkpI8f_=jum3+fFA%OfzyDaI%w}{ zmr$4w!sj2meH*Yp@H80$Z9tco*xNf|N`PU&?|}n=FM#pDuzxZ1z!+dAkp33qI&eNP zq7!bx|MIPs^wq{>D}T^j_ja10!MBxopIuq^^`B_PJ_b?iP|0Pk(W#=- zhZV|}gIELq-Nb)0tvt&|_wM`(<7x4^-^%!Snv3pWq(caYMK%drt$epPggg?qS^0ez-N)Fc%>7(J zZn5(5Zm@_TAA`^&&Lt&9d!Y(CblK_W8lScv^J%*zdZYaN03O^EFSb2b( z?xQ>LC^x;HF?tbI@-ap&;)@7}FXFoi2QT6m$ZP=r!%g=!b}!Yt+(UflD0x1 zGT0H?4*Wi8{fVy5{oHj2=Q<%sA}8qj@gR3Jc)6wBzLf%4!TXWiJIrqXp6XioERz3f zYqzf`5BU_|>W;}=VC84QH*4R;ZlB}~J{%`c$qv8oEILdXcNu8 z*ZKT=cKf#uAm;OK9%x|XXawL1c@LlDp$9l$Pq5qlNne?7@<4+x7V=ZDGu|xZK|z?n zYM8TfR{o}z?!jwR)5|%#At&!r5AO1yYH)ZS@@F(QXLvuzdJSH&3VNT4iq6#*+wCLC zb~#^9x?cR-YI<$s@5Ow4O{|(4=%^ppJu#GlycR)y-T~lTC)IAp0jW9s#K(H--g*IWYemIjiq>9 z#bA{C@DML#U3w~i+*tPxSP1Ggs=bA3MQ%IGDwJuvpx>X$mwM>|#wNzIyfA*i9yKv$ z1zs4l9n5ZjMs4=uhPUqT9Jfr3OIse~4dWflcn`we%lLT0{mb|wCGRFoU&b#0oxei8 zFUe05e*qbz=3X76_74Bb8$%LX9mCU)e@x{#Di7vXDyK?b*rcOu3|P*4G{Gv*m5M9y zVT@hQTi4KioSIj}6>d2nS_82M*U-xu+l9j4xGG2==ltb%`}-6_8IGTU`WOK#M56HQ z8e($3s{v>ISIEx4A_v1(aHpEctn3xMA>lKj^f6wppe*pwYjbN&%y}TkHcWo+@`Ip_ z8Jt(DgCGlPBI2=1aPgt=Iq!x|mj{#Z0Tm_!vhxZ|94MvN#_K3^)km?!qzd z&1RMv(IOvc=cac!c`{%pdXIB~ixG07cRANNZ`fM*4x;xt&)^52If{LZDo&|;2=W#5 z?F@7vg*Ws=`);dgL_GL#KfPQ-ZMEILnS%HTOWzD4YEBtrRdFisLL)OoCGUW9D33** zDkU52QEW@E?uM?W^z#m=fDL36Y;i>__8nJj8wmZ>)%-Cg!6^pigw>S3-cI9CPF&4{ zFl|l?QCz^89s&fEm!vUGu+3j7FW_C8n;SW5T=AC^WthwnM4GclZu}GOa?6^hKUgn%mlSBF#=+oAV_2D!Q<=yMWWu@~%!Q1U!U{h4j z=qTot>g|I5^!oo>e-8Qr>8tJ|l<)li*jA{9cNhF=U5sW^?iv7y-8`bL({}aI}XO76~jB!!}&2IP2?U= zsE0NGd79iO(nyF)<6G;&)vh#tiZEMRuBZ>)DN**|xy^J3?gz&CTpA8;3gmYlgk09k z@vimJum>LlFq}4uUDA=>1D4^n>gm;;=vCm;G=8=|qKx2wka1_Bag6rH$-)~qKy;%w z^6@x1>n;Q}b>sh?u=JYnb8=GMiQW{>-zYXRr$;DHh6@~!0TpnVt%r+u5oGbK^(mUQ zKDM38AOJlv85+t9qfax z?n}EJyHauFF)|!KP&1&9CK}F%MGT+@8+lmuE-x%Yj^FIi@ zdBq?=4UT`T$5#*F-3c1;j|sxLm7op(il6~M4=@IA9tI2*4=Z!Doe_%CjAO zJz?GseuVJC4*o0Q*&X~1@U2rj)GhJn>U>Z$#1qk!*6L2W*Ld^cB+O7k0tuh)lmjgm zXPQ#qXr}uFE%e8y-<@{*5Ss9w$dnR?U72@^bFBh0>6cw{hYm#f+%EBP#3=^l)8Oe` zs9GDzwo66!J0A%DM#5JjCa3+do2JT_QJ(x&QAFhoC=iUl^VY9aAcMG5b5#C$H$O`s zuOgbG58v%py(8i1Zn16g1QNd5&4U^snNQZm>mw+zww>;woWEOa6;2L#XZmtC?bJTT zqusP_aRHa!oJK)csLF{bKif@lc{^>9`~tFd+9UgL51fWFwjU=4yiY|LpRT-}>Y)5D z{I-B?d1X5a96kild^jbdjP}T{;}+t1atlnhn{?%P50o1}L)7cta)j&PW$QykPu?R@ zzHdg0a^7yO37~!0ly*my9nrqOSZu+G7Vy`?9(@9`^WlrXj zU{#s|5@zh>XIo>;T*9%dZQ84vX50rY==Zs68-yQ^3+{a$(gvMgi{kEm4o+t*iOOjS=36IxT5>wIOzZ8 zt{o8aLKJ_K#a$@=D2h(!P&{@}Zds2|K6_9sHKz*AM42wFQGqCbb5MS)E@}i<0eV|S zU5SA{5BmT0VTbhNpwCx9pmdkXIVi7#%7bCC; z@UsmE zJLMMW=4XntsQWmC^Pr#IUXjkV5Ag<}K9$FuzuW&fjw41(2*I)5t zZ|lSKwTEiO==EG#tWnj~1+M7XuvTI&4;iIb!OuYI@YEst0AtzfLl1`NL!GG5ZInLO z@Lsw`T2d$I2?ZMX2bMRiws99I1KCsM{_i07?x($AZ9LTspYQi{Gxafd#E2D%a-B54UhmxQd zfAoY@kp5ntlyVmd+T@CopReNjRPhVYAhmz383g_GUZLRb;(C zuJGO;>7Bhw5AKiHC&;e{-}{kXt!w1>vU*nu8h+^{)RWa1n{K~QeDpPGXomSl@x8h0 zIK7(J6Jbc*LHI5$6Jlpn|SqE zFYPg^;yI3+IH?En+2i$UBRmQu-AJj|*OBVW+azIjxC$=KRIUDf$#+!tlNDe3AACQo z__F*Tio*Ytv^TqW5wlJ1ym^NSy0063qd-r;dXoNNKc6%~4+@DpEqUzN;_;|2^){8F zW^^4CAIL9Gz^;~jSDJ$rPm>0++RKC7h|_~CrGuqygrb>yOxl~d)*qun{IqkDH|NiS z8yN9BR7#s+{H6G~U!^^La-yd#_Cd0rku5~NCr)qBb(2b+i-N;Bj9U@yDWls+LQS_8 zjA*EGfZFoRnVPM54_<2`lIWnCXDnjzIN%l|s_|Z=eyJ9_S)0CxCv`tQ8|tRE3|Xt6 zs-4wTs%Pa$!rT~}G?RQHmU_)B2do4-rDF{=KQP-%BZ&4d}%dc_aS`mbO2 z7Sg^2pEybX%;Kcx-E4M!OBK#_Sq;I9=au3wr%BnIh-G2Y{?ZG{i%S@aKjNEc>UG{m z$z#aH^Sk1ozY=1jqGNMXYgu9918Imv5l=Zx5=~Yd=Sx4)J9nM`RPuC{rN;rQlp4xE zD`h00c;+a6gxa&s4%TZc?Q8Q|ll5wr7|7&D78)+C1@w`%sd+Vi)=@(CAgshOYk>8oD2aQ;8MDpgu zZ0I2Me;$^6j07zh`v#56UuxSi1I|@^M5(i9TqkKC!6#0^J>*X{F6KOsjFi*x0j7x_ z`o@u-Z@^PCz15l-Eotqd;xEg7!a_bZqoloqnnrV#Oi+AI?bOk6EpASIL*&O21Kj8U7UR zj5`mw^JmyP$KL;M^+wpdsMQbD5C*6yPb=PqkNHe*(oNeTc`P9D48zP(wM}X}GNT#T zUGl$wFYU~UFHro97yJ&wz?&`Cc)d&4gU6(qs|4*~Pg%Wmlx<@rpQRS4Ink#TUx{yw z$8Pq%I^Ew_?lOAGYC-%WTJv^LS!V_{r?*u9b(J@rst?r%@+DI-RNoi!9aFK1z5IpW z09y8{2{aF>f!GbH?^!B~a5#x)x8ftz>OxY92mj|DnwwEsTlQKJ3SKGu(`pf!oeJzP z{l`8PhJ4;M-PLQ%x3ZSGV%)JtDTw-f?=-!MUX|aTh5^{6*3Vn2iF)&x>`q~_>7nn@>2V(- z{gggzK2UsyO_){D#t6g@td!z|pZSC7I7U1-NQF5<^M*->*D$j5cu3G5DBdSq66TUK zMo4=XK4J!jVgGkh{jsWcP^km>5>o%BmV&tp3>qnGx8cX2w%pz;t0DEo)Al{7{zC<0 zE`=kC=U1hjnRV?(N&5n|*_ig(iqBFXs?5WQzTBlbjJ_iL@cuJ#>ECx!I>nwLp8g*y zr|N_puJ}WWpYfw4CMmu@wmxz?>kQvF6FELs>0On62&RJcbuUT64E~(rKbKRc`6>H$ zW2OCgwN}hcaKGY59+ZaW3}5(&Ql;qO!)EEO__4vnSs4F$ze~QaQa8suQ7j&OBdIHX zDfzQXeMPDJ@rzJf)?b#q*^_JIW$mGRC2vmKxCxT?e=K=3;&6X!cJe|6}%l?Wa$cd{utTg4NxJ zKLw0fwMSa10nnOZ;gQ$pcO~Ccf;L6*1=}QHhMMx3isFpqYbtv^Uh*~g>;w#OrmAMH zwn|f_I)?8hb;kl(%^a|r)1r%Z$g4T0}RDX6%66Sz@ulO1~dA8ofxV(*L&DMjwywwQRRn>RRlGQtL{c|*4`tw0! zi&Qt|A*9Y!J*%y1CnrdC5FbJ6!dsFzd)95XRLAlqP=}< z+KG1{^{HQ^+6?v80;%@q6G`1oZ4KtBvo=Ml1Np{8j8Ew#9%7YhPoAHscL{k&J z3=&r?Nj2#v|6K9M<**LY%yYaZlNChX=Vc5QT)|M($MTh{Ay|6k8gxKuSzQKsw#Ig*GP49 zev#B$RBPtge7IJs+jD(B-uA6lpMK0Cu&k5ni$HoBOVHLTzVutjeTv_Emyeu}4>P~3 zAveb)5!WJ`>z@mxBQw7pH%Q*DqBrf^D}L8=X>aoT74OEMB5X?+nIc6Q-8oIVEB&JE zn&P|VNO$Hg-*Th0fBd85&C!{*N%H&Du$V{jvdxnBhCe)fUb60L*dqAqWZVlzmpTik zZjr{NmwfpzBwv^(oM`4)Tvz-!^pPGj-`?9Q?Q3L8o_mt32*Xet* zy07w5sYk;aHZsP+}_0Gn@Klr9peQ3x5beWzI37XRxY4<|i8^RU;Lh<1|WC@m3x=KAW zs_)LqYJ2wckxMWn5qt^EnhwvF)y*9%?|W%>1sg6srgx`vl3#s*-#~S*(yzyVRr*%E z)>5?YqIS9l(u?-hd09Q-9`C&r)l0u!I&?uE(!qSyQoTvno2qZ-Ui148vigp6>Car( z@wt+Jpm=lr8c0?;^ZmFh)2oe)d?2ft+sBAYvRbklqW~3Umf|0(TyL%Tve<8FYWAx^ zaZ`L{#b<5i3mASfAImSIIZL3r_nB^cU6Iw_!@8v>QG)i9;uorIzK`O2WA~!whN`^# zNb#Rsll)o5n~P%jb;-v8@pM8}`OuY{h9RfDl<9Z%^t@0N64bV92HN2#>1fdzzGE3a zoV&c1_SGe5nKz{V{4GhC?S%g<`5TyJddz9ttN1#+)^gnUroEKv7zx_EUu3lke9Usa zi?QW2-?tnG(}lxQZ%(okTBG5ppk~W#b%NqcU(|k8`~?gbJvAk0ALL0taYrTLqxgG@ zSKlxWD*B2s4D(10KT{i}nauMQ@1{=s-pY~B9ch2A)M?cfOPu^%`;up^z|zAHa8_VA zM!u5u%pLLu_Bt~2;vpQnQR#08Z~rDI^Xx2HjsCKLo}u?;wMu*m$8J4RZO#*^ z#!oGwHntw%3sxapD{Pe2&1@ZoU@5qHr8bAhiVuDv?ag#dd?xLia(y*cvG#{lw^Bi+ z{3+F??@b!AfSS3Ay@;Ns60}bhe_Gu|d=>w*;_3h2gdTG<`AhMq*w@5hf(yrn;EEkpMxuCH+}(DR7|E%a~6w@{1KjCWX}_&+@q!{eiIfo zJ?1xf$248E@5(Q(MGjt3$@P}B(qeU~cHsItER3WA$(vsPR_Yd8B##|HJT38k1@-FF z-}%UO_(neSU&(J*`hI1kzVz2sV-=rxpC4bRcWzp5h^#<=r$oCp3Qm*6{LRa0qM_dt^$5JLt`JNHY1aNPw{8>@sVlRo?YfGNz)H4Ni%v@ gF-Dx`cQTA1XY&O=evRY3-*D3PJBMl=Fnr7We{zMop8x;= diff --git a/bin/epiread_pairedEnd_convertion.cpp b/bin/epiread_pairedEnd_convertion.cpp index 3a5a2149..f974f953 100644 --- a/bin/epiread_pairedEnd_convertion.cpp +++ b/bin/epiread_pairedEnd_convertion.cpp @@ -1,686 +1,643 @@ -#include -#include -#include // std::stringstream -#include -#include - -#include -using namespace std; - -// Based on N. Loyfer Pattern algorithm - -string TAB = "\t"; -int MAX_PAT_LEN = 300; -struct CpG_record -{ - - string next_cpg; - int index; - CpG_record(){} - - CpG_record(string _next_cpg, int _index) : - next_cpg(_next_cpg), index(_index) {} -}; -unordered_map dictCpG; -struct SNP_record -{ - char ref; - char alt; - string next_snp; - string sp; - SNP_record(){} - - SNP_record(char _ref, char _alt,string snp,string _sp) : - ref(_ref), alt(_alt), next_snp(snp), sp(_sp) {} -}; -unordered_map dictSNP; - - -bool DEBUG = false; -bool SNP_data = true; //file is probably not empty, if file is empty-skip adding SMP data - -vector line2tokens(string &line); -//void convert_epiread(string genome_cpg); -void convert_epiread(ofstream& merged_epiread); -//int execute(string cmd, string& output); -string vec2string(vector &vec, string coordinates = string()); -//void merge_paired_and_print(vector l1, vector l2, string &genome); -void merge_paired_and_print(vector l1, vector l2,ofstream& merged_epiread); - -vector line2tokens(string &line) { - /** Break string line to words (a vector of string tokens) */ - vector result; - string cell; - stringstream lineStream(line); - while (getline(lineStream, cell, '\t')) { - result.push_back(cell); - } - return result; -} - - -//void convert_epiread(string genome_cpg) { -void convert_epiread(ofstream& merged_epiread) { - /** parse stdin for epiread paired-end file, sorted by name and order of mate - * Translate them to single-end like epiread format, and output to a file */ - - vector row1, row2, row_from_line; - - bool first_in_pair = true; - for (string line_str; getline(cin, line_str);) { - row_from_line = line2tokens(line_str); - // row is first out of a couple of rows - if (first_in_pair) { - //cout < &vec, string coordinates) { - /** print a epiread-like vector to stdout, tab separated */ - // vec length must be 8, or 6 if there is no SNP. - int num_of_column=8; - if (!SNP_data) - num_of_column=6; - string str_vec = vec[0] + coordinates; - //for (int i=1; isecond).index; - } else { - // This is an internal error - should never happen. - throw logic_error("Internal Error. Unknown CpG locus: " + locus); - } - return start_site; - -} - -// int CpGLastLoci(int &index) { - // //get the locus of the last CpG according to it's index - // string position = ""; - // for (auto it = dictCpG.begin(); it != dictCpG.end(); ++it) - // if (it->second == index) - // return stoi((it->first).substr((it->first).find("\t")+1)); - - // // This is an internal error - should never happen. - // throw logic_error("Internal Error. Unknown CpG index: " + index); - -// } - -int CpGLastLoci(string &chr,string &pos,int length_cpg) { - //get the locus of the last CpG according to window and length of string - if (length_cpg==0) return stoi(pos); - string locus = chr + TAB + pos; - auto search = dictCpG.find(locus); - if (search != dictCpG.end()) { - for (int i=0;isecond).next_cpg; - search = dictCpG.find(locus); - if (search == dictCpG.end()) - throw logic_error("Internal Error. Unknown CpG locus: " + locus); - } - return stoi((search->second).next_cpg); - - } - - - // This is an internal error - should never happen. - throw logic_error("Internal Error. Unknown CpG locus: " + locus); - -} - -SNP_record FindSNPRecord(string &locus) { - /** translate CpG index (in range 1,...,28M~) to dictionary */ - SNP_record variant; - auto search = dictSNP.find(locus); - if (search != dictSNP.end()) { - variant = search->second; - } else { - // This is an internal error - should never happen. - throw logic_error("Internal Error. Unknown SNP locus: " + locus); - } - return variant; -} - -// void initializeDictCpG(string cpg) -// { - - // int cpg_index=1; - // vector record; - // ifstream cpg_file(cpg, ios_base::in); - // string line; - // while (getline(cpg_file, line)) { - // record=line2tokens(line); - // //dictCpG.insert(make_pair(record[0]+TAB+record[1]+TAB+record[2],cpg_index++)); - // dictCpG.insert(make_pair(record[0]+TAB+record[1],cpg_index++)); - // } -// } - -void initializeDictCpG(string cpg) -{ - - int cpg_index=1; - vector record, next_record; - ifstream cpg_file(cpg, ios_base::in); - string line; - string next_cpg; - - //get first CpG - getline(cpg_file, line); - record = line2tokens(line); - - while (getline(cpg_file, line)) { - next_record = line2tokens(line); - next_cpg = next_record[1]; - dictCpG.insert(make_pair(record[0]+TAB+record[1],CpG_record(next_cpg,cpg_index++))); - record = next_record; - } - //last record-no "next cpg" - dictCpG.insert(make_pair(record[0]+TAB+record[1],CpG_record("",cpg_index++))); - -} - - -void initializeDictSNP(string snp) -{ - ifstream snp_file(snp, ios::in); - vector record,next_record; - string line_str; - if ( snp_file.peek() == fstream::traits_type::eof() ) - { - cerr <<"SNP file is empty"< &line) -{ // get the final merged SNP - string debug_data; - string cuttent_snp = line[7]; - string final_snp = "0:"; - int snp_length = line[7].length(); - SNP_record snp_rec = checkLocus(line[0],line[6],line[3],cuttent_snp[0]); - debug_data = "(Ref-" +convertChar2srting(snp_rec.ref)+",Alt-"+convertChar2srting(snp_rec.alt)+",SP-"+snp_rec.sp+")"; - final_snp += convertChar2srting(cuttent_snp[0]); - if (DEBUG) { - final_snp += debug_data; - } - if (snp_length ==1) - return final_snp; - - string next_pos = snp_rec.next_snp; - string absolute_pos = line[6]; - for (int i=1; i &line) -{ - string final_snp = ""; - if (line[6]==".") - return final_snp; - string debug_data; - string cuttent_snp = line[7]; - final_snp = "0:"; - int snp_length = line[7].length(); - string locus = line[0] + TAB + line[6]; - SNP_record snp_rec = FindSNPRecord(locus); - debug_data = "(Ref-" +convertChar2srting(snp_rec.ref)+",Alt-"+convertChar2srting(snp_rec.alt)+",SP-"+snp_rec.sp+")"; - final_snp += convertChar2srting(cuttent_snp[0]); - if (DEBUG) { - final_snp += debug_data; - } - if (snp_length ==1) - return final_snp; - - string next_pos = snp_rec.next_snp; - string absolute_pos = line[6]; - for (int i=1; i mergeSNP(vector l1, vector l2) -{//change snp to desired format, with relative index for each variant in format 0:var1:12:var2:13:var3 - - vector returned_snp; - - //if one mate has missing value-make it be the first vector - if (l2[6] == "." && l1[6] != ".") { - vector tmp = l1; - l1 = l2; - l2 = tmp; - } - - //if both mates have SNP in the same variant-make the one with more variants be the first mate - if (l1[6] == l2[6] && l1[7].length() < l2[7].length()) { - vector tmp = l1; - l1 = l2; - l2 = tmp; - } - - //if both mates are "-" strand, but same position - if (l1[6] != "." && l2[6] != "." && l1[4] == l2[4] && stoi(l1[6]) > stoi(l2[6])) { - vector tmp = l1; - l1 = l2; - l2 = tmp; - } - - - //get SNP-length: number of variants for each line: - string snp1 = l1[7]; - string snp2 = l2[7]; - int snp1_length = l1[7].length(); - int snp2_length = l2[7].length(); - //if both snp has missing values - if (l1[6] == "." && l2[6] == ".") returned_snp.assign( {".","."} ); - //get SNP data - //if one read has missing value (".") than use the other value. - else if (l1[6] == "." ) - returned_snp.assign( {l2[6],GetFinalSNP(l2)} ); - - //both mates have values in the SNP column, in the same position. l1 has more variants - else if (l1[6] == l2[6]) { - //both mates have the same SNP variants - if (snp1==snp2) - returned_snp.assign( {l2[6],GetFinalSNP(l2)} ); - //same position, different variants in both mates, the first mate has longer SNP variant list - else - { - //check all common variants between mates are the same. If not- put "N" in the relevant position - for (int i=0; i &l1) -{//add first and last coordinates in case where there is SNP data - int last_snp = 0; // works even if there's no SNP data - string coordinates = ""; - if (SNP_data && l1[7] != ".") //if there's SNP data - { //get last index of SNP - string str_tmp = l1[7].substr(0,l1[7].rfind(":")); - str_tmp = str_tmp.substr(str_tmp.rfind(":")+1); - int last_snp = stoi(l1[6])+stoi(str_tmp); - } - - //get last index of CpG - //string window = l1[0] + "\t" + l1[4]; - //int index = CpGFirstIndex(window) + l1[5].length() - 1; - int last_CpG = CpGLastLoci(l1[0],l1[4], l1[5].length() - 1); - - //insert values to vector and print - if (SNP_data && l1[6] != ".") - //l1.insert(l1.begin()+1, to_string(min(stoi(l1[4]),stoi(l1[6])))); - coordinates += TAB+ to_string(min(stoi(l1[4]),stoi(l1[6]))); - else // no SNP data, coordinates depends only on first CpG - //l1.insert(l1.begin()+1, l1[4]); - coordinates += TAB+ l1[4] ; - //l1.insert(l1.begin()+2, to_string(max(last_snp,last_CpG))); - coordinates += TAB+ to_string(max(last_snp,last_CpG)); - return coordinates; -} - -//void merge_paired_and_print(vector l1, vector l2, string &genome) { -void merge_paired_and_print(vector l1, vector l2, ofstream& merged_epiread) { - /*Merge two epiread-formated line into one */ - - if (!DEBUG) { - l1[1] = "."; - l2[1] = "."; - } - - - bool flag_SNP_identical = (SNP_data) ? l1[6] == l2[6] && l1[7] == l2[7] : true; - //if l2 doesn't add any information to the read-length, sequence and SNP data: - if (l1[4] == l2[4] && l1[5] == l2[5] && ( flag_SNP_identical )) { - //there is an snp value on the identical lines - try { - if ( SNP_data && l1[6] != "." ) - l1[7] = GetFinalSNP(l1); - - //add_coordintes(l1); - string coordinates = add_coordintes(l1); - merged_epiread << vec2string(l1,coordinates) < stoi(l2[4])) { - vector tmp = l1; - l1 = l2; - l2 = tmp; - } - - - string pattern1 = l1[5]; - string pattern2 = l2[5]; - int pattern1_len = pattern1.length(); - int pattern2_len = pattern2.length(); - - //0-based file - //int first_cpg_site1 = stoi(l1[4]); - //int first_cpg_site2 = stoi(l2[4]); - - string window1,window2; - //window1 = l1[0] + "\t" + to_string(first_cpg_site1) + "\t" + to_string(first_cpg_site1+1); - //window2 = l2[0] + "\t" + to_string(first_cpg_site2) + "\t" + to_string(first_cpg_site2+1); - window1 = l1[0] + "\t" + l1[4]; - window2 = l2[0] + "\t" + l2[4]; - int first_cpg1,first_cpg2; - try { - first_cpg1 = CpGFirstIndex(window1); - first_cpg2 = CpGFirstIndex(window2); - } - catch (std::exception &e) { - cout << vec2string(l1) << endl; - cout << vec2string(l2) << endl; - return; - } - - - int last_site = max(first_cpg1 + pattern1_len, first_cpg2 + pattern2_len); - int overall_len = last_site-first_cpg1; - - string merged_pattern; // output pattern - - if (overall_len > MAX_PAT_LEN) // sanity check: make sure the two reads are not too far apart - { - // throw invalid_argument("invalid pairing. merged read is too long "); - string output_error = "Problem with:\n" + l1[0] + "\t" + l1[1] + "\t" + l1[2] + "\t" + l1[3] + "\t" + l1[4] + "\n" + l2[0] + "\t" + l2[1] + "\t" + l2[2] + "\t" + l2[3] + "\t" + l2[4] ; - cerr < merged_snp; - try { - if (SNP_data) //SNP file is not empty - { - merged_snp = mergeSNP(l1,l2); - l1[5] = merged_pattern; - l1[6] = merged_snp[0]; - l1[7] = merged_snp[1]; - - } - //add_coordintes(l1); - merged_epiread << vec2string(l1,add_coordintes(l1)) < +#include +#include // std::stringstream +#include +#include + +#include +using namespace std; + +// Based on N. Loyfer Pattern algorithm + +string TAB = "\t"; +int MAX_PAT_LEN = 300; +struct CpG_record +{ + + string next_cpg; + int index; + CpG_record(){} + + CpG_record(string _next_cpg, int _index) : + next_cpg(_next_cpg), index(_index) {} +}; +unordered_map dictCpG; +struct SNP_record +{ + char ref; + char alt; + string next_snp; + string sp; + SNP_record(){} + + SNP_record(char _ref, char _alt,string snp,string _sp) : + ref(_ref), alt(_alt), next_snp(snp), sp(_sp) {} +}; +unordered_map dictSNP; + + +bool DEBUG = false; +bool SNP_data = true; //file is probably not empty, if file is empty-skip adding SMP data + +vector line2tokens(string &line); +//void convert_epiread(string genome_cpg); +void convert_epiread(ofstream& merged_epiread); +//int execute(string cmd, string& output); +string vec2string(vector &vec, string coordinates = string()); +//void merge_paired_and_print(vector l1, vector l2, string &genome); +void merge_paired_and_print(vector l1, vector l2,ofstream& merged_epiread); + +vector line2tokens(string &line) { + /** Break string line to words (a vector of string tokens) */ + vector result; + string cell; + stringstream lineStream(line); + while (getline(lineStream, cell, '\t')) { + result.push_back(cell); + } + return result; +} + +void convert_epiread(ofstream& merged_epiread) { + /** parse stdin for epiread paired-end file, sorted by name and order of mate + * Translate them to single-end like epiread format, and output to a file */ + + vector row1, row2, row_from_line; + + bool first_in_pair = true; + for (string line_str; getline(cin, line_str);) { + row_from_line = line2tokens(line_str); + // row is first out of a couple of rows + if (first_in_pair) { + //cout < &vec, string coordinates) { + /** print a epiread-like vector to stdout, tab separated */ + // vec length must be 8, or 6 if there is no SNP. + int num_of_column=8; + if (!SNP_data) + num_of_column=6; + string str_vec = vec[0] + coordinates; + //for (int i=1; isecond).index; + } else { + // This is an internal error - should never happen. + throw logic_error("Internal Error. Unknown CpG locus: " + locus); + } + return start_site; + +} + +int CpGLastLoci(string &chr,string &pos,int length_cpg) { + //get the locus of the last CpG according to window and length of string + if (length_cpg==0) return stoi(pos); + string locus = chr + TAB + pos; + auto search = dictCpG.find(locus); + if (search != dictCpG.end()) { + for (int i=0;isecond).next_cpg; + search = dictCpG.find(locus); + if (search == dictCpG.end()) + throw logic_error("Internal Error. Unknown CpG locus: " + locus); + } + return stoi((search->second).next_cpg); + + } + + + // This is an internal error - should never happen. + throw logic_error("Internal Error. Unknown CpG locus: " + locus); + +} + +SNP_record FindSNPRecord(string &locus) { + /** translate CpG index (in range 1,...,28M~) to dictionary */ + SNP_record variant; + auto search = dictSNP.find(locus); + if (search != dictSNP.end()) { + variant = search->second; + } else { + // This is an internal error - should never happen. + throw logic_error("Internal Error. Unknown SNP locus: " + locus); + } + return variant; +} + + +void initializeDictCpG(string cpg) +{ + + int cpg_index=1; + vector record, next_record; + ifstream cpg_file(cpg, ios_base::in); + string line; + string next_cpg; + + //get first CpG + getline(cpg_file, line); + record = line2tokens(line); + + while (getline(cpg_file, line)) { + next_record = line2tokens(line); + next_cpg = next_record[1]; + dictCpG.insert(make_pair(record[0]+TAB+record[1],CpG_record(next_cpg,cpg_index++))); + record = next_record; + } + //last record-no "next cpg" + dictCpG.insert(make_pair(record[0]+TAB+record[1],CpG_record("",cpg_index++))); + +} + + +void initializeDictSNP(string snp) +{ + ifstream snp_file(snp, ios::in); + vector record,next_record; + string line_str; + if ( snp_file.peek() == fstream::traits_type::eof() ) + { + cerr <<"SNP file is empty"< &line) +{ // get the final merged SNP + string debug_data; + string cuttent_snp = line[7]; + string final_snp = "0:"; + int snp_length = line[7].length(); + SNP_record snp_rec = checkLocus(line[0],line[6],line[3],cuttent_snp[0]); + debug_data = "(Ref-" +convertChar2srting(snp_rec.ref)+",Alt-"+convertChar2srting(snp_rec.alt)+",SP-"+snp_rec.sp+")"; + final_snp += convertChar2srting(cuttent_snp[0]); + if (DEBUG) { + final_snp += debug_data; + } + if (snp_length ==1) + return final_snp; + + string next_pos = snp_rec.next_snp; + string absolute_pos = line[6]; + for (int i=1; i &line) +{ + string final_snp = ""; + if (line[6]==".") + return final_snp; + string debug_data; + string cuttent_snp = line[7]; + final_snp = "0:"; + int snp_length = line[7].length(); + string locus = line[0] + TAB + line[6]; + SNP_record snp_rec = FindSNPRecord(locus); + debug_data = "(Ref-" +convertChar2srting(snp_rec.ref)+",Alt-"+convertChar2srting(snp_rec.alt)+",SP-"+snp_rec.sp+")"; + final_snp += convertChar2srting(cuttent_snp[0]); + if (DEBUG) { + final_snp += debug_data; + } + if (snp_length ==1) + return final_snp; + + string next_pos = snp_rec.next_snp; + string absolute_pos = line[6]; + for (int i=1; i mergeSNP(vector l1, vector l2) +{//change snp to desired format, with relative index for each variant in format 0:var1:12:var2:13:var3 + + vector returned_snp; + + //if one mate has missing value-make it be the first vector + if (l2[6] == "." && l1[6] != ".") { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + //if both mates have SNP in the same variant-make the one with more variants be the first mate + if (l1[6] == l2[6] && l1[7].length() < l2[7].length()) { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + //if both mates are "-" strand, but same position + if (l1[6] != "." && l2[6] != "." && l1[4] == l2[4] && stoi(l1[6]) > stoi(l2[6])) { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + + //get SNP-length: number of variants for each line: + string snp1 = l1[7]; + string snp2 = l2[7]; + int snp1_length = l1[7].length(); + int snp2_length = l2[7].length(); + //if both snp has missing values + if (l1[6] == "." && l2[6] == ".") returned_snp.assign( {".","."} ); + //get SNP data + //if one read has missing value (".") than use the other value. + else if (l1[6] == "." ) + returned_snp.assign( {l2[6],GetFinalSNP(l2)} ); + + //both mates have values in the SNP column, in the same position. l1 has more variants + else if (l1[6] == l2[6]) { + //both mates have the same SNP variants + if (snp1==snp2) + returned_snp.assign( {l2[6],GetFinalSNP(l2)} ); + //same position, different variants in both mates, the first mate has longer SNP variant list + else + { + //check all common variants between mates are the same. If not- put "N" in the relevant position + for (int i=0; i &l1) +{//add first and last coordinates in case where there is SNP data + int last_snp = 0; // works even if there's no SNP data + string coordinates = ""; + if (SNP_data && l1[7] != ".") //if there's SNP data + { //get last index of SNP + string str_tmp = l1[7].substr(0,l1[7].rfind(":")); + str_tmp = str_tmp.substr(str_tmp.rfind(":")+1); + int last_snp = stoi(l1[6])+stoi(str_tmp); + } + + //get last index of CpG + //string window = l1[0] + "\t" + l1[4]; + //int index = CpGFirstIndex(window) + l1[5].length() - 1; + int last_CpG = CpGLastLoci(l1[0],l1[4], l1[5].length() - 1); + + //insert values to vector and print + if (SNP_data && l1[6] != ".") + //l1.insert(l1.begin()+1, to_string(min(stoi(l1[4]),stoi(l1[6])))); + coordinates += TAB+ to_string(min(stoi(l1[4]),stoi(l1[6]))); + else // no SNP data, coordinates depends only on first CpG + //l1.insert(l1.begin()+1, l1[4]); + coordinates += TAB+ l1[4] ; + //l1.insert(l1.begin()+2, to_string(max(last_snp,last_CpG))); + coordinates += TAB+ to_string(max(last_snp,last_CpG)); + return coordinates; +} + +//void merge_paired_and_print(vector l1, vector l2, string &genome) { +void merge_paired_and_print(vector l1, vector l2, ofstream& merged_epiread) { + /*Merge two epiread-formated line into one */ + + if (!DEBUG) { + l1[1] = "."; + l2[1] = "."; + } + + + bool flag_SNP_identical = (SNP_data) ? l1[6] == l2[6] && l1[7] == l2[7] : true; + //if l2 doesn't add any information to the read-length, sequence and SNP data: + if (l1[4] == l2[4] && l1[5] == l2[5] && ( flag_SNP_identical )) { + //there is an snp value on the identical lines + try { + if ( SNP_data && l1[6] != "." ) + l1[7] = GetFinalSNP(l1); + + //add_coordintes(l1); + string coordinates = add_coordintes(l1); + merged_epiread << vec2string(l1,coordinates) < stoi(l2[4])) { + vector tmp = l1; + l1 = l2; + l2 = tmp; + } + + + string pattern1 = l1[5]; + string pattern2 = l2[5]; + int pattern1_len = pattern1.length(); + int pattern2_len = pattern2.length(); + + //0-based file + //int first_cpg_site1 = stoi(l1[4]); + //int first_cpg_site2 = stoi(l2[4]); + + string window1,window2; + //window1 = l1[0] + "\t" + to_string(first_cpg_site1) + "\t" + to_string(first_cpg_site1+1); + //window2 = l2[0] + "\t" + to_string(first_cpg_site2) + "\t" + to_string(first_cpg_site2+1); + window1 = l1[0] + "\t" + l1[4]; + window2 = l2[0] + "\t" + l2[4]; + int first_cpg1,first_cpg2; + try { + first_cpg1 = CpGFirstIndex(window1); + first_cpg2 = CpGFirstIndex(window2); + } + catch (std::exception &e) { + cout << vec2string(l1) << endl; + cout << vec2string(l2) << endl; + return; + } + + + int last_site = max(first_cpg1 + pattern1_len, first_cpg2 + pattern2_len); + int overall_len = last_site-first_cpg1; + + string merged_pattern; // output pattern + + if (overall_len > MAX_PAT_LEN) // sanity check: make sure the two reads are not too far apart + { + // throw invalid_argument("invalid pairing. merged read is too long "); + string output_error = "Problem with:\n" + l1[0] + "\t" + l1[1] + "\t" + l1[2] + "\t" + l1[3] + "\t" + l1[4] + "\n" + l2[0] + "\t" + l2[1] + "\t" + l2[2] + "\t" + l2[3] + "\t" + l2[4] ; + cerr < merged_snp; + try { + if (SNP_data) //SNP file is not empty + { + merged_snp = mergeSNP(l1,l2); + l1[5] = merged_pattern; + l1[6] = merged_snp[0]; + l1[7] = merged_snp[1]; + + } + //add_coordintes(l1); + merged_epiread << vec2string(l1,add_coordintes(l1)) < Date: Sat, 20 Feb 2021 21:17:35 +0200 Subject: [PATCH 41/56] remove tailing spaces --- main.nf | 166 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/main.nf b/main.nf index 6d18b745..a9aeb10d 100644 --- a/main.nf +++ b/main.nf @@ -12,13 +12,13 @@ def helpMessage() { log.info nfcoreHeader() log.info""" - + Usage: - + The typical command for running the pipeline is as follows: - + nextflow run nf-core/methylseq --input '*_R{1,2}.fastq.gz' -profile docker - + Mandatory arguments: --aligner [str] Alignment tool to use (default: bismark) Available: bismark, bismark_hisat, bwameth, biscuit @@ -85,16 +85,16 @@ def helpMessage() { --zymo [bool] --cegx [bool] --em_seq [bool] - + Other options: --outdir [file] The output directory where the results will be saved - + --publish_dir_mode [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy) --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic - + AWSBatch options: --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch --awsregion [str] The AWS Region for your AWS Batch job to run on @@ -114,7 +114,7 @@ assert params.aligner == 'bwameth' || params.aligner == 'bismark' || params.alig /* * SET UP CONFIGURATION VARIABLES */ - + // These params need to be set late, after the iGenomes config is loaded params.bismark_index = params.genome ? params.genomes[ params.genome ].bismark ?: false : false params.bwa_meth_index = params.genome ? params.genomes[ params.genome ].bwa_meth ?: false : false @@ -139,8 +139,8 @@ if( params.aligner =~ /bismark/ ){ Channel .fromPath(params.fasta, checkIfExists: true) .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } - .into { ch_fasta_for_makeBismarkIndex; ch_fasta_for_picard } - + .into { ch_fasta_for_makeBismarkIndex; ch_fasta_for_picard } + if( params.bismark_index ){ Channel .fromPath(params.bismark_index, checkIfExists: true) @@ -148,9 +148,9 @@ if( params.aligner =~ /bismark/ ){ .into { ch_bismark_index_for_bismark_align; ch_bismark_index_for_bismark_methXtract } ch_fasta_for_makeBismarkIndex.close() } - + } -else if( params.aligner == 'bwameth' || params.aligner == 'biscuit'){ +else if( params.aligner == 'bwameth' || params.aligner == 'biscuit'){ assert params.fasta : "No Fasta reference specified!" ch_wherearemyfiles_for_alignment.into { ch_wherearemyfiles_for_bwamem_align; ch_wherearemyfiles_for_biscuit_align; ch_wherearemyfiles_for_samtools_sort_index_flagstat; ch_wherearemyfiles_for_samblaster } @@ -291,7 +291,7 @@ if (params.input_paths) { } if (params.aligner == 'biscuit') { - if (params.epiread) { + if (params.epiread) { assert params.blacklist || params.whitelist : "Cannot find any blacklist/whitelist file matching: ${params.whitelist}\nEither whitelist or blacklist are needed if \'--epiread\' is specified" if (params.whitelist) { @@ -306,15 +306,15 @@ if (params.aligner == 'biscuit') { .ifEmpty { exit 1, "Cannot find any blacklist file matching: ${params.blacklist}" } .set { ch_blacklist_for_create_whitelist;} } - + if (params.common_dbsnp) { Channel .fromPath(params.common_dbsnp, checkIfExists: true) .ifEmpty { exit 1, "Cannot find any dbSNP file matching: ${params.common_dbsnp}\n" } .set { ch_commonSNP_for_SNP; } } - } - else + } + else { ch_fasta_for_create_whitelist.close() } @@ -360,8 +360,8 @@ if(params.save_reference) save_intermeds.add('Reference genome build') if(params.save_trimmed) save_intermeds.add('Trimmed FastQ files') if(params.unmapped) save_intermeds.add('Unmapped reads') if(params.save_align_intermeds) save_intermeds.add('Intermediate BAM files') -if(params.save_pileup_file) save_intermeds.add('Pileup files') -if(params.save_snp_file) save_intermeds.add('SNP bed-files') +if(params.save_pileup_file) save_intermeds.add('Pileup files') +if(params.save_snp_file) save_intermeds.add('SNP bed-files') if(save_intermeds.size() > 0) summary['Save Intermediates'] = save_intermeds.join(', ') debug_mode = []; if(params.debug_epiread) debug_mode.add('Debug epiread step') @@ -371,7 +371,7 @@ if(params.minins) summary['Bismark min insert size'] = bismark_minins if(params.maxins || params.em_seq) summary['Bismark max insert size'] = bismark_maxins if(params.bismark_align_cpu_per_multicore) summary['Bismark align CPUs per --multicore'] = params.bismark_align_cpu_per_multicore if(params.bismark_align_mem_per_multicore) summary['Bismark align memory per --multicore'] = params.bismark_align_mem_per_multicore -if(params.assets_dir) summary['Assets Directory'] = params.assets_dir +if(params.assets_dir) summary['Assets Directory'] = params.assets_dir if(params.whitelist) summary['Whitelist'] = params.whitelist if(params.blacklist) summary['Blacklist'] = params.whitelist if(params.common_dbsnp) summary['Common SNP'] = params.common_dbsnp @@ -420,7 +420,7 @@ Channel.from(summary.collect{ [it.key, it.value] }) """.stripIndent() } .set { ch_workflow_summary } - + /* * Parse software version numbers */ @@ -462,9 +462,9 @@ process get_software_versions { multiqc --version &> v_multiqc.txt samblaster --version &> v_samblaster.txt biscuit &>v_biscuit.txt 2>&1 || true - bcftools --version &> v_bcftools.txt - bedtools --version &> v_bedtools.txt - parallel --version &> v_parallel.txt + bcftools --version &> v_bcftools.txt + bedtools --version &> v_bedtools.txt + parallel --version &> v_parallel.txt gawk --version > v_gawk.txt scrape_software_versions.py &> software_versions_mqc.yaml """ @@ -753,13 +753,13 @@ if( params.aligner =~ /bismark/ ){ $splicesites """ } - + /* * STEP 4 - Samtools sort bismark */ process samtools_sort_index_flagstat_bismark { tag "$name" - publishDir "${params.outdir}/samtools", mode: 'copy', + publishDir "${params.outdir}/samtools", mode: 'copy', saveAs: {filename -> if(filename.indexOf("report.txt") > 0) "logs/$filename" else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename @@ -804,7 +804,7 @@ if( params.aligner =~ /bismark/ ){ set val(name), file(bam) from ch_bam_for_bismark_deduplicate output: - set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract, ch_dedup_bam_for_samtools_sort_index_flagstat + set val(name), file("*.deduplicated.bam") into ch_bam_dedup_for_bismark_methXtract, ch_dedup_bam_for_samtools_sort_index_flagstat set val(name), file("*.deduplication_report.txt") into ch_bismark_dedup_log_for_bismark_report, ch_bismark_dedup_log_for_bismark_summary, ch_bismark_dedup_log_for_multiqc script: @@ -820,7 +820,7 @@ if( params.aligner =~ /bismark/ ){ */ process samtools_sort_index_flagstat_dedup_bismark { tag "$name" - publishDir "${params.outdir}/samtools", mode: 'copy', + publishDir "${params.outdir}/samtools", mode: 'copy', saveAs: {filename -> if(filename.indexOf("report.txt") > 0) "logs/$filename" else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename @@ -833,7 +833,7 @@ if( params.aligner =~ /bismark/ ){ file wherearemyfiles from ch_wherearemyfiles_for_bismark_dedup_samtools_sort.collect() output: - set val(name), file("*.sorted.bam") into ch_bam_sorted_dedup_for_qualimap + set val(name), file("*.sorted.bam") into ch_bam_sorted_dedup_for_qualimap file "where_are_my_files.txt" script: @@ -845,7 +845,7 @@ if( params.aligner =~ /bismark/ ){ -o ${bam.baseName}.sorted.bam """ } - + /* * STEP 5 - Bismark methylation extraction */ @@ -977,7 +977,7 @@ else { ch_bismark_mbias_for_multiqc = Channel.from(false) ch_bismark_reports_results_for_multiqc = Channel.from(false) ch_bismark_summary_results_for_multiqc = Channel.from(false) - + } @@ -1133,7 +1133,7 @@ else { ch_samtools_stats_results_for_multiqc = Channel.from(false) ch_markDups_results_for_multiqc = Channel.from(false) ch_methyldackel_results_for_multiqc = Channel.from(false) - + } @@ -1165,7 +1165,7 @@ if( params.aligner == 'biscuit' ){ prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ non_directional = params.single_cell || params.zymo || params.nondirectional_library ? 0 : 1 - // Paired-end or single-end input files and pbat or not + // Paired-end or single-end input files and pbat or not input = params.pbat ? params.single_end ? reads + " -b 3" : "${reads[1]} ${reads[0]} -b " + non_directional : reads.toString() +" -b " + non_directional """ @@ -1196,14 +1196,14 @@ if( params.aligner == 'biscuit' ){ output: set val(name), file("${bam.baseName}.samblaster.bam") into ch_samblaster_for_samtools_sort_index_flagstat file "*log" into ch_samblaster_for_multiqc - + script: def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' unmapped = params.single_end ? '--ignoreUnmated' : '' """ - samtools sort -n $bam -@ ${task.cpus} $sort_mem| samtools view -h | samblaster -M $unmapped -d "${bam.baseName}_discordant.sam" -s "${bam.baseName}_split.sam" -u "${bam.baseName}_.fastq" --excludeDups --addMateTags | samtools view -Sb > ${bam.baseName}.samblaster.bam + samtools sort -n $bam -@ ${task.cpus} $sort_mem| samtools view -h | samblaster -M $unmapped -d "${bam.baseName}_discordant.sam" -s "${bam.baseName}_split.sam" -u "${bam.baseName}_.fastq" --excludeDups --addMateTags | samtools view -Sb > ${bam.baseName}.samblaster.bam cp .command.log ${bam.baseName}.log """ } @@ -1233,7 +1233,7 @@ if( params.aligner == 'biscuit' ){ file "${samblaster_bam.baseName}_flagstat_report.txt" into ch_flagstat_results_biscuit_for_multiqc file "${samblaster_bam.baseName}_stats_report.txt" into ch_samtools_stats_results_biscuit_for_multiqc file "where_are_my_files.txt" - + script: def avail_mem = task.memory ? ((task.memory.toGiga() - 6) / task.cpus).trunc() : false @@ -1248,7 +1248,7 @@ if( params.aligner == 'biscuit' ){ """ } - + /* * STEP 6 - Create vcf file with pileup, to extract methylation */ @@ -1268,15 +1268,15 @@ if( params.aligner == 'biscuit' ){ output: set val(name), file("${name}.vcf.gz*") into ch_vcf_biscuit_qc ,ch_vcf_for_bedgraph,ch_vcf_for_epiread - + script: filter_duplication = params.skip_deduplication || params.rrbs ? '-u' : '' """ - biscuit pileup -q ${task.cpus} $filter_duplication $fasta ${bam} -o ${name}.vcf + biscuit pileup -q ${task.cpus} $filter_duplication $fasta ${bam} -o ${name}.vcf bgzip -@ ${task.cpus} -f ${name}.vcf tabix -f -p vcf ${name}.vcf.gz """ - } + } /* * STEP 7 - Create bedgraph file from vcf @@ -1284,7 +1284,7 @@ if( params.aligner == 'biscuit' ){ process createBedgraph { tag "$name" publishDir "${params.outdir}/methylation_extract", mode: 'copy' - + input: set val(name), file(vcf) from ch_vcf_for_bedgraph @@ -1295,10 +1295,10 @@ if( params.aligner == 'biscuit' ){ min_depth = params.min_coverage > 1 ? "${params.min_coverage}" : '1' all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' """ - biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" + biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" """ } - + if (params.epiread) { if (params.common_dbsnp) { /* @@ -1308,10 +1308,10 @@ if( params.aligner == 'biscuit' ){ input: file commonSNP_file from ch_commonSNP_for_SNP.collect() - + output: file("reformattedSNP.snv.txt.gz*" ) into ch_reformattedSNP_for_SNP - + script: """ less $commonSNP_file | $projectDir/bin/processUcscDbsnp.pl | grep snv | bgzip > reformattedSNP.snv.txt.gz @@ -1320,10 +1320,10 @@ if( params.aligner == 'biscuit' ){ } } else { - ch_reformattedSNP_for_SNP = Channel.empty() + ch_reformattedSNP_for_SNP = Channel.empty() } - - + + /* * STEP 7.2 - Create whitelist for SNP calling */ @@ -1335,13 +1335,13 @@ if( params.aligner == 'biscuit' ){ input: file blacklist from ch_blacklist_for_create_whitelist file fasta_index from ch_fasta_for_create_whitelist - + output: file("whitelist.${name}.bed.gz" ) into ch_whitelist_for_SNP, ch_whitelist_for_epiread - file "sizes.${name}" + file "sizes.${name}" script: name = assembly_name - '.fa' - + """ cut -f1,2 $fasta_index > sizes.${name} bedtools sort -g sizes.${name} -i $blacklist > ${blacklist.baseName}.sorted.bed @@ -1349,20 +1349,20 @@ if( params.aligner == 'biscuit' ){ """ } } - else { + else { ch_fasta_for_create_whitelist.close() } /* - * STEP 7.3 - SNP file generation for the epiread convertion + * STEP 7.3 - SNP file generation for the epiread convertion */ - process get_SNP_file { + process get_SNP_file { tag "$name" publishDir "${params.outdir}/epireads/snp", mode: 'copy', saveAs: {filename -> if( filename.indexOf("bed") > 0 && params.save_snp_file && filename != "where_are_my_files.txt") filename else null } - + input: set val(name), file(vcf) from ch_vcf_for_epiread file whitelist_file from ch_whitelist_for_SNP.collect() @@ -1371,16 +1371,16 @@ if( params.aligner == 'biscuit' ){ output: set val(name), file ("${name}.snp.bed") into ch_snp_for_epiread file "*gz" - + script: whitelist = params.whitelist ? "-R $whitelist_file" : '' - snp_file = (reformatted_SNP.size()>0) ? "-a ${reformatted_SNP[0]}" : '' + snp_file = (reformatted_SNP.size()>0) ? "-a ${reformatted_SNP[0]}" : '' """ bcftools annotate $whitelist -O z ${snp_file} -h $projectDir/assets/common_dbsnp.hdr -c CHROM,FROM,TO,TYPE,COMMON_SOME,COMMON_ALL,REF_MIN,ALT_MIN,REF_DBSNP,ALT_DBSNP,REF_ALL,ALT_ALL,RSID,MAX_MAF "${vcf[0]}" > "${name}-whitelist-dbSNP.vcf.gz" tabix -p vcf "${name}-whitelist-dbSNP.vcf.gz" bcftools view -O z -i'ALT!="N" & ALT!="." & ( (COUNT(GT=="0/1")>=1 & COMMON_ALL==1 & MAX_MAF>=0.05) | (COUNT(GT=="0/1" & GQ>=60)>=1) )' "${name}-whitelist-dbSNP.vcf.gz" > "${name}-whitelist-dbSNP-HET60.vcf.gz" - tabix -p vcf "${name}-whitelist-dbSNP-HET60.vcf.gz" - bcftools query -u -i'GT="0/1" & GQ>=10' --format '%CHROM\t%POS\t%POS\t%REF\t%ALT[\t%GT\t%GQ\t%SP\t%AC\t%AF1]\t%RSID\t%COMMON_ALL\t%MAX_MAF\t%REF_MIN\t%ALT_MIN\n' "${name}-whitelist-dbSNP-HET60.vcf.gz" | awk -v OFS="\t" '{\$2 = \$2 - 1; print}' > "${name}.snp.bed" + tabix -p vcf "${name}-whitelist-dbSNP-HET60.vcf.gz" + bcftools query -u -i'GT="0/1" & GQ>=10' --format '%CHROM\t%POS\t%POS\t%REF\t%ALT[\t%GT\t%GQ\t%SP\t%AC\t%AF1]\t%RSID\t%COMMON_ALL\t%MAX_MAF\t%REF_MIN\t%ALT_MIN\n' "${name}-whitelist-dbSNP-HET60.vcf.gz" | awk -v OFS="\t" '{\$2 = \$2 - 1; print}' > "${name}.snp.bed" """ } @@ -1390,7 +1390,7 @@ if( params.aligner == 'biscuit' ){ process epiread_convertion { tag "$name" publishDir "${params.outdir}/epireads", mode: 'copy' - + input: set val(name), file(bam), @@ -1398,7 +1398,7 @@ if( params.aligner == 'biscuit' ){ file(snp), file(fasta), file(fasta_index), - file(whitelist) from ch_bam_sorted_for_epiread + file(whitelist) from ch_bam_sorted_for_epiread .join(ch_bam_index_for_epiread) .join(ch_snp_for_epiread) .combine(ch_fasta_for_epiread) @@ -1408,9 +1408,9 @@ if( params.aligner == 'biscuit' ){ output: - file "*${name}.e*.gz*" + file "*${name}.e*.gz*" file "${name}.original.epiread.*" optional true - + script: snp_file = (snp.size()>0) ? "-B " + snp.toString() : '' cpg_file = assets.toString() + "/cpg.bed.gz" @@ -1418,7 +1418,7 @@ if( params.aligner == 'biscuit' ){ no_filter_reverse = params.rrbs ? "-p" : '' if (params.single_end) { """ - bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam biscuit epiread -q ${task.cpus} $snp_file $no_filter_reverse $fasta ${name}.bam |sort --parallel=${task.cpus} -T . -k1,1Vf -k5,5n | bgzip > ${name}.epiread.gz tabix -0 -s 1 -b 5 -e 5 ${name}.epiread.gz @@ -1426,13 +1426,13 @@ if( params.aligner == 'biscuit' ){ } else if (params.debug_epiread) { """ zcat $cpg_file > cpg.bed - - bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam + + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n > ${name}.original.epiread less ${name}.original.epiread | $projectDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err - sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz - sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz + sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz + sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.original.epiread | bgzip > ${name}.original.epiread.gz tabix -0 -s 1 -b 5 -e 5 ${name}.original.epiread.gz tabix -0 -p bed ${name}.epiread.gz @@ -1442,18 +1442,18 @@ if( params.aligner == 'biscuit' ){ else { """ zcat $cpg_file > cpg.bed - bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam + bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n | $projectDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err - sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz - sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz - tabix -0 -p bed ${name}.epiread.gz + sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz + sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz + tabix -0 -p bed ${name}.epiread.gz tabix -0 -s 1 -b 5 -e 5 ${name}.err.gz """ } } } - + /* * STEP 8 - Running QC of samples */ @@ -1472,14 +1472,14 @@ if( params.aligner == 'biscuit' ){ .combine(ch_fasta_for_biscuitQC) .combine(ch_fasta_index_for_biscuitQC) .combine(ch_assets_dir_for_biscuit_qc) - + output: file "*_biscuitQC" into ch_QC_results_for_multiqc script: assembly = fasta.toString().replaceAll(/\.\w+/,"") """ - QC.sh -v ${vcf[0]} -o ${name}.${assembly}_biscuitQC $assets $fasta ${name}.${assembly} ${bam} + QC.sh -v ${vcf[0]} -o ${name}.${assembly}_biscuitQC $assets $fasta ${name}.${assembly} ${bam} """ } @@ -1497,7 +1497,7 @@ else { process qualimap { tag "$name" publishDir "${params.outdir}/qualimap", mode: params.publish_dir_mode - + input: set val(name), file(bam) from ch_bam_sorted_dedup_for_qualimap @@ -1523,7 +1523,7 @@ process qualimap { */ process prepareGenomeToPicard { publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: 'copy' + saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: 'copy' input: file fasta from ch_fasta_for_picard @@ -1537,7 +1537,7 @@ process prepareGenomeToPicard { avail_mem = 3 } else { avail_mem = task.memory.toGiga() - } + } """ mv ${fasta} ${fasta.baseName}.picard.fa picard -Xmx${avail_mem}g CreateSequenceDictionary \\ @@ -1547,7 +1547,7 @@ process prepareGenomeToPicard { } - + /* * STEP 11 - Picard InsertSizeMetrics and GcBiasMetrics */ @@ -1565,9 +1565,9 @@ process picardMetrics { file dict from ch_fasta_picard_dict_for_picard.collect() output: - file "${name}.*.pdf" + file "${name}.*.pdf" file "${name}.*.txt" into ch_picard_results_for_multiqc - + script: if( !task.memory ){ log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." @@ -1582,7 +1582,7 @@ process picardMetrics { HISTOGRAM_FILE=${name}.insert_size_histogram.pdf \\ ASSUME_SORTED=true \\ VALIDATION_STRINGENCY=LENIENT - set +e + set +e picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ INPUT=$bam \\ OUTPUT=${name}.gc_bias_metrics.txt \\ @@ -1591,7 +1591,7 @@ process picardMetrics { ASSUME_SORTED=true \\ IS_BISULFITE_SEQUENCED=true \\ REFERENCE_SEQUENCE=$fasta \\ - VALIDATION_STRINGENCY=LENIENT + VALIDATION_STRINGENCY=LENIENT [ ! "\$?" -eq "0" ] && picard -Xmx${avail_mem}g ReorderSam I=$bam O=${bam.baseName}.picard.bam SEQUENCE_DICTIONARY=$fasta VALIDATION_STRINGENCY=LENIENT TMP_DIR=. && picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ INPUT=${bam.baseName}.picard.bam \\ OUTPUT=${name}.gc_bias_metrics.txt \\ @@ -1622,7 +1622,7 @@ process preseq { """ preseq lc_extrap -v -B ${bam.baseName}.bam -o ${bam.baseName}.ccurve.txt """ - + } /* From 020b0e17a46e91918bd4c9dbaa0935b54dc6a5f1 Mon Sep 17 00:00:00 2001 From: ekushele Date: Sat, 20 Feb 2021 21:27:53 +0200 Subject: [PATCH 42/56] remove software updates from CHANGELOG.md --- CHANGELOG.md | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e240f6e3..b63adf34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,19 +20,6 @@ * _new_: bcftools`1.10` * _new_: parallel `20201122` * _new_: gawk `5.1.0` -* python `3.7.3` > `3.8.5` -* markdown `3.1.1` > `3.3.3` -* pymdown-extensions `6.0` > `8.1.1` -* pygments `2.6.1` > `2.7.4` -* pigz `2.3.4` > `2.5` -* trim-galore `0.6.5` > `0.6.6` -* samtools `1.9` > `1.10` -* bowtie2 `2.3.5` > `2.4.2` -* hisat2 `2.2.0` > `2.2.1` -* bismark `0.22.3` > `0.23.0` -* preseq `2.0.3` > `3.1.2` -* picard `2.22.2` > `2.25.0` -* methyldackel `0.5.0` > `0.5.1` ## [v1.5](https://github.com/nf-core/methylseq/releases/tag/1.5) - 2020-04-09 From 63f9202454b3c8b5b291c2e831aaef5ce1e659b3 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Mon, 22 Feb 2021 07:58:20 +0100 Subject: [PATCH 43/56] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40fcaddd..875d8014 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # nf-core/methylseq -## v1.6dev - [2020-02-15] +## v1.6dev - [date] **:warning: Breaking change!** From 310e45ab53bc69a3dad9936bca0d2846ee79fc3a Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Mon, 22 Feb 2021 07:58:42 +0100 Subject: [PATCH 44/56] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 875d8014..9a05f3af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,3 @@ - # nf-core/methylseq ## v1.6dev - [date] From b9cd416cbd416f236d3aa384e8a49ce84f535f98 Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 22 Feb 2021 12:31:43 +0200 Subject: [PATCH 45/56] change bin/epiread_pairedEnd_convertion to bin/epiread_pairedEnd_conversion --- ...dEnd_convertion => epiread_pairedEnd_conversion} | Bin ...vertion.cpp => epiread_pairedEnd_conversion.cpp} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename bin/{epiread_pairedEnd_convertion => epiread_pairedEnd_conversion} (100%) rename bin/{epiread_pairedEnd_convertion.cpp => epiread_pairedEnd_conversion.cpp} (100%) diff --git a/bin/epiread_pairedEnd_convertion b/bin/epiread_pairedEnd_conversion similarity index 100% rename from bin/epiread_pairedEnd_convertion rename to bin/epiread_pairedEnd_conversion diff --git a/bin/epiread_pairedEnd_convertion.cpp b/bin/epiread_pairedEnd_conversion.cpp similarity index 100% rename from bin/epiread_pairedEnd_convertion.cpp rename to bin/epiread_pairedEnd_conversion.cpp From 4a7415f067ed0a8e805421bc202c85abd0536257 Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 22 Feb 2021 12:41:12 +0200 Subject: [PATCH 46/56] remove cleanup=true from base.config --- conf/base.config | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/conf/base.config b/conf/base.config index cb81e08a..86f48b6f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -42,9 +42,8 @@ process { memory = { check_max( 64.GB * task.attempt, 'memory') } time = { check_max( 8.d * task.attempt, 'time') } } - - withName:samtools_sort_index_flagstat_bismark { - cpus = { check_max( 4 * task.attempt, 'cpus') } + withName:samtools_sort_index_flagstat_bismark { + cpus = { check_max( 4 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 1.d * task.attempt, 'time') } } @@ -65,10 +64,9 @@ process { } withName:preseq { errorStrategy = 'ignore' - cpus = { check_max( 4 * task.attempt, 'cpus') } - memory = { check_max( 8.GB * task.attempt, 'memory') } - time = { check_max( 15.h * task.attempt, 'time') } - + cpus = { check_max( 4 * task.attempt, 'cpus') } + memory = { check_max( 8.GB * task.attempt, 'memory') } + time = { check_max( 15.h * task.attempt, 'time') } } withName:get_software_versions { cache = false @@ -94,8 +92,7 @@ process { time = { check_max( 1.d * task.attempt, 'time') } } - -withName:biscuit_align { + withName:biscuit_align { cpus = { check_max( 10 * task.attempt, 'cpus') } memory = { check_max( 64.GB * task.attempt, 'memory') } time = { check_max( 6.d * task.attempt, 'time') } @@ -111,12 +108,11 @@ withName:biscuit_align { time = { check_max( 2.d * task.attempt, 'time') } } - withName:markDuplicates_samblaster { + withName:markDuplicates_samblaster { cpus = { check_max( 10 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 3.d * task.attempt, 'time') } - } - + } withName:createVCF { cpus = { check_max( 4 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } @@ -142,17 +138,16 @@ withName:biscuit_align { memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 1.d * task.attempt, 'time') } } - withName:epiread_convertion { + withName:epiread_convertion { cpus = { check_max( 4 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 2.d * task.attempt, 'time') } } - withName:get_SNP_file { + withName:get_SNP_file { cpus = { check_max( 2 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 2.d * task.attempt, 'time') } } - withName:fastqc { cpus = { check_max( 6 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } @@ -167,5 +162,3 @@ params { max_time = 240.h igenomes_base = 's3://ngi-igenomes/igenomes/' } - -// cleanup = true From 72a0dae9e2f0f82497557404190c772202b23bf4 Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 22 Feb 2021 12:54:03 +0200 Subject: [PATCH 47/56] processes names as snake_case --- conf/base.config | 4 ++-- main.nf | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/conf/base.config b/conf/base.config index 86f48b6f..89c71f58 100644 --- a/conf/base.config +++ b/conf/base.config @@ -128,12 +128,12 @@ process { memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 12.h * task.attempt, 'time') } } - withName:picardMetrics { + withName:picard_metrics { cpus = { check_max( 4 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 2.d * task.attempt, 'time') } } - withName:prepareGenomeToPicard { + withName:prepare_genome_to_picard { cpus = { check_max( 2 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 1.d * task.attempt, 'time') } diff --git a/main.nf b/main.nf index a9aeb10d..09bd380d 100644 --- a/main.nf +++ b/main.nf @@ -157,7 +157,7 @@ else if( params.aligner == 'bwameth' || params.aligner == 'biscuit'){ Channel .fromPath(params.fasta, checkIfExists: true) .ifEmpty { exit 1, "fasta file not found : ${params.fasta}" } - .into { ch_fasta_for_makeBwaMemIndex; ch_fasta_for_makeFastaIndex; ch_fasta_for_buildBiscuitQCAssets; ch_fasta_for_methyldackel; ch_fasta_for_pileup; ch_fasta_for_epiread; ch_fasta_for_biscuitQC; ch_fasta_for_picard} + .into { ch_fasta_for_makeBwaMemIndex; ch_fasta_for_makeFastaIndex; ch_fasta_for_build_biscuit_QC_assets; ch_fasta_for_methyldackel; ch_fasta_for_pileup; ch_fasta_for_epiread; ch_fasta_for_biscuitQC; ch_fasta_for_picard} if( params.bwa_meth_index ){ Channel @@ -189,7 +189,7 @@ if( params.aligner == 'biscuit' && params.assets_dir ) { .fromPath("${params.assets_dir}", checkIfExists: true) .ifEmpty { exit 1, "Assets directory for biscuit QC not found: ${params.assets_dir}" } .into { ch_assets_dir_for_biscuit_qc; ch_assets_dir_with_cpg_for_epiread } - ch_fasta_for_buildBiscuitQCAssets.close() + ch_fasta_for_build_biscuit_QC_assets.close() } if( workflow.profile == 'uppmax' ){ @@ -565,12 +565,12 @@ if( !params.fasta_index && params.aligner == 'bwameth' || !params.fasta_index & * PREPROCESSING - Build Biscuit QC assets */ if( !params.assets_dir && params.aligner == 'biscuit' ) { - process buildBiscuitQCAssets { + process build_biscuit_QC_assets { tag "$fasta" publishDir path: "${params.outdir}/reference_assets", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode input: - file fasta from ch_fasta_for_buildBiscuitQCAssets + file fasta from ch_fasta_for_build_biscuit_QC_assets output: file "*assets" into ch_assets_dir_for_biscuit_qc, ch_assets_dir_with_cpg_for_epiread @@ -1521,7 +1521,7 @@ process qualimap { /* * STEP 10 - Picard - Preparation step */ -process prepareGenomeToPicard { +process prepare_genome_to_picard { publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: 'copy' @@ -1551,7 +1551,7 @@ process prepareGenomeToPicard { /* * STEP 11 - Picard InsertSizeMetrics and GcBiasMetrics */ -process picardMetrics { +process picard_metrics { tag "$name" publishDir "${params.outdir}/picardMetrics", mode: 'copy', saveAs: { filename -> From 9329e4242ce20015f3e7b238148971a56c3edeec Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 22 Feb 2021 14:16:04 +0200 Subject: [PATCH 48/56] change processes name to snake_case --- conf/base.config | 4 ++-- environment.yml | 2 +- main.nf | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/conf/base.config b/conf/base.config index 89c71f58..056cc3e6 100644 --- a/conf/base.config +++ b/conf/base.config @@ -113,7 +113,7 @@ process { memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 3.d * task.attempt, 'time') } } - withName:createVCF { + withName:create_VCF { cpus = { check_max( 4 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 3.d * task.attempt, 'time') } @@ -123,7 +123,7 @@ process { memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 5.d * task.attempt, 'time') } } - withName:createBedgraph { + withName:create_Bedgraph { cpus = { check_max( 1 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 12.h * task.attempt, 'time') } diff --git a/environment.yml b/environment.yml index abc80748..9fdb3481 100644 --- a/environment.yml +++ b/environment.yml @@ -34,6 +34,6 @@ dependencies: - bioconda::samblaster=0.1.26 - bioconda::bedtools=2.30.0 - bioconda::biscuit=0.3.16.20200420 - - bioconda::bcftools=1.10 + - bioconda::bcftools=1.9 - conda-forge::parallel=20201122 - gawk=5.1.0 diff --git a/main.nf b/main.nf index 09bd380d..48a63e84 100644 --- a/main.nf +++ b/main.nf @@ -179,7 +179,7 @@ else if( params.aligner == 'bwameth' || params.aligner == 'biscuit'){ Channel .fromPath(params.fasta_index, checkIfExists: true) .ifEmpty { exit 1, "fasta index file not found: ${params.fasta_index}" } - .into { ch_fasta_index_for_methyldackel; ch_fasta_index_for_biscuitQC; ch_fasta_index_for_createVCF; ch_fasta_for_create_whitelist; ch_fasta_index_for_epiread } + .into { ch_fasta_index_for_methyldackel; ch_fasta_index_for_biscuitQC; ch_fasta_index_for_create_VCF; ch_fasta_for_create_whitelist; ch_fasta_index_for_epiread } ch_fasta_for_makeFastaIndex.close() } } @@ -552,7 +552,7 @@ if( !params.fasta_index && params.aligner == 'bwameth' || !params.fasta_index & file fasta from ch_fasta_for_makeFastaIndex output: - file "${fasta}.fai" into ch_fasta_index_for_methyldackel,ch_fasta_index_for_biscuitQC,ch_fasta_index_for_createVCF,ch_fasta_for_create_whitelist,ch_fasta_index_for_epiread + file "${fasta}.fai" into ch_fasta_index_for_methyldackel,ch_fasta_index_for_biscuitQC,ch_fasta_index_for_create_VCF,ch_fasta_for_create_whitelist,ch_fasta_index_for_epiread script: """ @@ -1252,7 +1252,7 @@ if( params.aligner == 'biscuit' ){ /* * STEP 6 - Create vcf file with pileup, to extract methylation */ - process createVCF { + process create_VCF { tag "$name" publishDir "${params.outdir}/methylation_extract", mode: 'copy', saveAs: {filename -> @@ -1264,7 +1264,7 @@ if( params.aligner == 'biscuit' ){ input: set val(name), file(bam), file (bam_index) from ch_bam_sorted_for_pileup.join(ch_bam_index_sorted_for_pileup) file fasta from ch_fasta_for_pileup.collect() - file fasta_index from ch_fasta_index_for_createVCF.collect() + file fasta_index from ch_fasta_index_for_create_VCF.collect() output: set val(name), file("${name}.vcf.gz*") into ch_vcf_biscuit_qc ,ch_vcf_for_bedgraph,ch_vcf_for_epiread @@ -1281,7 +1281,7 @@ if( params.aligner == 'biscuit' ){ /* * STEP 7 - Create bedgraph file from vcf */ - process createBedgraph { + process create_Bedgraph { tag "$name" publishDir "${params.outdir}/methylation_extract", mode: 'copy' From 16b78d9aabfe250785f423b75fc84cd65d2aff05 Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 22 Feb 2021 14:21:10 +0200 Subject: [PATCH 49/56] change epiread_convertion to epiread_conversion in process name --- conf/base.config | 2 +- main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/base.config b/conf/base.config index 056cc3e6..e1b1a112 100644 --- a/conf/base.config +++ b/conf/base.config @@ -138,7 +138,7 @@ process { memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 1.d * task.attempt, 'time') } } - withName:epiread_convertion { + withName:epiread_conversion { cpus = { check_max( 4 * task.attempt, 'cpus') } memory = { check_max( 32.GB * task.attempt, 'memory') } time = { check_max( 2.d * task.attempt, 'time') } diff --git a/main.nf b/main.nf index 48a63e84..8cc54bba 100644 --- a/main.nf +++ b/main.nf @@ -1387,7 +1387,7 @@ if( params.aligner == 'biscuit' ){ /* * STEP 7.4 - Convert bam to epiread file format */ - process epiread_convertion { + process epiread_conversion { tag "$name" publishDir "${params.outdir}/epireads", mode: 'copy' From e980520aef4471cd35032e8d6984ae0eae977d76 Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 22 Feb 2021 14:30:01 +0200 Subject: [PATCH 50/56] add linebreaks and indent in markDuplicates_samblaster --- main.nf | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 8cc54bba..1ece6486 100644 --- a/main.nf +++ b/main.nf @@ -1203,7 +1203,15 @@ if( params.aligner == 'biscuit' ){ unmapped = params.single_end ? '--ignoreUnmated' : '' """ - samtools sort -n $bam -@ ${task.cpus} $sort_mem| samtools view -h | samblaster -M $unmapped -d "${bam.baseName}_discordant.sam" -s "${bam.baseName}_split.sam" -u "${bam.baseName}_.fastq" --excludeDups --addMateTags | samtools view -Sb > ${bam.baseName}.samblaster.bam + samtools sort -n $bam \\ + -@ ${task.cpus} $sort_mem | \\ + samtools view -h | \\ + samblaster -M $unmapped \\ + -d "${bam.baseName}_discordant.sam" \\ + -s "${bam.baseName}_split.sam" \\ + -u "${bam.baseName}_.fastq" \\ + --excludeDups --addMateTags | \\ + samtools view -Sb > ${bam.baseName}.samblaster.bam cp .command.log ${bam.baseName}.log """ } From 439ccd8afa043dceeaf134f1f5b7ec9e6864d40b Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 22 Feb 2021 15:31:34 +0200 Subject: [PATCH 51/56] change debug_epiread to be in saveAs option --- main.nf | 52 ++++++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/main.nf b/main.nf index 1ece6486..b749a6d1 100644 --- a/main.nf +++ b/main.nf @@ -759,7 +759,7 @@ if( params.aligner =~ /bismark/ ){ */ process samtools_sort_index_flagstat_bismark { tag "$name" - publishDir "${params.outdir}/samtools", mode: 'copy', + publishDir "${params.outdir}/samtools", mode: params.publish_dir_mode, saveAs: {filename -> if(filename.indexOf("report.txt") > 0) "logs/$filename" else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename @@ -820,7 +820,7 @@ if( params.aligner =~ /bismark/ ){ */ process samtools_sort_index_flagstat_dedup_bismark { tag "$name" - publishDir "${params.outdir}/samtools", mode: 'copy', + publishDir "${params.outdir}/samtools", mode: params.publish_dir_mode, saveAs: {filename -> if(filename.indexOf("report.txt") > 0) "logs/$filename" else if( (!params.save_align_intermeds && !params.skip_deduplication && !params.rrbs).every() && filename == "where_are_my_files.txt") filename @@ -1143,7 +1143,7 @@ else { if( params.aligner == 'biscuit' ){ process biscuit_align { tag "$name" - publishDir "${params.outdir}/biscuit_alignments", mode: 'copy', + publishDir "${params.outdir}/biscuit_alignments", mode: params.publish_dir_mode, saveAs: {filename -> if( !params.save_align_intermeds && filename == "where_are_my_files.txt" ) filename else if( params.save_align_intermeds && filename != "where_are_my_files.txt" ) filename @@ -1183,7 +1183,7 @@ if( params.aligner == 'biscuit' ){ process markDuplicates_samblaster { tag "$name" - publishDir "${params.outdir}", mode: 'copy', + publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: {filename -> if( filename.indexOf("log") > 0 ) "biscuit_markDuplicates/$filename" else null @@ -1205,7 +1205,7 @@ if( params.aligner == 'biscuit' ){ """ samtools sort -n $bam \\ -@ ${task.cpus} $sort_mem | \\ - samtools view -h | \\ + samtools view -h | \\ samblaster -M $unmapped \\ -d "${bam.baseName}_discordant.sam" \\ -s "${bam.baseName}_split.sam" \\ @@ -1222,7 +1222,7 @@ if( params.aligner == 'biscuit' ){ */ process samtools_sort_index_flagstat_biscuit { tag "$name" - publishDir "${params.outdir}", mode: 'copy', + publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: {filename -> if(filename.indexOf("report.txt") > 0) "biscuit_alignments/logs/$filename" else if( (params.save_align_intermeds || params.skip_deduplication || params.rrbs).any() && filename.indexOf("sorted.bam") > 0) "biscuit_alignments/$filename" @@ -1262,7 +1262,7 @@ if( params.aligner == 'biscuit' ){ */ process create_VCF { tag "$name" - publishDir "${params.outdir}/methylation_extract", mode: 'copy', + publishDir "${params.outdir}/methylation_extract", mode: params.publish_dir_mode, saveAs: {filename -> if( !params.save_pileup_file && filename == "where_are_my_files.txt") filename else if( filename.indexOf("vcf.gz") > 0 && params.save_pileup_file && filename != "where_are_my_files.txt") filename @@ -1291,7 +1291,7 @@ if( params.aligner == 'biscuit' ){ */ process create_Bedgraph { tag "$name" - publishDir "${params.outdir}/methylation_extract", mode: 'copy' + publishDir "${params.outdir}/methylation_extract", mode: params.publish_dir_mode input: set val(name), file(vcf) from ch_vcf_for_bedgraph @@ -1361,11 +1361,11 @@ if( params.aligner == 'biscuit' ){ ch_fasta_for_create_whitelist.close() } /* - * STEP 7.3 - SNP file generation for the epiread convertion + * STEP 7.3 - SNP file generation for the epiread conversion */ process get_SNP_file { tag "$name" - publishDir "${params.outdir}/epireads/snp", mode: 'copy', + publishDir "${params.outdir}/epireads/snp", mode: params.publish_dir_mode, saveAs: {filename -> if( filename.indexOf("bed") > 0 && params.save_snp_file && filename != "where_are_my_files.txt") filename else null @@ -1397,7 +1397,12 @@ if( params.aligner == 'biscuit' ){ */ process epiread_conversion { tag "$name" - publishDir "${params.outdir}/epireads", mode: 'copy' + publishDir "${params.outdir}/epireads", mode: params.publish_dir_mode, + saveAs: {filename -> + if( params.debug_epiread && filename != "where_are_my_files.txt") filename + else if( filename.indexOf("original") < 0 ) filename + else null + } input: set val(name), @@ -1414,10 +1419,9 @@ if( params.aligner == 'biscuit' ){ .combine(ch_whitelist_for_epiread) file (assets) from ch_assets_dir_with_cpg_for_epiread.collect() - output: file "*${name}.e*.gz*" - file "${name}.original.epiread.*" optional true + file "${name}.original.epiread.*" script: snp_file = (snp.size()>0) ? "-B " + snp.toString() : '' @@ -1431,14 +1435,14 @@ if( params.aligner == 'biscuit' ){ biscuit epiread -q ${task.cpus} $snp_file $no_filter_reverse $fasta ${name}.bam |sort --parallel=${task.cpus} -T . -k1,1Vf -k5,5n | bgzip > ${name}.epiread.gz tabix -0 -s 1 -b 5 -e 5 ${name}.epiread.gz """ - } else if (params.debug_epiread) { + } else { """ zcat $cpg_file > cpg.bed bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n > ${name}.original.epiread - less ${name}.original.epiread | $projectDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err + less ${name}.original.epiread | $projectDir/bin/epiread_pairedEnd_conversion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.original.epiread | bgzip > ${name}.original.epiread.gz @@ -1447,18 +1451,6 @@ if( params.aligner == 'biscuit' ){ tabix -0 -s 1 -b 5 -e 5 ${name}.err.gz """ } - else { - """ - zcat $cpg_file > cpg.bed - bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam - samtools index ${name}.bam - biscuit epiread -q ${task.cpus} $snp_file $fasta ${name}.bam | sort --parallel=${task.cpus} -T . -k2,2 -k1,1 -k4,4 -k3,3n | $projectDir/bin/epiread_pairedEnd_convertion "cpg.bed" $snp ${name}.epiread $debug_merging_epiread > ${name}.err - sort -k1,1Vf -k 2,2n -k 3,3n --parallel=${task.cpus} -T . ${name}.epiread | bgzip > ${name}.epiread.gz - sort -k1,1Vf -k5,5n --parallel=${task.cpus} -T . ${name}.err | bgzip > ${name}.err.gz - tabix -0 -p bed ${name}.epiread.gz - tabix -0 -s 1 -b 5 -e 5 ${name}.err.gz - """ - } } } @@ -1467,7 +1459,7 @@ if( params.aligner == 'biscuit' ){ */ process biscuit_QC { tag "$name" - publishDir "${params.outdir}/biscuit_QC", mode: 'copy' + publishDir "${params.outdir}/biscuit_QC", mode: params.publish_dir_mode input: set val(name), @@ -1531,7 +1523,7 @@ process qualimap { */ process prepare_genome_to_picard { publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: 'copy' + saveAs: { (params.save_reference && it.indexOf("dict") >0) ? it : null }, mode: params.publish_dir_mode input: file fasta from ch_fasta_for_picard @@ -1561,7 +1553,7 @@ process prepare_genome_to_picard { */ process picard_metrics { tag "$name" - publishDir "${params.outdir}/picardMetrics", mode: 'copy', + publishDir "${params.outdir}/picardMetrics", mode: params.publish_dir_mode, saveAs: { filename -> if (filename.indexOf(".txt") > 0) filename else if (filename.indexOf(".pdf") > 0) "pdf/$filename" From 7ae0ffafa2d496f8a2e942a23e4633f4fbd53a43 Mon Sep 17 00:00:00 2001 From: ekushele Date: Mon, 22 Feb 2021 15:53:35 +0200 Subject: [PATCH 52/56] remove save_pileup_file, replaced with save_align_intermidiates --- docs/output.md | 4 ++-- main.nf | 6 ++---- nextflow.config | 1 - nextflow_schema.json | 6 ------ 4 files changed, 4 insertions(+), 13 deletions(-) diff --git a/docs/output.md b/docs/output.md index ead637da..e4a81106 100644 --- a/docs/output.md +++ b/docs/output.md @@ -189,10 +189,10 @@ Filename abbreviations stand for the following reference alignment strands: * Methylation statuses in [bedGraph](http://genome.ucsc.edu/goldenPath/help/bedgraph.html) format. * `sample.vcf.gz` * VCF file with the pileup information, used for creating the bedGraph file. - * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. + * **NB:** Only saved if `--save_align_intermeds` is specified when running the pipeline. * `sample.vcf.gz.tbi` * Index file for `sample.vcf.gz` - * **NB:** Only saved if `--save_pileup_file` is specified when running the pipeline. + * **NB:** Only saved if `--save_align_intermeds` is specified when running the pipeline. **NB** if `--epriread` is specified in the pipeline, then: **output directory:** `results/epireads` : diff --git a/main.nf b/main.nf index b749a6d1..1cf8b433 100644 --- a/main.nf +++ b/main.nf @@ -41,7 +41,6 @@ def helpMessage() { --nondirectional_library [bool] Run alignment against all four possible strands for Biscuit aligner --save_align_intermeds [bool] Save aligned intermediates to results directory --save_trimmed [bool] Save trimmed reads to results directory - --save_pileup_file [bool] Save VCF-pileup and VCF-index files from biscuit aligner to results directory --save_snp_file [bool] Save SNP bed-file from biscuit to results directory. Relevant only if '--epiread' is specified --unmapped [bool] Save unmapped reads to fastq files --relax_mismatches [bool] Turn on to relax stringency for alignment (set allowed penalty with --num_mismatches) @@ -360,7 +359,6 @@ if(params.save_reference) save_intermeds.add('Reference genome build') if(params.save_trimmed) save_intermeds.add('Trimmed FastQ files') if(params.unmapped) save_intermeds.add('Unmapped reads') if(params.save_align_intermeds) save_intermeds.add('Intermediate BAM files') -if(params.save_pileup_file) save_intermeds.add('Pileup files') if(params.save_snp_file) save_intermeds.add('SNP bed-files') if(save_intermeds.size() > 0) summary['Save Intermediates'] = save_intermeds.join(', ') debug_mode = []; @@ -1264,8 +1262,8 @@ if( params.aligner == 'biscuit' ){ tag "$name" publishDir "${params.outdir}/methylation_extract", mode: params.publish_dir_mode, saveAs: {filename -> - if( !params.save_pileup_file && filename == "where_are_my_files.txt") filename - else if( filename.indexOf("vcf.gz") > 0 && params.save_pileup_file && filename != "where_are_my_files.txt") filename + if( !params.save_align_intermeds && filename == "where_are_my_files.txt") filename + else if( filename.indexOf("vcf.gz") > 0 && params.save_align_intermeds && filename != "where_are_my_files.txt") filename else null } diff --git a/nextflow.config b/nextflow.config index f146ff69..34250d1a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -60,7 +60,6 @@ params { whitelist = false blacklist = false common_dbsnp = false - save_pileup_file = false save_snp_file = false epiread = false debug_epiread = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 85491434..4293fd0a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -420,12 +420,6 @@ "help_text": "Path to a directory containing needed file for biscuit-QC step\n> **NB** If none provided, will be generated automatically.", "fa_icon": "fab fa-buffer" }, - "save_pileup_file": { - "type": "boolean", - "description": "Save VCF-pileup and VCF-index files to results directory", - "help_text": "By default, the VCF and VCF-index files generated by `biscuit pileup` will not be save to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete. Relevant only if `--epiread` is set.", - "fa_icon": "fas fa-save" - }, "save_snp_file": { "type": "boolean", "description": "Save SNP bed-file to results directory", From 6c377785f77da40ab823bf14c825dccffb6e4c8b Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Tue, 30 Mar 2021 22:43:44 +0200 Subject: [PATCH 53/56] Minor whitespace formatting & cleanup --- docs/output.md | 2 +- main.nf | 109 +++++++++++++++++++++++++------------------------ 2 files changed, 57 insertions(+), 54 deletions(-) diff --git a/docs/output.md b/docs/output.md index 75ff2528..26f9c55d 100644 --- a/docs/output.md +++ b/docs/output.md @@ -69,7 +69,7 @@ Single-end data will have slightly different file names and only one FastQ file ### Alignment -Bismark, bwa-meth and BISCUIT convert all Cytosines contained within the sequenced reads to Thymine _in-silico_ and then align against a three-letter reference genome. This method avoids methylation-specific alignment bias. The alignment produces a BAM file of genomic alignments. _+__________ +Bismark, bwa-meth and BISCUIT convert all Cytosines contained within the sequenced reads to Thymine _in-silico_ and then align against a three-letter reference genome. This method avoids methylation-specific alignment bias. The alignment produces a BAM file of genomic alignments. **Bismark output directory: `results/bismark_alignments/`** _Note that bismark can use either use Bowtie2 (default) or HISAT2 as alignment tool and the output file names will not differ between the options._ diff --git a/main.nf b/main.nf index 680c8631..33e18585 100644 --- a/main.nf +++ b/main.nf @@ -35,7 +35,6 @@ if (params.validate_params) { // These params need to be set late, after the iGenomes config is loaded params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false -params.fasta_index = params.genome ? params.genomes[ params.genome ].fasta_index ?: false : false assembly_name = (params.fasta.toString().lastIndexOf('/') == -1) ?: params.fasta.toString().substring( params.fasta.toString().lastIndexOf('/')+1) // Check if genome exists in the config file @@ -204,26 +203,25 @@ if (params.aligner == 'biscuit') { if (params.whitelist) { Channel - .fromPath(params.whitelist, checkIfExists: true) - .ifEmpty { exit 1, "Cannot find any whitelist file matching: ${params.whitelist}" } - .into { ch_whitelist_for_SNP; ch_whitelist_for_epiread} + .fromPath(params.whitelist, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any whitelist file matching: ${params.whitelist}" } + .into { ch_whitelist_for_SNP; ch_whitelist_for_epiread } } else { Channel - .fromPath(params.blacklist, checkIfExists: true) - .ifEmpty { exit 1, "Cannot find any blacklist file matching: ${params.blacklist}" } - .set { ch_blacklist_for_create_whitelist;} + .fromPath(params.blacklist, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any blacklist file matching: ${params.blacklist}" } + .set { ch_blacklist_for_create_whitelist } } if (params.common_dbsnp) { Channel - .fromPath(params.common_dbsnp, checkIfExists: true) - .ifEmpty { exit 1, "Cannot find any dbSNP file matching: ${params.common_dbsnp}\n" } - .set { ch_commonSNP_for_SNP; } + .fromPath(params.common_dbsnp, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find any dbSNP file matching: ${params.common_dbsnp}\n" } + .set { ch_commonSNP_for_SNP } } - } - else - { + + } else { ch_fasta_for_create_whitelist.close() } } @@ -272,12 +270,12 @@ if(params.save_reference) save_intermeds.add('Reference genome build') if(params.save_trimmed) save_intermeds.add('Trimmed FastQ files') if(params.unmapped) save_intermeds.add('Unmapped reads') if(params.save_align_intermeds) save_intermeds.add('Intermediate BAM files') -if(params.save_snp_file) save_intermeds.add('SNP bed-files') +if(params.save_snp_file) save_intermeds.add('SNP bed-files') if(save_intermeds.size() > 0) summary['Save Intermediates'] = save_intermeds.join(', ') debug_mode = []; if(params.debug_epiread) debug_mode.add('Debug epiread step') if(params.debug_epiread_merging) debug_mode.add('Debug epiread merging') -if(debug_mode.size() > 0) summary['Debug mode'] = debug_mode.join(', ') +if(debug_mode.size() > 0) summary['Debug mode'] = debug_mode.join(', ') if(params.minins) summary['Bismark min insert size'] = bismark_minins if(params.maxins || params.em_seq) summary['Bismark max insert size'] = bismark_maxins if(params.bismark_align_cpu_per_multicore) summary['Bismark align CPUs per --multicore'] = params.bismark_align_cpu_per_multicore @@ -1159,8 +1157,8 @@ if( params.aligner == 'biscuit' ){ def sort_mem = avail_mem && avail_mem > 2 ? "-m ${avail_mem}G" : '' """ samtools sort $samblaster_bam \\ - -@ ${task.cpus} $sort_mem -l 9 \\ - -o ${samblaster_bam.baseName}.sorted.bam + -@ ${task.cpus} $sort_mem -l 9 \\ + -o ${samblaster_bam.baseName}.sorted.bam samtools index ${samblaster_bam.baseName}.sorted.bam samtools flagstat ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_flagstat_report.txt samtools stats ${samblaster_bam.baseName}.sorted.bam > ${samblaster_bam.baseName}_stats_report.txt @@ -1322,12 +1320,12 @@ if( params.aligner == 'biscuit' ){ file(snp), file(fasta), file(fasta_index), - file(whitelist) from ch_bam_sorted_for_epiread - .join(ch_bam_index_for_epiread) - .join(ch_snp_for_epiread) - .combine(ch_fasta_for_epiread) - .combine(ch_fasta_index_for_epiread) - .combine(ch_whitelist_for_epiread) + file(whitelist) from ch_bam_sorted_for_epiread + .join(ch_bam_index_for_epiread) + .join(ch_snp_for_epiread) + .combine(ch_fasta_for_epiread) + .combine(ch_fasta_index_for_epiread) + .combine(ch_whitelist_for_epiread) file (assets) from ch_assets_dir_with_cpg_for_epiread.collect() output: @@ -1340,14 +1338,14 @@ if( params.aligner == 'biscuit' ){ debug_merging_epiread = (params.debug_epiread_merging || params.debug_epiread) ? "debug" : '' no_filter_reverse = params.rrbs ? "-p" : '' if (params.single_end) { - """ + """ bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam samtools index ${name}.bam biscuit epiread -q ${task.cpus} $snp_file $no_filter_reverse $fasta ${name}.bam |sort --parallel=${task.cpus} -T . -k1,1Vf -k5,5n | bgzip > ${name}.epiread.gz tabix -0 -s 1 -b 5 -e 5 ${name}.epiread.gz - """ + """ } else { - """ + """ zcat $cpg_file > cpg.bed bedtools intersect -abam $bam -b $whitelist -ubam -f 1.0 | samtools view -Sb - > ${name}.bam @@ -1360,7 +1358,7 @@ if( params.aligner == 'biscuit' ){ tabix -0 -s 1 -b 5 -e 5 ${name}.original.epiread.gz tabix -0 -p bed ${name}.epiread.gz tabix -0 -s 1 -b 5 -e 5 ${name}.err.gz - """ + """ } } } @@ -1450,10 +1448,10 @@ process prepare_genome_to_picard { avail_mem = task.memory.toGiga() } """ - mv ${fasta} ${fasta.baseName}.picard.fa - picard -Xmx${avail_mem}g CreateSequenceDictionary \\ - R=${fasta.baseName}.picard.fa \\ - O=${fasta.baseName}.picard.dict + mv ${fasta} ${fasta.baseName}.picard.fa + picard -Xmx${avail_mem}g CreateSequenceDictionary \\ + R=${fasta.baseName}.picard.fa \\ + O=${fasta.baseName}.picard.dict """ } @@ -1488,30 +1486,35 @@ process picard_metrics { } """ picard -Xmx${avail_mem}g CollectInsertSizeMetrics \\ - INPUT=$bam \\ - OUTPUT=${name}.insert_size_metrics.txt \\ - HISTOGRAM_FILE=${name}.insert_size_histogram.pdf \\ - ASSUME_SORTED=true \\ - VALIDATION_STRINGENCY=LENIENT + INPUT=$bam \\ + OUTPUT=${name}.insert_size_metrics.txt \\ + HISTOGRAM_FILE=${name}.insert_size_histogram.pdf \\ + ASSUME_SORTED=true \\ + VALIDATION_STRINGENCY=LENIENT set +e picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ - INPUT=$bam \\ - OUTPUT=${name}.gc_bias_metrics.txt \\ - CHART=${name}.gc_bias_metrics.pdf \\ - SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ - ASSUME_SORTED=true \\ - IS_BISULFITE_SEQUENCED=true \\ - REFERENCE_SEQUENCE=$fasta \\ - VALIDATION_STRINGENCY=LENIENT - [ ! "\$?" -eq "0" ] && picard -Xmx${avail_mem}g ReorderSam I=$bam O=${bam.baseName}.picard.bam SEQUENCE_DICTIONARY=$fasta VALIDATION_STRINGENCY=LENIENT TMP_DIR=. && picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ - INPUT=${bam.baseName}.picard.bam \\ - OUTPUT=${name}.gc_bias_metrics.txt \\ - CHART=${name}.gc_bias_metrics.pdf \\ - SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ - ASSUME_SORTED=true \\ - IS_BISULFITE_SEQUENCED=true \\ - REFERENCE_SEQUENCE=$fasta \\ - VALIDATION_STRINGENCY=LENIENT + INPUT=$bam \\ + OUTPUT=${name}.gc_bias_metrics.txt \\ + CHART=${name}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ + ASSUME_SORTED=true \\ + IS_BISULFITE_SEQUENCED=true \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT + [ ! "\$?" -eq "0" ] && picard -Xmx${avail_mem}g ReorderSam \\ + I=$bam O=${bam.baseName}.picard.bam \\ + SEQUENCE_DICTIONARY=$fasta \\ + VALIDATION_STRINGENCY=LENIENT \\ + TMP_DIR=. && \\ + picard -Xmx${avail_mem}g CollectGcBiasMetrics \\ + INPUT=${bam.baseName}.picard.bam \\ + OUTPUT=${name}.gc_bias_metrics.txt \\ + CHART=${name}.gc_bias_metrics.pdf \\ + SUMMARY_OUTPUT=${name}.summary_metrics.txt \\ + ASSUME_SORTED=true \\ + IS_BISULFITE_SEQUENCED=true \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT echo "fine" """ } From 4ab3f8a463a151012b088ad62e970ef4021f2417 Mon Sep 17 00:00:00 2001 From: ekushele Date: Sun, 4 Apr 2021 10:32:29 +0300 Subject: [PATCH 54/56] make 'common' group in nextflow_schema.json, remove nondirectional_library and min_coverage --- docs/output.md | 2 +- main.nf | 7 +++--- nextflow.config | 2 -- nextflow_schema.json | 52 ++++++++++++++++++++++---------------------- 4 files changed, 30 insertions(+), 33 deletions(-) diff --git a/docs/output.md b/docs/output.md index 26f9c55d..2dae70a6 100644 --- a/docs/output.md +++ b/docs/output.md @@ -150,7 +150,7 @@ This step removes alignments with identical mapping position to avoid technical The methylation extractor step takes a BAM file with aligned reads and generates files containing cytosine methylation calls. It produces a few different output formats, described below. -Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` (or `nondirectional_library`) or `--skip_deduplication` or `--rrbs` when running the pipeline. +Note that the output may vary a little depending on whether you specify `--comprehensive` or `--non_directional` or `--skip_deduplication` or `--rrbs` when running the pipeline. Filename abbreviations stand for the following reference alignment strands: diff --git a/main.nf b/main.nf index 33e18585..ba8e7d46 100644 --- a/main.nf +++ b/main.nf @@ -258,11 +258,10 @@ if(params.cegx) summary['Trim Profile'] = 'CEGX' if(params.em_seq) summary['Trim Profile'] = 'EM Seq' summary['Trimming'] = "5'R1: $clip_r1 / 5'R2: $clip_r2 / 3'R1: $three_prime_clip_r1 / 3'R2: $three_prime_clip_r2" summary['Deduplication'] = params.skip_deduplication || params.rrbs ? 'No' : 'Yes' -summary['Directional Mode'] = params.single_cell || params.zymo || params.non_directional || params.nondirectional_library ? 'No' : 'Yes' +summary['Directional Mode'] = params.single_cell || params.zymo || params.non_directional ? 'No' : 'Yes' summary['All C Contexts'] = params.comprehensive ? 'Yes' : 'No' summary['Cytosine report'] = params.cytosine_report ? 'Yes' : 'No' if(params.min_depth) summary['Minimum Depth'] = params.min_depth -if(params.min_coverage) summary['Minimum Coverage'] = params.min_coverage if(params.ignore_flags) summary['MethylDackel'] = 'Ignoring SAM Flags' if(params.methyl_kit) summary['MethylDackel'] = 'Producing methyl_kit output' save_intermeds = []; @@ -1073,7 +1072,7 @@ if( params.aligner == 'biscuit' ){ assembly = fasta.replaceAll(/\.\w+/,"") prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?(\.bz2)?$/ - non_directional = params.single_cell || params.zymo || params.nondirectional_library ? 0 : 1 + non_directional = params.single_cell || params.zymo || params.non_directional ? 0 : 1 // Paired-end or single-end input files and pbat or not input = params.pbat ? params.single_end ? reads + " -b 3" : "${reads[1]} ${reads[0]} -b " + non_directional : reads.toString() +" -b " + non_directional @@ -1209,7 +1208,7 @@ if( params.aligner == 'biscuit' ){ set val(name), file("*bedgraph" ) into ch_bedgraph_for_intersect_soloWCGW script: - min_depth = params.min_coverage > 1 ? "${params.min_coverage}" : '1' + min_depth = params.min_depth > 1 ? "${params.min_depth}" : '1' all_contexts = params.comprehensive ? 'c, cg, ch, hcg, gch' : 'cg' """ biscuit vcf2bed -k $min_depth -t $all_contexts "${vcf[0]}" > "${name}.bedgraph" diff --git a/nextflow.config b/nextflow.config index 11585764..56b24b26 100644 --- a/nextflow.config +++ b/nextflow.config @@ -32,10 +32,8 @@ params { meth_cutoff = false methyl_kit = false min_depth = 0 - min_coverage = 1 skip_deduplication = false non_directional = false - nondirectional_library = false skip_trimming = false outdir = './results' save_align_intermeds = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 7e69a8a2..b3b24b8f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -112,7 +112,7 @@ "type": "boolean", "fa_icon": "fas fa-cut", "description": "Trimming preset for single-cell bisulfite libraries.", - "help_text": "Equivalent to `--clip_r1 6` `--clip_r2 6` `--three_prime_clip_r1 6` `--three_prime_clip_r2 6`.\n\nAlso sets the `--non_directional` flag for Bismark and the `--nondirectional_library` flag for BISCUIT." + "help_text": "Equivalent to `--clip_r1 6` `--clip_r2 6` `--three_prime_clip_r1 6` `--three_prime_clip_r2 6`.\n\nAlso sets the `--non_directional` flag for Bismark or BISCUIT." }, "accel": { "type": "boolean", @@ -136,7 +136,7 @@ "type": "boolean", "fa_icon": "fas fa-cut", "description": "Trimming preset for the Zymo kit.", - "help_text": "Equivalent to `--clip_r1 10` `--clip_r2 15` `--three_prime_clip_r1 10` `--three_prime_clip_r2 10`.\n\nAlso sets the `--non_directional` flag for Bismark and the `--nondirectional_library` flag for BISCUIT.." + "help_text": "Equivalent to `--clip_r1 10` `--clip_r2 15` `--three_prime_clip_r1 10` `--three_prime_clip_r2 10`.\n\nAlso sets the `--non_directional` flag for Bismark or for BISCUIT.." } }, "fa_icon": "fas fa-prescription-bottle" @@ -251,12 +251,6 @@ "default": "", "fa_icon": "fas fa-circle", "properties": { - "non_directional": { - "type": "boolean", - "description": "Run alignment against all four possible strands.", - "help_text": "By default, Bismark assumes that libraries are directional and does not align against complementary strands. If your library prep was not directional, use `--non_directional` to align against all four possible strands.\n\nNote that the `--single_cell` and `--zymo` parameters both set the `--non_directional` workflow flag automatically.", - "fa_icon": "fas fa-exchange-alt" - }, "cytosine_report": { "type": "boolean", "description": "Output stranded cytosine report during Bismark's bismark_methylation_extractor step.", @@ -334,12 +328,6 @@ "description": "Parameters specific to the bwa-meth workflow", "default": "", "properties": { - "min_depth": { - "type": "integer", - "description": "Specify a minimum read coverage for MethylDackel to report a methylation call.", - "default": 0, - "fa_icon": "fas fa-angle-double-down" - }, "ignore_flags": { "type": "boolean", "description": "MethylDackel - ignore SAM flags", @@ -361,18 +349,6 @@ "description": "Parameters specific to the BISCUIT workflow", "default": "", "properties": { - "min_coverage": { - "type": "integer", - "description": "Specify a minimum read coverage for information extraction from the VCF file to bed file.", - "default": "1", - "fa_icon": "fas fa-angle-double-down" - }, - "nondirectional_library": { - "type": "boolean", - "description": "Run alignment against all four possible strands.", - "help_text": "By default, the BISCUIT pipeline assumes that libraries are directional and does not align against complementary strands. If your library prep was not directional, use `--nondirectional_library` to align against all four possible strands.\n\nNote that the `--single_cell` and `--zymo` parameters both set the `--nondirectional_library` workflow flag automatically.", - "fa_icon": "fas fa-exchange-alt" - }, "epiread": { "type": "boolean", "description": "Specify a minimum read coverage for MethylDackel to report a methylation call.", @@ -425,6 +401,27 @@ } }, "fa_icon": "far fa-circle" + }, + "common_options": { + "title": "common options", + "type": "object", + "description": "Parameters that are common between pipelines", + "default": "", + "properties": { + "min_depth": { + "type": "integer", + "description": "Specify a minimum read coverage for MethylDackel to report a methylation call in bwa-meth workflow, or a minimum read coverage for information extraction from the VCF file to bed file in BISCUIT workflow.", + "default": 0, + "fa_icon": "fas fa-angle-double-down" + }, + "non_directional": { + "type": "boolean", + "description": "Run alignment against all four possible strands.", + "help_text": "By default, Bismark and BISCUIT assume that libraries are directional and does not align against complementary strands. If your library prep was not directional, use `--non_directional` to align against all four possible strands.\n\nNote that the `--single_cell` and `--zymo` parameters both set the `--non_directional` workflow flag automatically.", + "fa_icon": "fas fa-exchange-alt" + } + }, + "fa_icon": "far fa-circle" }, "skip_pipeline_steps": { "title": "Skip pipeline steps", @@ -649,6 +646,9 @@ }, { "$ref": "#/definitions/biscuit_options" + }, + { + "$ref": "#/definitions/common_options" }, { "$ref": "#/definitions/skip_pipeline_steps" From 26de2f1859be0f08acebd7ec5466cf7f814352f8 Mon Sep 17 00:00:00 2001 From: ekushele Date: Sun, 4 Apr 2021 11:05:20 +0300 Subject: [PATCH 55/56] use getSimpleName instead of assembly_name --- main.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main.nf b/main.nf index ba8e7d46..9b811400 100644 --- a/main.nf +++ b/main.nf @@ -35,7 +35,6 @@ if (params.validate_params) { // These params need to be set late, after the iGenomes config is loaded params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false -assembly_name = (params.fasta.toString().lastIndexOf('/') == -1) ?: params.fasta.toString().substring( params.fasta.toString().lastIndexOf('/')+1) // Check if genome exists in the config file if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { @@ -1256,7 +1255,7 @@ if( params.aligner == 'biscuit' ){ file("whitelist.${name}.bed.gz" ) into ch_whitelist_for_SNP, ch_whitelist_for_epiread file "sizes.${name}" script: - name = assembly_name - '.fa' + name = fasta_index.getSimpleName() // - '.fa' - '.fai' """ cut -f1,2 $fasta_index > sizes.${name} From d3bff3af387f5533d6a8c4fab7da7f9b82a0eb03 Mon Sep 17 00:00:00 2001 From: ekushele Date: Sun, 13 Jun 2021 08:57:32 +0300 Subject: [PATCH 56/56] add optional to output section for epiread original file --- main.nf | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 9b811400..775e970f 100644 --- a/main.nf +++ b/main.nf @@ -431,11 +431,16 @@ if(!params.bwa_biscuit_index && params.aligner == 'biscuit' ){ tag "$fasta" publishDir path: "${params.outdir}/reference_genome", saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode - input: - file fasta from ch_fasta_for_makeBwaMemIndex + // input: + // file fasta from ch_fasta_for_makeBwaMemIndex + + // output: + // file "${fasta}*" into ch_bwa_index_for_biscuit + input: + path fasta, stageAs 'fasta*' from ch_fasta_for_makeBwaMemIndex output: - file "${fasta}*" into ch_bwa_index_for_biscuit + file fasta into ch_bwa_index_for_biscuit script: """ @@ -1328,7 +1333,7 @@ if( params.aligner == 'biscuit' ){ output: file "*${name}.e*.gz*" - file "${name}.original.epiread.*" + file "${name}.original.epiread.*" optional true script: snp_file = (snp.size()>0) ? "-B " + snp.toString() : ''