-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml
121 lines (95 loc) · 4.66 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Configuration file for SCVUSv3
# Author: Teresita M. Porter
# Date: February, 24, 2020
############################################################################
# How to run:
############################################################################
# Create a new conda environment (only need to do this once)
# conda env create -f environment.yml
# Activate conda environment
# conda acitivate myenv.3
# Edit filenames, paths, and parameters in this file (config.yaml)
# Run snakemake (edit number of jobs according to number of cores you have)
# snakemake --jobs 24 --snakefile snakefile --configfile config.yaml
############################################################################
# Identify raw read files
############################################################################
# This directory contains raw reads (all of them or from just one run)
# Only use compressed fastq files with this pipeline, ex. fastq.gz
# For the standard pipeline, call the directory "data"
raw: "data"
# Indicate 'sample' and 'read' wildcards in raw filenames:
# These files should be in a "data" or "run1", etc. folder
# Sample filename structure,
# SITE-CONDITION-REPLICATE_S1_L001_R1_001.fastq.gz
# {sample}_L001_R{read}_001.fastq.gz
raw_sample_read_wildcards: "data/{sample}_L001_R{read}_001.fastq.gz"
# SEQPREP sample wildcard and parameters
# These files should be in a "data" or "run1", etc. folder
# Sample,
# {sample}_L001_R1_001.fastq.gz
raw_sample_forward_wildcard: "data/{sample}_L001_R1_001.fastq.gz"
raw_sample_reverse_wildcard: "data/{sample}_L001_R2_001.fastq.gz"
############################################################################
# Raw read pairing
############################################################################
SEQPREP:
# Phred score quality cutoff
q: 20
# Minimum overlap length between forward and reverse reads
o: 25
############################################################################
# Primer trimming
############################################################################
# CUTADAPT parameters
# TAReuk454FWD1 Eukaryote 18Sv4 forward primer (Stoeck et al., 2010)
# TAReukREV3 Eukaryote 18Sv4 reverse primer (Stoeck et al., 2010)
CUTADAPT_FWD:
g: "CCAGCASCYGCGGTAATTCC"
m: 150
q: "20,20"
mn: 3
CUTADAPT_REV:
a: "TYRATCAAGAACGAAAGT"
m: 150
q: "20,20"
mn: 3
############################################################################
# Dereplication
############################################################################
# Indicate a directory name here that is short and simple with no spaces or weird punctuation
# For the standard pipeline, a good directory name would be the amplicon, ex. "18Sv4"
dir: "18Sv4"
############################################################################
# Denoising
############################################################################
# Indicate minimum number of reads per cluster to retain
# Here, remove all singletons and doubletons, retain clusters with 3+ reads
VSEARCH_DENOISE:
minsize: 3
############################################################################
# ESV x sample table
############################################################################
# VSEARCH params
VSEARCH_TABLE:
# Indicate number of threads to use
# Do not exceed the number of jobs allotted to run the whole pipeline (# jobs entered at command line above takes precedence)
t: 24
############################################################################
# Taxonomic assignment
############################################################################
# Uses the RDP classifier
# Do not use old RDP classifier v2.2 from conda, install the newer v2.12 from SourceForge https://sourceforge.net/projects/rdp-classifier/
# 18S Classifier v3.2 based on SILVA but compatible with RDP classifier available from GitHub https://github.com/terrimporter/18SClassifier
# nuclear-encoded 18S diatom Classifier v1 based on curated barcode sequences from the INRA (Rimet et al., 2016) is available from GitHub https://github.com/terrimporter/SSUdiatomClassifier
RDP:
jar: "/path/to/rdp_classifier_2.12/dist/classifier.jar"
t: "/path/to/18S_Eukaryota/v4.0/mydata/mydata_trained/rRNAClassifier.properties"
############################################################################
# Reformat CSV
############################################################################
# Add amplicon name to Zotu to keep these ids unique when data from many amplicons are combined
# The pattern will prefix the Zotu with the amplicon name
# Ex. sed -e 's/^/amplicon_/g' infile > outfile
# Below, enter the substitution pattern for sed to use (the part in single quotes above)
SED: 's/^/18Sv4_/g'