-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig_rbcL.yaml
166 lines (130 loc) · 5.87 KB
/
config_rbcL.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# Configuration file for SCVURLv1
# Author: Teresita M. Porter
# Date: January 24, 2020
############################################################################
# General pipeline settings
############################################################################
# Indicate number of cores available to run pipeline, snakefile, and configuration file at the command line:
# snakemake --jobs 24 --snakefile snakefile --configfile config.yaml
############################################################################
# Identify raw read files
############################################################################
# This directory contains raw reads (all of them or from just one run)
# Only use compressed fastq files with this pipeline, ex. fastq.gz
# For the standard pipeline, call the directory "data"
raw: "data"
# Indicate 'sample' and 'read' wildcards in raw filenames:
# These files should be in a "data" or "run1", etc. folder
# Sample filename structure,
# SITE-CONDITION-REPLICATE_S1_L001_R1_001.fastq.gz
# {sample}_L001_R{read}_001.fastq.gz
raw_sample_read_wildcards: "data/{sample}_L001_R{read}_001.fastq.gz"
# SEQPREP sample wildcard and parameters
# These files should be in a "data" or "run1", etc. folder
# Sample,
# {sample}_L001_R1_001.fastq.gz
raw_sample_forward_wildcard: "data/{sample}_L001_R1_001.fastq.gz"
raw_sample_reverse_wildcard: "data/{sample}_L001_R2_001.fastq.gz"
############################################################################
# Raw read pairing
############################################################################
SEQPREP:
# Phred score quality cutoff
q: 20
# Minimum overlap length between forward and reverse reads
o: 25
############################################################################
# Primer trimming
############################################################################
# CUTADAPT parameters for the rbcL-xxx amplicon
# All primers are from Rivera et al., 2008
# FWD primers (5'-3'):
# AGGTGAAGTAAAAGGTTCWTACTTAAA Diat_rbcL_708F_1
# AGGTGAAGTTAAAGGTTCWTAYTTAAA Diat_rbcL_708_2
# AGGTGAAACTAAAGGTTCWTACTTAAA Diat_rbcL_708F_3
# AGGTGAARYWAAAGGTTCWTAYTTAAA <- use this consensus sequence with cutadapt
# REV primers (5'-3'):
# CCTTCTAATTTACCWACWACTG Diat_rbcL_R3_1
# CCTTCTAATTTACCWACAACAG Diat_rbcL_R3_2
# REV primers (reverse complemented for cutadapt):
# CAGTWGTWGGTAAATTAGAAGG Diat_rbcL_R3_1_rc
# CTGTTGTWGGTAAATTAGAAGG Diat_rbcL_R3_2_rc
# CWGTWGTWGGTAAATTAGAAGG <- use this consensus sequence with cutadapt
CUTADAPT_FWD:
g: "AGGTGAARYWAAAGGTTCWTAYTTAAA"
m: 150
q: "20,20"
mn: 3
CUTADAPT_REV:
a: "CWGTWGTWGGTAAATTAGAAGG"
m: 250
q: "20,20"
mn: 3
############################################################################
# Dereplication
############################################################################
# Indicate a directory name here that is short and simple with no spaces or weird punctuation
# For the standard pipeline, a good directory name would be the amplicon, ex. "Diat_rbcL", "rbcL"
dir: "rbcL"
############################################################################
# Denoising
############################################################################
# Indicate minimum number of reads per cluster to retain
# Here, remove all singletons and doubletons, retain clusters with 3+ reads
VSEARCH_DENOISE:
minsize: 3
############################################################################
# Get CDS
############################################################################
# Translate ESVs into all open reading frames
# ORFfinder params
ORFFINDER:
# genetic code
# 5 = invertebrate mitochondrial, see NCBI for additional genetic codes
# 1 = standard code, use this for green plant chloroplast sequences
# 11 = bacterial, archaeal, plant plastid code, use for diatom chloroplast sequences
g: 1
# ORF start codon to use
# 0 = ATG only
# 1 = ATG and alternative initiation codon (default)
# 2 = any sense codon
s: 2
# minimum length (default 75, min 30)
ml: 30
# ignore nested ORFs (true|false)
n: 'true'
# strand (both|plus|minus)
strand: 'plus'
# outfiile format
# 0 = list of ORFs in FASTA format (aa)
# 1 = CDS fasta (nt)
# 2 = Text ASN.1
# 3 = Feature table
outfmt: 1
############################################################################
# ESV x sample table
############################################################################
# VSEARCH params
VSEARCH_TABLE:
# Indicate number of threads to use
# Do not exceed the number of jobs allotted to run the whole pipeline ('jobs' above)
t: 24
############################################################################
# Taxonomic assignment
############################################################################
# Uses the RDP classifier
# Do not use old RDP classifier v2.2 from conda, install the newer v2.12 from SourceForge https://sourceforge.net/projects/rdp-classifier/
# rbcL Classifier v1 based on sequences mined from GenBank is compatible with the RDP classifier is available from GitHub https://github.com/Hajibabaei-Lab/SCVURL_rbcL_metabarcode_pipeline
# rbcL Diatom Classifier v1 based on curated barcode sequences from INRA (Rimet et al., 2016 Database) is available from https://github.com/terrimporter/rbcLdiatomClassifier
RDP:
jar: "/path/to/rdp_classifier_2.12/dist/classifier.jar"
t: "/path/to/rbcLClassifier/v1/mydata/mydata_trained/rRNAClassifier.properties"
############################################################################
# Reformat CSV
############################################################################
# Add amplicon name to Zotu to keep these ids unique when data from many amplicons are combined
# The pattern will prefix the Zotu with the amplicon name
# Ex. sed -e 's/^/amplicon_/g' infile > outfile
# Below, enter the substitution pattern for sed to use (the part in single quotes above)
# ex. "rbcL_" or "rbcLdiatom_"
SED: 's/^/rbcL_/g'