-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_bam2gvcf_strelka.pl
executable file
·210 lines (169 loc) · 7.28 KB
/
2_bam2gvcf_strelka.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/perl
############################################################################################
# Copyright (C) Nicolas Thierry-Mieg, 2019-2025
#
# This file is part of grexome-TIMC-Primary, written by Nicolas Thierry-Mieg
# (CNRS, France) Nicolas.Thierry-Mieg@univ-grenoble-alpes.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with this program.
# If not, see <https://www.gnu.org/licenses/>.
############################################################################################
# 16/06/2019
# NTM
# Call variants using strelka on BAM files.
# GVCF files and strelka stats will be created in subdirs of $outDir,
# if a subdir matching an inFile already exists this inFile is skipped.
# When anything potentially important happens (eg skipping an infile),
# messages are logged on stdout.
#
# See $USAGE for arguments.
use strict;
use warnings;
use Getopt::Long;
use POSIX qw(strftime);
use File::Basename qw(basename);
# we use $0 in every stderr message but we really only want
# the program name, not the path
$0 = basename($0);
#############################################
## options / params from the command-line
# subdir where BAMs can be found
my $inDir;
# comma-separated list of samples (FASTQs) to process (required)
my $samples = '';
# path+filename of ref genome, currently we recommend the full GRCh38 with
# decoy+alts+unmapped, as produced by Heng Li's run-gen-ref (from bwa-kit)
my $refGenome;
# bgzipped and tabix-indexed BED defining regions where variants should be called,
# any other genomic region is ignored
my $chromsBed;
# dir where GVCF-containing subdirs will be created
my $outDir;
# path+name of configureStrelkaGermlineWorkflow.py from strelka distrib,
# or use a one-liner wrapper eg strelkaGermline.sh that can be in your PATH,
# this is the default (works on fauve, luxor...)
my $strelka = "strelkaGermline.sh";
# type of data: exome or genome
my $datatype = "exome";
# number of parallel jobs to run
my $jobs = 16;
# $real: if not true don't actually process anything, just print INFO
# messages on what would be done. Default to false
my $real = '';
# help: if true just print $USAGE and exit
my $help = '';
my $USAGE = "
Arguments (all can be abbreviated to shortest unambiguous prefixes):
--indir : subdir containing the BAMs
--samples : comma-separated list of sampleIDs to process, for each sample we expect
[sample].bam and [sample].bam.bai files in indir
--genome : ref genome fasta, with path
--chroms : optional, if provided it must be a bgzipped and tabix-indexed BED file
defining regions where variants should be called
--outdir : dir where GVCF-containing subdirs will be created (one subdir per sample)
--strelka [$strelka] : must be either the path+name of configureStrelkaGermlineWorkflow.py
(from strelka distrib), or a wrapper script that can be in your PATH
--datatype [$datatype] : type of data, among {exome,genome}
--jobs [$jobs] : number of cores that strelka can use
--real : actually do the work, otherwise this is a dry run
--help : print this USAGE";
GetOptions ("indir=s" => \$inDir,
"samples=s" => \$samples,
"genome=s" => \$refGenome,
"chroms=s" => \$chromsBed,
"outdir=s" => \$outDir,
"strelka=s" => \$strelka,
"datatype=s" => \$datatype,
"jobs=i" => \$jobs,
"real" => \$real,
"help" => \$help)
or die("E $0: Error in command line arguments\n$USAGE\n");
# make sure required options were provided and sanity check them
($help) && die "$0 $USAGE\n\n";
($inDir) ||
die "E $0: you MUST provide --indir where BAMs can be found\n$USAGE\n";
(-d $inDir) ||
die "E $0: inDir specified is not a folder!";
# save samples in %samples to detect duplicates and allow sorting
my %samples;
foreach my $sample (split(/,/, $samples)) {
if ($samples{$sample}) {
warn "W $0: sample $sample was specified twice, is that a typo? Ignoring the dupe\n";
next;
}
$samples{$sample} = 1;
}
($refGenome) || die "E $0: you must provide a ref genome fasta file\n";
(-f $refGenome) || die "E $0: provided genome fasta file doesn't exist\n";
if ($chromsBed) {
(-f $chromsBed) || die "E $0: provided --chroms file doesn't exist\n";
(-f "$chromsBed.tbi") || (-f "$chromsBed.csi") ||
die "E $0: can't find tabix index for provided --chroms file\n";
}
($datatype eq 'exome') || ($datatype eq 'genome') ||
die "E $0: illegal datatype $datatype, must be among {exome,genome}\n";
($jobs > 0) ||
die "E $0: called with jobs=$jobs but we need at least one thread\n";
($outDir) ||
die "E $0: you MUST specify --outdir where GVCF-containing subdirs will be created\n$USAGE\n";
(-d $outDir) || (mkdir($outDir)) ||
die "E $0: outDir $outDir doesn't exist as a dir and can't be created\n";
# make sure strelka can be found
system("which $strelka &> /dev/null") &&
die "E $0: the strelka python (or shell wrapper) $strelka can't be found\n";
#############################################
## process each sample of interest
my $now = strftime("%F %T", localtime);
warn "I $now: $0 - STARTING TO WORK\n";
foreach my $sample (sort keys(%samples)) {
# make sure we have bam and bai files for $sample, otherwise skip
my $bam = "$inDir/$sample.bam";
((-e $bam) && (-e "$bam.bai")) ||
((warn "W $0: no BAM or BAI for $sample in inDir $inDir, skipping $sample\n") && next);
# strelka will produce a GVCF but also other files (stats etc) in $runDir
my $runDir = "$outDir/$sample/";
# strelka configure command
my $com = "$strelka --bam $bam --referenceFasta $refGenome";
($chromsBed) && ($com .= " --callRegions $chromsBed");
($datatype eq 'exome') && ($com .= " --exome");
$com .= " --runDir $runDir > /dev/null";
# only run the strelka configuration step if runDir doesn't exist
if (! -e $runDir) {
if (! $real) {
warn "I $0: dryrun, would configure strelka for $sample with: $com\n";
}
else {
$now = strftime("%F %T", localtime);
warn "I $now: $0 - configuring strelka for $sample\n";
system($com);
}
}
else {
warn "I $0: runDir $runDir already exists, assuming strelka is already configured\n";
}
# now run strelka (does nothing if it was already completed, but resumes
# if it was interrupted, nice)
# NOTE: we use --quiet since strelka is very verbose and stderr is replicated
# to workspace/pyflow.data/logs/pyflow_log.txt anyways
$com = "$runDir/runWorkflow.py -m local -j $jobs --quiet";
if (! $real) {
warn "I $0: dryrun, would run strelka for $sample with: $com\n";
}
else {
(-e "$runDir/runWorkflow.py") ||
((warn "I $0: want to run strelka for $sample but runDir $runDir doesn't exist, configure failed??\n") && next);
$now = strftime("%F %T", localtime);
warn "I $now: $0 - running strelka for $sample\n";
system($com);
}
}
$now = strftime("%F %T", localtime);
warn "I $now: $0 - ALL DONE\n";