-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0_grexomizeFastqs.pl
executable file
·254 lines (211 loc) · 10.2 KB
/
0_grexomizeFastqs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/perl
############################################################################################
# Copyright (C) Nicolas Thierry-Mieg, 2019-2025
#
# This file is part of grexome-TIMC-Primary, written by Nicolas Thierry-Mieg
# (CNRS, France) Nicolas.Thierry-Mieg@univ-grenoble-alpes.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with this program.
# If not, see <https://www.gnu.org/licenses/>.
############################################################################################
# 13/06/2019
# NTM
# This is a house-keeping script used to organize our FASTQ files before
# running the grexome-TIMC-primary pipeline. It's probably not re-usable as-is.
#
# Sometimes we get several pairs of FASTQ files for a single sample,
# eg when the sample was sequenced on several lanes.
# Also, the naming conventions are always different.
#
# This script creates a single pair of "files" (symlinks when possible)
# in $inPath/../$outDir/ for each sample, with uniform naming scheme
# ${sample}_[12].fq.gz .
#
# Basically we want to symlink when there's a single pair of files,
# and otherwise cat @infiles1 > $outDir/${sample}_1.fq.gz
# and same for @infiles2 (in same order).
# NOTE: we CAN just concatenate multiple gzip files and get a sane gzip file,
# see gzip spec https://tools.ietf.org/html/rfc1952 (grep for "members")
#
# We try to find original FASTQs automatically, by looking for .gz files
# in subdirs of $inPath whose name contains specimenID from the xlsx file.
# This is a bit fragile but we sanity-check as much as possible: we make
# sure each source file is used at most once, and we report at the end
# if every source fastq was actually used. Always do a dry run initially
# and examine the log! Then add --real to do the real work.
#
# See $USAGE for arguments.
use strict;
use warnings;
use File::Basename qw(basename);
use FindBin qw($RealBin);
use Getopt::Long;
use lib "$RealBin";
use grexome_metaParse qw(parseSamples);
# make glob case-insensitive (for eg */P17if085*)
use File::Glob qw(:globally :nocase);
# we use $0 in every stderr message but we really only want
# the program name, not the path
$0 = basename($0);
#############################################
## options / params from the command-line
# filename with path to samples metadata xlsx file
my $samplesFile;
# path containing subdirs containing the original fq.gz files
my $inPath = '';
# brother-dir of $inPath where new FASTQs are created/symlinked, must
# be a plain dirname (no slashes allowed)
my $outDir = "FASTQs_All_Grexomized";
# real: if true actually process files, otherwise just print INFO
# messages on what would be done. Default to false (==dry run)
my $real = '';
# help: if true just print $USAGE and exit
my $help = '';
my $USAGE = "
Arguments (all can be abbreviated to shortest unambiguous prefixes):
--samplesFile string : samples metadata xlsx file, with path.
--inpath string : path to a directory containing subdirs, each containing the
actual original fastq.gz files (eg FASTQs_grexome/).
--outdir string [default: $outDir] : name of brother-dir of \$inPath where \$sampleID_*.fq.gz
files are symlinked/produced. NOTE: outDir cannot contain a slash, it will be created
if needed and will be a BROTHER of the final subdir of \$inPath ie \$inPath/../\$outDir.
--real : actually do the work, default is a dry run ie only print info on what would be done.
--help : print this USAGE";
GetOptions ("samplesFile=s" => \$samplesFile,
"inpath=s" => \$inPath,
"outdir=s" => \$outDir,
"real" => \$real,
"help" => \$help)
or die("E $0: Error in command line arguments\n\n$USAGE\n");
# make sure required options were provided and sanity check them
($help) && die "$USAGE\n\n";
($samplesFile) || die "E $0: you must provide a samplesFile file\n";
(-f $samplesFile) || die "E $0: the supplied samplesFile file doesn't exist\n";
($inPath) ||
die "E: you MUST specify the path holding subdirs that contain the original FASTQs, with --inpath\n";
(-d $inPath) ||
die "E: the provided inpath $inPath doesn't exist as a dir\n";
($outDir =~ m~/~) &&
die "$USAGE\n\nE: the provided outdir $outDir has a slash, it must be a plain name\n";
# strip any trailing slashes from $inPath, and split $inPath into $parentDir and $lastDir
$inPath =~ s~/*$~~ ;
my ($parentDir,$lastDir);
if ($inPath =~ m~^(.+/)([^/]+)$~) {
($parentDir,$lastDir) = ($1,$2);
}
else {
($parentDir,$lastDir) = ("",$inPath);
}
# create $outDir if needed
(-d "$parentDir$outDir/") || (mkdir("$parentDir$outDir/")) ||
die "E: outDir $parentDir$outDir doesn't exist as a dir but can't be created\n";
#############################################
## parse $samplesFile
# $sample2specimenR : hashref, key is sampleID, value is specimenID
my $sample2specimenR;
{
my @parsed = &parseSamples($samplesFile);
$sample2specimenR = $parsed[1];
}
#############################################
# now deal with each sample
# make sure we don't use the same infile twice (eg if globbing isn't specific enough)
# key == infile (as found by the glob), value == $sample if file was already used
my %infilesDone = ();
foreach my $sample (sort keys(%$sample2specimenR)) {
my $specimen = $sample2specimenR->{$sample};
# precise filename patterns for each dataset:
# strasbourg: /${specimen}-R1.fastq.gz
# integragen: /Sample_${specimen}_R1_fastq.gz
# FV: /${specimen}_*_R1_001.fastq.gz
# genoscope13: /G430_CP_${specimen}_[0-9]_1_*.fastq.gz
# genoscope14 and 15: /E487_CP_${specimen}_[0-9]_1_*.fastq.gz
# novo16: /s${specimen}_*_1.fq.gz
# novo17: /P${specimen}_*_1.fq.gz
# novo19: /${specimen}_*_1.fq.gz
# Biomnis19: /${specimen}_*_R1_001.fastq.gz
# IBP_2019: /${specimen}_R1_001.fastq.gz (they fucked up the names with i for 1 but I fixed it)
# IBP_2019 second batch (01/08/2019): /${specimen}_R1.fastq.gz
# IBP_2019 later batches (10/11/2020): /${specimen}_*_R1_001.fastq.gz
# IBP_2019 auto-uploaded batches (04/01/2023): /INF-${specimen}_*_R1_001.fastq.gz
# IBP_2019 re-sequenced botched samples (20/01/2023): /INF-${specimen}BIS_*_R1_001.fastq.gz
# BGI_2021: /\w+_L\d_{$specimen}-\d+_1.fq.gz (eg: V300096729_L4_B5EHUMazpEBAAIBAA-522_1.fq.gz)
# Macrogen_2021: /${specimen}_1.fastq.gz
# these are unified into the following globs, update if needed when
# new datasets arrive (we error out if something is fishy)
my @files1 = glob("${inPath}/*/*${specimen}*[_-]R1[_.]fastq.gz ${inPath}/*/*${specimen}_1.fastq.gz ${inPath}/*/*${specimen}_*R1_001.fastq.gz ${inPath}/*/*${specimen}[_-]*_1.fq.gz ${inPath}/*/*${specimen}_[0-9]_1_*.fastq.gz");
my @files2 = glob("${inPath}/*/*${specimen}*[_-]R2[_.]fastq.gz ${inPath}/*/*${specimen}_2.fastq.gz ${inPath}/*/*${specimen}_*R2_001.fastq.gz ${inPath}/*/*${specimen}[_-]*_2.fq.gz ${inPath}/*/*${specimen}_[0-9]_2_*.fastq.gz");
(@files1) ||
((warn "W: no files found for $sample == specimen $specimen, fix the globs, skipping this sample for now\n") && next);
(@files1 == @files2) ||
die "E: different numbers of files for the 2 paired-ends of $sample == specimen $specimen:\n@files1\n@files2\n";
# make sure these files haven't been seen before
foreach my $f (@files1, @files2) {
(defined $infilesDone{$f}) &&
die "E: infile $f found for $sample == specimen $specimen, but it was previously used for $infilesDone{$f}! check the logs for it! Then fix the globs\n";
$infilesDone{$f} = $sample;
}
if ((-e "$parentDir$outDir/${sample}_1.fq.gz") && (-e "$parentDir$outDir/${sample}_2.fq.gz")) {
# 09/09/2019: don't INFO when skipping, it's too much noise
# warn "I: skipping $sample == specimen $specimen because grexomized fastqs already exist\n";
next;
}
elsif ((-e "$parentDir$outDir/${sample}_1.fq.gz") || (-e "$parentDir$outDir/${sample}_2.fq.gz")) {
die "E: $sample already has one grexomized FASTQ but not the other! Something is wrong, investigate!\n";
}
# else keep going, need to make the symlinks/files
if (@files1 == 1) {
# prepare symlink target names
my $f1 = $files1[0];
my $f2 = $files2[0];
# replace $inPath with ../$lastDir/
($f1 =~ s~^$inPath~../$lastDir/~) ||
die "E: cannot replace inPath $inPath from file1 $f1 for specimen $specimen == $sample\n";
($f2 =~ s~^$inPath~../$lastDir/~) ||
die "E: cannot replace inPath $inPath from file2 $f2 for specimen $specimen == $sample\n";
my $com = "cd $parentDir$outDir/; ln -s $f1 ${sample}_1.fq.gz ; ln -s $f2 ${sample}_2.fq.gz" ;
if (! $real) {
warn "I: dryrun, would run: $com\n";
}
else {
warn "I: single pair of files for $sample, symlinking with: $com\n";
system($com);
}
}
else {
# several pairs of files, need to cat them
my $com1 = "cat @files1 > $parentDir$outDir/${sample}_1.fq.gz";
my $com2 = "cat @files2 > $parentDir$outDir/${sample}_2.fq.gz";
if (! $real) {
warn "I: dryrun, would run: $com1\n";
warn "I: dryrun, would then run: $com2\n";
}
else {
warn "I: starting cat of $sample with command: $com1\n";
system($com1);
warn "I: finishing cat of $sample with command: $com2\n";
system($com2);
}
}
}
my $nbInfiles = scalar(keys(%infilesDone));
my $nbFqFiles = `cd $inPath ; /bin/ls -1 */*.gz | wc -l` ;
chomp($nbFqFiles);
# some grexomes have been obsoleted because they were dupes,
# the corresponding FASTQs are still there, don't warn about them.
# number of obsolete FASTQ files: hard-coded here,
my $nbObsoleteFiles = 28;
if ($nbInfiles + $nbObsoleteFiles == $nbFqFiles) {
warn "\nI: nb of examined ($nbInfiles) + skipped obsolete ($nbObsoleteFiles) FASTQ files == nbFqFiles found with ls|wc, good!\n";
}
else {
warn "\nW: examined $nbInfiles FASTQs and skipped $nbObsoleteFiles obsoletes, but we actually have $nbFqFiles! why didn't we examine them all? check this!!\n";
}