-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbcl2fastq.snakemake
176 lines (169 loc) · 6.58 KB
/
bcl2fastq.snakemake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import itertools
import os
import collections
import json
import glob
from snakemake.utils import R
#from snakemake.utils import min_vession
#min_version("3.2")
from snakemake.exceptions import MissingInputException
#Author: Rajesh Patidar
#Email: patidarr
#This is a pipeline to convert bcl files to fastq and send a report to a email list.
# it also changes names Sample_SampleID_FCID nomenclature.
#
# TODO
# parse original samplesheet to rename files.
# add _E _P _T _W _S in names to show type of data
#
INPUT=os.environ['TARGET']
SOURCE=os.environ['SOURCE']
# Snakemake Base location
shell.prefix("""
#PBS -S /bin/bash
#source /etc/profile.d/modules.sh
source /etc/prolibrary.d/modules.sh
module load snakemake
set -e -o pipefail
sleep 10s
""")
path=INPUT.split("/")
PATH=os.path.dirname(INPUT)
FCID=path[-2]
onstart:
shell("echo 'bcl2fastq conversion started {FCID} '| mutt -s 'bcl2fastq status ' `whoami`@mail.nih.gov ")
onerror:
shell("echo 'bcl2fastq pipeline failed on {FCID} '|mutt -s ' bcl2fastq status' `whoami`@mail.nih.gov ")
#onsuccess:
# shell("echo 'bcl2fastq pipeline finished successfully on {FCID} ' |mutt -s ' bcl2fastq status' `whoami`@mail.nih.gov ")
f = open(PATH+"/SampleSheet.csv", 'r')
FCName=FCID.split("_")
FCName=FCName[-1][1:]
TARGET = []
SAMPLES =[]
# Original Files names has will come from the SampleSheet.csv
#
#
NAMES ={}
for line in f:
if 'Sample_ID' in line:
column = line.split(",")
sampleID =column.index('Sample_ID')
sampleName =column.index('Sample_Name')
for line in f:
column = line.split(",")
SAMPLES += [column[sampleID]]
NAMES[column[sampleID]] = column[sampleName]
TARGET += [PATH+"/Unaligned/Sample_"+column[sampleID]+"_"+FCName+"/Transfer.done"]
TARGET += [PATH+"/Unaligned/Sample_"+column[sampleID]+"_"+FCName+"/Sample_"+column[sampleID]+"_"+FCName+"_R1.fastq.gz"]
TARGET += [PATH+"/Unaligned/Sample_"+column[sampleID]+"_"+FCName+"/Sample_"+column[sampleID]+"_"+FCName+"_R2.fastq.gz"]
NAMES['Undetermined'] = 'Undetermined'
TARGET +=[PATH+"/Unaligned/Sample_Undetermined_"+FCName+"/Sample_Undetermined_"+FCName+"_R2.fastq.gz"]
# For any sample where a ~ is converted to _ in SampleSheet.csv we need to read SampleSheet_ori.csv to make fq files in original names.
f = open(PATH+"/SampleSheet_ori.csv", 'r')
for line in f:
if 'Sample_ID' in line:
column = line.split(",")
sampleID =column.index('Sample_ID')
sampleName =column.index('Sample_Name')
for line in f:
column = line.split(",")
if column[sampleID] not in SAMPLES:
NAMES[column[sampleID]] = column[sampleName]
oldname=column[sampleID].replace("~", "_")
# if a replacement happened the replaced names are not targets anymore
TARGET.remove(PATH+"/Unaligned/Sample_"+oldname+"_"+FCName+"/Sample_"+oldname+"_"+FCName+"_R1.fastq.gz")
TARGET.remove(PATH+"/Unaligned/Sample_"+oldname+"_"+FCName+"/Sample_"+oldname+"_"+FCName+"_R2.fastq.gz")
TARGET.remove(PATH+"/Unaligned/Sample_"+oldname+"_"+FCName+"/Transfer.done")
# Add correct files to target list
TARGET += [PATH+"/Unaligned/Sample_"+column[sampleID]+"_"+FCName+"/Transfer.done"]
TARGET += [PATH+"/Unaligned/Sample_"+column[sampleID]+"_"+FCName+"/Sample_"+column[sampleID]+"_"+FCName+"_R1.fastq.gz"]
TARGET += [PATH+"/Unaligned/Sample_"+column[sampleID]+"_"+FCName+"/Sample_"+column[sampleID]+"_"+FCName+"_R2.fastq.gz"]
#################################
localrules: Final
#################################
# Parse html pages and send report email.
#################################
rule Final:
input: TARGET
params:
rulename = "final",
batch = "-l nodes=1,cput=100:00:00,pvmem=5gb,ncpus=3"
shell: """
############################
chgrp -R "nci-frederick pcc all users" /projects/lihc_hiseq/static/{FCID}/Unaligned/
{SOURCE}/makeReport.pl /projects/lihc_hiseq/static/ {FCID}/Unaligned {FCName} |mutt -e "my_hdr Content-Type: text/html" -s "bcl2fastq status on {FCID}" patidarr@mail.nih.gov -c corinne.camalier@nih.gov -c amanda.peach@nih.gov -c dasbiswa@mail.nih.gov -c li.chen2@nih.gov
############################
"""
#################################
# Transfer data to biowulf
#################################
rule Transfer:
input:
"{run}/Unaligned/Sample_{library}_{FCID}/Sample_{library}_{FCID}_R1.fastq.gz"
output:
"{run}/Unaligned/Sample_{library}_{FCID}/Transfer.done"
params:
rulename = "Transfer",
batch = "-l nodes=1,cput=100:00:00,pvmem=1gb,ncpus=1"
shell: """
############################
rsync -avzr {wildcards.run}/Unaligned/Sample_{wildcards.library}_{wildcards.FCID}/ helix.nih.gov:/data/MoCha/DATA/Sample_{wildcards.library}_{wildcards.FCID}/
ssh biowulf "chgrp -R MoCha /data/MoCha/DATA/Sample_{wildcards.library}_{wildcards.FCID}/"
ssh biowulf "chmod 755 /data/MoCha/DATA/Sample_{wildcards.library}_{wildcards.FCID}"
ssh biowulf "chmod 644 /data/MoCha/DATA/Sample_{wildcards.library}_{wildcards.FCID}/*"
touch {output}
############################
"""
#################################
# Merge fq files if same sample is loaded on to multiple lanes.
# Also changes names to contain FCID in it.
#################################
rule Merge:
input:
"{run}/Unaligned/Reports/html/tree.html",
"{run}/SampleSheet.csv"
output:
R1="{run}/Unaligned/Sample_{library}_{FCID}/Sample_{library}_{FCID}_R1.fastq.gz",
R2="{run}/Unaligned/Sample_{library}_{FCID}/Sample_{library}_{FCID}_R2.fastq.gz",
params:
rulename = "merge",
name = lambda wildcards: NAMES[wildcards.library],
batch = "-l nodes=1,cput=100:00:00,pvmem=5gb,ncpus=3"
shell: """
############################
newlib=`echo {params.name} |sed -e 's/~/_/g'`
R1=`find {wildcards.run}/Unaligned/ -name "${{newlib}}*_R1_*gz*" | tr '\\n' ' '`
R2=`find {wildcards.run}/Unaligned/ -name "${{newlib}}*_R2_*gz*" | tr '\\n' ' '`
count=`find {wildcards.run}/Unaligned/ -name "{wildcards.library}*_R1_*gz*" |wc -l`
if [ ${{count}} >1 ]; then
echo "Working on ${{R1}}"
/bin/zcat ${{R1}} |gzip >{output.R1} &
/bin/zcat ${{R2}} |gzip >{output.R2} &
else
echo "Working on ${{R1}}"
mv ${{R1}} {output.R1}
mv ${{R2}} {output.R2}
fi
wait
############################
"""
#################################
# This is bcl2fastq conversion
#################################
rule BCL2FASTQ:
input:
"{run}/RTAComplete.txt"
output:
"{run}/Unaligned/Reports/html/tree.html"
params:
rulename = "bcl2fastq",
batch = "-l nodes=1,cput=100:00:00,pvmem=72gb,ncpus=16"
shell: """
############################
module load bcl2fastq2
runName=`basename {wildcards.run}`
bcl2fastq -R {wildcards.run} -o {wildcards.run}/Unaligned/
chgrp -R "nci-frederick pcc all users" {wildcards.run}/Unaligned/
############################
"""