-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsample_qscores.cpp
336 lines (276 loc) · 13.1 KB
/
sample_qscores.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
#include <zlib.h>
#include <cstring>
#include <cassert>
#include <htslib/kstring.h>
#include "RandSampling.h"
#include "sample_qscores.h"
#include "mrand.h"
#include "NGSNGS_misc.h"
#define LENS 10000
#define MAXBINS 100
extern int refToInt[256];
extern char intToRef[4];
double phred2Prob[256];
ransampl_ws ***ReadQuality(char *ntqual, double *ErrProb, int ntcharoffset,const char *freqfile,int &inferredreadcycle){
/*
ReadQuality - reads quality scores and error probabilities from the input frequency file and sets up alias sampling distributions for cycle specific sequencing error depending on nucleotide
The function also initializes a lookup table for Phred scores to probability conversions.
@param ntqual A character array where nucleotide quality scores will be stored.
@param ErrProb An array where error probabilities corresponding to each read position will be stored.
@param ntcharoffset An integer offset to adjust quality scores.
@param freqfile Path to the input frequency file containing quality score data.
@param inferredreadcycle Reference to an integer where the inferred read cycle count will be stored.
@return A 3D array of pointers (`ransampl_ws ***`) representing the alias method
workspaces set up for each of the 5 bins representing possible nucleotide substitution errors (A,G,C,T,N) given read positions.
*/
// Initialize Phred score to probability conversion table
for(int qscore =0 ;qscore<256;qscore++){
double d = qscore;
phred2Prob[qscore] = pow(10,((-d)/10));
}
// Allocate memory for the 3D array of distribution
ransampl_ws ***dists = new ransampl_ws**[5];
// Read lines from the file and store them in the vector
std::vector<char *> all_lines;
gzFile gz = Z_NULL;
assert(((gz = gzopen(freqfile,"rb")))!=Z_NULL);
char buf[LENS];
while(gzgets(gz,buf,LENS))
all_lines.push_back(strdup(buf));
gzclose(gz);
inferredreadcycle = (all_lines.size()-2)/5; // Infer the number of sequencing cycles from the input data
// Loop over each of the 5 bins
int nbins = -1;
double probs[MAXBINS];
for(int b=0;b<5;b++){
dists[b] = new ransampl_ws *[inferredreadcycle];
// Loop over each read position
for(int pos = 0 ; pos<inferredreadcycle;pos++){
int at = 0;
probs[at++] = atof(strtok(all_lines[2+b*inferredreadcycle+pos],"\n\t "));
char *tok = NULL;
// Parse probabilities from the line
while(((tok=strtok(NULL,"\n\t ")))){
probs[at++] = atof(tok);
assert(at<MAXBINS);
}
// Validate that all bins have the same number of columns (the substitutions)
if(nbins==-1){
nbins = at;
}
if(nbins!=at){
fprintf(stderr,"Problems, number of columns is different nbins: %d at: %d\n",nbins,at);
exit(1);
}
// Allocate and set up the alias method for the current position
dists[b][pos] = ransampl_alloc( nbins );
ransampl_set(dists[b][pos],probs);
}
}
// Parse the nucleotide quality scores from the first line
int qualidx = 1;
ntqual[0] = (char) (atoi(strtok(all_lines[0],"\n\t "))+ntcharoffset);
char *qualtok = NULL;
while(((qualtok=strtok(NULL,"\n\t ")))){
ntqual[qualidx++] = (char) (atoi(qualtok)+ntcharoffset);
}
int Err_idx = 1;
// Parse the error probabilities from the second line
ErrProb[0] = atof(strtok(all_lines[1],"\n\t"));
char *Errtok = NULL;
while ((Errtok=strtok (NULL,"\n\t"))){
ErrProb[Err_idx++] = (double) atof(Errtok);
}
// Free the memory allocated by strdup for each line
for (unsigned long i = 0; i < all_lines.size(); i++){
free(all_lines[i]);
}
return dists;
}
int sample_qscores(char *bases, char *qscores,int len,ransampl_ws ***ws,char *NtQuals,mrand_t *mr,int simError, int ntcharoffset){
/*
sample_qscores - Samples quality scores for a given sequence of bases using a precomputed alias structure and generate sequencing error (by replacing nucleotide) based on quality scores
@param bases Character array representing the sequence of bases within original DNA molecule.
@param qscores Character array where sampled quality scores will be stored.
@param len Length of the sequence (`bases` and `qscores` arrays).
@param ws 3D array of alias sampling workspaces for each base and position.
@param NtQuals Array mapping index to nucleotide quality scores.
@param mr Pointer to the random number generator state (type `mrand_t`).
@param simError Boolean flag indicating whether sequencing errors should be simulated.
@param ntcharoffset Offset used to adjust quality scores to Phred scale.
@return Returns 1 if an error was simulated in the sequence, otherwise 0.
*/
int seq_err = 0;
// Iterate over each base in the input sequence
for(int i = 0;i<len;i++){
// Generate two random numbers for sampling
double dtemp1 = mrand_pop(mr);
double dtemp2 = mrand_pop(mr);
// Convert the input base to its integer representation
char inbase = refToInt[bases[i]];
// Draw a quality score index for the current base at position `i`
int qscore_idx = ransampl_draw2(ws[inbase][i],dtemp1,dtemp2);
// Assign the corresponding quality score from NtQuals
qscores[i] = NtQuals[qscore_idx];
// If simulating sequencing errors, determine if an error should be introduced
if (simError){
double tmprand = mrand_pop(mr);
// Check if the generated random number is less than the probability of error
if ( tmprand < phred2Prob[qscores[i]-ntcharoffset]){
// Calculate a random output base, ensuring it differs from the input base and set the error flag
int outbase=(int)floor(4.0*phred2Prob[qscores[i]-ntcharoffset]*tmprand);
while (((outbase=((int)floor(4*mrand_pop(mr))))) == inbase);
bases[i] = intToRef[outbase];
seq_err = 1;
}
}
}
return seq_err;
}
int sample_qscores_amplicon(kstring_t* seq,kstring_t* qual,ransampl_ws ***ws,char *NtQuals,mrand_t *mr,int simError, int ntcharoffset){
/*
sample_qscores_amplicon - Samples quality scores for an amplicon sequence and optionally simulates sequencing errors
@param seq Pointer to the kstring_t sequence representing the empirical sequencing reads.
@param qual Pointer to the kstring_t sequence representing the empirical quality strings reads.
@param ws 3D array of alias sampling workspaces for each base and position.
@param NtQuals Array mapping index to nucleotide quality scores.
@param mr Pointer to the random number generator state (type `mrand_t`).
@param simError Boolean flag indicating whether sequencing errors should be simulated.
@param ntcharoffset Offset used to adjust quality scores to Phred scale.
@return Returns 1 if an error was simulated in the sequence, otherwise 0.
*/
int seq_err = 0;
// Iterate over each base in the empirical sequence read
for(int i = 0;i<seq->l;i++){
// Generate two random numbers for sampling
double dtemp1 = mrand_pop(mr);
double dtemp2 = mrand_pop(mr);
// Convert the input base to its integer representation
char inbase = refToInt[seq->s[i]];
// Draw a quality score index for the current base at position `i`
int qscore_idx = ransampl_draw2(ws[refToInt[seq->s[i]]][i],dtemp1,dtemp2);
// Assign the corresponding quality score from NtQuals
qual->s[i] = NtQuals[qscore_idx];
// If simulating sequencing errors, determine if an error should be introduced
if (simError){
double tmprand = mrand_pop(mr);
// Check if the generated random number is less than the probability of error
if ( tmprand < phred2Prob[qual->s[i]-ntcharoffset]){
// Calculate a random output base, ensuring it differs from the input base and set the error flag
int outbase=(int)floor(4.0*phred2Prob[qual->s[i]-ntcharoffset]*tmprand);//DRAGON
while (((outbase=((int)floor(4*mrand_pop(mr))))) == inbase);
seq->s[i] = intToRef[outbase];
seq_err = 1;
}
}
}
return seq_err;
}
int sample_qscores_fix(char *bases, char *qscores, int qscoresval,int len,mrand_t *mr,int simError, int ntcharoffset){
/*
sample_qscores_fix - Samples quality scores for sequence and optionally simulates sequencing errors using a fixed quality score.
@param bases Pointer to the array of sequence bases.
@param qscores Pointer to the array where quality scores will be stored.
@param qscoresval The fixed integer quality score value to be applied to all bases.
@param len The length of the sequence.
@param mr Pointer to the random number generator state (type `mrand_t`).
@param simError Boolean flag indicating whether sequencing errors should be simulated.
@param ntcharoffset Offset used to adjust quality scores to Phred scale.
@return Returns 1 if an error was simulated in the sequence, otherwise 0.
*/
int seq_err = 0;
if (simError){
// Precompute Phred scale probabilities for all possible quality scores
for(int qscore =0 ;qscore<256;qscore++){
double d = qscore;
phred2Prob[qscore] = pow(10,((-d)/10));
}
}
// Convert fized integer score into the corresponding character considering the offset (different from fq or bam)
char fixedscore = (char)(qscoresval + ntcharoffset);
for(int i = 0;i<len;i++){
qscores[i] = fixedscore;
char inbase = refToInt[bases[i]];
if (simError){
double tmprand = mrand_pop(mr);
// Check if the generated random number is less than the probability of error
if ( tmprand < phred2Prob[fixedscore-ntcharoffset]){
// Calculate a random output base, ensuring it differs from the input base and set the error flag
int outbase=(int)floor(4.0*phred2Prob[fixedscore-ntcharoffset]*tmprand);
while (((outbase=((int)floor(4*mrand_pop(mr))))) == inbase);
bases[i] = intToRef[outbase];
seq_err = 1;
}
}
}
return seq_err;
}
//void add_indel_amplicon_fa(mrand_t *mr,kstring_t* seq,double *pars,int* ops){
//add_indel_amplicon_fqbam(mr,&FQseq->seq,&FQseq->qual,pars,ops,ErrProbTypeOffset);
int sample_qscores_fix_amplicon(mrand_t *mr,kstring_t* seq,int qscoresval,int ntcharoffset){
/*
sample_qscores_fix_amplicon - Simulates sequencing errors for a given sequence based on a fixed quality score for empirical sequencing data (so with fasta input and fastq output).
@param mr Pointer to the random number generator state (type `mrand_t`).
@param seq Pointer to the sequence object (`kstring_t`) to be updated with simulated errors.
@param qscoresval The fixed integer quality score value used to determine error probabilities.
@param ntcharoffset Offset used to adjust quality scores to the Phred scale.
@return Returns 1 if an error was simulated in the sequence, otherwise 0.
*/
int seq_err = 0;
// temporary sequences
kstring_t seq_intermediate;
seq_intermediate.l = seq->l;
seq_intermediate.m = seq->l;
seq_intermediate.s = (char *)malloc((seq->l + 1) * sizeof(char)); // Allocate memory
strcpy(seq_intermediate.s, seq->s); // Copy seq->s to seq_intermediate.s
// Precompute Phred scale probabilities for all possible quality scores
for(int qscore =0 ;qscore<256;qscore++){
double d = qscore;
phred2Prob[qscore] = pow(10,((-d)/10));
}
for(int i = 0;i<seq->l;i++){
char inbase = refToInt[seq->s[i]];
double tmprand = mrand_pop(mr);
// Check if the generated random number is less than the probability of error
if ( tmprand < phred2Prob[qscoresval-ntcharoffset]){
// Calculate a random output base, ensuring it differs from the input base and set the error flag
int outbase=(int)floor(4.0*phred2Prob[qscoresval]*tmprand);
while (((outbase=((int)floor(4*mrand_pop(mr))))) == inbase);
seq_intermediate.s[i] = intToRef[outbase];
seq->s[i] = seq_intermediate.s[i];
seq_err = 1;
}
}
seq->s[seq->l] = '\0';
free(seq_intermediate.s);
seq_intermediate.s = NULL;
seq_intermediate.l = seq_intermediate.m = 0;
return seq_err;
}
#ifdef __WITH_MAIN__
int main(int argc, char **argv){
const char *profile_fname = "Test_Examples/AccFreqL150R1.txt";
char ntquals[1024];
double errorArray[1024];
int maxreadcycles = 150;
//ransampl_ws ***ws = ReadQuality(ntquals,errorArray,33,profile_fname);
mrand_t *mr = mrand_alloc(3,88);
char bases[30];
char qscores[30];
memset(bases,'\0',30);
memset(qscores,'\0',30);
char specific_char = 'I';
char* char_array = (char*)malloc((30 + 1) * sizeof(char));
for(int i=0;i<30;i++){
bases[i] = intToRef[(int)floor(drand48()*4)];
char_array[i] = specific_char;
std::cout << " i " << i << " bases " << bases[i] << std::endl;
}
sample_qscores_fix(bases,qscores,10,30,mr,1,33);
//sample_qscores(bases,qscores,30,ws,ntquals,mr,0,0);
fprintf(stderr,"@readname\n%s\n+\n%s\n",bases,qscores);
//gtggTAGAGATAAAGCACATTCTTTAGGAGTGAATATGGNNTNNCTGCNCGCANANTGNNATTGNNTTGCNNNTNNANCGNNNCNNTNNNGNTTNGCNACAGCNANGNNA
return 0;
}
#endif
//g++ sample_qscores.cpp RandSampling.o mrand.o NGSNGS_misc.o -std=c++11 -lm -lz -D__WITH_MAIN__ -o Scores