Skip to content

Commit bd183f5

Browse files
authored
Merge pull request #394 from psj1997/main
strobealign-aemb module for metagenomic binning
2 parents 79cdd96 + 5a1ab9e commit bd183f5

11 files changed

+192
-47
lines changed

README.md

+6
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,11 @@ strobealign ref.fa reads.1.fastq.gz reads.2.fastq.gz | samtools sort -o sorted.b
113113
This is usually faster than doing the two steps separately because fewer
114114
intermediate files are created.
115115

116+
To output the estimated abundance of every contig, the format of output file is: contig_id \t abundance_value:
117+
```
118+
strobealign ref.fa reads.fq --aemb > abundance.txt # Single-end reads
119+
strobealign ref.fa reads1.fq reads2.fq --aemb > abundance.txt # Paired-end reads
120+
```
116121

117122
## Command-line options
118123

@@ -127,6 +132,7 @@ options. Some important ones are:
127132
* `--eqx`: Emit `=` and `X` CIGAR operations instead of `M`.
128133
* `-x`: Only map reads, do not do no base-level alignment. This switches the
129134
output format from SAM to [PAF](https://github.com/lh3/miniasm/blob/master/PAF.md).
135+
* `--aemb`: Output the estimated abundance value of every contig, the format of output file is: contig_id \t abundance_value.
130136
* `--rg-id=ID`: Add RG tag to each SAM record.
131137
* `--rg=TAG:VALUE`: Add read group metadata to the SAM header. This can be
132138
specified multiple times. Example: `--rg-id=1 --rg=SM:mysamle --rg=LB:mylibrary`.

src/aln.cpp

+107-23
Original file line numberDiff line numberDiff line change
@@ -862,13 +862,17 @@ std::vector<ScoredAlignmentPair> align_paired(
862862
return high_scores;
863863
}
864864

865-
// Only used for PAF output
865+
// Used for PAF and abundances output
866866
inline void get_best_map_location(
867867
std::vector<Nam> &nams1,
868868
std::vector<Nam> &nams2,
869869
InsertSizeDistribution &isize_est,
870870
Nam &best_nam1,
871-
Nam &best_nam2
871+
Nam &best_nam2,
872+
int read1_len,
873+
int read2_len,
874+
std::vector<double> &abundances,
875+
bool output_abundance
872876
) {
873877
std::vector<NamPair> nam_pairs = get_best_scoring_nam_pairs(nams1, nams2, isize_est.mu, isize_est.sigma);
874878
best_nam1.ref_start = -1; //Unmapped until proven mapped
@@ -903,6 +907,52 @@ inline void get_best_map_location(
903907
if (score_joint > score_indiv) { // joint score is better than individual
904908
best_nam1 = n1_joint_max;
905909
best_nam2 = n2_joint_max;
910+
911+
if (output_abundance){
912+
// we loop twice because we need to count the number of best pairs
913+
size_t n_best = 0;
914+
for (auto &[score, n1, n2] : nam_pairs){
915+
if ((n1.score + n2.score) == score_joint){
916+
++n_best;
917+
} else {
918+
break;
919+
}
920+
}
921+
for (auto &[score, n1, n2] : nam_pairs){
922+
if ((n1.score + n2.score) == score_joint){
923+
if (n1.ref_start >= 0) {
924+
abundances[n1.ref_id] += float(read1_len) / float(n_best);
925+
}
926+
if (n2.ref_start >= 0) {
927+
abundances[n2.ref_id] += float(read2_len) / float(n_best);
928+
}
929+
} else {
930+
break;
931+
}
932+
}
933+
}
934+
} else if (output_abundance) {
935+
for (auto &[nams, read_len]: { std::make_pair(std::cref(nams1), read1_len),
936+
std::make_pair(std::cref(nams2), read2_len) }) {
937+
size_t best_score = 0;
938+
// We loop twice because we need to count the number of NAMs with best score
939+
for (auto &nam : nams) {
940+
if (nam.score == nams[0].score){
941+
++best_score;
942+
} else {
943+
break;
944+
}
945+
}
946+
for (auto &nam: nams) {
947+
if (nam.ref_start < 0) {
948+
continue;
949+
}
950+
if (nam.score != nams[0].score){
951+
break;
952+
}
953+
abundances[nam.ref_id] += float(read_len) / float(best_score);
954+
}
955+
}
906956
}
907957

908958
if (isize_est.sample_size < 400 && score_joint > score_indiv) {
@@ -957,7 +1007,8 @@ void align_or_map_paired(
9571007
const IndexParameters& index_parameters,
9581008
const References& references,
9591009
const StrobemerIndex& index,
960-
std::minstd_rand& random_engine
1010+
std::minstd_rand& random_engine,
1011+
std::vector<double> &abundances
9611012
) {
9621013
std::array<Details, 2> details;
9631014
std::array<std::vector<Nam>, 2> nams_pair;
@@ -991,18 +1042,24 @@ void align_or_map_paired(
9911042
}
9921043

9931044
Timer extend_timer;
994-
if (!map_param.is_sam_out) {
1045+
if (map_param.output_format != OutputFormat::SAM) { // PAF or abundance
9951046
Nam nam_read1;
9961047
Nam nam_read2;
997-
get_best_map_location(nams_pair[0], nams_pair[1], isize_est,
998-
nam_read1,
999-
nam_read2);
1000-
output_hits_paf_PE(outstring, nam_read1, record1.name,
1001-
references,
1002-
record1.seq.length());
1003-
output_hits_paf_PE(outstring, nam_read2, record2.name,
1004-
references,
1005-
record2.seq.length());
1048+
get_best_map_location(
1049+
nams_pair[0], nams_pair[1],
1050+
isize_est,
1051+
nam_read1, nam_read2,
1052+
record1.seq.length(), record2.seq.length(),
1053+
abundances,
1054+
map_param.output_format == OutputFormat::Abundance);
1055+
if (map_param.output_format == OutputFormat::PAF) {
1056+
output_hits_paf_PE(outstring, nam_read1, record1.name,
1057+
references,
1058+
record1.seq.length());
1059+
output_hits_paf_PE(outstring, nam_read2, record2.name,
1060+
references,
1061+
record2.seq.length());
1062+
}
10061063
} else {
10071064
Read read1(record1.seq);
10081065
Read read2(record2.seq);
@@ -1082,7 +1139,8 @@ void align_or_map_single(
10821139
const IndexParameters& index_parameters,
10831140
const References& references,
10841141
const StrobemerIndex& index,
1085-
std::minstd_rand& random_engine
1142+
std::minstd_rand& random_engine,
1143+
std::vector<double> &abundances
10861144
) {
10871145
Details details;
10881146
Timer strobe_timer;
@@ -1111,15 +1169,41 @@ void align_or_map_single(
11111169

11121170

11131171
Timer extend_timer;
1114-
if (!map_param.is_sam_out) {
1115-
output_hits_paf(outstring, nams, record.name, references,
1116-
record.seq.length());
1117-
} else {
1118-
align_single(
1119-
aligner, sam, nams, record, index_parameters.syncmer.k,
1120-
references, details, map_param.dropoff_threshold, map_param.max_tries,
1121-
map_param.max_secondary, random_engine
1122-
);
1172+
size_t n_best = 0;
1173+
switch (map_param.output_format) {
1174+
case OutputFormat::Abundance: {
1175+
if (!nams.empty()){
1176+
for (auto &t : nams){
1177+
if (t.score == nams[0].score){
1178+
++n_best;
1179+
}else{
1180+
break;
1181+
}
1182+
}
1183+
1184+
for (auto &nam: nams) {
1185+
if (nam.ref_start < 0) {
1186+
continue;
1187+
}
1188+
if (nam.score != nams[0].score){
1189+
break;
1190+
}
1191+
abundances[nam.ref_id] += float(record.seq.length()) / float(n_best);
1192+
}
1193+
}
1194+
}
1195+
break;
1196+
case OutputFormat::PAF:
1197+
output_hits_paf(outstring, nams, record.name, references,
1198+
record.seq.length());
1199+
break;
1200+
case OutputFormat::SAM:
1201+
align_single(
1202+
aligner, sam, nams, record, index_parameters.syncmer.k,
1203+
references, details, map_param.dropoff_threshold, map_param.max_tries,
1204+
map_param.max_secondary, random_engine
1205+
);
1206+
break;
11231207
}
11241208
statistics.tot_extend += extend_timer.duration();
11251209
statistics += details;

src/aln.hpp

+11-3
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,20 @@ struct AlignmentStatistics {
5656
}
5757
};
5858

59+
enum class OutputFormat {
60+
SAM,
61+
PAF,
62+
Abundance
63+
};
64+
5965
struct MappingParameters {
6066
int r { 150 };
6167
int max_secondary { 0 };
6268
float dropoff_threshold { 0.5 };
6369
int rescue_level { 2 };
6470
int max_tries { 20 };
6571
int rescue_cutoff;
66-
bool is_sam_out { true };
72+
OutputFormat output_format {OutputFormat::SAM};
6773
CigarOps cigar_ops{CigarOps::M};
6874
bool output_unmapped { true };
6975
bool details{false};
@@ -88,7 +94,8 @@ void align_or_map_paired(
8894
const IndexParameters& index_parameters,
8995
const References& references,
9096
const StrobemerIndex& index,
91-
std::minstd_rand& random_engine
97+
std::minstd_rand& random_engine,
98+
std::vector<double> &abundances
9299
);
93100

94101
void align_or_map_single(
@@ -101,7 +108,8 @@ void align_or_map_single(
101108
const IndexParameters& index_parameters,
102109
const References& references,
103110
const StrobemerIndex& index,
104-
std::minstd_rand& random_engine
111+
std::minstd_rand& random_engine,
112+
std::vector<double> &abundances
105113
);
106114

107115
// Private declarations, only needed for tests

src/cmdline.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
2727
args::Flag v(parser, "v", "Verbose output", {'v'});
2828
args::Flag no_progress(parser, "no-progress", "Disable progress report (enabled by default if output is a terminal)", {"no-progress"});
2929
args::Flag x(parser, "x", "Only map reads, no base level alignment (produces PAF file)", {'x'});
30+
args::Flag aemb(parser, "aemb", "Output the estimated abundance value of contigs, the format of output file is: contig_id \t abundance_value", {"aemb"});
3031
args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"});
3132
args::ValueFlag<std::string> index_statistics(parser, "PATH", "Print statistics of indexing to PATH", {"index-statistics"});
3233
args::Flag i(parser, "index", "Do not map reads; only generate the strobemer index and write it to disk. If read files are provided, they are used to estimate read length", {"create-index", 'i'});
@@ -97,6 +98,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
9798
if (index_statistics) { opt.logfile_name = args::get(index_statistics); }
9899
if (i) { opt.only_gen_index = true; }
99100
if (use_index) { opt.use_index = true; }
101+
if (aemb) {opt.is_abundance_out = true; }
100102

101103
// SAM output
102104
if (eqx) { opt.cigar_eqx = true; }

src/cmdline.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ struct CommandLineOptions {
1818
bool only_gen_index { false };
1919
bool use_index { false };
2020
bool is_sam_out { true };
21+
bool is_abundance_out {false};
2122

2223
// SAM output
2324
bool cigar_eqx { false };

src/main.cpp

+41-15
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,12 @@ InputBuffer get_input_buffer(const CommandLineOptions& opt) {
105105
}
106106
}
107107

108+
void output_abundance(const std::vector<double>& abundances, const References& references){
109+
for (size_t i = 0; i < references.size(); ++i) {
110+
std::cout << references.names[i] << '\t' << std::fixed << std::setprecision(6) << abundances[i] / double(references.sequences[i].size()) << std::endl;
111+
}
112+
}
113+
108114
void show_progress_until_done(std::vector<int>& worker_done, std::vector<AlignmentStatistics>& stats) {
109115
Timer timer;
110116
bool reported = false;
@@ -155,6 +161,11 @@ int run_strobealign(int argc, char **argv) {
155161
if (opt.c >= 64 || opt.c <= 0) {
156162
throw BadParameter("c must be greater than 0 and less than 64");
157163
}
164+
165+
if (!opt.is_sam_out && opt.is_abundance_out){
166+
throw BadParameter("Can not use -x and --aemb at the same time");
167+
}
168+
158169
InputBuffer input_buffer = get_input_buffer(opt);
159170
if (!opt.r_set && !opt.reads_filename1.empty()) {
160171
opt.r = estimate_read_length(input_buffer);
@@ -184,7 +195,10 @@ int run_strobealign(int argc, char **argv) {
184195
map_param.dropoff_threshold = opt.dropoff_threshold;
185196
map_param.rescue_level = opt.rescue_level;
186197
map_param.max_tries = opt.max_tries;
187-
map_param.is_sam_out = opt.is_sam_out;
198+
map_param.output_format = (
199+
opt.is_abundance_out ? OutputFormat::Abundance :
200+
opt.is_sam_out ? OutputFormat::SAM :
201+
OutputFormat::PAF);
188202
map_param.cigar_ops = opt.cigar_eqx ? CigarOps::EQX : CigarOps::M;
189203
map_param.output_unmapped = opt.output_unmapped;
190204
map_param.details = opt.details;
@@ -288,32 +302,31 @@ int run_strobealign(int argc, char **argv) {
288302
}
289303

290304
std::ostream out(buf);
291-
292-
if (map_param.is_sam_out) {
293-
std::stringstream cmd_line;
294-
for(int i = 0; i < argc; ++i) {
295-
cmd_line << argv[i] << " ";
296-
}
297-
298-
out << sam_header(references, opt.read_group_id, opt.read_group_fields);
299-
if (opt.pg_header) {
300-
out << pg_header(cmd_line.str());
301-
}
305+
306+
if (map_param.output_format == OutputFormat::SAM) {
307+
std::stringstream cmd_line;
308+
for(int i = 0; i < argc; ++i) {
309+
cmd_line << argv[i] << " ";
310+
}
311+
out << sam_header(references, opt.read_group_id, opt.read_group_fields);
312+
if (opt.pg_header) {
313+
out << pg_header(cmd_line.str());
314+
}
302315
}
303316

304317
std::vector<AlignmentStatistics> log_stats_vec(opt.n_threads);
305-
318+
306319
logger.info() << "Running in " << (opt.is_SE ? "single-end" : "paired-end") << " mode" << std::endl;
307320

308321
OutputBuffer output_buffer(out);
309-
310322
std::vector<std::thread> workers;
311323
std::vector<int> worker_done(opt.n_threads); // each thread sets its entry to 1 when it’s done
324+
std::vector<std::vector<double>> worker_abundances(opt.n_threads, std::vector<double>(references.size(), 0));
312325
for (int i = 0; i < opt.n_threads; ++i) {
313326
std::thread consumer(perform_task, std::ref(input_buffer), std::ref(output_buffer),
314327
std::ref(log_stats_vec[i]), std::ref(worker_done[i]), std::ref(aln_params),
315328
std::ref(map_param), std::ref(index_parameters), std::ref(references),
316-
std::ref(index), std::ref(opt.read_group_id));
329+
std::ref(index), std::ref(opt.read_group_id), std::ref(worker_abundances[i]));
317330
workers.push_back(std::move(consumer));
318331
}
319332
if (opt.show_progress && isatty(2)) {
@@ -329,6 +342,19 @@ int run_strobealign(int argc, char **argv) {
329342
tot_statistics += it;
330343
}
331344

345+
if (map_param.output_format == OutputFormat::Abundance) {
346+
std::vector<double> abundances(references.size(), 0);
347+
for (size_t i = 0; i < worker_abundances.size(); ++i) {
348+
for (size_t j = 0; j < worker_abundances[i].size(); ++j) {
349+
abundances[j] += worker_abundances[i][j];
350+
}
351+
}
352+
353+
// output the abundance file
354+
output_abundance(abundances, references);
355+
}
356+
357+
332358
logger.info() << "Total mapping sites tried: " << tot_statistics.tot_all_tried << std::endl
333359
<< "Total calls to ssw: " << tot_statistics.tot_aligner_calls << std::endl
334360
<< "Inconsistent NAM ends: " << tot_statistics.inconsistent_nams << std::endl

0 commit comments

Comments
 (0)