Skip to content

Commit bfce78a

Browse files
authored
Merge pull request #476 from ksahlin/mcs-rescue
Rescue using partial hits, even in non-MCS mode
2 parents b2105c6 + affcebf commit bfce78a

File tree

5 files changed

+56
-21
lines changed

5 files changed

+56
-21
lines changed

CHANGES.md

+8-2
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,20 @@
22

33
## development version
44

5+
* #476: Significantly improve speed and accuracy by enabling by default a new
6+
variant of multi-context seeds: When no regular seeds - which consist
7+
of two strobes - can be found for the entire query, strobealign attempts to find
8+
single-strobe ("partial") seeds.
9+
The `--mcs` option is still available for now. It is a bit slower, but
10+
slightly more accurate.
511
* #468: Be less strict when checking reference sequence names.
612

713
## v0.15.0 (2024-12-13)
814

915
* #388 and #426: Increase accuracy and mapping rate for reads shorter than
1016
about 200 bp by introducing multi-context seeds.
11-
Previously, seeds always consisted of two k-mers and would only be found if
12-
both occur in query and reference.
17+
Previously, seeds always consisted of two k-mers ("strobes") and would only
18+
be found if both occur in query and reference.
1319
With this change, strobealign falls back to looking up just one of the k-mers
1420
when appropriate.
1521
This feature is currently *experimental* and only enabled when using the

README.md

+23
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,29 @@ actual mapping:
223223
- Index files are about four times as large as the reference.
224224

225225

226+
## Explanation
227+
228+
### Multi-context seeds
229+
230+
Strobealign uses randstrobes as seeds, which in our case consist of two k-mers
231+
("strobes") that are somewhat close to each other. When a seed is looked up
232+
in the index, it is only found if both strobes match. By changing the way in
233+
which the index is stored in v0.15.0, it became possible to support
234+
*multi-context seeds*. With those changes, strobealign falls back to looking
235+
up only one of the strobes (a "partial seed") if the full seed cannot be found.
236+
This results in better mapping rate and accuracy for read lengths of up to
237+
about 200 nt.
238+
239+
Usage of multi-context seeds is enabled by default in strobealign since v0.16.0.
240+
The strategy is to first search for all full seeds of the query and fall back to
241+
partial seeds if *no* seeds could be found.
242+
243+
A slightly more accurate, but slower mode of using multi-context seeds is
244+
available by using option `--mcs`: With it, the strategy is changed to a
245+
fallback *per seed*: If an individual full seed cannot be found, its partial
246+
version is looked up in the index.
247+
248+
226249
## Changelog
227250

228251
See [Changelog](CHANGES.md).

src/cmdline.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
5454
args::ValueFlag<int> end_bonus(parser, "INT", "Soft clipping penalty [10]", {'L'});
5555

5656
args::Group search(parser, "Search parameters:");
57-
args::Flag mcs(parser, "mcs", "Use multi-context seeds for finding hits", {"mcs"});
57+
args::Flag mcs(parser, "mcs", "Use extended multi-context seed mode for finding hits. Slightly more accurate, but slower", {"mcs"});
5858
args::ValueFlag<float> f(parser, "FLOAT", "Top fraction of repetitive strobemers to filter out from sampling [0.0002]", {'f'});
5959
args::ValueFlag<float> S(parser, "FLOAT", "Try candidate sites with mapping score at least S of maximum mapping score [0.5]", {'S'});
6060
args::ValueFlag<int> M(parser, "INT", "Maximum number of mapping sites to try [20]", {'M'});

src/nam.cpp

+23-17
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,21 @@ std::tuple<float, int, std::vector<Nam>> find_nams(
219219
}
220220
}
221221

222+
// Rescue using partial hits, even in non-MCS mode
223+
if (total_hits == 0 && !use_mcs) {
224+
for (const auto &q : query_randstrobes) {
225+
size_t partial_pos = index.find_partial(q.hash);
226+
if (partial_pos != index.end()) {
227+
total_hits++;
228+
if (index.is_partial_filtered(partial_pos)) {
229+
continue;
230+
}
231+
nr_good_hits++;
232+
add_to_matches_map_partial(matches_map[q.is_reverse], q.start, q.start + index.k(), index, partial_pos);
233+
}
234+
}
235+
}
236+
222237
float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0;
223238
auto nams = merge_matches_into_nams_forward_and_reverse(matches_map, index.k(), use_mcs);
224239
return {nonrepetitive_fraction, nr_good_hits, nams};
@@ -249,43 +264,34 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
249264
}
250265
};
251266
std::array<robin_hood::unordered_map<unsigned int, std::vector<Match>>, 2> matches_map;
252-
std::vector<RescueHit> hits_fw;
253-
std::vector<RescueHit> hits_rc;
267+
std::array<std::vector<RescueHit>, 2> hits;
254268
matches_map[0].reserve(100);
255269
matches_map[1].reserve(100);
256-
hits_fw.reserve(5000);
257-
hits_rc.reserve(5000);
270+
hits[0].reserve(5000);
271+
hits[1].reserve(5000);
258272

259273
for (auto &qr : query_randstrobes) {
260274
size_t position = index.find_full(qr.hash);
261275
if (position != index.end()) {
262276
unsigned int count = index.get_count_full(position);
263277
RescueHit rh{position, count, qr.start, qr.end, false};
264-
if (qr.is_reverse){
265-
hits_rc.push_back(rh);
266-
} else {
267-
hits_fw.push_back(rh);
268-
}
278+
hits[qr.is_reverse].push_back(rh);
269279
}
270280
else if (use_mcs) {
271281
size_t partial_pos = index.find_partial(qr.hash);
272282
if (partial_pos != index.end()) {
273283
unsigned int partial_count = index.get_count_partial(partial_pos);
274284
RescueHit rh{partial_pos, partial_count, qr.start, qr.start + index.k(), true};
275-
if (qr.is_reverse){
276-
hits_rc.push_back(rh);
277-
} else {
278-
hits_fw.push_back(rh);
279-
}
285+
hits[qr.is_reverse].push_back(rh);
280286
}
281287
}
282288
}
283289

284-
std::sort(hits_fw.begin(), hits_fw.end());
285-
std::sort(hits_rc.begin(), hits_rc.end());
290+
std::sort(hits[0].begin(), hits[0].end());
291+
std::sort(hits[1].begin(), hits[1].end());
286292
int n_hits = 0;
287293
size_t is_revcomp = 0;
288-
for (auto& rescue_hits : {hits_fw, hits_rc}) {
294+
for (auto& rescue_hits : hits) {
289295
int cnt = 0;
290296
for (auto &rh : rescue_hits) {
291297
if ((rh.count > rescue_cutoff && cnt >= 5) || rh.count > 1000) {

tests/baseline-commit.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
7f5ac330f68afbe32e0cde4a32bbb786cdd4d01e
1+
acc4cffe5ac2c4db266c58d00b7b6462c6b4189c

0 commit comments

Comments
 (0)