Skip to content

Commit c0695b0

Browse files
authored
Merge pull request #479 from ksahlin/revert-basehash
Revert PR #472 "Always pick first syncmer as base"
2 parents 82af37e + 6b0e8c8 commit c0695b0

9 files changed

+121
-52
lines changed

CHANGES.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
## development version
44

5-
* #476: Significantly improve speed and accuracy by enabling by default a new
6-
variant of multi-context seeds: When no regular seeds - which consist
7-
of two strobes - can be found for the entire query, strobealign attempts to find
5+
* #476: Improve accuracy by enabling (by default) a variant of multi-context
6+
seeds: When no regular seeds - which consist of two strobes - can be found
7+
for the *entire* query, strobealign now always attempts to find
88
single-strobe ("partial") seeds.
99
The `--mcs` option is still available for now. It is a bit slower, but
1010
slightly more accurate.

src/index.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,8 @@ void StrobemerIndex::assign_randstrobes(size_t ref_index, size_t offset) {
309309
randstrobe.hash,
310310
randstrobe.strobe1_pos,
311311
static_cast<uint32_t>(ref_index),
312-
static_cast<uint8_t>(randstrobe.strobe2_pos - randstrobe.strobe1_pos)
312+
static_cast<uint8_t>(randstrobe.strobe2_pos - randstrobe.strobe1_pos),
313+
randstrobe.first_strobe_is_main
313314
};
314315
}
315316
chunk.clear();

src/index.hpp

+9-2
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ struct StrobemerIndex {
100100

101101
auto pos = std::lower_bound(randstrobes.begin() + position_start,
102102
randstrobes.begin() + position_end,
103-
RefRandstrobe{key, 0, 0, 0},
103+
RefRandstrobe{key, 0, 0, 0, 0},
104104
cmp);
105105
if ((pos->hash() & hash_mask) == masked_key) return pos - randstrobes.begin();
106106
return end();
@@ -122,6 +122,10 @@ struct StrobemerIndex {
122122
}
123123
}
124124

125+
bool first_strobe_is_main(bucket_index_t position) const {
126+
return randstrobes[position].first_strobe_is_main();
127+
}
128+
125129
bool is_filtered(bucket_index_t position) const {
126130
return get_hash(position) == get_hash(position + filter_cutoff);
127131
}
@@ -141,6 +145,9 @@ struct StrobemerIndex {
141145
std::pair<int, int> strobe_extent_partial(bucket_index_t position) const {
142146
// Construct the match from the strobe that was selected as the main part of the hash
143147
int ref_start = get_strobe1_position(position);
148+
if (!first_strobe_is_main(position)) {
149+
ref_start += strobe2_offset(position);
150+
}
144151
return {ref_start, ref_start + k()};
145152
}
146153

@@ -198,7 +205,7 @@ struct StrobemerIndex {
198205

199206
auto pos = std::upper_bound(randstrobes.begin() + position,
200207
randstrobes.begin() + position_end,
201-
RefRandstrobe{key, 0, 0, 0},
208+
RefRandstrobe{key, 0, 0, 0, 0},
202209
cmp);
203210
return (pos - randstrobes.begin() - 1) - position + 1;
204211
}

src/nam.cpp

+53-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,19 @@ bool operator==(const Match& lhs, const Match& rhs) {
1313
return (lhs.query_start == rhs.query_start) && (lhs.query_end == rhs.query_end) && (lhs.ref_start == rhs.ref_start) && (lhs.ref_end == rhs.ref_end);
1414
}
1515

16+
/*
17+
* A partial hit is a hit where not the full randstrobe hash could be found in
18+
* the index but only the "main" hash (only the first aux_len bits).
19+
*/
20+
struct PartialHit {
21+
randstrobe_hash_t hash;
22+
unsigned int start; // position in strobemer index
23+
bool is_reverse;
24+
bool operator==(const PartialHit& rhs) const {
25+
return (hash == rhs.hash) && (start == rhs.start) && (is_reverse == rhs.is_reverse);
26+
}
27+
};
28+
1629
inline void add_to_matches_map_full(
1730
robin_hood::unordered_map<unsigned int, std::vector<Match>>& matches_map,
1831
int query_start,
@@ -34,6 +47,25 @@ inline void add_to_matches_map_full(
3447
}
3548
}
3649

50+
/*
51+
This function often produces the same Match multiple times.
52+
This happens when multiple randstrobes involve a syncmer that has a low hash
53+
value (and which therefore gets chosen as main hash multiple times).
54+
55+
For example, assume we have syncmers starting at positions 14, 21, 39, 51
56+
and randstrobes like this:
57+
58+
14, 21, 39, 51
59+
14------39
60+
21--39
61+
39--51
62+
63+
If 39 now has the lowest hash value, it will be chosen as the main hash for all
64+
three randstrobes. Thus, when looking up that main hash, all three randstrobes
65+
are found. After the start coordinate of the randstrobe is converted to the
66+
start coordinate of the single syncmer (using strobe_extent_partial()), they all
67+
result in a hit that starts at 39 (and has length k).
68+
*/
3769
inline void add_to_matches_map_partial(
3870
robin_hood::unordered_map<unsigned int, std::vector<Match>>& matches_map,
3971
int query_start,
@@ -191,6 +223,10 @@ std::tuple<float, int, std::vector<Nam>> find_nams(
191223
const StrobemerIndex& index,
192224
bool use_mcs
193225
) {
226+
std::vector<PartialHit> partial_queried; // TODO: is a small set more efficient than linear search in a small vector?
227+
if (use_mcs) {
228+
partial_queried.reserve(10);
229+
}
194230
std::array<robin_hood::unordered_map<unsigned int, std::vector<Match>>, 2> matches_map;
195231
matches_map[0].reserve(100);
196232
matches_map[1].reserve(100);
@@ -207,15 +243,22 @@ std::tuple<float, int, std::vector<Nam>> find_nams(
207243
add_to_matches_map_full(matches_map[q.is_revcomp], q.start, q.end, index, position);
208244
}
209245
else if (use_mcs) {
246+
PartialHit ph{q.hash & index.get_main_hash_mask(), q.partial_start, q.is_revcomp};
247+
if (std::find(partial_queried.begin(), partial_queried.end(), ph) != partial_queried.end()) {
248+
// already queried
249+
continue;
250+
}
210251
size_t partial_pos = index.find_partial(q.hash);
211252
if (partial_pos != index.end()) {
212253
total_hits++;
213254
if (index.is_partial_filtered(partial_pos)) {
255+
partial_queried.push_back(ph);
214256
continue;
215257
}
216258
nr_good_hits++;
217-
add_to_matches_map_partial(matches_map[q.is_revcomp], q.start, q.start + index.k(), index, partial_pos);
259+
add_to_matches_map_partial(matches_map[q.is_revcomp], q.partial_start, q.partial_end, index, partial_pos);
218260
}
261+
partial_queried.push_back(ph);
219262
}
220263
}
221264

@@ -263,6 +306,8 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
263306
< std::tie(rhs.count, rhs.query_start, rhs.query_end);
264307
}
265308
};
309+
std::vector<PartialHit> partial_queried; // TODO: is a small set more efficient than linear search in a small vector?
310+
partial_queried.reserve(10);
266311
std::array<robin_hood::unordered_map<unsigned int, std::vector<Match>>, 2> matches_map;
267312
std::array<std::vector<RescueHit>, 2> hits;
268313
matches_map[0].reserve(100);
@@ -278,12 +323,18 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
278323
hits[qr.is_revcomp].push_back(rh);
279324
}
280325
else if (use_mcs) {
326+
PartialHit ph = {qr.hash & index.get_main_hash_mask(), qr.partial_start, qr.is_revcomp};
327+
if (std::find(partial_queried.begin(), partial_queried.end(), ph) != partial_queried.end()) {
328+
// already queried
329+
continue;
330+
}
281331
size_t partial_pos = index.find_partial(qr.hash);
282332
if (partial_pos != index.end()) {
283333
unsigned int partial_count = index.get_count_partial(partial_pos);
284-
RescueHit rh{partial_pos, partial_count, qr.start, qr.start + index.k(), true};
334+
RescueHit rh{partial_pos, partial_count, qr.partial_start, qr.partial_end, true};
285335
hits[qr.is_revcomp].push_back(rh);
286336
}
337+
partial_queried.push_back(ph);
287338
}
288339
}
289340

src/randstrobes.cpp

+17-5
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ static inline syncmer_hash_t syncmer_smer_hash(uint64_t packed) {
4949
* This function combines two individual syncmer hashes into a single hash
5050
* for the randstrobe.
5151
*
52-
* One syncmer is designated as the "main", the other is the "auxiliary".
52+
* The syncmer with the smaller hash is designated as the "main", the other is
53+
* the "auxiliary".
5354
* The combined hash is obtained by setting the top bits to the bits of
5455
* the main hash and the bottom bits to the bits of the auxiliary
5556
* hash. Since entries in the index are sorted by randstrobe hash, this allows
@@ -58,6 +59,10 @@ static inline syncmer_hash_t syncmer_smer_hash(uint64_t packed) {
5859
static inline randstrobe_hash_t randstrobe_hash(
5960
syncmer_hash_t hash1, syncmer_hash_t hash2, randstrobe_hash_t main_hash_mask
6061
) {
62+
// Make the function symmetric
63+
if (hash1 > hash2) {
64+
std::swap(hash1, hash2);
65+
}
6166
return ((hash1 & main_hash_mask) | (hash2 & ~main_hash_mask)) & RANDSTROBE_HASH_MASK;
6267
}
6368

@@ -141,7 +146,7 @@ std::vector<Syncmer> canonical_syncmers(
141146

142147
std::ostream& operator<<(std::ostream& os, const Randstrobe& randstrobe) {
143148
os << "Randstrobe(hash=" << randstrobe.hash << ", strobe1_pos=" << randstrobe.strobe1_pos << ", strobe2_pos="
144-
<< randstrobe.strobe2_pos << ")";
149+
<< randstrobe.strobe2_pos << ", first_strobe_is_main=" << randstrobe.first_strobe_is_main << ")";
145150
return os;
146151
}
147152

@@ -156,10 +161,12 @@ std::ostream& operator<<(std::ostream& os, const QueryRandstrobe& randstrobe) {
156161
}
157162

158163
Randstrobe make_randstrobe(Syncmer strobe1, Syncmer strobe2, randstrobe_hash_t main_hash_mask) {
164+
bool first_strobe_is_main = strobe1.hash < strobe2.hash;
159165
return Randstrobe{
160166
randstrobe_hash(strobe1.hash, strobe2.hash, main_hash_mask),
161167
static_cast<uint32_t>(strobe1.position),
162-
static_cast<uint32_t>(strobe2.position)
168+
static_cast<uint32_t>(strobe2.position),
169+
first_strobe_is_main
163170
};
164171
}
165172

@@ -239,9 +246,11 @@ std::vector<QueryRandstrobe> randstrobes_query(const std::string_view seq, const
239246
RandstrobeIterator randstrobe_fwd_iter{syncmers, parameters.randstrobe};
240247
while (randstrobe_fwd_iter.has_next()) {
241248
auto randstrobe = randstrobe_fwd_iter.next();
249+
const unsigned int partial_start = randstrobe.first_strobe_is_main ? randstrobe.strobe1_pos : randstrobe.strobe2_pos;
242250
randstrobes.push_back(
243251
QueryRandstrobe {
244-
randstrobe.hash, randstrobe.strobe1_pos, randstrobe.strobe2_pos + parameters.syncmer.k, false
252+
randstrobe.hash, randstrobe.strobe1_pos, randstrobe.strobe2_pos + parameters.syncmer.k,
253+
partial_start, partial_start + parameters.syncmer.k, false
245254
}
246255
);
247256
}
@@ -262,9 +271,12 @@ std::vector<QueryRandstrobe> randstrobes_query(const std::string_view seq, const
262271
RandstrobeIterator randstrobe_rc_iter{syncmers, parameters.randstrobe};
263272
while (randstrobe_rc_iter.has_next()) {
264273
auto randstrobe = randstrobe_rc_iter.next();
274+
bool first_strobe_is_main = randstrobe.first_strobe_is_main;
275+
const unsigned int partial_start = first_strobe_is_main ? randstrobe.strobe1_pos : randstrobe.strobe2_pos;
265276
randstrobes.push_back(
266277
QueryRandstrobe {
267-
randstrobe.hash, randstrobe.strobe1_pos, randstrobe.strobe2_pos + parameters.syncmer.k, true
278+
randstrobe.hash, randstrobe.strobe1_pos, randstrobe.strobe2_pos + parameters.syncmer.k,
279+
partial_start, partial_start + parameters.syncmer.k, true
268280
}
269281
);
270282
}

src/randstrobes.hpp

+16-6
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,20 @@
1717
using syncmer_hash_t = uint64_t;
1818
using randstrobe_hash_t = uint64_t;
1919

20-
static constexpr uint64_t RANDSTROBE_HASH_MASK = 0xFFFFFFFFFFFFFF00;
20+
static constexpr uint64_t RANDSTROBE_HASH_MASK = 0xFFFFFFFFFFFFFE00;
2121

2222
struct RefRandstrobe {
2323
private:
24-
// packed representation of hash and offset
24+
// packed representation of hash, offset and first_strobe_is_main
2525
randstrobe_hash_t m_hash_offset_flag;
2626
uint32_t m_position;
2727
uint32_t m_ref_index;
2828

2929
public:
3030
RefRandstrobe() : m_hash_offset_flag(0), m_position(0), m_ref_index(0) { }
3131

32-
RefRandstrobe(randstrobe_hash_t hash, uint32_t position, uint32_t ref_index, uint8_t offset)
33-
: m_hash_offset_flag((hash & RANDSTROBE_HASH_MASK) | offset)
32+
RefRandstrobe(randstrobe_hash_t hash, uint32_t position, uint32_t ref_index, uint8_t offset, bool first_strobe_is_main)
33+
: m_hash_offset_flag((hash & RANDSTROBE_HASH_MASK) | (offset << 1) | first_strobe_is_main)
3434
, m_position(position)
3535
, m_ref_index(ref_index)
3636
{ }
@@ -45,12 +45,16 @@ struct RefRandstrobe {
4545
return lhs < rhs;
4646
}
4747

48+
bool first_strobe_is_main() const {
49+
return m_hash_offset_flag & 1;
50+
}
51+
4852
unsigned reference_index() const {
4953
return m_ref_index;
5054
}
5155

5256
unsigned strobe2_offset() const {
53-
return m_hash_offset_flag & 0xff;
57+
return (m_hash_offset_flag >> 1) & 0xff;
5458
}
5559

5660
randstrobe_hash_t hash() const {
@@ -68,6 +72,11 @@ struct QueryRandstrobe {
6872
randstrobe_hash_t hash;
6973
unsigned int start;
7074
unsigned int end;
75+
/* Start and end of the main syncmer (relevant if the randstrobe couldn’t
76+
* be found in the index and we fall back to a partial hit)
77+
*/
78+
unsigned int partial_start;
79+
unsigned int partial_end;
7180
bool is_revcomp;
7281
};
7382

@@ -79,6 +88,7 @@ struct Randstrobe {
7988
randstrobe_hash_t hash;
8089
unsigned int strobe1_pos;
8190
unsigned int strobe2_pos;
91+
bool first_strobe_is_main;
8292

8393
bool operator==(const Randstrobe& other) const {
8494
return hash == other.hash && strobe1_pos == other.strobe1_pos && strobe2_pos == other.strobe2_pos;
@@ -176,7 +186,7 @@ class RandstrobeGenerator {
176186
{ }
177187

178188
Randstrobe next();
179-
Randstrobe end() const { return Randstrobe{0, 0, 0}; }
189+
Randstrobe end() const { return Randstrobe{0, 0, 0, false}; }
180190

181191
private:
182192
SyncmerIterator syncmer_iterator;

tests/baseline-commit.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
acc4cffe5ac2c4db266c58d00b7b6462c6b4189c
1+
b63002e80213afd97517d47162ba5e51f2bed56e

tests/phix.mcs.se.paf

+11-11
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
SRR1377138.32 301 2 293 + NC_001422.1 5386 1434 1725 49 291 255
2-
SRR1377138.33 301 2 297 + NC_001422.1 5386 3818 4113 51 295 255
3-
SRR1377138.34 301 33 299 - NC_001422.1 5386 844 1110 43 266 255
4-
SRR1377138.35 301 5 298 - NC_001422.1 5386 4041 4334 49 293 255
5-
SRR1377138.36 301 3 301 + NC_001422.1 5386 4997 5295 53 298 255
6-
SRR1377138.37 301 2 299 - NC_001422.1 5386 794 1091 43 297 255
7-
SRR1377138.38 301 32 284 - NC_001422.1 5386 4971 5223 44 252 255
8-
SRR1377138.39/1 301 1 295 - NC_001422.1 5386 1791 2085 50 294 255
9-
SRR1377138.40 301 4 293 - NC_001422.1 5386 3020 3309 48 289 255
10-
rescuable.42 301 4 293 - NC_001422.1 5386 3020 3309 48 289 255
11-
not.rescuable 301 4 293 - NC_001422.1 5386 3020 3309 48 289 255
1+
SRR1377138.32 301 2 293 + NC_001422.1 5386 1434 1725 47 291 255
2+
SRR1377138.33 301 2 297 + NC_001422.1 5386 3818 4113 47 295 255
3+
SRR1377138.34 301 33 299 - NC_001422.1 5386 844 1110 40 266 255
4+
SRR1377138.35 301 5 298 - NC_001422.1 5386 4041 4334 47 293 255
5+
SRR1377138.36 301 3 301 + NC_001422.1 5386 4997 5295 51 298 255
6+
SRR1377138.37 301 2 299 - NC_001422.1 5386 794 1091 42 297 255
7+
SRR1377138.38 301 32 284 - NC_001422.1 5386 4971 5223 42 252 255
8+
SRR1377138.39/1 301 1 295 - NC_001422.1 5386 1791 2085 48 294 255
9+
SRR1377138.40 301 4 293 - NC_001422.1 5386 3020 3309 44 289 255
10+
rescuable.42 301 4 293 - NC_001422.1 5386 3020 3309 44 289 255
11+
not.rescuable 301 4 293 - NC_001422.1 5386 3020 3309 44 289 255

tests/test_randstrobes.cpp

+9-21
Original file line numberDiff line numberDiff line change
@@ -39,27 +39,15 @@ TEST_CASE("RefRandstrobe constructor") {
3939
randstrobe_hash_t hash = 0x1234567890ABCDEF & RANDSTROBE_HASH_MASK;
4040
uint32_t position = ~0u;
4141
uint32_t ref_index = RefRandstrobe::max_number_of_references - 1;
42-
SUBCASE("one") {
43-
uint8_t offset = 255;
44-
RefRandstrobe rr{hash, position, ref_index, offset};
45-
46-
CHECK(rr.hash() == hash);
47-
CHECK(rr.position() == position);
48-
CHECK(rr.reference_index() == ref_index);
49-
CHECK(rr.strobe2_offset() == offset);
50-
}
51-
52-
SUBCASE("two") {
53-
uint8_t offset = 0;
54-
RefRandstrobe rr{hash, position, ref_index, offset};
55-
56-
CHECK(rr.hash() == hash);
57-
CHECK(rr.position() == position);
58-
CHECK(rr.reference_index() == ref_index);
59-
CHECK(rr.strobe2_offset() == offset);
60-
}
61-
62-
42+
uint8_t offset = 255;
43+
bool first_strobe_is_main = true;
44+
RefRandstrobe rr{hash, position, ref_index, offset, first_strobe_is_main};
45+
46+
CHECK(rr.hash() == hash);
47+
CHECK(rr.position() == position);
48+
CHECK(rr.reference_index() == ref_index);
49+
CHECK(rr.strobe2_offset() == offset);
50+
CHECK(rr.first_strobe_is_main() == first_strobe_is_main);
6351
}
6452

6553
TEST_CASE("SyncmerIterator") {

0 commit comments

Comments
 (0)