Skip to content

Commit b2105c6

Browse files
authored
Merge pull request #472 from ksahlin/base-hash
Always pick first syncmer as main
2 parents bff84e9 + 9ea6fa5 commit b2105c6

10 files changed

+88
-132
lines changed

src/index.cpp

+2-3
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
#include <sstream>
2222

2323
static Logger& logger = Logger::get();
24-
static const uint32_t STI_FILE_FORMAT_VERSION = 4;
24+
static const uint32_t STI_FILE_FORMAT_VERSION = 5;
2525

2626

2727
namespace {
@@ -309,8 +309,7 @@ void StrobemerIndex::assign_randstrobes(size_t ref_index, size_t offset) {
309309
randstrobe.hash,
310310
randstrobe.strobe1_pos,
311311
static_cast<uint32_t>(ref_index),
312-
static_cast<uint8_t>(randstrobe.strobe2_pos - randstrobe.strobe1_pos),
313-
randstrobe.first_strobe_is_main
312+
static_cast<uint8_t>(randstrobe.strobe2_pos - randstrobe.strobe1_pos)
314313
};
315314
}
316315
chunk.clear();

src/index.hpp

+2-9
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ struct StrobemerIndex {
100100

101101
auto pos = std::lower_bound(randstrobes.begin() + position_start,
102102
randstrobes.begin() + position_end,
103-
RefRandstrobe{key, 0, 0, 0, 0},
103+
RefRandstrobe{key, 0, 0, 0},
104104
cmp);
105105
if ((pos->hash() & hash_mask) == masked_key) return pos - randstrobes.begin();
106106
return end();
@@ -122,10 +122,6 @@ struct StrobemerIndex {
122122
}
123123
}
124124

125-
bool first_strobe_is_main(bucket_index_t position) const {
126-
return randstrobes[position].first_strobe_is_main();
127-
}
128-
129125
bool is_filtered(bucket_index_t position) const {
130126
return get_hash(position) == get_hash(position + filter_cutoff);
131127
}
@@ -145,9 +141,6 @@ struct StrobemerIndex {
145141
std::pair<int, int> strobe_extent_partial(bucket_index_t position) const {
146142
// Construct the match from the strobe that was selected as the main part of the hash
147143
int ref_start = get_strobe1_position(position);
148-
if (!first_strobe_is_main(position)) {
149-
ref_start += strobe2_offset(position);
150-
}
151144
return {ref_start, ref_start + k()};
152145
}
153146

@@ -205,7 +198,7 @@ struct StrobemerIndex {
205198

206199
auto pos = std::upper_bound(randstrobes.begin() + position,
207200
randstrobes.begin() + position_end,
208-
RefRandstrobe{key, 0, 0, 0, 0},
201+
RefRandstrobe{key, 0, 0, 0},
209202
cmp);
210203
return (pos - randstrobes.begin() - 1) - position + 1;
211204
}

src/nam.cpp

+4-54
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,6 @@ bool operator==(const Match& lhs, const Match& rhs) {
1313
return (lhs.query_start == rhs.query_start) && (lhs.query_end == rhs.query_end) && (lhs.ref_start == rhs.ref_start) && (lhs.ref_end == rhs.ref_end);
1414
}
1515

16-
/*
17-
* A partial hit is a hit where not the full randstrobe hash could be found in
18-
* the index but only the "main" hash (only the first aux_len bits).
19-
*/
20-
struct PartialHit {
21-
randstrobe_hash_t hash;
22-
unsigned int start; // position in strobemer index
23-
bool is_reverse;
24-
bool operator==(const PartialHit& rhs) const {
25-
return (hash == rhs.hash) && (start == rhs.start) && (is_reverse == rhs.is_reverse);
26-
}
27-
};
28-
2916
inline void add_to_matches_map_full(
3017
robin_hood::unordered_map<unsigned int, std::vector<Match>>& matches_map,
3118
int query_start,
@@ -47,25 +34,6 @@ inline void add_to_matches_map_full(
4734
}
4835
}
4936

50-
/*
51-
This function often produces the same Match multiple times.
52-
This happens when multiple randstrobes involve a syncmer that has a low hash
53-
value (and which therefore gets chosen as main hash multiple times).
54-
55-
For example, assume we have syncmers starting at positions 14, 21, 39, 51
56-
and randstrobes like this:
57-
58-
14, 21, 39, 51
59-
14------39
60-
21--39
61-
39--51
62-
63-
If 39 now has the lowest hash value, it will be chosen as the main hash for all
64-
three randstrobes. Thus, when looking up that main hash, all three randstrobes
65-
are found. After the start coordinate of the randstrobe is converted to the
66-
start coordinate of the single syncmer (using strobe_extent_partial()), they all
67-
result in a hit that starts at 39 (and has length k).
68-
*/
6937
inline void add_to_matches_map_partial(
7038
robin_hood::unordered_map<unsigned int, std::vector<Match>>& matches_map,
7139
int query_start,
@@ -223,10 +191,6 @@ std::tuple<float, int, std::vector<Nam>> find_nams(
223191
const StrobemerIndex& index,
224192
bool use_mcs
225193
) {
226-
std::vector<PartialHit> partial_queried; // TODO: is a small set more efficient than linear search in a small vector?
227-
if (use_mcs) {
228-
partial_queried.reserve(10);
229-
}
230194
std::array<robin_hood::unordered_map<unsigned int, std::vector<Match>>, 2> matches_map;
231195
matches_map[0].reserve(100);
232196
matches_map[1].reserve(100);
@@ -243,24 +207,18 @@ std::tuple<float, int, std::vector<Nam>> find_nams(
243207
add_to_matches_map_full(matches_map[q.is_reverse], q.start, q.end, index, position);
244208
}
245209
else if (use_mcs) {
246-
PartialHit ph{q.hash & index.get_main_hash_mask(), q.partial_start, q.is_reverse};
247-
if (std::find(partial_queried.begin(), partial_queried.end(), ph) != partial_queried.end()) {
248-
// already queried
249-
continue;
250-
}
251210
size_t partial_pos = index.find_partial(q.hash);
252211
if (partial_pos != index.end()) {
253212
total_hits++;
254213
if (index.is_partial_filtered(partial_pos)) {
255-
partial_queried.push_back(ph);
256214
continue;
257215
}
258216
nr_good_hits++;
259-
add_to_matches_map_partial(matches_map[q.is_reverse], q.partial_start, q.partial_end, index, partial_pos);
217+
add_to_matches_map_partial(matches_map[q.is_reverse], q.start, q.start + index.k(), index, partial_pos);
260218
}
261-
partial_queried.push_back(ph);
262219
}
263220
}
221+
264222
float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0;
265223
auto nams = merge_matches_into_nams_forward_and_reverse(matches_map, index.k(), use_mcs);
266224
return {nonrepetitive_fraction, nr_good_hits, nams};
@@ -290,8 +248,6 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
290248
< std::tie(rhs.count, rhs.query_start, rhs.query_end);
291249
}
292250
};
293-
std::vector<PartialHit> partial_queried; // TODO: is a small set more efficient than linear search in a small vector?
294-
partial_queried.reserve(10);
295251
std::array<robin_hood::unordered_map<unsigned int, std::vector<Match>>, 2> matches_map;
296252
std::vector<RescueHit> hits_fw;
297253
std::vector<RescueHit> hits_rc;
@@ -312,22 +268,16 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
312268
}
313269
}
314270
else if (use_mcs) {
315-
PartialHit ph = {qr.hash & index.get_main_hash_mask(), qr.partial_start, qr.is_reverse};
316-
if (std::find(partial_queried.begin(), partial_queried.end(), ph) != partial_queried.end()) {
317-
// already queried
318-
continue;
319-
}
320271
size_t partial_pos = index.find_partial(qr.hash);
321272
if (partial_pos != index.end()) {
322273
unsigned int partial_count = index.get_count_partial(partial_pos);
323-
RescueHit rh{partial_pos, partial_count, qr.partial_start, qr.partial_end, true};
274+
RescueHit rh{partial_pos, partial_count, qr.start, qr.start + index.k(), true};
324275
if (qr.is_reverse){
325276
hits_rc.push_back(rh);
326277
} else {
327278
hits_fw.push_back(rh);
328279
}
329280
}
330-
partial_queried.push_back(ph);
331281
}
332282
}
333283

@@ -356,6 +306,6 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
356306
}
357307

358308
std::ostream& operator<<(std::ostream& os, const Nam& n) {
359-
os << "Nam(ref_id=" << n.ref_id << ", query: " << n.query_start << ".." << n.query_end << ", ref: " << n.ref_start << ".." << n.ref_end << ", score=" << n.score << ")";
309+
os << "Nam(ref_id=" << n.ref_id << ", query: " << n.query_start << ".." << n.query_end << ", ref: " << n.ref_start << ".." << n.ref_end << ", rc=" << static_cast<int>(n.is_rc) << ", score=" << n.score << ")";
360310
return os;
361311
}

src/randstrobes.cpp

+5-17
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,7 @@ static inline syncmer_hash_t syncmer_smer_hash(uint64_t packed) {
4949
* This function combines two individual syncmer hashes into a single hash
5050
* for the randstrobe.
5151
*
52-
* The syncmer with the smaller hash is designated as the "main", the other is
53-
* the "auxiliary".
52+
* One syncmer is designated as the "main", the other is the "auxiliary".
5453
* The combined hash is obtained by setting the top bits to the bits of
5554
* the main hash and the bottom bits to the bits of the auxiliary
5655
* hash. Since entries in the index are sorted by randstrobe hash, this allows
@@ -59,10 +58,6 @@ static inline syncmer_hash_t syncmer_smer_hash(uint64_t packed) {
5958
static inline randstrobe_hash_t randstrobe_hash(
6059
syncmer_hash_t hash1, syncmer_hash_t hash2, randstrobe_hash_t main_hash_mask
6160
) {
62-
// Make the function symmetric
63-
if (hash1 > hash2) {
64-
std::swap(hash1, hash2);
65-
}
6661
return ((hash1 & main_hash_mask) | (hash2 & ~main_hash_mask)) & RANDSTROBE_HASH_MASK;
6762
}
6863

@@ -146,7 +141,7 @@ std::vector<Syncmer> canonical_syncmers(
146141

147142
std::ostream& operator<<(std::ostream& os, const Randstrobe& randstrobe) {
148143
os << "Randstrobe(hash=" << randstrobe.hash << ", strobe1_pos=" << randstrobe.strobe1_pos << ", strobe2_pos="
149-
<< randstrobe.strobe2_pos << ", first_strobe_is_main=" << randstrobe.first_strobe_is_main << ")";
144+
<< randstrobe.strobe2_pos << ")";
150145
return os;
151146
}
152147

@@ -161,12 +156,10 @@ std::ostream& operator<<(std::ostream& os, const QueryRandstrobe& randstrobe) {
161156
}
162157

163158
Randstrobe make_randstrobe(Syncmer strobe1, Syncmer strobe2, randstrobe_hash_t main_hash_mask) {
164-
bool first_strobe_is_main = strobe1.hash < strobe2.hash;
165159
return Randstrobe{
166160
randstrobe_hash(strobe1.hash, strobe2.hash, main_hash_mask),
167161
static_cast<uint32_t>(strobe1.position),
168-
static_cast<uint32_t>(strobe2.position),
169-
first_strobe_is_main
162+
static_cast<uint32_t>(strobe2.position)
170163
};
171164
}
172165

@@ -246,11 +239,9 @@ QueryRandstrobeVector randstrobes_query(const std::string_view seq, const IndexP
246239
RandstrobeIterator randstrobe_fwd_iter{syncmers, parameters.randstrobe};
247240
while (randstrobe_fwd_iter.has_next()) {
248241
auto randstrobe = randstrobe_fwd_iter.next();
249-
const unsigned int partial_start = randstrobe.first_strobe_is_main ? randstrobe.strobe1_pos : randstrobe.strobe2_pos;
250242
randstrobes.push_back(
251243
QueryRandstrobe {
252-
randstrobe.hash, randstrobe.strobe1_pos, randstrobe.strobe2_pos + parameters.syncmer.k,
253-
partial_start, partial_start + parameters.syncmer.k, false
244+
randstrobe.hash, randstrobe.strobe1_pos, randstrobe.strobe2_pos + parameters.syncmer.k, false
254245
}
255246
);
256247
}
@@ -271,12 +262,9 @@ QueryRandstrobeVector randstrobes_query(const std::string_view seq, const IndexP
271262
RandstrobeIterator randstrobe_rc_iter{syncmers, parameters.randstrobe};
272263
while (randstrobe_rc_iter.has_next()) {
273264
auto randstrobe = randstrobe_rc_iter.next();
274-
bool first_strobe_is_main = randstrobe.first_strobe_is_main;
275-
const unsigned int partial_start = first_strobe_is_main ? randstrobe.strobe1_pos : randstrobe.strobe2_pos;
276265
randstrobes.push_back(
277266
QueryRandstrobe {
278-
randstrobe.hash, randstrobe.strobe1_pos, randstrobe.strobe2_pos + parameters.syncmer.k,
279-
partial_start, partial_start + parameters.syncmer.k, true
267+
randstrobe.hash, randstrobe.strobe1_pos, randstrobe.strobe2_pos + parameters.syncmer.k, true
280268
}
281269
);
282270
}

src/randstrobes.hpp

+6-16
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,20 @@
1717
using syncmer_hash_t = uint64_t;
1818
using randstrobe_hash_t = uint64_t;
1919

20-
static constexpr uint64_t RANDSTROBE_HASH_MASK = 0xFFFFFFFFFFFFFE00;
20+
static constexpr uint64_t RANDSTROBE_HASH_MASK = 0xFFFFFFFFFFFFFF00;
2121

2222
struct RefRandstrobe {
2323
private:
24-
// packed representation of hash, offset and first_strobe_is_main
24+
// packed representation of hash and offset
2525
randstrobe_hash_t m_hash_offset_flag;
2626
uint32_t m_position;
2727
uint32_t m_ref_index;
2828

2929
public:
3030
RefRandstrobe() : m_hash_offset_flag(0), m_position(0), m_ref_index(0) { }
3131

32-
RefRandstrobe(randstrobe_hash_t hash, uint32_t position, uint32_t ref_index, uint8_t offset, bool first_strobe_is_main)
33-
: m_hash_offset_flag((hash & RANDSTROBE_HASH_MASK) | (offset << 1) | first_strobe_is_main)
32+
RefRandstrobe(randstrobe_hash_t hash, uint32_t position, uint32_t ref_index, uint8_t offset)
33+
: m_hash_offset_flag((hash & RANDSTROBE_HASH_MASK) | offset)
3434
, m_position(position)
3535
, m_ref_index(ref_index)
3636
{ }
@@ -45,16 +45,12 @@ struct RefRandstrobe {
4545
return lhs < rhs;
4646
}
4747

48-
bool first_strobe_is_main() const {
49-
return m_hash_offset_flag & 1;
50-
}
51-
5248
unsigned reference_index() const {
5349
return m_ref_index;
5450
}
5551

5652
unsigned strobe2_offset() const {
57-
return (m_hash_offset_flag >> 1) & 0xff;
53+
return m_hash_offset_flag & 0xff;
5854
}
5955

6056
randstrobe_hash_t hash() const {
@@ -72,11 +68,6 @@ struct QueryRandstrobe {
7268
randstrobe_hash_t hash;
7369
unsigned int start;
7470
unsigned int end;
75-
/* Start and end of the main syncmer (relevant if the randstrobe couldn’t
76-
* be found in the index and we fall back to a partial hit)
77-
*/
78-
unsigned int partial_start;
79-
unsigned int partial_end;
8071
bool is_reverse;
8172
};
8273

@@ -90,7 +81,6 @@ struct Randstrobe {
9081
randstrobe_hash_t hash;
9182
unsigned int strobe1_pos;
9283
unsigned int strobe2_pos;
93-
bool first_strobe_is_main;
9484

9585
bool operator==(const Randstrobe& other) const {
9686
return hash == other.hash && strobe1_pos == other.strobe1_pos && strobe2_pos == other.strobe2_pos;
@@ -188,7 +178,7 @@ class RandstrobeGenerator {
188178
{ }
189179

190180
Randstrobe next();
191-
Randstrobe end() const { return Randstrobe{0, 0, 0, false}; }
181+
Randstrobe end() const { return Randstrobe{0, 0, 0}; }
192182

193183
private:
194184
SyncmerIterator syncmer_iterator;

tests/baseline-commit.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
92c6105d71549e19e8fb263975086c8ffc77e9c8
1+
7f5ac330f68afbe32e0cde4a32bbb786cdd4d01e

tests/compare-baseline.sh

+11-2
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,18 @@ python3 -c 'import pysam'
1414

1515
ends="pe"
1616
threads=4
17-
while getopts "st:" opt; do
17+
mcs=0
18+
while getopts "st:m" opt; do
1819
case "${opt}" in
1920
t)
2021
threads=$OPTARG
2122
;;
2223
s)
2324
ends=se # single-end reads
2425
;;
26+
m)
27+
mcs=1
28+
;;
2529
\?)
2630
exit 1
2731
;;
@@ -39,10 +43,15 @@ tests/download.sh
3943

4044
baseline_commit=$(< tests/baseline-commit.txt)
4145

42-
baseline_bam=baseline/bam/${baseline_commit}.${ends}.bam
4346
baseline_binary=baseline/strobealign-${baseline_commit}
4447
cmake_options=-DCMAKE_BUILD_TYPE=RelWithDebInfo
48+
extra_ext=""
4549
strobealign_options="-t ${threads}"
50+
if [[ ${mcs} == 1 ]]; then
51+
extra_ext=".mcs"
52+
strobealign_options="${strobealign_options} --mcs"
53+
fi
54+
baseline_bam=baseline/bam/${baseline_commit}.${ends}${extra_ext}.bam
4655

4756
# Generate the baseline BAM if necessary
4857
mkdir -p baseline/bam

tests/phix.mcs.se.paf

+11-11
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
SRR1377138.32 301 2 293 + NC_001422.1 5386 1434 1725 47 291 255
2-
SRR1377138.33 301 2 297 + NC_001422.1 5386 3818 4113 47 295 255
3-
SRR1377138.34 301 33 299 - NC_001422.1 5386 844 1110 40 266 255
4-
SRR1377138.35 301 5 298 - NC_001422.1 5386 4041 4334 47 293 255
5-
SRR1377138.36 301 3 301 + NC_001422.1 5386 4997 5295 51 298 255
6-
SRR1377138.37 301 2 299 - NC_001422.1 5386 794 1091 42 297 255
7-
SRR1377138.38 301 32 284 - NC_001422.1 5386 4971 5223 42 252 255
8-
SRR1377138.39/1 301 1 295 - NC_001422.1 5386 1791 2085 48 294 255
9-
SRR1377138.40 301 4 293 - NC_001422.1 5386 3020 3309 44 289 255
10-
rescuable.42 301 4 293 - NC_001422.1 5386 3020 3309 44 289 255
11-
not.rescuable 301 4 293 - NC_001422.1 5386 3020 3309 44 289 255
1+
SRR1377138.32 301 2 293 + NC_001422.1 5386 1434 1725 49 291 255
2+
SRR1377138.33 301 2 297 + NC_001422.1 5386 3818 4113 51 295 255
3+
SRR1377138.34 301 33 299 - NC_001422.1 5386 844 1110 43 266 255
4+
SRR1377138.35 301 5 298 - NC_001422.1 5386 4041 4334 49 293 255
5+
SRR1377138.36 301 3 301 + NC_001422.1 5386 4997 5295 53 298 255
6+
SRR1377138.37 301 2 299 - NC_001422.1 5386 794 1091 43 297 255
7+
SRR1377138.38 301 32 284 - NC_001422.1 5386 4971 5223 44 252 255
8+
SRR1377138.39/1 301 1 295 - NC_001422.1 5386 1791 2085 50 294 255
9+
SRR1377138.40 301 4 293 - NC_001422.1 5386 3020 3309 48 289 255
10+
rescuable.42 301 4 293 - NC_001422.1 5386 3020 3309 48 289 255
11+
not.rescuable 301 4 293 - NC_001422.1 5386 3020 3309 48 289 255

0 commit comments

Comments
 (0)