Skip to content

Commit 2adbeb4

Browse files
Itolstoganovmarcelm
authored andcommitted
Add multi-context seeds
1 parent f59bcb4 commit 2adbeb4

12 files changed

+174
-30
lines changed

src/arguments.hpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@ struct SeedingArguments {
2626
"results with non default values.", {'s'}}
2727
, bits{parser, "INT", "No. of top bits of hash to use as bucket indices (8-31)"
2828
"[determined from reference size]", {'b'}}
29+
, aux_len{parser, "INT", "Number of bits to be selected from the second strobe hash [24]", {"aux-len"}}
2930
{
3031
}
3132
args::ArgumentParser& parser;
32-
args::ValueFlag<int> r, m, k, l, u, c, s, bits;
33+
args::ValueFlag<int> r, m, k, l, u, c, s, bits, aux_len;
3334
};
3435

3536
#endif

src/cmdline.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
119119
if (seeding.s) { opt.s = args::get(seeding.s); opt.s_set = true; }
120120
if (seeding.c) { opt.c = args::get(seeding.c); opt.c_set = true; }
121121
if (seeding.bits) { opt.bits = args::get(seeding.bits); }
122+
if (seeding.aux_len) { opt.aux_len = args::get(seeding.aux_len); }
122123

123124
// Alignment
124125
// if (n) { n = args::get(n); }

src/cmdline.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ struct CommandLineOptions {
4646
int u { 7 };
4747
int s { 16 };
4848
int c { 8 };
49+
int aux_len{24};
4950

5051
// Alignment
5152
int A { 2 };

src/dumpstrobes.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ int run_dumpstrobes(int argc, char **argv) {
101101
}
102102

103103
// Seeding
104-
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7};
104+
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7}, aux_len{24};
105105
int max_seed_len{};
106106

107107
bool k_set{false}, s_set{false}, c_set{false}, max_seed_len_set{false}, l_set{false}, u_set{false};
@@ -125,7 +125,8 @@ int run_dumpstrobes(int argc, char **argv) {
125125
l_set ? l : IndexParameters::DEFAULT,
126126
u_set ? u : IndexParameters::DEFAULT,
127127
c_set ? c : IndexParameters::DEFAULT,
128-
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT
128+
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT,
129+
aux_len ? aux_len : IndexParameters::DEFAULT
129130
);
130131

131132
logger.info() << index_parameters << '\n';

src/index.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ void StrobemerIndex::populate(float f, unsigned n_threads) {
150150
}
151151
stats.tot_strobemer_count = total_randstrobes;
152152

153+
logger.info() << "Auxiliary hash length is : " << parameters.randstrobe.aux_len;
154+
153155
logger.debug() << " Total number of randstrobes: " << total_randstrobes << '\n';
154156
uint64_t memory_bytes = references.total_length() + sizeof(RefRandstrobe) * total_randstrobes + sizeof(bucket_index_t) * (1u << bits);
155157
logger.debug() << " Estimated total memory usage: " << memory_bytes / 1E9 << " GB\n";
@@ -241,6 +243,7 @@ void StrobemerIndex::populate(float f, unsigned n_threads) {
241243
filter_cutoff = 30;
242244
}
243245
stats.filter_cutoff = filter_cutoff;
246+
partial_filter_cutoff = 100;
244247
stats.elapsed_hash_index = hash_index_timer.duration();
245248
stats.distinct_strobemers = unique_mers;
246249
}
@@ -304,7 +307,7 @@ void StrobemerIndex::assign_randstrobes(size_t ref_index, size_t offset) {
304307
chunk.push_back(randstrobe);
305308
}
306309
for (auto randstrobe : chunk) {
307-
RefRandstrobe::packed_t packed = ref_index << 8;
310+
RefRandstrobe::packed_t packed = (ref_index << 9) | (randstrobe.main_is_first << 8);
308311
packed = packed + (randstrobe.strobe2_pos - randstrobe.strobe1_pos);
309312
randstrobes[offset++] = RefRandstrobe{randstrobe.hash, randstrobe.strobe1_pos, packed};
310313
}

src/index.hpp

+50
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ struct StrobemerIndex {
3838
using bucket_index_t = uint64_t;
3939
StrobemerIndex(const References& references, const IndexParameters& parameters, int bits=-1)
4040
: filter_cutoff(0)
41+
, partial_filter_cutoff(0)
4142
, parameters(parameters)
4243
, references(references)
4344
, bits(bits == -1 ? pick_bits(references.total_length()) : bits)
@@ -47,6 +48,7 @@ struct StrobemerIndex {
4748
}
4849
}
4950
unsigned int filter_cutoff;
51+
unsigned int partial_filter_cutoff;
5052
mutable IndexCreationStatistics stats;
5153

5254
void write(const std::string& filename) const;
@@ -80,18 +82,66 @@ struct StrobemerIndex {
8082
return end();
8183
}
8284

85+
//Returns the first entry that matches the main hash
86+
size_t partial_find(randstrobe_hash_t key) const {
87+
const unsigned int aux_len = parameters.randstrobe.aux_len;
88+
randstrobe_hash_t key_prefix = key >> aux_len;
89+
90+
constexpr int MAX_LINEAR_SEARCH = 4;
91+
const unsigned int top_N = key >> (64 - bits);
92+
bucket_index_t position_start = randstrobe_start_indices[top_N];
93+
bucket_index_t position_end = randstrobe_start_indices[top_N + 1];
94+
if (position_start == position_end) {
95+
return end();
96+
}
97+
98+
if (position_end - position_start < MAX_LINEAR_SEARCH) {
99+
for ( ; position_start < position_end; ++position_start) {
100+
if (randstrobes[position_start].hash >> aux_len == key_prefix) return position_start;
101+
if (randstrobes[position_start].hash >> aux_len > key_prefix) return end();
102+
}
103+
return end();
104+
}
105+
auto cmp = [&aux_len](const RefRandstrobe lhs, const RefRandstrobe rhs) {
106+
return (lhs.hash >> aux_len) < (rhs.hash >> aux_len); };
107+
108+
auto pos = std::lower_bound(randstrobes.begin() + position_start,
109+
randstrobes.begin() + position_end,
110+
RefRandstrobe{key, 0, 0},
111+
cmp);
112+
if (pos->hash == key) return pos - randstrobes.begin();
113+
return end();
114+
}
115+
83116
randstrobe_hash_t get_hash(bucket_index_t position) const {
84117
if (position < randstrobes.size()) {
85118
return randstrobes[position].hash;
86119
} else {
87120
return end();
88121
}
89122
}
123+
124+
randstrobe_hash_t get_partial_hash(bucket_index_t position) const {
125+
if (position < randstrobes.size()) {
126+
return randstrobes[position].hash >> parameters.randstrobe.aux_len;
127+
} else {
128+
return end();
129+
}
130+
}
131+
132+
bool first_strobe_is_main(bucket_index_t position) const {
133+
return randstrobes[position].main_is_first();
134+
}
90135

91136
bool is_filtered(bucket_index_t position) const {
92137
return get_hash(position) == get_hash(position + filter_cutoff);
93138
}
94139

140+
bool is_partial_filtered(bucket_index_t position) const {
141+
const unsigned int shift = parameters.randstrobe.aux_len;
142+
return (get_hash(position) >> shift) == (get_hash(position + partial_filter_cutoff) >> shift);
143+
}
144+
95145
unsigned int get_strobe1_position(bucket_index_t position) const {
96146
return randstrobes[position].position;
97147
}

src/indexparameters.cpp

+10-4
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ bool RandstrobeParameters::operator==(const RandstrobeParameters& other) const {
1818
&& this->q == other.q
1919
&& this->max_dist == other.max_dist
2020
&& this->w_min == other.w_min
21-
&& this->w_max == other.w_max;
21+
&& this->w_max == other.w_max
22+
&& this->aux_len == other.aux_len;
2223
}
2324

2425
/* Pre-defined index parameters that work well for a certain
@@ -48,7 +49,7 @@ static std::vector<Profile> profiles = {
4849
* k, s, l, u, c and max_seed_len can be used to override determined parameters
4950
* by setting them to a value other than IndexParameters::DEFAULT.
5051
*/
51-
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len) {
52+
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len, int aux_len) {
5253
const int default_c = 8;
5354
size_t canonical_read_length = 50;
5455
for (const auto& p : profiles) {
@@ -78,8 +79,11 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s,
7879
max_dist = max_seed_len - k; // convert to distance in start positions
7980
}
8081
int q = std::pow(2, c == DEFAULT ? default_c : c) - 1;
82+
if (aux_len == DEFAULT) {
83+
aux_len = 24;
84+
}
8185

82-
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
86+
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, aux_len);
8387
}
8488

8589
void IndexParameters::write(std::ostream& os) const {
@@ -90,6 +94,7 @@ void IndexParameters::write(std::ostream& os) const {
9094
write_int_to_ostream(os, randstrobe.u);
9195
write_int_to_ostream(os, randstrobe.q);
9296
write_int_to_ostream(os, randstrobe.max_dist);
97+
write_int_to_ostream(os, randstrobe.aux_len);
9398
}
9499

95100
IndexParameters IndexParameters::read(std::istream& is) {
@@ -100,7 +105,8 @@ IndexParameters IndexParameters::read(std::istream& is) {
100105
int u = read_int_from_istream(is);
101106
int q = read_int_from_istream(is);
102107
int max_dist = read_int_from_istream(is);
103-
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
108+
int aux_len = read_int_from_istream(is);
109+
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, aux_len);
104110
}
105111

106112
bool IndexParameters::operator==(const IndexParameters& other) const {

src/indexparameters.hpp

+9-5
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,16 @@ struct RandstrobeParameters {
4343
const int max_dist;
4444
const unsigned w_min;
4545
const unsigned w_max;
46+
const unsigned aux_len;
4647

47-
RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max)
48+
RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max, unsigned aux_len)
4849
: l(l)
4950
, u(u)
5051
, q(q)
5152
, max_dist(max_dist)
5253
, w_min(w_min)
5354
, w_max(w_max)
55+
, aux_len(aux_len)
5456
{
5557
verify();
5658
}
@@ -65,6 +67,9 @@ struct RandstrobeParameters {
6567
if (w_min > w_max) {
6668
throw BadParameter("w_min is greater than w_max (choose different -l/-u parameters)");
6769
}
70+
if (aux_len > 63) {
71+
throw BadParameter("aux_len is larger than 63");
72+
}
6873
}
6974
};
7075

@@ -77,16 +82,15 @@ class IndexParameters {
7782

7883
static const int DEFAULT = std::numeric_limits<int>::min();
7984

80-
IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist)
85+
IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist, int aux_len)
8186
: canonical_read_length(canonical_read_length)
8287
, syncmer(k, s)
83-
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u)
88+
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u, aux_len)
8489
{
8590
}
8691

8792
static IndexParameters from_read_length(
88-
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT
89-
);
93+
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT, int aux_len = DEFAULT);
9094
static IndexParameters read(std::istream& os);
9195
std::string filename_extension() const;
9296
void write(std::ostream& os) const;

src/main.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,8 @@ int run_strobealign(int argc, char **argv) {
179179
opt.l_set ? opt.l : IndexParameters::DEFAULT,
180180
opt.u_set ? opt.u : IndexParameters::DEFAULT,
181181
opt.c_set ? opt.c : IndexParameters::DEFAULT,
182-
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT
182+
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT,
183+
opt.aux_len ? opt.aux_len : IndexParameters::DEFAULT
183184
);
184185
logger.debug() << index_parameters << '\n';
185186
AlignmentParameters aln_params;

src/nam.cpp

+48-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ struct Hit {
99
int ref_end;
1010
};
1111

12-
inline void add_to_hits_per_ref(
12+
inline void add_to_hits_per_ref_full(
1313
robin_hood::unordered_map<unsigned int, std::vector<Hit>>& hits_per_ref,
1414
int query_start,
1515
int query_end,
@@ -22,7 +22,40 @@ inline void add_to_hits_per_ref(
2222
int ref_end = ref_start + index.strobe2_offset(position) + index.k();
2323
int diff = std::abs((query_end - query_start) - (ref_end - ref_start));
2424
if (diff <= min_diff) {
25-
hits_per_ref[index.reference_index(position)].push_back(Hit{query_start, query_end, ref_start, ref_end});
25+
hits_per_ref[index.reference_index(position)].push_back(
26+
Hit{query_start, query_end, ref_start, ref_end}
27+
);
28+
min_diff = diff;
29+
}
30+
}
31+
}
32+
33+
inline void add_to_hits_per_ref_partial(
34+
robin_hood::unordered_map<unsigned int, std::vector<Hit>>& hits_per_ref,
35+
int query_start,
36+
int query_end,
37+
const StrobemerIndex& index,
38+
size_t position
39+
) {
40+
int min_diff = std::numeric_limits<int>::max();
41+
for (const auto hash = index.get_partial_hash(position);
42+
index.get_partial_hash(position) == hash;
43+
++position) {
44+
bool first_strobe_is_main = index.first_strobe_is_main(position);
45+
// Construct the match from the strobe that was selected as the main part of the hash
46+
int adj_ref_start = 0, adj_ref_end = 0;
47+
if (first_strobe_is_main) {
48+
adj_ref_start = index.get_strobe1_position(position);
49+
}
50+
else {
51+
adj_ref_start = index.get_strobe1_position(position) + index.strobe2_offset(position);
52+
}
53+
adj_ref_end = adj_ref_start + index.k();
54+
int diff = std::abs((query_end - query_start) - (adj_ref_end - adj_ref_start));
55+
if (diff <= min_diff) {
56+
hits_per_ref[index.reference_index(position)].push_back(
57+
Hit{query_start, query_end, adj_ref_start, adj_ref_end}
58+
);
2659
min_diff = diff;
2760
}
2861
}
@@ -171,7 +204,18 @@ std::pair<float, std::vector<Nam>> find_nams(
171204
continue;
172205
}
173206
nr_good_hits++;
174-
add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, position);
207+
add_to_hits_per_ref_full(hits_per_ref[q.is_reverse], q.start, q.end, index, position);
208+
}
209+
else {
210+
size_t partial_pos = index.partial_find(q.hash);
211+
if (partial_pos != index.end()) {
212+
total_hits++;
213+
if (index.is_partial_filtered(position)) {
214+
continue;
215+
}
216+
nr_good_hits++;
217+
add_to_hits_per_ref_partial(hits_per_ref[q.is_reverse], q.partial_start, q.partial_end, index, partial_pos);
218+
}
175219
}
176220
}
177221
float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0;
@@ -231,7 +275,7 @@ std::vector<Nam> find_nams_rescue(
231275
if ((rh.count > rescue_cutoff && cnt >= 5) || rh.count > 1000) {
232276
break;
233277
}
234-
add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position);
278+
add_to_hits_per_ref_full(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position);
235279
cnt++;
236280
}
237281
is_revcomp++;

0 commit comments

Comments
 (0)