Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Non-canonical syncmers and randstrobes #474

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 31 additions & 53 deletions src/aln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,38 +51,7 @@ bool by_score(const T& a, const T& b)
* - If first and last strobe do not match consistently, return false.
*/
bool reverse_nam_if_needed(Nam& nam, const Read& read, const References& references, int k) {
auto read_len = read.size();
std::string ref_start_kmer = references.sequences[nam.ref_id].substr(nam.ref_start, k);
std::string ref_end_kmer = references.sequences[nam.ref_id].substr(nam.ref_end-k, k);

std::string seq, seq_rc;
if (nam.is_rc) {
seq = read.rc;
seq_rc = read.seq;
} else {
seq = read.seq;
seq_rc = read.rc;
}
std::string read_start_kmer = seq.substr(nam.query_start, k);
std::string read_end_kmer = seq.substr(nam.query_end-k, k);
if (ref_start_kmer == read_start_kmer && ref_end_kmer == read_end_kmer) {
return true;
}

// False forward or false reverse (possible due to symmetrical hash values)
// we need two extra checks for this - hopefully this will remove all the false hits we see (true hash collisions should be very few)
int q_start_tmp = read_len - nam.query_end;
int q_end_tmp = read_len - nam.query_start;
// false reverse hit, change coordinates in nam to forward
read_start_kmer = seq_rc.substr(q_start_tmp, k);
read_end_kmer = seq_rc.substr(q_end_tmp - k, k);
if (ref_start_kmer == read_start_kmer && ref_end_kmer == read_end_kmer) {
nam.is_rc = !nam.is_rc;
nam.query_start = q_start_tmp;
nam.query_end = q_end_tmp;
return true;
}
return false;
return true;
}

inline void align_single(
Expand Down Expand Up @@ -1041,6 +1010,7 @@ void align_or_map_paired(
const auto& record = is_revcomp == 0 ? record1 : record2;

Timer strobe_timer;
// FIXME forward only at the moment
auto query_randstrobes = randstrobes_query(record.seq, index_parameters);
statistics.n_randstrobes += query_randstrobes.size();
statistics.tot_construct_strobemers += strobe_timer.duration();
Expand Down Expand Up @@ -1185,27 +1155,35 @@ void align_or_map_single(
) {
Details details;
Timer strobe_timer;
auto query_randstrobes = randstrobes_query(record.seq, index_parameters);
statistics.n_randstrobes += query_randstrobes.size();
statistics.tot_construct_strobemers += strobe_timer.duration();

// Find NAMs
Timer nam_timer;
auto [nonrepetitive_fraction, n_hits, nams] = find_nams(query_randstrobes, index, map_param.use_mcs);
statistics.tot_find_nams += nam_timer.duration();
statistics.n_hits += n_hits;
details.nams = nams.size();

if (map_param.rescue_level > 1) {
Timer rescue_timer;
if (nams.empty() || nonrepetitive_fraction < 0.7) {
std::string seq_revcomp = reverse_complement(record.seq);
std::vector<Nam> nams;
for (bool revcomp : {false, true}) {
const std::string& seq = revcomp ? seq_revcomp : record.seq;
auto query_randstrobes = randstrobes_query(seq, index_parameters);

statistics.n_randstrobes += query_randstrobes.size();
statistics.tot_construct_strobemers += strobe_timer.duration();

// Find NAMs
Timer nam_timer;
auto [nonrepetitive_fraction, n_hits, nams_found] = find_nams(query_randstrobes, index, map_param.use_mcs);
statistics.tot_find_nams += nam_timer.duration();
statistics.n_hits += n_hits;
details.nams += nams_found.size();

if (map_param.rescue_level > 1 && (nams_found.empty() || nonrepetitive_fraction < 0.7)) {
Timer rescue_timer;
int n_rescue_hits;
std::tie(n_rescue_hits, nams) = find_nams_rescue(query_randstrobes, index, map_param.rescue_cutoff, map_param.use_mcs);
statistics.n_rescue_hits += n_rescue_hits;
details.rescue_nams = nams.size();
std::tie(n_rescue_hits, nams_found) = find_nams_rescue(query_randstrobes, index, map_param.rescue_cutoff, map_param.use_mcs);
details.rescue_nams += nams_found.size();
details.nam_rescue = true;
statistics.n_rescue_hits += n_rescue_hits;
statistics.tot_time_rescue += rescue_timer.duration();
}
for (auto &nam : nams_found) {
nam.is_rc = revcomp;
nams.push_back(nam);
}
statistics.tot_time_rescue += rescue_timer.duration();
}

Timer nam_sort_timer;
Expand All @@ -1215,7 +1193,7 @@ void align_or_map_single(

#ifdef TRACE
std::cerr << "Query: " << record.name << '\n';
std::cerr << "Found " << nams.size() << " NAMs\n";
std::cerr << "Found " << nams.size() << " NAMs:\n";
for (auto& nam : nams) {
std::cerr << "- " << nam << '\n';
}
Expand All @@ -1226,10 +1204,10 @@ void align_or_map_single(
switch (map_param.output_format) {
case OutputFormat::Abundance: {
if (!nams.empty()){
for (auto &t : nams){
for (auto &t : nams) {
if (t.score == nams[0].score){
++n_best;
}else{
} else{
break;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/nam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,6 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
}

std::ostream& operator<<(std::ostream& os, const Nam& n) {
os << "Nam(ref_id=" << n.ref_id << ", query: " << n.query_start << ".." << n.query_end << ", ref: " << n.ref_start << ".." << n.ref_end << ", score=" << n.score << ")";
os << "Nam(ref_id=" << n.ref_id << ", query: " << n.query_start << ".." << n.query_end << ", ref: " << n.ref_start << ".." << n.ref_end << ", rc=" << static_cast<int>(n.is_rc) << ", score=" << n.score << ")";
return os;
}
66 changes: 14 additions & 52 deletions src/randstrobes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,11 @@ static inline syncmer_hash_t syncmer_smer_hash(uint64_t packed) {
* hash. Since entries in the index are sorted by randstrobe hash, this allows
* us to search for the main syncmer only by masking out the lower bits.
*/
static inline randstrobe_hash_t randstrobe_hash(
static inline std::pair<randstrobe_hash_t, bool> randstrobe_hash(
syncmer_hash_t hash1, syncmer_hash_t hash2, randstrobe_hash_t main_hash_mask
) {
// Make the function symmetric
if (hash1 > hash2) {
std::swap(hash1, hash2);
}
return ((hash1 & main_hash_mask) | (hash2 & ~main_hash_mask)) & RANDSTROBE_HASH_MASK;
bool first_strobe_is_main = true;
return {((hash1 & main_hash_mask) | (hash2 & ~main_hash_mask)) & RANDSTROBE_HASH_MASK, first_strobe_is_main};
}

std::ostream& operator<<(std::ostream& os, const Syncmer& syncmer) {
Expand All @@ -76,16 +73,13 @@ Syncmer SyncmerIterator::next() {
// for (size_t i = 0; i < seq.length(); i++) {
int c = seq_nt4_table[(uint8_t) seq[i]];
if (c < 4) { // not an "N" base
xk[0] = (xk[0] << 2 | c) & kmask; // forward strand
xk[1] = xk[1] >> 2 | (uint64_t)(3 - c) << kshift; // reverse strand
xs[0] = (xs[0] << 2 | c) & smask; // forward strand
xs[1] = xs[1] >> 2 | (uint64_t)(3 - c) << sshift; // reverse strand
xk = (xk << 2 | c) & kmask; // forward strand
xs = (xs << 2 | c) & smask; // forward strand
if (++l < static_cast<size_t>(parameters.s)) {
continue;
}
// we find an s-mer
uint64_t ys = std::min(xs[0], xs[1]);
uint64_t hash_s = syncmer_smer_hash(ys);
uint64_t hash_s = syncmer_smer_hash(xs);
qs.push_back(hash_s);
// not enough hashes in the queue, yet
if (qs.size() < static_cast<size_t>(parameters.k - parameters.s + 1)) {
Expand Down Expand Up @@ -116,15 +110,14 @@ Syncmer SyncmerIterator::next() {
}
}
if (qs[parameters.t_syncmer - 1] == qs_min_val) { // occurs at t:th position in k-mer
uint64_t yk = std::min(xk[0], xk[1]);
auto syncmer = Syncmer{syncmer_kmer_hash(yk), i - parameters.k + 1};
auto syncmer = Syncmer{syncmer_kmer_hash(xk), i - parameters.k + 1};
i++;
return syncmer;
}
} else {
// if there is an "N", restart
qs_min_val = UINT64_MAX;
l = xs[0] = xs[1] = xk[0] = xk[1] = 0;
l = xs = xk = 0;
qs.clear();
}
}
Expand Down Expand Up @@ -161,9 +154,9 @@ std::ostream& operator<<(std::ostream& os, const QueryRandstrobe& randstrobe) {
}

Randstrobe make_randstrobe(Syncmer strobe1, Syncmer strobe2, randstrobe_hash_t main_hash_mask) {
bool first_strobe_is_main = strobe1.hash < strobe2.hash;
auto [hash, first_strobe_is_main] = randstrobe_hash(strobe1.hash, strobe2.hash, main_hash_mask);
return Randstrobe{
randstrobe_hash(strobe1.hash, strobe2.hash, main_hash_mask),
hash,
static_cast<uint32_t>(strobe1.position),
static_cast<uint32_t>(strobe2.position),
first_strobe_is_main
Expand Down Expand Up @@ -228,24 +221,18 @@ Randstrobe RandstrobeGenerator::next() {
}

/*
* Generate randstrobes for a query sequence and its reverse complement.
* Generate randstrobes for a sequence
*/
QueryRandstrobeVector randstrobes_query(const std::string_view seq, const IndexParameters& parameters) {
QueryRandstrobeVector randstrobes;
if (seq.length() < parameters.randstrobe.w_max) {
return randstrobes;
}

// Generate syncmers for the forward sequence
auto syncmers = canonical_syncmers(seq, parameters.syncmer);
if (syncmers.empty()) {
return randstrobes;
}

// Generate randstrobes for the forward sequence
RandstrobeIterator randstrobe_fwd_iter{syncmers, parameters.randstrobe};
while (randstrobe_fwd_iter.has_next()) {
auto randstrobe = randstrobe_fwd_iter.next();
RandstrobeIterator randstrobe_iter{syncmers, parameters.randstrobe};
while (randstrobe_iter.has_next()) {
auto randstrobe = randstrobe_iter.next();
const unsigned int partial_start = randstrobe.first_strobe_is_main ? randstrobe.strobe1_pos : randstrobe.strobe2_pos;
randstrobes.push_back(
QueryRandstrobe {
Expand All @@ -255,30 +242,5 @@ QueryRandstrobeVector randstrobes_query(const std::string_view seq, const IndexP
);
}

// For the reverse complement, we can re-use the syncmers of the forward
// sequence because canonical syncmers are invariant under reverse
// complementing. Only the coordinates need to be adjusted.
std::reverse(syncmers.begin(), syncmers.end());
for (size_t i = 0; i < syncmers.size(); i++) {
syncmers[i].position = seq.length() - syncmers[i].position - parameters.syncmer.k;
}

// Randstrobes cannot be re-used for the reverse complement:
// If in the forward direction, syncmer[i] and syncmer[j] were paired up, it
// is not necessarily the case that syncmer[j] is going to be paired with
// syncmer[i] in the reverse direction because i is fixed in the forward
// direction and j is fixed in the reverse direction.
RandstrobeIterator randstrobe_rc_iter{syncmers, parameters.randstrobe};
while (randstrobe_rc_iter.has_next()) {
auto randstrobe = randstrobe_rc_iter.next();
bool first_strobe_is_main = randstrobe.first_strobe_is_main;
const unsigned int partial_start = first_strobe_is_main ? randstrobe.strobe1_pos : randstrobe.strobe2_pos;
randstrobes.push_back(
QueryRandstrobe {
randstrobe.hash, randstrobe.strobe1_pos, randstrobe.strobe2_pos + parameters.syncmer.k,
partial_start, partial_start + parameters.syncmer.k, true
}
);
}
return randstrobes;
}
4 changes: 2 additions & 2 deletions src/randstrobes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ class SyncmerIterator {
std::deque<uint64_t> qs; // s-mer hashes
uint64_t qs_min_val = UINT64_MAX;
size_t l = 0;
uint64_t xk[2] = {0, 0};
uint64_t xs[2] = {0, 0};
uint64_t xk{0};
uint64_t xs{0};
size_t i = 0;
};

Expand Down
2 changes: 1 addition & 1 deletion src/revcomp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ static unsigned char revcomp_table[256] = {
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'
};

static inline std::string reverse_complement(const std::string &sequence) {
static inline std::string reverse_complement(const std::string_view sequence) {
std::string result;
result.reserve(sequence.size());
for (size_t i = 0; i < sequence.length(); ++i) {
Expand Down
13 changes: 11 additions & 2 deletions tests/compare-baseline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,18 @@ python3 -c 'import pysam'

ends="pe"
threads=4
while getopts "st:" opt; do
mcs=0
while getopts "st:m" opt; do
case "${opt}" in
t)
threads=$OPTARG
;;
s)
ends=se # single-end reads
;;
m)
mcs=1
;;
\?)
exit 1
;;
Expand All @@ -39,10 +43,15 @@ tests/download.sh

baseline_commit=$(< tests/baseline-commit.txt)

baseline_bam=baseline/bam/${baseline_commit}.${ends}.bam
baseline_binary=baseline/strobealign-${baseline_commit}
cmake_options=-DCMAKE_BUILD_TYPE=RelWithDebInfo
extra_ext=""
strobealign_options="-t ${threads}"
if [[ ${mcs} == 1 ]]; then
extra_ext=".mcs"
strobealign_options="${strobealign_options} --mcs"
fi
baseline_bam=baseline/bam/${baseline_commit}.${ends}${extra_ext}.bam

# Generate the baseline BAM if necessary
mkdir -p baseline/bam
Expand Down
35 changes: 25 additions & 10 deletions tests/samdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ def main():
parser.add_argument("after")
parser.add_argument("--truth")
parser.add_argument(
"--limit", metavar="N", type=int, default=10,
help="Report details for at most N changed records"
"--limit",
metavar="N",
type=int,
default=10,
help="Report details for at most N changed records",
)
args = parser.parse_args()
has_truth = bool(args.truth)
Expand Down Expand Up @@ -67,6 +70,9 @@ def main():
continue

if a.is_unmapped:
if reported < limit:
print(b.query_name, "became unmapped", "\n")
reported += 1
became_unmapped += 1
continue

Expand All @@ -84,7 +90,11 @@ def main():

b_score = b.get_tag("AS")
a_score = a.get_tag("AS")
if b.mapping_quality == 0 and a.mapping_quality == 0 and b.is_proper_pair == a.is_proper_pair:
if (
b.mapping_quality == 0
and a.mapping_quality == 0
and b.is_proper_pair == a.is_proper_pair
):
if b_score == a_score:
multimapper_same += 1
continue
Expand Down Expand Up @@ -117,10 +127,7 @@ def main():
reported += 1

if changed > limit:
print(
f"Reporting limit reached, not showing {reported - limit} "
"additional changed records."
)
print(f"Reporting limit reached, not showing additional changed records.")
print()

def stat(description, value, should_be_zero: bool = True):
Expand All @@ -133,9 +140,17 @@ def stat(description, value, should_be_zero: bool = True):
stat("became mapped", became_mapped)
stat("became unmapped", became_unmapped)
stat("were mapped to same locus before and after", identical, False)
stat("were multimapper before and after, same alignment score (AS)", multimapper_same)
stat("were multimapper before and after, better alignment score (AS)", multimapper_better)
stat("were multimapper before and after, worse alignment score (AS)", multimapper_worse)
stat(
"were multimapper before and after, same alignment score (AS)", multimapper_same
)
stat(
"were multimapper before and after, better alignment score (AS)",
multimapper_better,
)
stat(
"were multimapper before and after, worse alignment score (AS)",
multimapper_worse,
)
if has_truth:
stat("were incorrect before and after (relative to truth)", same, False)
stat("became correct (relative to truth)", better)
Expand Down
Loading