Skip to content

Commit 228bcfb

Browse files
committed
Add bitonic sort algorithm
Currently only low-level register-based interface is exposed. Even the 16 lane sort is small enough and uses few enough registers on e.g. AVX2 that it makes sense to inline it and pass both input and gather output via SIMD registers without going to memory. A higher-level, memory based interface can be exposed in the future.
1 parent db48381 commit 228bcfb

File tree

6 files changed

+345
-0
lines changed

6 files changed

+345
-0
lines changed

simdpp/algorithm/bitonic_sort.h

+231
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
/* Copyright (C) 2024 Povilas Kanapickas <povilas@radix.lt>
2+
3+
Distributed under the Boost Software License, Version 1.0.
4+
(See accompanying file LICENSE_1_0.txt or copy at
5+
http://www.boost.org/LICENSE_1_0.txt)
6+
*/
7+
8+
#ifndef LIBSIMDPP_SIMDPP_ALGORITHM_BITONIC_SORT
9+
#define LIBSIMDPP_SIMDPP_ALGORITHM_BITONIC_SORT
10+
11+
#include <simdpp/simd.h>
12+
13+
namespace simdpp {
14+
namespace SIMDPP_ARCH_NAMESPACE {
15+
16+
namespace detail {
17+
18+
template<unsigned N, class T>
19+
SIMDPP_INL T sort_4lane_2el_asc_2el_dec(const any_vec32<N,T>& a)
20+
{
21+
auto& aw = a.wrapped();
22+
23+
T swapped = permute4<1, 0, 3, 2>(aw);
24+
25+
T res_min = min(aw, swapped);
26+
T res_max = max(aw, swapped);
27+
28+
return shuffle4x2<1, 4, 7, 2>(res_min, res_max);
29+
}
30+
31+
template<unsigned N, class T>
32+
SIMDPP_INL T sort_4lane_2el_asc_2el_asc(const any_vec32<N,T>& a)
33+
{
34+
auto& aw = a.wrapped();
35+
36+
T swapped = permute4<1, 0, 3, 2>(aw);
37+
38+
T res_min = min(aw, swapped);
39+
T res_max = max(aw, swapped);
40+
41+
return shuffle4x2<1, 4, 2, 7>(res_min, res_max);
42+
}
43+
44+
template<unsigned N, class T>
45+
SIMDPP_INL T sort_4lane_2el_dec_2el_dec(const any_vec32<N,T>& a)
46+
{
47+
auto& aw = a.wrapped();
48+
49+
T swapped = permute4<1, 0, 3, 2>(aw);
50+
51+
T res_max = max(aw, swapped);
52+
T res_min = min(aw, swapped);
53+
54+
return shuffle4x2<1, 4, 2, 7>(res_max, res_min);
55+
}
56+
57+
template<unsigned N, class T>
58+
SIMDPP_INL T sort_4lane_corresponding_2el_asc(const any_vec32<N,T>& a)
59+
{
60+
auto& aw = a.wrapped();
61+
T swapped = permute4<2, 3, 0, 1>(aw);
62+
63+
T res_min = min(aw, swapped);
64+
T res_max = max(aw, swapped);
65+
66+
return shuffle4x2<0, 1, 4, 5>(res_min, res_max);
67+
}
68+
69+
template<unsigned N, class T>
70+
SIMDPP_INL T sort_4lane_corresponding_2el_dec(const any_vec32<N,T>& a)
71+
{
72+
auto& aw = a.wrapped();
73+
T swapped = permute4<2, 3, 0, 1>(aw);
74+
75+
T res_max = max(aw, swapped);
76+
T res_min = min(aw, swapped);
77+
78+
return shuffle4x2<0, 1, 4, 5>(res_max, res_min);
79+
}
80+
81+
template<unsigned N, class T>
82+
SIMDPP_INL T sort_8lane_corresponding_4el_asc(const any_vec32<N,T>& a)
83+
{
84+
auto& aw = a.wrapped();
85+
T lo = shuffle1_128<0, 0>(aw, aw);
86+
T hi = shuffle1_128<1, 1>(aw, aw);
87+
88+
T res_min = min(lo, hi);
89+
T res_max = max(lo, hi);
90+
91+
return shuffle1_128<0, 0>(res_min, res_max);
92+
}
93+
94+
template<unsigned N, class T>
95+
SIMDPP_INL T sort_8lane_corresponding_4el_dec(const any_vec32<N,T>& a)
96+
{
97+
auto& aw = a.wrapped();
98+
T lo = shuffle1_128<0, 0>(aw, aw);
99+
T hi = shuffle1_128<1, 1>(aw, aw);
100+
101+
T res_max = max(lo, hi);
102+
T res_min = min(lo, hi);
103+
104+
return shuffle1_128<0, 0>(res_max, res_min);
105+
}
106+
107+
template<unsigned N, class T>
108+
SIMDPP_INL T reverse_8lane_top4(const any_vec32<N,T>& a)
109+
{
110+
auto& aw = a.wrapped();
111+
112+
T reversed = permute4<3, 2, 1, 0>(aw);
113+
114+
return shuffle1_128<0, 1>(aw, reversed);
115+
}
116+
117+
template<unsigned N, class T>
118+
SIMDPP_INL T reverse_8lane_bottom4(const any_vec32<N,T>& a)
119+
{
120+
auto& aw = a.wrapped();
121+
122+
T reversed = permute4<3, 2, 1, 0>(aw);
123+
124+
return shuffle1_128<0, 1>(reversed, aw);
125+
}
126+
127+
template<unsigned N, class T>
128+
SIMDPP_INL T sort_8lane_4el_asc_4el_dec(const any_vec32<N,T>& a)
129+
{
130+
auto& aw = a.wrapped();
131+
132+
T step1_res = sort_4lane_2el_asc_2el_dec(aw);
133+
T step2_res = sort_4lane_corresponding_2el_asc(step1_res);
134+
T step3_res = sort_4lane_2el_asc_2el_asc(step2_res);
135+
136+
return reverse_8lane_top4(step3_res);
137+
}
138+
139+
template<unsigned N, class T>
140+
SIMDPP_INL T sort_8lane_4el_dec_4el_asc(const any_vec32<N,T>& a)
141+
{
142+
auto& aw = a.wrapped();
143+
144+
T step1_res = sort_4lane_2el_asc_2el_dec(aw);
145+
T step2_res = sort_4lane_corresponding_2el_asc(step1_res);
146+
T step3_res = sort_4lane_2el_asc_2el_asc(step2_res);
147+
148+
return reverse_8lane_bottom4(step3_res);
149+
}
150+
151+
template<unsigned N, class T>
152+
SIMDPP_INL T bitonic_sort_8lane_finalize_asc(const any_vec32<N, T>& a)
153+
{
154+
auto& aw = a.wrapped();
155+
156+
T step1_res = sort_8lane_corresponding_4el_asc(aw);
157+
T step2_res = sort_4lane_corresponding_2el_asc(step1_res);
158+
return sort_4lane_2el_asc_2el_asc(step2_res);
159+
}
160+
161+
template<unsigned N, class T>
162+
SIMDPP_INL T bitonic_sort_8lane_finalize_dec(const any_vec32<N, T>& a)
163+
{
164+
auto& aw = a.wrapped();
165+
166+
T step1_res = sort_8lane_corresponding_4el_dec(aw);
167+
T step2_res = sort_4lane_corresponding_2el_dec(step1_res);
168+
return sort_4lane_2el_dec_2el_dec(step2_res);
169+
}
170+
171+
} // namespace detail
172+
173+
/** Sorts data in the given SIMD registers in increasing order. Sort is not stable.
174+
*/
175+
template<class T>
176+
void bitonic_sort_asc(any_vec32<8,T>& a0)
177+
{
178+
auto r = detail::sort_8lane_4el_asc_4el_dec(a0.wrapped());
179+
a0.wrapped() = detail::bitonic_sort_8lane_finalize_asc(r);
180+
}
181+
182+
template<class T>
183+
void bitonic_sort_dec(any_vec32<8,T>& a0)
184+
{
185+
auto r = detail::sort_8lane_4el_dec_4el_asc(a0.wrapped());
186+
a0.wrapped() = detail::bitonic_sort_8lane_finalize_dec(r);
187+
}
188+
189+
190+
template<class T>
191+
void bitonic_sort_asc(any_vec32<8,T>& a0, any_vec32<8,T>& a1)
192+
{
193+
auto r0 = a0.wrapped();
194+
auto r1 = a1.wrapped();
195+
r0 = detail::sort_8lane_4el_asc_4el_dec(r0);
196+
r0 = detail::bitonic_sort_8lane_finalize_asc(r0);
197+
r1 = detail::sort_8lane_4el_asc_4el_dec(r1);
198+
r1 = detail::bitonic_sort_8lane_finalize_dec(r1);
199+
200+
T res_max = max(r0, r1);
201+
T res_min = min(r0, r1);
202+
203+
r0 = detail::bitonic_sort_8lane_finalize_asc(res_min);
204+
r1 = detail::bitonic_sort_8lane_finalize_asc(res_max);
205+
a0.wrapped() = r0;
206+
a1.wrapped() = r1;
207+
}
208+
209+
template<class T>
210+
void bitonic_sort_dec(any_vec32<8,T>& a0, any_vec32<8,T>& a1)
211+
{
212+
auto r0 = a0.wrapped();
213+
auto r1 = a1.wrapped();
214+
r0 = detail::sort_8lane_4el_dec_4el_asc(r0);
215+
r0 = detail::bitonic_sort_8lane_finalize_dec(r0);
216+
r1 = detail::sort_8lane_4el_dec_4el_asc(r1);
217+
r1 = detail::bitonic_sort_8lane_finalize_asc(r1);
218+
219+
T res_max = max(r0, r1);
220+
T res_min = min(r0, r1);
221+
222+
r0 = detail::bitonic_sort_8lane_finalize_dec(res_max);
223+
r1 = detail::bitonic_sort_8lane_finalize_dec(res_min);
224+
a0.wrapped() = r0;
225+
a1.wrapped() = r1;
226+
}
227+
228+
} // namespace simdpp
229+
} // namespace SIMDPP_ARCH_NAMESPACE
230+
231+
#endif // LIBSIMDPP_SIMDPP_ALGORITHM_BITONIC_SORT

test/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ set(TEST_INSN_ARCH_SOURCES
7676
insn/test_utils.cc
7777
insn/tests.cc
7878
insn/transpose.cc
79+
algorithm/bitonic_sort.cc
7980
)
8081

8182
set(TEST_INSN_ARCH_GEN_SOURCES "")

test/algorithm/bitonic_sort.cc

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/* Copyright (C) 2024 Povilas Kanapickas <povilas@radix.lt>
2+
3+
Distributed under the Boost Software License, Version 1.0.
4+
(See accompanying file LICENSE_1_0.txt or copy at
5+
http://www.boost.org/LICENSE_1_0.txt)
6+
*/
7+
8+
#include <simdpp/simd.h>
9+
#include <simdpp/algorithm/bitonic_sort.h>
10+
#include "../insn/tests.h"
11+
#include "../utils/test_helpers.h"
12+
#include <algorithm>
13+
#include <array>
14+
#include <random>
15+
16+
namespace SIMDPP_ARCH_NAMESPACE {
17+
18+
template<class V, unsigned N, bool Ascending>
19+
struct TestDataPreparer
20+
{
21+
using E = typename V::element_type;
22+
23+
TestDataPreparer()
24+
{
25+
for (std::uint32_t i = 0; i < data.size(); ++i)
26+
{
27+
data[i] = static_cast<E>(i);
28+
}
29+
expected_data = data;
30+
std::sort(expected_data.begin(), expected_data.end(), [](auto l, auto r)
31+
{
32+
if (Ascending) {
33+
return l < r;
34+
} else {
35+
return l > r;
36+
};
37+
});
38+
}
39+
40+
void next()
41+
{
42+
std::shuffle(data.begin(), data.end(), rng);
43+
}
44+
45+
std::minstd_rand rng = std::minstd_rand{123};
46+
std::array<E, N> data;
47+
std::array<E, N> expected_data;
48+
};
49+
50+
template<class V, bool Ascending>
51+
void test_bitonic_sort_impl1_v(TestReporter& tr)
52+
{
53+
TestDataPreparer<V, 8, Ascending> data_preparer;
54+
55+
for (std::uint32_t i = 0; i < 1000; ++i)
56+
{
57+
data_preparer.next();
58+
59+
V sorted = simdpp::load_u(&data_preparer.data[0]);
60+
if (Ascending) {
61+
simdpp::bitonic_sort_asc(sorted);
62+
} else {
63+
simdpp::bitonic_sort_dec(sorted);
64+
}
65+
V expected = simdpp::load_u(&data_preparer.expected_data[0]);
66+
TEST_EQUAL(tr, sorted, expected);
67+
}
68+
}
69+
70+
template<class V, bool Ascending>
71+
void test_bitonic_sort_impl2_v(TestReporter& tr)
72+
{
73+
TestDataPreparer<V, 16, Ascending> data_preparer;
74+
75+
for (std::uint32_t i = 0; i < 1000; ++i)
76+
{
77+
data_preparer.next();
78+
79+
V sorted1 = simdpp::load_u(&data_preparer.data[0]);
80+
V sorted2 = simdpp::load_u(&data_preparer.data[8]);
81+
if (Ascending) {
82+
simdpp::bitonic_sort_asc(sorted1, sorted2);
83+
} else {
84+
simdpp::bitonic_sort_dec(sorted1, sorted2);
85+
}
86+
V expected1 = simdpp::load_u(&data_preparer.expected_data[0]);
87+
V expected2 = simdpp::load_u(&data_preparer.expected_data[8]);
88+
TEST_EQUAL(tr, sorted1, expected1);
89+
TEST_EQUAL(tr, sorted2, expected2);
90+
}
91+
}
92+
93+
void test_algorithm_bitonic_sort(TestReporter& tr)
94+
{
95+
test_bitonic_sort_impl1_v<simdpp::float32<8>, true>(tr);
96+
test_bitonic_sort_impl1_v<simdpp::uint32<8>, true>(tr);
97+
test_bitonic_sort_impl1_v<simdpp::int32<8>, true>(tr);
98+
test_bitonic_sort_impl1_v<simdpp::float32<8>, false>(tr);
99+
test_bitonic_sort_impl1_v<simdpp::uint32<8>, false>(tr);
100+
test_bitonic_sort_impl1_v<simdpp::int32<8>, false>(tr);
101+
test_bitonic_sort_impl2_v<simdpp::float32<8>, true>(tr);
102+
test_bitonic_sort_impl2_v<simdpp::uint32<8>, true>(tr);
103+
test_bitonic_sort_impl2_v<simdpp::int32<8>, true>(tr);
104+
test_bitonic_sort_impl2_v<simdpp::float32<8>, false>(tr);
105+
test_bitonic_sort_impl2_v<simdpp::uint32<8>, false>(tr);
106+
test_bitonic_sort_impl2_v<simdpp::int32<8>, false>(tr);
107+
}
108+
109+
} // namespace SIMDPP_ARCH_NAMESPACE

test/expr/tests.h

+1
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,6 @@ void test_expr_bitwise(TestReporter& ts);
1515
void test_expr_math_float(TestReporter& ts);
1616
void test_expr_math_int(TestReporter& ts);
1717
void test_expr_compare(TestReporter& tr);
18+
void test_algorithm_bitonic_sort(TestReporter& tr);
1819

1920
#endif

test/insn/tests.cc

+2
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ void main_test_function(TestResults& res, TestReporter& tr, const TestOptions& o
7171
test_transpose(res);
7272

7373
test_for_each(res, tr);
74+
75+
test_algorithm_bitonic_sort(tr);
7476
}
7577

7678
} // namespace SIMDPP_ARCH_NAMESPACE

test/insn/tests.h

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ void test_permute_generic(TestResults& res);
3636
void test_shuffle_transpose(TestResults& res);
3737
void test_test_utils(TestResults& res);
3838
void test_transpose(TestResults& res);
39+
void test_algorithm_bitonic_sort(TestReporter& tr);
3940

4041
} // namespace SIMDPP_ARCH_NAMESPACE
4142

0 commit comments

Comments
 (0)