Skip to content

Commit 44c41af

Browse files
committed
Improve code generation for partial vector access
1 parent 3ee668c commit 44c41af

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+584
-457
lines changed

simdpp/detail/altivec/load1.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ namespace altivec {
2525
undefined.
2626
2727
@code
28-
a.vec(0) = *p
28+
a.vec<0>() = *p
2929
@endcode
3030
3131
@icost{ALTIVEC, 2}

simdpp/detail/insn/combine.h

+20-3
Original file line numberDiff line numberDiff line change
@@ -164,13 +164,30 @@ float64<8> i_combine(const float64<4>& a, const float64<4>& b)
164164

165165
// -----------------------------------------------------------------------------
166166
// generic implementation
167+
168+
template<unsigned I, unsigned End>
169+
struct combine_unroll {
170+
template<class VR, class VS>
171+
static SIMDPP_INL void combine(VR& dst, const VS& src1, const VS& src2)
172+
{
173+
dst.template vec<I>() = src1.template vec<I>();
174+
dst.template vec<I + End>() = src2.template vec<I>();
175+
combine_unroll<I + 1, End>::combine(dst, src1, src2);
176+
}
177+
};
178+
179+
template<unsigned End>
180+
struct combine_unroll<End, End> {
181+
template<class VR, class VS>
182+
static SIMDPP_INL void combine(VR&, const VS&, const VS&) {}
183+
};
184+
185+
167186
template<class V, class H> SIMDPP_INL
168187
V i_combine(const H& a1, const H& a2)
169188
{
170189
V r;
171-
unsigned h = H::vec_length;
172-
for (unsigned i = 0; i < h; ++i) { r.vec(i) = a1.vec(i); }
173-
for (unsigned i = 0; i < h; ++i) { r.vec(i+h) = a2.vec(i); }
190+
combine_unroll<0, H::vec_length>::combine(r, a1, a2);
174191
return r;
175192
}
176193

simdpp/detail/insn/conv_any_to_float32.h

+22-22
Original file line numberDiff line numberDiff line change
@@ -37,29 +37,29 @@ float32<4> i_to_float32(const float64<4>& a)
3737
return _mm256_cvtpd_ps(a.native());
3838
#elif SIMDPP_USE_SSE2
3939
float32x4 r1, r2;
40-
r1 = _mm_cvtpd_ps(a.vec(0).native());
41-
r2 = _mm_cvtpd_ps(a.vec(1).native());
40+
r1 = _mm_cvtpd_ps(a.vec<0>().native());
41+
r2 = _mm_cvtpd_ps(a.vec<1>().native());
4242
return _mm_movelh_ps(r1.native(), r2.native());
4343
#elif SIMDPP_USE_NEON64
4444
float32<4> r;
45-
r = vcvt_high_f32_f64(vcvt_f32_f64(a.vec(0).native()),
46-
a.vec(1).native());
45+
r = vcvt_high_f32_f64(vcvt_f32_f64(a.vec<0>().native()),
46+
a.vec<1>().native());
4747
return r;
4848
#elif SIMDPP_USE_VSX_206
4949
float32<4> lo, hi;
5050
uint32<4> shuffle_mask;
51-
lo = __builtin_vsx_xvcvdpsp(a.vec(0).native());
52-
hi = __builtin_vsx_xvcvdpsp(a.vec(1).native());
51+
lo = __builtin_vsx_xvcvdpsp(a.vec<0>().native());
52+
hi = __builtin_vsx_xvcvdpsp(a.vec<1>().native());
5353
shuffle_mask = make_shuffle_bytes16_mask<0,2,4,6>(shuffle_mask);
5454
return shuffle_bytes16(lo, hi, shuffle_mask);
5555
#elif SIMDPP_USE_MSA
56-
return __msa_fexdo_w(a.vec(0).native(), a.vec(1).native());
56+
return __msa_fexdo_w(a.vec<0>().native(), a.vec<1>().native());
5757
#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
5858
detail::mem_block<float32x4> r;
59-
r[0] = float(a.vec(0).el(0));
60-
r[1] = float(a.vec(0).el(1));
61-
r[2] = float(a.vec(1).el(0));
62-
r[3] = float(a.vec(1).el(1));
59+
r[0] = float(a.vec<0>().el(0));
60+
r[1] = float(a.vec<0>().el(1));
61+
r[2] = float(a.vec<1>().el(0));
62+
r[3] = float(a.vec<1>().el(1));
6363
return r;
6464
#endif
6565
}
@@ -72,8 +72,8 @@ float32<8> i_to_float32(const float64<8>& a)
7272
return _mm512_cvt_roundpd_ps(a.native(), (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC));
7373
#else
7474
float32x4 r1, r2;
75-
r1 = i_to_float32(a.vec(0));
76-
r2 = i_to_float32(a.vec(1));
75+
r1 = i_to_float32(a.vec<0>());
76+
r2 = i_to_float32(a.vec<1>());
7777
return combine(r1, r2);
7878
#endif
7979
}
@@ -84,8 +84,8 @@ static SIMDPP_INL
8484
float32<16> i_to_float32(const float64<16>& a)
8585
{
8686
float32<8> r1, r2;
87-
r1 = i_to_float32(a.vec(0));
88-
r2 = i_to_float32(a.vec(1));
87+
r1 = i_to_float32(a.vec<0>());
88+
r2 = i_to_float32(a.vec<1>());
8989
return combine(r1, r2);
9090
}
9191
#endif
@@ -155,8 +155,8 @@ static SIMDPP_INL
155155
float32<16> i_to_float32(const int64<16>& a)
156156
{
157157
#if SIMDPP_USE_AVX512DQ
158-
float32<8> r0 = _mm512_cvtepi64_ps(a.vec(0).native());
159-
float32<8> r1 = _mm512_cvtepi64_ps(a.vec(1).native());
158+
float32<8> r0 = _mm512_cvtepi64_ps(a.vec<0>().native());
159+
float32<8> r1 = _mm512_cvtepi64_ps(a.vec<1>().native());
160160
return combine(r0, r1);
161161
#else
162162
return i_to_float32(i_to_float64(a));
@@ -205,8 +205,8 @@ static SIMDPP_INL
205205
float32<16> i_to_float32(const uint64<16>& a)
206206
{
207207
#if SIMDPP_USE_AVX512DQ
208-
float32<8> r0 = _mm512_cvtepu64_ps(a.vec(0).native());
209-
float32<8> r1 = _mm512_cvtepu64_ps(a.vec(1).native());
208+
float32<8> r0 = _mm512_cvtepu64_ps(a.vec<0>().native());
209+
float32<8> r1 = _mm512_cvtepu64_ps(a.vec<1>().native());
210210
return combine(r0, r1);
211211
#else
212212
return i_to_float32(i_to_float64(a));
@@ -258,8 +258,8 @@ float32x8 i_to_float32(const int32x8& a)
258258
return _mm256_cvtepi32_ps(a.native());
259259
#else
260260
__m256i a1;
261-
a1 = _mm256_castsi128_si256(a.vec(0).native());
262-
a1 = _mm256_insertf128_si256(a1, a.vec(1).native(), 1);
261+
a1 = _mm256_castsi128_si256(a.vec<0>().native());
262+
a1 = _mm256_insertf128_si256(a1, a.vec<1>().native(), 1);
263263
return _mm256_cvtepi32_ps(a1);
264264
#endif
265265
}
@@ -361,7 +361,7 @@ float32x8 i_to_float32(const uint32x8& a)
361361
float32<8> f_a = _mm256_cvtepi32_ps(a.native());
362362
return add(f_a, bit_and(is_large, splat<float32<8>>(0x100000000)));
363363
#else
364-
return combine(i_to_float32(a.vec(0)), i_to_float32(a.vec(1)));
364+
return combine(i_to_float32(a.vec<0>()), i_to_float32(a.vec<1>()));
365365
#endif
366366
}
367367
#endif

simdpp/detail/insn/conv_any_to_float64.h

+30-30
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,10 @@ float64x4 i_to_float64(const float32x4& a)
6565
#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
6666
detail::mem_block<float32x4> ax(a);
6767
float64x4 r;
68-
r.vec(0).el(0) = double(ax[0]);
69-
r.vec(0).el(1) = double(ax[1]);
70-
r.vec(1).el(0) = double(ax[2]);
71-
r.vec(1).el(1) = double(ax[3]);
68+
r.vec<0>().el(0) = double(ax[0]);
69+
r.vec<0>().el(1) = double(ax[1]);
70+
r.vec<1>().el(0) = double(ax[2]);
71+
r.vec<1>().el(1) = double(ax[3]);
7272
return r;
7373
#endif
7474
}
@@ -152,31 +152,31 @@ float64x4 i_to_float64(const int32x4& a)
152152
#elif SIMDPP_USE_VSX_206
153153
#if SIMDPP_USE_VSX_207
154154
int64<4> a64 = i_to_int64(a);
155-
__vector int64_t b0 = a64.vec(0).native();
156-
__vector int64_t b1 = a64.vec(1).native();
155+
__vector int64_t b0 = a64.vec<0>().native();
156+
__vector int64_t b1 = a64.vec<1>().native();
157157
#else
158158
int32<4> sign = shift_r<31>(a);
159159
__vector int64_t b0 = (__vector int64_t) vec_mergeh(a.native(), sign.native());
160160
__vector int64_t b1 = (__vector int64_t) vec_mergel(a.native(), sign.native());
161161
#endif
162162

163163
float64<4> r;
164-
r.vec(0) = vec_ctf(b0, 0);
165-
r.vec(1) = vec_ctf(b1, 0);
164+
r.vec<0>() = vec_ctf(b0, 0);
165+
r.vec<1>() = vec_ctf(b1, 0);
166166
return r;
167167
#elif SIMDPP_USE_MSA
168168
int64<4> a64 = i_to_int64(a);
169169
float64<4> r;
170-
r.vec(0) = __msa_ffint_s_d(a64.vec(0).native());
171-
r.vec(1) = __msa_ffint_s_d(a64.vec(1).native());
170+
r.vec<0>() = __msa_ffint_s_d(a64.vec<0>().native());
171+
r.vec<1>() = __msa_ffint_s_d(a64.vec<1>().native());
172172
return r;
173173
#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
174174
detail::mem_block<int32x4> ax(a);
175175
float64x4 r;
176-
r.vec(0).el(0) = double(ax[0]);
177-
r.vec(0).el(1) = double(ax[1]);
178-
r.vec(1).el(0) = double(ax[2]);
179-
r.vec(1).el(1) = double(ax[3]);
176+
r.vec<0>().el(0) = double(ax[0]);
177+
r.vec<0>().el(1) = double(ax[1]);
178+
r.vec<1>().el(0) = double(ax[2]);
179+
r.vec<1>().el(1) = double(ax[3]);
180180
return r;
181181
#endif
182182
}
@@ -254,8 +254,8 @@ float64<4> i_to_float64(const uint32<4>& a)
254254
#if SIMDPP_USE_AVX
255255
f = _mm256_cvtepi32_pd(a.native());
256256
#else
257-
f.vec(0) = _mm_cvtepi32_pd(a.native());
258-
f.vec(1) = _mm_cvtepi32_pd(move4_l<2>(a).native());
257+
f.vec<0>() = _mm_cvtepi32_pd(a.native());
258+
f.vec<1>() = _mm_cvtepi32_pd(move4_l<2>(a).native());
259259
#endif
260260
// if result is negative, we converted integer larger than 0x7fffffff
261261
mask_float64<4> is_large = cmp_lt(f, 0);
@@ -268,31 +268,31 @@ float64<4> i_to_float64(const uint32<4>& a)
268268
#elif SIMDPP_USE_VSX_206
269269
#if SIMDPP_USE_VSX_207
270270
uint64<4> a64 = i_to_uint64(a);
271-
__vector uint64_t b0 = a64.vec(0).native();
272-
__vector uint64_t b1 = a64.vec(1).native();
271+
__vector uint64_t b0 = a64.vec<0>().native();
272+
__vector uint64_t b1 = a64.vec<1>().native();
273273
#else
274274
uint32<4> zero = make_zero();
275275
__vector uint64_t b0 = (__vector uint64_t) vec_mergeh(a.native(), zero.native());
276276
__vector uint64_t b1 = (__vector uint64_t) vec_mergel(a.native(), zero.native());
277277
#endif
278278

279279
float64<4> r;
280-
r.vec(0) = vec_ctf(b0, 0);
281-
r.vec(1) = vec_ctf(b1, 0);
280+
r.vec<0>() = vec_ctf(b0, 0);
281+
r.vec<1>() = vec_ctf(b1, 0);
282282
return r;
283283
#elif SIMDPP_USE_MSA
284284
uint64<4> a64 = i_to_uint64(a);
285285
float64<4> r;
286-
r.vec(0) = __msa_ffint_u_d(a64.vec(0).native());
287-
r.vec(1) = __msa_ffint_u_d(a64.vec(1).native());
286+
r.vec<0>() = __msa_ffint_u_d(a64.vec<0>().native());
287+
r.vec<1>() = __msa_ffint_u_d(a64.vec<1>().native());
288288
return r;
289289
#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
290290
detail::mem_block<uint32<4>> ax(a);
291291
float64<4> r;
292-
r.vec(0).el(0) = double(ax[0]);
293-
r.vec(0).el(1) = double(ax[1]);
294-
r.vec(1).el(0) = double(ax[2]);
295-
r.vec(1).el(1) = double(ax[3]);
292+
r.vec<0>().el(0) = double(ax[0]);
293+
r.vec<0>().el(1) = double(ax[1]);
294+
r.vec<1>().el(0) = double(ax[2]);
295+
r.vec<1>().el(1) = double(ax[3]);
296296
return r;
297297
#endif
298298
}
@@ -308,8 +308,8 @@ float64<8> i_to_float64(const uint32<8>& a)
308308
float64<8> f;
309309
split(a, a0, a1);
310310

311-
f.vec(0) = _mm256_cvtepi32_pd(a0.native());
312-
f.vec(1) = _mm256_cvtepi32_pd(a1.native());
311+
f.vec<0>() = _mm256_cvtepi32_pd(a0.native());
312+
f.vec<1>() = _mm256_cvtepi32_pd(a1.native());
313313

314314
// if result is negative, we converted integer larger than 0x7fffffff
315315
mask_float64<8> is_large = cmp_lt(f, 0);
@@ -326,8 +326,8 @@ float64<16> i_to_float64(const uint32<16>& a)
326326
uint32<8> a0, a1;
327327
split(a, a0, a1);
328328

329-
r.vec(0) = _mm512_cvtepu32_pd(a0.native());
330-
r.vec(1) = _mm512_cvtepu32_pd(a1.native());
329+
r.vec<0>() = _mm512_cvtepu32_pd(a0.native());
330+
r.vec<1>() = _mm512_cvtepu32_pd(a1.native());
331331
return r;
332332
}
333333
#endif

simdpp/detail/insn/conv_extend_to_int16.h

+6-6
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ SIMDPP_INL uint16<16> i_to_uint16(const uint8<16>& a)
4747
return combine(r1, r2);
4848
#elif SIMDPP_USE_NEON
4949
uint16x16 r;
50-
r.vec(0) = vmovl_u8(vget_low_u8(a.native()));
51-
r.vec(1) = vmovl_u8(vget_high_u8(a.native()));
50+
r.vec<0>() = vmovl_u8(vget_low_u8(a.native()));
51+
r.vec<1>() = vmovl_u8(vget_high_u8(a.native()));
5252
return r;
5353
#elif (SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN)
5454
uint16x8 r1, r2;
@@ -141,8 +141,8 @@ SIMDPP_INL int16x16 i_to_int16(const int8x16& a)
141141
return combine(r1, r2);
142142
#elif SIMDPP_USE_NEON
143143
int16x16 r;
144-
r.vec(0) = vmovl_s8(vget_low_s8(a.native()));
145-
r.vec(1) = vmovl_s8(vget_high_s8(a.native()));
144+
r.vec<0>() = vmovl_s8(vget_low_s8(a.native()));
145+
r.vec<1>() = vmovl_s8(vget_high_s8(a.native()));
146146
return r;
147147
#elif SIMDPP_USE_MSA
148148
int8x16 sign = shift_r<7>(a);
@@ -152,8 +152,8 @@ SIMDPP_INL int16x16 i_to_int16(const int8x16& a)
152152
return combine(lo, hi);
153153
#elif SIMDPP_USE_ALTIVEC
154154
int16x16 r;
155-
r.vec(0) = vec_unpackh(a.vec(0).native());
156-
r.vec(1) = vec_unpackl(a.vec(0).native());
155+
r.vec<0>() = vec_unpackh(a.vec<0>().native());
156+
r.vec<1>() = vec_unpackl(a.vec<0>().native());
157157
return r;
158158
#endif
159159
}

0 commit comments

Comments
 (0)