diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index af623f69611d0..c38db23eb0089 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -38,8 +38,8 @@ namespace arrow::util::internal { #if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2) template -void ByteStreamSplitDecode128B(const uint8_t* data, int64_t num_values, int64_t stride, - uint8_t* out) { +void ByteStreamSplitDecodeSimd128(const uint8_t* data, int64_t num_values, int64_t stride, + uint8_t* out) { using simd_batch = xsimd::make_sized_batch_t; static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); @@ -92,8 +92,8 @@ void ByteStreamSplitDecode128B(const uint8_t* data, int64_t num_values, int64_t } template -void ByteStreamSplitEncode128B(const uint8_t* raw_values, const int64_t num_values, - uint8_t* output_buffer_raw) { +void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, const int64_t num_values, + uint8_t* output_buffer_raw) { using simd_batch = xsimd::make_sized_batch_t; using simd_arch = typename simd_batch::arch_type; @@ -125,10 +125,10 @@ void ByteStreamSplitEncode128B(const uint8_t* raw_values, const int64_t num_valu // Example run for 32-bit variables: // Step 0, copy: // 0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ... - // Step 1: simd_batch::xip_lo and simd_batch::xip_hi: + // Step 1: simd_batch::zip_lo and simd_batch::zip_hi: // 0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ... // 0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ... - // Step 3: simd_batch::xip_lo and simd_batch::xip_hi: + // Step 3: simd_batch::zip_lo and simd_batch::zip_hi: // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ... // Step 4: simd_batch and simd_batch: // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ... @@ -223,7 +223,7 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t const int64_t size = num_values * kNumStreams; if (size < kBlockSize) // Back to SSE for small size - return ByteStreamSplitDecode128B(data, num_values, stride, out); + return ByteStreamSplitDecodeSimd128(data, num_values, stride, out); const int64_t num_blocks = size / kBlockSize; // First handle suffix. @@ -305,13 +305,13 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const int64_t num_valu constexpr int kBlockSize = sizeof(__m256i) * kNumStreams; if constexpr (kNumStreams == 8) // Back to SSE, currently no path for double. - return ByteStreamSplitEncode128B(raw_values, num_values, - output_buffer_raw); + return ByteStreamSplitEncodeSimd128(raw_values, num_values, + output_buffer_raw); const int64_t size = num_values * kNumStreams; if (size < kBlockSize) // Back to SSE for small size - return ByteStreamSplitEncode128B(raw_values, num_values, - output_buffer_raw); + return ByteStreamSplitEncodeSimd128(raw_values, num_values, + output_buffer_raw); const int64_t num_blocks = size / kBlockSize; const __m256i* raw_values_simd = reinterpret_cast(raw_values); __m256i* output_buffer_streams[kNumStreams]; @@ -378,7 +378,7 @@ void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values, #if defined(ARROW_HAVE_AVX2) return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); #elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) - return ByteStreamSplitDecode128B(data, num_values, stride, out); + return ByteStreamSplitDecodeSimd128(data, num_values, stride, out); #else #error "ByteStreamSplitDecodeSimd not implemented" #endif @@ -391,8 +391,8 @@ void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const int64_t n return ByteStreamSplitEncodeAvx2(raw_values, num_values, output_buffer_raw); #elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) - return ByteStreamSplitEncode128B(raw_values, num_values, - output_buffer_raw); + return ByteStreamSplitEncodeSimd128(raw_values, num_values, + output_buffer_raw); #else #error "ByteStreamSplitEncodeSimd not implemented" #endif diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc index 607e2191f28d7..83ed8c9ba5fcd 100644 --- a/cpp/src/arrow/util/byte_stream_split_test.cc +++ b/cpp/src/arrow/util/byte_stream_split_test.cc @@ -73,10 +73,8 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { #if defined(ARROW_HAVE_SIMD_SPLIT) encode_funcs_.push_back({"simd", &ByteStreamSplitEncodeSimd}); decode_funcs_.push_back({"simd", &ByteStreamSplitDecodeSimd}); -#endif -#if defined(ARROW_HAVE_SSE4_2) - encode_funcs_.push_back({"sse2", &ByteStreamSplitEncode128B}); - decode_funcs_.push_back({"sse2", &ByteStreamSplitDecode128B}); + encode_funcs_.push_back({"simd128", &ByteStreamSplitEncodeSimd128}); + decode_funcs_.push_back({"simd128", &ByteStreamSplitDecodeSimd128}); #endif #if defined(ARROW_HAVE_AVX2) encode_funcs_.push_back({"avx2", &ByteStreamSplitEncodeAvx2}); diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 26dd8c9270fa9..3069e8c9057a9 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -417,22 +417,22 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE); #if defined(ARROW_HAVE_SSE4_2) static void BM_ByteStreamSplitDecode_Float_Sse2(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecode128B); + state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); } static void BM_ByteStreamSplitDecode_Double_Sse2(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecode128B); + state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); } static void BM_ByteStreamSplitEncode_Float_Sse2(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncode128B); + state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); } static void BM_ByteStreamSplitEncode_Double_Sse2(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncode128B); + state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); } BENCHMARK(BM_ByteStreamSplitDecode_Float_Sse2)->Range(MIN_RANGE, MAX_RANGE); @@ -471,22 +471,22 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE); #if defined(ARROW_HAVE_NEON) static void BM_ByteStreamSplitDecode_Float_Neon(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecode128B); + state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); } static void BM_ByteStreamSplitDecode_Double_Neon(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecode128B); + state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); } static void BM_ByteStreamSplitEncode_Float_Neon(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncode128B); + state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); } static void BM_ByteStreamSplitEncode_Double_Neon(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncode128B); + state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); } BENCHMARK(BM_ByteStreamSplitDecode_Float_Neon)->Range(MIN_RANGE, MAX_RANGE);