Skip to content

Commit

Permalink
Merge pull request #20 from mklarqvist/cleanup
Browse files Browse the repository at this point in the history
General cleanup: function names as used in the paper. Fix compilation issues for Windows.
  • Loading branch information
mklarqvist authored Apr 11, 2019
2 parents 35ba0ad + ee72306 commit 76aa5f8
Show file tree
Hide file tree
Showing 10 changed files with 456 additions and 419 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@
example
bench
instrumented_benchmark
.vscode
CMakeFiles
Testing
20 changes: 20 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

cmake_minimum_required(VERSION 2.8)
project(libppospopcnt C CXX)
set(CMAKE_BUILD_TYPE Release)
include_directories(. benchmark/linux/)

set(CMAKE_CXX_STANDARD 11) # enable C++11 standard
set(CMAKE_CXX_FLAGS "-O3 -march=native")
set(CMAKE_C_FLAGS "-O3 -march=native")

add_executable(bench pospopcnt.c benchmark.cpp)
add_executable(example pospopcnt.c example.c)
add_executable(instrumented_benchmark pospopcnt.c benchmark/linux/instrumented_benchmark.cpp)

enable_testing()
add_test(NAME example COMMAND example)
add_test(NAME bench COMMAND bench 100000 10)
add_test(NAME instrumented_benchmark COMMAND instrumented_benchmark -n 100000 -i 10)

install(FILES pospopcnt.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include/pospopcnt)
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
OPTFLAGS := -O3 -march=native
CFLAGS = -std=c99 $(OPTFLAGS) $(DEBUG_FLAGS)
CPPFLAGS = -std=c++0x $(OPTFLAGS) $(DEBUG_FLAGS)
CPP_SOURCE = main.cpp benchmark/linux/instrumented_benchmark.cpp
CPP_SOURCE = benchmark.cpp benchmark/linux/instrumented_benchmark.cpp
C_SOURCE = pospopcnt.c example.c
OBJECTS = $(CPP_SOURCE:.cpp=.o) $(C_SOURCE:.c=.o)

Expand All @@ -36,8 +36,8 @@ all: bench
benchmark/linux/instrumented_benchmark.o : benchmark/linux/instrumented_benchmark.cpp
$(CXX) $(CPPFLAGS) -I. -Ibenchmark/linux -c -o $@ $<

bench: pospopcnt.o main.o
$(CXX) $(CPPFLAGS) pospopcnt.o main.o -o bench
bench: pospopcnt.o benchmark.o
$(CXX) $(CPPFLAGS) pospopcnt.o benchmark.o -o bench

itest: instrumented_benchmark
$(CXX) --version
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,19 @@ This benchmark shows the speedup of the 3 `pospopcnt` algorithms used on x86 CPU

| Algorithm | 128 | 256 | 512 | 1024 | 2048 | 4096 | 8192 | 65536 |
|-----------------------------------|------|------|------|------|------|------|------|-------|
| pospopcnt_u16_sse_mula_unroll8 | **2.09** | 3.16 | 2.35 | 1.88 | 1.67 | 1.56 | 1.5 | 1.44 |
| pospopcnt_u16_avx512_mula_unroll8 | 1.78 | **3.61** | **3.61** | 3.59 | 3.68 | 3.65 | 3.67 | 3.7 |
| pospopcnt_u16_avx512_mula3 | 0.77 | 0.9 | 3.24 | **3.96** | **4.96** | 5.87 | 6.52 | 7.24 |
| pospopcnt_u16_avx512_csa | 0.52 | 0.74 | 1.83 | 2.64 | 4.06 | **6.43** | **9.41** | **16.28** |
| pospopcnt_u16_sse_blend_popcnt_unroll8 | **2.09** | 3.16 | 2.35 | 1.88 | 1.67 | 1.56 | 1.5 | 1.44 |
| pospopcnt_u16_avx512_blend_popcnt_unroll8 | 1.78 | **3.61** | **3.61** | 3.59 | 3.68 | 3.65 | 3.67 | 3.7 |
| pospopcnt_u16_avx512_adder_forest | 0.77 | 0.9 | 3.24 | **3.96** | **4.96** | 5.87 | 6.52 | 7.24 |
| pospopcnt_u16_avx512_harvey_seal | 0.52 | 0.74 | 1.83 | 2.64 | 4.06 | **6.43** | **9.41** | **16.28** |

Compared to a naive unvectorized solution (`pospopcnt_u16_scalar_naive_nosimd`):

| Algorithm | 128 | 256 | 512 | 1024 | 2048 | 4096 | 8192 | 65536 |
|-----------------------------------|------|-------|-------|-------|-------|-------|-------|--------|
| pospopcnt_u16_sse_mula_unroll8 | **8.28** | 9.84 | 10.55 | 11 | 11.58 | 11.93 | 12.13 | 12.28 |
| pospopcnt_u16_avx512_mula_unroll8 | 7.07 | **11.25** | **16.21** | 21 | 25.49 | 27.91 | 29.73 | 31.55 |
| pospopcnt_u16_avx512_mula3 | 3.05 | 2.82 | 14.53 | **23.13** | **34.37** | 44.91 | 52.78 | 61.68 |
| pospopcnt_u16_avx512_csa | 2.07 | 2.3 | 8.21 | 15.41 | 28.17 | **49.14** | **76.11** | **138.71** |
| pospopcnt_u16_sse_blend_popcnt_unroll8 | **8.28** | 9.84 | 10.55 | 11 | 11.58 | 11.93 | 12.13 | 12.28 |
| pospopcnt_u16_avx512_blend_popcnt_unroll8 | 7.07 | **11.25** | **16.21** | 21 | 25.49 | 27.91 | 29.73 | 31.55 |
| pospopcnt_u16_avx512_adder_forest | 3.05 | 2.82 | 14.53 | **23.13** | **34.37** | 44.91 | 52.78 | 61.68 |
| pospopcnt_u16_avx512_harvey_seal | 2.07 | 2.3 | 8.21 | 15.41 | 28.17 | **49.14** | **76.11** | **138.71** |

The host architecture used is a 10 nm Cannon Lake [Core i3-8121U](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html) with gcc (GCC) 7.3.1 20180303 (Red Hat 7.3.1-5).

Expand Down
14 changes: 6 additions & 8 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,9 @@ platform:
- x64

build_script:
- cmd: if [%BUILD_ENVIRONMENT%]==[cygwin] (
C:\cygwin\bin\bash -l -c "make;" )
- cmd: if [%BUILD_ENVIRONMENT%]==[cygwin64] (
C:\cygwin64\bin\bash -l -c "make;" )
- cmd: if [%BUILD_ENVIRONMENT%]==[mingw] (
C:\MinGW\msys\1.0\bin\bash -l -c "make;" )
- cmd: if [%BUILD_ENVIRONMENT%]==[mingw-w64] (
C:\msys64\usr\bin\bash -l -c "make;" )
- if "%platform%" == "x86" cmake -G "Visual Studio 15 2017" .
- if "%platform%" == "x64" cmake -G "Visual Studio 15 2017 Win64" .
- cmake --build . --config Release

test_script:
- ctest -C Release
18 changes: 16 additions & 2 deletions main.cpp → benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
#include <cassert>//assert
#include <cstring>//memset

#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif

#include "pospopcnt.h"

inline void* aligned_malloc(size_t size, size_t align) {
Expand Down Expand Up @@ -43,8 +49,12 @@ struct bench_unit {

uint64_t get_cpu_cycles() {
uint64_t result;
#ifndef _MSC_VER
__asm__ volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax":"=a"
(result)::"%rdx");
#else
result = __rdtsc();
#endif
return result;
};

Expand Down Expand Up @@ -104,6 +114,7 @@ int pospopcnt_u16_wrapper(pospopcnt_u16_method_type f, int id, int iterations,
std::vector<uint64_t> clocks;
std::vector<uint32_t> times;

#ifndef _MSC_VER
// Intel guide:
// @see: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
asm volatile ("CPUID\n\t"
Expand All @@ -122,6 +133,7 @@ asm volatile("RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
#endif

for (int i = 0; i < iterations; ++i) {
memset(counters, 0, sizeof(uint32_t)*16);
Expand All @@ -139,19 +151,21 @@ asm volatile("RDTSCP\n\t"
/*at this stage we exclusively own the CPU*/
#endif

#ifndef _MSC_VER
asm volatile ("CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");

#endif
// Call argument subroutine pointer.
(*f)(data, n, counters);

#ifndef _MSC_VER
asm volatile("RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");

#endif
#ifdef __linux__
// raw_local_irq_restore(flags);/*we enable hard interrupts on our CPU*/
// preempt_enable();/*we enable preemption*/
Expand Down
40 changes: 22 additions & 18 deletions benchmark/linux/instrumented_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,36 +29,36 @@ pospopcnt_u16_method_type pospopcnt_u16_methods[] = {
pospopcnt_u16_scalar_umul128,
pospopcnt_u16_scalar_umul128_unroll2,
pospopcnt_u16_sse_single,
pospopcnt_u16_sse_mula,
pospopcnt_u16_sse_mula_unroll4,
pospopcnt_u16_sse_mula_unroll8,
pospopcnt_u16_sse_mula_unroll16,
pospopcnt_u16_sse_blend_popcnt,
pospopcnt_u16_sse_blend_popcnt_unroll4,
pospopcnt_u16_sse_blend_popcnt_unroll8,
pospopcnt_u16_sse_blend_popcnt_unroll16,
pospopcnt_u16_sse_sad,
pospopcnt_u16_sse_csa,
pospopcnt_u16_sse_harvey_seal,
pospopcnt_u16_avx2_popcnt,
pospopcnt_u16_avx2,
pospopcnt_u16_avx2_naive_counter,
pospopcnt_u16_avx2_single,
pospopcnt_u16_avx2_lemire,
pospopcnt_u16_avx2_lemire2,
pospopcnt_u16_avx2_mula,
pospopcnt_u16_avx2_mula_unroll4,
pospopcnt_u16_avx2_mula_unroll8,
pospopcnt_u16_avx2_mula_unroll16,
pospopcnt_u16_avx2_mula3,
pospopcnt_u16_avx2_csa,
pospopcnt_u16_avx2_blend_popcnt,
pospopcnt_u16_avx2_blend_popcnt_unroll4,
pospopcnt_u16_avx2_blend_popcnt_unroll8,
pospopcnt_u16_avx2_blend_popcnt_unroll16,
pospopcnt_u16_avx2_adder_forest,
pospopcnt_u16_avx2_harvey_seal,
pospopcnt_u16_avx512,
pospopcnt_u16_avx512bw_popcnt32_mask,
pospopcnt_u16_avx512bw_popcnt64_mask,
pospopcnt_u16_avx512_masked_ops,
pospopcnt_u16_avx512_popcnt,
pospopcnt_u16_avx512bw_mula,
pospopcnt_u16_avx512bw_mula_unroll4,
pospopcnt_u16_avx512bw_mula_unroll8,
pospopcnt_u16_avx512bw_blend_popcnt,
pospopcnt_u16_avx512bw_blend_popcnt_unroll4,
pospopcnt_u16_avx512bw_blend_popcnt_unroll8,
pospopcnt_u16_avx512_mula2,
pospopcnt_u16_avx512bw_mula3,
pospopcnt_u16_avx512bw_csa,
pospopcnt_u16_avx512vbmi_csa};
pospopcnt_u16_avx512bw_adder_forest,
pospopcnt_u16_avx512bw_harvey_seal,
pospopcnt_u16_avx512vbmi_harvey_seal};

void print16(uint32_t *flags) {
for (int k = 0; k < 16; k++)
Expand Down Expand Up @@ -188,6 +188,7 @@ bool benchmark(uint32_t n, uint32_t iterations, pospopcnt_u16_method_type fn, bo
return isok;
}

#if POSPOPCNT_SIMD_VERSION >= 5
void measurepopcnt(uint32_t n, uint32_t iterations, bool verbose) {
std::vector<int> evts;
std::vector<uint16_t> vdata(n);
Expand Down Expand Up @@ -252,6 +253,7 @@ void measurepopcnt(uint32_t n, uint32_t iterations, bool verbose) {
printf("cycles per 16-bit word: %4.3f; ref cycles per 16-bit word: %4.3f \n", double(mins[0]) / n, double(mins[5]) / n);
}
}
#endif

static void print_usage(char *command) {
printf(" Try %s -n 100000 -i 15 -v \n", command);
Expand Down Expand Up @@ -315,8 +317,10 @@ int main(int argc, char **argv) {
printf("array size: %.3f MB\n", array_in_bytes / (1024 * 1024.));
}

#if POSPOPCNT_SIMD_VERSION >= 5
measurepopcnt(n, iterations, verbose);

#endif

for (size_t k = 0; k < PPOPCNT_NUMBER_METHODS; k++) {
printf("%-40s\t", pospopcnt_u16_method_names[k]);
fflush(NULL);
Expand Down
1 change: 1 addition & 0 deletions example.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // memcpy

#include "pospopcnt.h"

Expand Down
Loading

0 comments on commit 76aa5f8

Please sign in to comment.