Skip to content

Commit

Permalink
templated multiply benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
svpolonsky committed Dec 23, 2023
1 parent e0784d2 commit bdbfb34
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 76 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
ingo_benchmarks.sh

# sql

benchmark.json
benchmark.sql
internal.sql/

# CMake
Expand Down
189 changes: 117 additions & 72 deletions benchmarks/multiply/benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,24 @@
#include <nvml.h>
#include </opt/benchmark/include/benchmark/benchmark.h>
#include "/icicle/icicle/primitives/field.cuh"
#include "/icicle/icicle/curves/bn254_params.cuh"
#include "/icicle/icicle/curves/bls12_381_params.cuh"
#include "/icicle/icicle/curves/bls12_377_params.cuh"
#include "/icicle/icicle/curves/bw6_761_params.cuh"
typedef Field<bn254::fp_config> bn254_scalar_t;
typedef Field<bn254::fq_config> bn254_point_field_t;
typedef Field<bls12_381::fp_config> bls12_381_scalar_t;
typedef Field<bls12_381::fq_config> bls12_381_point_field_t;
typedef Field<bls12_377::fp_config> bls12_377_scalar_t;
typedef Field<bls12_377::fq_config> bls12_377_point_field_t;
// typedef Field<bw6_761::fp_config> bw6_761_scalar_t;

#if CURVE == CURVE_BN254

#include "/icicle/icicle/curves/bn254/curve_config.cuh"
using namespace BN254;
const std::string curve = "BN254";
// const std::string curve = "BLS12-377";

#elif CURVE == CURVE_BLS12_381

#include "/icicle/icicle/curves/bls12_381/curve_config.cuh"
using namespace BLS12_381;
const std::string curve = "BLS12-381";

#elif CURVE == CURVE_BLS12_377

#include "/icicle/icicle/curves/bls12_377/curve_config.cuh"
using namespace BLS12_377;
const std::string curve = "BLS12-377";

#endif


#include "/icicle/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh"
#define MAX_THREADS_PER_BLOCK 256

template <typename S, int N>
__global__ void vectorMult(S *vec_a, S *vec_b, S *vec_r, size_t n_elments)
Expand All @@ -53,40 +48,115 @@ __global__ void vectorMult(S *vec_a, S *vec_b, S *vec_r, size_t n_elments)
template <typename S, int N = 10>
int vector_mult(S *vec_b, S *vec_a, S *vec_result, size_t n_elments)
{
// Set the grid and block dimensions
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
int threads_per_block = MAX_THREADS_PER_BLOCK;
// Set the grid and block dimensions
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
int threads_per_block = MAX_THREADS_PER_BLOCK;

// Call the kernel to perform element-wise modular multiplication
vectorMult<S, N><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
return 0;
// Call the kernel to perform element-wise modular multiplication
vectorMult<S, N><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
return 0;
}

// typedef scalar_t S;
typedef point_field_t S;
const std::string multiply_type="point_field";
typedef bls12_377_scalar_t S;
//typedef point_field_t S;

const unsigned nof_mult = 100;
unsigned nof_elements = 1 << 25;
S *vec_a;
S *vec_b;
S *d_vec_b;
S *d_vec_a, *d_result;

nvmlDevice_t device;

static void BM_mult(benchmark::State& state) {
for (auto _ : state) {
vector_mult<S, nof_mult>(d_vec_a, d_vec_b, d_result, nof_elements);
cudaDeviceSynchronize();
}
unsigned int power;
nvmlDeviceGetPowerUsage(device, &power);
state.counters["PowerUsage"] = int(1.0e-3 * power);
unsigned int temperature;
nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
state.counters["Temperature"] = int(temperature);
}

BENCHMARK(BM_mult)->MinTime(60.);
// best-practice to reduce redundant code:
// https://github.com/google/benchmark/issues/698

template <typename T>
class MyTemplatedFixture : public benchmark::Fixture {
public:
// host data
T *h_vec_a;
T *h_vec_b;
// device data
T *d_vec_a;
T *d_vec_b;
T *d_result;
int* array;
size_t size;

void SetUp(const ::benchmark::State& state) override {
size = static_cast<size_t>(state.range(0));
// expect size is a power of 2 and size > 10
array = new int[size];
h_vec_a = new T[size];
h_vec_b = new T[size];
// Initialize the arrays with random data
for (unsigned i = 0; i < (1 << 10); i++) {
h_vec_a[i] = T::rand_host();
h_vec_b[i] = T::rand_host();
}
for (unsigned i = 1; i < (size >> 10); i++) {
memcpy((void *)(h_vec_a + (i << 10)), (void *)(h_vec_a + ((i-1) << 10)), sizeof(T) << 10);
memcpy((void *)(h_vec_b + (i << 10)), (void *)(h_vec_b + ((i-1) << 10)), sizeof(T) << 10);
}
// Allocate memory on the device
// std::cout << "Moving data to device" << std::endl;
cudaMalloc(&d_vec_a, size * sizeof(T));
cudaMalloc(&d_vec_b, size * sizeof(T));
cudaMalloc(&d_result, size * sizeof(T));
// Copy the data to the device
cudaMemcpy(d_vec_a, h_vec_a, size * sizeof(T), cudaMemcpyHostToDevice);
cudaMemcpy(d_vec_b, h_vec_b, size * sizeof(T), cudaMemcpyHostToDevice);
}

void TearDown(const ::benchmark::State& state) override {
delete[] array;
cudaFree(d_vec_a);
cudaFree(d_vec_b);
cudaFree(d_result);
delete[] h_vec_a;
delete[] h_vec_b;
}
protected:
virtual void BenchmarkCase(benchmark::State& state) override {
for (auto _ : state) {
// Benchmark code using the array of type T
vector_mult<T, 100>(d_vec_a, d_vec_b, d_result, size);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess) {
fprintf(stderr, "CUDA Error after cudaDeviceSynchronize: %s\n", cudaGetErrorString(error));
// Handle the error
}
}
unsigned int power;
nvmlDeviceGetPowerUsage(device, &power);
state.counters["PowerUsage"] = int(1.0e-3 * power);
unsigned int temperature;
nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
state.counters["Temperature"] = int(temperature);
}
};

// bn254
BENCHMARK_TEMPLATE_DEFINE_F(MyTemplatedFixture, Test1, bn254_scalar_t)(benchmark::State& state){MyTemplatedFixture::BenchmarkCase(state);}
BENCHMARK_REGISTER_F(MyTemplatedFixture, Test1)->Name("BN254:scalar")->MinTime(30.)->Arg(nof_elements);

BENCHMARK_TEMPLATE_DEFINE_F(MyTemplatedFixture, Test2, bn254_point_field_t)(benchmark::State& state){MyTemplatedFixture::BenchmarkCase(state);}
BENCHMARK_REGISTER_F(MyTemplatedFixture, Test2)->Name("BN254:point_field")->MinTime(30.)->Arg(nof_elements);

// bls12_381
BENCHMARK_TEMPLATE_DEFINE_F(MyTemplatedFixture, Test3, bls12_381_scalar_t)(benchmark::State& state){MyTemplatedFixture::BenchmarkCase(state);}
BENCHMARK_REGISTER_F(MyTemplatedFixture, Test3)->Name("BLS12_381:scalar")->MinTime(30.)->Arg(nof_elements);

BENCHMARK_TEMPLATE_DEFINE_F(MyTemplatedFixture, Test4, bls12_381_point_field_t)(benchmark::State& state){MyTemplatedFixture::BenchmarkCase(state);}
BENCHMARK_REGISTER_F(MyTemplatedFixture, Test4)->Name("BLS12_381:point_field")->MinTime(30.)->Arg(nof_elements);

// bls12_377
BENCHMARK_TEMPLATE_DEFINE_F(MyTemplatedFixture, Test5, bls12_377_scalar_t)(benchmark::State& state){MyTemplatedFixture::BenchmarkCase(state);}
BENCHMARK_REGISTER_F(MyTemplatedFixture, Test5)->Name("BLS12_377:scalar")->MinTime(30.)->Arg(nof_elements);

BENCHMARK_TEMPLATE_DEFINE_F(MyTemplatedFixture, Test6, bls12_377_point_field_t)(benchmark::State& state){MyTemplatedFixture::BenchmarkCase(state);}
BENCHMARK_REGISTER_F(MyTemplatedFixture, Test6)->Name("BLS12_377:point_field")->MinTime(30.)->Arg(nof_elements);


int main(int argc, char** argv) {
cudaDeviceReset();
Expand All @@ -101,45 +171,20 @@ int main(int argc, char** argv) {
nvmlInit();
nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0

std::cout << "Setting host data" << std::endl;

vec_a = (S*)malloc(sizeof(S) * nof_elements);
vec_b = (S*)malloc(sizeof(S) * nof_elements);
for (unsigned i = 0; i < (1 << 10); i++) {
vec_a[i] = S::rand_host();
vec_b[i] = S::rand_host();
}
for (unsigned i = 1; i < (nof_elements >> 10); i++) {
memcpy((void *)(vec_a + (i << 10)), (void *)(vec_a + ((i-1) << 10)), sizeof(S) << 10);
memcpy((void *)(vec_b + (i << 10)), (void *)(vec_b + ((i-1) << 10)), sizeof(S) << 10);
}
// Allocate memory on the device for the input vectors, the output vector, and the modulus
std::cout << "Moving data to device" << std::endl;
cudaMalloc(&d_vec_a, nof_elements * sizeof(S));
cudaMalloc(&d_vec_b, nof_elements * sizeof(S));
cudaMalloc(&d_result, nof_elements * sizeof(S));

// Copy the input vectors and the modulus from the host to the device
cudaMemcpy(d_vec_a, vec_a, nof_elements * sizeof(S), cudaMemcpyHostToDevice);
cudaMemcpy(d_vec_b, vec_b, nof_elements * sizeof(S), cudaMemcpyHostToDevice);
std::cout << "Running benchmark" << std::endl;

// Run all benchmarks
::benchmark::Initialize(&argc, argv);
if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1;
::benchmark::AddCustomContext("team", "Ingonyama");
::benchmark::AddCustomContext("git_repository", "https://github.com/ingonyama-zk/icicle.git");
::benchmark::AddCustomContext("project", "Icicle");
::benchmark::AddCustomContext("runs_on", gpu_name);
::benchmark::AddCustomContext("frequency_MHz", std::to_string(gpu_clock_mhz));
::benchmark::AddCustomContext("uses", curve);
// ::benchmark::AddCustomContext("uses", curve);
::benchmark::AddCustomContext("comment", "on-device API");
::benchmark::AddCustomContext("operation_factor", std::to_string(nof_mult));
::benchmark::AddCustomContext("vector_size", std::to_string(nof_elements));
::benchmark::AddCustomContext("multiply", multiply_type);
::benchmark::RunSpecifiedBenchmarks();

cudaFree(d_vec_a);
cudaFree(d_vec_b);
cudaFree(d_result);
free(vec_a);
free(vec_b);
return 0;
}
16 changes: 13 additions & 3 deletions benchmarks/multiply/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ git_id=$(cd /icicle && git rev-parse --short HEAD)
echo "Icicle GitID: $git_id"

echo "Running the benchmarks and capturing the output in the file benchmark.json"
/icicle-benchmark/build/benchmark --benchmark_time_unit=s --benchmark_out_format=json --benchmark_out=benchmark.json
#/icicle-benchmark/build/benchmark --benchmark_time_unit=s --benchmark_out_format=json --benchmark_out=benchmark.json

json_data=$(<benchmark.json)
#echo $json_data
Expand All @@ -28,6 +28,12 @@ echo "Team: $team"
project=$(jq -r '.context.project' benchmark.json)
echo "Project: $project"

git_repository=$(jq -r '.context.git_repository' benchmark.json)
echo "Git repository: $git_repository"

multiply=$(jq -r '.context.multiply' benchmark.json)
echo "Multiply: $multiply"

runs_on=$(jq -r '.context.runs_on' benchmark.json)
echo "Runs on: $runs_on"

Expand Down Expand Up @@ -78,11 +84,13 @@ for ((nof_benchmark = 0; nof_benchmark < nof_benchmarks; nof_benchmark++)); do

chip_temp_C=$(echo "$benchmark" | jq '.Temperature')
echo "Chip temperature (C): $chip_temp_C"

QUERY="INSERT INTO add_benchmark (
QUERY="INSERT INTO multiply_benchmark (
team,
project,
multiply,
test_timestamp,
git_repository,
git_id,
frequency_MHz,
vector_size,
Expand All @@ -98,7 +106,9 @@ for ((nof_benchmark = 0; nof_benchmark < nof_benchmarks; nof_benchmark++)); do
VALUES (
'$team',
'$project',
'$multiply',
'$test_timestamp',
'$git_repository',
'$git_id',
$frequency_MHz,
$vector_size,
Expand Down
17 changes: 17 additions & 0 deletions scripts/cli.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,19 @@
#!/bin/bash
# Exit immediately if a command exits with a non-zero status
set -e

echo "${INGO_BENCHMARKS_DB_HOST}"
echo "${INGO_BENCHMARKS_DB_PORT}"
echo "${INGO_BENCHMARKS_DB_NAME}"
echo "${INGO_BENCHMARKS_DB_USER}"
#DB_PASS="${INGO_BENCHMARKS_DB_PASSWORD}"


read -p "Do you want to proceed? (Yes/No) " answer
if [[ $answer =~ ^[Nn] ]]; then
echo "Terminating."
exit 1
fi

export PGPASSWORD=$INGO_BENCHMARKS_DB_PASSWORD
psql -U $INGO_BENCHMARKS_DB_USER -d $INGO_BENCHMARKS_DB_NAME -h $INGO_BENCHMARKS_DB_HOST -p $INGO_BENCHMARKS_DB_PORT

0 comments on commit bdbfb34

Please sign in to comment.