Skip to content

Commit

Permalink
Switch to CUB for Energy Accumulation (#1474)
Browse files Browse the repository at this point in the history
* Add missing gpuErrchk calls

* Reduce __int128 energies with CUB

* Remove old accumulation code

* Minor clean up

* Separate out assertions

* Remove usage of shared memory in nonbonded kernel
  • Loading branch information
badisa authored Feb 7, 2025
1 parent 9061bc5 commit b1a3108
Show file tree
Hide file tree
Showing 41 changed files with 286 additions and 284 deletions.
15 changes: 0 additions & 15 deletions tests/test_energy_overflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,18 +274,3 @@ def test_energy_overflows_with_summation_of_energies(precision):

# The GPU potentials will overflow, resulting in a nan value
assert np.isnan(nonbonded_gpu(x, nb_params, box))


@pytest.mark.parametrize("size", [1, 32, 257, 1000, 10000])
def test_energy_accumulation(size):
"""Test the the logic used to accumulate energy in int128.
It relies on doing a block level parallel reduce for performance.
"""
rng = np.random.default_rng(2023)

vals = rng.integers(-10000, 10000, size=size, dtype=np.int64)

result = custom_ops._accumulate_energy(vals)

np.testing.assert_equal(np.sum(vals), result)
1 change: 0 additions & 1 deletion timemachine/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ pybind11_add_module(${LIBRARY_NAME} SHARED NO_EXTRAS
src/nonbonded_all_pairs.cu
src/nonbonded_pair_list.cu
src/nonbonded_interaction_group.cu
src/energy_accumulation.cu
src/local_md_utils.cu
src/neighborlist.cu
src/segmented_sumexp.cu
Expand Down
15 changes: 11 additions & 4 deletions timemachine/cpp/src/barostat.cu
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#include "barostat.hpp"
#include "constants.hpp"
#include "energy_accumulation.hpp"
#include "fixed_point.hpp"
#include "gpu_utils.cuh"
#include "math_utils.cuh"
#include "mol_utils.hpp"
#include <cub/cub.cuh>
#include <stdio.h>
#include <variant>

Expand All @@ -28,7 +28,7 @@ MonteCarloBarostat<RealType>::MonteCarloBarostat(
const double initial_volume_scale_factor)
: Mover(interval), N_(N), adaptive_scaling_enabled_(adaptive_scaling_enabled), bps_(bps),
pressure_(static_cast<RealType>(pressure)), temperature_(static_cast<RealType>(temperature)), seed_(seed),
group_idxs_(group_idxs), num_grouped_atoms_(0), runner_() {
group_idxs_(group_idxs), num_grouped_atoms_(0), sum_storage_bytes_(0), runner_() {

// Trigger check that interval is valid
this->set_interval(interval_);
Expand Down Expand Up @@ -79,6 +79,10 @@ MonteCarloBarostat<RealType>::MonteCarloBarostat(
cudaSafeMalloc(&d_atom_idxs_, num_grouped_atoms_ * sizeof(*d_atom_idxs_));
cudaSafeMalloc(&d_mol_idxs_, num_grouped_atoms_ * sizeof(*d_mol_idxs_));

gpuErrchk(cub::DeviceReduce::Sum(nullptr, sum_storage_bytes_, d_u_buffer_, d_u_buffer_, bps_.size()));

gpuErrchk(cudaMalloc(&d_sum_temp_storage_, sum_storage_bytes_));

gpuErrchk(cudaMemcpy(
d_atom_idxs_,
&flattened_groups[0][0],
Expand Down Expand Up @@ -116,6 +120,7 @@ template <typename RealType> MonteCarloBarostat<RealType>::~MonteCarloBarostat()
gpuErrchk(cudaFree(d_volume_delta_));
gpuErrchk(cudaFree(d_num_accepted_));
gpuErrchk(cudaFree(d_num_attempted_));
gpuErrchk(cudaFree(d_sum_temp_storage_));
curandErrchk(curandDestroyGenerator(cr_rng_));
};

Expand Down Expand Up @@ -212,11 +217,13 @@ void MonteCarloBarostat<RealType>::move(
gpuErrchk(cudaPeekAtLastError());

runner_.execute_potentials(bps_, N_, d_x, d_box, nullptr, nullptr, d_u_buffer_, stream);
accumulate_energy(bps_.size(), d_u_buffer_, d_init_u_, stream);
gpuErrchk(
cub::DeviceReduce::Sum(d_sum_temp_storage_, sum_storage_bytes_, d_u_buffer_, d_init_u_, bps_.size(), stream));

runner_.execute_potentials(
bps_, N_, d_x_proposed_, d_box_proposed_, nullptr, nullptr, d_u_proposed_buffer_, stream);
accumulate_energy(bps_.size(), d_u_proposed_buffer_, d_final_u_, stream);
gpuErrchk(cub::DeviceReduce::Sum(
d_sum_temp_storage_, sum_storage_bytes_, d_u_proposed_buffer_, d_final_u_, bps_.size(), stream));

double pressure = pressure_ * AVOGADRO * 1e-25;
const double kT = BOLTZ * temperature_;
Expand Down
3 changes: 3 additions & 0 deletions timemachine/cpp/src/barostat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ template <typename RealType> class MonteCarloBarostat : public Mover {

int num_grouped_atoms_;

size_t sum_storage_bytes_;
void *d_sum_temp_storage_;

int *d_num_attempted_;
int *d_num_accepted_;

Expand Down
13 changes: 3 additions & 10 deletions timemachine/cpp/src/centroid_restraint.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#include "centroid_restraint.hpp"
#include "energy_accumulation.hpp"
#include "gpu_utils.cuh"
#include "k_centroid_restraint.cuh"
#include "math_utils.cuh"
Expand All @@ -20,16 +19,13 @@ CentroidRestraint<RealType>::CentroidRestraint(

cudaSafeMalloc(&d_centroid_a_, 3 * sizeof(*d_centroid_a_));
cudaSafeMalloc(&d_centroid_b_, 3 * sizeof(*d_centroid_b_));

cudaSafeMalloc(&d_u_buffer_, sizeof(*d_u_buffer_));
};

template <typename RealType> CentroidRestraint<RealType>::~CentroidRestraint() {
gpuErrchk(cudaFree(d_group_a_idxs_));
gpuErrchk(cudaFree(d_group_b_idxs_));
gpuErrchk(cudaFree(d_centroid_a_));
gpuErrchk(cudaFree(d_centroid_b_));
gpuErrchk(cudaFree(d_u_buffer_));
};

template <typename RealType>
Expand All @@ -41,7 +37,7 @@ void CentroidRestraint<RealType>::execute_device(
const double *d_box,
unsigned long long *d_du_dx,
unsigned long long *d_du_dp,
__int128 *d_u,
__int128 *d_u, // [1]
cudaStream_t stream) {

if (N_B_ + N_A_ > 0) {
Expand All @@ -65,12 +61,9 @@ void CentroidRestraint<RealType>::execute_device(
kb_,
b0_,
d_du_dx,
d_u == nullptr ? nullptr : d_u_buffer_);
d_u // Can write directly to the energy buffer for this potential.
);
gpuErrchk(cudaPeekAtLastError());

if (d_u) {
accumulate_energy(1, d_u_buffer_, d_u, stream);
}
}
};

Expand Down
12 changes: 9 additions & 3 deletions timemachine/cpp/src/chiral_atom_restraint.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#include "chiral_atom_restraint.hpp"
#include "energy_accumulation.hpp"
#include "gpu_utils.cuh"
#include "k_chiral_restraint.cuh"
#include "kernel_utils.cuh"
#include "math_utils.cuh"
#include <cub/cub.cuh>
#include <vector>

namespace timemachine {

template <typename RealType>
ChiralAtomRestraint<RealType>::ChiralAtomRestraint(const std::vector<int> &idxs) : R_(idxs.size() / 4) {
ChiralAtomRestraint<RealType>::ChiralAtomRestraint(const std::vector<int> &idxs)
: R_(idxs.size() / 4), sum_storage_bytes_(0) {

if (idxs.size() % 4 != 0) {
throw std::runtime_error("idxs.size() must be exactly 4*k!");
Expand All @@ -19,11 +20,16 @@ ChiralAtomRestraint<RealType>::ChiralAtomRestraint(const std::vector<int> &idxs)
gpuErrchk(cudaMemcpy(d_idxs_, &idxs[0], R_ * 4 * sizeof(*d_idxs_), cudaMemcpyHostToDevice));

cudaSafeMalloc(&d_u_buffer_, R_ * sizeof(*d_u_buffer_));

gpuErrchk(cub::DeviceReduce::Sum(nullptr, sum_storage_bytes_, d_u_buffer_, d_u_buffer_, R_));

gpuErrchk(cudaMalloc(&d_sum_temp_storage_, sum_storage_bytes_));
};

template <typename RealType> ChiralAtomRestraint<RealType>::~ChiralAtomRestraint() {
gpuErrchk(cudaFree(d_idxs_));
gpuErrchk(cudaFree(d_u_buffer_));
gpuErrchk(cudaFree(d_sum_temp_storage_));
};

template <typename RealType>
Expand Down Expand Up @@ -53,7 +59,7 @@ void ChiralAtomRestraint<RealType>::execute_device(
gpuErrchk(cudaPeekAtLastError());

if (d_u) {
accumulate_energy(R_, d_u_buffer_, d_u, stream);
gpuErrchk(cub::DeviceReduce::Sum(d_sum_temp_storage_, sum_storage_bytes_, d_u_buffer_, d_u, R_, stream));
}
}
};
Expand Down
4 changes: 3 additions & 1 deletion timemachine/cpp/src/chiral_atom_restraint.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ namespace timemachine {
template <typename RealType> class ChiralAtomRestraint : public Potential {

private:
const int R_;
int *d_idxs_;
__int128 *d_u_buffer_;

const int R_;
size_t sum_storage_bytes_;
void *d_sum_temp_storage_;

public:
ChiralAtomRestraint(const std::vector<int> &idxs // [R, 4]
Expand Down
11 changes: 8 additions & 3 deletions timemachine/cpp/src/chiral_bond_restraint.cu
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#include "chiral_bond_restraint.hpp"
#include "energy_accumulation.hpp"
#include "gpu_utils.cuh"
#include "k_chiral_restraint.cuh"
#include "kernel_utils.cuh"
#include "math_utils.cuh"
#include <cub/cub.cuh>
#include <vector>

namespace timemachine {

template <typename RealType>
ChiralBondRestraint<RealType>::ChiralBondRestraint(const std::vector<int> &idxs, const std::vector<int> &signs)
: R_(idxs.size() / 4) {
: R_(idxs.size() / 4), sum_storage_bytes_(0) {

if (idxs.size() % 4 != 0) {
throw std::runtime_error("idxs.size() must be exactly 4*R!");
Expand All @@ -33,12 +33,17 @@ ChiralBondRestraint<RealType>::ChiralBondRestraint(const std::vector<int> &idxs,
gpuErrchk(cudaMemcpy(d_signs_, &signs[0], R_ * sizeof(*d_signs_), cudaMemcpyHostToDevice));

cudaSafeMalloc(&d_u_buffer_, R_ * sizeof(*d_u_buffer_));

gpuErrchk(cub::DeviceReduce::Sum(nullptr, sum_storage_bytes_, d_u_buffer_, d_u_buffer_, R_));

gpuErrchk(cudaMalloc(&d_sum_temp_storage_, sum_storage_bytes_));
};

template <typename RealType> ChiralBondRestraint<RealType>::~ChiralBondRestraint() {
gpuErrchk(cudaFree(d_idxs_));
gpuErrchk(cudaFree(d_signs_));
gpuErrchk(cudaFree(d_u_buffer_));
gpuErrchk(cudaFree(d_sum_temp_storage_));
};

template <typename RealType>
Expand Down Expand Up @@ -68,7 +73,7 @@ void ChiralBondRestraint<RealType>::execute_device(
gpuErrchk(cudaPeekAtLastError());

if (d_u) {
accumulate_energy(R_, d_u_buffer_, d_u, stream);
gpuErrchk(cub::DeviceReduce::Sum(d_sum_temp_storage_, sum_storage_bytes_, d_u_buffer_, d_u, R_, stream));
}
}
};
Expand Down
5 changes: 4 additions & 1 deletion timemachine/cpp/src/chiral_bond_restraint.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ namespace timemachine {
template <typename RealType> class ChiralBondRestraint : public Potential {

private:
const int R_;

int *d_idxs_;
int *d_signs_;
__int128 *d_u_buffer_;

const int R_;
size_t sum_storage_bytes_;
void *d_sum_temp_storage_;

public:
ChiralBondRestraint(
Expand Down
16 changes: 0 additions & 16 deletions timemachine/cpp/src/energy_accumulation.cu

This file was deleted.

11 changes: 0 additions & 11 deletions timemachine/cpp/src/energy_accumulation.hpp

This file was deleted.

15 changes: 12 additions & 3 deletions timemachine/cpp/src/fanout_summed_potential.cu
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
#include "energy_accumulation.hpp"
#include "fanout_summed_potential.hpp"
#include "gpu_utils.cuh"
#include "nonbonded_common.hpp"
#include <cub/cub.cuh>
#include <memory>

namespace timemachine {

FanoutSummedPotential::FanoutSummedPotential(
const std::vector<std::shared_ptr<Potential>> potentials, const bool parallel)
: potentials_(potentials), parallel_(parallel), d_u_buffer_(potentials_.size()) {};
: potentials_(potentials), parallel_(parallel), d_u_buffer_(potentials_.size()), sum_storage_bytes_(0) {

gpuErrchk(
cub::DeviceReduce::Sum(nullptr, sum_storage_bytes_, d_u_buffer_.data, d_u_buffer_.data, potentials_.size()));

gpuErrchk(cudaMalloc(&d_sum_temp_storage_, sum_storage_bytes_));
};

FanoutSummedPotential::~FanoutSummedPotential() { gpuErrchk(cudaFree(d_sum_temp_storage_)); };

const std::vector<std::shared_ptr<Potential>> &FanoutSummedPotential::get_potentials() { return potentials_; }

Expand Down Expand Up @@ -46,7 +54,8 @@ void FanoutSummedPotential::execute_device(
}
}
if (d_u) {
accumulate_energy(potentials_.size(), d_u_buffer_.data, d_u, stream);
gpuErrchk(cub::DeviceReduce::Sum(
d_sum_temp_storage_, sum_storage_bytes_, d_u_buffer_.data, d_u, potentials_.size(), stream));
}
};

Expand Down
5 changes: 5 additions & 0 deletions timemachine/cpp/src/fanout_summed_potential.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,14 @@ class FanoutSummedPotential : public Potential {
DeviceBuffer<__int128> d_u_buffer_;
StreamManager manager_;

size_t sum_storage_bytes_;
void *d_sum_temp_storage_;

public:
FanoutSummedPotential(const std::vector<std::shared_ptr<Potential>> potentials, const bool parallel);

~FanoutSummedPotential();

const std::vector<std::shared_ptr<Potential>> &get_potentials();

virtual void execute_device(
Expand Down
12 changes: 9 additions & 3 deletions timemachine/cpp/src/flat_bottom_bond.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#include "energy_accumulation.hpp"
#include "flat_bottom_bond.hpp"
#include "gpu_utils.cuh"
#include "k_flat_bottom_bond.cuh"
#include "kernel_utils.cuh"
#include "math_utils.cuh"
#include <cub/cub.cuh>
#include <vector>

namespace timemachine {

template <typename RealType>
FlatBottomBond<RealType>::FlatBottomBond(const std::vector<int> &bond_idxs) : B_(bond_idxs.size() / 2) {
FlatBottomBond<RealType>::FlatBottomBond(const std::vector<int> &bond_idxs)
: B_(bond_idxs.size() / 2), sum_storage_bytes_(0) {

// validate bond_idxs: even length, all idxs non-negative, and no self-edges
if (bond_idxs.size() % 2 != 0) {
Expand All @@ -33,11 +34,16 @@ FlatBottomBond<RealType>::FlatBottomBond(const std::vector<int> &bond_idxs) : B_
gpuErrchk(cudaMemcpy(d_bond_idxs_, &bond_idxs[0], B_ * 2 * sizeof(*d_bond_idxs_), cudaMemcpyHostToDevice));

cudaSafeMalloc(&d_u_buffer_, B_ * sizeof(*d_u_buffer_));

gpuErrchk(cub::DeviceReduce::Sum(nullptr, sum_storage_bytes_, d_u_buffer_, d_u_buffer_, B_));

gpuErrchk(cudaMalloc(&d_sum_temp_storage_, sum_storage_bytes_));
};

template <typename RealType> FlatBottomBond<RealType>::~FlatBottomBond() {
gpuErrchk(cudaFree(d_bond_idxs_));
gpuErrchk(cudaFree(d_u_buffer_));
gpuErrchk(cudaFree(d_sum_temp_storage_));
};

template <typename RealType>
Expand Down Expand Up @@ -70,7 +76,7 @@ void FlatBottomBond<RealType>::execute_device(
gpuErrchk(cudaPeekAtLastError());

if (d_u) {
accumulate_energy(B_, d_u_buffer_, d_u, stream);
gpuErrchk(cub::DeviceReduce::Sum(d_sum_temp_storage_, sum_storage_bytes_, d_u_buffer_, d_u, B_, stream));
}
}
};
Expand Down
5 changes: 4 additions & 1 deletion timemachine/cpp/src/flat_bottom_bond.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@ namespace timemachine {
template <typename RealType> class FlatBottomBond : public Potential {

private:
int B_; // TBD make this a constant

int *d_bond_idxs_;
__int128 *d_u_buffer_;

int B_;
size_t sum_storage_bytes_;
void *d_sum_temp_storage_;

public:
int num_bonds() const { return B_; }
Expand Down
Loading

0 comments on commit b1a3108

Please sign in to comment.