Skip to content

Commit

Permalink
made it so UF block is the only one computed when torques are not pro…
Browse files Browse the repository at this point in the history
…vided
  • Loading branch information
rykerfish committed Jan 24, 2025
1 parent f8ff7ea commit 3472421
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 39 deletions.
53 changes: 22 additions & 31 deletions solvers/NBody/extra/NbodyRPY.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@ __global__ void computeRPYBatchedFastGPU(const vecType *pos,
const int numberTiles = N / blobsPerTile;
const int tileOfLastParticle =
thrust::min(lastId / blobsPerTile, numberTiles);
const bool haveTorque = torques;
extern __shared__ char shMem[];
vecType *shPos = (vecType *)(shMem);
vecType *shForce = (vecType *)(shMem + blockDim.x * sizeof(vecType));
vecType *shTorque = (vecType *)(shMem + 2 * blockDim.x * sizeof(vecType));
vecType *shTorque = haveTorque ? (vecType *)(shMem + 2 * blockDim.x * sizeof(vecType)) : nullptr;
const real3 pi = active ? make_real3(pos[id]) : real3();
real3 MF = real3();
real3 MT = real3();
Expand All @@ -55,7 +56,7 @@ __global__ void computeRPYBatchedFastGPU(const vecType *pos,
if (i_load < N) {
shPos[threadIdx.x] = make_real3(pos[i_load]);
shForce[threadIdx.x] = make_real3(forces[i_load]);
shTorque[threadIdx.x] = make_real3(torques[i_load]);
if (haveTorque) shTorque[threadIdx.x] = make_real3(torques[i_load]);
}
__syncthreads();
// Compute interaction with all particles in tile
Expand All @@ -67,8 +68,8 @@ __global__ void computeRPYBatchedFastGPU(const vecType *pos,
if (fiber_id == fiber_j and cur_j < N) {
const real3 fj = shForce[counter];
const real3 pj = shPos[counter];
const real3 tj = shTorque[counter];
mdot_result dot = kernel.dotProduct(pi, pj, fj, tj);
const real3 tj = haveTorque ? shTorque[counter] : real3();
mdot_result dot = kernel.dotProduct(pi, pj, fj, tj, haveTorque);
MF += dot.MF;
MT += dot.MT;
}
Expand All @@ -78,7 +79,7 @@ __global__ void computeRPYBatchedFastGPU(const vecType *pos,
}
if (active)
Mv[id] = MF;
Mw[id] = MT;
if (haveTorque) Mw[id] = MT;
}

template <class HydrodynamicKernel, class vecType>
Expand All @@ -91,7 +92,8 @@ void computeRPYBatchedFast(const vecType *pos, const vecType *force,
int minBlockSize = std::max(nearestWarpMultiple, 32);
int Nthreads = std::min(std::min(minBlockSize, N), 256);
int Nblocks = (N + Nthreads - 1) / Nthreads;
computeRPYBatchedFastGPU<<<Nblocks, Nthreads, 3 * Nthreads * sizeof(real3)>>>(
int sharedMemoryFactor = torque != nullptr ? 3 : 2;
computeRPYBatchedFastGPU<<<Nblocks, Nthreads, sharedMemoryFactor * Nthreads * sizeof(real3)>>>(
pos, force, torque, Mv, Mw, hydrodynamicKernel, Nbatches, NperBatch);
}

Expand All @@ -105,6 +107,7 @@ __global__ void computeRPYBatchedNaiveGPU(const vecType *pos,
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= Nbatches * NperBatch)
return;
const bool haveTorque = torques;
real3 pi = make_real3(pos[tid]);
real3 MF = real3();
real3 MT = real3();
Expand All @@ -114,13 +117,13 @@ __global__ void computeRPYBatchedNaiveGPU(const vecType *pos,
break;
real3 pj = make_real3(pos[i]);
real3 fj = make_real3(forces[i]);
real3 tj = make_real3(torques[i]);
mdot_result dot = kernel.dotProduct(pi, pj, fj, tj);
real3 tj = haveTorque ? make_real3(torques[i]) : real3();
mdot_result dot = kernel.dotProduct(pi, pj, fj, tj, haveTorque);
MF += dot.MF;
MT += dot.MT;
}
Mv[tid] = MF;
Mw[tid] = MT;
if(haveTorque) Mw[tid] = MT;
}

template <class HydrodynamicKernel, class vecType>
Expand All @@ -144,6 +147,7 @@ __global__ void computeRPYBatchedNaiveBlockGPU(
const int tid = blockIdx.x;
if (tid >= Nbatches * NperBatch)
return;
const bool haveTorque = torque != nullptr;
real3 pi = make_real3(pos[tid]);
extern __shared__ real3 sharedMemory[];
real3 MF = real3();
Expand All @@ -154,23 +158,23 @@ __global__ void computeRPYBatchedNaiveBlockGPU(
i += blockDim.x) {
real3 pj = make_real3(pos[i]);
real3 fj = make_real3(forces[i]);
real3 tj = make_real3(torque[i]);
mdot_result dot = kernel.dotProduct(pi, pj, fj, tj);
real3 tj = haveTorque ? make_real3(torque[i]) : real3();
mdot_result dot = kernel.dotProduct(pi, pj, fj, tj, haveTorque);
MF += dot.MF;
MT += dot.MT;
}
sharedMemory[threadIdx.x] = MF;
sharedMemory[threadIdx.x + blockDim.x] = MT;
if (haveTorque) sharedMemory[threadIdx.x + blockDim.x] = MT;
__syncthreads();
if (threadIdx.x == 0) {
auto MFTot = real3();
auto MTTot = real3();
for (int i = 0; i < blockDim.x; i++) {
MFTot += sharedMemory[i];
MTTot += sharedMemory[i + blockDim.x];
if (haveTorque) MTTot += sharedMemory[i + blockDim.x];
}
Mv[tid] = MFTot;
Mw[tid] = MTTot;
if (haveTorque) Mw[tid] = MTTot;
}
}

Expand All @@ -183,8 +187,9 @@ void computeRPYBatchedNaiveBlock(const vecType *pos, const vecType *force,
int minBlockSize = 128;
int Nthreads = minBlockSize < N ? minBlockSize : N;
int Nblocks = N;
int sharedMemoryFactor = torque ? 2 : 1;
computeRPYBatchedNaiveBlockGPU<<<Nblocks, Nthreads,
2 * Nthreads * sizeof(real3)>>>(
sharedMemoryFactor * Nthreads * sizeof(real3)>>>(
pos, force, torque, Mv, Mw, hydrodynamicKernel, Nbatches, NperBatch);
}

Expand All @@ -202,24 +207,10 @@ void batchedNBody(device_span<const real> ipos, device_span<const real> iforces,
device_adapter<const real> torques(itorques, device::cuda);
device_adapter<real> MF(iMF, device::cuda);
device_adapter<real> MT(iMT, device::cuda);
// This algorithm always requires buffers for torques and angular velocities
// If the called did not provide them we allocate them here, discarding the
// result
// TODO: Adjust the kernels to avoid this allocation
using cached_vector =
thrust::device_vector<real, allocator::thrust_cached_allocator<real>>;
cached_vector buffer_MT;
cached_vector buffer_torques;

real *mt_ptr = MT.data();
const real *torques_ptr = torques.data();
if (iMT.size() == 0) {
buffer_MT.resize(pos.size(), 0);
mt_ptr = buffer_MT.data().get();
}
if (itorques.size() == 0) {
buffer_torques.resize(pos.size(), 0);
torques_ptr = buffer_torques.data().get();
}

auto kernel = computeRPYBatchedFast<HydrodynamicKernel, LayoutType>;
if (alg == algorithm::naive)
kernel = computeRPYBatchedNaive<HydrodynamicKernel, LayoutType>;
Expand Down
20 changes: 12 additions & 8 deletions solvers/NBody/extra/hydrodynamicKernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,17 @@ __device__ real3 RPY_WF(real3 rij,real r, real rh){
real rt0; //rot-trans & trans-rot mobility
public:

__device__ mdot_result dotProduct(real3 pi, real3 pj, real3 fj, real3 tj){
__device__ mdot_result dotProduct(real3 pi, real3 pj, real3 fj, real3 tj, bool haveTorque){
mdot_result result;
real3 rij = make_real3(pi)-make_real3(pj);
const real r = sqrt(dot(rij, rij));

result.MF += dotProduct_UF(rij, r, fj);
result.MF += dotProduct_UT(rij, r, tj);
result.MT += dotProduct_WF(rij, r, fj);
result.MT += dotProduct_WT(rij, r, tj);
if(haveTorque){
result.MF += dotProduct_UT(rij, r, tj);
result.MT += dotProduct_WF(rij, r, fj);
result.MT += dotProduct_WT(rij, r, tj);
}

return result;
}
Expand Down Expand Up @@ -310,7 +312,7 @@ __device__ real3 RPY_WF(real3 rij,real r, real rh){
//The constructor needs a mobility coefficient for each block and an hydrodynamic radius
BottomWall(real t0, real r0, real rt0, real rh):t0(t0), r0(r0), rt0(rt0), rh(rh){}

__device__ mdot_result dotProduct(real3 pi, real3 pj, real3 fj, real3 tj){
__device__ mdot_result dotProduct(real3 pi, real3 pj, real3 fj, real3 tj, bool haveTorque){
mdot_result result;
// implements damping from Appendix 1 in [2] so the matrix is positive definite when a particle overlaps the wall
real bi = min(pi.z/rh, real(1.0));
Expand All @@ -326,9 +328,11 @@ __device__ real3 RPY_WF(real3 rij,real r, real rh){
const real r = sqrt(dot(rij, rij));

result.MF += bi2*dotProduct_UF(rij, r, fj, pj.z);
result.MF += bi2*dotProduct_UT(rij, r, tj, pi.z); // this is correct with pi.z: see note on dotProduct_UT
result.MT += bi2*dotProduct_WF(rij, r, fj, pj.z);
result.MT += bi2*dotProduct_WT(rij, r, tj, pj.z);
if (haveTorque){
result.MF += bi2*dotProduct_UT(rij, r, tj, pi.z); // this is correct with pi.z: see note on dotProduct_UT
result.MT += bi2*dotProduct_WF(rij, r, fj, pj.z);
result.MT += bi2*dotProduct_WT(rij, r, tj, pj.z);
}

return result;
}
Expand Down

0 comments on commit 3472421

Please sign in to comment.