Skip to content

Commit

Permalink
Reduce memory allocations and copies in BoxForceReciprocal
Browse files Browse the repository at this point in the history
  • Loading branch information
LSchwiebert committed Feb 8, 2025
1 parent 63b1631 commit 3cb2b6b
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 46 deletions.
5 changes: 2 additions & 3 deletions src/CalculateEnergy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,15 +390,14 @@ CalculateEnergy::BoxForce(SystemPotential potential, XYZArray const &coords,
aForcex[nParticle] += -(forceLJ.x + forceReal.x);
aForcey[nParticle] += -(forceLJ.y + forceReal.y);
aForcez[nParticle] += -(forceLJ.z + forceReal.z);
}
else if (moveType == mp::MPDISPLACE) {
} else if (moveType == mp::MPDISPLACE) {
mForcex[particleMol[currParticle]] += (forceLJ.x + forceReal.x);
mForcey[particleMol[currParticle]] += (forceLJ.y + forceReal.y);
mForcez[particleMol[currParticle]] += (forceLJ.z + forceReal.z);
mForcex[particleMol[nParticle]] += -(forceLJ.x + forceReal.x);
mForcey[particleMol[nParticle]] += -(forceLJ.y + forceReal.y);
mForcez[particleMol[nParticle]] += -(forceLJ.z + forceReal.z);
}
}
}
}
}
Expand Down
15 changes: 7 additions & 8 deletions src/Ewald.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ along with this program, also can be found at
//
//
// Energy Calculation functions for Ewald summation method
// Calculating self, correction and reciprocal part of ewald
// Calculating self, correction and reciprocal part of Ewald
//
// Developed by Y. Li and Mohammad S. Barhaghi
//
Expand Down Expand Up @@ -125,6 +125,10 @@ void Ewald::Init() {
}

AllocMem();
#ifdef GOMC_CUDA
InitEwaldVariablesCUDA(ff.particles->getCUDAVars(), startMol, lengthMol,
imageTotal);
#endif
// initialize K vectors and reciprocal terms
UpdateVectorsAndRecipTerms(true);
}
Expand Down Expand Up @@ -184,10 +188,6 @@ void Ewald::AllocMem() {
sumRref[b] = new double[imageTotal];
sumIref[b] = new double[imageTotal];
}

#ifdef GOMC_CUDA
InitEwaldVariablesCUDA(ff.particles->getCUDAVars(), imageTotal);
#endif
}

// calculate reciprocal terms for a box. Should be called only at
Expand Down Expand Up @@ -1665,10 +1665,9 @@ void Ewald::BoxForceReciprocal(XYZArray const &molCoords,
}
if (moveType == mp::MPROTATE) {
atomForceRec.Set(p, X, Y, Z);
}
else if (moveType == mp::MPDISPLACE) {
} else if (moveType == mp::MPDISPLACE) {
molForceRec.Add(molIndex, X, Y, Z);
}
}
}
thisMol++;
}
Expand Down
44 changes: 12 additions & 32 deletions src/GPU/CalculateEwaldCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,9 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars, uint imageSize, uint box,
cudaMemcpyDeviceToHost);
}

// Note: This implementation assumes that this function is always called after
// BoxForce, so the coordinates have already been copied to the GPU. Otherwise,
// add cudaMemcpy calls to copy the coordinates to gpu_x, gpu_y, and gpu_z.
void CallBoxForceReciprocalGPU(
VariablesCUDA *vars, XYZArray &atomForceRec, XYZArray &molForceRec,
const std::vector<double> &particleCharge,
Expand All @@ -348,34 +351,18 @@ void CallBoxForceReciprocalGPU(
BoxDimensions const &boxAxes, int moveType, int box) {
int atomCount = atomForceRec.Count();
int molCount = molForceRec.Count();
int *gpu_particleUsed;
int *gpu_startMol, *gpu_lengthMol;

// calculate block and grid sizes
int threadsPerBlock = THREADS_PER_BLOCK;
int blocksPerGrid = particleUsed.size();

CUMALLOC((void **)&gpu_particleUsed, particleUsed.size() * sizeof(int));
CUMALLOC((void **)&gpu_startMol, startMol.size() * sizeof(int));
CUMALLOC((void **)&gpu_lengthMol, lengthMol.size() * sizeof(int));

if (moveType == mp::MPDISPLACE) {
cudaMemset(vars->gpu_mForceRecx, 0, molCount * sizeof(double));
cudaMemset(vars->gpu_mForceRecy, 0, molCount * sizeof(double));
cudaMemset(vars->gpu_mForceRecz, 0, molCount * sizeof(double));
}
cudaMemcpy(gpu_particleUsed, &particleUsed[0],
cudaMemcpy(vars->gpu_particleUsed, &particleUsed[0],
sizeof(int) * particleUsed.size(), cudaMemcpyHostToDevice);
cudaMemcpy(vars->gpu_x, molCoords.x, sizeof(double) * atomCount,
cudaMemcpyHostToDevice);
cudaMemcpy(vars->gpu_y, molCoords.y, sizeof(double) * atomCount,
cudaMemcpyHostToDevice);
cudaMemcpy(vars->gpu_z, molCoords.z, sizeof(double) * atomCount,
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_startMol, &startMol[0], sizeof(int) * startMol.size(),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_lengthMol, &lengthMol[0], sizeof(int) * lengthMol.size(),
cudaMemcpyHostToDevice);

#ifndef NDEBUG
checkLastErrorCUDA(__FILE__, __LINE__);
Expand All @@ -384,14 +371,14 @@ void CallBoxForceReciprocalGPU(
BoxForceReciprocalGPU<<<blocksPerGrid, threadsPerBlock>>>(
vars->gpu_aForceRecx, vars->gpu_aForceRecy, vars->gpu_aForceRecz,
vars->gpu_mForceRecx, vars->gpu_mForceRecy, vars->gpu_mForceRecz,
vars->gpu_particleCharge, vars->gpu_particleMol, gpu_particleUsed,
gpu_startMol, gpu_lengthMol, vars->gpu_alpha, vars->gpu_alphaSq,
constValue, imageSize, vars->gpu_kxRef[box], vars->gpu_kyRef[box],
vars->gpu_kzRef[box], vars->gpu_x, vars->gpu_y, vars->gpu_z,
vars->gpu_prefactRef[box], vars->gpu_sumRnew[box], vars->gpu_sumInew[box],
vars->gpu_isFraction, vars->gpu_molIndex, vars->gpu_lambdaCoulomb,
vars->gpu_cell_x[box], vars->gpu_cell_y[box], vars->gpu_cell_z[box],
vars->gpu_Invcell_x[box], vars->gpu_Invcell_y[box],
vars->gpu_particleCharge, vars->gpu_particleMol, vars->gpu_particleUsed,
vars->gpu_startMol, vars->gpu_lengthMol, vars->gpu_alpha,
vars->gpu_alphaSq, constValue, imageSize, vars->gpu_kxRef[box],
vars->gpu_kyRef[box], vars->gpu_kzRef[box], vars->gpu_x, vars->gpu_y,
vars->gpu_z, vars->gpu_prefactRef[box], vars->gpu_sumRnew[box],
vars->gpu_sumInew[box], vars->gpu_isFraction, vars->gpu_molIndex,
vars->gpu_lambdaCoulomb, vars->gpu_cell_x[box], vars->gpu_cell_y[box],
vars->gpu_cell_z[box], vars->gpu_Invcell_x[box], vars->gpu_Invcell_y[box],
vars->gpu_Invcell_z[box], vars->gpu_nonOrth, boxAxes.GetAxis(box).x,
boxAxes.GetAxis(box).y, boxAxes.GetAxis(box).z, moveType, box);
#ifndef NDEBUG
Expand All @@ -414,13 +401,6 @@ void CallBoxForceReciprocalGPU(
cudaMemcpy(molForceRec.z, vars->gpu_mForceRecz, sizeof(double) * molCount,
cudaMemcpyDeviceToHost);
}

#ifndef NDEBUG
cudaDeviceSynchronize();
#endif
CUFREE(gpu_particleUsed);
CUFREE(gpu_startMol);
CUFREE(gpu_lengthMol);
}

__global__ void BoxForceReciprocalGPU(
Expand Down
17 changes: 16 additions & 1 deletion src/GPU/ConstantDefinitionsCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,10 @@ void InitPartVariablesCUDA(VariablesCUDA *vars,
particleCharge.size() * sizeof(double), cudaMemcpyHostToDevice);
}

void InitEwaldVariablesCUDA(VariablesCUDA *vars, uint imageTotal) {
void InitEwaldVariablesCUDA(VariablesCUDA *vars,
const std::vector<int> &startMol,
const std::vector<int> &lengthMol,
uint imageTotal) {
vars->gpu_kx = new double *[BOX_TOTAL];
vars->gpu_ky = new double *[BOX_TOTAL];
vars->gpu_kz = new double *[BOX_TOTAL];
Expand Down Expand Up @@ -235,6 +238,15 @@ void InitEwaldVariablesCUDA(VariablesCUDA *vars, uint imageTotal) {
CUMALLOC((void **)&vars->gpu_wT23, imageTotal * sizeof(double));
CUMALLOC((void **)&vars->gpu_wT33, imageTotal * sizeof(double));
CUMALLOC((void **)&vars->gpu_recipEnergies, imageTotal * sizeof(double));
// The size of startMol and lengthMol are both the number of atoms in the
// system
CUMALLOC((void **)&vars->gpu_startMol, startMol.size() * sizeof(int));
CUMALLOC((void **)&vars->gpu_lengthMol, lengthMol.size() * sizeof(int));
CUMALLOC((void **)&vars->gpu_particleUsed, lengthMol.size() * sizeof(int));
cudaMemcpy(vars->gpu_startMol, &startMol[0], startMol.size() * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(vars->gpu_lengthMol, &lengthMol[0], lengthMol.size() * sizeof(int),
cudaMemcpyHostToDevice);
// Allocate space for cub reduction operations on the Ewald arrays
// Set to the maximum value
cub::DeviceReduce::Sum(vars->cub_reduce_storage,
Expand Down Expand Up @@ -420,6 +432,9 @@ void DestroyEwaldCUDAVars(VariablesCUDA *vars) {
CUFREE(vars->gpu_wT23);
CUFREE(vars->gpu_wT33);
CUFREE(vars->gpu_recipEnergies);
CUFREE(vars->gpu_startMol);
CUFREE(vars->gpu_lengthMol);
CUFREE(vars->gpu_particleUsed);
CUFREE(vars->cub_reduce_storage);

delete[] vars->gpu_kx;
Expand Down
4 changes: 3 additions & 1 deletion src/GPU/ConstantDefinitionsCUDAKernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ void InitGPUForceField(VariablesCUDA &vars, double const *sigmaSq,
double const *alphaSq, int ewald, double diElectric_1);
void InitCoordinatesCUDA(VariablesCUDA *vars, uint maxAtomNumber,
uint maxAtomsInMol, uint maxMolNumber);
void InitEwaldVariablesCUDA(VariablesCUDA *vars, uint imageTotal);
void InitEwaldVariablesCUDA(VariablesCUDA *vars,
const std::vector<int> &startMol,
const std::vector<int> &lengthMol, uint imageTotal);
void InitExp6VariablesCUDA(VariablesCUDA *vars, double *rMin, double *expConst,
double *rMaxSq, uint size);
void InitPartVariablesCUDA(VariablesCUDA *vars,
Expand Down
6 changes: 5 additions & 1 deletion src/GPU/VariablesCUDA.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ public:
gpu_ewald = nullptr;
gpu_diElectric_1 = nullptr;
gpu_finalVal = nullptr;
gpu_startMol = nullptr;
gpu_lengthMol = nullptr;
gpu_particleUsed = nullptr;
gpu_aForcex = nullptr;
gpu_aForcey = nullptr;
gpu_aForcez = nullptr;
Expand Down Expand Up @@ -142,7 +145,8 @@ public:
double *gpu_finalVal;
double *gpu_x, *gpu_y, *gpu_z;
double *gpu_molCharge, *gpu_particleCharge;
int *gpu_particleKind, *gpu_particleMol;
int *gpu_particleKind, *gpu_particleMol, *gpu_particleUsed;
int *gpu_startMol, *gpu_lengthMol;
double *gpu_nx, *gpu_ny, *gpu_nz;
double *gpu_dx, *gpu_dy, *gpu_dz;
double **gpu_kx, **gpu_ky, **gpu_kz;
Expand Down

0 comments on commit 3cb2b6b

Please sign in to comment.