Skip to content

Commit

Permalink
Use less device memory when checkpointing particles (#3238)
Browse files Browse the repository at this point in the history
This chunks the operation per box rather than per rank so as to use less
device memory.

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate

---------

Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja>
Co-authored-by: Ann Almgren <asalmgren@lbl.gov>
  • Loading branch information
3 people authored Mar 29, 2024
1 parent f264290 commit fe6f3de
Showing 1 changed file with 11 additions and 10 deletions.
21 changes: 11 additions & 10 deletions Src/Particle/AMReX_WriteBinaryParticleData.H
Original file line number Diff line number Diff line change
Expand Up @@ -189,16 +189,12 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
const Long rChunkSize = AMREX_SPACEDIM + num_output_real;
rdata.resize(np*rChunkSize);

typename PC::IntVector idata_d(idata.size());
typename PC::RealVector rdata_d(rdata.size());

typename PC::IntVector write_int_comp_d(write_int_comp.size());
typename PC::IntVector write_real_comp_d(write_real_comp.size());
Gpu::copyAsync(Gpu::hostToDevice, write_int_comp.begin(), write_int_comp.end(),
write_int_comp_d.begin());
Gpu::copyAsync(Gpu::hostToDevice, write_real_comp.begin(), write_real_comp.end(),
write_real_comp_d.begin());
Gpu::Device::streamSynchronize();

const auto write_int_comp_d_ptr = write_int_comp_d.data();
const auto write_real_comp_d_ptr = write_real_comp_d.data();
Expand All @@ -211,6 +207,9 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
typename PC::IntVector offsets(np_tile);
int num_copies = Scan::ExclusiveSum(np_tile, pflags.begin(), offsets.begin(), Scan::retSum);

typename PC::IntVector idata_d(num_copies*iChunkSize);
typename PC::RealVector rdata_d(num_copies*rChunkSize);

const auto flag_ptr = pflags.data();

auto idata_d_ptr = idata_d.data();
Expand All @@ -224,11 +223,11 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
const auto p = ptd.getSuperParticle(pindex);

if (flag_ptr[pindex]) {
std::size_t iout_index = (pindex+poffset)*iChunkSize;
std::size_t iout_index = pindex*iChunkSize;
packParticleIDs(&idata_d_ptr[iout_index], p, is_checkpoint);
iout_index += 2;

std::size_t rout_index = (pindex+poffset)*rChunkSize;
std::size_t rout_index = pindex*rChunkSize;
for (int j = 0; j < AMREX_SPACEDIM; j++) {
rdata_d_ptr[rout_index] = p.pos(j);
rout_index++;
Expand Down Expand Up @@ -267,12 +266,14 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
}
});

Gpu::copyAsync(Gpu::deviceToHost, idata_d.begin(), idata_d.end(),
idata.begin() + typename PC::IntVector::difference_type(poffset));
Gpu::copyAsync(Gpu::deviceToHost, rdata_d.begin(), rdata_d.end(),
rdata.begin() + typename PC::RealVector::difference_type(poffset));
Gpu::Device::streamSynchronize();

poffset += num_copies;
}

Gpu::copyAsync(Gpu::deviceToHost, idata_d.begin(), idata_d.end(), idata.begin());
Gpu::copyAsync(Gpu::deviceToHost, rdata_d.begin(), rdata_d.end(), rdata.begin());
Gpu::Device::streamSynchronize();
}

template <class PC>
Expand Down

0 comments on commit fe6f3de

Please sign in to comment.