Skip to content

Commit

Permalink
Tune PermutationForDeposition for MI250X (#3925)
Browse files Browse the repository at this point in the history
## Summary

PermutationForDeposition was initially developed for A100. A few tweaks
can be made to improve performance on MI250X, which has a smaller cache
but is much less sensitive to atomic add congestion.

TODO: more tests

## Additional background


![sp4d_amd](https://github.com/AMReX-Codes/amrex/assets/64009254/df1d08c7-d5a8-4f2a-bec9-a994431f4a9e)


## Checklist

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
  • Loading branch information
AlexanderSinn authored May 22, 2024
1 parent 568efb1 commit fed4bc1
Showing 1 changed file with 50 additions and 19 deletions.
69 changes: 50 additions & 19 deletions Src/Particle/AMReX_ParticleUtil.H
Original file line number Diff line number Diff line change
Expand Up @@ -697,9 +697,20 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
{
BL_PROFILE("PermutationForDeposition()");

constexpr index_type gpu_block_size = 1024;
constexpr index_type gpu_block_size_m1 = gpu_block_size - 1;
constexpr index_type llist_guard = std::numeric_limits<index_type>::max();
#if defined(AMREX_USE_HIP)
// MI250X has a small L2 cache and is more tolerant of atomic add contention,
// so we use a small block size of 64 and the compressed layout.
static constexpr index_type gpu_block_size = 64;
static constexpr bool compressed_layout = true;
#else
// A100 has a larger L2 cache and is very sensitive to atomic add contention,
// so we use a large bock size of 1024 and not the compressed layout.
static constexpr index_type gpu_block_size = 1014;
static constexpr bool compressed_layout = false;
#endif

static constexpr index_type gpu_block_size_m1 = gpu_block_size - 1;
static constexpr index_type llist_guard = std::numeric_limits<index_type>::max();

// round up to gpu_block_size
nbins = (nbins + gpu_block_size_m1) / gpu_block_size * gpu_block_size;
Expand All @@ -722,9 +733,34 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n

#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
amrex::launch<gpu_block_size>(nbins / gpu_block_size, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () {
[pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () {
__shared__ index_type sdata[gpu_block_size];
index_type current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
__shared__ index_type global_idx_start;
__shared__ index_type idx_start;

index_type current_idx = 0;

if constexpr (compressed_layout) {
// Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles
// are put right next to each other, while without the compressed layout,
// there can be other particle sweeps from different locations between them.
current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];

index_type num_particles_thread = 0;
while (current_idx != llist_guard) {
++num_particles_thread;
current_idx = pllist_next[current_idx];
}

index_type num_particles_block =
Gpu::blockReduceSum<gpu_block_size>(num_particles_thread);

if (threadIdx.x == 0) {
global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block);
}
}

current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];

while (true) {
sdata[threadIdx.x] = index_type(current_idx != llist_guard);
Expand All @@ -745,30 +781,25 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
if (sdata[gpu_block_size_m1] == 0) {
break;
}
__syncthreads();
if (threadIdx.x == gpu_block_size_m1) {
x = sdata[gpu_block_size_m1];
sdata[gpu_block_size_m1] = Gpu::Atomic::Add(pglobal_idx, x);
}
__syncthreads();
if (threadIdx.x < gpu_block_size_m1) {
sdata[threadIdx.x] += sdata[gpu_block_size_m1];
}
__syncthreads();
if (threadIdx.x == gpu_block_size_m1) {
sdata[gpu_block_size_m1] += x;
if constexpr (compressed_layout) {
idx_start = global_idx_start;
global_idx_start += sdata[gpu_block_size_m1];
} else {
idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]);
}
}
__syncthreads();

sdata[threadIdx.x] += idx_start;
if (current_idx != llist_guard) {
pperm[sdata[threadIdx.x] - 1] = current_idx;
current_idx = pllist_next[current_idx];
}
}
});
#else
amrex::ignore_unused(pperm, pglobal_idx);
Abort("Not implemented");
amrex::ignore_unused(pperm, pglobal_idx, compressed_layout);
Abort("PermutationForDeposition only implemented for CUDA and HIP");
#endif

Gpu::Device::streamSynchronize();
Expand Down

0 comments on commit fed4bc1

Please sign in to comment.