Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure that CCL output is contiguous on modules #666

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions device/alpaka/src/clusterization/clusterization_algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,8 @@ struct CCLKernel {

traccc::alpaka::thread_id1 thread_id(acc);

auto& partition_start =
::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
auto& partition_end =
::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
auto& outi = ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
auto& shared = ::alpaka::declareSharedVar<
device::details::ccl_kernel_static_smem_parcel, __COUNTER__>(acc);

device::details::index_t* const shared_v =
::alpaka::getDynSharedMem<device::details::index_t>(acc);
Expand All @@ -56,9 +53,8 @@ struct CCLKernel {

alpaka::barrier<TAcc> barry_r(&acc);

device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
partition_start, partition_end, outi, f_view,
gf_view, f_backup_view, gf_backup_view,
device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared,
f_view, gf_view, f_backup_view, gf_backup_view,
adjc_backup_view, adjv_backup_view, backup_mutex,
barry_r, measurements_view, cell_links);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@
#include <cstddef>

namespace traccc::device {
namespace details {
struct ccl_kernel_static_smem_parcel {
std::size_t partition_start;
std::size_t partition_end;
uint32_t outi;
};
} // namespace details

/// Function which reads raw detector cells and turns them into measurements.
///
Expand Down Expand Up @@ -59,7 +66,7 @@ TRACCC_DEVICE inline void ccl_kernel(
const clustering_config cfg, const thread_id_t& thread_id,
const cell_collection_types::const_view cells_view,
const cell_module_collection_types::const_view modules_view,
std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
details::ccl_kernel_static_smem_parcel& smem,
vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view,
vecmem::data::vector_view<details::index_t> f_backup_view,
Expand Down
159 changes: 117 additions & 42 deletions device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,19 @@

#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/aggregate_cluster.hpp"
#include "traccc/clusterization/device/ccl_kernel.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/clusterization/device/reduce_problem_cell.hpp"
#include "traccc/device/concepts/barrier.hpp"
#include "traccc/device/concepts/thread_id.hpp"
#include "traccc/device/mutex.hpp"
#include "traccc/device/sort.hpp"
#include "traccc/device/unique_lock.hpp"
#include "traccc/edm/cell.hpp"
#include "traccc/edm/measurement.hpp"
#include "vecmem/memory/device_atomic_ref.hpp"

namespace traccc::device {

/// Implementation of a FastSV algorithm with the following steps:
/// 1) mix of stochastic and aggressive hooking
/// 2) shortcutting
Expand Down Expand Up @@ -136,15 +138,15 @@ TRACCC_DEVICE void fast_sv_1(const thread_id_t& thread_id,
template <device::concepts::barrier barrier_t,
device::concepts::thread_id1 thread_id_t>
TRACCC_DEVICE inline void ccl_core(
const thread_id_t& thread_id, std::size_t& partition_start,
std::size_t& partition_end, vecmem::device_vector<details::index_t> f,
const thread_id_t& thread_id, details::ccl_kernel_static_smem_parcel& smem,
vecmem::device_vector<details::index_t> f,
vecmem::device_vector<details::index_t> gf,
vecmem::data::vector_view<unsigned int> cell_links, details::index_t* adjv,
unsigned char* adjc, const cell_collection_types::const_device cells_device,
const cell_module_collection_types::const_device modules_device,
measurement_collection_types::device measurements_device,
barrier_t& barrier) {
const details::index_t size = partition_end - partition_start;
const details::index_t size = smem.partition_end - smem.partition_start;

assert(size <= f.size());
assert(size <= gf.size());
Expand All @@ -160,8 +162,8 @@ TRACCC_DEVICE inline void ccl_core(
const details::index_t cid =
tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX();
adjc[tst] = 0;
reduce_problem_cell(cells_device, cid, partition_start, partition_end,
adjc[tst], &adjv[8 * tst]);
reduce_problem_cell(cells_device, cid, smem.partition_start,
smem.partition_end, adjc[tst], &adjv[8 * tst]);
}

for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
Expand Down Expand Up @@ -189,20 +191,96 @@ TRACCC_DEVICE inline void ccl_core(

barrier.blockBarrier();

/*
* We will now repurpose the `gf` shared vector to store the cells which
* are parents. For the time being, `outi` will be the size of `gf`.
*/
if (thread_id.getLocalThreadIdX() == 0) {
smem.outi = 0;
}

barrier.blockBarrier();

/*
* We now collect the parents into the `gf` array.
*/
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
const details::index_t cid =
tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX();

if (f.at(cid) == cid) {
// Add a new measurement to the output buffer. Remembering its
// position inside of the container.
const measurement_collection_types::device::size_type meas_pos =
measurements_device.push_back({});
// Set up the measurement under the appropriate index.
aggregate_cluster(
cells_device, modules_device, f, partition_start, partition_end,
cid, measurements_device.at(meas_pos), cell_links, meas_pos);
vecmem::device_atomic_ref<unsigned int,
vecmem::device_address_space::local>
atom(smem.outi);
gf.at(atom.fetch_add(1)) = cid;
}
}

barrier.blockBarrier();

uint32_t n_measurements = smem.outi;

barrier.blockBarrier();

/*
* The lead threat resizes the measurement vector to contain the total
* number of measurements. The `outi` variable is now repurposed to contain
* the start of this block's region in the output array.
*/
if (thread_id.getLocalThreadIdX() == 0) {
smem.outi = measurements_device.bulk_append_implicit(n_measurements);
}

barrier.blockBarrier();

/*
* Sort the measurements so that the modules are ordered, which guarantees
* that they will be contiguous.
*
* TODO: Can this be done more cleverly? Perhaps while collecting the
* elements into `gf`.
*/
blockOddEvenSort(
thread_id, barrier, gf.data(), n_measurements,
[&](const unsigned short lhs, const unsigned short rhs) {
return cells_device.at(smem.partition_start + f.at(lhs))
.module_link <
cells_device.at(smem.partition_start + f.at(rhs))
.module_link;
});

barrier.blockBarrier();

/*
* Aggregate the clusters and write them to global memory.
*/
for (uint32_t i = thread_id.getLocalThreadIdX(); i < n_measurements;
i += thread_id.getBlockDimX()) {
uint32_t meas_pos = smem.outi + i;
// Set up the measurement under the appropriate index.
aggregate_cluster(cells_device, modules_device, f, smem.partition_start,
smem.partition_end, gf.at(i),
measurements_device.at(meas_pos), cell_links,
meas_pos);
}
}

TRACCC_DEVICE inline std::size_t ccl_partition_find_helper(
const cell_collection_types::const_device cells, std::size_t begin,
std::size_t target_end) {
bool found_good = false;
std::size_t last_good;

for (std::size_t i = begin; i < target_end || !found_good; ++i) {
if (i == cells.size() ||
cells.at(i - 1).module_link != cells.at(i).module_link) {
found_good = true;
last_good = i;
}
}

assert(found_good);
return last_good;
}

template <device::concepts::barrier barrier_t,
Expand All @@ -211,7 +289,7 @@ TRACCC_DEVICE inline void ccl_kernel(
const clustering_config cfg, const thread_id_t& thread_id,
const cell_collection_types::const_view cells_view,
const cell_module_collection_types::const_view modules_view,
std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
details::ccl_kernel_static_smem_parcel& smem,
vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view,
vecmem::data::vector_view<details::index_t> f_backup_view,
Expand All @@ -236,8 +314,6 @@ TRACCC_DEVICE inline void ccl_kernel(
mutex<uint32_t> mutex(backup_mutex);
unique_lock lock(mutex, std::defer_lock);

const std::size_t num_cells = cells_device.size();

/*
* First, we determine the exact range of cells that is to be examined
* by this block of threads. We start from an initial range determined
Expand All @@ -247,42 +323,42 @@ TRACCC_DEVICE inline void ccl_kernel(
* amounts.
*/
if (thread_id.getLocalThreadIdX() == 0) {
std::size_t start =
thread_id.getBlockIdX() * cfg.target_partition_size();
assert(start < num_cells);
std::size_t end =
std::min(num_cells, start + cfg.target_partition_size());
outi = 0;
const std::size_t num_cells = cells_device.size();

smem.outi = 0;

std::size_t boundary_beg =
std::max(1lu, std::min(num_cells, thread_id.getBlockIdX() *
cfg.target_partition_size()));
std::size_t boundary_mid =
std::min(num_cells, (thread_id.getBlockIdX() + 1) *
cfg.target_partition_size());
std::size_t boundary_end =
std::min(num_cells, (thread_id.getBlockIdX() + 2) *
cfg.target_partition_size());

/*
* Next, shift the starting point to a position further in the
* array; the purpose of this is to ensure that we are not operating
* on any cells that have been claimed by the previous block (if
* any).
*/
while (start != 0 &&
cells_device[start - 1].module_link ==
cells_device[start].module_link &&
cells_device[start].channel1 <=
cells_device[start - 1].channel1 + 1) {
++start;
if (thread_id.getBlockIdX() == 0) {
smem.partition_start = 0;
} else {
smem.partition_start = ccl_partition_find_helper(
cells_device, boundary_beg, boundary_mid);
}

/*
* Then, claim as many cells as we need past the naive end of the
* current block to ensure that we do not end our partition on a
* cell that is not a possible boundary!
*/
while (end < num_cells &&
cells_device[end - 1].module_link ==
cells_device[end].module_link &&
cells_device[end].channel1 <=
cells_device[end - 1].channel1 + 1) {
++end;
}
partition_start = start;
partition_end = end;
assert(partition_start <= partition_end);
smem.partition_end =
ccl_partition_find_helper(cells_device, boundary_mid, boundary_end);

assert(smem.partition_start <= smem.partition_end);
}

barrier.blockBarrier();
Expand All @@ -303,7 +379,7 @@ TRACCC_DEVICE inline void ccl_kernel(
// into a return. As such, we cannot use returns in this kernel.

// Get partition for this thread group
const details::index_t size = partition_end - partition_start;
const details::index_t size = smem.partition_end - smem.partition_start;

// If the size is zero, we can just retire the whole block.
if (size == 0) {
Expand Down Expand Up @@ -342,8 +418,7 @@ TRACCC_DEVICE inline void ccl_kernel(
use_scratch = false;
}

ccl_core(thread_id, partition_start, partition_end,
use_scratch ? f_backup : f_primary,
ccl_core(thread_id, smem, use_scratch ? f_backup : f_primary,
use_scratch ? gf_backup : gf_primary, cell_links, adjv, adjc,
cells_device, modules_device, measurements_device, barrier);

Expand Down
86 changes: 86 additions & 0 deletions device/common/include/traccc/device/sort.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

#include <cstdint>
#include <vecmem/memory/device_atomic_ref.hpp>

#include "traccc/definitions/qualifiers.hpp"
#include "traccc/device/concepts/barrier.hpp"
#include "traccc/device/concepts/thread_id.hpp"

namespace traccc::device {
/**
* @brief Swap two values of arbitrary type.
*
* @tparam T The type of values to swap.
*
* @param a The first object in the swap (will take the value of b).
* @param b The second object in the swap (will take the value of a).
*/
template <std::movable T>
TRACCC_DEVICE void swap(T& a, T& b) {
T t = std::move(a);
a = std::move(b);
b = std::move(t);
}

/**
* @brief Perform a block-wide odd-even key sorting.
*
* This function performs a sorting operation across the entire block, assuming
* that all the threads in the block are currently active.
*
* @warning The behaviour of this function is ill-defined if any of the threads
* in the block have exited.
*
* @warning This method is efficient for sorting small arrays, preferably in
* shared memory, but given the O(n^2) worst-case performance this should not
* be used on larger arrays.
*
* @tparam T The thread identifier type.
* @tparam B The barrier type
* @tparam K The type of keys to sort.
* @tparam C The type of the comparison function.
*
* @param thread_id The thread identifier object.
* @param barrier The barrier to use for block synchronization.
* @param keys An array of keys to sort.
* @param num_keys The number of keys in the array to sort.
* @param comparison A comparison function.
*/
template <concepts::thread_id1 T, concepts::barrier B, std::movable K,
std::strict_weak_order<K, K> C>
TRACCC_DEVICE void blockOddEvenSort(T& thread_id, B& barrier, K* keys,
uint32_t num_keys, C&& comparison) {
bool sorted;

do {
sorted = true;

for (uint32_t j = 2 * thread_id.getLocalThreadIdX() + 1;
j < num_keys - 1; j += 2 * thread_id.getBlockDimX()) {
if (comparison(keys[j + 1], keys[j])) {
swap(keys[j + 1], keys[j]);
sorted = false;
}
}

barrier.blockBarrier();

for (uint32_t j = 2 * thread_id.getLocalThreadIdX(); j < num_keys - 1;
j += 2 * thread_id.getBlockDimX()) {
if (comparison(keys[j + 1], keys[j])) {
swap(keys[j + 1], keys[j]);
sorted = false;
}
}
} while (barrier.blockOr(!sorted));
}
} // namespace traccc::device
Loading
Loading