From 7bffdaccbc3bee42dba5d96009debeb099535140 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 25 Jul 2024 22:54:29 +0200 Subject: [PATCH] Add timeout to UCXX generic operations (#2398) https://github.com/rapidsai/ucxx/pull/238 introduced a new timeout argument for `registerGeneric{Pre,Post}` that can be used to prevent blocking indefinitely should there be no UCX worker progress wakeup events. This should also result in new RAFT packages with updated symbols. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/2398 --- cpp/include/raft/comms/detail/std_comms.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp index cb1accc95e..c5d64f6a29 100644 --- a/cpp/include/raft/comms/detail/std_comms.hpp +++ b/cpp/include/raft/comms/detail/std_comms.hpp @@ -307,13 +307,16 @@ class std_comms : public comms_iface { bool restart = false; // resets the timeout when any progress was made if (worker->isProgressThreadRunning()) { - // Wait for a UCXX progress thread roundtrip + // Wait for a UCXX progress thread roundtrip, prevent waiting for longer + // than 10ms for each operation, will retry in next iteration. ucxx::utils::CallbackNotifier callbackNotifierPre{}; - worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); }); + worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); }, + 10000000 /* 10ms */); callbackNotifierPre.wait(); ucxx::utils::CallbackNotifier callbackNotifierPost{}; - worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); }); + worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); }, + 10000000 /* 10ms */); callbackNotifierPost.wait(); } else { // Causes UCXX to progress through the send/recv message queue