Skip to content

Commit

Permalink
prov/util: Add name field to memory monitors:
Browse files Browse the repository at this point in the history
    Log when monitor states are changed.
    Log name of default monitor.

Signed-off-by: Mike Uttormark <mike.uttormark@hpe.com>
  • Loading branch information
muttormark committed Jan 22, 2024
1 parent a0b06ed commit 4729af9
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 15 deletions.
1 change: 1 addition & 0 deletions include/ofi_mr.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ struct ofi_mem_monitor {
*/
bool (*valid)(struct ofi_mem_monitor *notifier,
const struct ofi_mr_info *info, struct ofi_mr_entry *entry);
const char *name;
};

void ofi_monitor_init(struct ofi_mem_monitor *monitor);
Expand Down
1 change: 1 addition & 0 deletions prov/util/src/cuda_ipc_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ static struct ofi_mem_monitor cuda_ipc_monitor_ = {
.subscribe = ofi_monitor_subscribe_no_op,
.unsubscribe = ofi_monitor_unsubscribe_no_op,
.valid = cuda_ipc_monitor_valid,
.name = "cuda_ipc",
};

struct ofi_mem_monitor *cuda_ipc_monitor = &cuda_ipc_monitor_;
1 change: 1 addition & 0 deletions prov/util/src/cuda_mem_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ static struct ofi_mem_monitor cuda_mm = {
.subscribe = cuda_mm_subscribe,
.unsubscribe = cuda_mm_unsubscribe,
.valid = cuda_mm_valid,
.name = "cuda",
};

struct ofi_mem_monitor *cuda_monitor = &cuda_mm;
1 change: 1 addition & 0 deletions prov/util/src/rocr_ipc_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ static struct ofi_mem_monitor rocr_ipc_monitor_ = {
.subscribe = ofi_monitor_subscribe_no_op,
.unsubscribe = ofi_monitor_unsubscribe_no_op,
.valid = rocr_ipc_monitor_valid,
.name = "rocr_ipc",
};

struct ofi_mem_monitor *rocr_ipc_monitor = &rocr_ipc_monitor_;
1 change: 1 addition & 0 deletions prov/util/src/rocr_mem_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ static struct ofi_mem_monitor rocr_mm = {
.subscribe = rocr_mm_subscribe,
.unsubscribe = rocr_mm_unsubscribe,
.valid = rocr_mm_valid,
.name = "rocr",
};

struct ofi_mem_monitor *rocr_monitor = &rocr_mm;
Expand Down
1 change: 1 addition & 0 deletions prov/util/src/util_mem_hooks.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ struct ofi_memhooks memhooks = {
.monitor.cleanup = ofi_monitor_cleanup,
.monitor.start = ofi_memhooks_start,
.monitor.stop = ofi_memhooks_stop,
.monitor.name = "memhooks",
};
struct ofi_mem_monitor *memhooks_monitor = &memhooks.monitor;

Expand Down
80 changes: 65 additions & 15 deletions prov/util/src/util_mem_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ static struct ofi_uffd uffd = {
.monitor.cleanup = ofi_monitor_cleanup,
.monitor.start = ofi_uffd_start,
.monitor.stop = ofi_uffd_stop,
.monitor.name = "uffd",
};
struct ofi_mem_monitor *uffd_monitor = &uffd.monitor;

Expand All @@ -64,6 +65,9 @@ struct ofi_mem_monitor *default_cuda_monitor;
struct ofi_mem_monitor *default_rocr_monitor;
struct ofi_mem_monitor *default_ze_monitor;

struct ofi_mem_monitor **monitor_list;
size_t monitor_list_size;

static size_t ofi_default_cache_size(void)
{
long cpu_cnt;
Expand Down Expand Up @@ -125,6 +129,9 @@ static int ofi_monitors_update(struct ofi_mem_monitor **monitors)
assert(monitor->state != FI_MM_STATE_UNSPEC);
switch (monitor->state) {
case FI_MM_STATE_STARTING:
FI_INFO(&core_prov, FI_LOG_MR,
"Starting memory monitor: %s\n",
monitor->name);
ret = monitor->start(monitor);
if (ret) {
monitor->state = FI_MM_STATE_IDLE;
Expand All @@ -137,6 +144,9 @@ static int ofi_monitors_update(struct ofi_mem_monitor **monitors)
monitor->state = FI_MM_STATE_RUNNING;
break;
case FI_MM_STATE_STOPPING:
FI_INFO(&core_prov, FI_LOG_MR,
"Stopping memory monitor: %s\n",
monitor->name);
monitor->stop(monitor);
monitor->state = FI_MM_STATE_IDLE;
break;
Expand All @@ -162,6 +172,40 @@ void ofi_monitor_cleanup(struct ofi_mem_monitor *monitor)
assert(monitor->state == FI_MM_STATE_IDLE);
}

static void initialize_monitor_list()
{
/* Save a copy of the monitor list for cleanup time.
* This list can not be static because the pointer
* initialization is spread across mulitple modules.
*/

struct ofi_mem_monitor *monitors[] = {
uffd_monitor,
memhooks_monitor,
cuda_monitor,
cuda_ipc_monitor,
rocr_monitor,
rocr_ipc_monitor,
xpmem_monitor,
ze_monitor,
import_monitor,
};

monitor_list_size = ARRAY_SIZE(monitors);
monitor_list = calloc(monitor_list_size, sizeof(*monitor_list));

for (size_t i = 0; i < monitor_list_size; i++) {
monitor_list[i] = monitors[i];
assert(monitor_list[i]->name);
}
}

static void cleanup_monitor_list() {
free(monitor_list);
monitor_list = NULL;
monitor_list_size = 0;
}

/*
* Initialize all available memory monitors
*/
Expand All @@ -171,15 +215,14 @@ void ofi_monitors_init(void)
pthread_mutex_init(&mm_state_lock, NULL);
pthread_rwlock_init(&mm_list_rwlock, NULL);

uffd_monitor->init(uffd_monitor);
memhooks_monitor->init(memhooks_monitor);
cuda_monitor->init(cuda_monitor);
cuda_ipc_monitor->init(cuda_ipc_monitor);
rocr_monitor->init(rocr_monitor);
rocr_ipc_monitor->init(rocr_ipc_monitor);
xpmem_monitor->init(xpmem_monitor);
ze_monitor->init(ze_monitor);
import_monitor->init(import_monitor);
initialize_monitor_list();

for (size_t i = 0; i < monitor_list_size; i++) {
FI_INFO(&core_prov, FI_LOG_MR,
"Initializing memory monitor %s\n",
monitor_list[i]->name);
monitor_list[i]->init(monitor_list[i]);
}

fi_param_define(NULL, "mr_cache_max_size", FI_PARAM_SIZE_T,
"Defines the total number of bytes for all memory"
Expand Down Expand Up @@ -259,6 +302,10 @@ void ofi_monitors_init(void)
}
}

FI_INFO(&core_prov, FI_LOG_MR,
"Default memory monitor is: %s\n",
(default_monitor) ? default_monitor->name : "disabled");

if (cache_params.cuda_monitor_enabled)
default_cuda_monitor = cuda_monitor;
else
Expand All @@ -277,12 +324,14 @@ void ofi_monitors_init(void)

void ofi_monitors_cleanup(void)
{
uffd_monitor->cleanup(uffd_monitor);
memhooks_monitor->cleanup(memhooks_monitor);
cuda_monitor->cleanup(cuda_monitor);
rocr_monitor->cleanup(rocr_monitor);
ze_monitor->cleanup(ze_monitor);
import_monitor->cleanup(import_monitor);
for (size_t i = 0; i < monitor_list_size; i++) {
FI_INFO(&core_prov, FI_LOG_MR,
"Cleaning up memory monitor %s\n",
monitor_list[i]->name);
monitor_list[i]->cleanup(monitor_list[i]);
}

cleanup_monitor_list();

pthread_rwlock_destroy(&mm_list_rwlock);
pthread_mutex_destroy(&mm_state_lock);
Expand Down Expand Up @@ -735,6 +784,7 @@ static struct ofi_import_monitor impmon = {
.monitor.subscribe = ofi_import_monitor_subscribe,
.monitor.unsubscribe = ofi_import_monitor_unsubscribe,
.monitor.valid = ofi_import_monitor_valid,
.monitor.name = "import",
};

struct ofi_mem_monitor *import_monitor = &impmon.monitor;
Expand Down
1 change: 1 addition & 0 deletions prov/util/src/xpmem_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ static struct ofi_mem_monitor xpmem_monitor_ = {
.subscribe = ofi_monitor_subscribe_no_op,
.unsubscribe = ofi_monitor_unsubscribe_no_op,
.valid = xpmem_monitor_valid,
.name = "xpmem",
};

struct ofi_mem_monitor *xpmem_monitor = &xpmem_monitor_;
1 change: 1 addition & 0 deletions prov/util/src/ze_mem_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ static struct ofi_mem_monitor ze_mm = {
.subscribe = ze_mm_subscribe,
.unsubscribe = ze_mm_unsubscribe,
.valid = ze_mm_valid,
.name = "ze",
};

struct ofi_mem_monitor *ze_monitor = &ze_mm;

0 comments on commit 4729af9

Please sign in to comment.