Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch from fine grained locking throughout the code base to device and domain level locking #743

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 7 additions & 35 deletions include/nccl_ofi_deque.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ struct nccl_ofi_deque_t {
* locations.
*/
nccl_ofi_deque_elem_t head;
/* Lock for deque operations */
pthread_mutex_t lock;
};
typedef struct nccl_ofi_deque_t nccl_ofi_deque_t;

Expand Down Expand Up @@ -79,17 +77,13 @@ static inline int nccl_ofi_deque_insert_back(nccl_ofi_deque_t *deque, nccl_ofi_d
assert(deque);
assert(deque_elem);

nccl_net_ofi_mutex_lock(&deque->lock);

deque_elem->next = &deque->head;
deque_elem->prev = deque->head.prev;

assert(deque->head.prev);
deque->head.prev->next = deque_elem;
deque->head.prev = deque_elem;

nccl_net_ofi_mutex_unlock(&deque->lock);

return 0;
}

Expand All @@ -104,22 +98,18 @@ static inline int nccl_ofi_deque_insert_front(nccl_ofi_deque_t *deque, nccl_ofi_
assert(deque);
assert(deque_elem);

nccl_net_ofi_mutex_lock(&deque->lock);

deque_elem->next = deque->head.next;
deque_elem->prev = &deque->head;

assert(deque->head.next);
deque->head.next->prev = deque_elem;
deque->head.next = deque_elem;

nccl_net_ofi_mutex_unlock(&deque->lock);

return 0;
}

/*
* Check if the deque is empty. This call does not take the mutex.
* Check if the deque is empty.
*
* @return true if empty, false if not
*/
Expand All @@ -138,28 +128,21 @@ static inline int nccl_ofi_deque_remove_front(nccl_ofi_deque_t *deque, nccl_ofi_
assert(deque);
assert(deque_elem);

/* Shortcut to avoid taking mutex for empty deque */
if (nccl_ofi_deque_isempty(deque)) {
*deque_elem = NULL;
return 0;
}

nccl_net_ofi_mutex_lock(&deque->lock);

/* Check for empty deque. We need to do this again because the check above
was before we acquired the lock. */
if (nccl_ofi_deque_isempty(deque)) {
*deque_elem = NULL;
goto unlock;
}

*deque_elem = deque->head.next;
/* gcc 7.3.1 (AL2) is terrified *deque_elem is NULL here.
* None of us can figure out why, but add this check to make
* the compiler happy. */
if (OFI_UNLIKELY(*deque_elem == NULL)) {
return 0;
}
deque->head.next = (*deque_elem)->next;
(*deque_elem)->next->prev = &deque->head;

unlock:
nccl_net_ofi_mutex_unlock(&deque->lock);

return 0;
}

Expand All @@ -171,8 +154,6 @@ static inline void nccl_ofi_deque_remove(nccl_ofi_deque_t *deque, nccl_ofi_deque
assert(deque);
assert(deque_elem);

nccl_net_ofi_mutex_lock(&deque->lock);

assert(deque_elem->prev && deque_elem->next);

deque_elem->prev->next = deque_elem->next;
Expand All @@ -183,8 +164,6 @@ static inline void nccl_ofi_deque_remove(nccl_ofi_deque_t *deque, nccl_ofi_deque
/* Reset deque_elem pointers to avoid dangling pointers */
deque_elem->prev = NULL;
deque_elem->next = NULL;

nccl_net_ofi_mutex_unlock(&deque->lock);
}

/**
Expand All @@ -196,15 +175,12 @@ static inline nccl_ofi_deque_elem_t *nccl_ofi_deque_get_front(nccl_ofi_deque_t *

nccl_ofi_deque_elem_t *ret_elem = NULL;

nccl_net_ofi_mutex_lock(&deque->lock);

if (nccl_ofi_deque_isempty(deque)) {
ret_elem = NULL;
} else {
ret_elem = deque->head.next;
}

nccl_net_ofi_mutex_unlock(&deque->lock);
return ret_elem;
}

Expand All @@ -218,15 +194,11 @@ static inline nccl_ofi_deque_elem_t *nccl_ofi_deque_get_next(nccl_ofi_deque_t *d

nccl_ofi_deque_elem_t *ret_elem = NULL;

nccl_net_ofi_mutex_lock(&deque->lock);

ret_elem = deque_elem->next;
if (ret_elem == (&deque->head)) {
ret_elem = NULL;
}

nccl_net_ofi_mutex_unlock(&deque->lock);

return ret_elem;
}

Expand Down
13 changes: 1 addition & 12 deletions include/nccl_ofi_freelist.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,6 @@ struct nccl_ofi_freelist_t {
void *regmr_opaque;

size_t memcheck_redzone_size;

pthread_mutex_t lock;
};
typedef struct nccl_ofi_freelist_t nccl_ofi_freelist_t;

Expand Down Expand Up @@ -194,13 +192,11 @@ static inline nccl_ofi_freelist_elem_t *nccl_ofi_freelist_entry_alloc

assert(freelist);

nccl_net_ofi_mutex_lock(&freelist->lock);

if (!freelist->entries) {
ret = nccl_ofi_freelist_add(freelist, freelist->increase_entry_count);
if (ret != 0) {
NCCL_OFI_WARN("Could not extend freelist: %d", ret);
goto cleanup;
return NULL;
}
}

Expand All @@ -210,9 +206,6 @@ static inline nccl_ofi_freelist_elem_t *nccl_ofi_freelist_entry_alloc
freelist->entries = entry->next;
nccl_ofi_freelist_entry_set_undefined(freelist, entry->ptr);

cleanup:
nccl_net_ofi_mutex_unlock(&freelist->lock);

return entry;
}

Expand All @@ -231,14 +224,10 @@ static inline void nccl_ofi_freelist_entry_free(nccl_ofi_freelist_t *freelist,
assert(freelist);
assert(entry);

nccl_net_ofi_mutex_lock(&freelist->lock);

entry->next = freelist->entries;
freelist->entries = entry;

nccl_net_ofi_mem_noaccess(entry->ptr, user_entry_size);

nccl_net_ofi_mutex_unlock(&freelist->lock);
}

#ifdef __cplusplus
Expand Down
3 changes: 0 additions & 3 deletions include/nccl_ofi_idpool.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@ typedef struct nccl_ofi_idpool {
/* ID pool bit array. A bit set in the array indicates
that the ID corresponding to its index is available.*/
uint64_t *ids;

/* Lock for concurrency */
pthread_mutex_t lock;
} nccl_ofi_idpool_t;

/*
Expand Down
1 change: 0 additions & 1 deletion include/nccl_ofi_mr.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ typedef struct nccl_ofi_mr_cache {
size_t used;
uint32_t hit_count;
uint32_t miss_count;
pthread_mutex_t lock;
} nccl_ofi_mr_cache_t;

/**
Expand Down
2 changes: 0 additions & 2 deletions include/nccl_ofi_msgbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,6 @@ typedef struct {
uint16_t msg_last_incomplete;
// Points to the message after the inserted message with highest sequence number.
uint16_t msg_next;
// Mutex for this msg buffer -- locks all non-init operations
pthread_mutex_t lock;
} nccl_ofi_msgbuff_t;

/**
Expand Down
11 changes: 3 additions & 8 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ typedef uint16_t nccl_ofi_rdma_msg_type_t;
* allocate a RDMA memory registration handle with `num_rails`+`num_control_rails` rails.
*/
typedef struct nccl_net_ofi_rdma_mr_handle {
struct nccl_net_ofi_rdma_device *device;

int num_rails;

Expand Down Expand Up @@ -394,12 +395,6 @@ typedef struct nccl_net_ofi_rdma_req {
/* Size of completed request */
size_t size;

/*
* Protect updating critical fields such as size and ncompls when
* network xfer happened over multiple rails
*/
pthread_mutex_t req_lock;

/* State of request */
nccl_net_ofi_rdma_req_state_t state;

Expand Down Expand Up @@ -533,7 +528,6 @@ typedef struct nccl_net_ofi_rdma_send_comm {

nccl_ofi_deque_elem_t cleanup_list_elem;

pthread_mutex_t ctrl_recv_lock;
bool received_close_message;
/* Counters for total sent and received control messages */
uint64_t n_ctrl_received;
Expand Down Expand Up @@ -613,7 +607,6 @@ typedef struct nccl_net_ofi_rdma_recv_comm {
nccl_ofi_deque_elem_t cleanup_list_elem;

/* Counters for total sent and received control messages */
pthread_mutex_t ctrl_counter_lock;
uint64_t n_ctrl_sent;
uint64_t n_ctrl_delivered;

Expand Down Expand Up @@ -838,6 +831,8 @@ typedef struct nccl_net_ofi_rdma_domain {

/* List of endpoints and set of addresses they have connections to */
nccl_ofi_ep_addr_list_t *ep_addr_list;

pthread_mutex_t rdma_domain_lock;
} nccl_net_ofi_rdma_domain_t;


Expand Down
3 changes: 1 addition & 2 deletions include/nccl_ofi_scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ typedef struct nccl_net_ofi_threshold_scheduler {
nccl_net_ofi_scheduler_t base;
/* Round robin counter */
unsigned int rr_counter;
/* Lock for round robin counter */
pthread_mutex_t rr_lock;

/* Minimum size of the message in bytes before message is
* multiplexed */
size_t min_stripe_size;
Expand Down
1 change: 1 addition & 0 deletions m4/check_pkg_libfabric.m4
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ AC_DEFUN([CHECK_PKG_LIBFABRIC], [
FI_OPT_MAX_MSG_SIZE,
FI_OPT_SHARED_MEMORY_PERMITTED,
FI_MR_DMABUF,
FI_PROGRESS_CONTROL_UNIFIED,
FI_OPT_INJECT_RMA_SIZE],
[], [], [AC_INCLUDES_DEFAULT
[#include <rdma/fi_endpoint.h>
Expand Down
15 changes: 0 additions & 15 deletions src/nccl_ofi_deque.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@ int nccl_ofi_deque_init(nccl_ofi_deque_t **deque_p)
deque->head.prev = &deque->head;
deque->head.next = &deque->head;

int ret = pthread_mutex_init(&deque->lock, NULL);
if (ret != 0) {
NCCL_OFI_WARN("Failed to initialize deque mutex.");
free(deque);
return -ret;
}

assert(deque_p);
*deque_p = deque;

Expand All @@ -40,14 +33,6 @@ int nccl_ofi_deque_finalize(nccl_ofi_deque_t *deque)
{
assert(deque);

/* Since user allocates all memory used for deque elements, we don't need to
deallocate any entries here. :D */
int ret = pthread_mutex_destroy(&deque->lock);
if (ret != 0) {
NCCL_OFI_WARN("Failed to destroy deque mutex.");
return -ret;
}

free(deque);
return 0;
}
18 changes: 1 addition & 17 deletions src/nccl_ofi_ep_addr_list.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,11 @@ typedef struct ep_pair_list_elem {
} ep_pair_list_elem_t;

/**
* Outer structure storing the ep list and a mutex to protect access
* Outer structure storing the ep list
*/
struct nccl_ofi_ep_addr_list {
ep_pair_list_elem_t *ep_pair_list;
size_t max_addr_size;
pthread_mutex_t mutex;
};

nccl_ofi_ep_addr_list_t *nccl_ofi_ep_addr_list_init(size_t max_addr_size)
Expand All @@ -50,11 +49,6 @@ nccl_ofi_ep_addr_list_t *nccl_ofi_ep_addr_list_init(size_t max_addr_size)
ret_list->ep_pair_list = NULL;
ret_list->max_addr_size = max_addr_size;

if (nccl_net_ofi_mutex_init(&ret_list->mutex, NULL) != 0) {
NCCL_OFI_WARN("Failed to init mutex");
goto error;
}

goto exit;

error:
Expand Down Expand Up @@ -86,8 +80,6 @@ int nccl_ofi_ep_addr_list_get(nccl_ofi_ep_addr_list_t *ep_list, void *addr_in,
return ret;
}

nccl_net_ofi_mutex_lock(&ep_list->mutex);

char addr_padded[ep_list->max_addr_size];
memcpy(&addr_padded, addr_in, addr_size);
zero_pad_address(addr_padded, addr_size, ep_list->max_addr_size);
Expand Down Expand Up @@ -122,8 +114,6 @@ int nccl_ofi_ep_addr_list_get(nccl_ofi_ep_addr_list_t *ep_list, void *addr_in,
}

exit:
nccl_net_ofi_mutex_unlock(&ep_list->mutex);

*ep = ret_ep;
return ret;
}
Expand All @@ -142,8 +132,6 @@ int nccl_ofi_ep_addr_list_insert(nccl_ofi_ep_addr_list_t *ep_list,
return ret;
}

nccl_net_ofi_mutex_lock(&ep_list->mutex);

hashed_addr_t *new_addr = (hashed_addr_t *)malloc(sizeof(hashed_addr_t)
+ ep_list->max_addr_size);
if (!new_addr) {
Expand All @@ -170,7 +158,6 @@ int nccl_ofi_ep_addr_list_insert(nccl_ofi_ep_addr_list_t *ep_list,
DL_APPEND(ep_list->ep_pair_list, new_pair);

unlock:
nccl_net_ofi_mutex_unlock(&ep_list->mutex);
return ret;
}

Expand All @@ -189,7 +176,6 @@ static void delete_ep_list_entry(nccl_ofi_ep_addr_list_t *ep_list, ep_pair_list_
int nccl_ofi_ep_addr_list_delete(nccl_ofi_ep_addr_list_t *ep_list, nccl_net_ofi_ep_t *ep)
{
int ret = 0;
nccl_net_ofi_mutex_lock(&ep_list->mutex);

ep_pair_list_elem_t *ep_pair, *ep_pair_tmp;
DL_FOREACH_SAFE(ep_list->ep_pair_list, ep_pair, ep_pair_tmp) {
Expand All @@ -203,7 +189,6 @@ int nccl_ofi_ep_addr_list_delete(nccl_ofi_ep_addr_list_t *ep_list, nccl_net_ofi_
ret = -ENOENT;

exit:
nccl_net_ofi_mutex_unlock(&ep_list->mutex);
return ret;
}

Expand All @@ -219,6 +204,5 @@ void nccl_ofi_ep_addr_list_fini(nccl_ofi_ep_addr_list_t *ep_list)
/* After this, the ep list should be NULL */
assert(ep_list->ep_pair_list == NULL);

nccl_net_ofi_mutex_destroy(&ep_list->mutex);
free(ep_list);
}
Loading
Loading