From fa8173bc8886260b5723a34d5bff7bebfe9ec8e9 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 5 Apr 2024 03:54:15 +0000 Subject: [PATCH 1/2] Add libpthread to library list Add libpthread to the library list, so it will be directly linked in both the net plugin and executables. The plugin uses pthreads, so should have been including this by default. We got lucky because the pthread calls were in something we were building as a .so and not in the unit tests. But breaking that in a future commit breaks the getting lucky part of the equation. Signed-off-by: Brian Barrett --- configure.ac | 1 + 1 file changed, 1 insertion(+) diff --git a/configure.ac b/configure.ac index 91a90be3c..e1de0037c 100644 --- a/configure.ac +++ b/configure.ac @@ -61,6 +61,7 @@ AC_FUNC_MALLOC AC_CHECK_FUNC([memset], [], [AC_MSG_ERROR([NCCL OFI Plugin requires memset function.])]) AC_CHECK_FUNC([realpath], [], [AC_MSG_ERROR([NCCL OFI Plugin requires realpath function.])]) AC_SEARCH_LIBS([dlopen], [dl], [], [AC_MSG_ERROR([NCCL OFI Plugin requires dlopen])]) +AC_SEARCH_LIBS([pthread_getspecific], [pthread], [], [AC_MSG_ERROR([NCCL OFI Plugin requires pthreads.])]) # Check for GCC builtin functions CHECK_GCC_BUILTIN([__builtin_expect]) From a4f1819617d3e2d558610b379eb60a0cbebc1778 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 5 Apr 2024 03:54:30 +0000 Subject: [PATCH 2/2] Build plugins as DSOs instead of shared libraries Build the plugins (tuner and net) as DSOs instead of shared libraries by adding the `-module -avoid-version` flags to the library LDFLAGS. Of course, now that the net plugin is not a shared library, we can't link against it in the tests. Rather than build a bunch of infrastructure to dlopen() the net plugin (which would mean all kinds of fun path work to make "make check" work), build the net plugin by creating a helper library that contains all the source, and building the net plugin from that. Now the helper library can be used to static link the plugin into the tests (both functional and unit). Signed-off-by: Brian Barrett --- src/Makefile.am | 33 +++++++++++++++++++++++++++++---- tests/functional/Makefile.am | 4 ++-- tests/unit/Makefile.am | 2 +- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 7b42a8738..852bea85e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -6,6 +6,9 @@ AM_CPPFLAGS = -I$(top_srcdir)/include -DXML_DIR=\"${pkgdatadir}/xml\" +# +# net plugin +# sources = \ nccl_ofi_api.c \ nccl_ofi_net.c \ @@ -25,15 +28,36 @@ sources += platform-aws.c endif if ENABLE_NEURON - lib_LTLIBRARIES = libnccom-net.la sources += nccl_ofi_interface_neuron.c - libnccom_net_la_SOURCES = $(sources) else - lib_LTLIBRARIES = libnccl-net.la sources += nccl_ofi_cuda.c \ nccl_ofi_interface_nvidia.c - libnccl_net_la_SOURCES = $(sources) +endif + +# Build an internal-only library that can be used by unit tests as +# well as the actual nccl_net.so / nccom_net.so libraries. This saves +# us writing dlopen() handlers for simple unit tests. +noinst_LTLIBRARIES = libinternal_net_plugin.la +libinternal_net_plugin_la_SOURCES = $(sources) +libinternal_net_plugin_la_LDFLAGS = -avoid-version +if ENABLE_NEURON + lib_LTLIBRARIES = libnccom-net.la + libnccom_net_la_SOURCES = + libnccom_net_la_LIBADD = libinternal_net_plugin.la + libnccom_net_la_LDFLAGS = -module -avoid-version +else + lib_LTLIBRARIES = libnccl-net.la + libnccl_net_la_SOURCES = + libnccl_net_la_LIBADD = libinternal_net_plugin.la + libnccl_net_la_LDFLAGS = -module -avoid-version +endif + + +# +# Tuner +# +if HAVE_CUDA if WANT_PLATFORM_AWS # NCCL tuner plugin lib_LTLIBRARIES += libnccl-ofi-tuner.la @@ -41,5 +65,6 @@ if WANT_PLATFORM_AWS tuner/nccl_ofi_model.c \ tuner/nccl_ofi_tuner.c libnccl_ofi_tuner_la_SOURCES = $(tuner_sources) + libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version endif endif diff --git a/tests/functional/Makefile.am b/tests/functional/Makefile.am index 14e618763..fc17ace7f 100644 --- a/tests/functional/Makefile.am +++ b/tests/functional/Makefile.am @@ -7,7 +7,7 @@ AM_CPPFLAGS = -I$(top_srcdir)/include $(MPI_CPPFLAGS) AM_LDFLAGS = $(MPI_LDFLAGS) $(CUDA_LDFLAGS) -LDADD = $(top_builddir)/src/libnccl-net.la $(MPI_LIBS) $(CUDA_LIBS) +LDADD = $(top_builddir)/src/libinternal_net_plugin.la $(MPI_LIBS) $(CUDA_LIBS) CC = $(MPICC) noinst_HEADERS = test-common.h @@ -25,4 +25,4 @@ cuda_check_SOURCES = cuda_check.c # Override the LDADD for this check to avoid the -lcudart used by the # other tests, since the purpose of this test is to make sure we # didn't leak direct cuda dependencies into the plugin. -cuda_check_LDADD = $(top_builddir)/src/libnccl-net.la +cuda_check_LDADD = $(top_builddir)/src/libinternal_net_plugin.la diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am index 30e9fece9..5b0b152de 100644 --- a/tests/unit/Makefile.am +++ b/tests/unit/Makefile.am @@ -5,7 +5,7 @@ # AM_CPPFLAGS = -I$(top_srcdir)/include -LDADD = $(top_builddir)/src/lib$(OFI_NCCL_ARCHIVE).la +LDADD = $(top_builddir)/src/libinternal_net_plugin.la noinst_HEADERS = test-common.h