diff --git a/.github/workflows/srpm.yaml b/.github/workflows/srpm.yaml new file mode 100644 index 000000000..60a4ea9f0 --- /dev/null +++ b/.github/workflows/srpm.yaml @@ -0,0 +1,100 @@ +name: generate SRPM +on: + push: + pull_request: + types: [opened, reopened] +jobs: + srpm: + runs-on: ubuntu-latest + container: + image: fedora:rawhide + options: --privileged + steps: + - uses: actions/checkout@v3 + - run: dnf install -y packit git nodejs + - run: packit srpm + - uses: actions/upload-artifact@v4 + with: + name: "libnccl-net-ofi.src.rpm" + path: "*.src.rpm" + if-no-files-found: 'error' + compression-level: '0' + rpm-packit: + strategy: + matrix: + cfg: + - al2-x86_64-cuda-ofi + - al2-x86_64-cuda-ofi-aws + - al2-x86_64-neuron-ofi + - al2-x86_64-neuron-ofi-aws + - al2023-aarch64-neuron-ofi + - al2023-aarch64-neuron-ofi-aws + - al2023-x86_64-cuda-ofi + - al2023-x86_64-cuda-ofi-aws + - al2023-x86_64-neuron-ofi + - al2023-x86_64-neuron-ofi-aws + - f39-aarch64-cuda-ofi + - f39-aarch64-cuda-ofi-aws + - f39-aarch64-neuron-ofi + - f39-aarch64-neuron-ofi-aws + - f39-x86_64-cuda-ofi + - f39-x86_64-cuda-ofi-aws + - f39-x86_64-neuron-ofi + - f39-x86_64-neuron-ofi-aws + - rawhide-aarch64-cuda-ofi + - rawhide-aarch64-cuda-ofi-aws + - rawhide-aarch64-neuron-ofi + - rawhide-aarch64-neuron-ofi-aws + - rawhide-x86_64-cuda-ofi + - rawhide-x86_64-cuda-ofi-aws + - rawhide-x86_64-neuron-ofi + - rawhide-x86_64-neuron-ofi-aws + container: + image: fedora:rawhide + options: --privileged + steps: + - uses: actions/checkout@v3 + with: + sparse-checkout: 'dist' + sparse-checkout-cone-mode: false + - uses: actions/download-artifact@master + with: + name: "libnccl-net-ofi.src.rpm" + path: . + - run: dnf install -y mock + - run: packit build in-mock --root=dist/${{ matrix.cfg }}.cfg + - uses: actions/upload-artifact@v4 + with: + name: "RPM - ${{ matrix.cfg }}" + path: "*.rpm" + if-no-files-found: 'error' + compression-level: '0' + deb: + matrix: + image: + - ubuntu:focal + - ubuntu:jammy + - ubuntu:noble + - ubuntu:oracular + - debian:bullseye + - debian:bookworm + - debian:stable + - debian:testing + - debian:sid + container: + image: ${{ matrix.image }} + options: --privileged + steps: + - run: apt install -y build-essential hwloc-dev autoconf automake + - run: curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz + - run: tar xvf ./aws-efa-installer-latest.tar.gz + - run: apt install -y aws-efa-installer/DEBS/DEBIAN/x86_64/libfabric-aws-1.22.0amzn1.0-1.x86_64.rpm + - run: dnf -y install cuda-cudart-devel-12-6 + - uses: actions/download-artifact@master + with: + name: "libnccl-net-ofi.src.rpm" + path: . + - run: ls -lart + - run: unzip libnccl-net-ofi.src.rpm + - run: mkdir -p ~/debbuild/{OTHER,SOURCES,DEBS,SDEBS,BUILD,SPECS} + - run: debbuild -ra *.src.rpm --define 'with_cuda 1' --define 'with_platform_aws 1' diff --git a/.gitignore b/.gitignore index 7f8ecadfc..eced3ce78 100644 --- a/.gitignore +++ b/.gitignore @@ -75,3 +75,5 @@ m4/lt~obsolete.m4 .idea/ .devenv/ .direnv +*.src.rpm +dist/*.t*gz diff --git a/.packit.yml b/.packit.yml new file mode 100644 index 000000000..8b94dddba --- /dev/null +++ b/.packit.yml @@ -0,0 +1,21 @@ +--- +# vi:ts=2 sw=2 et: +# +# Docs: https://packit.dev/docs/ + + +srpm_build_deps: + - git + +actions: + get-current-version: + - bash -c "echo $(cat GIT_VERSION)" + create-archive: + - bash -c "git archive --prefix=libnccl-net-ofi/ --format=tgz --output=./dist/libnccl-net-ofi_${PACKIT_PROJECT_VERSION}.tar.gz HEAD src/ include/ doc/ m4/ tests/ topology/ configure.ac Makefile.am LICENSE NOTICE" + - bash -c "echo dist/libnccl-net-ofi_${PACKIT_PROJECT_VERSION}.tar.gz" + +specfile_path: dist/libnccl-net-ofi.spec +upstream_package_name: libnccl-net-ofi +downstream_package_name: libnccl-net-ofi +release_suffix: "{PACKIT_PROJECT_BRANCH}" +update_release: false diff --git a/GIT_VERSION b/GIT_VERSION new file mode 100644 index 000000000..6a0b77403 --- /dev/null +++ b/GIT_VERSION @@ -0,0 +1 @@ +1.11.0pre diff --git a/dist/libnccl-net-ofi.spec b/dist/libnccl-net-ofi.spec new file mode 100644 index 000000000..99e03a266 --- /dev/null +++ b/dist/libnccl-net-ofi.spec @@ -0,0 +1,83 @@ +# Whether to build with cuda support. Default: on if neuron +%if "%{with_cuda}" == "1" && "%{with_neuron}" == "1" +%{error:Neuron and CUDA must not be enabled together} +%endif + +%if "%{with_cuda}" == "0" && "%{with_neuron}" == "0" +%{error:One of Neuron or CUDA must be enabled} +%endif + +%if "%{with_cuda}" == "1" +%{!?target: %global target nccl} +%endif +%if "%{with_neuron}" == "1" +%{!?target: %global target nccom} +%endif + +%global pname_base lib%{!?with_neuron:nccl}%{?with_neuron:nccom}-net-ofi +%global pname %{pname_base}%{?with_platform_aws:-aws} + +%if "%{with_platform_aws}" +%global _prefix /opt/amazon/%{pname_base} +%endif + +# (CUDA only) what toolkit package to declare a build dependency on. Default: 12-6 +%{!?_cuda_toolkit_version: %global _cuda_toolkit_version 12-6} + +Name: %{pname} +Version: 1.11.0pre +Release: 0%{?dist} +Summary: NCCL + libfabric compatibility layer +License: Apache-2.0 +URL: https://github.com/aws/aws-ofi-nccl +Source0: libnccl-net-ofi_1.11.0pre45a7e10.dirty.tar.gz +%if "%{_vendor}" == "debbuild" +Group: devel +%else +Group: Development/Tools%{?suse_version:/Building} +BuildRequires: hwloc-devel +BuildRequires: libfabric-devel +BuildRequires: autoconf +BuildRequires: automake +BuildRequires: libtool +%if "0%{?with_cuda:1}" +BuildRequires: cuda-cudart-devel-%{_cuda_toolkit_version} +%endif +%endif +Requires: hwloc +Requires: libfabric%{?with_platform_aws:-aws} +%description +This is a plugin which lets EC2 developers use libfabric as network provider while running NCCL applications. + +%prep +%setup -n libnccl-net-ofi +%build +autoreconf -ivf +%configure \ + --prefix="%{_prefix}" \ +%if "0%{?with_cuda:1}" + --with-cuda=/usr/local/cuda-12 \ + --enable-neuron=no \ +%else + --with-cuda=no \ + --enable-neuron=yes \ +%endif + --disable-tests \ + --with-mpi=no \ +%if "0%{?with_platform_aws:1}" + --enable-platform-aws +%else + --disable-platform-aws +%endif + +%install +%make_install +%files +%{_libdir}/*.so +%{_datadir}/aws-ofi-nccl/xml/*.xml +%license LICENSE NOTICE +%doc + +%changelog +* Thu Aug 08 2024 Nicholas Sielicki +Initial Package