Skip to content

Commit

Permalink
Merge pull request kokkos#7333 from ndellingwood/master-release-4.4.01
Browse files Browse the repository at this point in the history
Master release 4.4.01
  • Loading branch information
crtrott authored Sep 13, 2024
2 parents 08ceff9 + 5cb2fa3 commit 15dc143
Show file tree
Hide file tree
Showing 26 changed files with 481 additions and 101 deletions.
1 change: 1 addition & 0 deletions .github/workflows/continuous-integration-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ jobs:
cmake -B builddir \
-DCMAKE_INSTALL_PREFIX=/usr \
${{ matrix.clang-tidy }} \
-DBUILD_SHARED_LIBS=ON \
-Ddesul_ROOT=/usr/desul-install/ \
-DKokkos_ENABLE_DESUL_ATOMICS_EXTERNAL=ON \
-DKokkos_ENABLE_HWLOC=ON \
Expand Down
36 changes: 12 additions & 24 deletions .github/workflows/releases.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,26 @@ jobs:
hashes: ${{ steps.hash.outputs.hashes }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
- name: Build artifacts
run: |
git archive -o kokkos-${{ github.ref_name }}.zip HEAD
git archive -o kokkos-${{ github.ref_name }}.tar.gz HEAD
git archive --prefix=kokkos-${{ github.ref_name }}/ -o kokkos-${{ github.ref_name }}.zip HEAD
git archive --prefix=kokkos-${{ github.ref_name }}/ -o kokkos-${{ github.ref_name }}.tar.gz HEAD
- name: Generate hashes
shell: bash
id: hash
run: |
# sha256sum generates sha256 hash for all artifacts.
# base64 -w0 encodes to base64 and outputs on a single line.
echo "hashes=$(sha256sum kokkos-${{ github.ref_name }}.zip kokkos-${{ github.ref_name }}.tar.gz | base64 -w0)" >> "$GITHUB_OUTPUT"
sha256sum kokkos-${{ github.ref_name }}.zip kokkos-${{ github.ref_name }}.tar.gz > kokkos-${{ github.ref_name }}-SHA-256.txt
echo "hashes=$(base64 -w0 kokkos-${{ github.ref_name }}-SHA-256.txt)" >> "$GITHUB_OUTPUT"
- name: Upload source code (zip)
uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5
- name: Upload artifacts
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: kokkos-${{ github.ref_name }}.zip
path: kokkos-${{ github.ref_name }}.zip
if-no-files-found: error
retention-days: 5

- name: Upload source code (tar.gz)
uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5
with:
name: kokkos-${{ github.ref_name }}.tar.gz
path: kokkos-${{ github.ref_name }}.tar.gz
name: release-artifacts
path: kokkos-${{ github.ref_name }}*
if-no-files-found: error
retention-days: 5

Expand All @@ -65,19 +58,14 @@ jobs:
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/')
steps:
- name: Download kokkos-${{ github.ref_name }}.zip
- name: Download artifacts
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: kokkos-${{ github.ref_name }}.zip

- name: Download kokkos-${{ github.ref_name }}.tar.gz
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: kokkos-${{ github.ref_name }}.tar.gz

name: release-artifacts
- name: Upload assets
uses: softprops/action-gh-release@c062e08bd532815e2082a85e87e3ef29c3e6d191 # v2.0.8
with:
files: |
kokkos-${{ github.ref_name }}.zip
kokkos-${{ github.ref_name }}.tar.gz
kokkos-${{ github.ref_name }}-SHA-256.txt
1 change: 1 addition & 0 deletions .jenkins
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ pipeline {
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=ON \
-DKokkos_ENABLE_LIBDL=OFF \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ENABLE_IMPL_MDSPAN=OFF \
-DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF \
.. && \
Expand Down
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# CHANGELOG

## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01)
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01)

### Features:
* Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229)

### Backend and Architecture Enhancements:

#### CUDA:
* Experimental support for unified memory mode (intended for Grace-Hopper etc.) [\#6823](https://github.com/kokkos/kokkos/pull/6823)

### Bug Fixes
* OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284)
* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296)

## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00)
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00)

Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ ENDIF()

set(Kokkos_VERSION_MAJOR 4)
set(Kokkos_VERSION_MINOR 4)
set(Kokkos_VERSION_PATCH 0)
set(Kokkos_VERSION_PATCH 1)
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
message(STATUS "Kokkos version: ${Kokkos_VERSION}")
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
Expand Down
2 changes: 1 addition & 1 deletion Makefile.kokkos
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

KOKKOS_VERSION_MAJOR = 4
KOKKOS_VERSION_MINOR = 4
KOKKOS_VERSION_PATCH = 0
KOKKOS_VERSION_PATCH = 1
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)

# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
Expand Down
1 change: 1 addition & 0 deletions cmake/KokkosCore_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated
#cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
#cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY
Expand Down
4 changes: 3 additions & 1 deletion cmake/kokkos_enable_options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
# resolved but we keep the option around a bit longer to be safe.
KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA")

KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" )
KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP")
Expand Down Expand Up @@ -135,7 +137,7 @@ FUNCTION(check_device_specific_options)
ENDIF()
ENDFUNCTION()

CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC)
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY)
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE)
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH)

Expand Down
12 changes: 12 additions & 0 deletions containers/unit_tests/TestWithoutInitializing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@
#endif
///@}

/// Some tests are skipped for unified memory space
#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \
if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space, \
Kokkos::CudaSpace>) \
GTEST_SKIP() << "skipping since unified memory requires additional " \
"fences";
#else
#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE
#endif

TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) {
using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels());
Expand Down Expand Up @@ -657,6 +668,7 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) {

TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) {
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE

using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
Expand Down
39 changes: 36 additions & 3 deletions core/src/Cuda/Kokkos_CudaSpace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
#include <algorithm>
#include <atomic>

//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
#include <impl/Kokkos_Error.hpp>

#include <impl/Kokkos_Tools.hpp>
Expand Down Expand Up @@ -178,6 +177,29 @@ void *impl_allocate_common(const int device_id,
cudaError_t error_code = cudaSuccess;
#ifndef CUDART_VERSION
#error CUDART_VERSION undefined!
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
// This is intended for Grace-Hopper (and future unified memory architectures)
// The idea is to use host allocator and then advise to keep it in HBM on the
// device, but that requires CUDA 12.2
static_assert(CUDART_VERSION >= 12020,
"CUDA runtime version >=12.2 required when "
"Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set. "
"Please update your CUDA runtime version or "
"reconfigure with "
"-D Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF");
if (arg_alloc_size) { // cudaMemAdvise_v2 does not work with nullptr
error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
if (error_code == cudaSuccess) {
// One would think cudaMemLocation{device_id,
// cudaMemLocationTypeDevice} would work but it doesn't. I.e. the order of
// members doesn't seem to be defined.
cudaMemLocation loc;
loc.id = device_id;
loc.type = cudaMemLocationTypeDevice;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2(
ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc));
}
}
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
if (arg_alloc_size >= memory_threshold_g) {
error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
Expand All @@ -190,9 +212,13 @@ void *impl_allocate_common(const int device_id,
"Kokkos::Cuda: backend fence after async malloc");
}
}
} else
} else {
error_code = cudaMalloc(&ptr, arg_alloc_size);
}
#else
error_code = cudaMalloc(&ptr, arg_alloc_size);
#endif
{ error_code = cudaMalloc(&ptr, arg_alloc_size); }

if (error_code != cudaSuccess) { // TODO tag as unlikely branch
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
Expand Down Expand Up @@ -326,6 +352,9 @@ void CudaSpace::impl_deallocate(
}
#ifndef CUDART_VERSION
#error CUDART_VERSION undefined!
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
if (arg_alloc_size >= memory_threshold_g) {
Impl::cuda_device_synchronize(
Expand Down Expand Up @@ -436,8 +465,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,

#include <impl/Kokkos_SharedAlloc_timpl.hpp>

#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Kokkos::CudaSpace);
#else
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace);
#endif
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Kokkos::CudaUVMSpace);
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Expand Down
23 changes: 22 additions & 1 deletion core/src/Cuda/Kokkos_CudaSpace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,19 @@ class CudaSpace {
void* allocate(const char* arg_label, const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const;

#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
template <typename ExecutionSpace>
void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
return allocate(arg_alloc_size);
}
template <typename ExecutionSpace>
void* allocate(const ExecutionSpace&, const char* arg_label,
const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const {
return allocate(arg_label, arg_alloc_size, arg_logical_size);
}
#endif

/**\brief Deallocate untracked memory in the cuda space */
void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
void deallocate(const char* arg_label, void* const arg_alloc_ptr,
Expand Down Expand Up @@ -337,7 +350,11 @@ static_assert(
template <>
struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> {
enum : bool { assignable = false };
enum : bool { accessible = false };
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
enum : bool{accessible = false};
#else
enum : bool { accessible = true };
#endif
enum : bool { deepcopy = true };
};

Expand Down Expand Up @@ -558,8 +575,12 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
Kokkos::CudaSpace);
#else
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaSpace);
#endif
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace);
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace);

Expand Down
20 changes: 20 additions & 0 deletions core/src/Cuda/Kokkos_Cuda_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,22 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default

//----------------------------------

#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
// Check if unified memory is available
int cuda_result;
cudaDeviceGetAttribute(&cuda_result, cudaDevAttrConcurrentManagedAccess,
cuda_device_id);
if (cuda_result == 0) {
Kokkos::abort(
"Kokkos::Cuda::initialize ERROR: Unified memory is not available on "
"this device\n"
"Please recompile Kokkos with "
"-DKokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF\n");
}
#endif

//----------------------------------

cudaStream_t singleton_stream;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
Expand Down Expand Up @@ -705,6 +721,10 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
#else
os << "no\n";
#endif
#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
os << " KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY: ";
os << "yes\n";
#endif

os << "\nCuda Runtime Configuration:\n";

Expand Down
2 changes: 2 additions & 0 deletions core/src/Kokkos_View.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,8 @@ inline constexpr Kokkos::ALL_t ALL{};
#pragma omp end declare target
#endif

inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{};

inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{};

inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{};
Expand Down
2 changes: 1 addition & 1 deletion core/src/OpenMP/Kokkos_OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ int OpenMP::impl_thread_pool_size() const noexcept {
}

int OpenMP::impl_max_hardware_threads() noexcept {
return Impl::g_openmp_hardware_max_threads;
return Impl::OpenMPInternal::max_hardware_threads();
}

namespace Impl {
Expand Down
Loading

0 comments on commit 15dc143

Please sign in to comment.