Skip to content

Commit

Permalink
BUG: several bug fixes for CUDA
Browse files Browse the repository at this point in the history
  • Loading branch information
Adrian-Diaz committed Dec 12, 2024
1 parent d3a7761 commit eba94b6
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 48 deletions.
18 changes: 4 additions & 14 deletions examples/ann_distributed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ void forward_propagate_layer(TpetraDFArray<real_t> &inputs,
const size_t num_j = outputs.submap_size();

//perform comms to get full input vector for row vector products on matrix
//VERY SIMPLE EXAMPLE OF COMMS; THIS IS A TERRIBLE WAY TO DECOMPOSE THE PROBLEM
//VERY SIMPLE EXAMPLE OF COMMS; THIS IS A NONIDEAL WAY TO DECOMPOSE THE PROBLEM


FOR_ALL(j, 0, num_j,{
Expand All @@ -173,8 +173,6 @@ void forward_propagate_layer(TpetraDFArray<real_t> &inputs,

}); // end parallel for



// For a GPU, use the nested parallelism below here
/*
using team_t = typename Kokkos::TeamPolicy<>::member_type;
Expand Down Expand Up @@ -276,7 +274,7 @@ int main(int argc, char* argv[])
});

ANNLayers(layer).output_partition_map = TpetraPartitionMap<>(all_current_layer_indices);
ANNLayers(layer).output_unique_map = TpetraPartitionMap<>(num_nodes_in_layer[layer+1]);
ANNLayers(layer).output_unique_map = TpetraPartitionMap<>(num_nodes_in_layer[layer+1]);
ANNLayers(layer).distributed_outputs = TpetraDFArray<real_t> (ANNLayers(layer).output_partition_map);
//comming from subview requires both the original map and the submap to be composed of contiguous indices
ANNLayers(layer).distributed_outputs.own_comm_setup(ANNLayers(layer).output_unique_map);
Expand Down Expand Up @@ -308,6 +306,7 @@ int main(int argc, char* argv[])

inputs.update_device(); // copy inputs to device
inputs.perform_comms(); //distribute to full map for row-vector product
//inputs.print();

// for (size_t i=0; i<num_nodes_in_layer[0]; i++) {
// std::cout << "input at " << i << " is " << inputs(i) << "\n";
Expand Down Expand Up @@ -380,16 +379,7 @@ int main(int argc, char* argv[])
// std::cout << "output values grid: \n";
std::flush(std::cout);
MPI_Barrier(MPI_COMM_WORLD);
std::stringstream output_stream;
size_t local_output_size = ANNLayers(num_layers-1).distributed_outputs.submap_size();
for (size_t val=0; val < local_output_size; val++){
int global_index = ANNLayers(num_layers-1).distributed_outputs.getSubMapGlobalIndex(val);
int local_index = ANNLayers(num_layers-1).distributed_outputs.getMapLocalIndex(global_index);
output_stream << " " << ANNLayers(num_layers-1).distributed_outputs.host(local_index);
if(val%10==0) output_stream << std::endl;
} // end for
std::cout << output_stream.str();
std::flush(std::cout);
ANNLayers(num_layers-1).distributed_outputs.print();

//test repartition; assume a 10 by 10 grid of outputs from ANN
//assign coords to each grid point, find a partition of the grid, then repartition output layer using new map
Expand Down
7 changes: 4 additions & 3 deletions examples/test_tpetra_mesh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,14 +300,15 @@ void setup_maps(mesh_data &mesh)
mesh.ghost_comms.execute_comms();

//convert nodes in elem to local node ids to avoid excessive map conversion calls
FOR_ALL(ielem,0,mesh.rnum_elem, {
for(int ielem = 0; ielem < mesh.rnum_elem; ielem++) {
for(int inode=0; inode < 8; inode++){
//recall that nodes in elem is storing global indices in this implementation
//you may just want to store local indices in your case to avoid the map call
nodes_in_elem_distributed(ielem,inode) = all_node_map.getLocalIndex(nodes_in_elem_distributed(ielem,inode));
nodes_in_elem_distributed.host(ielem,inode) = all_node_map.getLocalIndex(nodes_in_elem_distributed.host(ielem,inode));
}
});
}

nodes_in_elem_distributed.update_device();
// std::cout << "number of patches = " << mesh->num_patches() << std::endl;
if (process_rank == 0)
{
Expand Down
1 change: 1 addition & 0 deletions scripts/build-matar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ show_help() {
echo " --machine=<darwin|chicoma|linux|mac>. Default is 'linux'"
echo " --intel_mkl=<enabled|disabled>. Default is 'disabled'"
echo " --build_cores=<Integers greater than 0>. Default is set 1"
echo " --trilinos=<enabled|disabled>. Default is 'disabled'"
echo " --help: Display this help message"
echo " "
echo " "
Expand Down
11 changes: 8 additions & 3 deletions scripts/trilinos-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ then
mkdir -p ${TRILINOS_BUILD_DIR}
fi


if [ "$kokkos_build_type" = "cuda" ] || [ "$kokkos_build_type" = "cuda_mpi" ]; then
export OMPI_CXX=${TRILINOS_SOURCE_DIR}/packages/kokkos/bin/nvcc_wrapper
export CUDA_LAUNCH_BLOCKING=1
elif [ "$kokkos_build_type" = *"hip"* ] || [ "$kokkos_build_type" = *"hip_mpi"* ]; then
export OMPI_CXX=hipcc
fi

#check if Trilinos library files were installed, install them otherwise.
[ -d "${TRILINOS_BUILD_DIR}/lib" ] && echo "Directory ${TRILINOS_BUILD_DIR}/lib exists, assuming successful installation; delete build folder and run build script again if there was an environment error that has been corrected."

Expand Down Expand Up @@ -138,13 +146,10 @@ if [ "$kokkos_build_type" = "openmp" ] || [ "$kokkos_build_type" = "openmp_mpi"
${OPENMP_ADDITIONS[@]}
)
elif [ "$kokkos_build_type" = "cuda" ] || [ "$kokkos_build_type" = "cuda_mpi" ]; then
export OMPI_CXX=${TRILINOS_SOURCE_DIR}/packages/kokkos/bin/nvcc_wrapper
export CUDA_LAUNCH_BLOCKING=1
cmake_options+=(
${CUDA_ADDITIONS[@]}
)
elif [ "$kokkos_build_type" = *"hip"* ] || [ "$kokkos_build_type" = *"hip_mpi"* ]; then
export OMPI_CXX=hipcc
cmake_options+=(
${HIP_ADDITIONS[@]}
)
Expand Down
28 changes: 0 additions & 28 deletions src/include/tpetra_wrapper_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,22 +142,16 @@ class TpetraPartitionMap {
KOKKOS_INLINE_FUNCTION
size_t extent() const;

KOKKOS_INLINE_FUNCTION
int getLocalIndex(int global_index) const;

KOKKOS_INLINE_FUNCTION
long long int getGlobalIndex(int local_index) const;

KOKKOS_INLINE_FUNCTION
long long int getMinGlobalIndex() const;

KOKKOS_INLINE_FUNCTION
long long int getMaxGlobalIndex() const;

KOKKOS_INLINE_FUNCTION
bool isProcessGlobalIndex(int global_index) const;

KOKKOS_INLINE_FUNCTION
bool isProcessLocalIndex(int local_index) const;

// Method returns the raw device pointer of the Kokkos DualView
Expand Down Expand Up @@ -280,47 +274,41 @@ void TpetraPartitionMap<ExecSpace,MemoryTraits>::print() const {

// Return local index (on this process/rank) corresponding to the input global index
template <typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
int TpetraPartitionMap<ExecSpace,MemoryTraits>::getLocalIndex(int global_index) const {
int local_index = tpetra_map->getLocalElement(global_index);
return local_index;
}

// Return global index corresponding to the input local (on this process/rank) index
template <typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
long long int TpetraPartitionMap<ExecSpace,MemoryTraits>::getGlobalIndex(int local_index) const {
int global_index = tpetra_map->getGlobalElement(local_index);
return global_index;
}

// Return smallest global index (on this process/rank)
template <typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
long long int TpetraPartitionMap<ExecSpace,MemoryTraits>::getMinGlobalIndex() const {
int global_index = tpetra_map->getMinGlobalIndex();
return global_index;
}

// Return largest global index (on this process/rank)
template <typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
long long int TpetraPartitionMap<ExecSpace,MemoryTraits>::getMaxGlobalIndex() const {
int global_index = tpetra_map->getMaxGlobalIndex();
return global_index;
}

// Return global index corresponding to the input local (on this process/rank) index
template <typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
bool TpetraPartitionMap<ExecSpace,MemoryTraits>::isProcessGlobalIndex(int global_index) const {
bool belongs = tpetra_map->isNodeGlobalElement(global_index);
return belongs;
}

// Return global index corresponding to the input local (on this process/rank) index
template <typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
bool TpetraPartitionMap<ExecSpace,MemoryTraits>::isProcessLocalIndex(int local_index) const {
bool belongs = tpetra_map->isNodeGlobalElement(local_index);
return belongs;
Expand Down Expand Up @@ -565,16 +553,12 @@ class TpetraDCArray {

size_t submap_size() const;

KOKKOS_INLINE_FUNCTION
long long int getSubMapGlobalIndex(int local_index) const;

KOKKOS_INLINE_FUNCTION
long long int getMapGlobalIndex(int local_index) const;

KOKKOS_INLINE_FUNCTION
int getSubMapLocalIndex(long long int local_index) const;

KOKKOS_INLINE_FUNCTION
int getMapLocalIndex(long long int local_index) const;

// Host Method
Expand Down Expand Up @@ -1166,31 +1150,27 @@ T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j

// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
long long int TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::getSubMapGlobalIndex(int local_index) const {
long long int global_index = tpetra_comm_pmap->getGlobalElement(local_index);
return global_index;
}

// Return global index corresponding to the input local (on this process/rank) index
template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
long long int TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::getMapGlobalIndex(int local_index) const {
long long int global_index = tpetra_pmap->getGlobalElement(local_index);
return global_index;
}

// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
int TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::getSubMapLocalIndex(long long int global_index) const {
int local_index = tpetra_comm_pmap->getLocalElement(global_index);
return local_index;
}

// Return global index corresponding to the input local (on this process/rank) index
template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
int TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::getMapLocalIndex(long long int global_index) const {
int local_index = tpetra_pmap->getLocalElement(global_index);
return local_index;
Expand Down Expand Up @@ -1780,16 +1760,12 @@ class TpetraDFArray {

size_t submap_size() const;

KOKKOS_INLINE_FUNCTION
long long int getSubMapGlobalIndex(int local_index) const;

KOKKOS_INLINE_FUNCTION
long long int getMapGlobalIndex(int local_index) const;

KOKKOS_INLINE_FUNCTION
int getSubMapLocalIndex(long long int local_index) const;

KOKKOS_INLINE_FUNCTION
int getMapLocalIndex(long long int local_index) const;

// Host Method
Expand Down Expand Up @@ -2386,31 +2362,27 @@ T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j

// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
long long int TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::getSubMapGlobalIndex(int local_index) const {
long long int global_index = tpetra_comm_pmap->getGlobalElement(local_index);
return global_index;
}

// Return global index corresponding to the input local (on this process/rank) index
template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
long long int TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::getMapGlobalIndex(int local_index) const {
long long int global_index = tpetra_pmap->getGlobalElement(local_index);
return global_index;
}

// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
int TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::getSubMapLocalIndex(long long int global_index) const {
int local_index = tpetra_comm_pmap->getLocalElement(global_index);
return local_index;
}

// Return global index corresponding to the input local (on this process/rank) index
template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
KOKKOS_INLINE_FUNCTION
int TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::getMapLocalIndex(long long int global_index) const {
int local_index = tpetra_pmap->getLocalElement(global_index);
return local_index;
Expand Down

0 comments on commit eba94b6

Please sign in to comment.