diff --git a/Docs/sphinx_documentation/source/FFT.rst b/Docs/sphinx_documentation/source/FFT.rst index ac24e1c818..863539b6e3 100644 --- a/Docs/sphinx_documentation/source/FFT.rst +++ b/Docs/sphinx_documentation/source/FFT.rst @@ -197,3 +197,120 @@ support non-uniform cell size in the z-direction. For most applications, Similar to :cpp:`FFT::R2C`, the Poisson solvers should be cached for reuse, and one might need to use :cpp:`std::unique_ptr>` because there is no default constructor. + +.. _sec:FFT:rawptr: + +Raw Pointer Interface +===================== + +If you only want to use AMReX as an FFT library without using other +functionalities and data containers, you could use the raw pointer +interface. Below is an example. + +.. highlight:: c++ + +:: + + MPI_Init(&argc, &argv); + + // We don't need to call the full-blown amrex::Initialize + amrex::Init_FFT(MPI_COMM_WORLD); + + int nprocs, myproc; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &myproc); + + using RT = double; + using CT = std::complex; // or cufftDoubleComplex, etc. + + std::array domain_size{128,128,128}; + + // FFT between real and complex. + // Domain decomposition is flexible. The only constraint for the raw + // pointer interface is that there can be only zero or one local box + // per process, whereas the MultiFab interface can take any number of + // boxes. In this case, we choose to do manual domain decomposition for + // the real (i.e., forward) domain, and use the domain decomposition + // provided by amrex for the complex (i.e., backward) domain. + { + amrex::FFT::R2C r2c(domain_size); + + int nx = (domain_size[0] + nprocs - 1) / nprocs; + int xlo = nx * myproc; + nx = std::max(std::min(nx,domain_size[0]-xlo), 0); + std::array local_start{xlo,0,0}; + std::array local_size{nx,domain_size[1],domain_size[2]}; + + // Let amrex know the domain decomposition in the forward domain. + r2c.setLocalDomain(local_start,local_size); + + // Use amrex's domain decomposition in the backward domain. + auto const& [local_start_sp, local_size_sp] = r2c.getLocalSpectralDomain(); + + auto nr = std::size_t(local_size[0]) + * std::size_t(local_size[1]) + * std::size_t(local_size[2]); + auto* pr = (RT*)std::malloc(sizeof(RT)*nr); // or use cudaMalloc + // Initialize data ... + + auto nc = std::size_t(local_size_sp[0]) + * std::size_t(local_size_sp[1]) + * std::size_t(local_size_sp[2]); + auto* pc = (CT*)std::malloc(sizeof(CT)*nc); // or use cudaMalloc + + r2c.forward(pr, pc); // forward transform from real to complex + + // work on the complex data pointed by pc ... + + r2c.backward(pc, pr); // backward transform from complex to real + + std::free(pr); + std::free(pc); + } + + // Batched FFT between complex and complex. + // In this case, we choose to use the domain decomposition provided + // by amrex for the forward domain, and do manual domain decomposition + // for the backward domain. + int nbatch = 3; // batch size + { + amrex::FFT::Info info{}; + info.setBatchSize(nbatch); + amrex::FFT::C2C c2c(domain_size,info); + + // Use amrex's domain decomposition in the forward domain. + auto const& [local_start, local_size] = c2c.getLocalDomain(); + + int nx = (domain_size[0] + nprocs - 1) / nprocs; + int xlo = nx * myproc; + nx = std::max(std::min(nx,domain_size[0]-xlo), 0); + std::array local_start_sp{xlo,0,0}; + std::array local_size_sp{nx,domain_size[1],domain_size[2]}; + + // Let amrex know the domain decomposition in the backward domain. + c2c.setLocalSpectralDomain(local_start_sp, local_size_sp); + + auto nf = std::size_t(local_size[0]) + * std::size_t(local_size[1]) + * std::size_t(local_size[2]); + auto* pf = (CT*)std::malloc(sizeof(CT)*nf*nbatch); // or use cudaMalloc + // Initialize data ... + + auto nb = std::size_t(local_size_sp[0]) + * std::size_t(local_size_sp[1]) + * std::size_t(local_size_sp[2]); + auto* pb = (CT*)std::malloc(sizeof(CT)*nb*nbatch); + + c2c.forward(pf, pb); // forward transform + + // work on the data pointed by pb + + c2c.backward(pb, pf); // backward transform + + std::free(pf); + std::free(pb); + } + + amrex::Finalize_FFT(); + + MPI_Finalize(); diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H index a1124ce69d..aeac696076 100644 --- a/Src/Base/AMReX.H +++ b/Src/Base/AMReX.H @@ -87,6 +87,18 @@ namespace amrex std::ostream& a_oserr = std::cerr, ErrorHandler a_errhandler = nullptr); + // \brief Minimal version of initialization. + // + // This version is intended for users who only need AMReX for some + // specific functionalities such as FFT. It's the user's responsibility + // to initialize MPI. For multiple-GPU systems, it's the user's + // responsibility to properly set the GPU devices to be used. We will not + // try to pre-allocate memory arenas. We will not install a signal + // handler. Functionalities like random number generator and async I/O + // will be work. However, functionalities like FFT and linear solvers do + // work. + void Init_minimal (MPI_Comm mpi_comm = MPI_COMM_WORLD); + /** \brief Returns true if there are any currently-active and initialized AMReX instances (i.e. one for which amrex::Initialize has been called, @@ -96,6 +108,10 @@ namespace amrex void Finalize (AMReX* pamrex); void Finalize (); // Finalize the current top + // For initialization with Init_minimal, Finalize_minimal should be used + // for finalization. + void Finalize_minimal(); + /** * \brief We maintain a stack of functions that need to be called in Finalize(). * The functions are called in LIFO order. The idea here is to allow diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index cf4d1eb3c0..af403447fd 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -125,6 +125,11 @@ namespace system } } +namespace { + long long init_minimal_called = 0; + bool initialization_by_init_minimal = false; +} + namespace { std::string command_line; std::vector command_arguments; @@ -339,20 +344,37 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, ErrorHandler a_errhandler) { system::exename.clear(); + if (initialization_by_init_minimal) { + system::verbose = 0; + system::regtest_reduction = false; + system::signal_handling = false; + system::handle_sigsegv = false; + system::handle_sigterm = false; + system::handle_sigint = false; + system::handle_sigabrt = false; + system::handle_sigfpe = false; + system::handle_sigill = false; + system::call_addr2line = false; + system::throw_exception = false; + system::osout = &std::cout; + system::oserr = &std::cerr; + system::error_handler = nullptr; + } else { // system::verbose = 0; - system::regtest_reduction = false; - system::signal_handling = true; - system::handle_sigsegv = true; - system::handle_sigterm = false; - system::handle_sigint = true; - system::handle_sigabrt = true; - system::handle_sigfpe = true; - system::handle_sigill = true; - system::call_addr2line = true; - system::throw_exception = false; - system::osout = &a_osout; - system::oserr = &a_oserr; - system::error_handler = a_errhandler; + system::regtest_reduction = false; + system::signal_handling = true; + system::handle_sigsegv = true; + system::handle_sigterm = false; + system::handle_sigint = true; + system::handle_sigabrt = true; + system::handle_sigfpe = true; + system::handle_sigill = true; + system::call_addr2line = true; + system::throw_exception = false; + system::osout = &a_osout; + system::oserr = &a_oserr; + system::error_handler = a_errhandler; + } ParallelDescriptor::StartParallel(&argc, &argv, mpi_comm); @@ -510,7 +532,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, #ifdef AMREX_USE_GPU // Initialize after ParmParse so that we can read inputs. - Gpu::Device::Initialize(); + Gpu::Device::Initialize(initialization_by_init_minimal); #ifdef AMREX_USE_CUPTI CuptiInitialize(); #endif @@ -640,13 +662,15 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, ParallelDescriptor::Initialize(); BL_TINY_PROFILE_MEMORYINITIALIZE(); - Arena::Initialize(); + Arena::Initialize(initialization_by_init_minimal); amrex_mempool_init(); // // Initialize random seed after we're running in parallel. // - amrex::InitRandom(ParallelDescriptor::MyProc()+1, ParallelDescriptor::NProcs()); + if (!initialization_by_init_minimal) { + amrex::InitRandom(ParallelDescriptor::MyProc()+1, ParallelDescriptor::NProcs()); + } // For thread safety, we should do these initializations here. BaseFab_Initialize(); @@ -658,7 +682,9 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, MultiFab::Initialize(); iMultiFab::Initialize(); VisMF::Initialize(); - AsyncOut::Initialize(); + if (!initialization_by_init_minimal) { + AsyncOut::Initialize(); + } VectorGrowthStrategy::Initialize(); #ifdef AMREX_USE_FFT @@ -998,4 +1024,25 @@ FPExcept enableFPExcept (FPExcept excepts) return prev; } +void Init_minimal (MPI_Comm mpi_comm) +{ + ++init_minimal_called; + + if (Initialized()) { return; } + + initialization_by_init_minimal = true; + Initialize(mpi_comm); +} + +void Finalize_minimal () +{ + if (init_minimal_called > 0) { + --init_minimal_called; + } + if (init_minimal_called == 0 && initialization_by_init_minimal) { + Finalize(); + initialization_by_init_minimal = false; + } +} + } diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index 12a1e79af7..f0419ee973 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -187,7 +187,7 @@ public: */ static std::size_t align (std::size_t sz); - static void Initialize (); + static void Initialize (bool minimal); static void PrintUsage (); static void PrintUsageToFiles (std::string const& filename, std::string const& message); static void Finalize (); diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp index 8ca168bb1a..ff33163884 100644 --- a/Src/Base/AMReX_Arena.cpp +++ b/Src/Base/AMReX_Arena.cpp @@ -36,7 +36,7 @@ namespace { Arena* the_cpu_arena = nullptr; Arena* the_comms_arena = nullptr; - Long the_arena_init_size = 0L; + Long the_arena_init_size = 1024*1024*8; Long the_device_arena_init_size = 1024*1024*8; Long the_managed_arena_init_size = 1024*1024*8; Long the_pinned_arena_init_size = 1024*1024*8; @@ -280,7 +280,7 @@ namespace { } void -Arena::Initialize () +Arena::Initialize (bool minimal) { if (initialized) { return; } initialized = true; @@ -294,11 +294,18 @@ Arena::Initialize () BL_ASSERT(the_cpu_arena == nullptr || the_cpu_arena == The_BArena()); BL_ASSERT(the_comms_arena == nullptr || the_comms_arena == The_BArena()); + if (minimal) { + the_pinned_arena_init_size = 0; + } else { #ifdef AMREX_USE_GPU - the_arena_init_size = Gpu::Device::totalGlobalMem() / Gpu::Device::numDevicePartners() / 4L * 3L; + the_arena_init_size = Gpu::Device::totalGlobalMem() / Gpu::Device::numDevicePartners() / 4L * 3L; #ifdef AMREX_USE_SYCL - the_arena_init_size = std::min(the_arena_init_size, Gpu::Device::maxMemAllocSize()); + the_arena_init_size = std::min(the_arena_init_size, Gpu::Device::maxMemAllocSize()); +#endif #endif + } + +#ifdef AMREX_USE_GPU the_pinned_arena_release_threshold = Gpu::Device::totalGlobalMem() / Gpu::Device::numDevicePartners() / 2L; #endif diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H index a7aef5a924..eb2b95943e 100644 --- a/Src/Base/AMReX_GpuDevice.H +++ b/Src/Base/AMReX_GpuDevice.H @@ -53,7 +53,7 @@ class Device public: - static void Initialize (); + static void Initialize (bool minimal); static void Finalize (); #if defined(AMREX_USE_GPU) @@ -184,7 +184,7 @@ public: private: - static void initialize_gpu (); + static void initialize_gpu (bool minimal); static AMREX_EXPORT int device_id; static AMREX_EXPORT int num_devices_used; diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp index 961fdb0406..7857c26aab 100644 --- a/Src/Base/AMReX_GpuDevice.cpp +++ b/Src/Base/AMReX_GpuDevice.cpp @@ -145,8 +145,9 @@ namespace { #endif void -Device::Initialize () +Device::Initialize (bool minimal) { + amrex::ignore_unused(minimal); #ifdef AMREX_USE_GPU #if defined(AMREX_USE_CUDA) && (defined(AMREX_PROFILING) || defined(AMREX_TINY_PROFILING)) @@ -201,7 +202,11 @@ Device::Initialize () int n_local_procs = 1; amrex::ignore_unused(n_local_procs); - if (ParallelDescriptor::NProcs() == 1) { + if (minimal) { + device_id = 0; + AMREX_HIP_OR_CUDA(AMREX_HIP_SAFE_CALL (hipGetDevice(&device_id));, + AMREX_CUDA_SAFE_CALL(cudaGetDevice(&device_id)); ); + } else if (ParallelDescriptor::NProcs() == 1) { device_id = 0; } else if (gpu_device_count == 1) { @@ -217,7 +222,7 @@ Device::Initialize () } } - if (gpu_device_count > 1){ + if (gpu_device_count > 1 && ! minimal) { if (Machine::name() == "nersc.perlmutter") { // The CPU/GPU mapping on perlmutter has the reverse order. device_id = gpu_device_count - device_id - 1; @@ -241,19 +246,23 @@ Device::Initialize () } } - AMREX_HIP_OR_CUDA(AMREX_HIP_SAFE_CALL (hipSetDevice(device_id));, - AMREX_CUDA_SAFE_CALL(cudaSetDevice(device_id)); ); +#if !defined(AMREX_USE_SYCL) + if ( ! minimal) { + AMREX_HIP_OR_CUDA(AMREX_HIP_SAFE_CALL (hipSetDevice(device_id));, + AMREX_CUDA_SAFE_CALL(cudaSetDevice(device_id)); ); + } +#endif #ifdef AMREX_USE_ACC amrex_initialize_acc(device_id); #endif - initialize_gpu(); + initialize_gpu(minimal); num_devices_used = ParallelDescriptor::NProcs(); #ifdef AMREX_USE_MPI - if (ParallelDescriptor::NProcs() > 1) { + if (ParallelDescriptor::NProcs() > 1 && ! minimal) { #if defined(HIP_VERSION_MAJOR) && defined(HIP_VERSION_MINOR) && ((HIP_VERSION_MAJOR < 5) || ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR < 2))) @@ -336,7 +345,7 @@ Device::Initialize () } #endif /* AMREX_USE_MPI */ - if (amrex::Verbose()) { + if (amrex::Verbose() && ! minimal) { #if defined(AMREX_USE_CUDA) amrex::Print() << "CUDA" #elif defined(AMREX_USE_HIP) @@ -401,8 +410,10 @@ Device::Finalize () } void -Device::initialize_gpu () +Device::initialize_gpu (bool minimal) { + amrex::ignore_unused(minimal); + #ifdef AMREX_USE_GPU gpu_stream_pool.resize(max_gpu_streams); @@ -436,10 +447,12 @@ Device::initialize_gpu () #endif #if (__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 4)) - if (sizeof(Real) == 8) { - AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); - } else if (sizeof(Real) == 4) { - AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); + if ( ! minimal ) { + if (sizeof(Real) == 8) { + AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); + } else if (sizeof(Real) == 4) { + AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); + } } #endif diff --git a/Src/FFT/AMReX_FFT.H b/Src/FFT/AMReX_FFT.H index 11bf4f4cc8..7a7c85f5e9 100644 --- a/Src/FFT/AMReX_FFT.H +++ b/Src/FFT/AMReX_FFT.H @@ -2,11 +2,33 @@ #define AMREX_FFT_H_ #include +#include #include #include #include #include +namespace amrex +{ + /** + * \brief Initialize FFT + * + * This is needed only when the user wants to use amrex::FFT, but does + * not want to call amrex::Initialize to initialize the full version of + * AMReX. Note that one usually only needs to call Init_FFT and + * Finalize_FFT once in the entire program. + */ +#ifdef AMREX_USE_MPI + inline void Init_FFT (MPI_Comm comm ) { amrex::Init_minimal(comm); } +#else + inline void Init_FFT () { amrex::Init_minimal(); } +#endif + + //! If Init_FFT is called, this should be called after all the FFT works + //! are done. + inline void Finalize_FFT () { amrex::Finalize_minimal(); } +} + namespace amrex::FFT { void Initialize (); diff --git a/Src/FFT/AMReX_FFT_R2C.H b/Src/FFT/AMReX_FFT_R2C.H index fa071fea6d..eb32afc13a 100644 --- a/Src/FFT/AMReX_FFT_R2C.H +++ b/Src/FFT/AMReX_FFT_R2C.H @@ -26,6 +26,12 @@ template class PoissonHybrid; * convention, where applying the forward transform followed by the backward * transform scales the original data by the size of the input array. * + * The arrays are assumed to be in column-major, which is different from + * FFTW's row-major layout. Because the complex domain data have the + * Hermitian symmetry, only half of the data in the complex domain are + * stored. If the real domain size is nx * ny * nz, the complex domain's + * size will be (nx/2+1) * ny * nz. + * * For more details, we refer the users to * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html. */ @@ -50,6 +56,18 @@ public: */ explicit R2C (Box const& domain, Info const& info = Info{}); + /** + * \brief Constructor + * + * If AMREX_SPACEDIM is 3 and you want to do 2D FFT, you just need to + * set the size of one of the dimensions to 1. + * + * \param domain_size size of the forward domain (i.e., the real data domain) + * \param info optional information + */ + explicit R2C (std::array const& domain_size, + Info const& info = Info{}); + ~R2C (); R2C (R2C const&) = delete; @@ -57,6 +75,97 @@ public: R2C& operator= (R2C const&) = delete; R2C& operator= (R2C &&) = delete; + /** + * \brief Set local domain + * + * This is needed, only if one uses the raw pointer interfaces, not the + * amrex::MulitFab interfaces. This may contain collective MPI calls. So + * all processes even if their local size is zero should call. This + * function informs AMReX the domain decomposition chosen by the user + * for the forward domain (i.e., real data domain). There is no + * constraint on the domain decomposition strategy. One can do 1D, 2D or + * 3D domain decomposition. Alternatively, one could also let AMReX + * choose for you by calling getLocalDomain. The latter could + * potentially reduce data communication. + * + * Again, this is needed, only if one uses the raw pointer + * interfaces. Only one of the functions, setLocalDomain and + * getLocalDomain, should be called. + * + * This should only be called once unless the domain decomposition + * changes. + * + * local_start starting indices of the local domain + * local_size size of the local domain + */ + void setLocalDomain (std::array const& local_start, + std::array const& local_size); + + /** + * \brief Get local domain + * + * This function returns the domain decomposition chosen by AMReX. The + * first part of the pair is the local starting indices, and the second + * part is the local domain size. + * + * This is needed, only if one uses the raw pointer interfaces, not the + * amrex::MulitFab interfaces. Only one of the functions, setLocalDomain + * and getLocalDomain, should be called. + */ + std::pair,std::array> + getLocalDomain () const; + + /** + * \brief Set local spectral domain + * + * This is needed, only if one uses the raw pointer interfaces, not the + * amrex::MulitFab interfaces. This may contain collective MPI calls. So + * all processes even if their local size is zero should call. This + * function informs AMReX the domain decomposition chosen by the user + * for the complex data domain. There is no constraint on the domain + * decomposition strategy. One can do 1D, 2D or 3D domain + * decomposition. Alternatively, one could also let AMReX choose for you + * by calling getLocalSpectralDomain. The latter could potentially + * reduce data communication. + * + * Again, this is needed, only if one uses the raw pointer + * interfaces. Only one of the functions, setLocalSpectralDomain and + * getLocalSpectralDomain, should be called. Note that one could use + * this function together with getLocalDomain. That is the user is + * allowed to choose their own spectral domain decomposition, while let + * AMReX choose the real data domain decomposition. Also note that the + * entire spectral domain has the size of (nx+1)/2 * ny * nz, if the real + * domain is nx * ny * nz. + * + * This should only be called once unless the domain decomposition + * changes. + * + * local_start starting indices of the local domain + * local_size size of the local domain + */ + void setLocalSpectralDomain (std::array const& local_start, + std::array const& local_size); + + /** + * \brief Get local spectral domain + * + * This function returns the domain decomposition chosen by AMReX for + * the complex data spectral domain. The returned pair contains the + * local starting indices and the local domain size. + * + * This is needed, only if one uses the raw pointer interfaces, not the + * amrex::MulitFab interfaces. Only one of the functions, + * setLocalSpectralDomain and getLocalSpectralDomain, should be called. + * Note that one could use this function together with + * setLocalDomain. That is the user is allowed to choose their own real + * domain decomposition, while let AMReX choose the spectral data domain + * decomposition. Also note that the entire spectral domain has the size + * of (nx+1)/2 * ny * nz, if the real domain is nx * ny * nz. + * + */ + std::pair,std::array> + getLocalSpectralDomain () const; + /** * \brief Forward and then backward transform * @@ -113,6 +222,26 @@ public: DIR == Direction::both, int> = 0> void forward (MF const& inmf, cMF& outmf, int incomp = 0, int outcomp = 0); + /** + * \brief Forward transform + * + * This raw pointer version of forward requires + * setLocalDomain/getLocalDomain and + * setLocalSpectralDomain/getLocalSpectralDomain have been called + * already. Note that one is allowed to call this function multiple + * times after the set/get domain functions are called only once, unless + * the domain decomposition changes. In fact, that is the preferred way + * because it has better performance. All processes need to call this + * function even if their local size is zero. If the local size is zero, + * one can pass nullptrs. + */ + template = 0> + void forward (RT const* in, CT* out); + /** * \brief Backward transform * @@ -137,6 +266,26 @@ public: DIR == Direction::both, int> = 0> void backward (cMF const& inmf, MF& outmf, int incomp = 0, int outcomp = 0); + /** + * \brief Backward transform + * + * This raw pointer version of backward requires + * setLocalDomain/getLocalDomain and + * setLocalSpectralDomain/getLocalSpectralDomain have been called + * already. Note that one is allowed to call this function multiple + * times after the set/get domain functions are called only once unless + * the domain decomposition changes. In fact, that is the preferred way + * because it has better performance. All processes need to call this + * function even if their local size is zero. If the local size is zero, + * one can pass nullptrs. + */ + template = 0> + void backward (CT const* in, RT* out); + //! Scaling factor. If the data goes through forward and then backward, //! the result multiplied by the scaling factor is equal to the original //! data. @@ -153,7 +302,7 @@ public: */ template = 0> - std::pair getSpectralData (); + std::pair getSpectralData () const; /** * \brief Get BoxArray and DistributionMapping for spectral data @@ -231,6 +380,14 @@ private: } } + static std::pair + make_layout_from_local_domain (std::array const& local_start, + std::array const& local_size); + + template + std::pair,std::size_t> + install_raw_ptr (FA& fa, RT const* p); + Plan m_fft_fwd_x{}; Plan m_fft_bwd_x{}; Plan m_fft_fwd_y{}; @@ -263,6 +420,9 @@ private: cMF m_cy; cMF m_cz; + mutable MF m_raw_mf; + mutable cMF m_raw_cmf; + std::unique_ptr m_data_1; std::unique_ptr m_data_2; @@ -529,6 +689,11 @@ R2C::R2C (Box const& domain, Info const& info) } } +template +R2C::R2C (std::array const& domain_size, Info const& info) + : R2C(Box(IntVect(0),IntVect(domain_size)-1), info) +{} + template R2C::~R2C () { @@ -550,6 +715,89 @@ R2C::~R2C () m_fft_fwd_x_half.destroy(); } +template +std::pair +R2C::make_layout_from_local_domain (std::array const& local_start, + std::array const& local_size) +{ + IntVect lo(local_start); + IntVect len(local_size); + Box bx(lo, lo+len-1); +#ifdef AMREX_USE_MPI + Vector allboxes(ParallelDescriptor::NProcs()); + MPI_Allgather(&bx, 1, ParallelDescriptor::Mpi_typemap::type(), + allboxes.data(), 1, ParallelDescriptor::Mpi_typemap::type(), + ParallelDescriptor::Communicator()); + Vector pmap; + pmap.reserve(allboxes.size()); + for (int i = 0; i < allboxes.size(); ++i) { + if (allboxes[i].ok()) { + pmap.push_back(i); + } + } + allboxes.erase(std::remove_if(allboxes.begin(), allboxes.end(), + [=] (Box const& b) { return b.isEmpty(); }), + allboxes.end()); + BoxList bl(std::move(allboxes)); + return std::make_pair(BoxArray(std::move(bl)), DistributionMapping(std::move(pmap))); +#else + return std::make_pair(BoxArray(bx), DistributionMapping(Vector({0}))); +#endif +} + +template +void R2C::setLocalDomain (std::array const& local_start, + std::array const& local_size) +{ + auto const& [ba, dm] = make_layout_from_local_domain(local_start, local_size); + m_raw_mf = MF(ba, dm, m_rx.nComp(), 0, MFInfo().SetAlloc(false)); +} + +template +std::pair,std::array> +R2C::getLocalDomain () const +{ + m_raw_mf = MF(m_rx.boxArray(), m_rx.DistributionMap(), m_rx.nComp(), 0, + MFInfo{}.SetAlloc(false)); + + auto const myproc = ParallelDescriptor::MyProc(); + if (myproc < m_rx.size()) { + Box const& box = m_rx.box(myproc); + return std::make_pair(box.smallEnd().toArray(), + box.length().toArray()); + } else { + return std::make_pair(std::array{AMREX_D_DECL(0,0,0)}, + std::array{AMREX_D_DECL(0,0,0)}); + } +} + +template +void R2C::setLocalSpectralDomain (std::array const& local_start, + std::array const& local_size) +{ + auto const& [ba, dm] = make_layout_from_local_domain(local_start, local_size); + m_raw_cmf = cMF(ba, dm, m_rx.nComp(), 0, MFInfo().SetAlloc(false)); +} + +template +std::pair,std::array> +R2C::getLocalSpectralDomain () const +{ + auto const ncomp = m_info.batch_size; + auto const& [ba, dm] = getSpectralDataLayout(); + + m_raw_cmf = cMF(ba, dm, ncomp, 0, MFInfo{}.SetAlloc(false)); + + auto const myproc = ParallelDescriptor::MyProc(); + if (myproc < m_raw_cmf.size()) { + Box const& box = m_raw_cmf.box(myproc); + return std::make_pair(box.smallEnd().toArray(), box.length().toArray()); + } else { + return std::make_pair(std::array{AMREX_D_DECL(0,0,0)}, + std::array{AMREX_D_DECL(0,0,0)}); + } +} + template void R2C::prepare_openbc () { @@ -678,6 +926,76 @@ void R2C::forward (MF const& inmf, int incomp) m_fft_fwd_z.template compute_c2c(); } +template +template +std::pair,std::size_t> +R2C::install_raw_ptr (FA& fa, RT const* p) +{ + AMREX_ALWAYS_ASSERT(!fa.empty()); + + using FAB = typename FA::FABType::value_type; + using T_FAB = typename FAB::value_type; + static_assert((sizeof(T_FAB) == sizeof(RT))); + + auto const ncomp = m_info.batch_size; + auto const& ia = fa.IndexArray(); + + T_FAB* pp = nullptr; + std::size_t sz = 0; + + if ( ! ia.empty() ) { + int K = ia[0]; + Box const& box = fa.fabbox(K); + if ((alignof(T_FAB) == alignof(RT)) || amrex::is_aligned(p,alignof(T_FAB))) { + pp = (T_FAB*)p; + } else { + sz = sizeof(T_FAB) * box.numPts() * ncomp; + pp = (T_FAB*) The_Arena()->alloc(sz); + } + fa.setFab(K, FAB(box,ncomp,pp)); + } + + if (sz == 0) { + return std::make_pair(std::unique_ptr{},std::size_t(0)); + } else { + return std::make_pair(std::unique_ptr + {(char*)pp,DataDeleter{The_Arena()}}, sz); + } +} + + +template +template > +void R2C::forward (RT const* in, CT* out) +{ + auto [rdata, rsz] = install_raw_ptr(m_raw_mf, in); + auto [cdata, csz] = install_raw_ptr(m_raw_cmf, out); + + if (rsz > 0) { +#ifdef AMREX_USE_GPU + Gpu::dtod_memcpy_async(rdata.get(),in,rsz); + Gpu::streamSynchronize(); +#else + std::memcpy(rdata.get(),in,rsz); +#endif + } + + forward(m_raw_mf, m_raw_cmf); + + if (csz) { +#ifdef AMREX_USE_GPU + Gpu::dtod_memcpy_async(out,cdata.get(),csz); + Gpu::streamSynchronize(); +#else + std::memcpy(out,cdata.get(),csz); +#endif + } +} + template template > void R2C::backward (MF& outmf, int outcomp) @@ -745,6 +1063,38 @@ void R2C::backward_doit (MF& outmf, IntVect const& ngout, amrex::elemwiseMin(ngout,outmf.nGrowVect()), period); } +template +template > +void R2C::backward (CT const* in, RT* out) +{ + auto [rdata, rsz] = install_raw_ptr(m_raw_mf, out); + auto [cdata, csz] = install_raw_ptr(m_raw_cmf, in); + + if (csz) { +#ifdef AMREX_USE_GPU + Gpu::dtod_memcpy_async(cdata.get(),in,csz); + Gpu::streamSynchronize(); +#else + std::memcpy(cdata.get(),in,csz); +#endif + } + + backward(m_raw_cmf, m_raw_mf); + + if (rsz > 0) { +#ifdef AMREX_USE_GPU + Gpu::dtod_memcpy_async(out,rdata.get(),rsz); + Gpu::streamSynchronize(); +#else + std::memcpy(out,rdata.get(),rsz); +#endif + } +} + template std::pair, Plan> R2C::make_c2c_plans (cMF& inout, int ndims) const @@ -895,7 +1245,7 @@ template template > std::pair::cMF *, IntVect> -R2C::getSpectralData () +R2C::getSpectralData () const { #if (AMREX_SPACEDIM > 1) if (m_r2c_sub) { @@ -904,11 +1254,11 @@ R2C::getSpectralData () } else #endif if (!m_cz.empty()) { - return std::make_pair(&m_cz, IntVect{AMREX_D_DECL(2,0,1)}); + return std::make_pair(const_cast(&m_cz), IntVect{AMREX_D_DECL(2,0,1)}); } else if (!m_cy.empty()) { - return std::make_pair(&m_cy, IntVect{AMREX_D_DECL(1,0,2)}); + return std::make_pair(const_cast(&m_cy), IntVect{AMREX_D_DECL(1,0,2)}); } else { - return std::make_pair(&m_cx, IntVect{AMREX_D_DECL(0,1,2)}); + return std::make_pair(const_cast(&m_cx), IntVect{AMREX_D_DECL(0,1,2)}); } } diff --git a/Src/LinearSolvers/AMReX_SpMatrix.H b/Src/LinearSolvers/AMReX_SpMatrix.H index d0c082759b..88b44339d9 100644 --- a/Src/LinearSolvers/AMReX_SpMatrix.H +++ b/Src/LinearSolvers/AMReX_SpMatrix.H @@ -36,7 +36,7 @@ public: //! Define a matrix with CSR format data. Note that mat and col_index //! should contains nnz elements. The number of elements in row_index - //! should the numbrer of local rows plus 1. The data can be freed after + //! should be the number of local rows plus 1. The data can be freed after //! this function call. For GPU builds, the data are expected to be in //! GPU memory. void define (AlgPartition partition, T const* mat, Long const* col_index, diff --git a/Tests/FFT/C2C/main.cpp b/Tests/FFT/C2C/main.cpp index df87d4e641..9c92428200 100644 --- a/Tests/FFT/C2C/main.cpp +++ b/Tests/FFT/C2C/main.cpp @@ -129,6 +129,61 @@ int main (int argc, char* argv[]) #endif AMREX_ALWAYS_ASSERT(error < eps); } + + { + auto sba = amrex::decompose(domain, ParallelDescriptor::NProcs()); + DistributionMapping sdm{sba}; + cMultiFab smf(sba,sdm,1,0); + smf.ParallelCopy(mf); + + auto domain_size = domain.length().toArray(); + FFT::C2C c2c(domain_size); + + GpuComplex* pio = nullptr; + std::array local_start{AMREX_D_DECL(0,0,0)}; + std::array local_size{AMREX_D_DECL(0,0,0)}; + auto const& imap = smf.IndexArray(); + AMREX_ALWAYS_ASSERT(imap.size() <= 1); + if (imap.size() == 1) { + pio = smf[imap[0]].dataPtr(); + auto const& box = sba[imap[0]]; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + local_start[idim] = box.smallEnd(idim); + local_size[idim] = box.length(idim); + } + } + c2c.setLocalDomain(local_start, local_size); + + auto const& [sp_start, sp_size, sp_order] = c2c.getLocalSpectralDomain(); + auto npts = AMREX_D_TERM(std::size_t(sp_size[0]), + *std::size_t(sp_size[1]), + *std::size_t(sp_size[2])); + Gpu::DeviceVector> spv(npts); + + c2c.forward(pio, spv.data()); + c2c.backward(spv.data(), pio); + + amrex::Scale(smf, -scaling, 0, 1, 0); + smf.ParallelAdd(mf); + + MultiFab errmf(sba,sdm,1,0); + auto const& errma = errmf.arrays(); + + auto const& sma = smf.const_arrays(); + ParallelFor(smf, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + errma[b](i,j,k) = amrex::norm(sma[b](i,j,k)); + }); + + auto error = errmf.norminf(); + amrex::Print() << " Expected to be close to zero: " << error << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 1.e-6f; +#else + auto eps = 1.e-13; +#endif + AMREX_ALWAYS_ASSERT(error < eps); + } } amrex::Finalize(); } diff --git a/Tests/FFT/R2C/main.cpp b/Tests/FFT/R2C/main.cpp index 1f3a0e6854..08861cea68 100644 --- a/Tests/FFT/R2C/main.cpp +++ b/Tests/FFT/R2C/main.cpp @@ -166,6 +166,72 @@ int main (int argc, char* argv[]) auto eps = 1.e-6f; #else auto eps = 1.e-13; +#endif + AMREX_ALWAYS_ASSERT(error < eps); + } + + // Test raw pointer interface + { + auto sba = amrex::decompose(domain, ParallelDescriptor::NProcs()); + DistributionMapping sdm{sba}; + MultiFab smf(sba,sdm,1,0); + smf.ParallelCopy(mf); + + Box cdomain(IntVect(0),IntVect(AMREX_D_DECL(domain.length(0)/2, + domain.length(1)-1, + domain.length(2)-1))); + auto sba_c = amrex::decompose(cdomain, ParallelDescriptor::NProcs()); + DistributionMapping sdm_c{sba_c}; + FabArray>> smf_c(sba_c,sdm_c,1,0); + + auto domain_size = domain.length().toArray(); + FFT::R2C r2c(domain_size); + + Real* preal = nullptr; + std::array local_start{AMREX_D_DECL(0,0,0)}; + std::array local_size{AMREX_D_DECL(0,0,0)}; + auto const& imap = smf.IndexArray(); + AMREX_ALWAYS_ASSERT(imap.size() <= 1); + if (imap.size() == 1) { + preal = smf[imap[0]].dataPtr(); + auto const& box = sba[imap[0]]; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + local_start[idim] = box.smallEnd(idim); + local_size[idim] = box.length(idim); + } + } + r2c.setLocalDomain(local_start, local_size); + + GpuComplex* pcomplex = nullptr; + std::array local_start_c{AMREX_D_DECL(0,0,0)}; + std::array local_size_c{AMREX_D_DECL(0,0,0)}; + auto const& imap_c = smf_c.IndexArray(); + AMREX_ALWAYS_ASSERT(imap_c.size() <= 1); + if (imap_c.size() == 1) { + pcomplex = smf_c[imap_c[0]].dataPtr(); + auto const& box = sba_c[imap_c[0]]; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + local_start_c[idim] = box.smallEnd(idim); + local_size_c[idim] = box.length(idim); + } + } + r2c.setLocalSpectralDomain(local_start_c, local_size_c); + + r2c.forward(preal, pcomplex); + + amrex::Scale(smf_c, scaling, 0, 1, 0); + + r2c.backward(pcomplex, preal); + + mf2.ParallelCopy(smf); + MultiFab::Subtract(mf2, mf, 0, 0, 1, 0); + + auto error = mf2.norminf(); + amrex::Print() << " Expected to be close to zero: " << error << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 1.e-6f; +#else + auto eps = 1.e-13; #endif AMREX_ALWAYS_ASSERT(error < eps); } diff --git a/Tests/FFT/RawPtr/CMakeLists.txt b/Tests/FFT/RawPtr/CMakeLists.txt new file mode 100644 index 0000000000..21a9d3b268 --- /dev/null +++ b/Tests/FFT/RawPtr/CMakeLists.txt @@ -0,0 +1,10 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/FFT/RawPtr/GNUmakefile b/Tests/FFT/RawPtr/GNUmakefile new file mode 100644 index 0000000000..93376f4485 --- /dev/null +++ b/Tests/FFT/RawPtr/GNUmakefile @@ -0,0 +1,26 @@ +AMREX_HOME := ../../.. + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_MPI = TRUE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +USE_SYCL = FALSE + +USE_FFT = TRUE + +BL_NO_FORT = TRUE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/FFT/RawPtr/Make.package b/Tests/FFT/RawPtr/Make.package new file mode 100644 index 0000000000..6b4b865e8f --- /dev/null +++ b/Tests/FFT/RawPtr/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/FFT/RawPtr/main.cpp b/Tests/FFT/RawPtr/main.cpp new file mode 100644 index 0000000000..29adbffa70 --- /dev/null +++ b/Tests/FFT/RawPtr/main.cpp @@ -0,0 +1,139 @@ +#include + +int main (int argc, char* argv[]) +{ +#ifdef AMREX_USE_MPI + MPI_Init(&argc, &argv); + amrex::Init_FFT(MPI_COMM_WORLD); +#else + amrex::Init_FFT(); +#endif + + int nprocs, myproc; +#ifdef AMREX_USE_MPI + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &myproc); +#else + nprocs = 1; + myproc = 0; +#endif + + using RT = amrex::Real; + using CT = amrex::GpuComplex; + + std::array domain_size{AMREX_D_DECL(128,128,128)}; + + { + amrex::FFT::R2C r2c(domain_size); + + int nx = (domain_size[0] + nprocs - 1) / nprocs; + int xlo = nx * myproc; + nx = std::max(std::min(nx,domain_size[0]-xlo), 0); + std::array local_start{AMREX_D_DECL(xlo,0,0)}; + std::array local_size{AMREX_D_DECL(nx,domain_size[1],domain_size[2])}; + + r2c.setLocalDomain(local_start,local_size); + + auto const& [local_start_sp, local_size_sp] = r2c.getLocalSpectralDomain(); + amrex::ignore_unused(local_start_sp); + + auto nr = AMREX_D_TERM(std::size_t(local_size[0]), + *std::size_t(local_size[1]), + *std::size_t(local_size[2])); + auto* pr = (RT*)amrex::The_Arena()->alloc(sizeof(RT)*nr); + amrex::ParallelFor(nr, [=] AMREX_GPU_DEVICE (std::size_t i) + { + pr[i] = std::sin(i); + }); + + auto nc = AMREX_D_TERM(std::size_t(local_size_sp[0]), + *std::size_t(local_size_sp[1]), + *std::size_t(local_size_sp[2])); + auto* pc = (CT*)amrex::The_Arena()->alloc(sizeof(CT)*nc); + + r2c.forward(pr, pc); + + auto scaling = r2c.scalingFactor(); + amrex::ParallelFor(nc, [=] AMREX_GPU_DEVICE (std::size_t i) + { + pc[i] *= scaling; + }); + + r2c.backward(pc, pr); + + auto error = amrex::Reduce::Max + (nr, [=] AMREX_GPU_DEVICE (std::size_t i) + { + return std::abs(pr[i]-std::sin(i)); + }); + amrex::Print() << " Expected to be close to zero: " << error << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 1.e-6f; +#else + auto eps = 1.e-13; +#endif + amrex::ParallelDescriptor::Barrier(); + AMREX_ALWAYS_ASSERT(error < eps); + } + + int nbatch = 3; + { + amrex::FFT::Info info{}; + info.setBatchSize(nbatch); + amrex::FFT::C2C c2c(domain_size,info); + + auto const& [local_start, local_size] = c2c.getLocalDomain(); + + int nx = (domain_size[0] + nprocs - 1) / nprocs; + int xlo = nx * myproc; + nx = std::max(std::min(nx,domain_size[0]-xlo), 0); + std::array local_start_sp{AMREX_D_DECL(xlo,0,0)}; + std::array local_size_sp{AMREX_D_DECL(nx,domain_size[1],domain_size[2])}; + + c2c.setLocalSpectralDomain(local_start_sp, local_size_sp); + + auto nf = AMREX_D_TERM(std::size_t(local_size[0]), + *std::size_t(local_size[1]), + *std::size_t(local_size[2])); + auto* pf = (CT*)amrex::The_Arena()->alloc(sizeof(CT)*nf*nbatch); + amrex::ParallelFor(nf, [=] AMREX_GPU_DEVICE (std::size_t i) + { + pf[i] = amrex::GpuComplex(std::sin(i), std::cos(i)); + }); + + auto nb = AMREX_D_TERM(std::size_t(local_size_sp[0]), + *std::size_t(local_size_sp[1]), + *std::size_t(local_size_sp[2])); + auto* pb = (CT*)amrex::The_Arena()->alloc(sizeof(CT)*nb*nbatch); + + c2c.forward(pf, pb); + + auto scaling = c2c.scalingFactor(); + amrex::ParallelFor(nb, [=] AMREX_GPU_DEVICE (std::size_t i) + { + pb[i] *= scaling; + }); + + c2c.backward(pb, pf); + + auto error = amrex::Reduce::Max + (nf, [=] AMREX_GPU_DEVICE (std::size_t i) + { + return amrex::norm(pf[i]-amrex::GpuComplex(std::sin(i),std::cos(i))); + }); + amrex::Print() << " Expected to be close to zero: " << error << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 1.e-6f; +#else + auto eps = 1.e-13; +#endif + amrex::ParallelDescriptor::Barrier(); + AMREX_ALWAYS_ASSERT(error < eps); + } + + amrex::Finalize_FFT(); + +#ifdef AMREX_USE_MPI + MPI_Finalize(); +#endif +}