diff --git a/Docs/sphinx_documentation/source/FFT.rst b/Docs/sphinx_documentation/source/FFT.rst
index ac24e1c818..863539b6e3 100644
--- a/Docs/sphinx_documentation/source/FFT.rst
+++ b/Docs/sphinx_documentation/source/FFT.rst
@@ -197,3 +197,120 @@ support non-uniform cell size in the z-direction. For most applications,
 Similar to :cpp:`FFT::R2C`, the Poisson solvers should be cached for reuse,
 and one might need to use :cpp:`std::unique_ptr<FFT::Poisson<MultiFab>>`
 because there is no default constructor.
+
+.. _sec:FFT:rawptr:
+
+Raw Pointer Interface
+=====================
+
+If you only want to use AMReX as an FFT library without using other
+functionalities and data containers, you could use the raw pointer
+interface. Below is an example.
+
+.. highlight:: c++
+
+::
+
+    MPI_Init(&argc, &argv);
+
+    // We don't need to call the full-blown amrex::Initialize
+    amrex::Init_FFT(MPI_COMM_WORLD);
+
+    int nprocs, myproc;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+    using RT = double;
+    using CT = std::complex<RT>; // or cufftDoubleComplex, etc.
+
+    std::array<int,3> domain_size{128,128,128};
+
+    // FFT between real and complex.
+    // Domain decomposition is flexible. The only constraint for the raw
+    // pointer interface is that there can be only zero or one local box
+    // per process, whereas the MultiFab interface can take any number of
+    // boxes. In this case, we choose to do manual domain decomposition for
+    // the real (i.e., forward) domain, and use the domain decomposition
+    // provided by amrex for the complex (i.e., backward) domain.
+    {
+        amrex::FFT::R2C<RT,amrex::FFT::Direction::both> r2c(domain_size);
+
+        int nx = (domain_size[0] + nprocs - 1) / nprocs;
+        int xlo = nx * myproc;
+        nx = std::max(std::min(nx,domain_size[0]-xlo), 0);
+        std::array<int,3> local_start{xlo,0,0};
+        std::array<int,3> local_size{nx,domain_size[1],domain_size[2]};
+
+        // Let amrex know the domain decomposition in the forward domain.
+        r2c.setLocalDomain(local_start,local_size);
+
+        // Use amrex's domain decomposition in the backward domain.
+        auto const& [local_start_sp, local_size_sp] = r2c.getLocalSpectralDomain();
+
+        auto nr = std::size_t(local_size[0])
+                * std::size_t(local_size[1])
+                * std::size_t(local_size[2]);
+        auto* pr = (RT*)std::malloc(sizeof(RT)*nr); // or use cudaMalloc
+        // Initialize data ...
+
+        auto nc = std::size_t(local_size_sp[0])
+                * std::size_t(local_size_sp[1])
+                * std::size_t(local_size_sp[2]);
+        auto* pc = (CT*)std::malloc(sizeof(CT)*nc); // or use cudaMalloc
+
+        r2c.forward(pr, pc); // forward transform from real to complex
+
+        // work on the complex data pointed by pc ...
+
+        r2c.backward(pc, pr); // backward transform from complex to real
+
+        std::free(pr);
+        std::free(pc);
+    }
+
+    // Batched FFT between complex and complex.
+    // In this case, we choose to use the domain decomposition provided
+    // by amrex for the forward domain, and do manual domain decomposition
+    // for the backward domain.
+    int nbatch = 3; // batch size
+    {
+        amrex::FFT::Info info{};
+        info.setBatchSize(nbatch);
+        amrex::FFT::C2C<RT,amrex::FFT::Direction::both> c2c(domain_size,info);
+
+        // Use amrex's domain decomposition in the forward domain.
+        auto const& [local_start, local_size] = c2c.getLocalDomain();
+
+        int nx = (domain_size[0] + nprocs - 1) / nprocs;
+        int xlo = nx * myproc;
+        nx = std::max(std::min(nx,domain_size[0]-xlo), 0);
+        std::array<int,3> local_start_sp{xlo,0,0};
+        std::array<int,3> local_size_sp{nx,domain_size[1],domain_size[2]};
+
+        // Let amrex know the domain decomposition in the backward domain.
+        c2c.setLocalSpectralDomain(local_start_sp, local_size_sp);
+
+        auto nf = std::size_t(local_size[0])
+                * std::size_t(local_size[1])
+                * std::size_t(local_size[2]);
+        auto* pf = (CT*)std::malloc(sizeof(CT)*nf*nbatch); // or use cudaMalloc
+        // Initialize data ...
+
+        auto nb = std::size_t(local_size_sp[0])
+                * std::size_t(local_size_sp[1])
+                * std::size_t(local_size_sp[2]);
+        auto* pb = (CT*)std::malloc(sizeof(CT)*nb*nbatch);
+
+        c2c.forward(pf, pb); // forward transform
+
+        // work on the data pointed by pb
+
+        c2c.backward(pb, pf); // backward transform
+
+        std::free(pf);
+        std::free(pb);
+    }
+
+    amrex::Finalize_FFT();
+
+    MPI_Finalize();
diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H
index a1124ce69d..aeac696076 100644
--- a/Src/Base/AMReX.H
+++ b/Src/Base/AMReX.H
@@ -87,6 +87,18 @@ namespace amrex
                        std::ostream& a_oserr = std::cerr,
                        ErrorHandler a_errhandler = nullptr);
 
+    // \brief Minimal version of initialization.
+    //
+    // This version is intended for users who only need AMReX for some
+    // specific functionalities such as FFT. It's the user's responsibility
+    // to initialize MPI. For multiple-GPU systems, it's the user's
+    // responsibility to properly set the GPU devices to be used. We will not
+    // try to pre-allocate memory arenas. We will not install a signal
+    // handler. Functionalities like random number generator and async I/O
+    // will be work. However, functionalities like FFT and linear solvers do
+    // work.
+    void Init_minimal (MPI_Comm mpi_comm = MPI_COMM_WORLD);
+
     /**
        \brief Returns true if there are any currently-active and initialized
        AMReX instances (i.e. one for which amrex::Initialize has been called,
@@ -96,6 +108,10 @@ namespace amrex
 
     void Finalize (AMReX* pamrex);
     void Finalize (); // Finalize the current top
+    // For initialization with Init_minimal, Finalize_minimal should be used
+    // for finalization.
+    void Finalize_minimal();
+
     /**
     * \brief We maintain a stack of functions that need to be called in Finalize().
     * The functions are called in LIFO order.  The idea here is to allow
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index cf4d1eb3c0..af403447fd 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -125,6 +125,11 @@ namespace system
 }
 }
 
+namespace {
+    long long init_minimal_called = 0;
+    bool initialization_by_init_minimal = false;
+}
+
 namespace {
     std::string command_line;
     std::vector<std::string> command_arguments;
@@ -339,20 +344,37 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
                    ErrorHandler a_errhandler)
 {
     system::exename.clear();
+    if (initialization_by_init_minimal) {
+        system::verbose = 0;
+        system::regtest_reduction = false;
+        system::signal_handling = false;
+        system::handle_sigsegv = false;
+        system::handle_sigterm = false;
+        system::handle_sigint  = false;
+        system::handle_sigabrt = false;
+        system::handle_sigfpe  = false;
+        system::handle_sigill  = false;
+        system::call_addr2line = false;
+        system::throw_exception = false;
+        system::osout = &std::cout;
+        system::oserr = &std::cerr;
+        system::error_handler = nullptr;
+    } else {
 //    system::verbose = 0;
-    system::regtest_reduction = false;
-    system::signal_handling = true;
-    system::handle_sigsegv = true;
-    system::handle_sigterm = false;
-    system::handle_sigint  = true;
-    system::handle_sigabrt = true;
-    system::handle_sigfpe  = true;
-    system::handle_sigill  = true;
-    system::call_addr2line = true;
-    system::throw_exception = false;
-    system::osout = &a_osout;
-    system::oserr = &a_oserr;
-    system::error_handler = a_errhandler;
+        system::regtest_reduction = false;
+        system::signal_handling = true;
+        system::handle_sigsegv = true;
+        system::handle_sigterm = false;
+        system::handle_sigint  = true;
+        system::handle_sigabrt = true;
+        system::handle_sigfpe  = true;
+        system::handle_sigill  = true;
+        system::call_addr2line = true;
+        system::throw_exception = false;
+        system::osout = &a_osout;
+        system::oserr = &a_oserr;
+        system::error_handler = a_errhandler;
+    }
 
     ParallelDescriptor::StartParallel(&argc, &argv, mpi_comm);
 
@@ -510,7 +532,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 
 #ifdef AMREX_USE_GPU
     // Initialize after ParmParse so that we can read inputs.
-    Gpu::Device::Initialize();
+    Gpu::Device::Initialize(initialization_by_init_minimal);
 #ifdef AMREX_USE_CUPTI
     CuptiInitialize();
 #endif
@@ -640,13 +662,15 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
     ParallelDescriptor::Initialize();
 
     BL_TINY_PROFILE_MEMORYINITIALIZE();
-    Arena::Initialize();
+    Arena::Initialize(initialization_by_init_minimal);
     amrex_mempool_init();
 
     //
     // Initialize random seed after we're running in parallel.
     //
-    amrex::InitRandom(ParallelDescriptor::MyProc()+1, ParallelDescriptor::NProcs());
+    if (!initialization_by_init_minimal) {
+        amrex::InitRandom(ParallelDescriptor::MyProc()+1, ParallelDescriptor::NProcs());
+    }
 
     // For thread safety, we should do these initializations here.
     BaseFab_Initialize();
@@ -658,7 +682,9 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
     MultiFab::Initialize();
     iMultiFab::Initialize();
     VisMF::Initialize();
-    AsyncOut::Initialize();
+    if (!initialization_by_init_minimal) {
+        AsyncOut::Initialize();
+    }
     VectorGrowthStrategy::Initialize();
 
 #ifdef AMREX_USE_FFT
@@ -998,4 +1024,25 @@ FPExcept enableFPExcept (FPExcept excepts)
     return prev;
 }
 
+void Init_minimal (MPI_Comm mpi_comm)
+{
+    ++init_minimal_called;
+
+    if (Initialized()) { return; }
+
+    initialization_by_init_minimal = true;
+    Initialize(mpi_comm);
+}
+
+void Finalize_minimal ()
+{
+    if (init_minimal_called > 0) {
+        --init_minimal_called;
+    }
+    if (init_minimal_called == 0 && initialization_by_init_minimal) {
+        Finalize();
+        initialization_by_init_minimal = false;
+    }
+}
+
 }
diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
index 12a1e79af7..f0419ee973 100644
--- a/Src/Base/AMReX_Arena.H
+++ b/Src/Base/AMReX_Arena.H
@@ -187,7 +187,7 @@ public:
     */
     static std::size_t align (std::size_t sz);
 
-    static void Initialize ();
+    static void Initialize (bool minimal);
     static void PrintUsage ();
     static void PrintUsageToFiles (std::string const& filename, std::string const& message);
     static void Finalize ();
diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
index 8ca168bb1a..ff33163884 100644
--- a/Src/Base/AMReX_Arena.cpp
+++ b/Src/Base/AMReX_Arena.cpp
@@ -36,7 +36,7 @@ namespace {
     Arena* the_cpu_arena = nullptr;
     Arena* the_comms_arena = nullptr;
 
-    Long the_arena_init_size = 0L;
+    Long the_arena_init_size = 1024*1024*8;
     Long the_device_arena_init_size = 1024*1024*8;
     Long the_managed_arena_init_size = 1024*1024*8;
     Long the_pinned_arena_init_size = 1024*1024*8;
@@ -280,7 +280,7 @@ namespace {
 }
 
 void
-Arena::Initialize ()
+Arena::Initialize (bool minimal)
 {
     if (initialized) { return; }
     initialized = true;
@@ -294,11 +294,18 @@ Arena::Initialize ()
     BL_ASSERT(the_cpu_arena == nullptr || the_cpu_arena == The_BArena());
     BL_ASSERT(the_comms_arena == nullptr || the_comms_arena == The_BArena());
 
+    if (minimal) {
+        the_pinned_arena_init_size = 0;
+    } else {
 #ifdef AMREX_USE_GPU
-    the_arena_init_size = Gpu::Device::totalGlobalMem() / Gpu::Device::numDevicePartners() / 4L * 3L;
+        the_arena_init_size = Gpu::Device::totalGlobalMem() / Gpu::Device::numDevicePartners() / 4L * 3L;
 #ifdef AMREX_USE_SYCL
-    the_arena_init_size = std::min(the_arena_init_size, Gpu::Device::maxMemAllocSize());
+        the_arena_init_size = std::min(the_arena_init_size, Gpu::Device::maxMemAllocSize());
+#endif
 #endif
+    }
+
+#ifdef AMREX_USE_GPU
     the_pinned_arena_release_threshold = Gpu::Device::totalGlobalMem() / Gpu::Device::numDevicePartners() / 2L;
 #endif
 
diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index a7aef5a924..eb2b95943e 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -53,7 +53,7 @@ class Device
 
 public:
 
-    static void Initialize ();
+    static void Initialize (bool minimal);
     static void Finalize ();
 
 #if defined(AMREX_USE_GPU)
@@ -184,7 +184,7 @@ public:
 
 private:
 
-    static void initialize_gpu ();
+    static void initialize_gpu (bool minimal);
 
     static AMREX_EXPORT int device_id;
     static AMREX_EXPORT int num_devices_used;
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index 961fdb0406..7857c26aab 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -145,8 +145,9 @@ namespace {
 #endif
 
 void
-Device::Initialize ()
+Device::Initialize (bool minimal)
 {
+    amrex::ignore_unused(minimal);
 #ifdef AMREX_USE_GPU
 
 #if defined(AMREX_USE_CUDA) && (defined(AMREX_PROFILING) || defined(AMREX_TINY_PROFILING))
@@ -201,7 +202,11 @@ Device::Initialize ()
     int n_local_procs = 1;
     amrex::ignore_unused(n_local_procs);
 
-    if (ParallelDescriptor::NProcs() == 1) {
+    if (minimal) {
+        device_id = 0;
+        AMREX_HIP_OR_CUDA(AMREX_HIP_SAFE_CALL (hipGetDevice(&device_id));,
+                          AMREX_CUDA_SAFE_CALL(cudaGetDevice(&device_id)); );
+    } else if (ParallelDescriptor::NProcs() == 1) {
         device_id = 0;
     }
     else if (gpu_device_count == 1) {
@@ -217,7 +222,7 @@ Device::Initialize ()
         }
     }
 
-    if (gpu_device_count > 1){
+    if (gpu_device_count > 1 && ! minimal) {
         if (Machine::name() == "nersc.perlmutter") {
             // The CPU/GPU mapping on perlmutter has the reverse order.
             device_id = gpu_device_count - device_id - 1;
@@ -241,19 +246,23 @@ Device::Initialize ()
         }
     }
 
-    AMREX_HIP_OR_CUDA(AMREX_HIP_SAFE_CALL (hipSetDevice(device_id));,
-                      AMREX_CUDA_SAFE_CALL(cudaSetDevice(device_id)); );
+#if !defined(AMREX_USE_SYCL)
+    if ( ! minimal) {
+        AMREX_HIP_OR_CUDA(AMREX_HIP_SAFE_CALL (hipSetDevice(device_id));,
+                          AMREX_CUDA_SAFE_CALL(cudaSetDevice(device_id)); );
+    }
+#endif
 
 #ifdef AMREX_USE_ACC
     amrex_initialize_acc(device_id);
 #endif
 
-    initialize_gpu();
+    initialize_gpu(minimal);
 
     num_devices_used = ParallelDescriptor::NProcs();
 
 #ifdef AMREX_USE_MPI
-    if (ParallelDescriptor::NProcs() > 1) {
+    if (ParallelDescriptor::NProcs() > 1 && ! minimal) {
 
 #if defined(HIP_VERSION_MAJOR) && defined(HIP_VERSION_MINOR) && ((HIP_VERSION_MAJOR < 5) || ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR < 2)))
 
@@ -336,7 +345,7 @@ Device::Initialize ()
     }
 #endif /* AMREX_USE_MPI */
 
-    if (amrex::Verbose()) {
+    if (amrex::Verbose() && ! minimal) {
 #if defined(AMREX_USE_CUDA)
         amrex::Print() << "CUDA"
 #elif defined(AMREX_USE_HIP)
@@ -401,8 +410,10 @@ Device::Finalize ()
 }
 
 void
-Device::initialize_gpu ()
+Device::initialize_gpu (bool minimal)
 {
+    amrex::ignore_unused(minimal);
+
 #ifdef AMREX_USE_GPU
 
     gpu_stream_pool.resize(max_gpu_streams);
@@ -436,10 +447,12 @@ Device::initialize_gpu ()
 #endif
 
 #if (__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 4))
-    if (sizeof(Real) == 8) {
-        AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
-    } else if (sizeof(Real) == 4) {
-        AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
+    if ( ! minimal ) {
+        if (sizeof(Real) == 8) {
+            AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
+        } else if (sizeof(Real) == 4) {
+            AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
+        }
     }
 #endif
 
diff --git a/Src/FFT/AMReX_FFT.H b/Src/FFT/AMReX_FFT.H
index 11bf4f4cc8..7a7c85f5e9 100644
--- a/Src/FFT/AMReX_FFT.H
+++ b/Src/FFT/AMReX_FFT.H
@@ -2,11 +2,33 @@
 #define AMREX_FFT_H_
 #include <AMReX_Config.H>
 
+#include <AMReX.H>
 #include <AMReX_FFT_LocalR2C.H>
 #include <AMReX_FFT_OpenBCSolver.H>
 #include <AMReX_FFT_R2C.H>
 #include <AMReX_FFT_R2X.H>
 
+namespace amrex
+{
+    /**
+     * \brief Initialize FFT
+     *
+     * This is needed only when the user wants to use amrex::FFT, but does
+     * not want to call amrex::Initialize to initialize the full version of
+     * AMReX. Note that one usually only needs to call Init_FFT and
+     * Finalize_FFT once in the entire program.
+     */
+#ifdef AMREX_USE_MPI
+    inline void Init_FFT (MPI_Comm comm ) { amrex::Init_minimal(comm); }
+#else
+    inline void Init_FFT () { amrex::Init_minimal(); }
+#endif
+
+    //! If Init_FFT is called, this should be called after all the FFT works
+    //! are done.
+    inline void Finalize_FFT () { amrex::Finalize_minimal(); }
+}
+
 namespace amrex::FFT
 {
     void Initialize ();
diff --git a/Src/FFT/AMReX_FFT_R2C.H b/Src/FFT/AMReX_FFT_R2C.H
index fa071fea6d..eb32afc13a 100644
--- a/Src/FFT/AMReX_FFT_R2C.H
+++ b/Src/FFT/AMReX_FFT_R2C.H
@@ -26,6 +26,12 @@ template <typename T> class PoissonHybrid;
  * convention, where applying the forward transform followed by the backward
  * transform scales the original data by the size of the input array.
  *
+ * The arrays are assumed to be in column-major, which is different from
+ * FFTW's row-major layout. Because the complex domain data have the
+ * Hermitian symmetry, only half of the data in the complex domain are
+ * stored. If the real domain size is nx * ny * nz, the complex domain's
+ * size will be (nx/2+1) * ny * nz.
+ *
  * For more details, we refer the users to
  * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html.
  */
@@ -50,6 +56,18 @@ public:
      */
     explicit R2C (Box const& domain, Info const& info = Info{});
 
+    /**
+     * \brief Constructor
+     *
+     * If AMREX_SPACEDIM is 3 and you want to do 2D FFT, you just need to
+     * set the size of one of the dimensions to 1.
+     *
+     * \param domain_size size of the forward domain (i.e., the real data domain)
+     * \param info optional information
+     */
+    explicit R2C (std::array<int,AMREX_SPACEDIM> const& domain_size,
+                  Info const& info = Info{});
+
     ~R2C ();
 
     R2C (R2C const&) = delete;
@@ -57,6 +75,97 @@ public:
     R2C& operator= (R2C const&) = delete;
     R2C& operator= (R2C &&) = delete;
 
+    /**
+     * \brief Set local domain
+     *
+     * This is needed, only if one uses the raw pointer interfaces, not the
+     * amrex::MulitFab interfaces. This may contain collective MPI calls. So
+     * all processes even if their local size is zero should call. This
+     * function informs AMReX the domain decomposition chosen by the user
+     * for the forward domain (i.e., real data domain). There is no
+     * constraint on the domain decomposition strategy. One can do 1D, 2D or
+     * 3D domain decomposition. Alternatively, one could also let AMReX
+     * choose for you by calling getLocalDomain. The latter could
+     * potentially reduce data communication.
+     *
+     * Again, this is needed, only if one uses the raw pointer
+     * interfaces. Only one of the functions, setLocalDomain and
+     * getLocalDomain, should be called.
+     *
+     * This should only be called once unless the domain decomposition
+     * changes.
+     *
+     * local_start starting indices of the local domain
+     * local_size size of the local domain
+     */
+    void setLocalDomain (std::array<int,AMREX_SPACEDIM> const& local_start,
+                         std::array<int,AMREX_SPACEDIM> const& local_size);
+
+    /**
+     * \brief Get local domain
+     *
+     * This function returns the domain decomposition chosen by AMReX. The
+     * first part of the pair is the local starting indices, and the second
+     * part is the local domain size.
+     *
+     * This is needed, only if one uses the raw pointer interfaces, not the
+     * amrex::MulitFab interfaces. Only one of the functions, setLocalDomain
+     * and getLocalDomain, should be called.
+     */
+    std::pair<std::array<int,AMREX_SPACEDIM>,std::array<int,AMREX_SPACEDIM>>
+    getLocalDomain () const;
+
+    /**
+     * \brief Set local spectral domain
+     *
+     * This is needed, only if one uses the raw pointer interfaces, not the
+     * amrex::MulitFab interfaces. This may contain collective MPI calls. So
+     * all processes even if their local size is zero should call. This
+     * function informs AMReX the domain decomposition chosen by the user
+     * for the complex data domain. There is no constraint on the domain
+     * decomposition strategy. One can do 1D, 2D or 3D domain
+     * decomposition. Alternatively, one could also let AMReX choose for you
+     * by calling getLocalSpectralDomain. The latter could potentially
+     * reduce data communication.
+     *
+     * Again, this is needed, only if one uses the raw pointer
+     * interfaces. Only one of the functions, setLocalSpectralDomain and
+     * getLocalSpectralDomain, should be called. Note that one could use
+     * this function together with getLocalDomain. That is the user is
+     * allowed to choose their own spectral domain decomposition, while let
+     * AMReX choose the real data domain decomposition. Also note that the
+     * entire spectral domain has the size of (nx+1)/2 * ny * nz, if the real
+     * domain is nx * ny * nz.
+     *
+     * This should only be called once unless the domain decomposition
+     * changes.
+     *
+     * local_start starting indices of the local domain
+     * local_size size of the local domain
+     */
+    void setLocalSpectralDomain (std::array<int,AMREX_SPACEDIM> const& local_start,
+                                 std::array<int,AMREX_SPACEDIM> const& local_size);
+
+    /**
+     * \brief Get local spectral domain
+     *
+     * This function returns the domain decomposition chosen by AMReX for
+     * the complex data spectral domain. The returned pair contains the
+     * local starting indices and the local domain size.
+     *
+     * This is needed, only if one uses the raw pointer interfaces, not the
+     * amrex::MulitFab interfaces. Only one of the functions,
+     * setLocalSpectralDomain and getLocalSpectralDomain, should be called.
+     * Note that one could use this function together with
+     * setLocalDomain. That is the user is allowed to choose their own real
+     * domain decomposition, while let AMReX choose the spectral data domain
+     * decomposition. Also note that the entire spectral domain has the size
+     * of (nx+1)/2 * ny * nz, if the real domain is nx * ny * nz.
+     *
+     */
+    std::pair<std::array<int,AMREX_SPACEDIM>,std::array<int,AMREX_SPACEDIM>>
+    getLocalSpectralDomain () const;
+
     /**
      * \brief Forward and then backward transform
      *
@@ -113,6 +222,26 @@ public:
                                                 DIR == Direction::both, int> = 0>
     void forward (MF const& inmf, cMF& outmf, int incomp = 0, int outcomp = 0);
 
+    /**
+     * \brief Forward transform
+     *
+     * This raw pointer version of forward requires
+     * setLocalDomain/getLocalDomain and
+     * setLocalSpectralDomain/getLocalSpectralDomain have been called
+     * already. Note that one is allowed to call this function multiple
+     * times after the set/get domain functions are called only once, unless
+     * the domain decomposition changes. In fact, that is the preferred way
+     * because it has better performance. All processes need to call this
+     * function even if their local size is zero. If the local size is zero,
+     * one can pass nullptrs.
+     */
+    template <typename RT, typename CT, Direction DIR=D, bool CP=C,
+              std::enable_if_t<(DIR == Direction::forward ||
+                                DIR == Direction::both)
+                               && ((sizeof(RT)*2 == sizeof(CT) && !CP) ||
+                                   (sizeof(RT) == sizeof(CT) && CP)), int> = 0>
+    void forward (RT const* in, CT* out);
+
     /**
      * \brief Backward transform
      *
@@ -137,6 +266,26 @@ public:
                                                 DIR == Direction::both, int> = 0>
     void backward (cMF const& inmf, MF& outmf, int incomp = 0, int outcomp = 0);
 
+    /**
+     * \brief Backward transform
+     *
+     * This raw pointer version of backward requires
+     * setLocalDomain/getLocalDomain and
+     * setLocalSpectralDomain/getLocalSpectralDomain have been called
+     * already. Note that one is allowed to call this function multiple
+     * times after the set/get domain functions are called only once unless
+     * the domain decomposition changes. In fact, that is the preferred way
+     * because it has better performance. All processes need to call this
+     * function even if their local size is zero. If the local size is zero,
+     * one can pass nullptrs.
+     */
+    template <typename CT, typename RT, Direction DIR=D, bool CP=C,
+              std::enable_if_t<(DIR == Direction::backward ||
+                                DIR == Direction::both)
+                               && ((sizeof(RT)*2 == sizeof(CT) && !CP) ||
+                                   (sizeof(RT) == sizeof(CT) && CP)), int> = 0>
+    void backward (CT const* in, RT* out);
+
     //! Scaling factor. If the data goes through forward and then backward,
     //! the result multiplied by the scaling factor is equal to the original
     //! data.
@@ -153,7 +302,7 @@ public:
      */
     template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
                                                 DIR == Direction::both, int> = 0>
-    std::pair<cMF*,IntVect> getSpectralData ();
+    std::pair<cMF*,IntVect> getSpectralData () const;
 
     /**
      * \brief Get BoxArray and DistributionMapping for spectral data
@@ -231,6 +380,14 @@ private:
         }
     }
 
+    static std::pair<BoxArray,DistributionMapping>
+    make_layout_from_local_domain (std::array<int,AMREX_SPACEDIM> const& local_start,
+                                   std::array<int,AMREX_SPACEDIM> const& local_size);
+
+    template <typename FA, typename RT>
+    std::pair<std::unique_ptr<char,DataDeleter>,std::size_t>
+    install_raw_ptr (FA& fa, RT const* p);
+
     Plan<T> m_fft_fwd_x{};
     Plan<T> m_fft_bwd_x{};
     Plan<T> m_fft_fwd_y{};
@@ -263,6 +420,9 @@ private:
     cMF m_cy;
     cMF m_cz;
 
+    mutable MF m_raw_mf;
+    mutable cMF m_raw_cmf;
+
     std::unique_ptr<char,DataDeleter> m_data_1;
     std::unique_ptr<char,DataDeleter> m_data_2;
 
@@ -529,6 +689,11 @@ R2C<T,D,C>::R2C (Box const& domain, Info const& info)
     }
 }
 
+template <typename T, Direction D, bool C>
+R2C<T,D,C>::R2C (std::array<int,AMREX_SPACEDIM> const& domain_size, Info const& info)
+    : R2C<T,D,C>(Box(IntVect(0),IntVect(domain_size)-1), info)
+{}
+
 template <typename T, Direction D, bool C>
 R2C<T,D,C>::~R2C ()
 {
@@ -550,6 +715,89 @@ R2C<T,D,C>::~R2C ()
     m_fft_fwd_x_half.destroy();
 }
 
+template <typename T, Direction D, bool C>
+std::pair<BoxArray,DistributionMapping>
+R2C<T,D,C>::make_layout_from_local_domain (std::array<int,AMREX_SPACEDIM> const& local_start,
+                                           std::array<int,AMREX_SPACEDIM> const& local_size)
+{
+    IntVect lo(local_start);
+    IntVect len(local_size);
+    Box bx(lo, lo+len-1);
+#ifdef AMREX_USE_MPI
+    Vector<Box> allboxes(ParallelDescriptor::NProcs());
+    MPI_Allgather(&bx, 1, ParallelDescriptor::Mpi_typemap<Box>::type(),
+                  allboxes.data(), 1, ParallelDescriptor::Mpi_typemap<Box>::type(),
+                  ParallelDescriptor::Communicator());
+    Vector<int> pmap;
+    pmap.reserve(allboxes.size());
+    for (int i = 0; i < allboxes.size(); ++i) {
+        if (allboxes[i].ok()) {
+            pmap.push_back(i);
+        }
+    }
+    allboxes.erase(std::remove_if(allboxes.begin(), allboxes.end(),
+                                  [=] (Box const& b) { return b.isEmpty(); }),
+                   allboxes.end());
+    BoxList bl(std::move(allboxes));
+    return std::make_pair(BoxArray(std::move(bl)), DistributionMapping(std::move(pmap)));
+#else
+    return std::make_pair(BoxArray(bx), DistributionMapping(Vector<int>({0})));
+#endif
+}
+
+template <typename T, Direction D, bool C>
+void R2C<T,D,C>::setLocalDomain (std::array<int,AMREX_SPACEDIM> const& local_start,
+                                 std::array<int,AMREX_SPACEDIM> const& local_size)
+{
+    auto const& [ba, dm] = make_layout_from_local_domain(local_start, local_size);
+    m_raw_mf = MF(ba, dm, m_rx.nComp(), 0, MFInfo().SetAlloc(false));
+}
+
+template <typename T, Direction D, bool C>
+std::pair<std::array<int,AMREX_SPACEDIM>,std::array<int,AMREX_SPACEDIM>>
+R2C<T,D,C>::getLocalDomain () const
+{
+    m_raw_mf = MF(m_rx.boxArray(), m_rx.DistributionMap(), m_rx.nComp(), 0,
+                  MFInfo{}.SetAlloc(false));
+
+    auto const myproc = ParallelDescriptor::MyProc();
+    if (myproc < m_rx.size()) {
+        Box const& box = m_rx.box(myproc);
+        return std::make_pair(box.smallEnd().toArray(),
+                              box.length().toArray());
+    } else {
+        return std::make_pair(std::array<int,AMREX_SPACEDIM>{AMREX_D_DECL(0,0,0)},
+                              std::array<int,AMREX_SPACEDIM>{AMREX_D_DECL(0,0,0)});
+    }
+}
+
+template <typename T, Direction D, bool C>
+void R2C<T,D,C>::setLocalSpectralDomain (std::array<int,AMREX_SPACEDIM> const& local_start,
+                                         std::array<int,AMREX_SPACEDIM> const& local_size)
+{
+    auto const& [ba, dm] = make_layout_from_local_domain(local_start, local_size);
+    m_raw_cmf = cMF(ba, dm, m_rx.nComp(), 0, MFInfo().SetAlloc(false));
+}
+
+template <typename T, Direction D, bool C>
+std::pair<std::array<int,AMREX_SPACEDIM>,std::array<int,AMREX_SPACEDIM>>
+R2C<T,D,C>::getLocalSpectralDomain () const
+{
+    auto const ncomp = m_info.batch_size;
+    auto const& [ba, dm] = getSpectralDataLayout();
+
+    m_raw_cmf = cMF(ba, dm, ncomp, 0, MFInfo{}.SetAlloc(false));
+
+    auto const myproc = ParallelDescriptor::MyProc();
+    if (myproc < m_raw_cmf.size()) {
+        Box const& box = m_raw_cmf.box(myproc);
+        return std::make_pair(box.smallEnd().toArray(), box.length().toArray());
+    } else {
+        return std::make_pair(std::array<int,AMREX_SPACEDIM>{AMREX_D_DECL(0,0,0)},
+                              std::array<int,AMREX_SPACEDIM>{AMREX_D_DECL(0,0,0)});
+    }
+}
+
 template <typename T, Direction D, bool C>
 void R2C<T,D,C>::prepare_openbc ()
 {
@@ -678,6 +926,76 @@ void R2C<T,D,C>::forward (MF const& inmf, int incomp)
     m_fft_fwd_z.template compute_c2c<Direction::forward>();
 }
 
+template <typename T, Direction D, bool C>
+template <typename FA, typename RT>
+std::pair<std::unique_ptr<char,DataDeleter>,std::size_t>
+R2C<T,D,C>::install_raw_ptr (FA& fa, RT const* p)
+{
+    AMREX_ALWAYS_ASSERT(!fa.empty());
+
+    using FAB = typename FA::FABType::value_type;
+    using T_FAB = typename FAB::value_type;
+    static_assert((sizeof(T_FAB) == sizeof(RT)));
+
+    auto const ncomp = m_info.batch_size;
+    auto const& ia = fa.IndexArray();
+
+    T_FAB* pp = nullptr;
+    std::size_t sz = 0;
+
+    if ( ! ia.empty() ) {
+        int K = ia[0];
+        Box const& box = fa.fabbox(K);
+        if ((alignof(T_FAB) == alignof(RT)) || amrex::is_aligned(p,alignof(T_FAB))) {
+            pp = (T_FAB*)p;
+        } else {
+            sz = sizeof(T_FAB) * box.numPts() * ncomp;
+            pp = (T_FAB*) The_Arena()->alloc(sz);
+        }
+        fa.setFab(K, FAB(box,ncomp,pp));
+    }
+
+    if (sz == 0) {
+        return std::make_pair(std::unique_ptr<char,DataDeleter>{},std::size_t(0));
+    } else {
+        return std::make_pair(std::unique_ptr<char,DataDeleter>
+                              {(char*)pp,DataDeleter{The_Arena()}}, sz);
+    }
+}
+
+
+template <typename T, Direction D, bool C>
+template <typename RT, typename CT, Direction DIR, bool CP,
+          std::enable_if_t<(DIR == Direction::forward ||
+                            DIR == Direction::both)
+                           && ((sizeof(RT)*2 == sizeof(CT) && !CP) ||
+                               (sizeof(RT) == sizeof(CT) && CP)), int> >
+void R2C<T,D,C>::forward (RT const* in, CT* out)
+{
+    auto [rdata, rsz] = install_raw_ptr(m_raw_mf, in);
+    auto [cdata, csz] = install_raw_ptr(m_raw_cmf, out);
+
+    if (rsz > 0) {
+#ifdef AMREX_USE_GPU
+        Gpu::dtod_memcpy_async(rdata.get(),in,rsz);
+        Gpu::streamSynchronize();
+#else
+        std::memcpy(rdata.get(),in,rsz);
+#endif
+    }
+
+    forward(m_raw_mf, m_raw_cmf);
+
+    if (csz) {
+#ifdef AMREX_USE_GPU
+        Gpu::dtod_memcpy_async(out,cdata.get(),csz);
+        Gpu::streamSynchronize();
+#else
+        std::memcpy(out,cdata.get(),csz);
+#endif
+    }
+}
+
 template <typename T, Direction D, bool C>
 template <Direction DIR, std::enable_if_t<DIR == Direction::both, int> >
 void R2C<T,D,C>::backward (MF& outmf, int outcomp)
@@ -745,6 +1063,38 @@ void R2C<T,D,C>::backward_doit (MF& outmf, IntVect const& ngout,
                        amrex::elemwiseMin(ngout,outmf.nGrowVect()), period);
 }
 
+template <typename T, Direction D, bool C>
+template <typename CT, typename RT, Direction DIR, bool CP,
+          std::enable_if_t<(DIR == Direction::backward ||
+                            DIR == Direction::both)
+                           && ((sizeof(RT)*2 == sizeof(CT) && !CP) ||
+                               (sizeof(RT) == sizeof(CT) && CP)), int> >
+void R2C<T,D,C>::backward (CT const* in, RT* out)
+{
+    auto [rdata, rsz] = install_raw_ptr(m_raw_mf, out);
+    auto [cdata, csz] = install_raw_ptr(m_raw_cmf, in);
+
+    if (csz) {
+#ifdef AMREX_USE_GPU
+        Gpu::dtod_memcpy_async(cdata.get(),in,csz);
+        Gpu::streamSynchronize();
+#else
+        std::memcpy(cdata.get(),in,csz);
+#endif
+    }
+
+    backward(m_raw_cmf, m_raw_mf);
+
+    if (rsz > 0) {
+#ifdef AMREX_USE_GPU
+        Gpu::dtod_memcpy_async(out,rdata.get(),rsz);
+        Gpu::streamSynchronize();
+#else
+        std::memcpy(out,rdata.get(),rsz);
+#endif
+    }
+}
+
 template <typename T, Direction D, bool C>
 std::pair<Plan<T>, Plan<T>>
 R2C<T,D,C>::make_c2c_plans (cMF& inout, int ndims) const
@@ -895,7 +1245,7 @@ template <typename T, Direction D, bool C>
 template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
                                           DIR == Direction::both, int> >
 std::pair<typename R2C<T,D,C>::cMF *, IntVect>
-R2C<T,D,C>::getSpectralData ()
+R2C<T,D,C>::getSpectralData () const
 {
 #if (AMREX_SPACEDIM > 1)
     if (m_r2c_sub) {
@@ -904,11 +1254,11 @@ R2C<T,D,C>::getSpectralData ()
    } else
 #endif
     if (!m_cz.empty()) {
-        return std::make_pair(&m_cz, IntVect{AMREX_D_DECL(2,0,1)});
+        return std::make_pair(const_cast<cMF*>(&m_cz), IntVect{AMREX_D_DECL(2,0,1)});
     } else if (!m_cy.empty()) {
-        return std::make_pair(&m_cy, IntVect{AMREX_D_DECL(1,0,2)});
+        return std::make_pair(const_cast<cMF*>(&m_cy), IntVect{AMREX_D_DECL(1,0,2)});
     } else {
-        return std::make_pair(&m_cx, IntVect{AMREX_D_DECL(0,1,2)});
+        return std::make_pair(const_cast<cMF*>(&m_cx), IntVect{AMREX_D_DECL(0,1,2)});
     }
 }
 
diff --git a/Src/LinearSolvers/AMReX_SpMatrix.H b/Src/LinearSolvers/AMReX_SpMatrix.H
index d0c082759b..88b44339d9 100644
--- a/Src/LinearSolvers/AMReX_SpMatrix.H
+++ b/Src/LinearSolvers/AMReX_SpMatrix.H
@@ -36,7 +36,7 @@ public:
 
     //! Define a matrix with CSR format data. Note that mat and col_index
     //! should contains nnz elements. The number of elements in row_index
-    //! should the numbrer of local rows plus 1. The data can be freed after
+    //! should be the number of local rows plus 1. The data can be freed after
     //! this function call. For GPU builds, the data are expected to be in
     //! GPU memory.
     void define (AlgPartition partition, T const* mat, Long const* col_index,
diff --git a/Tests/FFT/C2C/main.cpp b/Tests/FFT/C2C/main.cpp
index df87d4e641..9c92428200 100644
--- a/Tests/FFT/C2C/main.cpp
+++ b/Tests/FFT/C2C/main.cpp
@@ -129,6 +129,61 @@ int main (int argc, char* argv[])
 #endif
             AMREX_ALWAYS_ASSERT(error < eps);
         }
+
+        {
+            auto sba = amrex::decompose(domain, ParallelDescriptor::NProcs());
+            DistributionMapping sdm{sba};
+            cMultiFab smf(sba,sdm,1,0);
+            smf.ParallelCopy(mf);
+
+            auto domain_size = domain.length().toArray();
+            FFT::C2C<Real,FFT::Direction::both> c2c(domain_size);
+
+            GpuComplex<Real>* pio = nullptr;
+            std::array<int,AMREX_SPACEDIM> local_start{AMREX_D_DECL(0,0,0)};
+            std::array<int,AMREX_SPACEDIM> local_size{AMREX_D_DECL(0,0,0)};
+            auto const& imap = smf.IndexArray();
+            AMREX_ALWAYS_ASSERT(imap.size() <= 1);
+            if (imap.size() == 1) {
+                pio = smf[imap[0]].dataPtr();
+                auto const& box = sba[imap[0]];
+                for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                    local_start[idim] = box.smallEnd(idim);
+                    local_size[idim] = box.length(idim);
+                }
+            }
+            c2c.setLocalDomain(local_start, local_size);
+
+            auto const& [sp_start, sp_size, sp_order] = c2c.getLocalSpectralDomain();
+            auto npts = AMREX_D_TERM(std::size_t(sp_size[0]),
+                                    *std::size_t(sp_size[1]),
+                                    *std::size_t(sp_size[2]));
+            Gpu::DeviceVector<GpuComplex<Real>> spv(npts);
+
+            c2c.forward(pio, spv.data());
+            c2c.backward(spv.data(), pio);
+
+            amrex::Scale(smf, -scaling, 0, 1, 0);
+            smf.ParallelAdd(mf);
+
+            MultiFab errmf(sba,sdm,1,0);
+            auto const& errma = errmf.arrays();
+
+            auto const& sma = smf.const_arrays();
+            ParallelFor(smf, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+            {
+                errma[b](i,j,k) = amrex::norm(sma[b](i,j,k));
+            });
+
+            auto error = errmf.norminf();
+            amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 1.e-6f;
+#else
+            auto eps = 1.e-13;
+#endif
+            AMREX_ALWAYS_ASSERT(error < eps);
+        }
     }
     amrex::Finalize();
 }
diff --git a/Tests/FFT/R2C/main.cpp b/Tests/FFT/R2C/main.cpp
index 1f3a0e6854..08861cea68 100644
--- a/Tests/FFT/R2C/main.cpp
+++ b/Tests/FFT/R2C/main.cpp
@@ -166,6 +166,72 @@ int main (int argc, char* argv[])
             auto eps = 1.e-6f;
 #else
             auto eps = 1.e-13;
+#endif
+            AMREX_ALWAYS_ASSERT(error < eps);
+        }
+
+        // Test raw pointer interface
+        {
+            auto sba = amrex::decompose(domain, ParallelDescriptor::NProcs());
+            DistributionMapping sdm{sba};
+            MultiFab smf(sba,sdm,1,0);
+            smf.ParallelCopy(mf);
+
+            Box cdomain(IntVect(0),IntVect(AMREX_D_DECL(domain.length(0)/2,
+                                                        domain.length(1)-1,
+                                                        domain.length(2)-1)));
+            auto sba_c = amrex::decompose(cdomain, ParallelDescriptor::NProcs());
+            DistributionMapping sdm_c{sba_c};
+            FabArray<BaseFab<GpuComplex<Real>>> smf_c(sba_c,sdm_c,1,0);
+
+            auto domain_size = domain.length().toArray();
+            FFT::R2C<Real,FFT::Direction::both> r2c(domain_size);
+
+            Real* preal = nullptr;
+            std::array<int,AMREX_SPACEDIM> local_start{AMREX_D_DECL(0,0,0)};
+            std::array<int,AMREX_SPACEDIM> local_size{AMREX_D_DECL(0,0,0)};
+            auto const& imap = smf.IndexArray();
+            AMREX_ALWAYS_ASSERT(imap.size() <= 1);
+            if (imap.size() == 1) {
+                preal = smf[imap[0]].dataPtr();
+                auto const& box = sba[imap[0]];
+                for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                    local_start[idim] = box.smallEnd(idim);
+                    local_size[idim] = box.length(idim);
+                }
+            }
+            r2c.setLocalDomain(local_start, local_size);
+
+            GpuComplex<Real>* pcomplex = nullptr;
+            std::array<int,AMREX_SPACEDIM> local_start_c{AMREX_D_DECL(0,0,0)};
+            std::array<int,AMREX_SPACEDIM> local_size_c{AMREX_D_DECL(0,0,0)};
+            auto const& imap_c = smf_c.IndexArray();
+            AMREX_ALWAYS_ASSERT(imap_c.size() <= 1);
+            if (imap_c.size() == 1) {
+                pcomplex = smf_c[imap_c[0]].dataPtr();
+                auto const& box = sba_c[imap_c[0]];
+                for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                    local_start_c[idim] = box.smallEnd(idim);
+                    local_size_c[idim] = box.length(idim);
+                }
+            }
+            r2c.setLocalSpectralDomain(local_start_c, local_size_c);
+
+            r2c.forward(preal, pcomplex);
+
+            amrex::Scale(smf_c, scaling, 0, 1, 0);
+
+            r2c.backward(pcomplex, preal);
+
+            mf2.ParallelCopy(smf);
+            MultiFab::Subtract(mf2, mf, 0, 0, 1, 0);
+
+            auto error = mf2.norminf();
+            amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 1.e-6f;
+#else
+            auto eps = 1.e-13;
 #endif
             AMREX_ALWAYS_ASSERT(error < eps);
         }
diff --git a/Tests/FFT/RawPtr/CMakeLists.txt b/Tests/FFT/RawPtr/CMakeLists.txt
new file mode 100644
index 0000000000..21a9d3b268
--- /dev/null
+++ b/Tests/FFT/RawPtr/CMakeLists.txt
@@ -0,0 +1,10 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources  main.cpp)
+
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/FFT/RawPtr/GNUmakefile b/Tests/FFT/RawPtr/GNUmakefile
new file mode 100644
index 0000000000..93376f4485
--- /dev/null
+++ b/Tests/FFT/RawPtr/GNUmakefile
@@ -0,0 +1,26 @@
+AMREX_HOME := ../../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+USE_FFT = TRUE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/FFT/RawPtr/Make.package b/Tests/FFT/RawPtr/Make.package
new file mode 100644
index 0000000000..6b4b865e8f
--- /dev/null
+++ b/Tests/FFT/RawPtr/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/FFT/RawPtr/main.cpp b/Tests/FFT/RawPtr/main.cpp
new file mode 100644
index 0000000000..29adbffa70
--- /dev/null
+++ b/Tests/FFT/RawPtr/main.cpp
@@ -0,0 +1,139 @@
+#include <AMReX_FFT.H>
+
+int main (int argc, char* argv[])
+{
+#ifdef AMREX_USE_MPI
+    MPI_Init(&argc, &argv);
+    amrex::Init_FFT(MPI_COMM_WORLD);
+#else
+    amrex::Init_FFT();
+#endif
+
+    int nprocs, myproc;
+#ifdef AMREX_USE_MPI
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#else
+    nprocs = 1;
+    myproc = 0;
+#endif
+
+    using RT = amrex::Real;
+    using CT = amrex::GpuComplex<RT>;
+
+    std::array<int,AMREX_SPACEDIM> domain_size{AMREX_D_DECL(128,128,128)};
+
+    {
+        amrex::FFT::R2C<RT,amrex::FFT::Direction::both> r2c(domain_size);
+
+        int nx = (domain_size[0] + nprocs - 1) / nprocs;
+        int xlo = nx * myproc;
+        nx = std::max(std::min(nx,domain_size[0]-xlo), 0);
+        std::array<int,AMREX_SPACEDIM> local_start{AMREX_D_DECL(xlo,0,0)};
+        std::array<int,AMREX_SPACEDIM> local_size{AMREX_D_DECL(nx,domain_size[1],domain_size[2])};
+
+        r2c.setLocalDomain(local_start,local_size);
+
+        auto const& [local_start_sp, local_size_sp] = r2c.getLocalSpectralDomain();
+        amrex::ignore_unused(local_start_sp);
+
+        auto nr = AMREX_D_TERM(std::size_t(local_size[0]),
+                              *std::size_t(local_size[1]),
+                              *std::size_t(local_size[2]));
+        auto* pr = (RT*)amrex::The_Arena()->alloc(sizeof(RT)*nr);
+        amrex::ParallelFor(nr, [=] AMREX_GPU_DEVICE (std::size_t i)
+        {
+            pr[i] = std::sin(i);
+        });
+
+        auto nc = AMREX_D_TERM(std::size_t(local_size_sp[0]),
+                              *std::size_t(local_size_sp[1]),
+                              *std::size_t(local_size_sp[2]));
+        auto* pc = (CT*)amrex::The_Arena()->alloc(sizeof(CT)*nc);
+
+        r2c.forward(pr, pc);
+
+        auto scaling = r2c.scalingFactor();
+        amrex::ParallelFor(nc, [=] AMREX_GPU_DEVICE (std::size_t i)
+        {
+            pc[i] *= scaling;
+        });
+
+        r2c.backward(pc, pr);
+
+        auto error = amrex::Reduce::Max<RT>
+            (nr, [=] AMREX_GPU_DEVICE (std::size_t i)
+                     {
+                         return std::abs(pr[i]-std::sin(i));
+                     });
+        amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+        auto eps = 1.e-6f;
+#else
+        auto eps = 1.e-13;
+#endif
+        amrex::ParallelDescriptor::Barrier();
+        AMREX_ALWAYS_ASSERT(error < eps);
+    }
+
+    int nbatch = 3;
+    {
+        amrex::FFT::Info info{};
+        info.setBatchSize(nbatch);
+        amrex::FFT::C2C<RT,amrex::FFT::Direction::both> c2c(domain_size,info);
+
+        auto const& [local_start, local_size] = c2c.getLocalDomain();
+
+        int nx = (domain_size[0] + nprocs - 1) / nprocs;
+        int xlo = nx * myproc;
+        nx = std::max(std::min(nx,domain_size[0]-xlo), 0);
+        std::array<int,AMREX_SPACEDIM> local_start_sp{AMREX_D_DECL(xlo,0,0)};
+        std::array<int,AMREX_SPACEDIM> local_size_sp{AMREX_D_DECL(nx,domain_size[1],domain_size[2])};
+
+        c2c.setLocalSpectralDomain(local_start_sp, local_size_sp);
+
+        auto nf = AMREX_D_TERM(std::size_t(local_size[0]),
+                              *std::size_t(local_size[1]),
+                              *std::size_t(local_size[2]));
+        auto* pf = (CT*)amrex::The_Arena()->alloc(sizeof(CT)*nf*nbatch);
+        amrex::ParallelFor(nf, [=] AMREX_GPU_DEVICE (std::size_t i)
+        {
+            pf[i] = amrex::GpuComplex(std::sin(i), std::cos(i));
+        });
+
+        auto nb = AMREX_D_TERM(std::size_t(local_size_sp[0]),
+                              *std::size_t(local_size_sp[1]),
+                              *std::size_t(local_size_sp[2]));
+        auto* pb = (CT*)amrex::The_Arena()->alloc(sizeof(CT)*nb*nbatch);
+
+        c2c.forward(pf, pb);
+
+        auto scaling = c2c.scalingFactor();
+        amrex::ParallelFor(nb, [=] AMREX_GPU_DEVICE (std::size_t i)
+        {
+            pb[i] *= scaling;
+        });
+
+        c2c.backward(pb, pf);
+
+        auto error = amrex::Reduce::Max<RT>
+            (nf, [=] AMREX_GPU_DEVICE (std::size_t i)
+                     {
+                         return amrex::norm(pf[i]-amrex::GpuComplex(std::sin(i),std::cos(i)));
+                     });
+        amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+        auto eps = 1.e-6f;
+#else
+        auto eps = 1.e-13;
+#endif
+        amrex::ParallelDescriptor::Barrier();
+        AMREX_ALWAYS_ASSERT(error < eps);
+    }
+
+    amrex::Finalize_FFT();
+
+#ifdef AMREX_USE_MPI
+    MPI_Finalize();
+#endif
+}