diff --git a/.github/workflows/catalyst.yml b/.github/workflows/catalyst.yml
new file mode 100644
index 00000000000..108b76dff10
--- /dev/null
+++ b/.github/workflows/catalyst.yml
@@ -0,0 +1,33 @@
+name: 🐧 Catalyst
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.ref }}-${{ github.head_ref }}-insituvis-catalyst
+  cancel-in-progress: true
+
+jobs:
+  catalyst:
+    name: Catalyst
+    runs-on: ubuntu-22.04
+    if: github.event.pull_request.draft == false
+    env:
+      CXX: g++
+      CC: gcc
+      CMAKE_PREFIX_PATH: "/opt/conduit:/opt/catalyst"
+    container:
+      image: kitware/paraview:ci-catalyst-amrex-warpx-20240701
+    steps:
+    - uses: actions/checkout@v4
+    - name: Configure
+      run: |
+        cmake -S . -B build          \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DAMReX_ENABLE_TESTS=ON  \
+            -DAMReX_FORTRAN=OFF      \
+            -DAMReX_CATALYST=ON      \
+            -DAMReX_CONDUIT=ON
+    - name: Build
+      run: |
+        cmake --build build -j 4
+
diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index c996db63c1c..d02342899b8 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -126,6 +126,53 @@ jobs:
         ccache -s
         du -hs ~/.cache/ccache
 
+  tests_cxx20:
+    name: Clang C++20 [tests]
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v4
+    - name: Dependencies
+      run: |
+        .github/workflows/dependencies/dependencies.sh
+        .github/workflows/dependencies/dependencies_clang.sh 15
+        .github/workflows/dependencies/dependencies_ccache.sh
+    - name: Set Up Cache
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/ccache
+        key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
+        restore-keys: |
+             ccache-${{ github.workflow }}-${{ github.job }}-git-
+    - name: Build & Install
+      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"}
+      run: |
+        export CCACHE_COMPRESS=1
+        export CCACHE_COMPRESSLEVEL=10
+        export CCACHE_MAXSIZE=100M
+        export CCACHE_EXTRAFILES=${{ github.workspace }}/.clang-tidy
+        export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt
+        ccache -z
+
+        mkdir build
+        cd build
+        cmake ..                                      \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+            -DCMAKE_VERBOSE_MAKEFILE=ON               \
+            -DAMReX_EB=ON                             \
+            -DAMReX_ENABLE_TESTS=ON                   \
+            -DAMReX_FORTRAN=OFF                       \
+            -DAMReX_MPI=ON                            \
+            -DCMAKE_CXX_STANDARD=20                   \
+            -DCMAKE_C_COMPILER=$(which clang-15)      \
+            -DCMAKE_CXX_COMPILER=$(which clang++-15)  \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+        make -j 4
+
+        ctest --output-on-failure
+
+        ccache -s
+        du -hs ~/.cache/ccache
+
   # Build 2D libamrex with configure
   configure-2d:
     name: Clang NOMPI Release [configure 2D]
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index 7e4fe8d40bf..5d8c9ad9c30 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -641,7 +641,7 @@ jobs:
     - name: Run tests
       run: |
         cd build
-        ctest --output-on-failure -R
+        ctest --output-on-failure
 
   test_hdf5:
     name: GNU@9.3 HDF5 I/O Test [tests]
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 168670eda30..4d1612d0ec4 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -24,7 +24,11 @@ jobs:
         restore-keys: |
              ccache-${{ github.workflow }}-${{ github.job }}-git-
     - name: Build & Install
-      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"}
+      # /tmp/icpx-2d34de0e47/global_vars-header-4390fb.h:25:36: error: zero size arrays are an extension [-Werror,-Wzero-length-array]
+      #    25 | const char* const kernel_names[] = {
+      #       |                                    ^
+      # 1 error generated.
+      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-zero-length-array"}
       run: |
         export CCACHE_COMPRESS=1
         export CCACHE_COMPRESSLEVEL=10
@@ -68,7 +72,11 @@ jobs:
         restore-keys: |
              ccache-${{ github.workflow }}-${{ github.job }}-git-
     - name: Build & Install
-      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"}
+      # /tmp/icpx-2d34de0e47/global_vars-header-4390fb.h:25:36: error: zero size arrays are an extension [-Werror,-Wzero-length-array]
+      #    25 | const char* const kernel_names[] = {
+      #       |                                    ^
+      # 1 error generated.
+      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-zero-length-array"}
       run: |
         export CCACHE_COMPRESS=1
         export CCACHE_COMPRESSLEVEL=10
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 4090f9c2ec0..e0e7bf69118 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -87,16 +87,17 @@ jobs:
 
         cmake -S . -B build   `
               -DCMAKE_VERBOSE_MAKEFILE=ON   `
+              -DCMAKE_BUILD_TYPE=RelWithDebInfo `
               -DCMAKE_CXX_FLAGS=" /D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR" `
               -DAMReX_EB=ON                 `
               -DAMReX_ENABLE_TESTS=ON       `
               -DAMReX_FORTRAN=OFF           `
               -DAMReX_MPI=OFF
               #-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build --config RelWithDebInfo -j 4
+        cmake --build build -j 4
 
-        cmake --build build --config RelWithDebInfo --target install
-        cmake --build build --config RelWithDebInfo --target test_install
+        cmake --build build --target install
+        cmake --build build --target test_install
 
         #ccache -s
 
diff --git a/CHANGES b/CHANGES
index 0c6b093820c..d5e4dc8ab4b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,70 @@
+# 24.08
+
+  -- Catalyst support (#4011)
+
+  -- Fix typos in FaceDivFree interior interpolation. (#4048)
+
+  -- ParmParse: Read IntVect & RealVect from array (#4050)
+     ParmParse: Fix assertion in new queryarr for IntVect & RealVect (#4051)
+
+  -- IParser: Use long long int (#4046)
+
+  -- ParmParse: Math expression support (#4032)
+
+  -- AmrData: remove dependency on Fortran (#4049)
+
+  -- Remove reliance on managed memory in HDF5 mesh IO (#4047)
+
+  -- New Feature in MLEBNodeFDLaplacian: MultiFab sigma coefficient (#3968)
+
+  -- Nodal Solver: Use multi-color Gauss-Seidel on GPU (#4043)
+
+  -- Use reduction to compute min and max particle distances in NeighborParticles test. (#3212)
+
+  -- ParmParse: Refactoring II (#4035)
+
+  -- Fix deadlock for CUDA (#4044)
+
+  -- ParmParse: Refactoring (#4031)
+
+  -- MLMG: Keep ghost cell values in user's inputs (#4040)
+
+  -- Geometry: Fix operator>> (#4042)
+
+  -- FillPatch for Face Data: Fix Geometry used in creating mask (#4039)
+
+  -- Parser: Allow dots in variable names (#4038)
+
+  -- HDF5: Remove unnecessary casting (#4036)
+
+  -- HDF5: Fix chunking warning (#4033) (#4034)
+
+  -- New parameter amrex::init_snan (#4030)
+
+  -- Change the default max_grid_size in 3D on GPU from 32 to 64 (#4029)
+
+  -- GpuArray: add operator+= (#4028)
+
+  -- FPE traps: Add functions for enabling and disabling FPE traps (#4025)
+
+  -- HDF5: Correctly determine max_mfi_count (#4024)
+
+  -- HDF5: expose ZLIB compression without SZ or ZFP flags (#4015)
+
+  -- Hypre: Use Gpu::hypreSynchronize (#4022)
+
+  -- AmrMesh: add const to a few functions (#4021)
+
+  -- Convexify AMR data (#4013)
+
+  -- Fix BottomSolver::bicgcg (#4020)
+
+  -- Fix some warnings issued by Coverity (#4017)
+
+  -- convert IndexType to IndexTypeND (#3988)
+
+  -- Workaround MSVC issue in tupleToArray (#4000)
+
 # 24.07
 
   -- HDF5: Correctly handle writing only some components when writing Particle (#4005)
diff --git a/Docs/sphinx_documentation/source/Basics.rst b/Docs/sphinx_documentation/source/Basics.rst
index 5887df56fb5..97e68fd1e68 100644
--- a/Docs/sphinx_documentation/source/Basics.rst
+++ b/Docs/sphinx_documentation/source/Basics.rst
@@ -224,17 +224,17 @@ file is a series of
 definitions in the form of ``prefix.name = value value ....`` For each line,
 text after # are comments. Here is an example inputs file.
 
-    .. highlight:: python
+.. highlight:: python
 
-    ::
+::
 
-        nsteps    = 100               # integer
-        nsteps    = 1000              # nsteps appears a second time
-        dt        = 0.03              # floating point number
-        ncells    = 128 64 32         # a list of 3 ints
-        xrange    = -0.5 0.5          # a list of 2 reals
-        title     = "Three Kingdoms"  # a string
-        hydro.cfl = 0.8               # with prefix, hydro
+    nsteps    = 100               # integer
+    nsteps    = 1000              # nsteps appears a second time
+    dt        = 0.03              # floating point number
+    ncells    = 128 64 32         # a list of 3 ints
+    xrange    = -0.5 0.5          # a list of 2 reals
+    title     = "Three Kingdoms"  # a string
+    hydro.cfl = 0.8               # with prefix, hydro
 
 The following code shows how to use :cpp:`ParmParse` to get/query the values.
 
@@ -275,6 +275,200 @@ by default returns the last one. The difference between :cpp:`query` and
 get the value, whereas :cpp:`query` returns an error code without generating a
 runtime error that will abort the run.
 
+Math Expressions
+----------------
+
+.. versionadded:: 24.08
+   Math expression support in :cpp:`ParmParse`.
+
+:cpp:`ParmParse` supports math expressions for integers and floating point
+numbers. For example,
+
+.. highlight:: python
+
+::
+
+    # three numbers. whitespaces inside `""` are okay.
+    f = 3+4  99  "5 + 6"
+
+    # two numbers. `\` is for continuation
+    g = 3.1+4.1 \
+        5.0+6.6
+
+    # two numbers unless using [query|get]WithParser
+    w = 1 -2
+
+    my_constants.alpha = 5.
+    amrex.c = c
+
+    # must use [query|get]WithParser
+    amrex.foo = sin( pi/2 ) + alpha + -amrex.c**2/c^2
+
+    # either [query|get] or [query|get]WithParser is okay
+    amrex.bar = sin(pi/2)+alpha+-amrex.c**2/c^2
+
+    geom.prob_lo = 2*sin(pi/4)/sqrt(2)  sin(pi/2)+cos(pi/2)  -(sin(pi*3/2)+cos(pi*3/2))
+
+    # three numbers. `\` is for continuation
+    geom.prob_hi =  "2*sin(pi/4)/sqrt(2)" \
+                    "sin(pi/2) + cos(pi/2)" \
+                    -(sin(pi*3/2)+cos(pi*3/2))
+
+can be processed by
+
+.. highlight:: c++
+
+::
+
+    {
+        ParmParse::SetParserPrefix("physical_constants");
+        ParmParse pp("physical_constants");
+        pp.add("c", 299792458.);
+        pp.add("pi", 3.14159265358979323846);
+    }
+    {
+        ParmParse pp;
+
+        double f0 = -1;
+        pp.query("f", f0);
+        std::cout << " double f = " << f0 << '\n';
+
+        std::vector<int> f;
+        pp.queryarr("f", f);
+        std::cout << " int f[3] = {" << f[0] << ", " << f[1] << ", "
+                  << f[2] << "}\n";
+
+        std::vector<double> g;
+        pp.queryarr("g", g);
+        std::cout << " double g[] = " << g[0] << " " << g[1] << '\n';
+
+        double w;
+        pp.query("w", w);
+        std::cout << " w = " << w << " with query\n";
+        pp.queryWithParser("w", w);
+        std::cout << " w = " << w << " with queryParser\n";
+    }
+    {
+        ParmParse pp("amrex", "my_constants");
+        double foo = -1, bar;
+        pp.getWithParser("foo", foo);
+        pp.get("bar", bar);
+        std::cout << " foo = " << foo << ", bar = " << bar << '\n';
+    }
+    {
+        ParmParse pp;
+        std::array<double,3> prob_lo, prob_hi;
+        pp.get("geom.prob_lo", prob_lo);
+        pp.get("geom.prob_hi", prob_hi);
+        std::cout << " double prob_lo[] = {" << prob_lo[0] << ", "
+                  << prob_lo[1] << ", " << prob_lo[2] << "}\n"
+                  << " double prob_hi[] = {" << prob_hi[0] << ", "
+                  << prob_hi[1] << ", " << prob_hi[2] << "}\n";
+    }
+
+The results will be
+
+.. highlight:: console
+
+::
+
+    double f = 7
+    int f[3] = {7, 99, 11}
+    double g[] = 7.2 11.6
+    w = 1 with query
+    w = -1 with queryParser
+    foo = 5, bar = 5
+    double prob_lo[] = {1, 1, 1}
+    double prob_hi[] = {1, 1, 1}
+
+Note that the empty spaces are significant for math expressions unless they
+are inside a pair of ``"`` or explicitly parsed by
+:cpp:`ParmParse::queryWithParser` or :cpp:`ParmParse::getWithParser`. If the
+expression contains another variable, it will be looked up by
+:cpp:`ParmParse`. :cpp:`ParmParse`'s constructor accepts an optional second
+argument, ``parser_prefix``. When a variable in a math expression is being
+looked up, it will first try to find it by using the exact name of the
+variable. If this attempt fails and the :cpp:`ParmParse` object has a
+non-empty non-static member ``parser_prefix``, it will try again, this time
+looking up the variable by prefixing its name with the value of
+``parser_prefix`` followed by a ``.``. If this attempt also fails and the
+:cpp:`ParmParse` class has a non-empty static member ``ParserPrefix`` (which
+can be set by :cpp:`ParmParse::SetParserPrefix`), it will try again, this
+time looking up the variable by prefixing its name with the value of
+``ParserPrefix`` followed by a ``.``.
+
+The variables in :cpp:`ParmParse` math expressions are not evaluated until
+they are referenced. If a variable is defined multiple times, the last
+occurrence will override previous ones even if it appears after the variable
+has been referenced. This behavior is demonstrated in the following example.
+
+.. highlight:: python
+
+::
+
+    foo.a = 1
+    foo.b = foo.a
+    foo.a = 2
+
+will become
+
+.. highlight:: python
+
+::
+
+    foo.a = 2
+    foo.b = 2
+
+Enum Class
+----------
+
+.. versionadded:: 24.09
+   Enum class support in :cpp:`ParmParse`.
+
+AMReX provides a macro :cpp:`AMREX_ENUM` for defining :cpp:`enum class` that
+supports reflection. For example,
+
+.. highlight:: c++
+
+::
+
+   AMREX_ENUM(MyColor, red, green, blue);
+
+   void f ()
+   {
+       MyColor color = amrex::getEnum<MyColor>("red"); // MyColor::red
+       std::string name = amrex::getEnumNameString(MyColor::blue); // "blue"
+       std::vector<std::string> names = amrex::getEnumNameStrings<MyColor>();
+       // names = {"red", "green", "blue"};
+       std::string class_name = amrex::getEnumClassName<MyColor>(); // "MyColor"
+   }
+
+This allows us to read :cpp:`ParmParse` parameters into enum class objects.
+
+.. highlight:: python
+
+::
+
+   color1 = red
+   color2 = BLue
+
+The following code shows how to query the enumerators.
+
+.. highlight:: c++
+
+::
+
+   AMREX_ENUM(MyColor, none, red, green, blue);
+
+   void f (MyColor& c1, MyColor& c2)
+   {
+       ParmParse pp;
+       pp.query("color1", c1); // c1 becomes MyColor::red
+       pp.query_enum_case_insensitive("color2", c2); // c2 becomes MyColor::blue
+       MyColor default_color; // MyColor::none
+       pp.query("color3", default_color); // Still MyColor::none
+   }
+
 Overriding Parameters with Command-Line Arguments
 -------------------------------------------------
 
@@ -453,7 +647,14 @@ Besides :cpp:`amrex::Parser` for floating point numbers, AMReX also provides
 similarity, but floating point number specific functions (e.g., ``sqrt``,
 ``sin``, etc.) are not supported in ``IParser``.  In addition to ``/`` whose
 result truncates towards zero, the integer parser also supports ``//`` whose
-result truncates towards negative infinity.
+result truncates towards negative infinity. Single quotes ``'`` are allowed
+as a separator for :cpp:`IParser` numbers just like C++ integer
+literals. Additionally, a floating point like number with a positive
+exponent may be accepted as an integer if it is reasonable to do so. For
+example, it's okay to have ``1.234e3``, but ``1.234e2`` is an error.
+
+  .. versionadded:: 24.08
+     Support for ``'`` and ``e`` in :cpp:`IParser` integers.
 
 .. _sec:basics:initialize:
 
@@ -912,7 +1113,7 @@ an :cpp:`int` pointer or array specifying periodicity. If a :cpp:`RealBox` is no
 given in the first constructor, AMReX  will construct one based on :cpp:`ParmParse` parameters,
 ``geometry.prob_lo`` / ``geometry.prob_hi`` / ``geometry.prob_extent``,
 where each of the parameter is an array of ``AMREX_SPACEDIM`` real numbers.
-See the section on :ref:`sec:inputs:pd` for more details about how to specify these.
+See the section on :ref:`sec:inputs:geom` for more details about how to specify these.
 
 The argument for coordinate system is an integer type with
 valid values being 0 (Cartesian), or 1 (cylindrical), or 2 (spherical). If it
diff --git a/Docs/sphinx_documentation/source/BuildingAMReX.rst b/Docs/sphinx_documentation/source/BuildingAMReX.rst
index 3176730320e..90fb4d6eb30 100644
--- a/Docs/sphinx_documentation/source/BuildingAMReX.rst
+++ b/Docs/sphinx_documentation/source/BuildingAMReX.rst
@@ -508,6 +508,8 @@ The list of available options is reported in the :ref:`table <tab:cmakevar>` bel
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_CONDUIT                |  Enable Conduit support                         | NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
+   | AMReX_CATALYST               |  Enable Catalyst support                        | NO                      | YES, NO               |
+   +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_ASCENT                 |  Enable Ascent support                          | NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_HYPRE                  |  Enable HYPRE interfaces                        | NO                      | YES, NO               |
diff --git a/Docs/sphinx_documentation/source/IO.rst b/Docs/sphinx_documentation/source/IO.rst
index 60fb0568851..185b48955d7 100644
--- a/Docs/sphinx_documentation/source/IO.rst
+++ b/Docs/sphinx_documentation/source/IO.rst
@@ -156,7 +156,7 @@ Currently supported compression libraries include `SZ`_ and `ZFP`_.
 To enable HDF5 output, AMReX must be compiled and linked to an HDF5 library
 with parallel I/O support, by adding ``USE_HDF5=TRUE`` and
 ``HDF5_HOME=/path/to/hdf5/install/dir`` to the GNUMakefile.
-many HPC systems have an HDF5 module available that can be loaded with
+Many HPC systems have an HDF5 module available that can be loaded with
 ``module load hdf5`` or ``module load cray-hdf5-parallel``. To download
 and compile HDF5 from source code, please go to `HDF5 Download`_ webpage
 and follow the instructions (latest version is recommended and remember
@@ -199,7 +199,7 @@ chapter on :ref:`Chap:Visualization`)
 
 HDF5 Plotfile Compression
 -------------------------
-To enable data compression on the HDF5 datasets, the corresponding compression
+To enable SZ or ZFP data compression on the HDF5 datasets, the corresponding compression
 library and its HDF5 plugin must be available. To compile `SZ`_ or `ZFP`_ plugin,
 please refer to their documentation: `H5Z-SZ`_ and `H5Z-ZFP`_, and adding
 ``USE_HDF5_SZ=TRUE``, ``SZ_HOME=``, or ``USE_HDF5_ZFP=TRUE``, ``ZFP_HOME=``,
@@ -210,12 +210,18 @@ please refer to their documentation: `H5Z-SZ`_ and `H5Z-ZFP`_, and adding
 .. _`H5Z-SZ`: https://github.com/szcompressor/SZ/tree/master/hdf5-filter/H5Z-SZ
 .. _`H5Z-ZFP`: https://github.com/LLNL/H5Z-ZFP
 
+ZLIB compression is available without external libraries or other make flags.
+Different compression levels (at the cost of read/write time) can be used, just
+like GZIP.
+
 The string argument :cpp:`compression` in the above two functions controls
 whether to enable data compression and its parameters. Currently supported
 options include:
 
 * No compression
     * ``None@0``
+* ZLIB compression
+    * ``ZLIB@compression_level``
 * SZ compression
     * ``SZ@/path/to/sz.config``
 * ZFP compression
@@ -224,6 +230,9 @@ options include:
     * ``ZFP_ACCURACY@accuracy``
     * ``ZFP_REVERSIBLE@reversible``
 
+Using compression requires data to be stored in a chunked format. The size of these
+chunks can (and generally should) be configured by changing the ``HDF5_CHUNK_SIZE``
+environment variable, with a default value of 1024 elements provided.
 
 HDF5 Asynchronous Output
 ------------------------
diff --git a/Docs/sphinx_documentation/source/InputsCheckpoint.rst b/Docs/sphinx_documentation/source/InputsCheckpoint.rst
deleted file mode 100644
index 13bae0e467f..00000000000
--- a/Docs/sphinx_documentation/source/InputsCheckpoint.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-.. _Chap:InputsCheckpoint:
-
-Checkpoint/Restart
-==================
-
-The following inputs must be preceded by "amr" and control checkpoint/restart.
-
-+------------------+-----------------------------------------------------------------------+-------------+-----------+
-|                  | Description                                                           |   Type      | Default   |
-+==================+=======================================================================+=============+===========+
-| restart          | If present, then the name of file to restart from                     |    String   | None      |
-+------------------+-----------------------------------------------------------------------+-------------+-----------+
-| check_int        | Frequency of checkpoint output;                                       |    Int      | -1        |
-|                  | if -1 then no checkpoints will be written                             |             |           |
-+------------------+-----------------------------------------------------------------------+-------------+-----------+
-| check_file       | Prefix to use for checkpoint output                                   |  String     | chk       |
-+------------------+-----------------------------------------------------------------------+-------------+-----------+
-
diff --git a/Docs/sphinx_documentation/source/InputsComputeBackends.rst b/Docs/sphinx_documentation/source/InputsComputeBackends.rst
deleted file mode 100644
index 26e5d527508..00000000000
--- a/Docs/sphinx_documentation/source/InputsComputeBackends.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-.. _Chap:InputsComputeBackends:
-
-Compute Backends
-================
-
-The following inputs must be preceded by ``amrex.`` and determine runtime options of CPU or GPU compute implementations.
-
-+------------------------+-----------------------------------------------------------------------+-------------+------------+
-| Parameter              | Description                                                           |   Type      | Default    |
-+========================+=======================================================================+=============+============+
-| ``omp_threads``        | If OpenMP is enabled, this can be used to set the default number of   |   String    | ``system`` |
-|                        | threads. The special value ``nosmt`` can be used to avoid using       |   or Int    |            |
-|                        | threads for virtual cores (aka Hyperthreading or SMT), as is default  |             |            |
-|                        | in OpenMP, and instead only spawns threads equal to the number of     |             |            |
-|                        | physical cores in the system.                                         |             |            |
-|                        | For the values ``system`` and ``nosmt``, the environment variable     |             |            |
-|                        | ``OMP_NUM_THREADS`` takes precedence. For Integer values,             |             |            |
-|                        | ``OMP_NUM_THREADS`` is ignored.                                       |             |            |
-+------------------------+-----------------------------------------------------------------------+-------------+------------+
-
-For GPU-specific parameters, see also the :ref:`GPU chapter <sec:gpu:parameters>`.
diff --git a/Docs/sphinx_documentation/source/InputsLoadBalancing.rst b/Docs/sphinx_documentation/source/InputsLoadBalancing.rst
deleted file mode 100644
index ea7ff2ebaae..00000000000
--- a/Docs/sphinx_documentation/source/InputsLoadBalancing.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-.. _Chap:InputsLoadBalancing:
-
-Gridding and Load Balancing
-===========================
-
-The following inputs must be preceded by "amr" and determine how we create the grids and how often we regrid.
-
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| Parameter              | Description                                                           |   Type      | Default   |
-+========================+=======================================================================+=============+===========+
-| regrid_int             | How often to regrid (in number of steps at level 0)                   |   Int       |    -1     |
-|                        | if regrid_int = -1 then no regridding will occur                      |             |           |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| max_grid_size_x        | Maximum number of cells at level 0 in each grid in x-direction        |    Int      | 32        |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| max_grid_size_y        | Maximum number of cells at level 0 in each grid in y-direction        |    Int      | 32        |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| max_grid_size_z        | Maximum number of cells at level 0 in each grid in z-direction        |    Int      | 32        |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| blocking_factor_x      | Each grid must be divisible by blocking_factor_x in x-direction       |    Int      |  8        |
-|                        | (must be 1 or power of 2)                                             |             |           |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| blocking_factor_y      | Each grid must be divisible by blocking_factor_y in y-direction       |    Int      |  8        |
-|                        | (must be 1 or power of 2)                                             |             |           |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| blocking_factor_z      | Each grid must be divisible by blocking_factor_z in z-direction       |    Int      |  8        |
-|                        | (must be 1 or power of 2)                                             |             |           |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| refine_grid_layout     | Split grids in half until the number of grids is no less than the     |    Bool     |  true     |
-|                        | number of procs. (Will be overridden if refine_grid_layout_[x,y,z]    |             |           |
-|                        | is specified)                                                         |             |           |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| refine_grid_layout_x   | Allow grids to be split in the x-dimension when refining the layout.  |    Int      |  1        |
-|                        | (1 to allow or 0 to disallow)                                         |             |           |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| refine_grid_layout_y   | Allow grids to be split in the y-dimension when refining the layout.  |    Int      |  1        |
-|                        | (1 to allow or 0 to disallow)                                         |             |           |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-| refine_grid_layout_z   | Allow grids to be split in the z-dimension when refining the layout.  |    Int      |  1        |
-|                        | (1 to allow or 0 to disallow)                                         |             |           |
-+------------------------+-----------------------------------------------------------------------+-------------+-----------+
-
-The following inputs must be preceded by "particles".
-
-+-------------------+-----------------------------------------------------------------------+-------------+-----------+
-|  Parameter        | Description                                                           |   Type      | Default   |
-+===================+=======================================================================+=============+===========+
-| max_grid_size_x   | Maximum number of cells at level 0 in each grid in x-direction        |    Int      | 32        |
-|                   | for grids in the ParticleBoxArray if dual_grid is true                |             |           |
-+-------------------+-----------------------------------------------------------------------+-------------+-----------+
-| max_grid_size_y   | Maximum number of cells at level 0 in each grid in y-direction        |    Int      | 32        |
-|                   | for grids in the ParticleBoxArray if dual_grid is true                |             |           |
-+-------------------+-----------------------------------------------------------------------+-------------+-----------+
-| max_grid_size_z   | Maximum number of cells at level 0 in each grid in z-direction        |    Int      | 32        |
-|                   | for grids in the ParticleBoxArray if dual_grid is true.               |             |           |
-+-------------------+-----------------------------------------------------------------------+-------------+-----------+
diff --git a/Docs/sphinx_documentation/source/InputsPlotFiles.rst b/Docs/sphinx_documentation/source/InputsPlotFiles.rst
deleted file mode 100644
index 9e8789a90ac..00000000000
--- a/Docs/sphinx_documentation/source/InputsPlotFiles.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-.. _Chap:InputsPlotfiles:
-
-Plotfiles and Other Output
-==========================
-
-The following inputs must be preceded by "amr" and control the frequency and naming of plotfile generation, as well
-as whether a plotfile should be written out immediately after restarting a simulation.
-
-+---------------------+-----------------------------------------------------------------------+-------------+-----------+
-|                     | Description                                                           |   Type      | Default   |
-+=====================+=======================================================================+=============+===========+
-| plot_int            | Frequency of plotfile output;                                         |    Int      | -1        |
-|                     | if -1 then no plotfiles will be written                               |             |           |
-+---------------------+-----------------------------------------------------------------------+-------------+-----------+
-| plotfile_on_restart | Should we write a plotfile when we restart (only used if plot_int>0)  |   Bool      | 0 (false) |
-+---------------------+-----------------------------------------------------------------------+-------------+-----------+
-| plot_file           | Prefix to use for plotfile output                                     |  String     | plt       |
-+---------------------+-----------------------------------------------------------------------+-------------+-----------+
diff --git a/Docs/sphinx_documentation/source/InputsProblemDefinition.rst b/Docs/sphinx_documentation/source/InputsProblemDefinition.rst
deleted file mode 100644
index 3e67ffdb38e..00000000000
--- a/Docs/sphinx_documentation/source/InputsProblemDefinition.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-.. _sec:inputs:pd:
-
-Problem Definition
-==================
-
-The following inputs must be preceded by "amr."
-
-+-------------------+-----------------------------------------------------------------------+-------------+-----------+
-|                   | Description                                                           |   Type      | Default   |
-+===================+=======================================================================+=============+===========+
-| n_cell            | Number of cells at level 0 in each coordinate direction               | Int Int Int | None      |
-+-------------------+-----------------------------------------------------------------------+-------------+-----------+
-| max_level         | Maximum level of refinement allowed (0 when single-level)             |    Int      | None      |
-+-------------------+-----------------------------------------------------------------------+-------------+-----------+
-
-The following inputs must be preceded by "geometry."
-
-+-----------------+-----------------------------------------------------------------------+-------------+-----------+
-|                 | Description                                                           |   Type      | Default   |
-+=================+=======================================================================+=============+===========+
-| coord_sys       | 0 for Cartesian                                                       |   Int       |   0       |
-+-----------------+-----------------------------------------------------------------------+-------------+-----------+
-| is_periodic     | 1 for true, 0 for false (one value for each coordinate direction)     |   Ints      | 0 0 0     |
-+-----------------+-----------------------------------------------------------------------+-------------+-----------+
-| prob_lo         | Low corner of physical domain (physical not index space)              |   Reals     | 0 0 0     |
-+-----------------+-----------------------------------------------------------------------+-------------+-----------+
-| prob_hi         | High corner of physical domain (physical not index space)             |   Reals     | None      |
-+-----------------+-----------------------------------------------------------------------+-------------+-----------+
-| prob_extent     | Extent of physical domain (physical not index space)                  |   Reals     | None      |
-+-----------------+-----------------------------------------------------------------------+-------------+-----------+
-
-Note that internally ``prob_lo`` and ``prob_hi`` are the variables carried by the ``Geometry`` class.
-In the inputs file (or command line), one can specify
-1) ``geometry.prob_hi`` only or
-2) ``geometry.prob_extent`` only or
-3) ``geometry.prob_lo`` and ``geometry.prob_hi`` or
-4) ``geometry.prob_lo`` and ``geometry.prob_extent``.
-If ``geometry.prob_lo`` is not specified then it will be 0 in each coordinate direction.
-If ``geometry.prob_extent`` is specified (and ``geometry.prob_hi`` is not) then internally
-"prob_hi" will be set to "prob_lo" + "prob_extent".
-
diff --git a/Docs/sphinx_documentation/source/InputsTimeStepping.rst b/Docs/sphinx_documentation/source/InputsTimeStepping.rst
deleted file mode 100644
index 144ef0fe6dd..00000000000
--- a/Docs/sphinx_documentation/source/InputsTimeStepping.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-.. sec:InputsTimeStepping:
-
-Time Stepping
-=============
-
-The following inputs must be preceded by "amr."   Note that if both are specified, both criteria
-are used and the simulation still stop when the first criterion is hit.  In the case of unsteady flow,
-the simulation will stop when either the number of steps reaches max_step or time reaches stop_time.
-In the case of unsteady flow, the simulation will stop when either the tolerance (difference between
-subsequent steps) is reached or the number of iterations reaches the maximum number specified.
-
-+------------------+-----------------------------------------------------------------------+-------------+-----------+
-|                  | Description                                                           |   Type      | Default   |
-+==================+=======================================================================+=============+===========+
-| max_step         | Maximum number of time steps to take                                  |    Int      |  -1       |
-+------------------+-----------------------------------------------------------------------+-------------+-----------+
-| stop_time        | Maximum time to reach                                                 |    Real     | -1.0      |
-+------------------+-----------------------------------------------------------------------+-------------+-----------+
diff --git a/Docs/sphinx_documentation/source/Inputs_Chapter.rst b/Docs/sphinx_documentation/source/Inputs_Chapter.rst
deleted file mode 100644
index 43ead40b3c6..00000000000
--- a/Docs/sphinx_documentation/source/Inputs_Chapter.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. _Chap:Inputs:
-
-Run-time Inputs
-===============
-
-.. toctree::
-   :maxdepth: 1
-
-   InputsProblemDefinition
-   InputsTimeStepping
-   InputsLoadBalancing
-   InputsComputeBackends
-   InputsPlotFiles
-   InputsCheckpoint
-
diff --git a/Docs/sphinx_documentation/source/LinearSolvers.rst b/Docs/sphinx_documentation/source/LinearSolvers.rst
index ab0ba3506a9..ee3ebe0efcf 100644
--- a/Docs/sphinx_documentation/source/LinearSolvers.rst
+++ b/Docs/sphinx_documentation/source/LinearSolvers.rst
@@ -568,6 +568,19 @@ residual correction form of the original problem. To build Hypre, follow the nex
     5.- Create an environment variable with the HYPRE directory --
         HYPRE_DIR=/hypre_path/hypre/src/hypre
 
+To use Hypre with CUDA, nvcc compiler is needed along with all other requirements for CPU (e.g. gcc, mpicc). It is very important that the GPU architecture for Hypre matches with that of AMReX. By default, Hypre assumes its architecture number to be 70 and it is best to build Hypre for multiple architectures by specifying multiple compute capability numbers (e.g. 80 and 90).
+
+::
+
+    1.- git clone https://github.com/hypre-space/hypre.git
+    2.- cd hypre/src
+    3.- ./configure --with-cuda -—with-gpu-arch=’80 90'
+        (you can figure out the gpu arch from command line using
+        nvidia-smi --query-gpu=compute_cap --format=csv, if it gives 9.0, gpu-arch is 90)
+    4.- make install
+    5.- Create an environment variable with the HYPRE directory --
+        HYPRE_DIR=/hypre_path/hypre/src/hypre
+
 To use hypre, one must include ``amrex/Src/Extern/HYPRE`` in the build system.
 For examples of using hypre, we refer the reader to
 `ABecLaplacian`_ or `NodeTensorLap`_.
diff --git a/Docs/sphinx_documentation/source/RuntimeParameters.rst b/Docs/sphinx_documentation/source/RuntimeParameters.rst
new file mode 100644
index 00000000000..c57c74c8fb1
--- /dev/null
+++ b/Docs/sphinx_documentation/source/RuntimeParameters.rst
@@ -0,0 +1,1238 @@
+
+.. _chap:inputs:
+
+Runtime Parameters
+==================
+
+.. role:: cpp(code)
+   :language: c++
+
+This chapter contains a list of AMReX :cpp:`ParmParse` runtime parameters
+and their **default** values. They can be set by either including them in an
+inputs file, or specifying them at the command line, or passing a function
+to :cpp:`amrex::Initialize` and the function adds parameters to AMReX's
+:cpp:`ParmParse`'s parameter database. For more information on
+:cpp:`ParmParse`, see :ref:`sec:basics:parmparse`.
+
+.. important:: AMReX reserves the following prefixes in :cpp:`ParmParse`
+               parameters: ``amr``, ``amrex``, ``blprofiler``, ``device``,
+               ``DistributionMapping``, ``eb2``, ``fab``, ``fabarray``,
+               ``geometry``, ``particles``, ``tiny_profiler``, and
+               ``vismf``.
+
+AMR
+---
+
+AMReX applications with AMR use either :cpp:`class AmrCore` or the more
+specialized :cpp:`class Amr`. Since :cpp:`class Amr` is derived from
+:cpp:`class AmrCore`, the parameters for the :cpp:`AmrCore` class also apply
+to the :cpp:`Amr` class. Additionally, :cpp:`class AmrCore` is derived from
+:cpp:`class AmrMesh`, so :cpp:`AmrMesh` member functions are also available
+to :cpp:`AmrCore` and :cpp:`Amr`.
+
+AmrCore Class
+^^^^^^^^^^^^^
+
+Below are a list of important :cpp:`ParmParse` parameters. However, AMReX
+applications can choose to avoid them entirely by use this :cpp:`AMRCore`
+constructor :cpp:`AmrCore(Geometry const& level_0_geom, AmrInfo const&
+amr_info)`, where :cpp:`struct AmrInfo` contains all the information that
+can be set via :cpp:`ParmParse`.
+
+.. py:data:: amr.verbose
+   :type: int
+   :value: 0
+
+   This controls the verbosity level of :cpp:`AmrCore` functions.
+
+.. py:data:: amr.n_cell
+   :type: int array
+   :value: [none]
+
+   This parameter is used only when ``n_cell`` is not provided as an
+   argument to :cpp:`AmrCore` constructors. It specifies the number of cells
+   in each dimension on Level 0.
+
+.. py:data:: amr.max_level
+   :type: int
+   :value: [none]
+
+   This parameter is used only when ``max_level`` is not provided as an
+   argument to :cpp:`AmrCore` constructors. It specifies the maximum level
+   of refinement allowed. Note that the total number of levels, including
+   the base level 0, is ``max_level+1``.
+
+.. py:data:: amr.ref_ratio
+   :type: int array
+   :value: 2 2 2 ... 2
+
+   If the refinement ratio is not provided as an argument to :cpp:`AmrCore`
+   constructors and :py:data:`amr.ref_ratio_vect` is not found in the
+   :cpp:`ParmParse` database, this parameter will be used to set the
+   refinement ratios between AMR levels. If there are more AMR levels than
+   the size of the integer parameter array, the last integer will be used as
+   the refinement ratio for the unspecified levels. For example, if
+   ``max_level`` is 4 and the provided ``amr.ref_ratio`` parameter is ``2
+   4``, the refinement ratios are 2, 4, 4 and 4, for levels 0/1, 1/2, 2/3
+   and 3/4, respectively.
+
+.. py:data:: amr.ref_ratio_vect
+   :type: int array
+   :value: [none]
+
+   If the refinement ratio is not provided as an argument to :cpp:`AmrCore`
+   constructors and :py:data:`amr.ref_ratio_vect` is found in the
+   :cpp:`ParmParse` database, it will be used to set the refinement ratios
+   between AMR levels. It's an error if the size of the integer array, if
+   found, is less than ``max_level*AMREX_SPACEDIM``. The first
+   ``AMREX_SPACEDIM`` numbers specify the refinement ratios in the
+   ``AMREX_SPACEDIM`` dimensions between levels 0 and 1, the next
+   ``AMREX_SPACEDIM`` numbers specify the ratios for levels 1 and 2, and so
+   on.
+
+.. py:data:: amr.max_grid_size
+   :type: int array
+   :value: [build dependent]
+
+   This controls the maximum grid size on AMR levels, one value for each
+   level. If the size of the integer array is less than the total number of
+   levels, the last integer will be used for the unspecified levels. The
+   default value is 128 for 1D and 2D runs. For 3D runs, the default value
+   is 64 and 32, for GPU and CPU runs, respectively. Note that the user can
+   also call :cpp:`AmrMesh::SetMaxGridSize` to set the maximum grid
+   sizes. Additionally, the values set by this parameter can be overridden
+   by :py:data:`amr.max_grid_size_x`, :py:data:`amr.max_grid_size_y` and
+   :py:data:`amr.max_grid_size_z`.
+
+.. py:data:: amr.max_grid_size_x
+   :type: int array
+   :value: [none]
+
+   If provided, this will override the maximum grid size in the x-direction
+   set by :py:data:`amr.max_grid_size`. If the size of the integer array is
+   less than the total number of levels, the last integer will be used for
+   the unspecified levels.
+
+.. py:data:: amr.max_grid_size_y
+   :type: int array
+   :value: [none]
+
+   If provided, this will override the maximum grid size in the y-direction
+   set by :py:data:`amr.max_grid_size`. If the size of the integer array is
+   less than the total number of levels, the last integer will be used for
+   the unspecified levels.
+
+.. py:data:: amr.max_grid_size_z
+   :type: int array
+   :value: [none]
+
+   If provided, this will override the maximum grid size in the z-direction
+   set by :py:data:`amr.max_grid_size`. If the size of the integer array is
+   less than the total number of levels, the last integer will be used for
+   the unspecified levels.
+
+.. py:data:: amr.blocking_factor
+   :type: int array
+   :value: [build dependent]
+
+   This controls the blocking factor on AMR levels, one value for each
+   level. If the size of the integer array is less than the total number of
+   levels, the last integer will be used for the unspecified levels. The
+   default value is 8. Note that the user can also call
+   :cpp:`AmrMesh::SetBlockingFactor` to set the blocking
+   factors. Additionally, the values set by this parameter can be overridden
+   by :py:data:`amr.blocking_factor_x`, :py:data:`amr.blocking_factor_y` and
+   :py:data:`amr.blocking_factor_z`.
+
+.. py:data:: amr.blocking_factor_x
+   :type: int array
+   :value: [none]
+
+   If provided, this will override the blocking factor in the x-direction
+   set by :py:data:`amr.blocking_factor`. If the size of the integer array
+   is less than the total number of levels, the last integer will be used
+   for the unspecified levels.
+
+.. py:data:: amr.blocking_factor_y
+   :type: int array
+   :value: [none]
+
+   If provided, this will override the blocking factor in the y-direction
+   set by :py:data:`amr.blocking_factor`. If the size of the integer array
+   is less than the total number of levels, the last integer will be used
+   for the unspecified levels.
+
+.. py:data:: amr.blocking_factor_z
+   :type: int array
+   :value: [none]
+
+   If provided, this will override the blocking factor in the z-direction
+   set by :py:data:`amr.blocking_factor`. If the size of the integer array
+   is less than the total number of levels, the last integer will be used
+   for the unspecified levels.
+
+.. py:data:: amr.n_proper
+   :type: int
+   :value: 1
+
+   This parameter controls the proper nesting of grids on AMR levels. For
+   example, if we have ``blocking_factor = 8``, ``ref_ratio = 2`` and
+   ``n_proper = 1``, there will be at least ``8/2*1 = 4`` coarse level cells
+   outside the fine level grids except at the physical boundaries. Note that
+   the user can also call :cpp:`AmrMesh::SetNProper(int)` to set the proper
+   nesting parameter.
+
+.. py:data:: amr.grid_eff
+   :type: amrex::Real
+   :value: 0.7
+
+   This parameter controls the grid efficiency threshold during grid
+   creation. While a higher value can enhance efficiency, it may negatively
+   impact overall performance, especially for GPU runs, because it tends to
+   create smaller grids. Note that the user can also call
+   :cpp:`AmrMesh::SetGridEff(Real)` to set the grid efficiency threshold.
+
+.. py:data:: amr.n_error_buf
+   :type: int array
+   :value: 1 1 1 ... 1
+
+   This parameter controls how many extra cells will be tagged around every
+   tagged cell. For example, if ``n_error_buf = 2``, tagging cell
+   ``(i,j,k)`` will result in the tagging of the region of from lower corner
+   ``(i-2,j-2,k-2)`` to upper corner ``(i+2,j+2,k+2)``. If the size of the
+   integer array is less than the number of levels, the last integer will be
+   used for the unspecified levels. Note that the values set by this
+   parameter can be overridden by :py:data:`amr.n_error_buf_x`,
+   :py:data:`amr.n_error_buf_y` and :py:data:`amr.n_error_buf_z`.
+
+
+.. py:data:: amr.n_error_buf_x
+   :type: int array
+   :value: [none]
+
+   This parameter controls the error buffer size in the x-direction. If the
+   size of the integer array is less than the number of levels, the last
+   integer will be used for the unspecified levels.
+
+.. py:data:: amr.n_error_buf_y
+   :type: int array
+   :value: [none]
+
+   This parameter controls the error buffer size in the y-direction. If the
+   size of the integer array is less than the number of levels, the last
+   integer will be used for the unspecified levels.
+
+.. py:data:: amr.n_error_buf_z
+
+   This parameter controls the error buffer size in the z-direction. If the
+   size of the integer array is less than the number of levels, the last
+   integer will be used for the unspecified levels.
+
+.. py:data:: amr.refine_grid_layout
+   :type: bool
+   :value: true
+
+   If it's true, AMReX will attempt to chop new grids into smaller chunks
+   ensuring at least one grid per MPI process, provided this does not
+   violate the blocking factor constraint.
+
+.. py:data:: amr.refine_grid_layout_x
+   :type: bool
+   :value: [none]
+
+   This parameter, if found, will override the
+   :py:data:`amrex.refine_grid_layout` parameter in the x-direction.
+
+.. py:data:: amr.refine_grid_layout_y
+   :type: bool
+   :value: [none]
+
+   This parameter, if found, will override the
+   :py:data:`amrex.refine_grid_layout` parameter in the y-direction.
+
+.. py:data:: amr.refine_grid_layout_z
+   :type: bool
+   :value: [none]
+
+   This parameter, if found, will override the
+   :py:data:`amrex.refine_grid_layout` parameter in the z-direction.
+
+.. py:data:: amr.check_input
+   :type: bool
+   :value: true
+
+   If this is true, AMReX will check if the various parameters in
+   :cpp:`AmrMesh` are reasonable.
+
+Amr Class
+^^^^^^^^^
+
+.. warning:: These parameters are specific to :cpp:`class Amr` based
+             applications. If your application use :cpp:`class AmrCore`
+             directly, they do not apply unless you have provided
+             implementations for them.
+
+Subcycling
+""""""""""
+
+.. py:data:: amr.subcycling_mode
+   :type: string
+   :value: Auto
+
+   This controls the subcycling mode of :cpp:`class Amr`. Possible value
+   are ``None`` for no subcycling, or ``Auto`` for subcycling.
+
+Regrid
+""""""
+
+.. py:data:: amr.regrid_int
+   :type: int array
+   :value: 1 1 1 ... 1
+
+   This controls how often we perform the regrid operation on AMR levels 0
+   to ``max_level-1``. If the parameter is a single value, it will be used
+   on all levels. If the parameter is an array of more than one values, the
+   size must be at least ``max_level`` and values after the first
+   ``max_level`` elements are ignored.
+
+.. py:data:: amr.regrid_on_restart
+   :type: bool
+   :value: false
+
+   This controls whether we perform regrid immediately after restart.
+
+.. py:data:: amr.force_regrid_level_zero
+   :type: bool
+   :value: false
+
+   This controls whether we perform regrid on level 0.
+
+.. py:data:: amr.compute_new_dt_on_regrid
+   :type: bool
+   :value: false
+
+   This controls whether we re-compute ``dt`` after regrid.
+
+.. py:data:: amr.initial_grid_file
+   :type: string
+   :value: [none]
+
+   If this is set, the initial grids will be read from the specified file.
+
+.. py:data:: amr.regrid_file
+   :type: string
+   :value: [none]
+
+   If this is set, regrid will use the grids in the specified file.
+
+I/O
+"""
+
+.. py:data:: amr.restart
+   :type: string
+   :value: [none]
+
+   If this is set, the simulation will restart from the specified checkpoint
+   file.
+
+.. py:data:: amr.plotfile_on_restart
+   :type: bool
+   :value: false
+
+   If this is set to true, a plotfile will be written after restart.
+
+.. py:data:: amr.file_name_digits
+   :type: int
+   :value: 5
+
+   This parameter specifies the minimum number of digits in checkpoint and
+   plotfile names.
+
+.. py:data:: amr.checkpoint_files_output
+   :type: bool
+   :value: true
+
+   This controls whether we write checkpoint files.
+
+.. py:data:: amr.check_file
+   :type: string
+   :value: chk
+
+   This sets the "root" of checkpoint file names. For example, the
+   checkpoint files are named ``chk00000``, ``chk001000``, etc. by default.
+
+.. py:data:: amr.check_int
+   :type: int
+   :value: -1
+
+   This controls the interval of writing checkpoint files, defined as the
+   number of level 0 steps between each checkpoint. A value less than 1
+   indicates no checkpoint files will be written.
+
+.. py:data:: amr.check_per
+   :type: amrex::Real
+   :value: -1
+
+   This controls the interval of writing checkpoint files, defined as the
+   time (not the wall time) elapsed between each checkpoint. A value less
+   or equal to 0 indicates no checkpoint files will be written.
+
+.. py:data:: amr.checkpoint_nfiles
+   :type: int
+   :value: 64
+
+   This is the maximum number of binary files per :cpp:`MultiFab` when
+   writing checkpoint files.
+
+.. py:data:: amr.plot_files_output
+   :type: bool
+   :value: true
+
+   This controls whether we write plot files.
+
+.. py:data:: amr.plot_file
+   :type: string
+   :value: plt
+
+   This sets the "root" of plot file names. For example, the plot files are
+   named ``plt00000``, ``plt001000``, etc. by default.
+
+.. py:data:: amr.plot_int
+   :type: int
+   :value: -1
+
+   This controls the interval of writing plot files, defined as the number
+   of level 0 steps between each plot file. A value less than 1 indicates no
+   plot files will be written.
+
+.. py:data:: amr.plot_per
+   :type: amrex::Real
+   :value: -1
+
+   This controls the interval of writing plot files, defined as the time
+   (not the wall time) elapsed between each plot file. A value less or equal
+   to 0 indicates no plot files will be written.
+
+.. py:data:: amr.plot_log_per
+   :type: amrex::Real
+   :value: -1
+
+   This controls the interval of writing plot files, defined as the
+   ``log10`` time (not the wall time) elapsed between each plot file. A
+   value less or equal to 0 indicates no plot files will be written.
+
+.. py:data:: amr.plot_max_level
+   :type: int
+   :value: amr.max_level
+
+   This controls the finest level in a plot file. For example, if the finest
+   level in a run is 3, but this parameter is set to 1, only levels 0 and 1
+   will be saved in a plot file.
+
+.. py:data:: amr.plot_nfiles
+   :type: int
+   :value: 64
+
+   This is the maximum number of binary files per :cpp:`MultiFab` when
+   writing plot files.
+
+.. py:data:: amr.plot_vars
+   :type: string array
+   :value: [none]
+
+   If this parameter is set, the variables specified in the string array
+   will be the state variables saved in the plot files. The special values
+   ``ALL`` and ``NONE`` mean that all or none of the state variables will be
+   saved. If this parameter is not set, all state variables will be saved.
+
+.. py:data:: amr.derive_plot_vars
+   :type: string array
+   :value: [none]
+
+   If this parameter is set, the variables specified in the string array
+   will be the derive variables saved in the plot files. The special values
+   ``ALL`` and ``NONE`` mean that all or none of the derive variables will
+   be saved. If this parameter is not set, none of the derive variables will
+   be saved.
+
+.. py:data:: amr.small_plot_file
+   :type: string
+   :value: smallplt
+
+   This sets the "root" of small plot file names. For example, the small
+   plot files are named ``smallplt00000``, ``smallplt001000``, etc. by
+   default.
+
+.. py:data:: amr.small_plot_int
+   :type: int
+   :value: -1
+
+   This controls the interval of writing small plot files, defined as the
+   number of level 0 steps between each small plot file. A value less than 1
+   indicates no small plot files will be written.
+
+.. py:data:: amr.small_plot_per
+   :type: amrex::Real
+   :value: -1
+
+   This controls the interval of writing small plot files, defined as the
+   time (not the wall time) elapsed between each small plot file. A value
+   less or equal to 0 indicates no small plot files will be written.
+
+.. py:data:: amr.small_plot_log_per
+   :type: amrex::Real
+   :value: -1
+
+   This controls the interval of writing small plot files, defined as the
+   ``log10`` time (not the wall time) elapsed between each small plot
+   file. A value less or equal to 0 indicates no small plot files will be
+   written.
+
+.. py:data:: amr.small_plot_vars
+   :type: string array
+   :value: [none]
+
+   If this parameter is set, the variables specified in the string array
+   will be the state variables saved in the small plot files. The special
+   values ``ALL`` and ``NONE`` mean that all or none of the state variables
+   will be saved. If this parameter is not set, none of the state variables
+   will be saved.
+
+.. py:data:: amr.derive_small_plot_vars
+   :type: string array
+   :value: [none]
+
+   If this parameter is set, the variables specified in the string array
+   will be the derive variables saved in the small plot files. The special
+   values ``ALL`` and ``NONE`` mean that all or none of the derive variables
+   will be saved. If this parameter is not set, none of the derive variables
+   will be saved.
+
+.. py:data:: amr.message_int
+   :type: int
+   :value: 10
+
+   This controls the interval of checking messages during a run, defined as
+   the number of level 0 steps between checks. A value less than 1 indicates
+   no checking will be performed. A message refers to a file created by the
+   user on the disk, where only the file name is checked, not its
+   content. If the file name matches one of the following predefined names,
+   appropriate actions will be taken.
+
+   dump_and_continue
+      Make a checkpoint file and continue running the simulation.
+
+   stop_run
+      Stop the simulation.
+
+   dump_and_stop
+      Make a checkpoint file and stop the simulation.
+
+   plot_and_continue
+      Make a plot file and continue running the simulation.
+
+   small_plot_and_continue
+      Make a small plot file and continue running the simulation.
+
+.. py:data:: amr.write_plotfile_with_checkpoint
+   :type: bool
+   :value: true
+
+   This parameter is for the message action discussed in
+   :py:data:`amr.message_int`. It controls whether an action will make a
+   plot file as well when asked to make a checkpoint file.
+
+.. py:data:: amr.run_log
+   :type: string
+   :value: [none]
+
+   If this parameter is set, the run log will be enabled and this is the log
+   file name.
+
+.. py:data:: amr.run_log_terse
+   :type: string
+   :value: [none]
+
+   If this parameter is set, the terse run log will be enabled and this is
+   the log file name.
+
+.. py:data:: amr.grid_log
+   :type: string
+   :value: [none]
+
+   If this parameter is set, the grid log will be enabled and this is the
+   log file name.
+
+.. py:data:: amr.data_log
+   :type: string
+   :value: [none]
+
+   If this parameter is set, the data log will be enabled and this is the
+   log file name.
+
+Basic Controls
+--------------
+
+.. py:data:: amrex.verbose
+   :type: int
+   :value: 1
+
+   This controls the verbosity level of AMReX. Besides using
+   :cpp:`ParmParse`, you can also call :cpp:`amrex::SetVerbose(int)` to set
+   it.
+
+.. py:data:: amrex.init_snan
+   :type: bool
+   :value: [build dependent]
+
+   This controls whether :cpp:`MultiFab`, :cpp:`FArrayBox`,
+   :cpp:`BaseFab<double|float>`, :cpp:`PODVectors<double|float>`,
+   :cpp:`Gpu::DeviceVector<double|float>`, etc. will be initialized to
+   signaling NaNs at construction. The default value is true for debug
+   builds. For non-debug builds, the default is false unless ``TEST=TRUE``
+   for GNU Make or ``AMReX_TESTING`` is enabled for CMake.
+
+.. py:data:: amrex.abort_on_unused_inputs
+   :type: bool
+   :value: false
+
+   If this is true and there are unused :cpp:`ParmParse` parameters, AMReX
+   will abort during :cpp:`amrex::Finalize`.
+
+.. py:data:: amrex.parmparse.verbose
+   :type: int
+   :value: amrex.verbose
+
+   If this is greater than zero, unused :cpp:`ParmParse` variables will be
+   printed out during :cpp:`amrex::Finalize` or
+   :cpp:`ParmParse::QueryUnusedInputs`. The parameter can also be set by
+   calling :cpp:`amrex::ParmParse::SetVerbose(int)`.
+
+.. py:data:: amrex.device.verbose
+   :type: int
+   :value: 0
+
+   This controls whether AMReX prints out GPU device properties such name,
+   vendor, total memory size, etc. This is only relevant for GPU runs.
+
+.. py:data:: amrex.max_gpu_streams
+   :type: int
+   :value: 4
+
+   This controls the number of GPU streams used by AMReX. It's only relevant
+   for GPU runs.
+
+.. py:data:: amrex.omp_threads
+   :type: string
+   :value: system
+
+   If OpenMP is enabled, this can be used to set the default number of
+   threads. Possible values are ``system``, ``nosmt``, or an integer
+   string. The special value ``nosmt`` can be used to avoid using threads
+   for virtual cores (aka Hyperthreading or SMT), as is default in OpenMP,
+   and instead only spawns threads equal to the number of physical cores in
+   the system.  For the values ``system`` and ``nosmt``, the environment
+   variable ``OMP_NUM_THREADS`` takes precedence. If the string can be
+   converted to an integer, ``OMP_NUM_THREADS`` is ignored.
+
+.. py:data:: amrex.memory_log
+   :type: string
+   :value: memlog
+
+   This is the name of the memory log file when memory profiling is enabled.
+
+Communication
+-------------
+
+.. py:data:: amrex.use_gpu_aware_mpi
+   :type: bool
+   :value: false
+
+   For GPU runs, this controls the memory type used for AMReX's
+   communication buffers. When this is true, AMReX uses GPU device memory
+   for communication data in MPI function calls. When this is false, the
+   data are placed in pinned memory. Note that this flag does not enable
+   GPU-aware MPI by itself. Enabling GPU-aware MPI is system
+   dependent. Users should consult their system's documentation for
+   instructions on setting up the environment and linking to GPU-aware MPI
+   libraries.
+
+Distribution Mapping
+--------------------
+
+.. py:data:: DistributionMapping.verbose
+   :type: int
+   :value: 0
+
+   This controls the verbosity level of :cpp:`DistributionMapping`
+   functions.
+
+.. py:data:: DistributionMapping.strategy
+   :type: string
+   :value: SFC
+
+   This is the default :cpp:`DistributionMapping` strategy. Possible values
+   are ``SFC``, ``KNAPSACK``, ``ROUNDROBIN``, or ``RRSFC``. Note that the
+   default strategy can also be set by calling
+   :cpp:`DistributionMapping::strategy(DistributionMapping::Strategy)`.
+
+Embedded Boundary
+-----------------
+
+.. py:data:: eb2.max_grid_size
+   :type: int
+   :value: 64
+
+   This parameter specifies the maximum grid size in AMReX's internal EB
+   database, not the user's data.
+
+.. py:data:: eb2.extend_domain_face
+   :type: bool
+   :value: true
+
+   This controls the behavior of the embedded boundary outside the
+   domain. If this is true, the embedded boundary outside the domain is
+   extended perpendicularly from the domain face. Otherwise, it's generated
+   with the user provided implicit function. Note that this parameter can be
+   overridden by the user when calling :cpp:`amrex::EB2::Build` with the
+   optional parameter ``bool extend_domain_face``.
+
+.. py:data:: eb2.num_coarsen_opt
+   :type: int
+   :value: 0
+
+   If it is greater than 0, this parameter can speed up the EB
+   generation. It indicates that the search for EB can be performed on grids
+   coarsened by this factor and then the EB information details will be
+   generated on the original grids. However, the user should be aware that
+   setting this parameter too high could result in erroneous results. Also
+   note that this parameter can be overridden by the user when calling
+   :cpp:`amrex::EB2::Build` with the optional parameter ``int
+   num_coarsen_opt``.
+
+.. py:data:: eb2.geom_type
+   :type: string
+   :value: [none]
+
+   There are two versions of the `amrex::EB2::Build` function that can be
+   used to build EB. One version is a function template that takes a user
+   provided :cpp:`GeometryShop`, while the other uses :cpp:`ParmParse`
+   parameters to build EB. For the latter version, this parameter specifies
+   the type of the EB. Possible values include the following.
+
+   all_regular
+      The entire domain is regular without any EB objects.
+
+   parser
+      The embedded boundary is describe by :py:data:`eb2.parser_function`.
+
+   stl
+      The embedded boundary will be built using an STL file specified by
+      :py:data:`eb2.stl_file`.
+
+.. py:data:: eb2.parser_function
+   :type: string
+   :value: [none]
+
+   When ``eb2.geom_type = parser``, this parameter is a parser function
+   string that contains a math expression describing the surface of the EB.
+
+   .. seealso:: Section :ref:`sec:basics:parser`.
+
+.. py:data:: eb2.stl_file
+   :type: string
+   :value: [none]
+
+   When ``eb2.geom_type = stl``, this is a required string parameter
+   specifying the STL file name.
+
+.. py:data:: eb2.stl_scale
+   :type: amrex:Real
+   :value: 1
+
+   When building EB using STL, the triangles in the STL file will be scaled
+   by the given value of this optional parameter.
+
+.. py:data:: eb2.stl_center
+   :type: amrex::Real array
+   :value: 0 0 0
+
+   When building EB using STL, this optional parameter specifies the shifted
+   center. The original coordinates in the STL file will be shifted by the
+   provided values.
+
+.. py:data:: eb2.stl_reverse_normal
+   :type: bool
+   :value: false
+
+   When building EB using STL, the normal direction of the triangles in the
+   STL file will be reversed if this optional parameter is set to true.
+
+.. py:data:: eb2.small_volfrac
+   :type: amrex::Real
+   :value: [depend on the type of amrex::Real]
+
+   This parameter specifies the threshold for small cells that will be
+   converted to covered cells. The default value is ``1.e-14`` if
+   :cpp:`amrex::Real` is ``double``, or ``1.e-5`` if :cpp:`amrex::Real` is
+   ``float``.
+
+.. py:data:: eb2.cover_multiple_cuts
+   :type: bool
+   :value: false
+
+   If this parameter is set to true, multi-cut cells will be converted to
+   covered cells.
+
+   .. tip::  Because AMReX currently does not support multi-cut cells, it
+             would be a runtime error if multi-cut cells are left unfixed.
+
+.. py:data:: eb2.maxiter
+   :type: int
+   :value: 32
+
+   Fixing small and multi-cut cells is an iterative process. This parameter
+   specifies the maximum number of iterations for the fix-up process.
+
+Error Handling
+--------------
+
+By default AMReX installs a signal handler that will be run when a signal
+such as segfault is received. You can also enable floating point exception
+trapping. The signal handler will print out backtraces that can be useful
+for debugging.
+
+.. note:: Floating point exception trapping is not enabled by default,
+   because compilers might generate optimized SIMD code that raises the
+   exceptions.
+
+.. py:data:: amrex.signal_handling
+   :type: bool
+   :value: true
+
+   This controls whether AMReX should handle signals.
+
+.. py:data:: amrex.handle_sigsegv
+   :type: bool
+   :value: true
+
+   If both this flag and ``amrex.signal_handling`` are true, ``SIGSEGV``
+   will be handled by AMReX.
+
+.. py:data:: amrex.handle_sigterm
+   :type: bool
+   :value: false
+
+   If both this flag and ``amrex.signal_handling`` are true, ``SIGTERM``
+   will be handled by AMReX. This flag is false by default because this
+   could generate lots of backtrace files on some batch systems that issue
+   ``SIGTERM`` for jobs running out of wall clock time.
+
+.. py:data:: amrex.handle_sigint
+   :type: bool
+   :value: true
+
+   If both this flag and ``amrex.signal_handling`` are true, ``SIGINT``
+   will be handled by AMReX.
+
+.. py:data:: amrex.handle_sigabrt
+   :type: bool
+   :value: true
+
+   If both this flag and ``amrex.signal_handling`` are true, ``SIGABGT``
+   will be handled by AMReX.
+
+.. py:data:: amrex.handle_sigfpe
+   :type: bool
+   :value: true
+
+   If both this flag and ``amrex.signal_handling`` are true, ``SIGFPE``
+   will be handled by AMReX.
+
+   .. seealso::
+      Use :py:data:`amrex.fpe_trap_invalid`, :py:data:`amrex.fpe_trap_zero`
+      and :py:data:`amrex.fpe_trap_overflow` to enable ``FE_INVALID``,
+      ``FE_DIVBYZERO`` and ``FE_OVERFLOW`` trapping, respectively.
+
+.. py:data:: amrex.handle_sigill
+   :type: bool
+   :value: true
+
+   If both this flag and ``amrex.signal_handling`` are true, ``SIGILL``
+   will be handled by AMReX.
+
+.. py:data:: amrex.throw_exception
+   :type: bool
+   :value: false
+
+   If this flag is true and ``amrex.signal_handling`` is false,
+   :cpp:`amrex::Abort` and :cpp:`amrex::Error` will throw
+   :cpp:`std::runtime_error` instead of aborting immediately. Note that
+   according the C++ standard, if an exception is thrown and not caught,
+   :cpp:`std::terminate` will be called.
+
+.. py:data:: amrex.fpe_trap_invalid
+   :type: bool
+   :value: false
+
+    If ``SIGFPE`` is handled by AMReX and this flag is true, ``FE_INVALID``
+    (e.g., ``0/0``) trapping will be enabled. This flag has no effect on
+    Windows.
+
+.. py:data:: amrex.fpe_trap_zero
+   :type: bool
+   :value: false
+
+    If ``SIGFPE`` is handled by AMReX and this flag is true,
+    ``FE_DIVBYZERO`` (e.g., ``1/0``) trapping will be enabled. This flag has
+    no effect on Windows.
+
+.. py:data:: amrex.fpe_trap_overflow
+   :type: bool
+   :value: false
+
+    If ``SIGFPE`` is handled by AMReX and this flag is true, ``FE_OVERFLOW``
+    (i.e., the result is too large to be representable) trapping will be
+    enabled. This flag has no effect on Windows.
+
+Extern
+------
+
+Hypre
+^^^^^
+
+These parameters are relevant only when Hypre support is enabled.
+
+.. py:data:: amrex.init_hypre
+   :type: bool
+   :value: true
+
+   This controls whether AMReX should call ``HYPRE_Init()`` during
+   :cpp:`amrex::Initialize`.
+
+.. py:data:: amrex.hypre_spgemm_use_vendor
+   :type: bool
+   :value: false
+
+   This controls whether HYPRE should use the vendor's ``SpGemm``
+   functionality.
+
+.. py:data:: amrex.hypre_spmv_use_vendor
+   :type: bool
+   :value: false
+
+   This controls whether HYPRE should use the vendor's ``SpMV``
+   functionality.
+
+.. py:data:: amrex.hypre_sptrans_use_vendor
+   :type: bool
+   :value: false
+
+   This controls whether HYPRE should use the vendor's ``SpTrans``
+   functionality.
+
+.. _sec:inputs:geom:
+
+Geometry
+--------
+
+All these parameters are optional for constructing a :ref:`Geometry <sec:basics:geom>`
+object. There are only used if the information is not provided via function
+arguments.
+
+.. py:data:: geometry.coord_sys
+   :type: int
+   :value: 0
+
+   This specifies the coordinate system type with valid values being 0
+   (Cartesian), or 1 (cylindrical), or 2 (spherical).
+
+.. py:data:: geometry.prob_lo
+   :type: amrex::Real array
+   :value: 0 0 0
+
+   This specifies the position of the lower corner of the physical domain.
+
+.. py:data:: geometry.prob_hi
+   :type: amrex::Real array
+   :value: [none]
+
+   This specifies the position of the upper corner of the physical
+   domain. If this is provided, :py:data:`geometry.prob_extent` will be
+   ignored.
+
+.. py:data:: geometry.prob_extent
+   :type: amrex::Real array
+   :value: [none]
+
+   This specifies the length of the physical domain. If
+   :py:data:`geometry.prob_hi` is provided, this will be ignored.
+
+.. py:data:: geometry.is_periodic
+   :type: int array
+   :value: 0 0 0
+
+   These integer parameters are boolean flags to indicate whether the domain
+   is periodic in each direction. It's considered true (i.e., periodic) if
+   its value is non-zero, and false (i.e., non-periodic) if its value is
+   zero.
+
+I/O
+---
+
+.. py:data:: amrex.async_out
+   :type: bool
+   :value: false
+
+   If this is true, AMReX's native mesh and particle plotfiles will be
+   written asynchronously by a background thread.
+
+.. py:data:: amrex.async_out_nfiles
+   :type: into
+   :value: 64
+
+   This is the maximum number of binary files on each AMR level that will be
+   used when AMReX writes a plotfile asynchronously.
+
+.. py:data:: vismf.verbose
+   :type: int
+   :value: 0
+
+   This controls the verbosity level of :cpp:`VisMF` functions.
+
+Memory
+------
+
+.. py:data:: amrex.the_arena_init_size
+   :type: long
+   :value: [system dependent]
+
+   This controls the main memory arena's initial size in bytes. For CPU
+   runs, the default is 0, whereas for GPU runs, the default is set at run
+   time to 3/4 of the system's device memory.
+
+   .. tip:: Since ``amrex v24.08``, instead of
+            ``amrex.the_arena_init_size=10000000000``, one can use
+            ``amrex.the_arena_init_size=10'000'000'000`` or
+            ``amrex.the_arena_init_size=1e10`` to set :cpp:`ParmParse`
+            integer parameters like this one.
+
+.. py:data:: amrex.the_device_arena_init_size
+   :type: long
+   :value: 8388608 [8 MB]
+
+   This controls the GPU device arena's initial size in bytes. For CPU runs,
+   this is ignored. If the main arena uses the device memory (as opposed to
+   managed memory), this parameter is also ignored.
+
+.. py:data:: amrex.the_managed_arena_init_size
+   :type: long
+   :value: 8388608 [8 MB]
+
+   This controls the managed device arena's initial size in bytes. For CPU
+   runs, this is ignored. If the main arena uses the managed memory (as
+   opposed to device memory), this parameter is also ignored.
+
+.. py:data:: amrex.the_pinned_arena_init_size
+   :type: long
+   :value: [system dependent]
+
+   This controls the pinned host memory arena's initial size in bytes. The
+   default is 8 MB for CPU runs. For GPU runs it's set to half of the GPU
+   device memory by default.
+
+.. py:data:: amrex.the_comms_arena_init_size
+   :type: long
+   :value: 8388608 [8 MB]
+
+   This controls the MPI communication memory arena's initial size in bytes.
+
+.. py:data:: amrex.the_arena_release_threshold
+   :type: long
+   :value: LONG_MAX
+
+   This controls the release threshold of the main arena.
+
+.. py:data:: amrex.the_device_arena_release_threshold
+   :type: long
+   :value: LONG_MAX
+
+   This controls the release threshold of the device arena.
+
+.. py:data:: amrex.the_managed_arena_release_threshold
+   :type: long
+   :value: LONG_MAX
+
+   This controls the release threshold of the managed arena.
+
+.. py:data:: amrex.the_pinned_arena_release_threshold
+   :type: long
+   :value: LONG_MAX
+
+   This controls the release threshold of the pinned arena.
+
+.. py:data:: amrex.the_comms_arena_release_threshold
+   :type: long
+   :value: LONG_MAX
+
+   This controls the release threshold of the communication arena.
+
+.. py:data:: amrex.the_async_arena_release_threshold
+   :type: long
+   :value: LONG_MAX
+
+   This controls the release threshold of the asynchronous arena. Note that
+   this is only relevant for the CUDA (>= 11.2) and HIP backends that
+   support stream-ordered memory allocator.
+
+.. py:data:: amrex.the_arena_is_managed
+   :type: bool
+   :value: false
+
+   This controls if AMReX uses the managed memory for the main arena. This
+   is only relevant for GPU runs.
+
+.. py:data:: amrex.abort_on_out_of_gpu_memory
+   :type: bool
+   :value: false
+
+   This controls if AMReX should simply abort when the reported free device
+   memory is less than the amount an arena is asked to allocate. Note that
+   for managed memory it's possible to allocate more than the amount of free
+   device memory available. However, the code will be very slow. This
+   parameter is only relevant for GPU runs.
+
+.. py:data:: amrex.mf.alloc_single_chunk
+   :type: bool
+   :value: false
+
+   This controls if all the data in a :cpp:`FabArray` (including
+   :cpp:`MultiFab`) are in a contiguous chunk of memory.
+
+.. py:data:: amrex.vector_growth_factor
+   :type: amrex::Real
+   :value: 1.5
+
+   This controls the growth factor of :cpp:`amrex::PODVector` and its
+   derived classes such as :cpp:`amrex::Gpu::DeviceVector`,
+   :cpp:`amrex::Gpu::ManagedVector`, etc. A smaller value can avoid wasting
+   memory, but it may result in a performance penalty during resizing.
+
+Particles
+---------
+
+.. py:data:: particles.do_tiling
+   :type: bool
+   :value: false
+
+   This controls whether tiling is enabled for particle containers.
+
+.. py:data:: particles.tile_size
+   :type: int array
+   :value: 1024000 8 8
+
+   When tiling is enabled, this is the default tile size. Note that a big
+   number like 1024000 effectively turns tiling off in that direction.
+
+.. py:data:: particles.do_mem_efficient_sort
+   :type: bool
+   :value: true
+
+   This parameter controls whether the more memory efficient method will be
+   used for sorting particles.
+
+.. py:data:: particles.particles_nfiles
+   :type: int
+   :value: 256
+
+   This is the maximum number of binary files per level for a particle
+   container when writing checkpoint and plot files for particles. The
+   special value of ``-1`` indicates one file per process.
+
+Tiling
+------
+
+.. py:data:: fabarray.mfiter_tile_size
+   :type: int array
+   :value: [build dependent]
+
+   This is the default size for :ref:`tiling <sec:basics:mfiter>`. For GPU
+   runs, it is disabled by default. For CPU runs, it is disabled by default
+   in 1D and 2D, but enabled in 3D with a tile size of 8 in the y and
+   z-directions.
+
+.. py:data:: fabarray.comm_tile_size
+   :type: int array
+   :value: [build dependent]
+
+   This is the default tiling size used in moving data in and out of the MPI
+   communication buffer . It is disabled by default for GPU runs, but
+   enabled for CPU runs with a tile size of 8 in the y and z-directions (if
+   they exist).
+
+Tiny Profiler
+-------------
+
+These parameters are ignored unless profiling with :cpp:`TinyProfiler` is
+enabled.
+
+.. py:data:: tiny_profiler.verbose
+   :type: int
+   :value: 0
+
+   If this value is greater than 0, messages about entering or leaving
+   profiled regions will be printed on the I/O process.
+
+.. py:data:: tiny_profiler.print_threshold
+  :type: double
+  :value: 1.0
+
+  In the profiling report, regions with very small run times are not listed
+  individually. Instead, they are included in a section named "Other". This
+  parameter specifies the maximum inclusive run time that the "Other"
+  section can take in percent relative to the total run time.
+
+.. py:data:: tiny_profiler.device_synchronize_around_region
+  :type: bool
+  :value: false
+
+  This parameter is only relevant for GPU runs. If it is set to true, the
+  current GPU stream is synchronized when entering and leaving a profiling
+  region. Because GPU kernels are asynchronous, time measurements without
+  synchronization could be misleading. Enabling this parameter can provide
+  more accurate measurements. However, the added synchronization points,
+  which are unnecessary for correctness, could potentially degrade the
+  performance.
+
+.. py:data:: tiny_profiler.enabled
+   :type: bool
+   :value: true
+
+   .. versionadded:: 24.09
+      Runtime parameter `tiny_profiler.enabled``.
+
+   This parameter can be used to disable tiny profiling including
+   :cpp:`CArena` memory profiling at run time.
+
+.. py:data:: tiny_profiler.memprof_enabled
+   :type: bool
+   :value: true
+
+   .. versionadded:: 24.09
+      Runtime parameter ``tiny_profiler.memprof_enabled``.
+
+   This parameter can be used to disable :cpp:`CArena` memory profiling at
+   run time. If ``tiny_profiler.enabled`` is false, this parameter has no
+   effects.
+
+.. py:data:: tiny_profiler.output_file
+   :type: string
+   :value: [empty]
+
+   .. versionadded:: 24.09
+      Runtime parameter ``tiny_profiler.output_file``.
+
+   If this parameter is empty, the output of tiny profiling is dumped on the
+   default out stream of AMReX. If it's not empty, it specifies the file
+   name for the output. Note that ``/dev/null`` is a special name that mean
+   a null file.
diff --git a/Docs/sphinx_documentation/source/TimeIntegration_Chapter.rst b/Docs/sphinx_documentation/source/TimeIntegration_Chapter.rst
index 720c312e2db..ef311de0439 100644
--- a/Docs/sphinx_documentation/source/TimeIntegration_Chapter.rst
+++ b/Docs/sphinx_documentation/source/TimeIntegration_Chapter.rst
@@ -16,13 +16,8 @@ A Simple Time Integrator Setup
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This is best shown with some sample code that sets up a time integrator and
-asks it to step forwards by some interval ``dt``. The user needs to supply at
-minimum, the right-hand side function using the ``TimeIntegrator::set_rhs()``
-function. By using the ``TimeIntegrator::set_post_update()`` function, a user
-can also supply a post update function which is called on state data immediately
-before evaluating the right-hand side. This post update function is a good
-opportunity to fill boundary conditions for Runge-Kutta stage solution data so that
-ghost cells are filled when the right hand side function is called on that solution data.
+asks it to step forward by some interval ``dt``. The user needs to supply the
+right-hand side function using the ``TimeIntegrator::set_rhs()`` function.
 
 .. highlight:: c++
 
@@ -30,171 +25,26 @@ ghost cells are filled when the right hand side function is called on that solut
 
    #include <AMReX_TimeIntegrator.H>
 
-   MultiFab Sborder; // MultiFab containing old-time state data and ghost cells
-   MultiFab Snew;    // MultiFab where we want new-time state data
-   Geometry geom;    // The domain (or level) geometry
+   MultiFab Sold; // MultiFab containing old-time state data
+   MultiFab Snew; // MultiFab where we want new-time state data
 
-   // [Fill Sborder here]
+   // [Fill Sold here]
 
    // Create a time integrator that will work with
    // MultiFabs with the same BoxArray, DistributionMapping,
    // and number of components as the state_data MultiFab.
-   TimeIntegrator<MultiFab> integrator(Sborder);
+   TimeIntegrator<MultiFab> integrator(Sold);
 
-   // Create a RHS source function we will integrate
-   auto source_fun = [&](MultiFab& rhs, const MultiFab& state, const Real time){
-       // User function to calculate the rhs MultiFab given the state MultiFab
-       fill_rhs(rhs, state, time);
+   // Create a function that fills the state BCs and computes the RHS
+   auto rhs_fun = [&](MultiFab& rhs, MultiFab& state, const Real time){
+       // [Calculate the rhs MultiFab given the state MultiFab]
    };
 
-   // Create a function to call after updating a state
-   auto post_update_fun = [&](MultiFab& S_data, const Real time) {
-       // Call user function to update state MultiFab, e.g. fill BCs
-       post_update(S_data, time, geom);
-   };
-
-   // Attach the right hand side and post-update functions
-   // to the integrator
+   // Attach the right hand side function to the integrator
    integrator.set_rhs(source_fun);
-   integrator.set_post_update(post_update_fun);
-
-   // integrate forward one step from `time` by `dt` to fill S_new
-   integrator.advance(Sborder, S_new, time, dt);
-
-.. _sec:time_int:sundials:
-
-Using SUNDIALS
-^^^^^^^^^^^^^^
-
-The AMReX Time Integration interface also supports a SUNDIALS backend that
-wraps both the explicit Runge-Kutta (ERK) and multirate (MRI) integration
-schemes in the SUNDIALS ARKODE package. To use either of them, the user needs
-to compile AMReX with `USE_SUNDIALS=TRUE` and use SUNDIALS v. 6.0 or later.
-
-There are only minor changes to the code above required to use the SUNDIALS
-interface. The first change is that the integration datatype is now a
-`Vector<MultiFab>` type instead of simply `MultiFab`. The reason for
-introducing a `Vector<MultiFab>` in this case, is to permit integrating state
-data with different spatial centering (e.g. cell centered, face centered, node
-centered) concurrently. Shown here is sample code equivalent to the code above,
-suitable for the SUNDIALS explicit Runge-Kutta integrator:
-
-.. highlight:: c++
-
-::
-
-   #include <AMReX_TimeIntegrator.H>
-
-   Vector<MultiFab> Sborder; // MultiFab(s) containing old-time state data and ghost cells
-   Vector<MultiFab> Snew;    // MultiFab(s) where we want new-time state data
-   Geometry geom;    // The domain (or level) geometry
-
-   // [Fill Sborder here]
-
-   // Create a time integrator that will work with
-   // MultiFabs with the same BoxArray, DistributionMapping,
-   // and number of components as the state_data MultiFab.
-   TimeIntegrator<Vector<MultiFab> > integrator(Sborder);
-
-   // Create a RHS source function we will integrate
-   auto source_fun = [&](Vector<MultiFab>& rhs, const Vector<MultiFab>& state, const Real time){
-       // User function to calculate the rhs MultiFab given the state MultiFab
-       fill_rhs(rhs, state, time);
-   };
-
-   // Create a function to call after updating a state
-   auto post_update_fun = [&](Vector<MultiFab>& S_data, const Real time) {
-       // Call user function to update state MultiFab, e.g. fill BCs
-       post_update(S_data, time, geom);
-   };
-
-   // Attach the right hand side and post-update functions
-   // to the integrator
-   integrator.set_rhs(source_fun);
-   integrator.set_post_update(post_update_fun);
-
-   // integrate forward one step from `time` by `dt` to fill S_new
-   integrator.advance(Sborder, S_new, time, dt);
-
-Afterwards, to select the ERK integrator, one needs only to add the following
-two input parameters at runtime:
-
-::
-
-  integration.type = SUNDIALS
-  integration.sundials.strategy = ERK
-
-If instead one wishes to use the SUNDIALS multirate integrator, then the user
-will need to use the following runtime inputs parameters:
-
-::
-
-  integration.type = SUNDIALS
-  integration.sundials.strategy = MRI
-
-In addition, to set up the multirate problem, the user needs to supply a fast
-timescale right-hand-side function in addition to the usual right hand side
-function (which is interpreted as the slow timescale right-hand side). The user
-will also need to supply the ratio of the slow timestep size to the fast
-timestep size, which is an integer corresponding to the number of fast
-timesteps the integrator will take per every slow timestep. An example code
-snippet would look as follows:
-
-.. highlight:: c++
-
-::
-
-   #include <AMReX_TimeIntegrator.H>
-
-   Vector<MultiFab> Sborder; // Vector of MultiFab(s) containing old-time state data and ghost cells
-   Vector<MultiFab> Snew;    // Vector of MultiFab(s) where we want new-time state data
-   Geometry geom;    // The domain (or level) geometry
-
-   // [Fill Sborder here]
-
-   // Create a time integrator that will work with
-   // MultiFabs with the same BoxArray, DistributionMapping,
-   // and number of components as the state_data MultiFab.
-   TimeIntegrator<Vector<MultiFab> > integrator(Sborder);
-
-   // Create a slow timescale RHS function we will integrate
-   auto rhs_fun = [&](Vector<MultiFab>& rhs, const Vector<MultiFab>& state, const Real time){
-       // User function to calculate the rhs MultiFab given the state MultiFab(s)
-       fill_rhs(rhs, state, time);
-   };
-
-   // Create a fast timescale RHS function to integrate
-   auto rhs_fun_fast = [&](Vector<MultiFab>& rhs,
-                           const Vector<MultiFab>& stage_data,
-                           const Vector<MultiFab>& state, const Real time) {
-        // User function to calculate the fast-timescale rhs MultiFab given
-        // the state MultiFab and stage_data which holds the previously
-        // accessed slow-timescale stage state data.
-        fill_fast_rhs(rhs, stage_data, state, time);
-   };
-
-   // The post update function is called after updating state data or
-   // immediately before using state data to calculate a fast or slow right hand side.
-   // (it is a good place to e.g. fill boundary conditions)
-   auto post_update_fun = [&](Vector<MultiFab>& S_data, const Real time) {
-       // Call user function to update state MultiFab(s), e.g. fill BCs
-       post_update(S_data, time, geom);
-   };
-
-   // Attach the slow and fast right hand side functions to integrator
-   integrator.set_rhs(rhs_fun);
-   integrator.set_fast_rhs(rhs_fun_fast);
-
-   // This sets the ratio of slow timestep size to fast timestep size as an integer,
-   // or equivalently, the number of fast timesteps per slow timestep.
-   integrator.set_slow_fast_timestep_ratio(2);
-
-   // Attach the post update function to the integrator
-   integrator.set_post_update(post_update_fun);
-
-   // integrate forward one step from `time` by `dt` to fill S_new
-   integrator.advance(Sborder, S_new, time, dt);
 
+   // integrate forward one step from `time` by `dt` to fill Snew
+   integrator.advance(Sold, Snew, time, dt);
 
 Picking A Time Integration Method
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -205,12 +55,6 @@ a generic explicit Runge-Kutta method. If Runge-Kutta is selected, then the user
 can choose which of a set of predefined Butcher Tables to use, or can choose to
 use a custom table and supply it manually.
 
-When AMReX is compiled with SUNDIALS v.6 or later, the user also has an option
-to use the SUNDIALS ARKODE integrator as a backend for the AMReX Time Integrator
-class. The features of this interface evolve with the needs of our codes, so
-they may not yet support all SUNDIALS configurations available. If you find you
-need SUNDIALS options we have not implemented, please let us know.
-
 The full set of integrator options are detailed as follows:
 
 ::
@@ -222,7 +66,7 @@ The full set of integrator options are detailed as follows:
   ## (without the quotation marks)
   ## "ForwardEuler" or "0" = Native Forward Euler Integrator
   ## "RungeKutta" or "1"   = Native Explicit Runge Kutta
-  ## "SUNDIALS" or "2"     = SUNDIALS ARKODE Integrator
+  ## "SUNDIALS" or "2"     = SUNDIALS Integrator
   ## for example:
   integration.type = RungeKutta
 
@@ -246,35 +90,104 @@ The full set of integrator options are detailed as follows:
   integration.rk.nodes = 0
   integration.rk.tableau = 0.0
 
-  ## *** Parameters Needed For SUNDIALS ARKODE Integrator ***
-  ## integration.sundials.strategy specifies which ARKODE strategy to use.
-  ## The available options are (without the quotations):
-  ## "ERK" = Explicit Runge Kutta
-  ## "MRI" = Multirate Integrator
-  ## "MRITEST" = Tests the Multirate Integrator by setting a zero-valued fast RHS function
-  ## for example:
-  integration.sundials.strategy = ERK
-
-  ## *** Parameters Specific to SUNDIALS ERK Strategy ***
-  ## (Requires integration.type=SUNDIALS and integration.sundials.strategy=ERK)
-  ## integration.sundials.erk.method specifies which explicit Runge Kutta method
-  ## for SUNDIALS to use. The following options are supported:
-  ## "SSPRK3" = 3rd order strong stability preserving RK (default)
-  ## "Trapezoid" = 2nd order trapezoidal rule
-  ## "ForwardEuler" = 1st order forward euler
-  ## for example:
-  integration.sundials.erk.method = SSPRK3
-
-  ## *** Parameters Specific to SUNDIALS MRI Strategy ***
-  ## (Requires integration.type=SUNDIALS and integration.sundials.strategy=MRI)
-  ## integration.sundials.mri.implicit_inner specifies whether or not to use an implicit inner solve
-  ## integration.sundials.mri.outer_method specifies which outer (slow) method to use
-  ## integration.sundials.mri.inner_method specifies which inner (fast) method to use
-  ## The following options are supported for both the inner and outer methods:
-  ## "KnothWolke3" = 3rd order Knoth-Wolke method (default for outer method)
-  ## "Trapezoid" = 2nd order trapezoidal rule
-  ## "ForwardEuler" = 1st order forward euler (default for inner method)
-  ## for example:
-  integration.sundials.mri.implicit_inner = false
-  integration.sundials.mri.outer_method = KnothWolke3
-  integration.sundials.mri.inner_method = Trapezoid
+.. _sec:time_int:sundials:
+
+Using SUNDIALS
+^^^^^^^^^^^^^^
+
+The AMReX Time Integration interface also supports a SUNDIALS backend that
+provides explicit, implicit, and implicit-explicit (ImEx) Runge-Kutta methods
+as well a multirate (MRI) methods from the ARKODE package in SUNDIALS.
+To use SUNDIALS integrators, the user needs to compile AMReX with
+``USE_SUNDIALS=TRUE`` and use SUNDIALS v6.0 or later.
+
+The SUNDIALS interface supports ``MultiFab`` or ``Vector<MultiFab>`` data
+types. Using a ``Vector<MultiFab>`` permits integrating state data with
+different spatial centering (e.g. cell centered, face centered, node centered)
+concurrently.
+
+The same code as above can be used with SUNDIALS explicit or implicit
+Runge-Kutta methods without any modification. To select a SUNDIALS explicit
+Runge-Kutta integrator, one needs only to add the following two input parameters
+at runtime:
+
+::
+
+  integration.type = SUNDIALS
+  integration.sundials.type = ERK
+
+One can select a different method type by changing the value of
+``integration.sundials.type`` to one of the following values:
+
++------------------------+--------------------------------------------------+
+| Input Option           | SUNDIALS Method Type                             |
++========================+==================================================+
+| ERK                    | Explicit Runge-Kutta method                      |
++------------------------+--------------------------------------------------+
+| DIRK                   | Diagonally Implicit Runge-Kutta method           |
++------------------------+--------------------------------------------------+
+| IMEX-RK                | Implicit-Explicit Additive Runge-Kutta method    |
++------------------------+--------------------------------------------------+
+| EX-MRI                 | Explicit Multirate Infinitesimal method          |
++------------------------+--------------------------------------------------+
+| IM-MRI                 | Implicit Multirate Infinitesimal method          |
++------------------------+--------------------------------------------------+
+| IMEX-MRI               | Implicit-Explicit Multirate Infinitesimal method |
++------------------------+--------------------------------------------------+
+
+For ImEx methods, the user needs to supply two right-hand side functions, an
+implicit and an explicit function, using the function
+``TimeIntegrator::set_imex_rhs()``. Similarly for multirate methods, the user
+needs to supply slow and fast right-hand side functions using
+``TimeIntegrator::set_rhs()`` to set the slow function and
+``TimeIntegrator::set_fast_rhs()`` to set the fast function. With multirate
+methods, one also needs to select the fast time scale method type using the
+input option ``integration.sundials.fast_type`` which maybe set to ``ERK`` or
+``DIRK``.
+
+To select a specific SUNDIALS method use the input option
+``integration.sundials.method`` for ERK and DIRK methods as well as the slow
+time scale method with an MRI integrator, use ``integration.sundials.method_i``
+and ``integration.sundials.method_e`` to set the implicit and explicit method in
+an ImEx method, and ``integration.sundials.fast_method`` to set the ERK or DIRK
+method used at the fast time scale with an MRI integrator. These options may be
+set to any valid SUNDIALS method name, see the following sections in the
+SUNDIALS documentation for more details:
+
+* `ERK methods <https://sundials.readthedocs.io/en/latest/arkode/Butcher_link.html#explicit-butcher-tables>`_
+* `DIRK methods <https://sundials.readthedocs.io/en/latest/arkode/Butcher_link.html#implicit-butcher-tables>`_
+* `ImEx methods <https://sundials.readthedocs.io/en/latest/arkode/Butcher_link.html#additive-butcher-tables>`_
+* `MRI methods <https://sundials.readthedocs.io/en/latest/arkode/Usage/MRIStep/MRIStepCoupling.html#mri-coupling-tables>`_
+
+The full set of integrator options are detailed as follows:
+
+::
+
+  # INTEGRATION WITH SUNDIALS
+
+  # *** Select the SUNDIALS integrator backend ***
+  integration.type = SUNDIALS
+
+  # *** Select the SUNDIALS method type ***
+  # ERK      = Explicit Runge-Kutta method
+  # DIRK     = Diagonally Implicit Runge-Kutta method
+  # IMEX-RK  = Implicit-Explicit Additive Runge-Kutta method
+  # EX-MRI   = Explicit Multirate Infatesimal method
+  # IM-MRI   = Implicit Multirate Infatesimal method
+  # IMEX-MRI = Implicit-Explicit Multirate Infatesimal method
+  integration.sundials.type = ERK
+
+  # *** Select a specific SUNDIALS ERK method ***
+  integration.sundials.method = ARKODE_BOGACKI_SHAMPINE_4_2_3
+
+  # *** Select a specific SUNDIALS ImEx method ***
+  integration.sundials.method_i = ARKODE_ARK2_DIRK_3_1_2
+  integration.sundials.method_e = ARKODE_ARK2_ERK_3_1_2
+
+  # *** Select a specific SUNDIALS MRI method ***
+  integration.sundials.method = ARKODE_MIS_KW3
+  integration.sundials.fast_method = ARKODE_KNOTH_WOLKE_3_3
+
+The features of this interface evolve with the needs of our codes, so they may
+not yet support all SUNDIALS configurations available. If you find you need
+SUNDIALS options we have not implemented, please let us know.
diff --git a/Docs/sphinx_documentation/source/index.rst b/Docs/sphinx_documentation/source/index.rst
index d302af07468..203545cf40a 100644
--- a/Docs/sphinx_documentation/source/index.rst
+++ b/Docs/sphinx_documentation/source/index.rst
@@ -57,7 +57,7 @@ Documentation on migration from BoxLib is available in the AMReX repository at D
    Visualization_Chapter
    Post_Processing_Chapter
    Debugging
-   Inputs_Chapter
+   RuntimeParameters
    AMReX_Profiling_Tools_Chapter
    External_Profiling_Tools_Chapter
    External_Frameworks_Chapter
@@ -69,13 +69,6 @@ Documentation on migration from BoxLib is available in the AMReX repository at D
    :maxdepth: 1
    :caption: API
 
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-
 The copyright notice of AMReX is included in the AMReX home directory
 as README.txt.
 Your use of this software is under the 3-clause BSD license -- the license agreement is included in the
diff --git a/README.md b/README.md
index 02ea15dcbfb..64aad625bfe 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ refinement applications.
 [Gallery](#Gallery) -
 [Get Help](#get-help) -
 [Contribute](#Contribute) -
+[Copyright Notice](#copyright-notice) -
 [License](#License) -
 [Citation](#Citation)
 
diff --git a/Src/Amr/AMReX_Amr.H b/Src/Amr/AMReX_Amr.H
index 3d9df9484ec..627285f9d9a 100644
--- a/Src/Amr/AMReX_Amr.H
+++ b/Src/Amr/AMReX_Amr.H
@@ -181,7 +181,7 @@ public:
     static void fillDerivePlotVarList ();
     static void fillDeriveSmallPlotVarList ();
 
-    static void setComputeNewDtOnRegrid (int flag) { compute_new_dt_on_regrid = flag; }
+    static void setComputeNewDtOnRegrid (bool flag) { compute_new_dt_on_regrid = flag; }
 
     static void Initialize ();
     static void Finalize ();
@@ -430,7 +430,7 @@ protected:
     int              small_plot_int;  //!< How often small plotfile (# of time steps)
     Real             small_plot_per;  //!< How often small plotfile (in units of time)
     Real             small_plot_log_per;  //!< How often small plotfile (in units of log10(time))
-    int              write_plotfile_with_checkpoint;  //!< Write out a plotfile whenever we checkpoint
+    bool             write_plotfile_with_checkpoint;  //!< Write out a plotfile whenever we checkpoint
     int              file_name_digits; //!< How many digits to use in the plotfile and checkpoint names
     int              message_int;     //!< How often checking messages touched by user, such as "stop_run"
     std::string      plot_file_root;  //!< Root name of plotfile.
@@ -473,7 +473,7 @@ protected:
     static Vector<BoxArray> initial_ba;
     //! Array of BoxArrays read in to externally define grid hierarchy at each regrid
     static Vector<BoxArray> regrid_ba;
-    static int compute_new_dt_on_regrid;
+    static bool compute_new_dt_on_regrid;
 
 #if defined(AMREX_USE_SENSEI_INSITU) && !defined(AMREX_NO_SENSEI_AMR_INST)
     static AmrInSituBridge *insitu_bridge;
diff --git a/Src/Amr/AMReX_Amr.cpp b/Src/Amr/AMReX_Amr.cpp
index 8df6f5cd423..2f8c110359b 100644
--- a/Src/Amr/AMReX_Amr.cpp
+++ b/Src/Amr/AMReX_Amr.cpp
@@ -59,7 +59,7 @@ bool                   Amr::first_plotfile;
 bool                   Amr::first_smallplotfile;
 Vector<BoxArray>       Amr::initial_ba;
 Vector<BoxArray>       Amr::regrid_ba;
-int                    Amr::compute_new_dt_on_regrid;
+bool                   Amr::compute_new_dt_on_regrid;
 #if defined(AMREX_USE_SENSEI_INSITU) && !defined(AMREX_NO_SENSEI_AMR_INST)
 AmrInSituBridge*       Amr::insitu_bridge;
 #endif
@@ -80,12 +80,12 @@ namespace
 #endif
     bool plot_files_output;
     int  checkpoint_nfiles;
-    int  regrid_on_restart;
-    int  force_regrid_level_zero;
-    int  use_efficient_regrid;
-    int  plotfile_on_restart;
-    int  insitu_on_restart;
-    int  checkpoint_on_restart;
+    bool regrid_on_restart;
+    bool force_regrid_level_zero;
+    bool use_efficient_regrid;
+    bool plotfile_on_restart;
+    bool insitu_on_restart;
+    bool checkpoint_on_restart;
     bool checkpoint_files_output;
     bool precreateDirectories;
     bool prereadFAHeaders;
@@ -117,14 +117,14 @@ Amr::Initialize ()
 #endif
     plot_files_output        = true;
     checkpoint_nfiles        = 64;
-    regrid_on_restart        = 0;
-    force_regrid_level_zero  = 0;
-    use_efficient_regrid     = 0;
-    plotfile_on_restart      = 0;
-    insitu_on_restart        = 0;
-    checkpoint_on_restart    = 0;
+    regrid_on_restart        = false;
+    force_regrid_level_zero  = false;
+    use_efficient_regrid     = false;
+    plotfile_on_restart      = false;
+    insitu_on_restart        = false;
+    checkpoint_on_restart    = false;
     checkpoint_files_output  = true;
-    compute_new_dt_on_regrid = 0;
+    compute_new_dt_on_regrid = false;
     precreateDirectories     = true;
     prereadFAHeaders         = true;
     plot_headerversion       = VisMF::Header::Version_v1;
@@ -257,17 +257,17 @@ Amr::InitAmr ()
     //
     pp.queryAdd("regrid_on_restart",regrid_on_restart);
     pp.queryAdd("force_regrid_level_zero",force_regrid_level_zero);
-    pp.queryAdd("use_efficient_regrid",use_efficient_regrid);
+    pp.query("use_efficient_regrid",use_efficient_regrid);
     pp.queryAdd("plotfile_on_restart",plotfile_on_restart);
-    pp.queryAdd("insitu_on_restart",insitu_on_restart);
+    pp.query("insitu_on_restart",insitu_on_restart);
     pp.queryAdd("checkpoint_on_restart",checkpoint_on_restart);
 
     pp.queryAdd("compute_new_dt_on_regrid",compute_new_dt_on_regrid);
 
-    pp.queryAdd("mffile_nstreams", mffile_nstreams);
+    pp.query("mffile_nstreams", mffile_nstreams);
 
 #ifndef AMREX_NO_PROBINIT
-    pp.queryAdd("probinit_natonce", probinit_natonce);
+    pp.query("probinit_natonce", probinit_natonce);
     probinit_natonce = std::max(1, std::min(ParallelDescriptor::NProcs(), probinit_natonce));
 #endif
 
@@ -502,13 +502,13 @@ Amr::InitAmr ()
     }
 
     loadbalance_with_workestimates = 0;
-    pp.queryAdd("loadbalance_with_workestimates", loadbalance_with_workestimates);
+    pp.query("loadbalance_with_workestimates", loadbalance_with_workestimates);
 
     loadbalance_level0_int = 2;
-    pp.queryAdd("loadbalance_level0_int", loadbalance_level0_int);
+    pp.query("loadbalance_level0_int", loadbalance_level0_int);
 
     loadbalance_max_fac = 1.5;
-    pp.queryAdd("loadbalance_max_fac", loadbalance_max_fac);
+    pp.query("loadbalance_max_fac", loadbalance_max_fac);
 }
 
 int
@@ -2008,7 +2008,7 @@ Amr::timeStep (int  level,
     //
     if (plotfile_on_restart && ! (restart_chkfile.empty()) )
     {
-        plotfile_on_restart = 0;
+        plotfile_on_restart = false;
         writePlotFile();
     }
     //
@@ -2610,7 +2610,7 @@ Amr::regrid (int  lbase,
     //
     // If use_efficient_regrid flag is set and grids are unchanged, then don't do anything more here.
     //
-    if (use_efficient_regrid == 1 && grids_unchanged )
+    if (use_efficient_regrid == true && grids_unchanged )
     {
         if (verbose > 0) {
             amrex::Print() << "Regridding at level lbase = " << lbase
@@ -2673,7 +2673,7 @@ Amr::regrid (int  lbase,
         {
             //
             // Init with data from old structure then remove old structure.
-            // NOTE: The init function may use a filPatch from the old level,
+            // NOTE: The init function may use a fillPatch from the old level,
             //       which therefore needs remain in the hierarchy during the call.
             //
             a->init(*amr_level[lev]);
@@ -2820,7 +2820,7 @@ Amr::InstallNewDistributionMap (int lev, const DistributionMapping& newdm)
 void
 Amr::regrid_level_0_on_restart()
 {
-    regrid_on_restart = 0;
+    regrid_on_restart = false;
     //
     // Coarsening before we split the grids ensures that each resulting
     // grid will have an even number of cells in each direction.
@@ -3121,6 +3121,7 @@ Amr::initSubcycle ()
     BL_PROFILE("Amr::initSubcycle()");
     ParmParse pp("amr");
     sub_cycle = true;
+    subcycling_mode = "Auto";
     if (pp.contains("nosub"))
     {
         if (verbose) {
@@ -3136,11 +3137,8 @@ Amr::initSubcycle ()
         }
         subcycling_mode = "None";
     }
-    else
-    {
-        subcycling_mode = "Auto";
-        pp.queryAdd("subcycling_mode",subcycling_mode);
-    }
+
+    pp.queryAdd("subcycling_mode",subcycling_mode);
 
     if (subcycling_mode == "None")
     {
@@ -3292,25 +3290,25 @@ Amr::initPltAndChk ()
         }
     }
 
-    write_plotfile_with_checkpoint = 1;
+    write_plotfile_with_checkpoint = true;
     pp.queryAdd("write_plotfile_with_checkpoint",write_plotfile_with_checkpoint);
 
     stream_max_tries = 4;
-    pp.queryAdd("stream_max_tries",stream_max_tries);
+    pp.query("stream_max_tries",stream_max_tries);
     stream_max_tries = std::max(stream_max_tries, 1);
 
     abort_on_stream_retry_failure = false;
-    pp.queryAdd("abort_on_stream_retry_failure",abort_on_stream_retry_failure);
+    pp.query("abort_on_stream_retry_failure",abort_on_stream_retry_failure);
 
-    pp.queryAdd("precreateDirectories", precreateDirectories);
-    pp.queryAdd("prereadFAHeaders", prereadFAHeaders);
+    pp.query("precreateDirectories", precreateDirectories);
+    pp.query("prereadFAHeaders", prereadFAHeaders);
 
     int phvInt(plot_headerversion), chvInt(checkpoint_headerversion);
-    pp.queryAdd("plot_headerversion", phvInt);
+    pp.query("plot_headerversion", phvInt);
     if(phvInt != plot_headerversion) {
         plot_headerversion = static_cast<VisMF::Header::Version> (phvInt);
     }
-    pp.queryAdd("checkpoint_headerversion", chvInt);
+    pp.query("checkpoint_headerversion", chvInt);
     if(chvInt != checkpoint_headerversion) {
         checkpoint_headerversion = static_cast<VisMF::Header::Version> (chvInt);
     }
diff --git a/Src/AmrCore/AMReX_AmrMesh.H b/Src/AmrCore/AMReX_AmrMesh.H
index f5d49f5c5da..c915f6011d4 100644
--- a/Src/AmrCore/AMReX_AmrMesh.H
+++ b/Src/AmrCore/AMReX_AmrMesh.H
@@ -26,7 +26,11 @@ struct AmrInfo {
     //! Blocking factor in grid generation (by level).
     Vector<IntVect> blocking_factor {{IntVect(8)}};
     //! Maximum allowable grid size (by level).
+#if defined(AMREX_USE_GPU)
+    Vector<IntVect> max_grid_size {{IntVect(AMREX_D_PICK(128,128,64))}};
+#else
     Vector<IntVect> max_grid_size {{IntVect(AMREX_D_PICK(128,128,32))}};
+#endif
     //! Buffer cells around each tagged cell.
     Vector<IntVect> n_error_buf {{IntVect(1)}};
     //! Grid efficiency.
@@ -178,7 +182,7 @@ public:
     void SetGeometry (int lev, const Geometry& geom_in) noexcept;
 
     //! Given domain box, return AMR level.  Return -1 if there is no match.
-    int GetLevel (Box const& domain) noexcept;
+    [[nodiscard]] int GetLevel (Box const& domain) const noexcept;
 
     void ClearDistributionMap (int lev) noexcept;
     void ClearBoxArray (int lev) noexcept;
@@ -201,7 +205,7 @@ public:
     //! Return the largest allowable grid.
     [[nodiscard]] const IntVect& maxGridSize (int lev) const noexcept { return max_grid_size[lev]; }
 
-    [[nodiscard]] bool LevelDefined (int lev) noexcept;
+    [[nodiscard]] bool LevelDefined (int lev) const noexcept;
 
     //! Should we keep the coarser grids fixed (and not regrid those levels) at all?
     [[nodiscard]] bool useFixedCoarseGrids () const noexcept { return use_fixed_coarse_grids; }
@@ -255,7 +259,7 @@ public:
 
     [[nodiscard]] virtual BoxArray GetAreaNotToTag (int /*lev*/) { return BoxArray(); }
 
-    [[nodiscard]] long CountCells (int lev) noexcept;
+    [[nodiscard]] Long CountCells (int lev) const noexcept;
 
     [[nodiscard]] virtual DistributionMapping MakeDistributionMap (int lev, BoxArray const& ba);
 
diff --git a/Src/AmrCore/AMReX_AmrMesh.cpp b/Src/AmrCore/AMReX_AmrMesh.cpp
index 0ed59002f2e..388ba55dcf2 100644
--- a/Src/AmrCore/AMReX_AmrMesh.cpp
+++ b/Src/AmrCore/AMReX_AmrMesh.cpp
@@ -78,7 +78,9 @@ AmrMesh::InitAmrMesh (int max_level_in, const Vector<int>& n_cell_in,
 {
     ParmParse pp("amr");
 
-    pp.queryAdd("v",verbose);
+    if (! pp.query("verbose", "v", verbose)) {
+        pp.add("verbose", verbose);
+    }
 
     if (max_level_in == -1) {
        pp.get("max_level", max_level);
@@ -89,28 +91,17 @@ AmrMesh::InitAmrMesh (int max_level_in, const Vector<int>& n_cell_in,
 
     int nlev = max_level + 1;
 
-    blocking_factor.resize(nlev);
-    max_grid_size.resize(nlev);
-    n_error_buf.resize(nlev);
+    AmrInfo def_amr_info;
+
+    blocking_factor.resize(nlev, def_amr_info.blocking_factor.back());
+    max_grid_size.resize  (nlev, def_amr_info.max_grid_size.back());
+    n_error_buf.resize    (nlev, def_amr_info.n_error_buf.back());
+    ref_ratio.resize      (nlev, def_amr_info.ref_ratio.back());
 
     geom.resize(nlev);
     dmap.resize(nlev);
     grids.resize(nlev);
 
-    for (int i = 0; i < nlev; ++i) {
-        n_error_buf[i]     = IntVect{AMREX_D_DECL(1,1,1)};
-        blocking_factor[i] = IntVect{AMREX_D_DECL(8,8,8)};
-        max_grid_size[i]   = (AMREX_SPACEDIM == 2) ? IntVect{AMREX_D_DECL(128,128,128)}
-                                                   : IntVect{AMREX_D_DECL(32,32,32)};
-    }
-
-    // Make the default ref_ratio = 2 for all levels.
-    ref_ratio.resize(max_level);
-    for (int i = 0; i < max_level; ++i)
-    {
-      ref_ratio[i] = 2 * IntVect::TheUnitVector();
-    }
-
     pp.queryAdd("n_proper",n_proper);
     pp.queryAdd("grid_eff",grid_eff);
     int cnt = pp.countval("n_error_buf");
@@ -371,11 +362,15 @@ AmrMesh::InitAmrMesh (int max_level_in, const Vector<int>& n_cell_in,
     {
         pp.queryAdd("refine_grid_layout", refine_grid_layout);
 
-        refine_grid_layout_dims = IntVect(refine_grid_layout);
-        AMREX_D_TERM(pp.queryAdd("refine_grid_layout_x", refine_grid_layout_dims[0]);,
-                     pp.queryAdd("refine_grid_layout_y", refine_grid_layout_dims[1]);,
-                     pp.queryAdd("refine_grid_layout_z", refine_grid_layout_dims[2]));
-
+        std::array<bool,AMREX_SPACEDIM> tmp{AMREX_D_DECL(refine_grid_layout,
+                                                         refine_grid_layout,
+                                                         refine_grid_layout)};
+        AMREX_D_TERM(pp.query("refine_grid_layout_x", tmp[0]);,
+                     pp.query("refine_grid_layout_y", tmp[1]);,
+                     pp.query("refine_grid_layout_z", tmp[2]));
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            refine_grid_layout_dims[idim] = tmp[idim];
+        }
         refine_grid_layout = refine_grid_layout_dims != 0;
     }
 
@@ -421,9 +416,9 @@ AmrMesh::SetGeometry (int lev, const Geometry& geom_in) noexcept
 }
 
 int
-AmrMesh::GetLevel (Box const& domain) noexcept
+AmrMesh::GetLevel (Box const& domain) const noexcept
 {
-    Box ccdomain = amrex::enclosedCells(domain);
+    Box const& ccdomain = amrex::enclosedCells(domain);
     for (int lev = 0; lev < geom.size(); ++lev) {
         if (geom[lev].Domain() == ccdomain) { return lev; }
     }
@@ -443,7 +438,7 @@ AmrMesh::ClearBoxArray (int lev) noexcept
 }
 
 bool
-AmrMesh::LevelDefined (int lev) noexcept
+AmrMesh::LevelDefined (int lev) const noexcept
 {
     return lev <= max_level && !grids[lev].empty() && !dmap[lev].empty();
 }
@@ -476,6 +471,9 @@ AmrMesh::ChopGrids (int lev, BoxArray& ba, int target_size) const
     IntVect chunk = max_grid_size[lev];
     chunk.min(Geom(lev).Domain().length());
 
+    // Note that ba already satisfies the max_grid_size requirement and it's
+    // coarsenable if it's a fine level BoxArray.
+
     while (ba.size() < target_size)
     {
         IntVect chunk_prev = chunk;
@@ -490,11 +488,24 @@ AmrMesh::ChopGrids (int lev, BoxArray& ba, int target_size) const
             int idim = chunk_dir[idx].second;
             if (refine_grid_layout_dims[idim]) {
                 int new_chunk_size = chunk[idim] / 2;
+                int rr = (lev > 0) ? ref_ratio[lev-1][idim] : 1;
+                if (rr > 1) {
+                    new_chunk_size = (new_chunk_size/rr) * rr;
+                }
                 if (new_chunk_size != 0 &&
                     new_chunk_size%blocking_factor[lev][idim] == 0)
                 {
                     chunk[idim] = new_chunk_size;
-                    ba.maxSize(chunk);
+                    if (rr == 1) {
+                        ba.maxSize(chunk);
+                    } else {
+                        IntVect bf(1);
+                        bf[idim] = rr;
+                        // Note that only idim-direction will be chopped by
+                        // minmaxSize because the sizes in other directions
+                        // are already smaller than chunk.
+                        ba.minmaxSize(bf, chunk);
+                    }
                     break;
                 }
             }
@@ -1216,8 +1227,8 @@ AmrMesh::checkInput ()
     }
 }
 
-long
-AmrMesh::CountCells (int lev) noexcept
+Long
+AmrMesh::CountCells (int lev) const noexcept
 {
     return grids[lev].numPts();
 }
diff --git a/Src/AmrCore/AMReX_ErrorList.cpp b/Src/AmrCore/AMReX_ErrorList.cpp
index c9d1a7feef8..7e72cd956a7 100644
--- a/Src/AmrCore/AMReX_ErrorList.cpp
+++ b/Src/AmrCore/AMReX_ErrorList.cpp
@@ -258,9 +258,9 @@ AMRErrorTag::operator() (TagBoxArray&    tba,
             auto const& tagma = tba.arrays();
             if (m_test == BOX)
             {
-                const auto plo = geom.ProbLoArray();
-                const auto dx  = geom.CellSizeArray();
-                const auto tag_rb = m_info.m_realbox;
+                const auto& plo = geom.ProbLoArray();
+                const auto& dx  = geom.CellSizeArray();
+                const auto& tag_rb = m_info.m_realbox;
                 ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
                 {
                     GpuArray<Real,AMREX_SPACEDIM> pt
@@ -277,7 +277,7 @@ AMRErrorTag::operator() (TagBoxArray&    tba,
                 auto const& datma   = mf->const_arrays();
                 auto threshold = m_value[level];
                 auto const volume_weighting = m_info.m_volume_weighting;
-                auto geomdata = geom.data();
+                auto const& geomdata = geom.data();
                 auto tag_update = tagval;
                 if (m_info.m_derefine) {
                     tag_update = clearval;
diff --git a/Src/AmrCore/AMReX_FillPatchUtil_I.H b/Src/AmrCore/AMReX_FillPatchUtil_I.H
index 02b5523cab5..15aaf3004dd 100644
--- a/Src/AmrCore/AMReX_FillPatchUtil_I.H
+++ b/Src/AmrCore/AMReX_FillPatchUtil_I.H
@@ -519,7 +519,7 @@ namespace detail {
 
                     const FabArrayBase::CPC mask_cpc( mf_solution, IntVect::TheZeroVector(),
                                                       mf_known, IntVect::TheZeroVector(),
-                                                      fgeom.periodicity());
+                                                      cgeom.periodicity());
 
                     solve_mask.setVal(1);                   // Values to solve.
                     solve_mask.setVal(0, mask_cpc, 0, 1);   // Known values.
@@ -681,7 +681,7 @@ namespace detail {
 
                     const FabArrayBase::CPC mask_cpc( mf_solution, IntVect::TheZeroVector(),
                                                       mf_known, IntVect::TheZeroVector(),
-                                                      fgeom.periodicity() );
+                                                      cgeom.periodicity() );
 
                     solve_mask[d].setVal(1);                   // Values to solve.
                     solve_mask[d].setVal(0, mask_cpc, 0, 1);   // Known values.
diff --git a/Src/AmrCore/AMReX_Interp_3D_C.H b/Src/AmrCore/AMReX_Interp_3D_C.H
index b29ba4aca40..382962d11b2 100644
--- a/Src/AmrCore/AMReX_Interp_3D_C.H
+++ b/Src/AmrCore/AMReX_Interp_3D_C.H
@@ -294,13 +294,13 @@ facediv_int (int ci, int cj, int ck, int nf,
                                   + dy*(2*dx*dx+dy*dy)/(8*dz*xspys)*(w000+w012-w002-w010)
                                   +                dy3/(8*dz*xspys)*(w100+w112-w102-w110);
 
-    fine[1](fi+1, fj+1, fk  , nf) = Real(0.5)*(v001+v021)
+    fine[1](fi+1, fj+1, fk  , nf) = Real(0.5)*(v100+v120)
                                   + dy*(2*dz*dz+dy*dy)/(8*dx*yspzs)*(u000+u210-u010-u200)
                                   +                dy3/(8*dx*yspzs)*(u001+u211-u011-u201)
                                   + dy*(2*dx*dx+dy*dy)/(8*dz*xspys)*(w100+w112-w102-w110)
                                   +                dy3/(8*dz*xspys)*(w000+w012-w002-w010);
 
-    fine[1](fi  , fj+1, fk+1, nf) = Real(0.5)*(v100+v120)
+    fine[1](fi  , fj+1, fk+1, nf) = Real(0.5)*(v001+v021)
                                   + dy*(2*dz*dz+dy*dy)/(8*dx*yspzs)*(u001+u211-u011-u201)
                                   +                dy3/(8*dx*yspzs)*(u000+u210-u010-u200)
                                   + dy*(2*dx*dx+dy*dy)/(8*dz*xspys)*(w000+w012-w002-w010)
@@ -319,17 +319,17 @@ facediv_int (int ci, int cj, int ck, int nf,
                                   +                dz3/(8*dy*zspxs)*(v100+v121-v101-v120);
 
     fine[2](fi  , fj+1, fk+1, nf) = Real(0.5)*(w010+w012)
-                                  + dz*(2*dy*dy+dz*dz)/(8*dx*yspzs)*(u000+u201-u001-u200)
-                                  +                dz3/(8*dx*yspzs)*(u010+u211-u011-u210)
-                                  + dz*(2*dx*dx+dz*dz)/(8*dy*zspxs)*(v100+v121-v101-v120)
-                                  +                dz3/(8*dy*zspxs)*(v000+v021-v001-v020);
-
-    fine[2](fi+1, fj  , fk+1, nf) = Real(0.5)*(w100+w102)
                                   + dz*(2*dy*dy+dz*dz)/(8*dx*yspzs)*(u010+u211-u011-u210)
                                   +                dz3/(8*dx*yspzs)*(u000+u201-u001-u200)
                                   + dz*(2*dx*dx+dz*dz)/(8*dy*zspxs)*(v000+v021-v001-v020)
                                   +                dz3/(8*dy*zspxs)*(v100+v121-v101-v120);
 
+    fine[2](fi+1, fj  , fk+1, nf) = Real(0.5)*(w100+w102)
+                                  + dz*(2*dy*dy+dz*dz)/(8*dx*yspzs)*(u000+u201-u001-u200)
+                                  +                dz3/(8*dx*yspzs)*(u010+u211-u011-u210)
+                                  + dz*(2*dx*dx+dz*dz)/(8*dy*zspxs)*(v100+v121-v101-v120)
+                                  +                dz3/(8*dy*zspxs)*(v000+v021-v001-v020);
+
     fine[2](fi+1, fj+1, fk+1, nf) = Real(0.5)*(w110+w112)
                                   + dz*(2*dy*dy+dz*dz)/(8*dx*yspzs)*(u010+u211-u011-u210)
                                   +                dz3/(8*dx*yspzs)*(u000+u201-u001-u200)
diff --git a/Src/AmrCore/AMReX_Interpolater.H b/Src/AmrCore/AMReX_Interpolater.H
index c13fb283a35..6333112aae8 100644
--- a/Src/AmrCore/AMReX_Interpolater.H
+++ b/Src/AmrCore/AMReX_Interpolater.H
@@ -565,6 +565,9 @@ public:
 * of the divergence of the underlying crse cell. All fine cells overlying
 * a given coarse cell will have the same divergence, even when the coarse
 * grid divergence is spatially varying.
+* Based on Vanella et. al. (doi:10.1016/j.jcp.2010.05.003, section 3.2),
+* but solves the interior closure problem using least squares with an
+* initial guess equal to the average of fine face values across the cell.
 */
 class FaceDivFree
     :
diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H
index 2b88553bcdf..4094d4f2ff2 100644
--- a/Src/Base/AMReX.H
+++ b/Src/Base/AMReX.H
@@ -10,10 +10,12 @@
 #include <AMReX_Exception.H>
 #include <AMReX_Extension.H>
 
+#include <cstdint>
 #include <functional>
 #include <iostream>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 //
@@ -47,23 +49,25 @@ namespace amrex
 
         extern AMREX_EXPORT int verbose;
 
-        extern AMREX_EXPORT int signal_handling;
-        extern AMREX_EXPORT int handle_sigsegv;
-        extern AMREX_EXPORT int handle_sigterm;
-        extern AMREX_EXPORT int handle_sigint;
-        extern AMREX_EXPORT int handle_sigabrt;
-        extern AMREX_EXPORT int handle_sigfpe;
+        extern AMREX_EXPORT bool signal_handling;
+        extern AMREX_EXPORT bool handle_sigsegv;
+        extern AMREX_EXPORT bool handle_sigterm;
+        extern AMREX_EXPORT bool handle_sigint;
+        extern AMREX_EXPORT bool handle_sigabrt;
+        extern AMREX_EXPORT bool handle_sigfpe;
 
-        extern AMREX_EXPORT int call_addr2line;
-        extern AMREX_EXPORT int throw_exception;
+        extern AMREX_EXPORT bool call_addr2line;
+        extern AMREX_EXPORT bool throw_exception;
 
-        extern AMREX_EXPORT int regtest_reduction;
+        extern AMREX_EXPORT bool regtest_reduction;
 
         extern AMREX_EXPORT std::ostream* osout;
         extern AMREX_EXPORT std::ostream* oserr;
 
         extern AMREX_EXPORT ErrorHandler error_handler;
-        extern AMREX_EXPORT int abort_on_unused_inputs;
+        extern AMREX_EXPORT bool abort_on_unused_inputs;
+
+        extern AMREX_EXPORT bool init_snan;
     }
 
     /** the AMReX "git describe" version */
@@ -197,6 +201,9 @@ namespace amrex
     [[nodiscard]] int Verbose () noexcept;
     void SetVerbose (int v) noexcept;
 
+    [[nodiscard]] bool InitSNaN () noexcept;
+    void SetInitSNaN (bool v) noexcept;
+
     // ! Get the entire command line including the executable
     [[nodiscard]] std::string get_command ();
 
@@ -277,6 +284,65 @@ namespace amrex
 
         Geometry* m_geom = nullptr;
     };
+
+    enum struct FPExcept : std::uint8_t {
+        none     = 0B0000,
+        invalid  = 0B0001,
+        zero     = 0B0010,
+        overflow = 0B0100,
+        all      = 0B0111
+    };
+
+    [[nodiscard]] inline bool any (FPExcept a) { return a != FPExcept::none; }
+
+    [[nodiscard]] inline FPExcept operator| (FPExcept a, FPExcept b)
+    {
+        using T = std::underlying_type_t<FPExcept>;
+        return static_cast<FPExcept>(static_cast<T>(a) | static_cast<T>(b));
+    }
+
+    [[nodiscard]] inline FPExcept operator& (FPExcept a, FPExcept b)
+    {
+        using T = std::underlying_type_t<FPExcept>;
+        return static_cast<FPExcept>(static_cast<T>(a) & static_cast<T>(b));
+    }
+
+    //! Return currently enabled FP exceptions. Linux only.
+    [[nodiscard]] FPExcept getFPExcept ();
+
+    //! Set FP exception traps. Linux only. This enables set flags and
+    //! DISABLES unset flags. This can be used to restore previous settings.
+    FPExcept setFPExcept (FPExcept excepts);
+
+    /**
+     * \brief Disable FP exceptions. Linux Only
+     *
+     * This function disables given exception traps and keeps the status of
+     * the others. The example below disables FPE invalid and
+     * divide-by-zero, and later restores the previous settings.
+     *
+     \verbatim
+         auto prev_excepts = disableFPExcept(FPExcept::invalid | FPExcept::zero);
+         // ....
+         setFPExcept(prev_excepts); // restore previous settings
+     \endverbatim
+     */
+    [[nodiscard]] FPExcept disableFPExcept (FPExcept excepts);
+
+    /**
+     * \brief Enable FP exceptions. Linux Only
+     *
+     * This function enables given exception traps and keeps the status of
+     * the others. The example below enables all FPE traps, and later
+     * restores the previous settings.
+     *
+     \verbatim
+         auto prev_excepts = disableFPExcept(FPExcept::all);
+         // ....
+         setFPExcept(prev_excepts); // restore previous settings
+     \endverbatim
+     */
+    [[nodiscard]] FPExcept enableFPExcept (FPExcept excepts);
 }
 
 #endif /*BL_AMREX_H*/
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index 2d6e7626c80..b4aa8f44909 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -99,20 +99,25 @@ namespace system
 {
     std::string exename;
     int verbose = 1;
-    int signal_handling;
-    int handle_sigsegv;
-    int handle_sigterm;
-    int handle_sigint;
-    int handle_sigabrt;
-    int handle_sigfpe;
-    int handle_sigill;
-    int call_addr2line;
-    int throw_exception;
-    int regtest_reduction;
-    int abort_on_unused_inputs = 0;
+    bool signal_handling;
+    bool handle_sigsegv;
+    bool handle_sigterm;
+    bool handle_sigint;
+    bool handle_sigabrt;
+    bool handle_sigfpe;
+    bool handle_sigill;
+    bool call_addr2line;
+    bool throw_exception;
+    bool regtest_reduction;
+    bool abort_on_unused_inputs = false;
     std::ostream* osout = &std::cout;
     std::ostream* oserr = &std::cerr;
     ErrorHandler error_handler = nullptr;
+#if defined(AMREX_DEBUG) || defined(AMREX_TESTING)
+    bool init_snan = true;
+#else
+    bool init_snan = false;
+#endif
 }
 }
 
@@ -143,11 +148,11 @@ namespace {
 
 #ifdef AMREX_USE_HYPRE
 namespace {
-    int init_hypre = 1;
+    bool init_hypre = true;
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-    int hypre_spgemm_use_vendor = 0;
-    int hypre_spmv_use_vendor = 0;
-    int hypre_sptrans_use_vendor = 0;
+    bool hypre_spgemm_use_vendor = false;
+    bool hypre_spmv_use_vendor = false;
+    bool hypre_sptrans_use_vendor = false;
 #endif
 }
 #endif
@@ -156,6 +161,10 @@ int amrex::Verbose () noexcept { return amrex::system::verbose; }
 
 void amrex::SetVerbose (int v) noexcept { amrex::system::verbose = v; }
 
+bool amrex::InitSNaN () noexcept { return amrex::system::init_snan; }
+
+void amrex::SetInitSNaN (bool v) noexcept  { amrex::system::init_snan = v; }
+
 void amrex::SetErrorHandler (amrex::ErrorHandler f) {
     amrex::system::error_handler = f;
 }
@@ -327,16 +336,16 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 {
     system::exename.clear();
 //    system::verbose = 0;
-    system::regtest_reduction = 0;
-    system::signal_handling = 1;
-    system::handle_sigsegv = 1;
-    system::handle_sigterm = 0;
-    system::handle_sigint  = 1;
-    system::handle_sigabrt = 1;
-    system::handle_sigfpe  = 1;
-    system::handle_sigill  = 1;
-    system::call_addr2line = 1;
-    system::throw_exception = 0;
+    system::regtest_reduction = false;
+    system::signal_handling = true;
+    system::handle_sigsegv = true;
+    system::handle_sigterm = false;
+    system::handle_sigint  = true;
+    system::handle_sigabrt = true;
+    system::handle_sigfpe  = true;
+    system::handle_sigill  = true;
+    system::call_addr2line = true;
+    system::throw_exception = false;
     system::osout = &a_osout;
     system::oserr = &a_oserr;
     system::error_handler = a_errhandler;
@@ -442,8 +451,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 
     {
         ParmParse pp("amrex");
-        pp.queryAdd("v", system::verbose);
-        pp.queryAdd("verbose", system::verbose);
+        if (! pp.query("verbose", "v", system::verbose)) {
+            pp.add("verbose", system::verbose);
+        }
+        pp.queryAdd("init_snan", system::init_snan);
     }
 
     if (system::verbose > 0) {
@@ -501,10 +512,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 
     {
         ParmParse pp("amrex");
-        pp.queryAdd("regtest_reduction", system::regtest_reduction);
+        pp.query("regtest_reduction", system::regtest_reduction);
         pp.queryAdd("signal_handling", system::signal_handling);
         pp.queryAdd("throw_exception", system::throw_exception);
-        pp.queryAdd("call_addr2line", system::call_addr2line);
+        pp.query("call_addr2line", system::call_addr2line);
         pp.queryAdd("abort_on_unused_inputs", system::abort_on_unused_inputs);
 
 #ifdef AMREX_USE_SYCL
@@ -550,10 +561,21 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
                 prev_handler_sigabrt = SIG_ERR; // NOLINT(performance-no-int-to-ptr)
             }
 
-            prev_handler_sigfpe = SIG_ERR; // NOLINT(performance-no-int-to-ptr)
+            if (system::handle_sigfpe) {
+                prev_handler_sigfpe = std::signal(SIGFPE,  BLBackTrace::handler);
+            } else {
+                prev_handler_sigfpe = SIG_ERR; // NOLINT(performance-no-int-to-ptr)
+            }
+
+            if (system::handle_sigill) {
+                prev_handler_sigill = std::signal(SIGILL,  BLBackTrace::handler);
+            } else {
+                prev_handler_sigill = SIG_ERR; // NOLINT(performance-no-int-to-ptr)
+            }
+
             if (system::handle_sigfpe)
             {
-                int invalid = 0, divbyzero=0, overflow=0;
+                bool invalid = false, divbyzero=false, overflow=false;
                 pp.queryAdd("fpe_trap_invalid", invalid);
                 pp.queryAdd("fpe_trap_zero", divbyzero);
                 pp.queryAdd("fpe_trap_overflow", overflow);
@@ -566,7 +588,6 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
                 prev_fpe_excepts = fegetexcept();
                 if (curr_fpe_excepts != 0) {
                     feenableexcept(curr_fpe_excepts);  // trap floating point exceptions
-                    prev_handler_sigfpe = std::signal(SIGFPE,  BLBackTrace::handler);
                 }
 
 #elif defined(__APPLE__) && defined(__x86_64__)
@@ -577,16 +598,14 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
                 if (overflow)  { curr_fpe_excepts |= _MM_MASK_OVERFLOW; }
                 if (curr_fpe_excepts != 0u) {
                     _MM_SET_EXCEPTION_MASK(prev_fpe_mask & ~curr_fpe_excepts);
-                    prev_handler_sigfpe = std::signal(SIGFPE,  BLBackTrace::handler);
                 }
 #endif
             }
 
-            prev_handler_sigill = SIG_ERR; // NOLINT(performance-no-int-to-ptr)
+#if defined(__APPLE__) && defined(__aarch64__)
             if (system::handle_sigill)
             {
-#if defined(__APPLE__) && defined(__aarch64__)
-                int invalid = 0, divbyzero=0, overflow=0;
+                bool invalid = false, divbyzero=false, overflow=false;
                 pp.queryAdd("fpe_trap_invalid", invalid);
                 pp.queryAdd("fpe_trap_zero", divbyzero);
                 pp.queryAdd("fpe_trap_overflow", overflow);
@@ -598,9 +617,8 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
                 if (overflow)  { env.__fpcr |= __fpcr_trap_overflow;  }
                 fesetenv(&env);
                 // SIGILL ref: https://developer.apple.com/forums/thread/689159
-#endif
-                prev_handler_sigill = std::signal(SIGILL,  BLBackTrace::handler);
             }
+#endif
         }
 
 #ifdef AMREX_USE_HYPRE
@@ -913,4 +931,63 @@ AMReX::erase (AMReX* pamrex)
     }
 }
 
+FPExcept getFPExcept ()
+{
+    auto r = FPExcept::none;
+#if defined(__linux__)
+    auto excepts = fegetexcept();
+    if (excepts & FE_INVALID  ) { r = r | FPExcept::invalid ; }
+    if (excepts & FE_DIVBYZERO) { r = r | FPExcept::zero    ; }
+    if (excepts & FE_OVERFLOW ) { r = r | FPExcept::overflow; }
+#endif
+    return r;
+}
+
+FPExcept setFPExcept (FPExcept excepts)
+{
+    auto prev = getFPExcept();
+#if defined(__linux__)
+    int flags = FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW;
+    fedisableexcept(flags);
+    flags = 0;
+    if (any(excepts & FPExcept::invalid )) { flags |= FE_INVALID  ; }
+    if (any(excepts & FPExcept::zero    )) { flags |= FE_DIVBYZERO; }
+    if (any(excepts & FPExcept::overflow)) { flags |= FE_OVERFLOW ; }
+    feenableexcept(flags);
+#else
+    amrex::ignore_unused(excepts);
+#endif
+    return prev;
+}
+
+FPExcept disableFPExcept (FPExcept excepts)
+{
+    auto prev = getFPExcept();
+#if defined(__linux__)
+    int flags = 0;
+    if (any(excepts & FPExcept::invalid )) { flags |= FE_INVALID  ; }
+    if (any(excepts & FPExcept::zero    )) { flags |= FE_DIVBYZERO; }
+    if (any(excepts & FPExcept::overflow)) { flags |= FE_OVERFLOW ; }
+    fedisableexcept(flags);
+#else
+    amrex::ignore_unused(excepts);
+#endif
+    return prev;
+}
+
+FPExcept enableFPExcept (FPExcept excepts)
+{
+    auto prev = getFPExcept();
+#if defined(__linux__)
+    int flags = 0;
+    if (any(excepts & FPExcept::invalid )) { flags |= FE_INVALID  ; }
+    if (any(excepts & FPExcept::zero    )) { flags |= FE_DIVBYZERO; }
+    if (any(excepts & FPExcept::overflow)) { flags |= FE_OVERFLOW ; }
+    feenableexcept(flags);
+#else
+    amrex::ignore_unused(excepts);
+#endif
+    return prev;
+}
+
 }
diff --git a/Src/Base/AMReX_Array.H b/Src/Base/AMReX_Array.H
index 525133cde87..15ddde4d1e9 100644
--- a/Src/Base/AMReX_Array.H
+++ b/Src/Base/AMReX_Array.H
@@ -127,6 +127,15 @@ namespace amrex {
             return p;
         }
 
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        GpuArray<T,N>& operator+= (GpuArray<T,N> const& a) noexcept
+        {
+            for (unsigned int i = 0; i < N; ++i) {
+                arr[i] += a.arr[i];
+            }
+            return *this;
+        }
+
         T arr[amrex::max(N,1U)];
     };
 }
diff --git a/Src/Base/AMReX_AsyncOut.cpp b/Src/Base/AMReX_AsyncOut.cpp
index b235baa6580..3835c0b137e 100644
--- a/Src/Base/AMReX_AsyncOut.cpp
+++ b/Src/Base/AMReX_AsyncOut.cpp
@@ -10,7 +10,7 @@ namespace amrex::AsyncOut {
 
 namespace {
 
-int s_asyncout = false;
+bool s_asyncout = false;
 int s_noutfiles = 64;
 MPI_Comm s_comm = MPI_COMM_NULL;
 
diff --git a/Src/Base/AMReX_BLBackTrace.cpp b/Src/Base/AMReX_BLBackTrace.cpp
index d511a192728..d065bd71c05 100644
--- a/Src/Base/AMReX_BLBackTrace.cpp
+++ b/Src/Base/AMReX_BLBackTrace.cpp
@@ -13,12 +13,13 @@
 #include <AMReX_TinyProfiler.H>
 #endif
 
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
 #include <iostream>
 #include <sstream>
-#include <fstream>
-#include <cstring>
-#include <cstdio>
-#include <csignal>
 
 #if !(defined(_MSC_VER) && defined(__CUDACC__))
 //MSVC can't pre-processor cfenv with `Zc:preprocessor`
@@ -177,6 +178,18 @@ namespace {
         }
         return r;
     }
+
+#ifdef __linux__
+    bool command_exists(std::string const &cmd)
+    {
+        // command -v is part of POSIX so should be available
+        std::string check_command = "command -v " + cmd + " > /dev/null 2>&1";
+        int r = std::system(check_command.c_str());
+        // return value of std::system is implementation defined and can be
+        // decoded using WEXITSTATUS but it should be 0 on success
+        return r == 0;
+    }
+#endif
 }
 #endif
 
@@ -209,19 +222,32 @@ BLBackTrace::print_backtrace_info (FILE* f)
         int have_addr2line = 0;
         std::string eu_cmd;
         {
-            have_eu_addr2line = file_exists("/usr/bin/eu-addr2line");
+            if (command_exists("eu-addr2line")) {
+                have_eu_addr2line = 1;
+                eu_cmd = "eu-addr2line";
+            } else {
+                std::string eu_fallback_path = "/usr/bin/eu-addr2line";
+                have_eu_addr2line = file_exists(eu_fallback_path.c_str());
+                eu_cmd = std::move(eu_fallback_path);
+            }
             if (have_eu_addr2line) {
                 const pid_t pid = getpid();
                 // cmd = "/usr/bin/eu-addr2line -C -f -i --pretty-print -p "
-                eu_cmd = "/usr/bin/eu-addr2line -C -f -i -p "
-                    + std::to_string(pid);
+                eu_cmd += " -C -f -i -p " + std::to_string(pid);
             }
         }
         std::string cmd;
         {
-            have_addr2line = file_exists("/usr/bin/addr2line");
+            if (command_exists("addr2line")) {
+                have_addr2line = 1;
+                cmd = "addr2line";
+            } else {
+                std::string fallback_path = "/usr/bin/addr2line";
+                have_addr2line = file_exists(fallback_path.c_str());
+                cmd = std::move(fallback_path);
+            }
             if (have_addr2line) {
-                cmd = "/usr/bin/addr2line -Cpfie " + amrex::system::exename;
+                cmd += " -Cpfie " + amrex::system::exename;
             }
         }
 
diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H
index c1212fb7a0a..db9289894ca 100644
--- a/Src/Base/AMReX_BaseFab.H
+++ b/Src/Base/AMReX_BaseFab.H
@@ -457,6 +457,15 @@ public:
     void getVal (T* data, const IntVect& pos, int N, int numcomp) const noexcept;
     //! Same as above, except that starts at component 0 and copies all comps.
     void getVal (T* data, const IntVect& pos) const noexcept;
+
+#if defined(AMREX_USE_GPU)
+    template <RunOn run_on,
+#else
+    template <RunOn run_on=RunOn::Host,
+#endif
+              class U=T, std::enable_if_t<std::is_same_v<U,float> || std::is_same_v<U,double>,int> FOO = 0>
+    void fill_snan () noexcept;
+
     /**
     * \brief The setVal functions set sub-regions in the BaseFab to a
     * constant value.  This most general form specifies the sub-box,
@@ -1829,6 +1838,15 @@ BaseFab<T>::shiftHalf (int idir, int n_cell) noexcept
     return *this;
 }
 
+template <class T>
+template <RunOn run_on, class U,
+          std::enable_if_t<std::is_same_v<U,float> || std::is_same_v<U,double>, int> FOO>
+void
+BaseFab<T>::fill_snan () noexcept
+{
+    amrex::fill_snan<run_on>(this->dptr, this->truesize);
+}
+
 template <class T>
 template <RunOn run_on>
 void
@@ -1924,6 +1942,20 @@ BaseFab<T>::define ()
     placementNew(this->dptr, this->truesize);
 
     amrex::update_fab_stats(this->domain.numPts(), this->truesize, sizeof(T));
+
+    if constexpr (std::is_same_v<T,float> || std::is_same_v<T,double>) {
+        if (amrex::InitSNaN() && this->truesize > 0) {
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion() && arena()->isDeviceAccessible()) {
+                this->template fill_snan<RunOn::Device>();
+                Gpu::streamSynchronize();
+            } else
+#endif
+            {
+                this->template fill_snan<RunOn::Host>();
+            }
+        }
+    }
 }
 
 template <class T>
diff --git a/Src/Base/AMReX_BaseFwd.H b/Src/Base/AMReX_BaseFwd.H
index 824a6c504e7..d187673869a 100644
--- a/Src/Base/AMReX_BaseFwd.H
+++ b/Src/Base/AMReX_BaseFwd.H
@@ -1,5 +1,6 @@
 #ifndef AMREX_BASE_FWD_H_
 #define AMREX_BASE_FWD_H_
+#include <AMReX_Config.H>
 
 namespace amrex {
 
@@ -21,11 +22,15 @@ class BoxList;
 class DistributionMapping;
 class Geometry;
 
-class Box;
+template<int dim>
+class BoxND;
+using Box = BoxND<AMREX_SPACEDIM>;
 template<int dim>
 class IntVectND;
 using IntVect = IntVectND<AMREX_SPACEDIM>;
-class IndexType;
+template<int dim>
+class IndexTypeND;
+using IndexType = IndexTypeND<AMREX_SPACEDIM>;
 struct Dim3;
 struct XDim3;
 class RealBox;
diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
index f29428b1022..7d4cb30c43e 100644
--- a/Src/Base/AMReX_Box.H
+++ b/Src/Base/AMReX_Box.H
@@ -21,59 +21,61 @@
 
 namespace amrex
 {
+template<int dim>
+class BoxND;
+using Box = BoxND<AMREX_SPACEDIM>;
 class BoxCommHelper;
 
 /**
 * \brief A Rectangular Domain on an Integer Lattice
 *
-* A Box is an abstraction for defining discrete regions of
-* SPACEDIM indexing space.  Boxes have an IndexType, which defines
+* A BoxND is an abstraction for defining discrete regions of
+* dim indexing space. Boxes have an IndexType, which defines
 * IndexType::CELL or IndexType::NODE based points for each direction
 * and a low and high INTVECT which defines the lower and upper corners
-* of the Box.  Boxes can exist in positive and negative indexing space.
+* of the BoxND. Boxes can exist in positive and negative indexing space.
 *
-* Box is a dimension dependent class, so SPACEDIM must be
+* Box is a dimension dependent alias to BoxND<AMREX_SPACEDIM>, so AMREX_SPACEDIM must be
 * defined as either 1, 2, or 3 when compiling.
 */
-class Box
+template<int dim>
+class BoxND
 {
     friend MPI_Datatype ParallelDescriptor::Mpi_typemap<Box>::type();
     friend class BoxCommHelper;
 
 public:
     /*
-    * \brief The default constructor.  For safety, the constructed Box is
+    * \brief The default constructor. For safety, the constructed BoxND is
     * invalid and may be tested for validity with ok().
     * DO NOT CHANGE THIS BEHAVIOR!
     */
     AMREX_GPU_HOST_DEVICE
-    constexpr Box () noexcept
+    constexpr BoxND () noexcept
         : smallend(1),
           bigend(0)
         {}
 
-    //! Construct cell-centered type Box.
+    //! Construct cell-centered type BoxND.
     AMREX_GPU_HOST_DEVICE
-    constexpr Box (const IntVect& small, const IntVect& big) noexcept
+    constexpr BoxND (const IntVectND<dim>& small, const IntVectND<dim>& big) noexcept
         : smallend(small),
           bigend(big)
         {}
 
-    //! Construct box with specified lengths.
+    //! Construct BoxND with specified lengths.
     AMREX_GPU_HOST_DEVICE
-    Box (const IntVect& small, const int* vec_len) noexcept
+    BoxND (const IntVectND<dim>& small, const int* vec_len) noexcept
         : smallend(small),
-          bigend(AMREX_D_DECL(small[0]+vec_len[0]-1,
-                              small[1]+vec_len[1]-1,
-                              small[2]+vec_len[2]-1))
+          bigend(small + IntVectND<dim>(vec_len) - 1)
         {}
 
     /**
-    * \brief Construct Box with given type.  small and big are expected
+    * \brief Construct BoxND with given type.  small and big are expected
     * to be consistent with given type.
     */
     AMREX_GPU_HOST_DEVICE
-    Box (const IntVect& small, const IntVect& big, const IntVect& typ) noexcept
+    BoxND (const IntVectND<dim>& small, const IntVectND<dim>& big, const IntVectND<dim>& typ) noexcept
         : smallend(small),
           bigend(big),
           btype(typ)
@@ -83,27 +85,27 @@ public:
 
     //! Construct dimension specific Boxes.
     AMREX_GPU_HOST_DEVICE
-    Box (const IntVect& small, const IntVect& big, IndexType t) noexcept
+    BoxND (const IntVectND<dim>& small, const IntVectND<dim>& big, IndexTypeND<dim> t) noexcept
         : smallend(small),
           bigend(big),
           btype(t)
         {}
 
-    template <typename T>
+    template <typename T, int Tdim=dim, std::enable_if_t<( 1<=Tdim && Tdim<=3 ), int> = 0>
     AMREX_GPU_HOST_DEVICE
-    explicit Box (Array4<T> const& a) noexcept
-        : smallend(AMREX_D_DECL(a.begin.x,a.begin.y,a.begin.z)),
-          bigend  (AMREX_D_DECL(a.end.x-1,a.end.y-1,a.end.z-1))
+    explicit BoxND (Array4<T> const& a) noexcept
+        : smallend(a.begin),
+          bigend(IntVectND<dim>(a.end) - 1)
         {}
 
     // dtor, copy-ctor, copy-op=, move-ctor, and move-op= are compiler generated.
 
-    //! Get the smallend of the box.
+    //! Get the smallend of the BoxND.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    const IntVect& smallEnd () const& noexcept { return smallend; }
+    const IntVectND<dim>& smallEnd () const& noexcept { return smallend; }
 
-    //! Get the smallend of the box.
-    [[nodiscard]] const IntVect& smallEnd () && = delete;
+    //! Get the smallend of the BoxND.
+    [[nodiscard]] const IntVectND<dim>& smallEnd () && = delete;
 
     //! Returns the coordinate of the low end in the given direction.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
@@ -111,10 +113,10 @@ public:
 
     //! Get the bigend.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    const IntVect& bigEnd () const& noexcept { return bigend; }
+    const IntVectND<dim>& bigEnd () const& noexcept { return bigend; }
 
     //! Get the bigend.
-    [[nodiscard]] const IntVect& bigEnd () && = delete;
+    [[nodiscard]] const IntVectND<dim>& bigEnd () && = delete;
 
     //! Returns the coordinate of the high end in the given direction.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
@@ -122,69 +124,53 @@ public:
 
     //! Returns the indexing type.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    IndexType ixType () const noexcept { return btype; }
+    IndexTypeND<dim> ixType () const noexcept { return btype; }
 
     //! Returns the indexing type.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    IntVect type () const noexcept { return btype.ixType(); }
+    IntVectND<dim> type () const noexcept { return btype.ixType(); }
 
     //! Returns the indexing type in the specified direction.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     IndexType::CellIndex type (int dir) const noexcept { return btype.ixType(dir); }
 
-    //! Return the length of the Box.
+    //! Return the length of the BoxND.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    IntVect size () const noexcept
+    IntVectND<dim> size () const noexcept
     {
-        return IntVect(AMREX_D_DECL(bigend[0]-smallend[0] + 1,
-                                    bigend[1]-smallend[1] + 1,
-                                    bigend[2]-smallend[2] + 1));
+        return bigend - smallend + 1;
     }
 
-    //! Return the length of the Box.
+    //! Return the length of the BoxND.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    IntVect length () const noexcept
+    IntVectND<dim> length () const noexcept
     {
-        return IntVect(AMREX_D_DECL(bigend[0]-smallend[0] + 1,
-                                    bigend[1]-smallend[1] + 1,
-                                    bigend[2]-smallend[2] + 1));
+        return bigend - smallend + 1;
     }
 
-    //! Return the length of the Box in given direction.
+    //! Return the length of the BoxND in given direction.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     int length (int dir) const noexcept { return bigend[dir] - smallend[dir] + 1; }
 
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     GpuArray<int,3> length3d () const noexcept {
-#if (AMREX_SPACEDIM == 1)
-        return {{bigend[0]-smallend[0]+1, 1, 1}};
-#elif (AMREX_SPACEDIM == 2)
-        return {{bigend[0]-smallend[0]+1, bigend[1]-smallend[1]+1, 1}};
-#elif (AMREX_SPACEDIM == 3)
-        return {{bigend[0]-smallend[0]+1, bigend[1]-smallend[1]+1, bigend[2]-smallend[2]+1}};
-#endif
+        Dim3 len3d = length().dim3(1);
+        return {{len3d.x, len3d.y, len3d.z}};
     }
 
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     GpuArray<int,3> loVect3d () const noexcept {
-#if (AMREX_SPACEDIM == 1)
-        return {{smallend[0], 0, 0}};
-#elif (AMREX_SPACEDIM == 2)
-        return {{smallend[0], smallend[1], 0}};
-#elif (AMREX_SPACEDIM == 3)
-        return {{smallend[0], smallend[1], smallend[2]}};
-#endif
+        Dim3 lo3d = smallend.dim3(0);
+        return {{lo3d.x, lo3d.y, lo3d.z}};
     }
 
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     GpuArray<int,3> hiVect3d () const noexcept {
-#if (AMREX_SPACEDIM == 1)
-        return {{bigend[0], 0, 0}};
-#elif (AMREX_SPACEDIM == 2)
-        return {{bigend[0], bigend[1], 0}};
-#elif (AMREX_SPACEDIM == 3)
-        return {{bigend[0], bigend[1], bigend[2]}};
-#endif
+        Dim3 hi3d = bigend.dim3(0);
+        return {{hi3d.x, hi3d.y, hi3d.z}};
     }
 
     //! Returns a constant pointer the array of low end coordinates. Useful for calls to FORTRAN.
@@ -205,77 +191,77 @@ public:
         return face.isLow() ? smallend[dir] : bigend[dir];
     }
 
-    //! Checks if it is an empty box.
+    //! Checks if it is an empty BoxND.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     bool isEmpty () const noexcept { return !ok(); }
 
-    //! Checks if it is a proper Box (including a valid type).
+    //! Checks if it is a proper BoxND (including a valid type).
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     bool ok () const noexcept { return bigend.allGE(smallend) && btype.ok(); }
 
-    //! Returns true if argument is contained within Box.
+    //! Returns true if argument is contained within BoxND.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool contains (const IntVect& p) const noexcept { return p.allGE(smallend) && p.allLE(bigend); }
+    bool contains (const IntVectND<dim>& p) const noexcept {
+        return p.allGE(smallend) && p.allLE(bigend);
+    }
 
-    //! Returns true if argument is contained within Box.
+    //! Returns true if argument is contained within BoxND.
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     bool contains (const Dim3& p) const noexcept {
-        return AMREX_D_TERM(p.x >= smallend[0] && p.x <= bigend[0],
-                         && p.y >= smallend[1] && p.y <= bigend[1],
-                         && p.z >= smallend[2] && p.z <= bigend[2]);
+        IntVectND<dim> piv{p};
+        return contains(piv);
     }
 
-    //! Returns true if argument is contained within Box.
+    //! Returns true if argument is contained within BoxND.
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-#if (AMREX_SPACEDIM == 1)
-    bool contains (int i, int, int) const noexcept {
-#elif (AMREX_SPACEDIM == 2)
-    bool contains (int i, int j, int) const noexcept {
-#else
     bool contains (int i, int j, int k) const noexcept {
-#endif
-        return AMREX_D_TERM(i >= smallend[0] && i <= bigend[0],
-                         && j >= smallend[1] && j <= bigend[1],
-                         && k >= smallend[2] && k <= bigend[2]);
+        Dim3 p3d{i, j, k};
+        return contains(p3d);
     }
 
-    /** \brief Returns true if argument is contained within Box.
+    /** \brief Returns true if argument is contained within BoxND.
     * It is an error if the Boxes have different types.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool contains (const Box& b) const noexcept
+    bool contains (const BoxND& b) const noexcept
     {
         BL_ASSERT(sameType(b));
         return b.smallend.allGE(smallend) && b.bigend.allLE(bigend);
     }
 
-    //! Returns true if argument is strictly contained within Box.
+    //! Returns true if argument is strictly contained within BoxND.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool strictly_contains (const IntVect& p) const noexcept { return p.allGT(smallend) && p.allLT(bigend); }
+    bool strictly_contains (const IntVectND<dim>& p) const noexcept {
+        return p.allGT(smallend) && p.allLT(bigend);
+    }
 
     /**
-    * \brief Returns true if argument is strictly contained within Box.
+    * \brief Returns true if argument is strictly contained within BoxND.
     * It is an error if the Boxes have different types.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool strictly_contains (const Box& b) const noexcept
+    bool strictly_contains (const BoxND& b) const noexcept
     {
         BL_ASSERT(sameType(b));
         return b.smallend.allGT(smallend) && b.bigend.allLT(bigend);
     }
 
-    //! Returns true if argument is strictly contained within Box.
+    //! Returns true if argument is strictly contained within BoxND.
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE
+    bool strictly_contains (const Dim3& p) const noexcept {
+        IntVectND<dim> piv{p};
+        return strictly_contains(piv);
+    }
+
+    //! Returns true if argument is strictly contained within BoxND.
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-#if (AMREX_SPACEDIM == 1)
-    bool strictly_contains (int i, int, int) const noexcept {
-#elif (AMREX_SPACEDIM == 2)
-    bool strictly_contains (int i, int j, int) const noexcept {
-#else
     bool strictly_contains (int i, int j, int k) const noexcept {
-#endif
-        return AMREX_D_TERM(i > smallend[0] && i < bigend[0],
-                         && j > smallend[1] && j < bigend[1],
-                         && k > smallend[2] && k < bigend[2]);
+        Dim3 p3d{i, j, k};
+        return strictly_contains(p3d);
     }
 
     /**
@@ -283,34 +269,32 @@ public:
     * It is an error if the Boxes have different types.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool intersects (const Box& b) const noexcept { Box isect(*this); isect &= b; return isect.ok(); }
+    bool intersects (const BoxND& b) const noexcept { BoxND isect(*this); isect &= b; return isect.ok(); }
 
     /**
     * \brief Returns true is Boxes same size, ie translates of each other,.
     * It is an error if they have different types.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool sameSize (const Box& b) const noexcept {
+    bool sameSize (const BoxND& b) const noexcept {
         BL_ASSERT(sameType(b));
-        return AMREX_D_TERM(length(0) == b.length(0),
-                         && length(1) == b.length(1),
-                         && length(2) == b.length(2));
+        return length() == b.length();
     }
 
     //! Returns true if Boxes have same type.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool sameType (const Box &b) const noexcept { return btype == b.btype; }
+    bool sameType (const BoxND &b) const noexcept { return btype == b.btype; }
 
     //! Returns true if Boxes are identical (including type).
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool operator== (const Box& b) const noexcept { return smallend == b.smallend && bigend == b.bigend && b.btype == btype; }
+    bool operator== (const BoxND& b) const noexcept { return smallend == b.smallend && bigend == b.bigend && b.btype == btype; }
 
     //! Returns true if Boxes differ (including type).
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool operator!= (const Box& b) const noexcept { return !operator==(b); }
+    bool operator!= (const BoxND& b) const noexcept { return !operator==(b); }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool operator< (const Box& rhs) const noexcept
+    bool operator< (const BoxND& rhs) const noexcept
     {
         return btype < rhs.btype ||
              ((btype == rhs.btype) &&
@@ -318,67 +302,106 @@ public:
                    ((smallend == rhs.smallend) && (bigend < rhs.bigend)) ));
     }
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool operator <= (const Box& rhs) const noexcept {
+    bool operator <= (const BoxND& rhs) const noexcept {
         return !(rhs < *this);
     }
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool operator> (const Box& rhs) const noexcept {
+    bool operator> (const BoxND& rhs) const noexcept {
         return rhs < *this;
     }
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool operator>= (const Box& rhs) const noexcept {
+    bool operator>= (const BoxND& rhs) const noexcept {
         return !(*this < rhs);
     }
 
-    //! Returns true if Box is cell-centered in all indexing directions.
+    //! Returns true if BoxND is cell-centered in all indexing directions.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     bool cellCentered () const noexcept { return !btype.any(); }
 
+    //! Assert that there are no int/Long overflows when calling length or numPts.
+    void checkOverflow () const noexcept {
+        if (ok()) {
+            for (int i = 0; i < dim; ++i) {
+                auto lo = static_cast<Long>(smallend[i]);
+                auto hi = static_cast<Long>(bigend[i]);
+                Long len = hi - lo + 1;
+                AMREX_ALWAYS_ASSERT_WITH_MESSAGE(len>=0 && len<std::numeric_limits<int>::max(),
+                    "Overflow when computing length of box");
+            }
+            auto num_pts = static_cast<Long>(length(0));
+            for (int i = 1; i < dim; ++i) {
+                auto len = static_cast<Long>(length(i));
+                AMREX_ALWAYS_ASSERT_WITH_MESSAGE(num_pts == 0 || len == 0 ||
+                    num_pts <= std::numeric_limits<Long>::max() / len,
+                    "Overflow when computing numPts of box");
+                num_pts *= len;
+            }
+        }
+    }
+
     /**
-    * \brief Returns the number of points contained in the Box.
+    * \brief Returns the number of points contained in the BoxND.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     Long numPts () const noexcept {
-        return ok() ? AMREX_D_TERM( static_cast<Long>(length(0)),
-                                   *static_cast<Long>(length(1)),
-                                   *static_cast<Long>(length(2)))
-                    : Long(0);
+#if defined(AMREX_DEBUG) || defined(AMREX_USE_ASSERTION)
+        AMREX_IF_ON_HOST((checkOverflow();))
+#endif
+        if (ok()) {
+            auto num_pts = static_cast<Long>(length(0));
+            for (int i = 1; i < dim; ++i) {
+                num_pts *= static_cast<Long>(length(i));
+            }
+            return num_pts;
+        } else {
+            return Long(0);
+        }
     }
 
     /**
-    * \brief Returns the number of points contained in the Box.
+    * \brief Returns the number of points contained in the BoxND.
     * This is intended for use only in diagnostic messages.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     double d_numPts () const noexcept {
-        return ok() ? AMREX_D_TERM( double(length(0)),
-                                   *double(length(1)),
-                                   *double(length(2)))
-                    : 0.0;
+        if (ok()) {
+            auto num_pts = static_cast<double>(length(0));
+            for (int i = 1; i < dim; ++i) {
+                num_pts *= static_cast<double>(length(i));
+            }
+            return num_pts;
+        } else {
+            return 0.0;
+        }
     }
 
     /**
     * \brief Return the volume, in indexing space, of region enclosed by
-    * this Box.   This is identical to numPts() for CELL centered
-    * Box; otherwise, numPts() > volume().
+    * this BoxND.   This is identical to numPts() for CELL centered
+    * BoxND; otherwise, numPts() > volume().
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     Long volume () const noexcept {
-        return ok() ? AMREX_D_TERM( static_cast<Long>(length(0)-btype[0]),
-                                   *static_cast<Long>(length(1)-btype[1]),
-                                   *static_cast<Long>(length(2)-btype[2]))
-                    : Long(0);
+        if (ok()) {
+            auto num_pts = static_cast<Long>(length(0)-btype[0]);
+            for (int i = 1; i < dim; ++i) {
+                num_pts *= static_cast<Long>(length(i)-btype[i]);
+            }
+            return num_pts;
+        } else {
+            return Long(0);
+        }
     }
 
     /**
     * \brief Returns length of longest side.  dir is modified to give
-    * direction with longest side: 0...SPACEDIM-1. Ignores type.
+    * direction with longest side: 0...dim-1. Ignores type.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     int longside (int& dir) const noexcept {
         int maxlen = length(0);
         dir = 0;
-        for (int i = 1; i < AMREX_SPACEDIM; i++)
+        for (int i = 1; i < dim; i++)
         {
             if (length(i) > maxlen)
             {
@@ -398,13 +421,13 @@ public:
 
     /**
     * \brief Returns length of shortest side.  dir is modified to give
-    *  direction with shortest side: 0...SPACEDIM-1.  Ignores type.
+    *  direction with shortest side: 0...dim-1.  Ignores type.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     int shortside (int& dir) const noexcept {
         int minlen = length(0);
         dir = 0;
-        for (int i = 1; i < AMREX_SPACEDIM; i++)
+        for (int i = 1; i < dim; i++)
         {
             if (length(i) < minlen)
             {
@@ -428,56 +451,57 @@ public:
     * Is used in accessing FArrayBox.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    Long index (const IntVect& v) const noexcept;
+    Long index (const IntVectND<dim>& v) const noexcept;
 
-    //! Given the offset, compute IntVect
+    //! Given the offset, compute IntVectND<dim>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    IntVect atOffset (Long offset) const noexcept;
+    IntVectND<dim> atOffset (Long offset) const noexcept;
 
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     GpuArray<int,3> atOffset3d (Long offset) const noexcept;
 
-    //! Redefine the small end of the Box.
+    //! Redefine the small end of the BoxND.
     AMREX_GPU_HOST_DEVICE
-    Box& setSmall (const IntVect& sm) noexcept { smallend = sm; return *this; }
+    BoxND& setSmall (const IntVectND<dim>& sm) noexcept { smallend = sm; return *this; }
 
-    //! Redefine the small end of the Box.
+    //! Redefine the small end of the BoxND.
     AMREX_GPU_HOST_DEVICE
-    Box& setSmall (int dir, int sm_index) noexcept { smallend.setVal(dir,sm_index); return *this; }
+    BoxND& setSmall (int dir, int sm_index) noexcept { smallend.setVal(dir,sm_index); return *this; }
 
-    //! Redefine the big end of the Box.
+    //! Redefine the big end of the BoxND.
     AMREX_GPU_HOST_DEVICE
-    Box& setBig (const IntVect& bg) noexcept { bigend = bg; return *this; }
+    BoxND& setBig (const IntVectND<dim>& bg) noexcept { bigend = bg; return *this; }
 
-    //! Redefine the big end of the Box.
+    //! Redefine the big end of the BoxND.
     AMREX_GPU_HOST_DEVICE
-    Box& setBig (int dir, int bg_index) noexcept { bigend.setVal(dir,bg_index); return *this; }
+    BoxND& setBig (int dir, int bg_index) noexcept { bigend.setVal(dir,bg_index); return *this; }
 
     /**
     * \brief Set the entire range in a given direction, starting at
     * sm_index with length n_cells.  NOTE: This will yield an
-    * illegal Box if n_cells <= 0.
+    * illegal BoxND if n_cells <= 0.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& setRange (int dir,
+    BoxND& setRange (int dir,
                    int sm_index,
                    int n_cells = 1) noexcept;
 
     //! Set indexing type
     AMREX_GPU_HOST_DEVICE
-    Box& setType (const IndexType& t) noexcept { btype = t; return *this; }
+    BoxND& setType (const IndexTypeND<dim>& t) noexcept { btype = t; return *this; }
 
-    //! Shift this Box nzones indexing positions in coordinate direction dir.
+    //! Shift this BoxND nzones indexing positions in coordinate direction dir.
     AMREX_GPU_HOST_DEVICE
-    Box& shift (int dir, int nzones) noexcept { smallend.shift(dir,nzones); bigend.shift(dir,nzones); return *this; }
+    BoxND& shift (int dir, int nzones) noexcept { smallend.shift(dir,nzones); bigend.shift(dir,nzones); return *this; }
 
     //! Equivalent to b.shift(0,iv[0]).shift(1,iv[1]) ....
     AMREX_GPU_HOST_DEVICE
-    Box& shift (const IntVect& iv) noexcept { smallend.shift(iv); bigend.shift(iv); return *this; }
+    BoxND& shift (const IntVectND<dim>& iv) noexcept { smallend.shift(iv); bigend.shift(iv); return *this; }
 
     /**
-    * \brief This member shifts the Box by "half" indices, thereby
-    * converting the Box from type CELL to NODE and visa-versa.
+    * \brief This member shifts the BoxND by "half" indices, thereby
+    * converting the BoxND from type CELL to NODE and visa-versa.
     * b.shiftHalf(0,1)  shifts b to the right by 1/2 cells.
     * b.shiftHalf(1,-3) shifts b in the -j direction by 3/2 cells.
     * NOTE: If num_halfs is EVEN the shift is num_halfs/2 full
@@ -485,64 +509,64 @@ public:
     * This is: b.shifthalf(4) == b.shift(2).
     */
     AMREX_GPU_HOST_DEVICE
-    Box& shiftHalf (int dir, int num_halfs) noexcept;
+    BoxND& shiftHalf (int dir, int num_halfs) noexcept;
 
     //! Equivalent to b.shiftHalf(0,iv[0]).shiftHalf(1,iv[1]) ...
     AMREX_GPU_HOST_DEVICE
-    Box& shiftHalf (const IntVect& iv) noexcept;
+    BoxND& shiftHalf (const IntVectND<dim>& iv) noexcept;
 
     /**
-    * \brief Convert the Box from the current type into the
-    * argument type.  This may change the Box coordinates:
+    * \brief Convert the BoxND from the current type into the
+    * argument type.  This may change the BoxND coordinates:
     * type CELL -> NODE : increase coordinate by one on high end
     * type NODE -> CELL : reduce coordinate by one on high end
     * other type mappings make no change.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& convert (IndexType typ) noexcept;
+    BoxND& convert (IndexTypeND<dim> typ) noexcept;
 
     /**
-    * \brief Convert the Box from the current type into the
-    * argument type.  This may change the Box coordinates:
+    * \brief Convert the BoxND from the current type into the
+    * argument type.  This may change the BoxND coordinates:
     * type CELL -> NODE : increase coordinate by one on high end
     * type NODE -> CELL : reduce coordinate by one on high end
     * other type mappings make no change.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& convert (const IntVect& typ) noexcept;
+    BoxND& convert (const IntVectND<dim>& typ) noexcept;
 
     //! Convert to NODE type in all directions.
     AMREX_GPU_HOST_DEVICE
-    Box& surroundingNodes () noexcept;
+    BoxND& surroundingNodes () noexcept;
 
     //! Convert to NODE type in given direction.
     AMREX_GPU_HOST_DEVICE
-    Box& surroundingNodes (int dir) noexcept;
+    BoxND& surroundingNodes (int dir) noexcept;
 
     AMREX_GPU_HOST_DEVICE
-    Box& surroundingNodes (Direction d) noexcept { return surroundingNodes(static_cast<int>(d)); }
+    BoxND& surroundingNodes (Direction d) noexcept { return surroundingNodes(static_cast<int>(d)); }
 
     //! Convert to CELL type in all directions.
     AMREX_GPU_HOST_DEVICE
-    Box& enclosedCells () noexcept;
+    BoxND& enclosedCells () noexcept;
 
     //! Convert to CELL type in given direction.
     AMREX_GPU_HOST_DEVICE
-    Box& enclosedCells (int dir) noexcept;
+    BoxND& enclosedCells (int dir) noexcept;
 
     AMREX_GPU_HOST_DEVICE
-    Box& enclosedCells (Direction d) noexcept { return enclosedCells(static_cast<int>(d)); }
+    BoxND& enclosedCells (Direction d) noexcept { return enclosedCells(static_cast<int>(d)); }
 
     /**
-    * \brief Return Box that is intersection of this Box
+    * \brief Return BoxND that is intersection of this BoxND
     * and argument.  The Boxes MUST be of same type.
     */
     AMREX_GPU_HOST_DEVICE
-    Box operator& (const Box& rhs) const noexcept { Box lhs(*this); lhs &= rhs; return lhs; }
+    BoxND operator& (const BoxND& rhs) const noexcept { BoxND lhs(*this); lhs &= rhs; return lhs; }
 
-    //! Intersect this Box with its argument. The Boxes MUST be of the same type.
+    //! Intersect this BoxND with its argument. The Boxes MUST be of the same type.
     AMREX_GPU_HOST_DEVICE
-    Box& operator&= (const Box& rhs) noexcept
+    BoxND& operator&= (const BoxND& rhs) noexcept
     {
         BL_ASSERT(sameType(rhs));
         smallend.max(rhs.smallend);
@@ -551,12 +575,12 @@ public:
     }
 
     /**
-    * \brief Modify Box to that of the minimum Box containing both
-    * the original Box and the argument.
+    * \brief Modify BoxND to that of the minimum BoxND containing both
+    * the original BoxND and the argument.
     * Both Boxes must have identical type.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& minBox (const Box& b) noexcept {
+    BoxND& minBox (const BoxND& b) noexcept {
         // BoxArray may call this with not ok boxes.  BL_ASSERT(b.ok() && ok());
         BL_ASSERT(sameType(b));
         smallend.min(b.smallend);
@@ -564,82 +588,82 @@ public:
         return *this;
     }
 
-    //! Shift Box (relative) by given IntVect.
+    //! Shift BoxND (relative) by given IntVectND<dim>.
     AMREX_GPU_HOST_DEVICE
-    Box& operator+= (const IntVect& v) noexcept { smallend += v; bigend   += v; return *this; }
+    BoxND& operator+= (const IntVectND<dim>& v) noexcept { smallend += v; bigend   += v; return *this; }
 
-    //! Shift Box (relative) by given IntVect.
+    //! Shift BoxND (relative) by given IntVectND<dim>.
     AMREX_GPU_HOST_DEVICE
-    Box operator+ (const IntVect& v) const noexcept { Box r(*this); r += v; return r; }
+    BoxND operator+ (const IntVectND<dim>& v) const noexcept { BoxND r(*this); r += v; return r; }
 
-    //! Shift Box (relative) by given IntVect.
+    //! Shift BoxND (relative) by given IntVectND<dim>.
     AMREX_GPU_HOST_DEVICE
-    Box& operator-= (const IntVect& v) noexcept { smallend -= v; bigend -= v; return *this; }
+    BoxND& operator-= (const IntVectND<dim>& v) noexcept { smallend -= v; bigend -= v; return *this; }
 
-    //! Shift Box (relative) by given IntVect.
+    //! Shift BoxND (relative) by given IntVectND<dim>.
     AMREX_GPU_HOST_DEVICE
-    Box operator- (const IntVect& v) const noexcept { Box r(*this); r -= v; return r; }
+    BoxND operator- (const IntVectND<dim>& v) const noexcept { BoxND r(*this); r -= v; return r; }
 
     /**
-    * \brief Chop the Box at the chop_pnt in the dir direction
-    * returns one Box, modifies the object Box.
-    * The union of the two is the original Box.
-    * The modified Box is the low end, the returned Box
+    * \brief Chop the BoxND at the chop_pnt in the dir direction
+    * returns one BoxND, modifies the object BoxND.
+    * The union of the two is the original BoxND.
+    * The modified BoxND is the low end, the returned BoxND
     * is the high end.  If type(dir) = CELL, the Boxes are disjoint
-    * with the chop_pnt included in the high end (new Box).
-    * It is an ERROR if chop_pnt is the low end of the orig Box.
+    * with the chop_pnt included in the high end (new BoxND).
+    * It is an ERROR if chop_pnt is the low end of the orig BoxND.
     * If type(dir) = NODE, the chop_pnt is included in both Boxes
     * but is the only point in common.  It is also an error if the
-    * chop_pnt is an end node of the Box.
+    * chop_pnt is an end node of the BoxND.
     */
     AMREX_GPU_HOST_DEVICE
-    Box chop (int dir, int chop_pnt) noexcept;
+    BoxND chop (int dir, int chop_pnt) noexcept;
 
     /*
-    * \brief Grow Box in all directions by given amount.
-    * NOTE: n_cell negative shrinks the Box by that number of cells.
+    * \brief Grow BoxND in all directions by given amount.
+    * NOTE: n_cell negative shrinks the BoxND by that number of cells.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& grow (int i) noexcept { smallend.diagShift(-i); bigend.diagShift(i); return *this; }
+    BoxND& grow (int i) noexcept { smallend.diagShift(-i); bigend.diagShift(i); return *this; }
 
-    //! Grow Box in each direction by specified amount.
+    //! Grow BoxND in each direction by specified amount.
     AMREX_GPU_HOST_DEVICE
-    Box& grow (const IntVect& v) noexcept { smallend -= v; bigend += v; return *this;}
+    BoxND& grow (const IntVectND<dim>& v) noexcept { smallend -= v; bigend += v; return *this;}
 
     /**
-    * \brief Grow the Box on the low and high end by n_cell cells
+    * \brief Grow the BoxND on the low and high end by n_cell cells
     * in direction idir.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& grow (int idir, int n_cell) noexcept { smallend.shift(idir, -n_cell); bigend.shift(idir, n_cell); return *this; }
+    BoxND& grow (int idir, int n_cell) noexcept { smallend.shift(idir, -n_cell); bigend.shift(idir, n_cell); return *this; }
 
     AMREX_GPU_HOST_DEVICE
-    Box& grow (Direction d, int n_cell) noexcept { return grow(static_cast<int>(d), n_cell); }
+    BoxND& grow (Direction d, int n_cell) noexcept { return grow(static_cast<int>(d), n_cell); }
 
     /**
-    * \brief Grow the Box on the low end by n_cell cells in direction idir.
-    * NOTE: n_cell negative shrinks the Box by that number of cells.
+    * \brief Grow the BoxND on the low end by n_cell cells in direction idir.
+    * NOTE: n_cell negative shrinks the BoxND by that number of cells.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& growLo (int idir, int n_cell = 1) noexcept { smallend.shift(idir, -n_cell); return *this; }
+    BoxND& growLo (int idir, int n_cell = 1) noexcept { smallend.shift(idir, -n_cell); return *this; }
 
     AMREX_GPU_HOST_DEVICE
-    Box& growLo (Direction d, int n_cell = 1) noexcept { return growLo(static_cast<int>(d), n_cell); }
+    BoxND& growLo (Direction d, int n_cell = 1) noexcept { return growLo(static_cast<int>(d), n_cell); }
 
     /**
-    * \brief Grow the Box on the high end by n_cell cells in
-    * direction idir.  NOTE: n_cell negative shrinks the Box by that
+    * \brief Grow the BoxND on the high end by n_cell cells in
+    * direction idir.  NOTE: n_cell negative shrinks the BoxND by that
     * number of cells.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& growHi (int idir, int n_cell = 1) noexcept { bigend.shift(idir,n_cell); return *this; }
+    BoxND& growHi (int idir, int n_cell = 1) noexcept { bigend.shift(idir,n_cell); return *this; }
 
     AMREX_GPU_HOST_DEVICE
-    Box& growHi (Direction d, int n_cell = 1) noexcept { return growHi(static_cast<int>(d), n_cell); }
+    BoxND& growHi (Direction d, int n_cell = 1) noexcept { return growHi(static_cast<int>(d), n_cell); }
 
     //! Grow in the direction of the given face.
     AMREX_GPU_HOST_DEVICE
-    Box& grow (Orientation face, int n_cell = 1) noexcept {
+    BoxND& grow (Orientation face, int n_cell = 1) noexcept {
         int idir = face.coordDir();
         if (face.isLow()) {
             smallend.shift(idir, -n_cell);
@@ -650,80 +674,80 @@ public:
     }
 
     /**
-    * \brief Refine Box by given (positive) refinement ratio.
+    * \brief Refine BoxND by given (positive) refinement ratio.
     * NOTE: if type(dir) = CELL centered: lo <- lo*ratio and
     * hi <- (hi+1)*ratio - 1.
     * NOTE: if type(dir) = NODE centered: lo <- lo*ratio and
     * hi <- hi*ratio.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& refine (int ref_ratio) noexcept {
-        return this->refine(IntVect(ref_ratio));
+    BoxND& refine (int ref_ratio) noexcept {
+        return this->refine(IntVectND<dim>(ref_ratio));
     }
 
     /*
-    * \brief Refine Box by given (positive) refinement ratio.
+    * \brief Refine BoxND by given (positive) refinement ratio.
     * NOTE: if type(dir) = CELL centered: lo <- lo*ratio and
     * hi <- (hi+1)*ratio - 1.
     * NOTE: if type(dir) = NODE centered: lo <- lo*ratio and
     * hi <- hi*ratio.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& refine (const IntVect& ref_ratio) noexcept;
+    BoxND& refine (const IntVectND<dim>& ref_ratio) noexcept;
 
     /**
-    * \brief Coarsen Box by given (positive) refinement ratio.
+    * \brief Coarsen BoxND by given (positive) refinement ratio.
     * NOTE: if type(dir) = CELL centered: lo <- lo/ratio and
     * hi <- hi/ratio.
     * NOTE: if type(dir) = NODE centered: lo <- lo/ratio and
     * hi <- hi/ratio + ((hi%ratio)==0 ? 0 : 1).
-    * That is, refinement of coarsened Box must contain
-    * the original Box.
+    * That is, refinement of coarsened BoxND must contain
+    * the original BoxND.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& coarsen (int ref_ratio) noexcept {
-        return this->coarsen(IntVect(ref_ratio));
+    BoxND& coarsen (int ref_ratio) noexcept {
+        return this->coarsen(IntVectND<dim>(ref_ratio));
     }
 
     /**
-    * \brief Coarsen Box by given (positive) refinement ratio.
+    * \brief Coarsen BoxND by given (positive) refinement ratio.
     * NOTE: if type(dir) = CELL centered: lo <- lo/ratio and
     * hi <- hi/ratio.
     * NOTE: if type(dir) = NODE centered: lo <- lo/ratio and
     * hi <- hi/ratio + ((hi%ratio)==0 ? 0 : 1).
-    * That is, refinement of coarsened Box must contain
-    * the original Box.
+    * That is, refinement of coarsened BoxND must contain
+    * the original BoxND.
     */
     AMREX_GPU_HOST_DEVICE
-    Box& coarsen (const IntVect& ref_ratio) noexcept;
+    BoxND& coarsen (const IntVectND<dim>& ref_ratio) noexcept;
 
     /**
     * \brief Step through the rectangle.  It is a runtime error to give
     * a point not inside rectangle.  Iteration may not be efficient.
     */
     AMREX_GPU_HOST_DEVICE
-    void next (IntVect &) const noexcept;
+    void next (IntVectND<dim> &) const noexcept;
 
     /**
     * \brief This static member function returns a constant reference to
-    * an object of type Box representing the unit box in
-    * AMREX_SPACEDIM-dimensional space.
+    * an object of type BoxND representing the unit BoxND in
+    * dim-dimensional space.
     */
     AMREX_GPU_HOST_DEVICE
-    static Box TheUnitBox () noexcept {
-        return Box(IntVect::TheZeroVector(),IntVect::TheZeroVector());
+    static BoxND TheUnitBox () noexcept {
+        return BoxND(IntVectND<dim>::TheZeroVector(),IntVectND<dim>::TheZeroVector());
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     bool isSquare() const noexcept;
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool coarsenable(const IntVect& refrat, const IntVect& min_width) const noexcept
+    bool coarsenable(const IntVectND<dim>& refrat, const IntVectND<dim>& min_width) const noexcept
     {
         if (!size().allGE(refrat*min_width)) {
             return false;
         } else {
-            Box testBox = *this;
+            BoxND testBox = *this;
             testBox.coarsen(refrat);
             testBox.refine (refrat);
             return (*this == testBox);
@@ -732,19 +756,19 @@ public:
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     bool coarsenable(int refrat, int min_width=1) const noexcept {
-        return coarsenable(IntVect(refrat), IntVect(min_width));
+        return coarsenable(IntVectND<dim>(refrat), IntVectND<dim>(min_width));
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
-    bool coarsenable(const IntVect& refrat, int min_width=1) const noexcept
+    bool coarsenable(const IntVectND<dim>& refrat, int min_width=1) const noexcept
     {
-        return coarsenable(refrat, IntVect(min_width));
+        return coarsenable(refrat, IntVectND<dim>(min_width));
     }
 
     AMREX_GPU_HOST_DEVICE
     void normalize () noexcept
     {
-        for (int idim=0; idim < AMREX_SPACEDIM; ++idim) {
+        for (int idim=0; idim < dim; ++idim) {
             if (this->length(idim) == 0) {
                 this->growHi(idim,1);
             }
@@ -752,37 +776,80 @@ public:
     }
 
     AMREX_GPU_HOST_DEVICE
-    Box& makeSlab (int direction, int slab_index) noexcept
+    BoxND& makeSlab (int direction, int slab_index) noexcept
     {
         smallend[direction] = slab_index;
         bigend[direction] = slab_index;
         return *this;
     }
 
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Dim3 lbound (Box const& box) noexcept;
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Dim3 ubound (Box const& box) noexcept;
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Dim3 begin  (Box const& box) noexcept;
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Dim3 end    (Box const& box) noexcept;
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Dim3 length (Box const& box) noexcept;
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Dim3 max_lbound (Box const&, Box const&) noexcept;
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Dim3 max_lbound (Box const&, Dim3 const&) noexcept;
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Dim3 min_ubound (Box const&, Box const&) noexcept;
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Dim3 min_ubound (Box const&, Dim3 const&) noexcept;
-    AMREX_GPU_HOST_DEVICE friend AMREX_FORCE_INLINE Box minBox (Box const&, Box const&, IndexType) noexcept;
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr std::size_t ndims () noexcept {
+        return static_cast<std::size_t>(dim);
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr int indims () noexcept {
+        return dim;
+    }
+
+    /**
+    * \brief Returns a new BoxND of dimension new_dim and
+    * assigns the first new_dim dimension of this BoxND to it.
+    */
+    template<int new_dim>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    BoxND<new_dim> shrink () const noexcept {
+        static_assert(new_dim <= dim);
+        auto lo = smallend.template shrink<new_dim>();
+        auto hi = bigend.template shrink<new_dim>();
+        auto typ = btype.template shrink<new_dim>();
+        return BoxND<new_dim>(lo, hi, typ);
+    }
+
+    /**
+    * \brief Returns a new BoxND of size new_dim and
+    * assigns all values of this BoxND to it and
+    * (small=0, big=0, typ=CELL) to the remaining elements.
+    */
+    template<int new_dim>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    BoxND<new_dim> expand () const noexcept {
+        static_assert(new_dim >= dim);
+        auto lo = smallend.template expand<new_dim>(0);
+        auto hi = bigend.template expand<new_dim>(0);
+        auto typ = btype.template expand<new_dim>(IndexType::CellIndex::CELL);
+        return BoxND<new_dim>(lo, hi, typ);
+    }
+
+    /**
+    * \brief Returns a new BoxND of size new_dim
+    * by either shrinking or expanding this BoxND
+    */
+    template<int new_dim>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    BoxND<new_dim> resize () const noexcept {
+        if constexpr (new_dim > dim) {
+            return expand<new_dim>();
+        } else {
+            return shrink<new_dim>();
+        }
+    }
 
 private:
-    IntVect         smallend;
-    IntVect         bigend;
-    IndexType       btype;
+    IntVectND<dim>         smallend;
+    IntVectND<dim>         bigend;
+    IndexTypeND<dim>       btype;
 };
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box&
-Box::refine (const IntVect& ref_ratio) noexcept
+BoxND<dim>&
+BoxND<dim>::refine (const IntVectND<dim>& ref_ratio) noexcept
 {
     if (ref_ratio != 1) {
-        IntVect shft(1);
+        IntVectND<dim> shft(1);
         shft -= btype.ixType();
         smallend *= ref_ratio;
         bigend += shft;
@@ -792,10 +859,11 @@ Box::refine (const IntVect& ref_ratio) noexcept
     return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box&
-Box::coarsen (const IntVect& ref_ratio) noexcept
+BoxND<dim>&
+BoxND<dim>::coarsen (const IntVectND<dim>& ref_ratio) noexcept
 {
     if (ref_ratio != 1)
     {
@@ -803,8 +871,8 @@ Box::coarsen (const IntVect& ref_ratio) noexcept
 
         if (btype.any())
         {
-            IntVect off(0);
-            for (int dir = 0; dir < AMREX_SPACEDIM; dir++)
+            IntVectND<dim> off(0);
+            for (int dir = 0; dir < dim; dir++)
             {
                 if (btype[dir]) {
                     if (bigend[dir]%ref_ratio[dir]) {
@@ -824,38 +892,41 @@ Box::coarsen (const IntVect& ref_ratio) noexcept
     return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box&
-Box::convert (const IntVect& typ) noexcept
+BoxND<dim>&
+BoxND<dim>::convert (const IntVectND<dim>& typ) noexcept
 {
     BL_ASSERT(typ.allGE(0) && typ.allLE(1));
-    IntVect shft(typ - btype.ixType());
+    IntVectND<dim> shft(typ - btype.ixType());
     bigend += shft;
-    btype = IndexType(typ);
+    btype = IndexTypeND<dim>(typ);
     return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box&
-Box::convert (IndexType t) noexcept
+BoxND<dim>&
+BoxND<dim>::convert (IndexTypeND<dim> t) noexcept
 {
-   for (int dir = 0; dir < AMREX_SPACEDIM; dir++)
+   for (int dir = 0; dir < dim; dir++)
    {
       const auto typ = t[dir];
       const auto bitval = btype[dir];
       const int off = typ - bitval;
       bigend.shift(dir,off);
-      btype.setType(dir, (IndexType::CellIndex) typ);
+      btype.setType(dir, static_cast<IndexType::CellIndex>(typ));
    }
    return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box&
-Box::surroundingNodes (int dir) noexcept
+BoxND<dim>&
+BoxND<dim>::surroundingNodes (int dir) noexcept
 {
     if (!(btype[dir]))
     {
@@ -868,12 +939,13 @@ Box::surroundingNodes (int dir) noexcept
     return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box&
-Box::surroundingNodes () noexcept
+BoxND<dim>&
+BoxND<dim>::surroundingNodes () noexcept
 {
-    for (int i = 0; i < AMREX_SPACEDIM; ++i) {
+    for (int i = 0; i < dim; ++i) {
         if ((btype[i] == 0)) {
             bigend.shift(i,1);
         }
@@ -882,10 +954,11 @@ Box::surroundingNodes () noexcept
     return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box&
-Box::enclosedCells (int dir) noexcept
+BoxND<dim>&
+BoxND<dim>::enclosedCells (int dir) noexcept
 {
     if (btype[dir])
     {
@@ -898,12 +971,13 @@ Box::enclosedCells (int dir) noexcept
     return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box&
-Box::enclosedCells () noexcept
+BoxND<dim>&
+BoxND<dim>::enclosedCells () noexcept
 {
-    for (int i = 0 ; i < AMREX_SPACEDIM; ++i) {
+    for (int i = 0 ; i < dim; ++i) {
         if (btype[i]) {
             bigend.shift(i,-1);
         }
@@ -912,78 +986,64 @@ Box::enclosedCells () noexcept
     return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
 Long
-Box::index (const IntVect& v) const noexcept
-{
-    Long result = v[0]-smallend[0];
-#if   AMREX_SPACEDIM==2
-    result += length(0)*Long(v[1]-smallend[1]);
-#elif AMREX_SPACEDIM==3
-    result += length(0)*((v[1]-smallend[1])
-                         +Long(v[2]-smallend[2])*length(1));
-#endif
+BoxND<dim>::index (const IntVectND<dim>& v) const noexcept
+{
+    IntVectND<dim> vz = v - smallend;
+    Long result = vz[0];
+    Long mult = length(0);
+    for (int i = 1 ; i < dim; ++i) {
+        result += mult * vz[i];
+        mult *= length(i);
+    }
     return result;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-IntVect
-Box::atOffset (Long offset) const noexcept
-{
-#if (AMREX_SPACEDIM == 1)
-    return IntVect{static_cast<int>(offset+smallend[0])};
-#elif (AMREX_SPACEDIM == 2)
-    int xlen = bigend[0]-smallend[0]+1;
-    Long j = offset / xlen;
-    Long i = offset - j*xlen;
-    return IntVect{static_cast<int>(i+smallend[0]),
-                   static_cast<int>(j+smallend[1])};
-#elif (AMREX_SPACEDIM == 3)
-    int xlen = bigend[0]-smallend[0]+1;
-    int ylen = bigend[1]-smallend[1]+1;
-    Long k = offset / (xlen*ylen);
-    Long j = (offset - k*(xlen*ylen)) / xlen;
-    Long i = (offset - k*(xlen*ylen)) - j*xlen;
-    return IntVect{static_cast<int>(i+smallend[0]),
-                   static_cast<int>(j+smallend[1]),
-                   static_cast<int>(k+smallend[2])};
-#endif
+IntVectND<dim>
+BoxND<dim>::atOffset (Long offset) const noexcept
+{
+    IntVectND<dim> result = smallend;
+
+    if constexpr (dim > 1) {
+        GpuArray<Long, dim-1> mult{};
+        mult[0] = length(0);
+        for (int i = 1 ; i < dim-1; ++i) {
+            mult[i] = mult[i-1] * length(i);
+        }
+        for (int i = dim-1 ; i > 0; --i) {
+            Long idx = offset / mult[i-1];
+            offset -= idx * mult[i-1];
+            result[i] += static_cast<int>(idx);
+        }
+    }
+
+    result[0] += static_cast<int>(offset);
+
+    return result;
 }
 
+template<int dim>
+template <int N, std::enable_if_t<( 1<=N && N<=3 ), int>>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
 GpuArray<int,3>
-Box::atOffset3d (Long offset) const noexcept
-{
-#if (AMREX_SPACEDIM == 1)
-    return {{static_cast<int>(offset+smallend[0]),
-             static_cast<int>(0),
-             static_cast<int>(0)}};
-#elif (AMREX_SPACEDIM == 2)
-    int xlen = bigend[0]-smallend[0]+1;
-    Long j = offset / xlen;
-    Long i = offset - j*xlen;
-    return {{static_cast<int>(i+smallend[0]),
-             static_cast<int>(j+smallend[1]),
-             static_cast<int>(0)}};
-#elif (AMREX_SPACEDIM == 3)
-    int xlen = bigend[0]-smallend[0]+1;
-    int ylen = bigend[1]-smallend[1]+1;
-    Long k = offset / (xlen*ylen);
-    Long j = (offset - k*(xlen*ylen)) / xlen;
-    Long i = (offset - k*(xlen*ylen)) - j*xlen;
-    return {{static_cast<int>(i+smallend[0]),
-             static_cast<int>(j+smallend[1]),
-             static_cast<int>(k+smallend[2])}};
-#endif
+BoxND<dim>::atOffset3d (Long offset) const noexcept
+{
+    Dim3 iv3d = atOffset(offset).dim3(0);
+    return {{iv3d.x, iv3d.y, iv3d.z}};
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box&
-Box::setRange (int dir,
+BoxND<dim>&
+BoxND<dim>::setRange (int dir,
                int sm_index,
                int n_cells) noexcept
 {
@@ -992,93 +1052,92 @@ Box::setRange (int dir,
     return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
 void
-Box::next (IntVect& p) const noexcept // NOLINT(readability-convert-member-functions-to-static)
+BoxND<dim>::next (IntVectND<dim>& p) const noexcept // NOLINT(readability-convert-member-functions-to-static)
 {
     BL_ASSERT(contains(p));
 
     ++p[0];
 
-#if (AMREX_SPACEDIM >= 2)
-    if (p[0] > bigend[0])
-    {
-        p[0] = smallend[0];
-        ++p[1];
-#if (AMREX_SPACEDIM == 3)
-        if (p[1] > bigend[1])
-        {
-            p[1] = smallend[1];
-            ++p[2];
+    for (int i = 0 ; i < dim-1; ++i) {
+        if (p[i] > bigend[i]) {
+            p[i] = smallend[i];
+            ++p[i+1];
+        } else {
+            break;
         }
-#endif
     }
-#endif
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
 bool
-Box::isSquare () const noexcept // NOLINT(readability-convert-member-functions-to-static)
-{
-#if AMREX_SPACEDIM==1
-    return false; // can't build a square in 1-D
-#elif AMREX_SPACEDIM==2
-    const IntVect& sz = this->size();
-    return (sz[0] == sz[1]);
-#elif AMREX_SPACEDIM==3
-    const IntVect& sz = this->size();
-    return (sz[0] == sz[1] && (sz[1] == sz[2]));
-#endif
+BoxND<dim>::isSquare () const noexcept // NOLINT(readability-convert-member-functions-to-static)
+{
+    if constexpr (dim == 1) {
+        return false; // can't build a square in 1-D
+    } else {
+        bool is_square = true;
+        const IntVectND<dim>& sz = this->size();
+        for (int i = 0 ; i < dim-1; ++i) {
+            is_square = is_square && (sz[i] == sz[i+1]);
+        }
+        return is_square;
+    }
 }
 
 //
-// Modified Box is low end, returned Box is high end.
+// Modified BoxND is low end, returned BoxND is high end.
 // If CELL: chop_pnt included in high end.
 // If NODE: chop_pnt included in both Boxes.
 //
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 inline
-Box
-Box::chop (int dir, int chop_pnt) noexcept
+BoxND<dim>
+BoxND<dim>::chop (int dir, int chop_pnt) noexcept
 {
     //
-    // Define new high end Box including chop_pnt.
+    // Define new high end BoxND including chop_pnt.
     //
-    IntVect sm(smallend);
-    IntVect bg(bigend);
+    IntVectND<dim> sm(smallend);
+    IntVectND<dim> bg(bigend);
     sm.setVal(dir,chop_pnt);
     if (btype[dir])
     {
         //
-        // NODE centered Box.
+        // NODE centered BoxND.
         //
         BL_ASSERT(chop_pnt > smallend[dir] && chop_pnt < bigend[dir]);
         //
-        // Shrink original Box to just contain chop_pnt.
+        // Shrink original BoxND to just contain chop_pnt.
         //
         bigend.setVal(dir,chop_pnt);
     }
     else
     {
         //
-        // CELL centered Box.
+        // CELL centered BoxND.
         //
         BL_ASSERT(chop_pnt > smallend[dir] && chop_pnt <= bigend[dir]);
         //
-        // Shrink original Box to one below chop_pnt.
+        // Shrink original BoxND to one below chop_pnt.
         //
         bigend.setVal(dir,chop_pnt-1);
     }
-    return Box(sm,bg,btype);
+    return BoxND<dim>(sm,bg,btype);
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 inline
-Box&
-Box::shiftHalf (int dir, int num_halfs) noexcept
+BoxND<dim>&
+BoxND<dim>::shiftHalf (int dir, int num_halfs) noexcept
 {
     const int nbit = (num_halfs<0 ? -num_halfs : num_halfs)%2;
     int nshift = num_halfs/2;
@@ -1099,12 +1158,13 @@ Box::shiftHalf (int dir, int num_halfs) noexcept
     return *this;
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
 inline
-Box&
-Box::shiftHalf (const IntVect& iv) noexcept
+BoxND<dim>&
+BoxND<dim>::shiftHalf (const IntVectND<dim>& iv) noexcept
 {
-    for (int i = 0; i < AMREX_SPACEDIM; i++) {
+    for (int i = 0; i < dim; i++) {
         shiftHalf(i,iv[i]);
     }
     return *this;
@@ -1140,308 +1200,332 @@ public:
 void AllGatherBoxes (Vector<Box>& bxs, int n_extra_reserve=0);
 
     /**
-    * \brief Grow Box in all directions by given amount.
+    * \brief Grow BoxND in all directions by given amount.
 
-    * NOTE: n_cell negative shrinks the Box by that number of cells.
+    * NOTE: n_cell negative shrinks the BoxND by that number of cells.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box grow (const Box& b, int i) noexcept
+BoxND<dim> grow (const BoxND<dim>& b, int i) noexcept
 {
-    Box result = b;
+    BoxND<dim> result = b;
     result.grow(i);
     return result;
 }
 
-    //! Grow Box in each direction by specified amount.
+    //! Grow BoxND in each direction by specified amount.
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box grow (const Box& b, const IntVect& v) noexcept
+BoxND<dim> grow (const BoxND<dim>& b, const IntVectND<dim>& v) noexcept
 {
-    Box result = b;
+    BoxND<dim> result = b;
     result.grow(v);
     return result;
 }
 
-    //! Grow Box in direction idir be n_cell cells
+    //! Grow BoxND in direction idir be n_cell cells
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box grow (const Box& b, int idir, int n_cell) noexcept
+BoxND<dim> grow (const BoxND<dim>& b, int idir, int n_cell) noexcept
 {
-    Box result = b;
+    BoxND<dim> result = b;
     result.grow(idir, n_cell);
     return result;
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box grow (const Box& b, Direction d, int n_cell) noexcept
+BoxND<dim> grow (const BoxND<dim>& b, Direction d, int n_cell) noexcept
 {
     return grow(b, static_cast<int>(d), n_cell);
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box growLo (const Box& b, int idir, int n_cell) noexcept
+BoxND<dim> growLo (const BoxND<dim>& b, int idir, int n_cell) noexcept
 {
-    Box result = b;
+    BoxND<dim> result = b;
     result.growLo(idir, n_cell);
     return result;
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box growLo (const Box& b, Direction d, int n_cell) noexcept
+BoxND<dim> growLo (const BoxND<dim>& b, Direction d, int n_cell) noexcept
 {
     return growLo(b, static_cast<int>(d), n_cell);
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box growHi (const Box& b, int idir, int n_cell) noexcept
+BoxND<dim> growHi (const BoxND<dim>& b, int idir, int n_cell) noexcept
 {
-    Box result = b;
+    BoxND<dim> result = b;
     result.growHi(idir, n_cell);
     return result;
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box growHi (const Box& b, Direction d, int n_cell) noexcept
+BoxND<dim> growHi (const BoxND<dim>& b, Direction d, int n_cell) noexcept
 {
     return growHi(b, static_cast<int>(d), n_cell);
 }
 
     /**
-    * \brief Coarsen Box by given (positive) refinement ratio.
+    * \brief Coarsen BoxND by given (positive) refinement ratio.
     * NOTE: if type(dir) = CELL centered: lo <- lo/ratio and
     * hi <- hi/ratio.
     * NOTE: if type(dir) = NODE centered: lo <- lo/ratio and
     * hi <- hi/ratio + ((hi%ratio)==0 ? 0 : 1).
-    * That is, refinement of coarsened Box must contain
-    * the original Box.
+    * That is, refinement of coarsened BoxND must contain
+    * the original BoxND.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box coarsen (const Box& b, int ref_ratio) noexcept
+BoxND<dim> coarsen (const BoxND<dim>& b, int ref_ratio) noexcept
 {
-    Box result = b;
-    result.coarsen(IntVect(ref_ratio));
+    BoxND<dim> result = b;
+    result.coarsen(IntVectND<dim>(ref_ratio));
     return result;
 }
 
     /**
-    * \brief Coarsen Box by given (positive) refinement ratio.
+    * \brief Coarsen BoxND by given (positive) refinement ratio.
     * NOTE: if type(dir) = CELL centered: lo <- lo/ratio and
     * hi <- hi/ratio.
     * NOTE: if type(dir) = NODE centered: lo <- lo/ratio and
     * hi <- hi/ratio + ((hi%ratio)==0 ? 0 : 1).
-    * That is, refinement of coarsened Box must contain
-    * the original Box.
+    * That is, refinement of coarsened BoxND must contain
+    * the original BoxND.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box coarsen (const Box& b, const IntVect& ref_ratio) noexcept
+BoxND<dim> coarsen (const BoxND<dim>& b, const IntVectND<dim>& ref_ratio) noexcept
 {
-    Box result = b;
+    BoxND<dim> result = b;
     result.coarsen(ref_ratio);
     return result;
 }
 
     /**
-    * Refine Box by given (positive) refinement ratio.
+    * Refine BoxND by given (positive) refinement ratio.
     * NOTE: if type(dir) = CELL centered: lo <- lo*ratio and
     * hi <- (hi+1)*ratio - 1.
     * NOTE: if type(dir) = NODE centered: lo <- lo*ratio and
     * hi <- hi*ratio.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box refine (const Box& b, int ref_ratio) noexcept
+BoxND<dim> refine (const BoxND<dim>& b, int ref_ratio) noexcept
 {
-    Box result = b;
-    result.refine(IntVect(ref_ratio));
+    BoxND<dim> result = b;
+    result.refine(IntVectND<dim>(ref_ratio));
     return result;
 }
 
     /**
-    * \brief Refine Box by given (positive) refinement ratio.
+    * \brief Refine BoxND by given (positive) refinement ratio.
     * NOTE: if type(dir) = CELL centered: lo <- lo*ratio and
     * hi <- (hi+1)*ratio - 1.
     * NOTE: if type(dir) = NODE centered: lo <- lo*ratio and
     * hi <- hi*ratio.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box refine (const Box& b, const IntVect& ref_ratio) noexcept
+BoxND<dim> refine (const BoxND<dim>& b, const IntVectND<dim>& ref_ratio) noexcept
 {
-    Box result = b;
+    BoxND<dim> result = b;
     result.refine(ref_ratio);
     return result;
 }
 
-    //! Return a Box with indices shifted by nzones in dir direction.
+    //! Return a BoxND with indices shifted by nzones in dir direction.
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box shift (const Box& b, int dir, int nzones) noexcept
+BoxND<dim> shift (const BoxND<dim>& b, int dir, int nzones) noexcept
 {
-    Box result = b;
+    BoxND<dim> result = b;
     result.shift(dir, nzones);
     return result;
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box shift (const Box& b, const IntVect& nzones) noexcept
+BoxND<dim> shift (const BoxND<dim>& b, const IntVectND<dim>& nzones) noexcept
 {
-    Box result = b;
+    BoxND<dim> result = b;
     result.shift(nzones);
     return result;
 }
 
     /**
-    * \brief Returns a Box with NODE based coordinates in direction dir
-    * that encloses Box b.  NOTE: equivalent to b.convert(dir,NODE)
+    * \brief Returns a BoxND with NODE based coordinates in direction dir
+    * that encloses BoxND b.  NOTE: equivalent to b.convert(dir,NODE)
     * NOTE: error if b.type(dir) == NODE.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box surroundingNodes (const Box& b, int dir) noexcept
+BoxND<dim> surroundingNodes (const BoxND<dim>& b, int dir) noexcept
 {
-    Box bx(b);
+    BoxND<dim> bx(b);
     bx.surroundingNodes(dir);
     return bx;
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box surroundingNodes (const Box& b, Direction d) noexcept
+BoxND<dim> surroundingNodes (const BoxND<dim>& b, Direction d) noexcept
 {
     return surroundingNodes(b, static_cast<int>(d));
 }
 
     /**
-    * \brief Returns a Box with NODE based coordinates in all
-    * directions that encloses Box b.
+    * \brief Returns a BoxND with NODE based coordinates in all
+    * directions that encloses BoxND b.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box surroundingNodes (const Box& b) noexcept
+BoxND<dim> surroundingNodes (const BoxND<dim>& b) noexcept
 {
-    Box bx(b);
+    BoxND<dim> bx(b);
     bx.surroundingNodes();
     return bx;
 }
 
-    //! Returns a Box with different type
+    //! Returns a BoxND with different type
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box convert (const Box& b, const IntVect& typ) noexcept
+BoxND<dim> convert (const BoxND<dim>& b, const IntVectND<dim>& typ) noexcept
 {
-    Box bx(b);
+    BoxND<dim> bx(b);
     bx.convert(typ);
     return bx;
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box convert (const Box& b, const IndexType& typ) noexcept
+BoxND<dim> convert (const BoxND<dim>& b, const IndexTypeND<dim>& typ) noexcept
 {
-    Box bx(b);
+    BoxND<dim> bx(b);
     bx.convert(typ);
     return bx;
 }
 
     /**
-    * \brief Returns a Box with CELL based coordinates in
+    * \brief Returns a BoxND with CELL based coordinates in
     * direction dir that is enclosed by b.
     * NOTE: equivalent to b.convert(dir,CELL)
     * NOTE: error if b.type(dir) == CELL.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box enclosedCells (const Box& b, int dir) noexcept
+BoxND<dim> enclosedCells (const BoxND<dim>& b, int dir) noexcept
 {
-    Box bx(b);
+    BoxND<dim> bx(b);
     bx.enclosedCells(dir);
     return bx;
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box enclosedCells (const Box& b, Direction d) noexcept
+BoxND<dim> enclosedCells (const BoxND<dim>& b, Direction d) noexcept
 {
     return enclosedCells(b, static_cast<int>(d));
 }
 
     /**
-    * \brief Returns a Box with CELL based coordinates in all
+    * \brief Returns a BoxND with CELL based coordinates in all
     * directions that is enclosed by b.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box enclosedCells (const Box& b) noexcept
+BoxND<dim> enclosedCells (const BoxND<dim>& b) noexcept
 {
-    Box bx(b);
+    BoxND<dim> bx(b);
     bx.enclosedCells();
     return bx;
 }
 
     /**
-    * \brief Returns the edge-centered Box (in direction dir) defining
-    * the low side of Box b.
+    * \brief Returns the edge-centered BoxND (in direction dir) defining
+    * the low side of BoxND b.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box bdryLo (const Box& b, int dir, int len=1) noexcept
+BoxND<dim> bdryLo (const BoxND<dim>& b, int dir, int len=1) noexcept
 {
-    IntVect low(b.smallEnd());
-    IntVect hi(b.bigEnd());
+    IntVectND<dim> low(b.smallEnd());
+    IntVectND<dim> hi(b.bigEnd());
     int sm = low[dir];
     low.setVal(dir,sm-len+1);
     hi.setVal(dir,sm);
     //
     // set dir'th bit to 1 = IndexType::NODE.
     //
-    IndexType typ(b.ixType());
+    IndexTypeND<dim> typ(b.ixType());
     typ.set(dir);
-    return Box(low,hi,typ);
+    return BoxND<dim>(low,hi,typ);
 }
 
     /**
-    * \brief Returns the edge-centered Box (in direction dir) defining
-    * the high side of Box b.
+    * \brief Returns the edge-centered BoxND (in direction dir) defining
+    * the high side of BoxND b.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box bdryHi (const Box& b, int dir, int len=1) noexcept
+BoxND<dim> bdryHi (const BoxND<dim>& b, int dir, int len=1) noexcept
 {
-    IntVect low(b.smallEnd());
-    IntVect hi(b.bigEnd());
+    IntVectND<dim> low(b.smallEnd());
+    IntVectND<dim> hi(b.bigEnd());
     auto const bitval = b.type()[dir];
     int bg = hi[dir]  + 1 - bitval%2;
     low.setVal(dir,bg);
@@ -1449,23 +1533,24 @@ Box bdryHi (const Box& b, int dir, int len=1) noexcept
     //
     // Set dir'th bit to 1 = IndexType::NODE.
     //
-    IndexType typ(b.ixType());
+    IndexTypeND<dim> typ(b.ixType());
     typ.set(dir);
-    return Box(low,hi,typ);
+    return BoxND<dim>(low,hi,typ);
 }
 
     /**
     * \brief Similar to bdryLo and bdryHi except that it operates on the
     * given face of  box b.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box bdryNode (const Box& b, Orientation face, int len=1) noexcept
+BoxND<dim> bdryNode (const BoxND<dim>& b, Orientation face, int len=1) noexcept
 {
     int dir = face.coordDir();
-    IntVect low(b.smallEnd());
-    IntVect hi(b.bigEnd());
+    IntVectND<dim> low(b.smallEnd());
+    IntVectND<dim> hi(b.bigEnd());
     if (face.isLow())
     {
         int sm = low[dir];
@@ -1482,51 +1567,53 @@ Box bdryNode (const Box& b, Orientation face, int len=1) noexcept
     //
     // Set dir'th bit to 1 = IndexType::NODE.
     //
-    IndexType typ(b.ixType());
+    IndexTypeND<dim> typ(b.ixType());
     typ.set(dir);
-    return Box(low,hi,typ);
+    return BoxND<dim>(low,hi,typ);
 }
 
     /**
-    * \brief Returns the cell centered Box of length len adjacent
+    * \brief Returns the cell centered BoxND of length len adjacent
     * to b on the low end along the coordinate direction dir.
-    * The return Box is identical to b in the other directions.
-    * The return Box and b have an empty intersection.
+    * The return BoxND is identical to b in the other directions.
+    * The return BoxND and b have an empty intersection.
     * NOTE:  len >= 1
-    * NOTE:  Box retval = b.adjCellLo(b,dir,len)
+    * NOTE:  BoxND retval = b.adjCellLo(b,dir,len)
     * is equivalent to the following set of operations:
-    * Box retval(b);
-    * retval.convert(dir,Box::CELL);
+    * BoxND retval(b);
+    * retval.convert(dir,BoxND::CELL);
     * retval.setrange(dir,retval.smallEnd(dir)-len,len);
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box adjCellLo (const Box& b, int dir, int len=1) noexcept
+BoxND<dim> adjCellLo (const BoxND<dim>& b, int dir, int len=1) noexcept
 {
     BL_ASSERT(len > 0);
-    IntVect low(b.smallEnd());
-    IntVect hi(b.bigEnd());
+    IntVectND<dim> low(b.smallEnd());
+    IntVectND<dim> hi(b.bigEnd());
     int sm = low[dir];
     low.setVal(dir,sm - len);
     hi.setVal(dir,sm - 1);
     //
     // Set dir'th bit to 0 = IndexType::CELL.
     //
-    IndexType typ(b.ixType());
+    IndexTypeND<dim> typ(b.ixType());
     typ.unset(dir);
-    return Box(low,hi,typ);
+    return BoxND<dim>(low,hi,typ);
 }
 
-    //! Similar to adjCellLo but builds an adjacent Box on the high end.
+    //! Similar to adjCellLo but builds an adjacent BoxND on the high end.
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box adjCellHi (const Box& b, int dir, int len=1) noexcept
+BoxND<dim> adjCellHi (const BoxND<dim>& b, int dir, int len=1) noexcept
 {
     BL_ASSERT(len > 0);
-    IntVect low(b.smallEnd());
-    IntVect hi(b.bigEnd());
+    IntVectND<dim> low(b.smallEnd());
+    IntVectND<dim> hi(b.bigEnd());
     int bitval = b.type()[dir];
     int bg = hi[dir] + 1 - bitval%2;
     low.setVal(dir,bg);
@@ -1534,20 +1621,21 @@ Box adjCellHi (const Box& b, int dir, int len=1) noexcept
     //
     // Set dir'th bit to 0 = IndexType::CELL.
     //
-    IndexType typ(b.ixType());
+    IndexTypeND<dim> typ(b.ixType());
     typ.unset(dir);
-    return Box(low,hi,typ);
+    return BoxND<dim>(low,hi,typ);
 }
 
     //! Similar to adjCellLo and adjCellHi; operates on given face.
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box adjCell (const Box& b, Orientation face, int len=1) noexcept
+BoxND<dim> adjCell (const BoxND<dim>& b, Orientation face, int len=1) noexcept
 {
     BL_ASSERT(len > 0);
-    IntVect low(b.smallEnd());
-    IntVect hi(b.bigEnd());
+    IntVectND<dim> low(b.smallEnd());
+    IntVectND<dim> hi(b.bigEnd());
     int dir = face.coordDir();
     if (face.isLow())
     {
@@ -1565,225 +1653,310 @@ Box adjCell (const Box& b, Orientation face, int len=1) noexcept
     //
     // Set dir'th bit to 0 = IndexType::CELL.
     //
-    IndexType typ(b.ixType());
+    IndexTypeND<dim> typ(b.ixType());
     typ.unset(dir);
-    return Box(low,hi,typ);
+    return BoxND<dim>(low,hi,typ);
 }
 
     /**
-    * \brief Modify Box to that of the minimum Box containing both
-    * the original Box and the argument.
-    * Both Boxes must have identical type.
+    * \brief Modify BoxND to that of the minimum BoxND containing both
+    * the original BoxND and the argument.
+    * Both BoxNDes must have identical type.
     */
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box minBox (const Box& b1, const Box& b2) noexcept
+BoxND<dim> minBox (const BoxND<dim>& b1, const BoxND<dim>& b2) noexcept
 {
-    Box result = b1;
+    BoxND<dim> result = b1;
     result.minBox(b2);
     return result;
 }
 
-    //! Write an ASCII representation to the ostream.
-std::ostream& operator<< (std::ostream& os, const Box& bx);
+namespace detail {
+    std::ostream& box_write (std::ostream& os, const int * smallend, const int * bigend,
+                             const int * type, int dim);
+    std::istream& box_read (std::istream& is, int * smallend, int * bigend, int * type, int dim);
+
+    template<std::size_t...Ns, class T, class U>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr
+    auto BoxSplit_imp (std::index_sequence<Ns...>,
+                       const T& lo, const T& hi, const U& typ) noexcept {
+        return makeTuple(BoxND(get<Ns>(lo), get<Ns>(hi), get<Ns>(typ))...);
+    }
+}
+
+//! Write an ASCII representation to the ostream.
+template<int dim>
+std::ostream& operator<< (std::ostream& os, const BoxND<dim>& bx)
+{
+    IntVectND<dim> type = bx.type();
+    return detail::box_write(os, bx.smallEnd().begin(), bx.bigEnd().begin(), type.begin(), dim);
+}
+
+//! Read from istream.
+template<int dim>
+std::istream& operator>> (std::istream& is, BoxND<dim>& bx) {
+    IntVectND<dim> small;
+    IntVectND<dim> big;
+    IntVectND<dim> type;
+    detail::box_read(is, small.begin(), big.begin(), type.begin(), dim);
+    bx = BoxND<dim>{small, big, type};
+    return is;
+}
+
+/**
+* \brief Returns a BoxND obtained by concatenating the input BoxNDs.
+* The dimension of the return value equals the sum of the dimensions of the inputted BoxNDs.
+*/
+template<int d, int...dims>
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+constexpr BoxND<detail::get_sum<d, dims...>()>
+BoxCat (const BoxND<d>& bx, const BoxND<dims>&...boxes) noexcept {
+    auto lo = IntVectCat(bx.smallEnd(), boxes.smallEnd()...);
+    auto hi = IntVectCat(bx.bigEnd(), boxes.bigEnd()...);
+    auto typ = IndexTypeCat(bx.ixType(), boxes.ixType()...);
+    return BoxND<detail::get_sum<d, dims...>()>{lo, hi, typ};
+}
+
+/**
+* \brief Returns a tuple of BoxNDs obtained by splitting the input BoxND
+* according to the dimensions specified by the template arguments.
+*/
+template<int d, int...dims>
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+constexpr GpuTuple<BoxND<d>, BoxND<dims>...>
+BoxSplit (const BoxND<detail::get_sum<d, dims...>()>& bx) noexcept {
+    auto lo = IntVectSplit<d, dims...>(bx.smallEnd());
+    auto hi = IntVectSplit<d, dims...>(bx.bigEnd());
+    auto typ = IndexTypeSplit<d, dims...>(bx.ixType());
+    return detail::BoxSplit_imp(std::make_index_sequence<1 + sizeof...(dims)>(), lo, hi, typ);
+}
+
+/**
+* \brief Returns a new BoxND of dimension new_dim and
+* assigns the first new_dim dimension of this BoxND to it.
+*/
+template<int new_dim, int old_dim>
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+constexpr BoxND<new_dim>
+BoxShrink (const BoxND<old_dim>& bx) noexcept {
+    return bx.template shrink<new_dim>();
+}
+
+/**
+* \brief Returns a new BoxND of size new_dim and
+* assigns all values of this BoxND to it and
+* (small=0, big=0, typ=CELL) to the remaining elements.
+*/
+template<int new_dim, int old_dim>
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+constexpr BoxND<new_dim>
+BoxExpand (const BoxND<old_dim>& bx) noexcept {
+    return bx.template expand<new_dim>();
+}
 
-    //! Read from istream.
-std::istream& operator>> (std::istream& is, Box& bx);
+/**
+* \brief Returns a new BoxND of size new_dim
+* by either shrinking or expanding this BoxND
+*/
+template<int new_dim, int old_dim>
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+constexpr BoxND<new_dim>
+BoxResize (const BoxND<old_dim>& bx) noexcept {
+    return bx.template resize<new_dim>();
+}
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Dim3 lbound (Box const& box) noexcept
+IntVectND<dim> lbound_iv (BoxND<dim> const& box) noexcept
 {
-#if (AMREX_SPACEDIM == 1)
-    return {box.smallend[0], 0, 0};
-#elif (AMREX_SPACEDIM == 2)
-    return {box.smallend[0], box.smallend[1], 0};
-#elif (AMREX_SPACEDIM == 3)
-    return {box.smallend[0], box.smallend[1], box.smallend[2]};
-#endif
+    return box.smallEnd();
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Dim3 ubound (Box const& box) noexcept
+IntVectND<dim> ubound_iv (BoxND<dim> const& box) noexcept
 {
-#if (AMREX_SPACEDIM == 1)
-    return {box.bigend[0], 0, 0};
-#elif (AMREX_SPACEDIM == 2)
-    return {box.bigend[0], box.bigend[1], 0};
-#elif (AMREX_SPACEDIM == 3)
-    return {box.bigend[0], box.bigend[1], box.bigend[2]};
-#endif
+    return box.bigEnd();
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Dim3 begin (Box const& box) noexcept
+IntVectND<dim> begin_iv (BoxND<dim> const& box) noexcept
 {
-#if (AMREX_SPACEDIM == 1)
-    return {box.smallend[0], 0, 0};
-#elif (AMREX_SPACEDIM == 2)
-    return {box.smallend[0], box.smallend[1], 0};
-#elif (AMREX_SPACEDIM == 3)
-    return {box.smallend[0], box.smallend[1], box.smallend[2]};
-#endif
+    return box.smallEnd();
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Dim3 end (Box const& box) noexcept
+IntVectND<dim> end_iv (BoxND<dim> const& box) noexcept
 {
-#if (AMREX_SPACEDIM == 1)
-    return {box.bigend[0]+1, 1, 1};
-#elif (AMREX_SPACEDIM == 2)
-    return {box.bigend[0]+1, box.bigend[1]+1, 1};
-#elif (AMREX_SPACEDIM == 3)
-    return {box.bigend[0]+1, box.bigend[1]+1, box.bigend[2]+1};
-#endif
+    return box.bigEnd() + 1;
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Dim3 length (Box const& box) noexcept
+IntVectND<dim> length_iv (BoxND<dim> const& box) noexcept
 {
-#if (AMREX_SPACEDIM == 1)
-    return {box.bigend[0]-box.smallend[0]+1, 1, 1};
-#elif (AMREX_SPACEDIM == 2)
-    return {box.bigend[0]-box.smallend[0]+1,
-            box.bigend[1]-box.smallend[1]+1, 1};
-#elif (AMREX_SPACEDIM == 3)
-    return {box.bigend[0]-box.smallend[0]+1,
-            box.bigend[1]-box.smallend[1]+1,
-            box.bigend[2]-box.smallend[2]+1};
-#endif
+    return box.bigEnd() - box.smallEnd() + 1;
 }
 
 // Max of lower bound
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Dim3 max_lbound (Box const& b1, Box const& b2) noexcept
+IntVectND<dim> max_lbound_iv (BoxND<dim> const& b1, BoxND<dim> const& b2) noexcept
 {
-#if (AMREX_SPACEDIM == 1)
-    return {amrex::max(b1.smallend[0], b2.smallend[0]),
-            0, 0};
-#elif (AMREX_SPACEDIM == 2)
-    return {amrex::max(b1.smallend[0], b2.smallend[0]),
-            amrex::max(b1.smallend[1], b2.smallend[1]),
-            0};
-#elif (AMREX_SPACEDIM == 3)
-    return {amrex::max(b1.smallend[0], b2.smallend[0]),
-            amrex::max(b1.smallend[1], b2.smallend[1]),
-            amrex::max(b1.smallend[2], b2.smallend[2])};
-#endif
+    return max(b1.smallEnd(), b2.smallEnd());
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Dim3 max_lbound (Box const& b1, Dim3 const& lo) noexcept
+IntVectND<dim> max_lbound_iv (BoxND<dim> const& b1, IntVectND<dim> const& lo) noexcept
 {
-#if (AMREX_SPACEDIM == 1)
-    return {amrex::max(b1.smallend[0], lo.x),
-            0, 0};
-#elif (AMREX_SPACEDIM == 2)
-    return {amrex::max(b1.smallend[0], lo.x),
-            amrex::max(b1.smallend[1], lo.y),
-            0};
-#elif (AMREX_SPACEDIM == 3)
-    return {amrex::max(b1.smallend[0], lo.x),
-            amrex::max(b1.smallend[1], lo.y),
-            amrex::max(b1.smallend[2], lo.z)};
-#endif
+    return max(b1.smallEnd(), lo);
 }
 
 // Min of upper bound
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Dim3 min_ubound (Box const& b1, Box const& b2) noexcept
+IntVectND<dim> min_ubound_iv (BoxND<dim> const& b1, BoxND<dim> const& b2) noexcept
 {
-#if (AMREX_SPACEDIM == 1)
-    return {amrex::min(b1.bigend[0], b2.bigend[0]),
-            0, 0};
-#elif (AMREX_SPACEDIM == 2)
-    return {amrex::min(b1.bigend[0], b2.bigend[0]),
-            amrex::min(b1.bigend[1], b2.bigend[1]),
-            0};
-#elif (AMREX_SPACEDIM == 3)
-    return {amrex::min(b1.bigend[0], b2.bigend[0]),
-            amrex::min(b1.bigend[1], b2.bigend[1]),
-            amrex::min(b1.bigend[2], b2.bigend[2])};
-#endif
+    return min(b1.bigEnd(), b2.bigEnd());
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Dim3 min_ubound (Box const& b1, Dim3 const& hi) noexcept
+IntVectND<dim> min_ubound_iv (BoxND<dim> const& b1, IntVectND<dim> const& hi) noexcept
 {
-#if (AMREX_SPACEDIM == 1)
-    return {amrex::min(b1.bigend[0], hi.x),
-            0, 0};
-#elif (AMREX_SPACEDIM == 2)
-    return {amrex::min(b1.bigend[0], hi.x),
-            amrex::min(b1.bigend[1], hi.y),
-            0};
-#elif (AMREX_SPACEDIM == 3)
-    return {amrex::min(b1.bigend[0], hi.x),
-            amrex::min(b1.bigend[1], hi.y),
-            amrex::min(b1.bigend[2], hi.z)};
-#endif
+    return min(b1.bigEnd(), hi);
 }
 
+template<int dim, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box minBox (Box const& b1, Box const& b2, IndexType typ) noexcept
-{
-#if (AMREX_SPACEDIM == 1)
-    return Box(IntVect(amrex::max(b1.smallend[0], b2.smallend[0])),
-               IntVect(amrex::min(b1.bigend  [0], b2.bigend  [0])),
-               typ);
-#elif (AMREX_SPACEDIM == 2)
-    return Box(IntVect(amrex::max(b1.smallend[0], b2.smallend[0]),
-                       amrex::max(b1.smallend[1], b2.smallend[1])),
-               IntVect(amrex::min(b1.bigend  [0], b2.bigend  [0]),
-                       amrex::min(b1.bigend  [1], b2.bigend  [1])),
-               typ);
-#elif (AMREX_SPACEDIM == 3)
-    return Box(IntVect(amrex::max(b1.smallend[0], b2.smallend[0]),
-                       amrex::max(b1.smallend[1], b2.smallend[1]),
-                       amrex::max(b1.smallend[2], b2.smallend[2])),
-               IntVect(amrex::min(b1.bigend  [0], b2.bigend  [0]),
-                       amrex::min(b1.bigend  [1], b2.bigend  [1]),
-                       amrex::min(b1.bigend  [2], b2.bigend  [2])),
-               typ);
-#endif
+Dim3 lbound (BoxND<dim> const& box) noexcept
+{
+    return box.smallEnd().dim3();
+}
+
+template<int dim, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
+[[nodiscard]]
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+Dim3 ubound (BoxND<dim> const& box) noexcept
+{
+    return box.bigEnd().dim3();
+}
+
+template<int dim, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
+[[nodiscard]]
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+Dim3 begin (BoxND<dim> const& box) noexcept
+{
+    return box.smallEnd().dim3();
+}
+
+template<int dim, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
+[[nodiscard]]
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+Dim3 end (BoxND<dim> const& box) noexcept
+{
+    return (box.bigEnd() + 1).dim3(1);
+}
+
+template<int dim, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
+[[nodiscard]]
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+Dim3 length (BoxND<dim> const& box) noexcept
+{
+    return (box.bigEnd() - box.smallEnd() + 1).dim3(1);
 }
 
-// Returns a Box that covers all the argument Boxes in index
+// Max of lower bound
+template<int dim, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
+[[nodiscard]]
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+Dim3 max_lbound (BoxND<dim> const& b1, BoxND<dim> const& b2) noexcept
+{
+    return max(b1.smallEnd(), b2.smallEnd()).dim3();
+}
+
+template<int dim, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
+[[nodiscard]]
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+Dim3 max_lbound (BoxND<dim> const& b1, Dim3 const& lo) noexcept
+{
+    return max(b1.smallEnd(), IntVectND<dim>(lo)).dim3();
+}
+
+// Min of upper bound
+template<int dim, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
+[[nodiscard]]
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+Dim3 min_ubound (BoxND<dim> const& b1, BoxND<dim> const& b2) noexcept
+{
+    return min(b1.bigEnd(), b2.bigEnd()).dim3();
+}
+
+template<int dim, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
+[[nodiscard]]
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+Dim3 min_ubound (BoxND<dim> const& b1, Dim3 const& hi) noexcept
+{
+    return min(b1.bigEnd(), IntVectND<dim>(hi)).dim3();
+}
+
+// Returns a BoxND that covers all the argument Boxes in index
 // space. The types are ignored. Thus, the arguments can have
-// different index types, and the returned Box's index type has no
+// different index types, and the returned BoxND's index type has no
 // meaning.
+template<int dim>
 [[nodiscard]]
 AMREX_FORCE_INLINE
-Box getIndexBounds (Box const& b1) noexcept
+BoxND<dim> getIndexBounds (BoxND<dim> const& b1) noexcept
 {
     return b1;
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_FORCE_INLINE
-Box getIndexBounds (Box const& b1, Box const& b2) noexcept
+BoxND<dim> getIndexBounds (BoxND<dim> const& b1, BoxND<dim> const& b2) noexcept
 {
-    Box b = b1;
+    BoxND<dim> b = b1;
     b.setType(b2.ixType());
     b.minBox(b2);
     return b;
@@ -1792,16 +1965,17 @@ Box getIndexBounds (Box const& b1, Box const& b2) noexcept
 template <class T, class ... Ts>
 [[nodiscard]]
 AMREX_FORCE_INLINE
-Box getIndexBounds (T const& b1, T const& b2, Ts const& ... b3) noexcept
+auto getIndexBounds (T const& b1, T const& b2, Ts const& ... b3) noexcept
 {
     return getIndexBounds(getIndexBounds(b1,b2),b3...);
 }
 
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-IntVect getCell (Box const* boxes, int nboxes, Long icell) noexcept
+IntVectND<dim> getCell (BoxND<dim> const* boxes, int nboxes, Long icell) noexcept
 {
     int ibox;
     Long ncells_subtotal = 0;
@@ -1817,117 +1991,113 @@ IntVect getCell (Box const* boxes, int nboxes, Long icell) noexcept
     return boxes[ibox].atOffset(offset);
 }
 
+template<int dim>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box makeSlab (Box const& b, int direction, int slab_index) noexcept
+BoxND<dim> makeSlab (BoxND<dim> const& b, int direction, int slab_index) noexcept
 {
-    Box r = b;
+    BoxND<dim> r = b;
     r.makeSlab(direction,slab_index);
     return r;
 }
 
+template<int dim=AMREX_SPACEDIM, std::enable_if_t<( 1<=dim && dim<=3 ), int> = 0>
 [[nodiscard]]
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE
-Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellType())
+BoxND<dim> makeSingleCellBox (int i, int j, int k, IndexTypeND<dim> typ = IndexTypeND<dim>::TheCellType())
 {
-#if (AMREX_SPACEDIM == 1)
-    amrex::ignore_unused(j,k);
-#elif (AMREX_SPACEDIM == 2)
-    amrex::ignore_unused(k);
-#endif
-    return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ);
+    Dim3 p3d{i, j, k};
+    IntVectND<dim> vect{p3d};
+    return BoxND<dim>{vect, vect, typ};
 }
 
-struct BoxIndexer
+template<int dim>
+[[nodiscard]]
+AMREX_GPU_HOST_DEVICE
+AMREX_FORCE_INLINE
+BoxND<dim> makeSingleCellBox (IntVectND<dim> const& vect, IndexTypeND<dim> typ = IndexTypeND<dim>::TheCellType())
 {
-    std::uint64_t npts;
+    return BoxND<dim>{vect, vect, typ};
+}
 
-#if (AMREX_SPACEDIM == 3)
-    Math::FastDivmodU64 fdxy;
-    Math::FastDivmodU64 fdx;
-    IntVect lo;
+template<int dim>
+struct BoxIndexerND
+{
+    std::uint64_t npts;
+    Math::FastDivmodU64 fdm[dim-1];
+    IntVectND<dim> lo;
 
-    BoxIndexer (Box const& box)
+    BoxIndexerND (BoxND<dim> const& box)
         : npts(box.numPts()),
-          fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
-          fdx (std::uint64_t(box.length(0))),
           lo  (box.smallEnd())
-        {}
-
-    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    Dim3 operator() (std::uint64_t icell) const
-    {
-        std::uint64_t x, y, z, rem;
-        fdxy(z, rem, icell);
-        fdx(y, x, rem);
-        return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
-    }
+        {
+            std::uint64_t mult = 1;
+            for (int i=0; i<dim-1; ++i) {
+                mult *= box.length(i);
+                fdm[i] = Math::FastDivmodU64(mult);
+            }
+        }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    IntVect intVect (std::uint64_t icell) const
+    IntVectND<dim> intVect (std::uint64_t icell) const
     {
-        std::uint64_t x, y, z, rem;
-        fdxy(z, rem, icell);
-        fdx(y, x, rem);
-        return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
-    }
+        IntVectND<dim> retval = lo;
 
-#elif (AMREX_SPACEDIM == 2)
+        for (int i=dim-1; i>0; --i) {
+            std::uint64_t quotient, remainder;
+            fdm[i-1](quotient, remainder, icell);
+            retval[i] += quotient;
+            icell = remainder;
+        }
 
-    Math::FastDivmodU64 fdx;
-    IntVect lo;
+        retval[0] += icell;
 
-    BoxIndexer (Box const& box)
-        : npts(box.numPts()),
-          fdx (std::uint64_t(box.length(0))),
-          lo  (box.smallEnd())
-        {}
+        return retval;
+    }
 
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Dim3 operator() (std::uint64_t icell) const
     {
-        std::uint64_t x, y;
-        fdx(y, x, icell);
-        return {int(x)+lo[0], int(y)+lo[1], 0};
+        return intVect(icell).dim3();
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    IntVect intVect (std::uint64_t icell) const
-    {
-        std::uint64_t x, y;
-        fdx(y, x, icell);
-        return {int(x)+lo[0], int(y)+lo[1]};
-    }
+    std::uint64_t numPts () const { return npts; }
+};
 
-#elif (AMREX_SPACEDIM == 1)
+template<>
+struct BoxIndexerND<1>
+{
+    std::uint64_t npts;
 
     int lo;
 
-    BoxIndexer (Box const& box)
+    BoxIndexerND (BoxND<1> const& box)
         : npts(box.numPts()),
           lo(box.smallEnd(0))
         {}
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    Dim3 operator() (std::uint64_t icell) const
+    IntVectND<1> intVect (std::uint64_t icell) const
     {
-        return {int(icell)+lo, 0, 0};
+        return IntVectND<1>{int(icell)+lo};
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    IntVect intVect (std::uint64_t icell) const
+    Dim3 operator() (std::uint64_t icell) const
     {
-        return IntVect{int(icell)+lo};
+        return {int(icell)+lo, 0, 0};
     }
 
-#endif
-
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     std::uint64_t numPts () const { return npts; }
 };
 
+using BoxIndexer = BoxIndexerND<AMREX_SPACEDIM>;
+
 }
 
 #endif /*AMREX_BOX_H*/
diff --git a/Src/Base/AMReX_Box.cpp b/Src/Base/AMReX_Box.cpp
index b0db9caa994..0f53bda5e99 100644
--- a/Src/Base/AMReX_Box.cpp
+++ b/Src/Base/AMReX_Box.cpp
@@ -10,19 +10,23 @@
 
 namespace amrex {
 
+namespace detail {
+
 //
 // I/O functions.
 //
 
 std::ostream&
-operator<< (std::ostream& os,
-            const Box&    b)
+box_write (std::ostream& os,
+           const int * smallend,
+           const int * bigend,
+           const int * type,
+           int dim)
 {
-    os << '('
-       << b.smallEnd() << ' '
-       << b.bigEnd()   << ' '
-       << b.type()
-       << ')';
+    os << '(';
+    int_vector_write(os, smallend, dim) << ' ';
+    int_vector_write(os, bigend, dim) << ' ';
+    int_vector_write(os, type, dim) << ')';
 
     if (os.fail()) {
         amrex::Error("operator<<(ostream&,Box&) failed");
@@ -37,37 +41,44 @@ operator<< (std::ostream& os,
 #define BL_IGNORE_MAX 100000
 
 std::istream&
-operator>> (std::istream& is,
-            Box&          b)
+box_read (std::istream& is,
+          int * smallend,
+          int * bigend,
+          int * type,
+          int dim)
 {
-    IntVect lo, hi, typ;
-
     is >> std::ws;
     char c;
     is >> c;
 
+    for (int i=0; i<dim; ++i) {
+        type[i] = 0;
+    }
+
     if (c == '(')
     {
-        is >> lo >> hi;
+        int_vector_read(is, smallend, dim);
+        int_vector_read(is, bigend, dim);
         is >> c;
         // Read an optional IndexType
         is.putback(c);
         if ( c == '(' )
         {
-            is >> typ;
+            int_vector_read(is, type, dim);
         }
         is.ignore(BL_IGNORE_MAX,')');
     }
     else if (c == '<')
     {
         is.putback(c);
-        is >> lo >> hi;
+        int_vector_read(is, smallend, dim);
+        int_vector_read(is, bigend, dim);
         is >> c;
         // Read an optional IndexType
         is.putback(c);
         if ( c == '<' )
         {
-            is >> typ;
+            int_vector_read(is, type, dim);
         }
         //is.ignore(BL_IGNORE_MAX,'>');
     }
@@ -76,8 +87,6 @@ operator>> (std::istream& is,
         amrex::Error("operator>>(istream&,Box&): expected \'(\'");
     }
 
-    b = Box(lo,hi,typ);
-
     if (is.fail()) {
         amrex::Error("operator>>(istream&,Box&) failed");
     }
@@ -85,6 +94,8 @@ operator>> (std::istream& is,
     return is;
 }
 
+} // namespace detail
+
 BoxCommHelper::BoxCommHelper (const Box& bx, int* p_)
     : p(p_)
 {
diff --git a/Src/Base/AMReX_BoxArray.H b/Src/Base/AMReX_BoxArray.H
index 19cee3cefb3..b3b339c33bb 100644
--- a/Src/Base/AMReX_BoxArray.H
+++ b/Src/Base/AMReX_BoxArray.H
@@ -615,6 +615,11 @@ public:
 
     BoxArray& maxSize (const IntVect& block_size);
 
+    //! Forces each Box in BoxArray to have sizes >= min_size and <=
+    //! max_size. It's the caller's responsibility to make sure both the
+    //! BoxArray and max_size are coarsenable by min_size.
+    BoxArray& minmaxSize (const IntVect& min_size, const IntVect& max_size);
+
     //! Refine each Box in the BoxArray to the specified ratio.
     BoxArray& refine (int refinement_ratio);
 
diff --git a/Src/Base/AMReX_BoxArray.cpp b/Src/Base/AMReX_BoxArray.cpp
index ecffd06d8a3..9bca5943526 100644
--- a/Src/Base/AMReX_BoxArray.cpp
+++ b/Src/Base/AMReX_BoxArray.cpp
@@ -555,12 +555,27 @@ BoxArray::maxSize (const IntVect& block_size)
     blst.maxSize(block_size);
     const int N = static_cast<int>(blst.size());
     if (size() != N) { // If size doesn't change, do nothing.
-        BoxList bak = (m_simplified_list) ? *m_simplified_list : BoxList();
+        std::shared_ptr<BoxList> bak;
+        bak.swap(m_simplified_list);
         define(std::move(blst));
-        if (bak.isNotEmpty()) {
-            m_simplified_list = std::make_shared<BoxList>(std::move(bak));
-        }
+        m_simplified_list = std::move(bak);
+    }
+    return *this;
+}
+
+BoxArray&
+BoxArray::minmaxSize (const IntVect& min_size, const IntVect& max_size)
+{
+    AMREX_ASSERT(this->coarsenable(min_size) &&
+                 (max_size/min_size)*min_size == max_size);
+    std::shared_ptr<BoxList> bak;
+    if (m_bat.is_simple() && crseRatio() == IntVect::TheUnitVector()) {
+        bak.swap(m_simplified_list);
     }
+    this->coarsen(min_size);
+    this->maxSize(max_size/min_size);
+    this->refine(min_size);
+    m_simplified_list = std::move(bak);
     return *this;
 }
 
diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp
index c47f8f5ed26..42987f47a86 100644
--- a/Src/Base/AMReX_CArena.cpp
+++ b/Src/Base/AMReX_CArena.cpp
@@ -2,6 +2,7 @@
 #include <AMReX_CArena.H>
 #include <AMReX_BLassert.H>
 #include <AMReX_Gpu.H>
+#include <AMReX_MFIter.H>
 #include <AMReX_ParallelReduce.H>
 
 #ifdef AMREX_TINY_PROFILING
@@ -57,7 +58,11 @@ CArena::alloc_protected (std::size_t nbytes)
     }
 #endif
 
-    if (static_cast<Long>(m_used+nbytes) >= arena_info.release_threshold) {
+    if (static_cast<Long>(m_used+nbytes) >= arena_info.release_threshold
+#ifdef AMREX_USE_GPU
+        && (MFIter::currentDepth() == 0)
+#endif
+        ) {
         freeUnused_protected();
     }
 
@@ -393,7 +398,11 @@ CArena::hasFreeDeviceMemory (std::size_t sz)
 
         std::size_t nbytes = Arena::align(sz == 0 ? 1 : sz);
 
-        if (static_cast<Long>(m_used+nbytes) >= arena_info.release_threshold) {
+        if (static_cast<Long>(m_used+nbytes) >= arena_info.release_threshold
+#ifdef AMREX_USE_GPU
+            && (MFIter::currentDepth() == 0)
+#endif
+            ) {
             freeUnused_protected();
         }
 
diff --git a/Src/Base/AMReX_DistributionMapping.cpp b/Src/Base/AMReX_DistributionMapping.cpp
index 958f7ef9ec6..cfe16b346a1 100644
--- a/Src/Base/AMReX_DistributionMapping.cpp
+++ b/Src/Base/AMReX_DistributionMapping.cpp
@@ -118,14 +118,15 @@ DistributionMapping::Initialize ()
 
     ParmParse pp("DistributionMapping");
 
-    pp.queryAdd("v"      ,             verbose);
-    pp.queryAdd("verbose",             verbose);
-    pp.queryAdd("efficiency",          max_efficiency);
-    pp.queryAdd("sfc_threshold",       sfc_threshold);
-    pp.queryAdd("node_size",           node_size);
-    pp.queryAdd("verbose_mapper",      flag_verbose_mapper);
-
-    std::string theStrategy;
+    if (! pp.query("verbose", "v", verbose)) {
+        pp.add("verbose", verbose);
+    }
+    pp.query("efficiency",          max_efficiency);
+    pp.query("sfc_threshold",       sfc_threshold);
+    pp.query("node_size",           node_size);
+    pp.query("verbose_mapper",      flag_verbose_mapper);
+
+    std::string theStrategy("SFC");
 
     if (pp.query("strategy", theStrategy))
     {
diff --git a/Src/Base/AMReX_Enum.H b/Src/Base/AMReX_Enum.H
new file mode 100644
index 00000000000..09583f5b73f
--- /dev/null
+++ b/Src/Base/AMReX_Enum.H
@@ -0,0 +1,81 @@
+#ifndef AMREX_ENUM_H_
+#define AMREX_ENUM_H_
+
+#include <AMReX_String.H>
+
+#include <algorithm>
+#include <array>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+template <typename T>
+using amrex_enum_traits = decltype(amrex_get_enum_traits(std::declval<T>()));
+
+namespace amrex {
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    T getEnum (std::string_view const& s)
+    {
+        auto pos = ET::enum_names.find(s);
+        if (pos == std::string_view::npos) {
+            std::string error_msg("amrex::getEnum: Unknown enum: ");
+            error_msg.append(s).append(" in AMREX_ENUM(").append(ET::class_name)
+                .append(", ").append(ET::enum_names).append(").");
+            throw std::runtime_error(error_msg);
+        }
+        auto count = std::count(ET::enum_names.begin(),
+                                ET::enum_names.begin()+pos, ',');
+        return static_cast<T>(count);
+    }
+
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    std::string getEnumNameString (T const& v)
+    {
+        auto n = static_cast<int>(v);
+        std::size_t pos = 0;
+        for (int i = 0; i < n; ++i) {
+            pos = ET::enum_names.find(',', pos);
+            if (pos == std::string::npos) {
+                std::string error_msg("amrex::getEnum: Unknown enum value: ");
+                error_msg.append(std::to_string(n)).append(" in AMREX_ENUM(")
+                    .append(ET::class_name).append(", ").append(ET::enum_names)
+                    .append(").");
+                throw std::runtime_error(error_msg);
+            }
+            ++pos;
+        }
+        auto pos2 = ET::enum_names.find(',', pos);
+        return amrex::trim(std::string(ET::enum_names.substr(pos,pos2-pos)));
+    }
+
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    std::vector<std::string> getEnumNameStrings ()
+    {
+        return amrex::split(std::string(ET::enum_names), ", ");
+    }
+
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    std::string getEnumClassName ()
+    {
+        return std::string(ET::class_name);
+    }
+}
+
+#define AMREX_ENUM(CLASS, ...) \
+    enum class CLASS : int { __VA_ARGS__ }; \
+    struct CLASS##_EnumTraits { \
+        using enum_class_t = CLASS; \
+        static constexpr bool value = true; \
+        static constexpr std::string_view class_name{#CLASS}; \
+        static constexpr std::string_view enum_names{#__VA_ARGS__}; \
+    }; \
+    CLASS##_EnumTraits amrex_get_enum_traits(CLASS)
+
+#endif
diff --git a/Src/Base/AMReX_FArrayBox.cpp b/Src/Base/AMReX_FArrayBox.cpp
index ecb7fc0f4fc..3341c3da5dd 100644
--- a/Src/Base/AMReX_FArrayBox.cpp
+++ b/Src/Base/AMReX_FArrayBox.cpp
@@ -24,10 +24,8 @@ bool FArrayBox::initialized = false;
 
 #if defined(AMREX_DEBUG) || defined(AMREX_TESTING)
 bool FArrayBox::do_initval = true;
-bool FArrayBox::init_snan  = true;
 #else
 bool FArrayBox::do_initval = false;
-bool FArrayBox::init_snan  = false;
 #endif
 Real FArrayBox::initval;
 
@@ -144,6 +142,9 @@ FArrayBox::FArrayBox (const Box& b, int ncomp, Real const* p) noexcept
 void
 FArrayBox::initVal () noexcept
 {
+    // If amrex::InitSNaN is true, snans have been filled by BaseFab.
+    if (amrex::InitSNaN()) { return; }
+
     Real * p = dataPtr();
     Long s = size();
     if (p && s > 0) {
@@ -158,22 +159,7 @@ FArrayBox::initVal () noexcept
         runon = RunOn::Cpu;
 #endif
 
-        if (init_snan) {
-#if defined(AMREX_USE_GPU)
-            if (runon == RunOn::Gpu)
-            {
-                amrex::ParallelFor(s, [=] AMREX_GPU_DEVICE (Long i) noexcept
-                {
-                    p[i] = std::numeric_limits<Real>::signaling_NaN();
-                });
-                Gpu::streamSynchronize();
-            }
-            else
-#endif
-            {
-                amrex_array_init_snan(p, s);
-            }
-        } else if (do_initval) {
+        if (do_initval) {
             const Real x = initval;
             AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG (runon, s, i,
             {
@@ -435,9 +421,8 @@ FArrayBox::Initialize ()
             ? std::numeric_limits<Real>::quiet_NaN()
             : std::numeric_limits<Real>::max();
 
-    pp.queryAdd("initval",    initval);
-    pp.queryAdd("do_initval", do_initval);
-    pp.queryAdd("init_snan", init_snan);
+    pp.query("initval",    initval);
+    pp.query("do_initval", do_initval);
 
     amrex::ExecOnFinalize(FArrayBox::Finalize);
 }
diff --git a/Src/Base/AMReX_FEIntegrator.H b/Src/Base/AMReX_FEIntegrator.H
index f8a002ef534..0e89ffb9d5a 100644
--- a/Src/Base/AMReX_FEIntegrator.H
+++ b/Src/Base/AMReX_FEIntegrator.H
@@ -53,9 +53,6 @@ public:
         // So we initialize S_new by copying the old state.
         IntegratorOps<T>::Copy(S_new, S_old);
 
-        // Call the pre RHS hook
-        BaseT::pre_rhs_action(S_new, time);
-
         // F = RHS(S, t)
         T& F = *F_nodes[0];
         BaseT::Rhs(F, S_new, time);
diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp
index d8820e97e6f..9acf440d98f 100644
--- a/Src/Base/AMReX_FabArrayBase.cpp
+++ b/Src/Base/AMReX_FabArrayBase.cpp
@@ -27,10 +27,7 @@
 
 namespace amrex {
 
-//
-// Set default values in Initialize()!!!
-//
-int     FabArrayBase::MaxComp;
+int FabArrayBase::MaxComp = 25;
 
 #if defined(AMREX_USE_GPU)
 
@@ -99,11 +96,6 @@ FabArrayBase::Initialize ()
     if (initialized) { return; }
     initialized = true;
 
-    //
-    // Set default values here!!!
-    //
-    FabArrayBase::MaxComp           = 25;
-
     ParmParse pp("fabarray");
 
     Vector<int> tilesize(AMREX_SPACEDIM);
@@ -112,13 +104,25 @@ FabArrayBase::Initialize ()
     {
         for (int i=0; i<AMREX_SPACEDIM; i++) { FabArrayBase::mfiter_tile_size[i] = tilesize[i]; }
     }
+    else
+    {
+        pp.addarr("mfiter_tile_size", std::vector<int>{AMREX_D_DECL(FabArrayBase::mfiter_tile_size[0],
+                                                                    FabArrayBase::mfiter_tile_size[1],
+                                                                    FabArrayBase::mfiter_tile_size[2])});
+    }
 
     if (pp.queryarr("comm_tile_size", tilesize, 0, AMREX_SPACEDIM))
     {
         for (int i=0; i<AMREX_SPACEDIM; i++) { FabArrayBase::comm_tile_size[i] = tilesize[i]; }
     }
+    else
+    {
+        pp.addarr("comm_tile_size", std::vector<int>{AMREX_D_DECL(FabArrayBase::comm_tile_size[0],
+                                                                  FabArrayBase::comm_tile_size[1],
+                                                                  FabArrayBase::comm_tile_size[2])});
+    }
 
-    pp.queryAdd("maxcomp",             FabArrayBase::MaxComp);
+    pp.query("maxcomp", FabArrayBase::MaxComp);
 
     if (MaxComp < 1) {
         MaxComp = 1;
@@ -341,7 +345,7 @@ FabArrayBase::CPC::define (const BoxArray& ba_dst, const DistributionMapping& dm
 
         std::vector< std::pair<int,Box> > isects;
 
-        const std::vector<IntVect>& pshifts = m_period.shiftIntVect();
+        const std::vector<IntVect>& pshifts = m_period.shiftIntVect(ng_dst);
 
         auto& send_tags = *m_SndTags;
 
@@ -668,7 +672,7 @@ FabArrayBase::define_fb_metadata (CommMetaData& cmd, const IntVect& nghost,
     const IntVect ng_ng =nghost - 1;
     std::vector< std::pair<int,Box> > isects;
 
-    const std::vector<IntVect>& pshifts = period.shiftIntVect();
+    const std::vector<IntVect>& pshifts = period.shiftIntVect(nghost);
 
     auto& send_tags = *cmd.m_SndTags;
 
@@ -897,7 +901,7 @@ FabArrayBase::FB::define_epo (const FabArrayBase& fa)
     const IndexType& typ = ba.ixType();
     std::vector< std::pair<int,Box> > isects;
 
-    const std::vector<IntVect>& pshifts = m_period.shiftIntVect();
+    const std::vector<IntVect>& pshifts = m_period.shiftIntVect(ng);
 
     auto& send_tags = *m_SndTags;
 
@@ -1049,7 +1053,7 @@ void FabArrayBase::FB::tag_one_box (int krcv, BoxArray const& ba, DistributionMa
 
     std::vector<std::pair<int,Box> > isects2;
     std::vector<std::tuple<int,Box,IntVect> > isects3;
-    auto const& pshifts = m_period.shiftIntVect();
+    auto const& pshifts = m_period.shiftIntVect(m_ngrow);
     for (auto const& shft: pshifts) {
         ba.intersections(gbx+shft, isects2);
         for (auto const& is2 : isects2) {
@@ -1140,7 +1144,7 @@ FabArrayBase::FB::define_os (const FabArrayBase& fa)
 
 #ifdef AMREX_USE_MPI
     if (ParallelDescriptor::NProcs() > 1) {
-        const std::vector<IntVect>& pshifts = m_period.shiftIntVect();
+        const std::vector<IntVect>& pshifts = m_period.shiftIntVect(m_ngrow);
         std::vector< std::pair<int,Box> > isects;
 
         std::set<int> my_receiver;
diff --git a/Src/Base/AMReX_ForkJoin.cpp b/Src/Base/AMReX_ForkJoin.cpp
index f0cc03ae04b..ce51203ab35 100644
--- a/Src/Base/AMReX_ForkJoin.cpp
+++ b/Src/Base/AMReX_ForkJoin.cpp
@@ -68,8 +68,8 @@ ForkJoin::ForkJoin (const Vector<double> &task_rank_pct)
 void
 ForkJoin::init(const Vector<int> &task_rank_n)
 {
-    ParmParse pp("forkjoin");
-    pp.queryAdd("verbose", flag_verbose);
+    ParmParse pp("amrex.forkjoin");
+    pp.query("verbose", flag_verbose);
 
     const auto task_n = task_rank_n.size();
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(task_n > 0,
diff --git a/Src/Base/AMReX_Geometry.H b/Src/Base/AMReX_Geometry.H
index 01759e1b502..6231fbd1f9b 100644
--- a/Src/Base/AMReX_Geometry.H
+++ b/Src/Base/AMReX_Geometry.H
@@ -425,6 +425,14 @@ public:
     */
     void computeRoundoffDomain ();
 
+    //! Returns roundoff domain's lower end
+    [[nodiscard]] GpuArray<ParticleReal, AMREX_SPACEDIM> const&
+    RoundOffLo () const { return roundoff_lo; }
+
+    //! Returns roundoff domain's higher end
+    [[nodiscard]] GpuArray<ParticleReal, AMREX_SPACEDIM> const&
+    RoundOffHi () const { return roundoff_hi; }
+
 private:
     void read_params ();
 
@@ -440,6 +448,8 @@ private:
 
     //
     Box     domain;
+
+    friend std::istream& operator>> (std::istream&, Geometry&);
 };
 
 
diff --git a/Src/Base/AMReX_Geometry.cpp b/Src/Base/AMReX_Geometry.cpp
index 2ec216ae109..4b63f29fdd8 100644
--- a/Src/Base/AMReX_Geometry.cpp
+++ b/Src/Base/AMReX_Geometry.cpp
@@ -15,22 +15,18 @@
 namespace amrex {
 
 std::ostream&
-operator<< (std::ostream&   os,
-            const Geometry& g)
+operator<< (std::ostream& os, const Geometry& g)
 {
     os << (CoordSys&) g << g.ProbDomain() << g.Domain() << 'P' << IntVect(g.isPeriodic());
     return os;
 }
 
 std::istream&
-operator>> (std::istream& is,
-            Geometry&     g)
+operator>> (std::istream& is, Geometry& g)
 {
-    Box     bx;
-    RealBox rb;
-    is >> (CoordSys&) g >> rb >> bx;
-    g.Domain(bx);
-    g.ProbDomain(rb);
+    is >> (CoordSys&) g >> g.prob_domain >> g.domain;
+
+    g.computeRoundoffDomain();
 
     int ic = is.peek();
     if (ic == static_cast<int>('P')) {
diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index 7c17c918a70..a7aef5a9243 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -14,6 +14,7 @@
 #include <algorithm>
 #include <array>
 #include <cstdlib>
+#include <cstring>
 #include <memory>
 
 #define AMREX_GPU_MAX_STREAMS 8
@@ -318,6 +319,52 @@ dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
 void hypreSynchronize ();
 #endif
 
+//! Copy `nbytes` bytes from host to device global variable. `offset` is the
+//! offset in bytes from the start of the device global variable.
+template <typename T>
+void memcpy_from_host_to_device_global_async (T& dg, const void* src,
+                                              std::size_t nbytes,
+                                              std::size_t offset = 0)
+{
+#if defined(AMREX_USE_CUDA)
+    AMREX_CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(dg, src, nbytes, offset,
+                                                 cudaMemcpyHostToDevice,
+                                                 Device::gpuStream()));
+#elif defined(AMREX_USE_HIP)
+    AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes, offset,
+                                               hipMemcpyHostToDevice,
+                                               Device::gpuStream()));
+#elif defined(AMREX_USE_SYCL)
+    Device::streamQueue().memcpy(dg, src, nbytes, offset);
+#else
+    auto* p = (char*)(&dg);
+    std::memcpy(p+offset, src, nbytes);
+#endif
+}
+
+//! Copy `nbytes` bytes from device global variable to host. `offset` is the
+//! offset in bytes from the start of the device global variable.
+template <typename T>
+void memcpy_from_device_global_to_host_async (void* dst, T const& dg,
+                                              std::size_t nbytes,
+                                              std::size_t offset = 0)
+{
+#if defined(AMREX_USE_CUDA)
+    AMREX_CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
+                                                   cudaMemcpyDeviceToHost,
+                                                   Device::gpuStream()));
+#elif defined(AMREX_USE_HIP)
+    AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
+                                                  hipMemcpyDeviceToHost,
+                                                  Device::gpuStream()));
+#elif defined(AMREX_USE_SYCL)
+    Device::streamQueue().memcpy(dst, dg, nbytes, offset);
+#else
+    auto const* p = (char const*)(&dg);
+    std::memcpy(dst, p+offset, nbytes);
+#endif
+}
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index fa190a22c1c..d911349a61f 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -19,7 +19,11 @@
 #if defined(AMREX_USE_CUDA)
 #include <cuda_profiler_api.h>
 #if defined(AMREX_PROFILING) || defined (AMREX_TINY_PROFILING)
-#include <nvToolsExt.h>
+#if __has_include(<nvtx3/nvToolsExt.h>)
+#  include <nvtx3/nvToolsExt.h>
+#else
+#  include <nvToolsExt.h>
+#endif
 #endif
 #endif
 
@@ -155,9 +159,9 @@ Device::Initialize ()
     max_gpu_streams = std::max(max_gpu_streams, 1);
 
     ParmParse pp("device");
-
-    pp.queryAdd("v", verbose);
-    pp.queryAdd("verbose", verbose);
+    if (! pp.query("verbose", "v", verbose)) {
+        pp.add("verbose", verbose);
+    }
 
     if (amrex::Verbose()) {
         AMREX_HIP_OR_CUDA_OR_SYCL
@@ -519,9 +523,9 @@ Device::initialize_gpu ()
     int ny = 0;
     int nz = 0;
 
-    pp.queryAdd("numThreads.x", nx);
-    pp.queryAdd("numThreads.y", ny);
-    pp.queryAdd("numThreads.z", nz);
+    pp.query("numThreads.x", nx);
+    pp.query("numThreads.y", ny);
+    pp.query("numThreads.z", nz);
 
     numThreadsOverride.x = (int) nx;
     numThreadsOverride.y = (int) ny;
@@ -531,9 +535,9 @@ Device::initialize_gpu ()
     ny = 0;
     nz = 0;
 
-    pp.queryAdd("numBlocks.x", nx);
-    pp.queryAdd("numBlocks.y", ny);
-    pp.queryAdd("numBlocks.z", nz);
+    pp.query("numBlocks.x", nx);
+    pp.query("numBlocks.y", ny);
+    pp.query("numBlocks.z", nz);
 
     numBlocksOverride.x = (int) nx;
     numBlocksOverride.y = (int) ny;
@@ -542,8 +546,8 @@ Device::initialize_gpu ()
     // Graph initialization
     int graph_init = 0;
     int graph_size = 10000;
-    pp.queryAdd("graph_init", graph_init);
-    pp.queryAdd("graph_init_nodes", graph_size);
+    pp.query("graph_init", graph_init);
+    pp.query("graph_init_nodes", graph_size);
 
     if (graph_init)
     {
diff --git a/Src/Base/AMReX_GpuLaunch.nolint.H b/Src/Base/AMReX_GpuLaunch.nolint.H
index c7df1737517..bb1bbb2453b 100644
--- a/Src/Base/AMReX_GpuLaunch.nolint.H
+++ b/Src/Base/AMReX_GpuLaunch.nolint.H
@@ -1,9 +1,8 @@
 // Do not include this header anywhere other than AMReX_GpuLaunch.H.
 // The purpose of this file is to avoid clang-tidy.
 
-#define AMREX_WRONG_NUM_ARGS(...) static_assert(false,"Wrong number of arguments to macro")
-#define AMREX_GET_MACRO(_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
-#define AMREX_LAUNCH_DEVICE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\
+#define AMREX_GET_LAUNCH_MACRO(_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
+#define AMREX_LAUNCH_DEVICE_LAMBDA(...) AMREX_GET_LAUNCH_MACRO(__VA_ARGS__,\
                                                         AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3, \
                                                         AMREX_WRONG_NUM_ARGS, \
                                                         AMREX_WRONG_NUM_ARGS, \
@@ -14,7 +13,7 @@
                                                         AMREX_WRONG_NUM_ARGS, \
                                                         AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
 
-#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\
+#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA(...) AMREX_GET_LAUNCH_MACRO(__VA_ARGS__,\
                                                         AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3, \
                                                         AMREX_WRONG_NUM_ARGS, \
                                                         AMREX_WRONG_NUM_ARGS, \
diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 1850ff6ff96..927b729f7ed 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -5,53 +5,115 @@
 namespace amrex {
 
 namespace detail {
+
+    // call_f_scalar_handler
+
     template <typename F, typename N>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i)
+    AMREX_FORCE_INLINE
+    auto call_f_scalar_handler (F const& f, N i)
         noexcept -> decltype(f(0))
     {
         f(i);
     }
 
     template <typename F, typename N>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i)
+    AMREX_FORCE_INLINE
+    auto call_f_scalar_handler (F const& f, N i)
         noexcept -> decltype(f(0,Gpu::Handler{}))
     {
-        f(i,Gpu::Handler{});
+        f(i, Gpu::Handler{});
+    }
+
+    // call_f_intvect_inner
+
+    template <typename F, std::size_t...Ns, class...Args>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
+        noexcept -> decltype(f(0, 0, 0, args...))
+    {
+        f(iv[0], 0, 0, args...);
+    }
+
+    template <typename F, std::size_t...Ns, class...Args>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
+        noexcept -> decltype(f(0, 0, 0, args...))
+    {
+        f(iv[0], iv[1], 0, args...);
+    }
+
+    template <typename F, int dim, std::size_t...Ns, class...Args>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
+        noexcept -> decltype(f(iv, args...))
+    {
+        f(iv, args...);
+    }
+
+    template <typename F, int dim, std::size_t...Ns, class...Args>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
+        noexcept -> decltype(f(iv[Ns]..., args...))
+    {
+        f(iv[Ns]..., args...);
+    }
+
+    // call_f_intvect_engine
+
+    template <typename F, int dim>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_engine (F const& f, IntVectND<dim> iv, RandomEngine engine)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
+    {
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
+    }
+
+    // call_f_intvect_handler
+
+    template <typename F, int dim>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_handler (F const& f, IntVectND<dim> iv)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
+    {
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
     }
 
-    template <typename F>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k)
-        noexcept -> decltype(f(0,0,0))
+    template <typename F, int dim>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_handler (F const& f, IntVectND<dim> iv)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
     {
-        f(i,j,k);
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{});
     }
 
-    template <typename F>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k)
-        noexcept -> decltype(f(0,0,0,Gpu::Handler{}))
+    // call_f_intvect_ncomp_engine
+
+    template <typename F, typename T, int dim>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T n, RandomEngine engine)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine))
     {
-        f(i,j,k,Gpu::Handler{});
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
     }
 
-    template <typename F, typename T>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T n)
-        noexcept -> decltype(f(0,0,0,0))
+    // call_f_intvect_ncomp_handler
+
+    template <typename F, typename T, int dim>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T n)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
     {
-        f(i,j,k,n);
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
     }
 
-    template <typename F, typename T>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T n)
-        noexcept -> decltype(f(0,0,0,0,Gpu::Handler{}))
+    template <typename F, typename T, int dim>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T n)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, Gpu::Handler{}))
     {
-        f(i,j,k,n,Gpu::Handler{});
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, Gpu::Handler{});
     }
+
 }
 
 template<typename T, typename L>
@@ -72,7 +134,7 @@ AMREX_ATTRIBUTE_FLATTEN_FOR
 void For (T n, L const& f) noexcept
 {
     for (T i = 0; i < n; ++i) {
-        detail::call_f(f,i);
+        detail::call_f_scalar_handler(f,i);
     }
 }
 
@@ -102,7 +164,7 @@ void ParallelFor (T n, L const& f) noexcept
 {
     AMREX_PRAGMA_SIMD
     for (T i = 0; i < n; ++i) {
-        detail::call_f(f,i);
+        detail::call_f_scalar_handler(f,i);
     }
 }
 
@@ -126,182 +188,294 @@ void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
     ParallelFor(n, std::forward<L>(f));
 }
 
-template <typename L>
+namespace detail {
+
+template <int idim, typename L, int dim>
+AMREX_FORCE_INLINE
+void For_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv) noexcept
+{
+    if constexpr (idim == 1) {
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_handler(f,iv);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_handler(f,iv);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_handler(f,iv);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            For_impND<idim-1>(f, lo, hi, iv);
+        }
+    }
+}
+
+}
+
+template <typename L, int dim>
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void For (Box const& box, L const& f) noexcept
+void For (BoxND<dim> const& box, L const& f) noexcept
 {
-    const auto lo = amrex::lbound(box);
-    const auto hi = amrex::ubound(box);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    for (int i = lo.x; i <= hi.x; ++i) {
-        detail::call_f(f,i,j,k);
-    }}}
+    const auto lo = amrex::lbound_iv(box);
+    const auto hi = amrex::ubound_iv(box);
+    IntVectND<dim> iv;
+    detail::For_impND<dim>(f, lo, hi, iv);
 }
 
-template <int MT, typename L>
-void For (Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void For (BoxND<dim> const& box, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box, std::forward<L>(f));
 }
 
-template <typename L>
-void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void For (Gpu::KernelInfo const&, BoxND<dim> const& box, L&& f) noexcept
 {
     For(box, std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void For (Gpu::KernelInfo const&, BoxND<dim> const& box, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box, std::forward<L>(f));
 }
 
-template <typename L>
+namespace detail {
+
+template <int idim, typename L, int dim>
+AMREX_FORCE_INLINE
+void ParallelFor_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv) noexcept
+{
+    if constexpr (idim == 1) {
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_handler(f,iv);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_handler(f,iv);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_handler(f,iv);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            ParallelFor_impND<idim-1>(f, lo, hi, iv);
+        }
+    }
+}
+
+}
+
+template <typename L, int dim>
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void ParallelFor (Box const& box, L const& f) noexcept
+void ParallelFor (BoxND<dim> const& box, L const& f) noexcept
 {
-    const auto lo = amrex::lbound(box);
-    const auto hi = amrex::ubound(box);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    AMREX_PRAGMA_SIMD
-    for (int i = lo.x; i <= hi.x; ++i) {
-        detail::call_f(f,i,j,k);
-    }}}
+    const auto lo = amrex::lbound_iv(box);
+    const auto hi = amrex::ubound_iv(box);
+    IntVectND<dim> iv;
+    detail::ParallelFor_impND<dim>(f, lo, hi, iv);
 }
 
-template <int MT, typename L>
-void ParallelFor (Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box, std::forward<L>(f));
 }
 
-template <typename L>
-void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor(box, std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
+namespace detail {
+
+template <int idim, typename L, typename T, int dim>
+AMREX_FORCE_INLINE
+void For_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv, T n) noexcept
+{
+    if constexpr (idim == 1) {
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_handler(f,iv,n);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_handler(f,iv,n);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_handler(f,iv,n);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            For_impND<idim-1>(f, lo, hi, iv, n);
+        }
+    }
+}
+
+}
+
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void For (Box const& box, T ncomp, L const& f) noexcept
+void For (BoxND<dim> const& box, T ncomp, L const& f) noexcept
 {
-    const auto lo = amrex::lbound(box);
-    const auto hi = amrex::ubound(box);
+    const auto lo = amrex::lbound_iv(box);
+    const auto hi = amrex::ubound_iv(box);
+    IntVectND<dim> iv;
     for (T n = 0; n < ncomp; ++n) {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-        for (int i = lo.x; i <= hi.x; ++i) {
-            detail::call_f(f,i,j,k,n);
-        }}}
+        detail::For_impND<dim>(f, lo, hi, iv, n);
     }
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void For (Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box, ncomp, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void For (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     For(box, ncomp, std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void For (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box, ncomp, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-AMREX_ATTRIBUTE_FLATTEN_FOR
-void ParallelFor (Box const& box, T ncomp, L const& f) noexcept
+namespace detail {
+
+template <int idim, typename L, typename T, int dim>
+AMREX_FORCE_INLINE
+void ParallelFor_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv, T n) noexcept
 {
-    const auto lo = amrex::lbound(box);
-    const auto hi = amrex::ubound(box);
-    for (T n = 0; n < ncomp; ++n) {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        for (int j = lo.y; j <= hi.y; ++j) {
+    if constexpr (idim == 1) {
         AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            detail::call_f(f,i,j,k,n);
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_handler(f,iv,n);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_handler(f,iv,n);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_handler(f,iv,n);
         }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            ParallelFor_impND<idim-1>(f, lo, hi, iv, n);
+        }
     }
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+}
+
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+AMREX_ATTRIBUTE_FLATTEN_FOR
+void ParallelFor (BoxND<dim> const& box, T ncomp, L const& f) noexcept
+{
+    const auto lo = amrex::lbound_iv(box);
+    const auto hi = amrex::ubound_iv(box);
+    IntVectND<dim> iv;
+    for (T n = 0; n < ncomp; ++n) {
+        detail::ParallelFor_impND<dim>(f, lo, hi, iv, n);
+    }
+}
+
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box, ncomp, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor(box, ncomp, std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box, ncomp, std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
-void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     For(box1, std::forward<L1>(f1));
     For(box2, std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1, std::forward<L1>(f1));
     For(box2, std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2>
-void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void For (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     For (box1, box2, std::forward<L1>(f1), std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void For (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     For (box1, box2, std::forward<L1>(f1), std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
-void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+template <typename L1, typename L2, typename L3, int dim>
+void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     For(box1, std::forward<L1>(f1));
     For(box2, std::forward<L2>(f2));
     For(box3, std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
-void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+template <int MT, typename L1, typename L2, typename L3, int dim>
+void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1, std::forward<L1>(f1));
@@ -309,81 +483,81 @@ void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L
     For(box3, std::forward<L3>(f3));
 }
 
-template <typename L1, typename L2, typename L3>
-void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+template <typename L1, typename L2, typename L3, int dim>
+void For (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     For(box1, box2, box3, std::forward<L1>(f1), std::forward<L2>(f2), std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
-void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+template <int MT, typename L1, typename L2, typename L3, int dim>
+void For (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1, box2, box3, std::forward<L1>(f1), std::forward<L2>(f2), std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
-void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     For(box1, ncomp1, std::forward<L1>(f1));
     For(box2, ncomp2, std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
-void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1, ncomp1, std::forward<L1>(f1));
     For(box2, ncomp2, std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void For (Gpu::KernelInfo const&,
-          Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void For (Gpu::KernelInfo const&,
-          Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
-void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     For(box1, ncomp1, std::forward<L1>(f1));
     For(box2, ncomp2, std::forward<L2>(f2));
     For(box3, ncomp3, std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
-void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1, ncomp1, std::forward<L1>(f1));
@@ -391,28 +565,28 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
     For(box3, ncomp3, std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void For (Gpu::KernelInfo const&,
-          Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     For(box1,ncomp1,std::forward<L1>(f1),
         box2,ncomp2,std::forward<L2>(f2),
         box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void For (Gpu::KernelInfo const&,
-          Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,ncomp1,std::forward<L1>(f1),
@@ -420,44 +594,44 @@ void For (Gpu::KernelInfo const&,
         box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <typename L1, typename L2>
-void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor(box1, std::forward<L1>(f1));
     ParallelFor(box2, std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1, std::forward<L1>(f1));
     ParallelFor(box2, std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2>
-void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
-void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+template <typename L1, typename L2, typename L3, int dim>
+void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor(box1, std::forward<L1>(f1));
     ParallelFor(box2, std::forward<L2>(f2));
     ParallelFor(box3, std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
-void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+template <int MT, typename L1, typename L2, typename L3, int dim>
+void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1, std::forward<L1>(f1));
@@ -465,83 +639,83 @@ void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2
     ParallelFor(box3, std::forward<L3>(f3));
 }
 
-template <typename L1, typename L2, typename L3>
-void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+template <typename L1, typename L2, typename L3, int dim>
+void ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
-void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+template <int MT, typename L1, typename L2, typename L3, int dim>
+void ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor(box1, ncomp1, std::forward<L1>(f1));
     ParallelFor(box2, ncomp2, std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1, ncomp1, std::forward<L1>(f1));
     ParallelFor(box2, ncomp2, std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void ParallelFor (Gpu::KernelInfo const&,
-                  Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),
                 box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void ParallelFor (Gpu::KernelInfo const&,
-                  Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),
                 box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2,
-                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor(box1, ncomp1, std::forward<L1>(f1));
     ParallelFor(box2, ncomp2, std::forward<L2>(f2));
     ParallelFor(box3, ncomp3, std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2,
-                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1, ncomp1, std::forward<L1>(f1));
@@ -549,28 +723,28 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
     ParallelFor(box3, ncomp3, std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void ParallelFor (Gpu::KernelInfo const&,
-                  Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2,
-                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor(box1, ncomp1, std::forward<L1>(f1),
                 box2, ncomp2, std::forward<L2>(f2),
                 box3, ncomp3, std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void ParallelFor (Gpu::KernelInfo const&,
-                  Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2,
-                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1, ncomp1, std::forward<L1>(f1),
@@ -591,99 +765,99 @@ void HostDeviceParallelFor (T n, L&& f) noexcept
     ParallelFor(n,std::forward<L>(f));
 }
 
-template <typename L>
-void HostDeviceParallelFor (Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor(box,std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void HostDeviceParallelFor (Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor(box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box,ncomp,std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
-void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
-void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+template <typename L1, typename L2, typename L3, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                             L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
-void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+template <int MT, typename L1, typename L2, typename L3, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                             L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
-void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
-void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
-void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2,
-                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                            BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),
                 box2,ncomp2,std::forward<L2>(f2),
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
-void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2,
-                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                            BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),
@@ -704,99 +878,99 @@ void HostDeviceFor (T n, L&& f) noexcept
     For(n,std::forward<L>(f));
 }
 
-template <typename L>
-void HostDeviceFor (Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void HostDeviceFor (BoxND<dim> const& box, L&& f) noexcept
 {
     For(box,std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void HostDeviceFor (Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void HostDeviceFor (BoxND<dim> const& box, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void HostDeviceFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     For(box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, int dim, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void HostDeviceFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box,ncomp,std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
-void HostDeviceFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void HostDeviceFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void HostDeviceFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void HostDeviceFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
-void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3,
+template <typename L1, typename L2, typename L3, int dim>
+void HostDeviceFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
-void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3,
+template <int MT, typename L1, typename L2, typename L3, int dim>
+void HostDeviceFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
-void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void HostDeviceFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
-void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void HostDeviceFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
-void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2,
-                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void HostDeviceFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                    BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     For(box1,ncomp1,std::forward<L1>(f1),
         box2,ncomp2,std::forward<L2>(f2),
         box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
-void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2,
-                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void HostDeviceFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                    BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,ncomp1,std::forward<L1>(f1),
@@ -817,105 +991,105 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
     ParallelFor(n,std::forward<L>(f));
 }
 
-template <typename L>
-void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor(box,std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor(box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box,ncomp,std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
-void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
+template <typename L1, typename L2, typename L3, int dim>
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
-                            Box const& box1, Box const& box2, Box const& box3,
+                            BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                             L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2, typename L3, int dim>
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
-                            Box const& box1, Box const& box2, Box const& box3,
+                            BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                             L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
-                            Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                            BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
-                            Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                            BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
-                            Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2,
-                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                            BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                            BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),
                 box2,ncomp2,std::forward<L2>(f2),
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
-                            Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2,
-                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                            BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                            BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),
@@ -936,105 +1110,105 @@ void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
     For(n,std::forward<L>(f));
 }
 
-template <typename L>
-void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void HostDeviceFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L&& f) noexcept
 {
     For(box,std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void HostDeviceFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void HostDeviceFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     For(box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
-void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void HostDeviceFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box,ncomp,std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
-void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void HostDeviceFor (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void HostDeviceFor (Gpu::KernelInfo const&, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
+template <typename L1, typename L2, typename L3, int dim>
 void HostDeviceFor (Gpu::KernelInfo const&,
-                    Box const& box1, Box const& box2, Box const& box3,
+                    BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2, typename L3, int dim>
 void HostDeviceFor (Gpu::KernelInfo const&,
-                    Box const& box1, Box const& box2, Box const& box3,
+                    BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceFor (Gpu::KernelInfo const&,
-                    Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceFor (Gpu::KernelInfo const&,
-                    Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceFor (Gpu::KernelInfo const&,
-                    Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2,
-                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                    BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     For(box1,ncomp1,std::forward<L1>(f1),
         box2,ncomp2,std::forward<L2>(f2),
         box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral_v<T1>>,
           typename M2=std::enable_if_t<std::is_integral_v<T2>>,
           typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceFor (Gpu::KernelInfo const&,
-                    Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2,
-                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                    BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     amrex::ignore_unused(MT);
     For(box1,ncomp1,std::forward<L1>(f1),
@@ -1051,31 +1225,81 @@ void ParallelForRNG (T n, L const& f) noexcept
     }
 }
 
-template <typename L>
+namespace detail {
+
+template <int idim, typename L, int dim>
+AMREX_FORCE_INLINE
+void ParallelForRNG_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv) noexcept
+{
+    if constexpr (idim == 1) {
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_engine(f,iv,RandomEngine{});
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_engine(f,iv,RandomEngine{});
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_engine(f,iv,RandomEngine{});
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            ParallelForRNG_impND<idim-1>(f, lo, hi, iv);
+        }
+    }
+}
+
+template <int idim, typename L, typename T, int dim>
+AMREX_FORCE_INLINE
+void ParallelForRNG_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv, T n) noexcept
+{
+    if constexpr (idim == 1) {
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_engine(f,iv,n,RandomEngine{});
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_engine(f,iv,n,RandomEngine{});
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_engine(f,iv,n,RandomEngine{});
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            ParallelForRNG_impND<idim-1>(f, lo, hi, iv, n);
+        }
+    }
+}
+
+}
+
+template <typename L, int dim>
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void ParallelForRNG (Box const& box, L const& f) noexcept
+void ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
 {
-    const auto lo = amrex::lbound(box);
-    const auto hi = amrex::ubound(box);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k,RandomEngine{});
-    }}}
+    const auto lo = amrex::lbound_iv(box);
+    const auto hi = amrex::ubound_iv(box);
+    IntVectND<dim> iv;
+    detail::ParallelForRNG_impND<dim>(f, lo, hi, iv);
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
+void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
 {
-    const auto lo = amrex::lbound(box);
-    const auto hi = amrex::ubound(box);
+    const auto lo = amrex::lbound_iv(box);
+    const auto hi = amrex::ubound_iv(box);
+    IntVectND<dim> iv;
     for (T n = 0; n < ncomp; ++n) {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-        for (int i = lo.x; i <= hi.x; ++i) {
-            f(i,j,k,n,RandomEngine{});
-        }}}
+        detail::ParallelForRNG_impND<dim>(f, lo, hi, iv, n);
     }
 }
 
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index dbc36e47383..7955410f8ba 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -4,6 +4,146 @@
 
 namespace amrex {
 
+namespace detail {
+
+    // call_f_scalar_handler
+
+    template <typename F, typename N>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)
+        noexcept -> decltype(f(0))
+    {
+        f(i);
+    }
+
+    template <typename F, typename N>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)
+        noexcept -> decltype(f(0,Gpu::Handler{}))
+    {
+        f(i, handler);
+    }
+
+    // call_f_intvect_inner
+
+    template <typename F, std::size_t...Ns, class...Args>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
+        noexcept -> decltype(f(0, 0, 0, args...))
+    {
+        f(iv[0], 0, 0, args...);
+    }
+
+    template <typename F, std::size_t...Ns, class...Args>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
+        noexcept -> decltype(f(0, 0, 0, args...))
+    {
+        f(iv[0], iv[1], 0, args...);
+    }
+
+    template <typename F, int dim, std::size_t...Ns, class...Args>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
+        noexcept -> decltype(f(iv, args...))
+    {
+        f(iv, args...);
+    }
+
+    template <typename F, int dim, std::size_t...Ns, class...Args>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
+        noexcept -> decltype(f(iv[Ns]..., args...))
+    {
+        f(iv[Ns]..., args...);
+    }
+
+    // call_f_intvect
+
+    template <typename F, int dim>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect (F const& f, IntVectND<dim> iv)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
+    {
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
+    }
+
+    // call_f_intvect_engine
+
+    template <typename F, int dim>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_engine (F const& f, IntVectND<dim> iv, RandomEngine engine)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
+    {
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
+    }
+
+    // call_f_intvect_handler
+
+    template <typename F, int dim>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const&)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
+    {
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
+    }
+
+    template <typename F, int dim>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
+    {
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
+    }
+
+    // call_f_intvect_ncomp
+
+    template <typename F, typename T, int dim>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
+    {
+        for (T n = 0; n < ncomp; ++n) {
+            call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
+        }
+    }
+
+    // call_f_intvect_ncomp_engine
+
+    template <typename F, typename T, int dim>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
+    {
+        for (T n = 0; n < ncomp; ++n) {
+            call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
+        }
+    }
+
+    // call_f_intvect_ncomp_handler
+
+    template <typename F, typename T, int dim>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
+    {
+        for (T n = 0; n < ncomp; ++n) {
+            call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
+        }
+    }
+
+    template <typename F, typename T, int dim>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
+    {
+        for (T n = 0; n < ncomp; ++n) {
+            call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
+        }
+    }
+
+}
+
 #ifdef AMREX_USE_SYCL
 
 template <typename L>
@@ -137,56 +277,6 @@ void launch (T const& n, L const& f) noexcept
     }
 }
 
-namespace detail {
-    template <typename F, typename N>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i, Gpu::Handler const&)
-        noexcept -> decltype(f(0))
-    {
-        f(i);
-    }
-
-    template <typename F, typename N>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i, Gpu::Handler const& handler)
-        noexcept -> decltype(f(0,Gpu::Handler{}))
-    {
-        f(i,handler);
-    }
-
-    template <typename F>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, Gpu::Handler const&)
-        noexcept -> decltype(f(0,0,0))
-    {
-        f(i,j,k);
-    }
-
-    template <typename F>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, Gpu::Handler const& handler)
-        noexcept -> decltype(f(0,0,0,Gpu::Handler{}))
-    {
-        f(i,j,k,handler);
-    }
-
-    template <typename F, typename T>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T ncomp, Gpu::Handler const&)
-        noexcept -> decltype(f(0,0,0,0))
-    {
-        for (T n = 0; n < ncomp; ++n) f(i,j,k,n);
-    }
-
-    template <typename F, typename T>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T ncomp, Gpu::Handler const& handler)
-        noexcept -> decltype(f(0,0,0,0,Gpu::Handler{}))
-    {
-        for (T n = 0; n < ncomp; ++n) f(i,j,k,n,handler);
-    }
-}
-
 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
 {
@@ -210,7 +300,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
                          i < std::size_t(n); i += stride) {
                         int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
                                                           item.get_local_range(0));
-                        detail::call_f(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
+                        detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                           n_active_threads});
                     }
                 });
@@ -225,7 +315,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
                 {
                     for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
                          i < std::size_t(n); i += stride) {
-                        detail::call_f(f, T(i), Gpu::Handler{&item});
+                        detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
                     }
                 });
             });
@@ -235,11 +325,11 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
     }
 }
 
-template <int MT, typename L>
-void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexcept
+template <int MT, typename L, int dim>
+void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const BoxIndexer indexer(box);
+    const BoxIndexerND<dim> indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
@@ -257,10 +347,10 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexc
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < indexer.numPts(); icell += stride) {
-                        auto [i, j, k] = indexer(icell);
+                        auto iv = indexer.intVect(icell);
                         int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
                                                           std::uint64_t(item.get_local_range(0)));
-                        detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
+                        detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                                 n_active_threads});
                     }
                 });
@@ -275,8 +365,8 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexc
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < indexer.numPts(); icell += stride) {
-                        auto [i, j, k] = indexer(icell);
-                        detail::call_f(f,i,j,k,Gpu::Handler{&item});
+                        auto iv = indexer.intVect(icell);
+                        detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
                     }
                 });
             });
@@ -286,11 +376,11 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexc
     }
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const BoxIndexer indexer(box);
+    const BoxIndexerND<dim> indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
@@ -308,10 +398,10 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const&
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < indexer.numPts(); icell += stride) {
-                        auto [i, j, k] = indexer(icell);
+                        auto iv = indexer.intVect(icell);
                         int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
                                                           std::uint64_t(item.get_local_range(0)));
-                        detail::call_f(f, i, j, k, ncomp,
+                        detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
                                        Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                     n_active_threads});
                     }
@@ -327,8 +417,8 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const&
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < indexer.numPts(); icell += stride) {
-                        auto [i, j, k] = indexer(icell);
-                        detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item});
+                        auto iv = indexer.intVect(icell);
+                        detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
                     }
                 });
             });
@@ -371,11 +461,11 @@ void ParallelForRNG (T n, L const& f) noexcept
     }
 }
 
-template <typename L>
-void ParallelForRNG (Box const& box, L const& f) noexcept
+template <typename L, int dim>
+void ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const BoxIndexer indexer(box);
+    const BoxIndexerND<dim> indexer(box);
     const auto ec = Gpu::ExecutionConfig(box.numPts());
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
@@ -395,8 +485,8 @@ void ParallelForRNG (Box const& box, L const& f) noexcept
                 RandomEngine rand_eng{&engine};
                 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
                      icell < indexer.numPts(); icell += stride) {
-                    auto [i, j, k] = indexer(icell);
-                    f(i,j,k,rand_eng);
+                    auto iv = indexer.intVect(icell);
+                    detail::call_f_intvect_engine(f,iv,rand_eng);
                 }
                 engine_acc.store(engine, tid);
             });
@@ -407,11 +497,11 @@ void ParallelForRNG (Box const& box, L const& f) noexcept
     }
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const BoxIndexer indexer(box);
+    const BoxIndexerND<dim> indexer(box);
     const auto ec = Gpu::ExecutionConfig(box.numPts());
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
@@ -431,10 +521,8 @@ void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
                 RandomEngine rand_eng{&engine};
                 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
                      icell < indexer.numPts(); icell += stride) {
-                    auto [i, j, k] = indexer(icell);
-                    for (T n = 0; n < ncomp; ++n) {
-                        f(i,j,k,n,rand_eng);
-                    }
+                    auto iv = indexer.intVect(icell);
+                    detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
                 }
                 engine_acc.store(engine, tid);
             });
@@ -445,12 +533,12 @@ void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
     }
 }
 
-template <int MT, typename L1, typename L2>
-void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const BoxIndexer indexer1(box1);
-    const BoxIndexer indexer2(box2);
+    const BoxIndexerND<dim> indexer1(box1);
+    const BoxIndexerND<dim> indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
@@ -467,12 +555,12 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
                 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < indexer1.numPts()) {
-                        auto [i, j, k] = indexer1(icell);
-                        f1(i,j,k);
+                        auto iv = indexer1.intVect(icell);
+                        detail::call_f_intvect(f1,iv);
                     }
                     if (icell < indexer2.numPts()) {
-                        auto [i, j, k] = indexer2(icell);
-                        f2(i,j,k);
+                        auto iv = indexer2.intVect(icell);
+                        detail::call_f_intvect(f2,iv);
                     }
                 }
             });
@@ -482,15 +570,15 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
     }
 }
 
-template <int MT, typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2, typename L3, int dim>
 void ParallelFor (Gpu::KernelInfo const& /*info*/,
-                  Box const& box1, Box const& box2, Box const& box3,
+                  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const BoxIndexer indexer1(box1);
-    const BoxIndexer indexer2(box2);
-    const BoxIndexer indexer3(box3);
+    const BoxIndexerND<dim> indexer1(box1);
+    const BoxIndexerND<dim> indexer2(box2);
+    const BoxIndexerND<dim> indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
@@ -507,16 +595,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < indexer1.numPts()) {
-                        auto [i, j, k] = indexer1(icell);
-                        f1(i,j,k);
+                        auto iv = indexer1.intVect(icell);
+                        detail::call_f_intvect(f1,iv);
                     }
                     if (icell < indexer2.numPts()) {
-                        auto [i, j, k] = indexer2(icell);
-                        f2(i,j,k);
+                        auto iv = indexer2.intVect(icell);
+                        detail::call_f_intvect(f2,iv);
                     }
                     if (icell < indexer3.numPts()) {
-                        auto [i, j, k] = indexer3(icell);
-                        f3(i,j,k);
+                        auto iv = indexer3.intVect(icell);
+                        detail::call_f_intvect(f3,iv);
                     }
                 }
             });
@@ -526,16 +614,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     }
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 void ParallelFor (Gpu::KernelInfo const& /*info*/,
-                  Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const BoxIndexer indexer1(box1);
-    const BoxIndexer indexer2(box2);
+    const BoxIndexerND<dim> indexer1(box1);
+    const BoxIndexerND<dim> indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
@@ -552,16 +640,12 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < indexer1.numPts()) {
-                        auto [i, j, k] = indexer1(icell);
-                        for (T1 n = 0; n < ncomp1; ++n) {
-                            f1(i,j,k,n);
-                        }
+                        auto iv = indexer1.intVect(icell);
+                        detail::call_f_intvect_ncomp(f1,iv,ncomp1);
                     }
                     if (icell < indexer2.numPts()) {
-                        auto [i, j, k] = indexer2(icell);
-                        for (T2 n = 0; n < ncomp2; ++n) {
-                            f2(i,j,k,n);
-                        }
+                        auto iv = indexer2.intVect(icell);
+                        detail::call_f_intvect_ncomp(f2,iv,ncomp2);
                     }
                 }
             });
@@ -571,19 +655,19 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     }
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
 void ParallelFor (Gpu::KernelInfo const& /*info*/,
-                  Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2,
-                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const BoxIndexer indexer1(box1);
-    const BoxIndexer indexer2(box2);
-    const BoxIndexer indexer3(box3);
+    const BoxIndexerND<dim> indexer1(box1);
+    const BoxIndexerND<dim> indexer2(box2);
+    const BoxIndexerND<dim> indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
@@ -600,22 +684,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < indexer1.numPts()) {
-                        auto [i, j, k] = indexer1(icell);
-                        for (T1 n = 0; n < ncomp1; ++n) {
-                            f1(i,j,k,n);
-                        }
+                        auto iv = indexer1.intVect(icell);
+                        detail::call_f_intvect_ncomp(f1,iv,ncomp1);
                     }
                     if (icell < indexer2.numPts()) {
-                        auto [i, j, k] = indexer2(icell);
-                        for (T2 n = 0; n < ncomp2; ++n) {
-                            f2(i,j,k,n);
-                        }
+                        auto iv = indexer2.intVect(icell);
+                        detail::call_f_intvect_ncomp(f2,iv,ncomp2);
                     }
                     if (icell < indexer3.numPts()) {
-                        auto [i, j, k] = indexer3(icell);
-                        for (T3 n = 0; n < ncomp3; ++n) {
-                            f3(i,j,k,n);
-                        }
+                        auto iv = indexer3.intVect(icell);
+                        detail::call_f_intvect_ncomp(f3,iv,ncomp3);
                     }
                 }
             });
@@ -683,56 +761,6 @@ void launch (T const& n, L const& f) noexcept
     AMREX_GPU_ERROR_CHECK();
 }
 
-namespace detail {
-    template <typename F, typename N>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i, std::uint64_t /*nleft*/)
-        noexcept -> decltype(f(0))
-    {
-        f(i);
-    }
-
-    template <typename F, typename N>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i, std::uint64_t nleft)
-        noexcept -> decltype(f(0,Gpu::Handler{}))
-    {
-        f(i,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x)));
-    }
-
-    template <typename F>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, std::uint64_t /*nleft*/)
-        noexcept -> decltype(f(0,0,0))
-    {
-        f(i,j,k);
-    }
-
-    template <typename F>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, std::uint64_t nleft)
-        noexcept -> decltype(f(0,0,0,Gpu::Handler{}))
-    {
-        f(i,j,k,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x)));
-    }
-
-    template <typename F, typename T>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t /*nleft*/)
-        noexcept -> decltype(f(0,0,0,0))
-    {
-        for (T n = 0; n < ncomp; ++n) f(i,j,k,n);
-    }
-
-    template <typename F, typename T>
-    AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t nleft)
-        noexcept -> decltype(f(0,0,0,0,Gpu::Handler{}))
-    {
-        for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x)));
-    }
-}
-
 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
@@ -743,44 +771,50 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
     [=] AMREX_GPU_DEVICE () noexcept {
         for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
              i < Long(n); i += stride) {
-            detail::call_f(f, T(i), (Long(n)-i+(Long)threadIdx.x));
+            detail::call_f_scalar_handler(f, T(i),
+                Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
+                (std::uint64_t)blockDim.x)));
         }
     });
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <int MT, typename L>
+template <int MT, typename L, int dim>
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Gpu::KernelInfo const&, Box const& box, L const& f) noexcept
+ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const BoxIndexer indexer(box);
+    const BoxIndexerND<dim> indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < indexer.numPts(); icell += stride)
         {
-            auto [i, j, k] = indexer(icell);
-            detail::call_f(f, i, j, k, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x));
+            auto iv = indexer.intVect(icell);
+            detail::call_f_intvect_handler(f, iv,
+                Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+                (std::uint64_t)blockDim.x)));
         }
     });
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L const& f) noexcept
+ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const BoxIndexer indexer(box);
+    const BoxIndexerND<dim> indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < indexer.numPts(); icell += stride) {
-            auto [i, j, k] = indexer(icell);
-            detail::call_f(f, i, j, k, ncomp, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x));
+            auto iv = indexer.intVect(icell);
+            detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
+                Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+                (std::uint64_t)blockDim.x)));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -807,13 +841,13 @@ ParallelForRNG (T n, L const& f) noexcept
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <typename L>
+template <typename L, int dim>
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelForRNG (Box const& box, L const& f) noexcept
+ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    const BoxIndexer indexer(box);
+    const BoxIndexerND<dim> indexer(box);
     const auto ec = Gpu::ExecutionConfig(box.numPts());
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
@@ -822,21 +856,21 @@ ParallelForRNG (Box const& box, L const& f) noexcept
         auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
         for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
-            auto [i, j, k] = indexer(icell);
-            f(i,j,k,engine);
+            auto iv = indexer.intVect(icell);
+            detail::call_f_intvect_engine(f, iv, engine);
         }
     });
     Gpu::streamSynchronize(); // To avoid multiple streams using RNG
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
+ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    const BoxIndexer indexer(box);
+    const BoxIndexerND<dim> indexer(box);
     const auto ec = Gpu::ExecutionConfig(box.numPts());
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
@@ -845,24 +879,22 @@ ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
         auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
         for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
-            auto [i, j, k] = indexer(icell);
-            for (T n = 0; n < ncomp; ++n) {
-                f(i,j,k,n,engine);
-            }
+            auto iv = indexer.intVect(icell);
+            detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
         }
     });
     Gpu::streamSynchronize(); // To avoid multiple streams using RNG
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <int MT, typename L1, typename L2>
+template <int MT, typename L1, typename L2, int dim>
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
 ParallelFor (Gpu::KernelInfo const&,
-             Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+             BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const BoxIndexer indexer1(box1);
-    const BoxIndexer indexer2(box2);
+    const BoxIndexerND<dim> indexer1(box1);
+    const BoxIndexerND<dim> indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
@@ -870,28 +902,28 @@ ParallelFor (Gpu::KernelInfo const&,
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
-                auto [i, j, k] = indexer1(icell);
-                f1(i,j,k);
+                auto iv = indexer1.intVect(icell);
+                detail::call_f_intvect(f1, iv);
             }
             if (icell < indexer2.numPts()) {
-                auto [i, j, k] = indexer2(icell);
-                f2(i,j,k);
+                auto iv = indexer2.intVect(icell);
+                detail::call_f_intvect(f2, iv);
             }
         }
     });
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <int MT, typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2, typename L3, int dim>
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
 ParallelFor (Gpu::KernelInfo const&,
-             Box const& box1, Box const& box2, Box const& box3,
+             BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
              L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const BoxIndexer indexer1(box1);
-    const BoxIndexer indexer2(box2);
-    const BoxIndexer indexer3(box3);
+    const BoxIndexerND<dim> indexer1(box1);
+    const BoxIndexerND<dim> indexer2(box2);
+    const BoxIndexerND<dim> indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
@@ -899,33 +931,33 @@ ParallelFor (Gpu::KernelInfo const&,
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
-                auto [i, j, k] = indexer1(icell);
-                f1(i,j,k);
+                auto iv = indexer1.intVect(icell);
+                detail::call_f_intvect(f1, iv);
             }
             if (icell < indexer2.numPts()) {
-                auto [i, j, k] = indexer2(icell);
-                f2(i,j,k);
+                auto iv = indexer2.intVect(icell);
+                detail::call_f_intvect(f2, iv);
             }
             if (icell < indexer3.numPts()) {
-                auto [i, j, k] = indexer3(icell);
-                f3(i,j,k);
+                auto iv = indexer3.intVect(icell);
+                detail::call_f_intvect(f3, iv);
             }
         }
     });
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
 ParallelFor (Gpu::KernelInfo const&,
-             Box const& box1, T1 ncomp1, L1&& f1,
-             Box const& box2, T2 ncomp2, L2&& f2) noexcept
+             BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+             BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const BoxIndexer indexer1(box1);
-    const BoxIndexer indexer2(box2);
+    const BoxIndexerND<dim> indexer1(box1);
+    const BoxIndexerND<dim> indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
@@ -933,36 +965,32 @@ ParallelFor (Gpu::KernelInfo const&,
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
-                auto [i, j, k] = indexer1(icell);
-                for (T1 n = 0; n < ncomp1; ++n) {
-                    f1(i,j,k,n);
-                }
+                auto iv = indexer1.intVect(icell);
+                detail::call_f_intvect_ncomp(f1, iv, ncomp1);
             }
             if (icell < indexer2.numPts()) {
-                auto [i, j, k] = indexer2(icell);
-                for (T2 n = 0; n < ncomp2; ++n) {
-                    f2(i,j,k,n);
-                }
+                auto iv = indexer2.intVect(icell);
+                detail::call_f_intvect_ncomp(f2, iv, ncomp2);
             }
         }
     });
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
 ParallelFor (Gpu::KernelInfo const&,
-             Box const& box1, T1 ncomp1, L1&& f1,
-             Box const& box2, T2 ncomp2, L2&& f2,
-             Box const& box3, T3 ncomp3, L3&& f3) noexcept
+             BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+             BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+             BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const BoxIndexer indexer1(box1);
-    const BoxIndexer indexer2(box2);
-    const BoxIndexer indexer3(box3);
+    const BoxIndexerND<dim> indexer1(box1);
+    const BoxIndexerND<dim> indexer2(box2);
+    const BoxIndexerND<dim> indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
@@ -970,22 +998,16 @@ ParallelFor (Gpu::KernelInfo const&,
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
-                auto [i, j, k] = indexer1(icell);
-                for (T1 n = 0; n < ncomp1; ++n) {
-                    f1(i,j,k,n);
-                }
+                auto iv = indexer1.intVect(icell);
+                detail::call_f_intvect_ncomp(f1, iv, ncomp1);
             }
             if (icell < indexer2.numPts()) {
-                auto [i, j, k] = indexer2(icell);
-                for (T2 n = 0; n < ncomp2; ++n) {
-                    f2(i,j,k,n);
-                }
+                auto iv = indexer2.intVect(icell);
+                detail::call_f_intvect_ncomp(f2, iv, ncomp2);
             }
             if (icell < indexer3.numPts()) {
-                auto [i, j, k] = indexer3(icell);
-                for (T3 n = 0; n < ncomp3; ++n) {
-                    f3(i,j,k,n);
-                }
+                auto iv = indexer3.intVect(icell);
+                detail::call_f_intvect_ncomp(f3, iv, ncomp3);
             }
         }
     });
@@ -1013,60 +1035,60 @@ ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
     ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
 }
 
-template <typename L>
+template <typename L, int dim>
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
+template <typename L1, typename L2, int dim>
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
 ParallelFor (Gpu::KernelInfo const& info,
-             Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+             BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
                                        std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
+template <typename L1, typename L2, typename L3, int dim>
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
 ParallelFor (Gpu::KernelInfo const& info,
-             Box const& box1, Box const& box2, Box const& box3,
+             BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
              L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
                                        std::forward<L2>(f2), std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
 ParallelFor (Gpu::KernelInfo const& info,
-             Box const& box1, T1 ncomp1, L1&& f1,
-             Box const& box2, T2 ncomp2, L2&& f2) noexcept
+             BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+             BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
                                              box2, ncomp2, std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
 ParallelFor (Gpu::KernelInfo const& info,
-             Box const& box1, T1 ncomp1, L1&& f1,
-             Box const& box2, T2 ncomp2, L2&& f2,
-             Box const& box3, T3 ncomp3, L3&& f3) noexcept
+             BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+             BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+             BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
                                              box2, ncomp2, std::forward<L2>(f2),
@@ -1085,88 +1107,88 @@ void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
     ParallelFor<MT>(info, n,std::forward<L>(f));
 }
 
-template <typename L>
-void For (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void For (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor<MT>(info, box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void For (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void For (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
+template <typename L1, typename L2, int dim>
 void For (Gpu::KernelInfo const& info,
-          Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+          BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
+template <int MT, typename L1, typename L2, int dim>
 void For (Gpu::KernelInfo const& info,
-          Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+          BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
+template <typename L1, typename L2, typename L3, int dim>
 void For (Gpu::KernelInfo const& info,
-          Box const& box1, Box const& box2, Box const& box3,
+          BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
           L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2, typename L3, int dim>
 void For (Gpu::KernelInfo const& info,
-          Box const& box1, Box const& box2, Box const& box3,
+          BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
           L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 void For (Gpu::KernelInfo const& info,
-          Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 void For (Gpu::KernelInfo const& info,
-          Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
 void For (Gpu::KernelInfo const& info,
-          Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(info,
                 box1,ncomp1,std::forward<L1>(f1),
@@ -1174,14 +1196,14 @@ void For (Gpu::KernelInfo const& info,
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
 void For (Gpu::KernelInfo const& info,
-          Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor<MT>(info,
                 box1,ncomp1,std::forward<L1>(f1),
@@ -1201,81 +1223,81 @@ void ParallelFor (T n, L&& f) noexcept
     ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
 }
 
-template <typename L>
-void ParallelFor (Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void ParallelFor (Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
-void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
-void ParallelFor (Box const& box1, Box const& box2, Box const& box3,
+template <typename L1, typename L2, typename L3, int dim>
+void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
-void ParallelFor (Box const& box1, Box const& box2, Box const& box3,
+template <int MT, typename L1, typename L2, typename L3, int dim>
+void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2,
-                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
                 box1,ncomp1,std::forward<L1>(f1),
@@ -1283,13 +1305,13 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2,
-                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},
                 box1,ncomp1,std::forward<L1>(f1),
@@ -1309,81 +1331,81 @@ void For (T n, L&& f) noexcept
     ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
 }
 
-template <typename L>
-void For (Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void For (BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void For (Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void For (BoxND<dim> const& box, L&& f) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void For (Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void For (Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
-void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
-void For (Box const& box1, Box const& box2, Box const& box3,
+template <typename L1, typename L2, typename L3, int dim>
+void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
           L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
-void For (Box const& box1, Box const& box2, Box const& box3,
+template <int MT, typename L1, typename L2, typename L3, int dim>
+void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
           L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
-void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
-void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
-void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
                 box1,ncomp1,std::forward<L1>(f1),
@@ -1391,13 +1413,13 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
-void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor<MT>(Gpu::KernelInfo{},
                 box1,ncomp1,std::forward<L1>(f1),
@@ -1451,9 +1473,9 @@ HostDeviceParallelFor (T n, L&& f) noexcept
     HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
 }
 
-template <typename L>
+template <typename L, int dim>
 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
@@ -1466,9 +1488,9 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexc
     }
 }
 
-template <int MT, typename L>
+template <int MT, typename L, int dim>
 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<MT>(info, box,std::forward<L>(f));
@@ -1481,9 +1503,9 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexc
     }
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
@@ -1496,9 +1518,9 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&&
     }
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
@@ -1511,10 +1533,10 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&&
     }
 }
 
-template <typename L1, typename L2>
+template <typename L1, typename L2, int dim>
 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info,
-                       Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+                       BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
@@ -1528,10 +1550,10 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info,
     }
 }
 
-template <int MT, typename L1, typename L2>
+template <int MT, typename L1, typename L2, int dim>
 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info,
-                       Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+                       BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
@@ -1545,10 +1567,10 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info,
     }
 }
 
-template <int MT, typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2, typename L3, int dim>
 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info,
-                       Box const& box1, Box const& box2, Box const& box3,
+                       BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                        L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (Gpu::inLaunchRegion()) {
@@ -1565,13 +1587,13 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info,
     }
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info,
-                       Box const& box1, T1 ncomp1, L1&& f1,
-                       Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                       BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                       BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
@@ -1585,13 +1607,13 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info,
     }
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info,
-                       Box const& box1, T1 ncomp1, L1&& f1,
-                       Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                       BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                       BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
@@ -1605,15 +1627,15 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info,
     }
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info,
-                       Box const& box1, T1 ncomp1, L1&& f1,
-                       Box const& box2, T2 ncomp2, L2&& f2,
-                       Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                       BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                       BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                       BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<AMREX_GPU_MAX_THREADS>(info,
@@ -1631,15 +1653,15 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info,
     }
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info,
-                       Box const& box1, T1 ncomp1, L1&& f1,
-                       Box const& box2, T2 ncomp2, L2&& f2,
-                       Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                       BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                       BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                       BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (Gpu::inLaunchRegion()) {
         ParallelFor<MT>(info,
@@ -1669,90 +1691,90 @@ void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
     HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
 }
 
-template <typename L>
-void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
 {
     HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
+template <typename L1, typename L2, int dim>
 void HostDeviceFor (Gpu::KernelInfo const& info,
-                    Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+                    BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
+template <int MT, typename L1, typename L2, int dim>
 void HostDeviceFor (Gpu::KernelInfo const& info,
-                    Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+                    BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
+template <typename L1, typename L2, typename L3, int dim>
 void HostDeviceFor (Gpu::KernelInfo const& info,
-                    Box const& box1, Box const& box2, Box const& box3,
+                    BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
                           std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2, typename L3, int dim>
 void HostDeviceFor (Gpu::KernelInfo const& info,
-                    Box const& box1, Box const& box2, Box const& box3,
+                    BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     HostDeviceParallelFor<MT>(info, box1,box2,box3,
                           std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 void HostDeviceFor (Gpu::KernelInfo const& info,
-                    Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 void HostDeviceFor (Gpu::KernelInfo const& info,
-                    Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
 void HostDeviceFor (Gpu::KernelInfo const& info,
-                    Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2,
-                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                    BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
                           box1,ncomp1,std::forward<L1>(f1),
@@ -1760,14 +1782,14 @@ void HostDeviceFor (Gpu::KernelInfo const& info,
                           box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
 void HostDeviceFor (Gpu::KernelInfo const& info,
-                    Box const& box1, T1 ncomp1, L1&& f1,
-                    Box const& box2, T2 ncomp2, L2&& f2,
-                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                    BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     HostDeviceParallelFor<MT>(info,
                           box1,ncomp1,std::forward<L1>(f1),
@@ -1787,83 +1809,83 @@ void HostDeviceParallelFor (T n, L&& f) noexcept
     HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
 }
 
-template <typename L>
-void HostDeviceParallelFor (Box const& box, L&& f) noexcept
+template <typename L, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
 }
 
-template <int MT, typename L>
-void HostDeviceParallelFor (Box const& box, L&& f) noexcept
+template <int MT, typename L, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
 {
     HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
 {
     HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
-template <typename L1, typename L2>
-void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <typename L1, typename L2, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <int MT, typename L1, typename L2>
-void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+template <int MT, typename L1, typename L2, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
 {
     HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
-template <typename L1, typename L2, typename L3>
-void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+template <typename L1, typename L2, typename L3, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                             L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
                           std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <int MT, typename L1, typename L2, typename L3>
-void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+template <int MT, typename L1, typename L2, typename L3, int dim>
+void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
                             L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
                           std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
-void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <int MT, typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
-void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
-void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2,
-                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                            BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
                           box1,ncomp1,std::forward<L1>(f1),
@@ -1871,13 +1893,13 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                           box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
-void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                            Box const& box2, T2 ncomp2, L2&& f2,
-                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
+                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
+                            BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
                           box1,ncomp1,std::forward<L1>(f1),
diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H
index 4fba23a849a..3e10bec54df 100644
--- a/Src/Base/AMReX_GpuQualifiers.H
+++ b/Src/Base/AMReX_GpuQualifiers.H
@@ -64,4 +64,22 @@
 # include <sycl/sycl.hpp>
 #endif
 
+#define AMREX_WRONG_NUM_ARGS(...) static_assert(false,"Wrong number of arguments to macro")
+
+#define AMREX_GET_DGV_MACRO(_1,_2,_3,NAME,...) NAME
+#define AMREX_DEVICE_GLOBAL_VARIABLE(...) AMREX_GET_DGV_MACRO(__VA_ARGS__,\
+                                              AMREX_DGVARR, AMREX_DGV,\
+                                              AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
+
+#ifdef AMREX_USE_SYCL
+# define AMREX_DGV(type,name) SYCL_EXTERNAL sycl::ext::oneapi::experimental::device_global<type> name
+# define AMREX_DGVARR(type,num,name) SYCL_EXTERNAL sycl::ext::oneapi::experimental::device_global<type[num]> name
+#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+# define AMREX_DGV(type,name) __device__ type name
+# define AMREX_DGVARR(type,num,name) __device__ type name[num]
+#else
+# define AMREX_DGV(type,name) type name
+# define AMREX_DGVARR(type,num,name) type name[num]
+#endif
+
 #endif
diff --git a/Src/Base/AMReX_GpuRange.H b/Src/Base/AMReX_GpuRange.H
index ecf9a32fd2e..0b7fcd5a9c8 100644
--- a/Src/Base/AMReX_GpuRange.H
+++ b/Src/Base/AMReX_GpuRange.H
@@ -13,7 +13,8 @@ namespace amrex {
 template <typename T, std::enable_if_t<std::is_integral_v<T>,int> = 0>
 bool isEmpty (T n) noexcept { return n <= 0; }
 
-AMREX_FORCE_INLINE bool isEmpty (Box const& b) noexcept { return b.isEmpty(); }
+template<int dim>
+AMREX_FORCE_INLINE bool isEmpty (BoxND<dim> const& b) noexcept { return b.isEmpty(); }
 
 namespace Gpu {
 
@@ -28,9 +29,10 @@ template <typename T, std::enable_if_t<std::is_integral_v<T>,int> = 0>
 AMREX_GPU_HOST_DEVICE
 Long at (T const& /*b*/, Long offset) noexcept { return offset; }
 
-//! Box version
+//! BoxND version
+template<int dim>
 AMREX_GPU_HOST_DEVICE
-AMREX_FORCE_INLINE Long size (Box const& b) noexcept
+AMREX_FORCE_INLINE Long size (BoxND<dim> const& b) noexcept
 {
     AMREX_IF_ON_DEVICE((return b.numPts();))
     AMREX_IF_ON_HOST((
@@ -39,19 +41,13 @@ AMREX_FORCE_INLINE Long size (Box const& b) noexcept
     ))
 }
 
+template<int dim>
 AMREX_GPU_HOST_DEVICE
-AMREX_FORCE_INLINE Box at (Box const& b, Long offset) noexcept
+AMREX_FORCE_INLINE BoxND<dim> at (BoxND<dim> const& b, Long offset) noexcept
 {
     AMREX_IF_ON_DEVICE((
-        auto len = b.length3d();
-        Long k = offset / (len[0]*len[1]);
-        Long j = (offset - k*(len[0]*len[1])) / len[0];
-        Long i = (offset - k*(len[0]*len[1])) - j*len[0];
-        IntVect iv{AMREX_D_DECL(static_cast<int>(i),
-                                static_cast<int>(j),
-                                static_cast<int>(k))};
-        iv += b.smallEnd();
-        return Box(iv,iv,b.type());
+        auto iv = b.atOffset(offset);
+        return BoxND<dim>(iv,iv,b.ixType());
     ))
     AMREX_IF_ON_HOST((
         amrex::ignore_unused(offset);
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index ecb992983ba..e01303656ff 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -84,7 +84,7 @@ struct Handler
 
 struct Handler
 {
-    AMREX_GPU_HOST_DEVICE constexpr Handler (int n_active_threads = -1)
+    AMREX_GPU_HOST_DEVICE constexpr explicit Handler (int n_active_threads = -1)
         : numActiveThreads(n_active_threads) {}
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
diff --git a/Src/Base/AMReX_IArrayBox.H b/Src/Base/AMReX_IArrayBox.H
index db0f26d5080..b070f5c1f96 100644
--- a/Src/Base/AMReX_IArrayBox.H
+++ b/Src/Base/AMReX_IArrayBox.H
@@ -94,7 +94,6 @@ public:
 
     void readFrom (std::istream& is);
 
-    //! Initialize from ParmParse with "fab" prefix.
     static void Initialize ();
     static void Finalize ();
 
diff --git a/Src/Base/AMReX_IArrayBox.cpp b/Src/Base/AMReX_IArrayBox.cpp
index 40b2762729d..1b9eea50658 100644
--- a/Src/Base/AMReX_IArrayBox.cpp
+++ b/Src/Base/AMReX_IArrayBox.cpp
@@ -2,7 +2,6 @@
 #include <AMReX_BLassert.H>
 #include <AMReX_IArrayBox.H>
 #include <AMReX_VectorIO.H>
-#include <AMReX_ParmParse.H>
 #include <AMReX_Utility.H>
 
 #include <cfloat>
@@ -32,7 +31,6 @@ void
 IArrayBox::Initialize ()
 {
     if (initialized) { return; }
-//    ParmParse pp("iab");
 
     ifabio = std::make_unique<IFABio>();
 
diff --git a/Src/Base/AMReX_IndexType.H b/Src/Base/AMReX_IndexType.H
index 0fd613d2a99..ea310b9b430 100644
--- a/Src/Base/AMReX_IndexType.H
+++ b/Src/Base/AMReX_IndexType.H
@@ -3,145 +3,364 @@
 #define BL_INDEXTYPE_H
 #include <AMReX_Config.H>
 
-#include <AMReX_ccse-mpi.H>
 #include <AMReX_IntVect.H>
 #include <AMReX_SPACE.H>
+#include <AMReX_Tuple.H>
 
 #include <iosfwd>
 
 namespace amrex {
 
+/**
+* \brief Type for defining CellIndex so that all IndexTypeND with different dimensions
+* have the same CellIndex type.
+*/
+struct CellIndexEnum {
+    //! The cell index type: one of CELL or NODE.
+    enum CellIndex { CELL = 0, NODE = 1 };
+};
+
 /**
 * \brief Cell-Based or Node-Based Indices
 *
-* The class IndexType defines an index as being cell based or node (edge)
-* based in each of the AMREX_SPACEDIM directions.  This class defines an
+* The class IndexTypeND defines an index as being cell based or node (edge)
+* based in each of the dim directions. This class defines an
 * enumerated type CellIndex to be either CELL or NODE; i.e. each of the
-* AMREX_SPACEDIM dimensions must be either CELL or NODE.
+* dim dimensions must be either CELL or NODE.
 */
-class IndexType
+template<int dim>
+class IndexTypeND : public CellIndexEnum
 {
-    friend MPI_Datatype ParallelDescriptor::Mpi_typemap<IndexType>::type();
-
 public:
-    //! The cell index type: one of CELL or NODE.
-    enum CellIndex { CELL = 0, NODE = 1 };
+    static_assert(1 <= dim && dim <= 31, "The number of dimensions of IndexTypeND must be positive"
+        " and less than 32");
+
     //! The default constructor
     AMREX_GPU_HOST_DEVICE
-    constexpr IndexType () noexcept  = default;
-    //! Construct an IndexType identical to an IntVect.
+    constexpr IndexTypeND () noexcept = default;
+    //! Construct an IndexTypeND identical to an IntVectND.
     AMREX_GPU_HOST_DEVICE
-    explicit IndexType (const IntVect& iv) noexcept
-        : itype(AMREX_D_TERM((iv[0]?1:0), | ((iv[1]?1:0)<<1), | ((iv[2]?1:0)<<2)))
-        {}
+    explicit IndexTypeND (const IntVectND<dim>& iv) noexcept {
+        for (int i=0; i<dim; ++i) {
+            itype |= (iv[i] ? 1U : 0U) << i;
+        }
+    }
     /**
-    * \brief Construct an IndexType given an explicit CellIndex for
-    * each direction.  AMREX_D_DECL is a macro that sets the constructor
-    * to take AMREX_SPACEDIM arguments.
+    * \brief Construct an IndexTypeND given an explicit CellIndex for
+    * each direction.  The inputs for this constructor are N CellIndex,
+    * where N is equal to the number of dimensions of the IndexTypeND.
     */
+    template <class...Args,
+        std::enable_if_t<
+            (sizeof...(Args)+1 == dim) &&
+            IsConvertible_v<CellIndex, Args...>,
+        int> = 0>
     AMREX_GPU_HOST_DEVICE
-    constexpr IndexType (AMREX_D_DECL(CellIndex i, CellIndex j, CellIndex k)) noexcept
-        : itype(AMREX_D_TERM(i, | (j<<1), | (k<<2)))
-        {}
+    constexpr IndexTypeND (CellIndex i, Args...js) noexcept {
+        CellIndex locarr[dim] = {i, static_cast<CellIndex>(js)...};
+        for (int s=0; s<dim; ++s) {
+            itype |= ((locarr[s] == CellIndex::NODE) ? 1U : 0U) << s;
+        }
+    }
     // dtor, copy-ctor, copy-op=, move-ctor, and move-op= are compiler generated.
 
-    //! Set IndexType to be NODE based in direction dir.
+    //! Set IndexTypeND to be NODE based in direction dir.
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void set (int dir) noexcept { itype |= mask(dir); }
-    //! Set IndexType to be CELL based in direction dir.
+    //! Set IndexTypeND to be CELL based in direction dir.
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void unset (int dir) noexcept { itype &= ~mask(dir); }
-    //! True if IndexType is NODE based in direction dir.
+    //! True if IndexTypeND is NODE based in direction dir.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     bool test (int dir) const noexcept { return (itype & mask(dir)) != 0; }
     //! Set NODE based in all directions.
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void setall () noexcept { itype = (1 << AMREX_SPACEDIM) - 1; }
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr
+    void setall () noexcept { itype = (1U << dim) - 1; }
     //! Set CELL based in all directions.
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void clear () noexcept { itype = 0; }
-    //! True if this IndexType is NODE based in any direction.
+    //! True if this IndexTypeND is NODE based in any direction.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     bool any () const noexcept { return itype != 0; }
-    //! True if IndexType is valid.
+    //! True if IndexTypeND is valid.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    bool ok () const noexcept { return itype < (1 << AMREX_SPACEDIM); }
+    bool ok () const noexcept { return itype < (1U << dim); }
     //! Change from CELL to NODE or NODE to CELL in direction dir.
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void flip (int i) noexcept { itype ^= mask(i); }
-    //! True if IndexTypes are identical.
+    //! True if IndexTypeNDs are identical.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    bool operator== (const IndexType& t) const noexcept { return t.itype == itype; }
-    //! True if IndexTypes are not identical.
+    bool operator== (const IndexTypeND& t) const noexcept { return t.itype == itype; }
+    //! True if IndexTypeNDs are not identical.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    bool operator!= (const IndexType& t) const noexcept { return t.itype != itype; }
+    bool operator!= (const IndexTypeND& t) const noexcept { return t.itype != itype; }
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    bool operator< (const IndexType& t) const noexcept { return itype < t.itype; }
-    //! True if the IndexType is CELL based in all directions.
+    bool operator< (const IndexTypeND& t) const noexcept { return itype < t.itype; }
+    //! True if the IndexTypeND is CELL based in all directions.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     bool cellCentered () const noexcept { return itype == 0; }
-    //! True if the IndexType is CELL based in dir-direction.
+    //! True if the IndexTypeND is CELL based in dir-direction.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     bool cellCentered (int dir) const noexcept { return (itype & mask(dir)) == 0; }
-    //! True if the IndexType is NODE based in all directions.
+    //! True if the IndexTypeND is NODE based in all directions.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    bool nodeCentered () const noexcept { return itype == (1<<AMREX_SPACEDIM)-1; }
-    //! True if the IndexType is NODE based in dir-direction.
+    bool nodeCentered () const noexcept { return itype == (1U<<dim)-1; }
+    //! True if the IndexTypeND is NODE based in dir-direction.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     bool nodeCentered (int dir) const noexcept { return (itype & mask(dir)) != 0; }
-    //! Set IndexType to CellIndex type t in direction dir.
+    //! Set IndexTypeND to CellIndex type t in direction dir.
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void setType (int dir, CellIndex t) noexcept { t == CELL ? unset(dir) : set(dir); }
     //! Returns  the CellIndex in direction dir.
-    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    CellIndex ixType (int dir) const noexcept { return (CellIndex) ((itype & (1<<dir)) >> dir); }
-    //! Return an integer representing the IndexType in direction dir.
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr
+    CellIndex ixType (int dir) const noexcept { return (CellIndex) ((itype & (1U<<dir)) >> dir); }
+    //! Return an integer representing the IndexTypeND in direction dir.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int operator[] (int dir) const noexcept { return test(dir); }
-    //! Fill an IntVect of size AMREX_SPACEDIM with IndexTypes.
+    //! Returns the i'th CellIndex of the IndexTypeND. Used by structured bindings.
+    template<std::size_t i>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr
+    CellIndex get () const noexcept { static_assert(0<=i && i<dim); return ixType(i); }
+    //! Fill an IntVectND of size dim with IndexTypeNDs.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    IntVect ixType () const noexcept { return IntVect(AMREX_D_DECL(itype&1, (itype>>1)&1, (itype>>2)&1)); }
-    //! Fill an IntVect of size AMREX_SPACEDIM with IndexTypes.
+    IntVectND<dim> ixType () const noexcept {
+        IntVectND<dim> retval(0);
+        for (int i=0; i<dim; ++i) {
+            retval[i] = test(i);
+        }
+        return retval;
+    }
+    //! Fill an IntVectND of size dim with IndexTypeNDs.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    IntVect toIntVect () const noexcept { return IntVect(AMREX_D_DECL(itype&1, (itype>>1)&1, (itype>>2)&1)); }
+    IntVectND<dim> toIntVect () const noexcept {
+        IntVectND<dim> retval(0);
+        for (int i=0; i<dim; ++i) {
+            retval[i] = test(i);
+        }
+        return retval;
+    }
     /**
-    * \brief This static member function returns an IndexType object of value
-    * IndexType::CELL.  It is provided as a convenience to our users
+    * \brief This static member function returns an IndexTypeND object of value
+    * IndexTypeND::CELL.  It is provided as a convenience to our users
     * when defining a Box all of whose faces should be of type
-    * IndexType::CELL.
+    * IndexTypeND::CELL.
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    static constexpr IndexType TheCellType () noexcept {
-        return IndexType(AMREX_D_DECL(IndexType::CELL,
-                                      IndexType::CELL,
-                                      IndexType::CELL));
+    static constexpr IndexTypeND<dim> TheCellType () noexcept {
+        return IndexTypeND<dim>{};
     }
     /**
-    * \brief This static member function returns an IndexType object of value
-    * IndexType::NODE.  It is provided as a convenience to our users
+    * \brief This static member function returns an IndexTypeND object of value
+    * IndexTypeND::NODE.  It is provided as a convenience to our users
     * when defining a Box all of whose faces should be of type
-    * IndexType::NODE.
+    * IndexTypeND::NODE.
+    */
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr IndexTypeND<dim> TheNodeType () noexcept {
+        IndexTypeND<dim> retval{};
+        retval.setall();
+        return retval;
+    }
+
+    //! Return the size of this IndexTypeND.
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr std::size_t size () noexcept {
+        return static_cast<std::size_t>(dim);
+    }
+
+    //! Return the size of this IndexTypeND.
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr int isize () noexcept {
+        return dim;
+    }
+
+    using value_type = CellIndex;
+
+    /**
+    * \brief Returns a new IndexTypeND of size new_dim and
+    * assigns the first new_dim values of this IndexTypeND to it.
+    */
+    template<int new_dim>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IndexTypeND<new_dim> shrink () const noexcept {
+        static_assert(new_dim <= dim);
+        IndexTypeND<new_dim> retval{};
+        retval.getBits() = itype & ((1U << new_dim) - 1);
+        return retval;
+    }
+
+    /**
+    * \brief Returns a new IndexTypeND of size new_dim and
+    * assigns all values of this IndexTypeND to it and fill_extra to the remaining elements.
+    */
+    template<int new_dim>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IndexTypeND<new_dim> expand (CellIndex fill_extra=CellIndex::CELL) const noexcept {
+        static_assert(new_dim >= dim);
+        IndexTypeND<new_dim> retval{};
+        retval.getBits() = itype;
+        if (fill_extra == CellIndex::NODE) {
+            retval.getBits() |= (1U << new_dim) - (1U << dim);
+        }
+        return retval;
+    }
+
+    /**
+    * \brief Returns a new IndexTypeND of size new_dim
+    * by either shrinking or expanding this IndexTypeND
     */
+    template<int new_dim>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    static constexpr IndexType TheNodeType () noexcept {
-        return IndexType(AMREX_D_DECL(IndexType::NODE,
-                                      IndexType::NODE,
-                                      IndexType::NODE));
+    IndexTypeND<new_dim> resize (CellIndex fill_extra=CellIndex::CELL) const noexcept {
+        if constexpr (new_dim > dim) {
+            return expand<new_dim>(fill_extra);
+        } else {
+            return shrink<new_dim>();
+        }
     }
 
+    //! Return the bit field representing the underlying data
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr
+    unsigned int& getBits () noexcept { return itype; }
+
+    //! Return the bit field representing the underlying data
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr
+    const unsigned int& getBits () const noexcept { return itype; }
+
 private:
     //! Returns 1<<k.
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    static int mask (int k) noexcept { return 1<<k; }
-    //! An integer holding the CellIndex in bits 0 - AMREX_SPACEDIM-1.
+    static constexpr unsigned int mask (int k) noexcept { return 1U<<k; }
+    //! An integer holding the CellIndex in bits 0 - dim-1.
     unsigned int itype{0};
 };
 
-//! Write an IndexType to an ostream in ASCII.
-std::ostream& operator<< (std::ostream& os, const IndexType& itype);
-//! Read an IndexType from an istream.
-std::istream& operator>> (std::istream& is, IndexType& itype);
+using IndexType = IndexTypeND<AMREX_SPACEDIM>;
+
+// Template deduction guide for IndexTypeND
+template<int dim>
+AMREX_GPU_HOST_DEVICE // __device__ for HIP
+IndexTypeND(const IntVectND<dim>&) -> IndexTypeND<dim>;
+
+// Template deduction guide for IndexTypeND
+template <class...Args,
+    std::enable_if_t<
+        IsConvertible_v<IndexType::CellIndex, Args...>,
+    int> = 0>
+AMREX_GPU_HOST_DEVICE // __device__ for HIP
+IndexTypeND(IndexType::CellIndex, Args...) -> IndexTypeND<sizeof...(Args)+1>;
+
+namespace detail {
+    std::ostream& index_type_write (std::ostream& os, const unsigned int& iv, int dim);
+    std::istream& index_type_read (std::istream& is, unsigned int& iv, int dim);
+
+    template<class T, std::size_t...Ns>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr
+    T IndexTypeSplit_imp (T& retval, std::index_sequence<Ns...>, unsigned int src) noexcept {
+        int dim_shift = 0;
+        (
+            (
+                amrex::get<Ns>(retval).getBits() =
+                    (src >> dim_shift) & ((1U << amrex::get<Ns>(retval).isize()) - 1),
+                dim_shift += amrex::get<Ns>(retval).isize()
+            ), ...
+        );
+        return retval;
+    }
+}
+
+//! Write an IndexTypeND to an ostream in ASCII.
+template<int dim>
+std::ostream& operator<< (std::ostream& os, const IndexTypeND<dim>& it) {
+    return detail::index_type_write(os, it.getBits(), dim);
+}
+//! Read an IndexTypeND from an istream.
+template<int dim>
+std::istream& operator>> (std::istream& is, IndexTypeND<dim>& it) {
+    return detail::index_type_read(is, it.getBits(), dim);
+}
+
+/**
+* \brief Returns a IndexTypeND obtained by concatenating the input IndexTypeNDs.
+* The dimension of the return value equals the sum of the dimensions of the inputted IndexTypeNDs.
+*/
+template<int d, int...dims>
+[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+constexpr IndexTypeND<detail::get_sum<d, dims...>()>
+IndexTypeCat (const IndexTypeND<d>& v, const IndexTypeND<dims>&...vects) noexcept {
+    IndexTypeND<detail::get_sum<d, dims...>()> retval{};
+    retval.getBits() |= v.getBits();
+    int dim_shift = v.isize();
+    (
+        (
+            retval.getBits() |= (vects.getBits() << dim_shift),
+            dim_shift += vects.isize()
+        ), ...
+    );
+    return retval;
+}
+
+/**
+* \brief Returns a tuple of IndexTypeND obtained by splitting the input IndexTypeND
+* according to the dimensions specified by the template arguments.
+*/
+template<int d, int...dims>
+[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+constexpr GpuTuple<IndexTypeND<d>, IndexTypeND<dims>...>
+IndexTypeSplit (const IndexTypeND<detail::get_sum<d, dims...>()>& v) noexcept {
+    GpuTuple<IndexTypeND<d>, IndexTypeND<dims>...> retval{};
+    return detail::IndexTypeSplit_imp(retval,
+                                      std::make_index_sequence<1 + sizeof...(dims)>(),
+                                      v.getBits());
+}
+
+/**
+* \brief Returns a new IndexTypeND of size new_dim and
+* assigns the first new_dim values of v to it.
+*/
+template<int new_dim, int old_dim>
+[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+constexpr IndexTypeND<new_dim>
+IndexTypeShrink (const IndexTypeND<old_dim>& v) noexcept {
+    return v.template shrink<new_dim>();
+}
+
+/**
+* \brief Returns a new IndexTypeND of size new_dim and
+* assigns all values of iv to it and fill_extra to the remaining elements.
+*/
+template<int new_dim, int old_dim>
+[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+constexpr IndexTypeND<new_dim>
+IndexTypeExpand (const IndexTypeND<old_dim>& v,
+                 IndexType::CellIndex fill_extra=IndexType::CellIndex::CELL) noexcept {
+    return v.template expand<new_dim>(fill_extra);
+}
 
+/**
+* \brief Returns a new IndexTypeND of size new_dim
+* by either shrinking or expanding iv.
+*/
+template<int new_dim, int old_dim>
+[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+constexpr IndexTypeND<new_dim>
+IndexTypeResize (const IndexTypeND<old_dim>& v,
+                 IndexType::CellIndex fill_extra=IndexType::CellIndex::CELL) noexcept {
+    return v.template resize<new_dim>(fill_extra);
 }
 
+} // namespace amrex
+
+// Spcialize std::tuple_size for IndexTypeND. Used by structured bindings.
+template<int dim>
+struct std::tuple_size<amrex::IndexTypeND<dim>> {
+    static constexpr std::size_t value = dim;
+};
+
+// Spcialize std::tuple_element for IndexTypeND. Used by structured bindings.
+template<std::size_t s, int dim>
+struct std::tuple_element<s, amrex::IndexTypeND<dim>> {
+    using type = typename amrex::IndexTypeND<dim>::CellIndex;
+};
+
 #endif /*BL_INDEXTYPE_H*/
diff --git a/Src/Base/AMReX_IndexType.cpp b/Src/Base/AMReX_IndexType.cpp
index e96052a4d14..e2cd7097ecf 100644
--- a/Src/Base/AMReX_IndexType.cpp
+++ b/Src/Base/AMReX_IndexType.cpp
@@ -4,16 +4,16 @@
 #include <iostream>
 #include <iomanip>
 
-namespace amrex {
+namespace amrex::detail {
 
 std::ostream&
-operator<< (std::ostream&    os,
-            const IndexType& it)
+index_type_write (std::ostream& os, const unsigned int& iv, int dim)
 {
-    os << '('
-       << AMREX_D_TERM( (it.test(0)?'N':'C'),
-                  << ',' << (it.test(1)?'N':'C'),
-                  << ',' << (it.test(2)?'N':'C')) << ')' << std::flush;
+    os << '(' << (((iv & 1U) != 0) ? 'N' : 'C');
+    for (int i=1; i<dim; ++i) {
+        os << ',' << (((iv & (1U<<i)) != 0) ? 'N' : 'C');
+    }
+    os << ')' << std::flush;
 
     if (os.fail()) {
         amrex::Error("operator<<(ostream&,IndexType&) failed");
@@ -28,19 +28,18 @@ operator<< (std::ostream&    os,
 #define BL_IGNORE_MAX 100000
 
 std::istream&
-operator>> (std::istream& is,
-            IndexType&    it)
+index_type_read (std::istream& is, unsigned int& iv, int dim)
 {
-    char AMREX_D_DECL(t0,t1,t2);
-
-    AMREX_D_EXPR( is.ignore(BL_IGNORE_MAX, '(') >> t0,
-            is.ignore(BL_IGNORE_MAX, ',') >> t1,
-            is.ignore(BL_IGNORE_MAX, ',') >> t2);
+    char t = '0';
+    is.ignore(BL_IGNORE_MAX, '(') >> t;
+    BL_ASSERT(t == 'C' || t == 'N');
+    t == 'N' ? (iv |= 1U) : (iv &= ~1U);
+    for (int i=1; i<dim; ++i) {
+        is.ignore(BL_IGNORE_MAX, ',') >> t;
+        BL_ASSERT(t == 'C' || t == 'N');
+        t == 'N' ? (iv |= (1U << i)) : (iv &= ~(1U << i));
+    }
     is.ignore(BL_IGNORE_MAX, ')');
-    AMREX_D_TERM(
-        BL_ASSERT(t0 == 'C' || t0 == 'N'); t0=='N'?it.set(0):it.unset(0); ,
-        BL_ASSERT(t1 == 'C' || t1 == 'N'); t1=='N'?it.set(1):it.unset(1); ,
-        BL_ASSERT(t2 == 'C' || t2 == 'N'); t2=='N'?it.set(2):it.unset(2));
 
     if (is.fail()) {
         amrex::Error("operator>>(ostream&,IndexType&) failed");
diff --git a/Src/Base/AMReX_IntVect.H b/Src/Base/AMReX_IntVect.H
index 2cf48053cd2..6452f29b036 100644
--- a/Src/Base/AMReX_IntVect.H
+++ b/Src/Base/AMReX_IntVect.H
@@ -167,6 +167,18 @@ public:
         }
     }
 
+    template <int N=dim, std::enable_if_t<( 1<=N && N<=3 ), int> = 0>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 dim3 ([[maybe_unused]] int fill_extra) const noexcept {
+        if constexpr (dim == 1) {
+            return Dim3{vect[0],fill_extra,fill_extra};
+        } else if constexpr (dim == 2) {
+            return Dim3{vect[0],vect[1],fill_extra};
+        } else {
+            return Dim3{vect[0],vect[1],vect[2]};
+        }
+    }
+
 #if __cplusplus >= 201402L
     template< typename T = int >
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -1108,7 +1120,7 @@ namespace detail {
     template<class T, std::size_t...Ns>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr
     T IntVectSplit_imp (T& retval, std::index_sequence<Ns...>, const int * src) noexcept {
-        (IntVectSplit_imp2(std::get<Ns>(retval), src), ...);
+        (IntVectSplit_imp2(amrex::get<Ns>(retval), src), ...);
         return retval;
     }
 
diff --git a/Src/Base/AMReX_IntegratorBase.H b/Src/Base/AMReX_IntegratorBase.H
index d9af8053d7e..af29dc82cca 100644
--- a/Src/Base/AMReX_IntegratorBase.H
+++ b/Src/Base/AMReX_IntegratorBase.H
@@ -165,31 +165,25 @@ protected:
     /**
      * \brief Rhs is the right-hand-side function the integrator will use.
      */
-    std::function<void(T& rhs, const T& state, const amrex::Real time)> Rhs;
+    std::function<void(T& rhs, T& state, const amrex::Real time)> Rhs;
 
     /**
      * \brief RhsIm is the implicit right-hand-side function an ImEx integrator
      * will use.
      */
-    std::function<void(T& rhs, const T& state, const amrex::Real time)> RhsIm;
+    std::function<void(T& rhs, T& state, const amrex::Real time)> RhsIm;
 
     /**
      * \brief RhsEx is the explicit right-hand-side function an ImEx integrator
      * will use.
      */
-    std::function<void(T& rhs, const T& state, const amrex::Real time)> RhsEx;
+    std::function<void(T& rhs, T& state, const amrex::Real time)> RhsEx;
 
     /**
      * \brief RhsFast is the fast timescale right-hand-side function a multirate
      * integrator will use.
      */
-    std::function<void(T& rhs, const T& state, const amrex::Real time)> RhsFast;
-
-    /**
-     * \brief The pre_rhs_action function is called by the integrator on state
-     * data before using it to evaluate a right-hand side.
-     */
-    std::function<void (T&, amrex::Real)> pre_rhs_action;
+    std::function<void(T& rhs, T& state, const amrex::Real time)> RhsFast;
 
     /**
      * \brief The post_stage_action function is called by the integrator on
@@ -283,28 +277,23 @@ public:
 
     virtual ~IntegratorBase () = default;
 
-    void set_rhs (std::function<void(T&, const T&, const amrex::Real)> F)
+    void set_rhs (std::function<void(T&, T&, const amrex::Real)> F)
     {
         Rhs = F;
     }
 
-    void set_imex_rhs (std::function<void(T&, const T&, const amrex::Real)> Fi,
-                       std::function<void(T&, const T&, const amrex::Real)> Fe)
+    void set_imex_rhs (std::function<void(T&, T&, const amrex::Real)> Fi,
+                       std::function<void(T&, T&, const amrex::Real)> Fe)
     {
         RhsIm = Fi;
         RhsEx = Fe;
     }
 
-    void set_fast_rhs (std::function<void(T&, const T&, const amrex::Real)> F)
+    void set_fast_rhs (std::function<void(T&, T&, const amrex::Real)> F)
     {
         RhsFast = F;
     }
 
-    void set_pre_rhs_action (std::function<void (T&, amrex::Real)> A)
-    {
-        pre_rhs_action = A;
-    }
-
     void set_post_stage_action (std::function<void (T&, amrex::Real)> A)
     {
         post_stage_action = A;
@@ -325,12 +314,6 @@ public:
         post_fast_step_action = A;
     }
 
-    void set_post_update (std::function<void (T&, amrex::Real)> A)
-    {
-        set_post_stage_action(A);
-        set_post_step_action(A);
-    }
-
     amrex::Real get_time_step ()
     {
         return time_step;
diff --git a/Src/Base/AMReX_Loop.H b/Src/Base/AMReX_Loop.H
index 076985caac3..fe76b8c9882 100644
--- a/Src/Base/AMReX_Loop.H
+++ b/Src/Base/AMReX_Loop.H
@@ -7,6 +7,117 @@
 
 namespace amrex {
 
+namespace loop_detail {
+
+    // call_f_intvect_inner
+
+    template <typename F, std::size_t...Ns, class...Args>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
+        noexcept -> decltype(f(0, 0, 0, args...))
+    {
+        f(iv[0], 0, 0, args...);
+    }
+
+    template <typename F, std::size_t...Ns, class...Args>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
+        noexcept -> decltype(f(0, 0, 0, args...))
+    {
+        f(iv[0], iv[1], 0, args...);
+    }
+
+    template <typename F, int dim, std::size_t...Ns, class...Args>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
+        noexcept -> decltype(f(iv, args...))
+    {
+        f(iv, args...);
+    }
+
+    template <typename F, int dim, std::size_t...Ns, class...Args>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
+        noexcept -> decltype(f(iv[Ns]..., args...))
+    {
+        f(iv[Ns]..., args...);
+    }
+
+    // call_f_intvect
+
+    template <typename F, int dim>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect (F const& f, IntVectND<dim> iv)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
+    {
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
+    }
+
+    // call_f_intvect_ncomp
+
+    template <typename F, int dim>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, int n)
+        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
+    {
+        call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
+    }
+
+    // call_f_intvect_inner_cpu
+
+    template <typename F, std::size_t...Ns, class...Args>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_inner_cpu (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
+        noexcept -> decltype(f(0, 0, 0, args...))
+    {
+        f(iv[0], 0, 0, args...);
+    }
+
+    template <typename F, std::size_t...Ns, class...Args>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_inner_cpu (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
+        noexcept -> decltype(f(0, 0, 0, args...))
+    {
+        f(iv[0], iv[1], 0, args...);
+    }
+
+    template <typename F, int dim, std::size_t...Ns, class...Args>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_inner_cpu (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
+        noexcept -> decltype(f(iv, args...))
+    {
+        f(iv, args...);
+    }
+
+    template <typename F, int dim, std::size_t...Ns, class...Args>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_inner_cpu (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
+        noexcept -> decltype(f(iv[Ns]..., args...))
+    {
+        f(iv[Ns]..., args...);
+    }
+
+    // call_f_intvect_cpu
+
+    template <typename F, int dim>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_cpu (F const& f, IntVectND<dim> iv)
+        noexcept -> decltype(call_f_intvect_inner_cpu(std::make_index_sequence<dim>(), f, iv))
+    {
+        call_f_intvect_inner_cpu(std::make_index_sequence<dim>(), f, iv);
+    }
+
+    // call_f_intvect_ncomp_cpu
+
+    template <typename F, int dim>
+    AMREX_FORCE_INLINE
+    auto call_f_intvect_ncomp_cpu (F const& f, IntVectND<dim> iv, int n)
+        noexcept -> decltype(call_f_intvect_inner_cpu(std::make_index_sequence<dim>(), f, iv, 0))
+    {
+        call_f_intvect_inner_cpu(std::make_index_sequence<dim>(), f, iv, n);
+    }
+}
+
 template <class F>
 AMREX_GPU_HOST_DEVICE
 AMREX_ATTRIBUTE_FLATTEN_FOR
@@ -59,64 +170,178 @@ void LoopConcurrent (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept
     }}}}
 }
 
-template <class F>
+namespace loop_detail {
+
+template <int idim, typename L, int dim>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void Loop_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv) noexcept
+{
+    if constexpr (idim == 1) {
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect(f,iv);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect(f,iv);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect(f,iv);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            Loop_impND<idim-1>(f, lo, hi, iv);
+        }
+    }
+}
+
+}
+
+template <class F, int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void Loop (Box const& bx, F const& f) noexcept
+void Loop (BoxND<dim> const& bx, F const& f) noexcept
 {
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k);
-    }}}
+    const auto lo = amrex::lbound_iv(bx);
+    const auto hi = amrex::ubound_iv(bx);
+    IntVectND<dim> iv;
+    loop_detail::Loop_impND<dim>(f, lo, hi, iv);
 }
 
-template <class F>
+namespace loop_detail {
+
+template <int idim, typename L, int dim>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void Loop_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv, int n) noexcept
+{
+    if constexpr (idim == 1) {
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp(f,iv,n);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp(f,iv,n);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp(f,iv,n);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            Loop_impND<idim-1>(f, lo, hi, iv, n);
+        }
+    }
+}
+
+}
+
+template <class F, int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void Loop (Box const& bx, int ncomp, F const& f) noexcept
+void Loop (BoxND<dim> const& bx, int ncomp, F const& f) noexcept
 {
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
+    const auto lo = amrex::lbound_iv(bx);
+    const auto hi = amrex::ubound_iv(bx);
+    IntVectND<dim> iv;
     for (int n = 0; n < ncomp; ++n) {
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k,n);
-    }}}}
+        loop_detail::Loop_impND<dim>(f, lo, hi, iv, n);
+    }
 }
 
-template <class F>
+namespace loop_detail {
+
+template <int idim, typename L, int dim>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void LoopConcurrent_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv) noexcept
+{
+    if constexpr (idim == 1) {
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect(f,iv);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect(f,iv);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect(f,iv);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            LoopConcurrent_impND<idim-1>(f, lo, hi, iv);
+        }
+    }
+}
+
+}
+
+template <class F, int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void LoopConcurrent (Box const& bx, F const& f) noexcept
+void LoopConcurrent (BoxND<dim> const& bx, F const& f) noexcept
 {
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    AMREX_PRAGMA_SIMD
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k);
-    }}}
+    const auto lo = amrex::lbound_iv(bx);
+    const auto hi = amrex::ubound_iv(bx);
+    IntVectND<dim> iv;
+    loop_detail::LoopConcurrent_impND<dim>(f, lo, hi, iv);
 }
 
-template <class F>
+namespace loop_detail {
+
+template <int idim, typename L, int dim>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void LoopConcurrent_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv, int n) noexcept
+{
+    if constexpr (idim == 1) {
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp(f,iv,n);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp(f,iv,n);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp(f,iv,n);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            LoopConcurrent_impND<idim-1>(f, lo, hi, iv, n);
+        }
+    }
+}
+
+}
+
+template <class F, int dim>
 AMREX_GPU_HOST_DEVICE
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void LoopConcurrent (Box const& bx, int ncomp, F const& f) noexcept
+void LoopConcurrent (BoxND<dim> const& bx, int ncomp, F const& f) noexcept
 {
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
+    const auto lo = amrex::lbound_iv(bx);
+    const auto hi = amrex::ubound_iv(bx);
+    IntVectND<dim> iv;
     for (int n = 0; n < ncomp; ++n) {
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    AMREX_PRAGMA_SIMD
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k,n);
-    }}}}
+        loop_detail::LoopConcurrent_impND<dim>(f, lo, hi, iv, n);
+    }
 }
 
 // The functions above are __host__ __device__ functions.  If f is not a
@@ -172,60 +397,174 @@ void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept
     }}}}
 }
 
-template <class F>
+namespace loop_detail {
+
+template <int idim, typename L, int dim>
+AMREX_FORCE_INLINE
+void LoopOnCpu_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv) noexcept
+{
+    if constexpr (idim == 1) {
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_cpu(f,iv);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_cpu(f,iv);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_cpu(f,iv);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            LoopOnCpu_impND<idim-1>(f, lo, hi, iv);
+        }
+    }
+}
+
+}
+
+template <class F, int dim>
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void LoopOnCpu (Box const& bx, F const& f) noexcept
+void LoopOnCpu (BoxND<dim> const& bx, F const& f) noexcept
 {
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k);
-    }}}
+    const auto lo = amrex::lbound_iv(bx);
+    const auto hi = amrex::ubound_iv(bx);
+    IntVectND<dim> iv;
+    loop_detail::LoopOnCpu_impND<dim>(f, lo, hi, iv);
 }
 
-template <class F>
+namespace loop_detail {
+
+template <int idim, typename L, int dim>
+AMREX_FORCE_INLINE
+void LoopOnCpu_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv, int n) noexcept
+{
+    if constexpr (idim == 1) {
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_cpu(f,iv,n);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_cpu(f,iv,n);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_cpu(f,iv,n);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            LoopOnCpu_impND<idim-1>(f, lo, hi, iv, n);
+        }
+    }
+}
+
+}
+
+template <class F, int dim>
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void LoopOnCpu (Box const& bx, int ncomp, F const& f) noexcept
+void LoopOnCpu (BoxND<dim> const& bx, int ncomp, F const& f) noexcept
 {
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
+    const auto lo = amrex::lbound_iv(bx);
+    const auto hi = amrex::ubound_iv(bx);
+    IntVectND<dim> iv;
     for (int n = 0; n < ncomp; ++n) {
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k,n);
-    }}}}
+        loop_detail::LoopOnCpu_impND<dim>(f, lo, hi, iv, n);
+    }
 }
 
-template <class F>
+namespace loop_detail {
+
+template <int idim, typename L, int dim>
+AMREX_FORCE_INLINE
+void LoopConcurrentOnCpu_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv) noexcept
+{
+    if constexpr (idim == 1) {
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_cpu(f,iv);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_cpu(f,iv);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_cpu(f,iv);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            LoopConcurrentOnCpu_impND<idim-1>(f, lo, hi, iv);
+        }
+    }
+}
+
+}
+
+template <class F, int dim>
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void LoopConcurrentOnCpu (Box const& bx, F const& f) noexcept
+void LoopConcurrentOnCpu (BoxND<dim> const& bx, F const& f) noexcept
 {
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    AMREX_PRAGMA_SIMD
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k);
-    }}}
+    const auto lo = amrex::lbound_iv(bx);
+    const auto hi = amrex::ubound_iv(bx);
+    IntVectND<dim> iv;
+    loop_detail::LoopConcurrentOnCpu_impND<dim>(f, lo, hi, iv);
 }
 
-template <class F>
+namespace loop_detail {
+
+template <int idim, typename L, int dim>
+AMREX_FORCE_INLINE
+void LoopConcurrentOnCpu_impND (L const& f, IntVectND<dim> const lo, IntVectND<dim> const hi, IntVectND<dim> iv, int n) noexcept
+{
+    if constexpr (idim == 1) {
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_cpu(f,iv,n);
+        }
+    } else if constexpr (idim == 2) {
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_cpu(f,iv,n);
+        }}
+    } else if constexpr (idim == 3) {
+        for (int i2 = lo[2], h2 = hi[2]; i2 <= h2; ++i2) { iv[2] = i2;
+        for (int i1 = lo[1], h1 = hi[1]; i1 <= h1; ++i1) { iv[1] = i1;
+        AMREX_PRAGMA_SIMD
+        for (int i0 = lo[0], h0 = hi[0]; i0 <= h0; ++i0) { iv[0] = i0;
+            call_f_intvect_ncomp_cpu(f,iv,n);
+        }}}
+    } else {
+        for (int id = lo[idim-1], hd = hi[idim-1]; id <= hd; ++id) { iv[idim-1] = id;
+            LoopConcurrentOnCpu_impND<idim-1>(f, lo, hi, iv, n);
+        }
+    }
+}
+
+}
+
+template <class F, int dim>
 AMREX_ATTRIBUTE_FLATTEN_FOR
-void LoopConcurrentOnCpu (Box const& bx, int ncomp, F const& f) noexcept
+void LoopConcurrentOnCpu (BoxND<dim> const& bx, int ncomp, F const& f) noexcept
 {
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
+    const auto lo = amrex::lbound_iv(bx);
+    const auto hi = amrex::ubound_iv(bx);
+    IntVectND<dim> iv;
     for (int n = 0; n < ncomp; ++n) {
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    AMREX_PRAGMA_SIMD
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k,n);
-    }}}}
+        loop_detail::LoopConcurrentOnCpu_impND<dim>(f, lo, hi, iv, n);
+    }
 }
 
 // Implementation of "constexpr for" based on
diff --git a/Src/Base/AMReX_MFIter.H b/Src/Base/AMReX_MFIter.H
index 7f0ca4d3539..fd21b0ee7c2 100644
--- a/Src/Base/AMReX_MFIter.H
+++ b/Src/Base/AMReX_MFIter.H
@@ -167,6 +167,8 @@ public:
 
     static int allowMultipleMFIters (int allow);
 
+    static int currentDepth ();
+
     void Finalize ();
 
 protected:
diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp
index f68ab9ce35f..8180318dadd 100644
--- a/Src/Base/AMReX_MFIter.cpp
+++ b/Src/Base/AMReX_MFIter.cpp
@@ -17,6 +17,17 @@ MFIter::allowMultipleMFIters (int allow)
     return allow;
 }
 
+int
+MFIter::currentDepth ()
+{
+    int r;
+#ifdef AMREX_USE_OMP
+#pragma omp atomic read
+#endif
+    r = MFIter::depth;
+    return r;
+}
+
 MFIter::MFIter (const FabArrayBase& fabarray_,
                 unsigned char       flags_)
     :
@@ -222,13 +233,6 @@ MFIter::Finalize ()
     // mark as invalid
     currentIndex = endIndex;
 
-#ifdef AMREX_USE_OMP
-#pragma omp master
-#endif
-    {
-        depth = 0;
-    }
-
 #ifdef BL_USE_TEAM
     if ( ! (flags & NoTeamBarrier) )
         ParallelDescriptor::MyTeam().MemoryBarrier();
@@ -257,6 +261,13 @@ MFIter::Finalize ()
     if (m_fa) {
         m_fa.reset(nullptr);
     }
+
+#ifdef AMREX_USE_OMP
+#pragma omp master
+#endif
+    {
+        depth = 0;
+    }
 }
 
 void
diff --git a/Src/Base/AMReX_Machine.cpp b/Src/Base/AMReX_Machine.cpp
index 97de84f30c8..a7cc0d3993e 100644
--- a/Src/Base/AMReX_Machine.cpp
+++ b/Src/Base/AMReX_Machine.cpp
@@ -327,9 +327,9 @@ class Machine
 
     void get_params ()
     {
-        ParmParse pp("machine");
-        pp.queryAdd("verbose", flag_verbose);
-        pp.queryAdd("very_verbose", flag_very_verbose);
+        ParmParse pp("amrex.machine");
+        pp.query("verbose", flag_verbose);
+        pp.query("very_verbose", flag_very_verbose);
     }
 
     static std::string get_env_str (const std::string& env_key)
diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H
index 42762001268..c4d8d524af0 100644
--- a/Src/Base/AMReX_Math.H
+++ b/Src/Base/AMReX_Math.H
@@ -274,7 +274,7 @@ std::uint64_t umulhi (std::uint64_t a, std::uint64_t b)
 ///
 struct FastDivmodU64
 {
-    std::uint64_t divisor;
+    std::uint64_t divisor = 0;
 
 #ifdef AMREX_INT128_SUPPORTED
     std::uint64_t multiplier = 1U;
@@ -324,6 +324,9 @@ struct FastDivmodU64
 
 #endif
 
+    /// Default construct an invalid FastDivmodU64
+    FastDivmodU64 () = default;
+
     /// Returns the quotient of floor(dividend / divisor)
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     std::uint64_t divide (std::uint64_t dividend) const
diff --git a/Src/Base/AMReX_MemPool.H b/Src/Base/AMReX_MemPool.H
index 9a3b92fae04..5a87793aa8f 100644
--- a/Src/Base/AMReX_MemPool.H
+++ b/Src/Base/AMReX_MemPool.H
@@ -2,8 +2,12 @@
 #define BL_MEMPOOL_H
 #include <AMReX_Config.H>
 
+#include <AMReX_GpuLaunch.H>
 #include <AMReX_REAL.H>
 
+#include <limits>
+#include <type_traits>
+
 extern "C" {
     void  amrex_mempool_init ();
     void  amrex_mempool_finalize ();
@@ -14,4 +18,40 @@ extern "C" {
     void  amrex_array_init_snan (amrex_real* p, size_t nelems);
 }
 
+namespace amrex {
+    template <RunOn run_on, typename T,
+              std::enable_if_t<std::is_same_v<T,double> || std::is_same_v<T,float>, int> FOO = 0>
+    void fill_snan (T* p, std::size_t nelems)
+    {
+        if (p == nullptr || nelems == 0) { return; }
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion() && run_on == RunOn::Device) {
+            amrex::ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long i) noexcept
+            {
+                p[i] = std::numeric_limits<T>::signaling_NaN();
+            });
+        } else
+#endif
+        {
+            if constexpr (std::is_same_v<T,float>) {
+#ifdef UINT32_MAX
+                const uint32_t snan = UINT32_C(0x7fa00000);
+                static_assert(sizeof(float) == sizeof(uint32_t), "MemPool: sizeof float != sizeof uint32_t");
+                for (size_t i = 0; i < nelems; ++i) {
+                    std::memcpy(p++, &snan, sizeof(float));
+                }
+#endif
+            } else if constexpr (std::is_same_v<T,double>) {
+#ifdef UINT64_MAX
+                const uint64_t snan = UINT64_C(0x7ff0000080000001);
+                static_assert(sizeof(double) == sizeof(uint64_t), "MemPool: sizeof double != sizeof uint64_t");
+                for (size_t i = 0; i < nelems; ++i) {
+                    std::memcpy(p++, &snan, sizeof(double));
+                }
+#endif
+            }
+        }
+    }
+}
+
 #endif
diff --git a/Src/Base/AMReX_MemPool.cpp b/Src/Base/AMReX_MemPool.cpp
index e1e26ce7b6d..3d4bfde5247 100644
--- a/Src/Base/AMReX_MemPool.cpp
+++ b/Src/Base/AMReX_MemPool.cpp
@@ -23,11 +23,6 @@ using namespace amrex;
 namespace
 {
     Vector<std::unique_ptr<CArena> > the_memory_pool;
-#if defined(AMREX_TESTING) || defined(AMREX_DEBUG)
-    int init_snan = 1;
-#else
-    int init_snan = 0;
-#endif
     bool initialized = false;
 }
 
@@ -41,9 +36,6 @@ void amrex_mempool_init ()
 
         initialized = true;
 
-        ParmParse pp("fab");
-        pp.queryAdd("init_snan", init_snan);
-
         int nthreads = OpenMP::get_max_threads();
 
         the_memory_pool.resize(nthreads);
@@ -109,31 +101,12 @@ void amrex_mempool_get_stats (int& mp_min, int& mp_max, int& mp_tot) // min, max
 
 void amrex_real_array_init (Real* p, size_t nelems)
 {
-    if (init_snan) { amrex_array_init_snan(p, nelems); }
+    if (amrex::InitSNaN()) { amrex_array_init_snan(p, nelems); }
 }
 
 void amrex_array_init_snan (Real* p, size_t nelems)
 {
-#ifdef BL_USE_DOUBLE
-
-#ifdef UINT64_MAX
-    const uint64_t snan = UINT64_C(0x7ff0000080000001);
-    static_assert(sizeof(double) == sizeof(uint64_t), "MemPool: sizeof double != sizeof uint64_t");
-    for (size_t i = 0; i < nelems; ++i) {
-        std::memcpy(p++, &snan, sizeof(double));
-    }
-#endif
-
-#else
-
-#ifdef UINT32_MAX
-    const uint32_t snan = UINT32_C(0x7fa00000);
-    static_assert(sizeof(float) == sizeof(uint32_t), "MemPool: sizeof float != sizeof uint32_t");
-    for (size_t i = 0; i < nelems; ++i) {
-        std::memcpy(p++, &snan, sizeof(float));
-    }
-#endif
-
-#endif
+    amrex::fill_snan<RunOn::Host>(p, nelems);
 }
+
 }
diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H
index d4b3a3271ca..228070a13c9 100644
--- a/Src/Base/AMReX_MultiFabUtil.H
+++ b/Src/Base/AMReX_MultiFabUtil.H
@@ -398,6 +398,20 @@ namespace amrex
      * \param stddev standard deviation of normal distribution
      */
     void FillRandomNormal (MultiFab& mf, int scomp, int ncomp, Real mean, Real stddev);
+
+    /**
+     * \brief Convexify AMR data
+     *
+     * This function "convexifies" the AMR data by removing cells that are
+     * covered by fine levels from coarse level MultiFabs. This could be
+     * useful for visualization. The returned MultiFabs on coarse levels
+     * have different BoxArrays from the original BoxArrays. For the finest
+     * level, the data is simply copied to the returned object. The returned
+     * MultiFabs have no ghost cells. For nodal data, the nodes on the
+     * coarse/fine interface exist on both levels.
+     */
+    [[nodiscard]] Vector<MultiFab> convexify (Vector<MultiFab const*> const& mf,
+                                Vector<IntVect> const& refinement_ratio);
 }
 
 namespace amrex {
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index a2a1f14ed8d..86a1e290548 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -1216,4 +1216,57 @@ namespace amrex
             FillRandomNormal(p, npts, mean, stddev);
         }
     }
+
+    Vector<MultiFab> convexify (Vector<MultiFab const*> const& mf,
+                                Vector<IntVect> const& refinement_ratio)
+    {
+        if (mf.empty()) { return Vector<MultiFab>{}; }
+
+        const auto nlevels = int(mf.size());
+        Vector <MultiFab> rmf(nlevels);
+
+        const int ncomp = mf[nlevels-1]->nComp();
+        rmf[nlevels-1].define(mf[nlevels-1]->boxArray(),
+                              mf[nlevels-1]->DistributionMap(), ncomp, 0);
+        MultiFab::Copy(rmf[nlevels-1], *mf[nlevels-1], 0, 0, ncomp, 0);
+
+        for (int ilev = nlevels-2; ilev >= 0; --ilev) {
+            BoxArray fba = mf[ilev+1]->boxArray();
+            BoxArray cba = mf[ilev  ]->boxArray();
+            AMREX_ASSERT(fba.ixType() == cba.ixType());
+            AMREX_ASSERT(mf[ilev]->nComp() == ncomp);
+
+            fba.convert(IntVect(0)).coarsen(refinement_ratio[ilev]);
+            cba.convert(IntVect(0));
+            auto const& cdm = mf[ilev]->DistributionMap();
+
+            BoxList blnew, bltmp;
+            Vector<int> procmap;
+            Vector<int> localmap;
+            for (int ibox = 0; ibox < int(cba.size()); ++ibox) {
+                fba.complementIn(bltmp, cba[ibox]);
+                blnew.join(bltmp);
+                procmap.resize(procmap.size()+bltmp.size(), cdm[ibox]);
+                if (ParallelDescriptor::MyProc() == cdm[ibox]) {
+                    localmap.resize(localmap.size()+bltmp.size(), ibox);
+                }
+            }
+
+            if (blnew.isNotEmpty()) {
+                BoxArray banew(std::move(blnew));
+                banew.convert(mf[ilev]->ixType());
+                DistributionMapping dmnew(std::move(procmap));
+                rmf[ilev].define(banew, dmnew, ncomp, 0);
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+                for (MFIter mfi(rmf[ilev], TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+                   rmf[ilev][mfi].template copy<RunOn::Device>
+                       ((*mf[ilev])[localmap[mfi.LocalIndex()]], mfi.tilebox());
+                }
+            }
+        }
+
+        return rmf;
+    }
 }
diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp
index 53782c8c347..56225253fa7 100644
--- a/Src/Base/AMReX_OpenMP.cpp
+++ b/Src/Base/AMReX_OpenMP.cpp
@@ -175,7 +175,7 @@ namespace amrex::OpenMP
             }
         } else {
             std::optional<int> num_omp_threads = to_int(omp_threads);
-            if (num_omp_threads.has_value()) {
+            if (num_omp_threads.has_value() && num_omp_threads.value() > 0) {
                 omp_set_num_threads(num_omp_threads.value());
             }
             else {
diff --git a/Src/Base/AMReX_PODVector.H b/Src/Base/AMReX_PODVector.H
index 0e7458fe5b0..464bb552ee1 100644
--- a/Src/Base/AMReX_PODVector.H
+++ b/Src/Base/AMReX_PODVector.H
@@ -2,10 +2,12 @@
 #define AMREX_PODVECTOR_H_
 #include <AMReX_Config.H>
 
+#include <AMReX.H>
 #include <AMReX_Arena.H>
 #include <AMReX_GpuLaunch.H>
 #include <AMReX_GpuAllocators.H>
 #include <AMReX_GpuDevice.H>
+#include <AMReX_MemPool.H>
 #include <AMReX_TypeTraits.H>
 
 #include <iterator>
@@ -196,6 +198,33 @@ namespace amrex
 #endif
             std::memmove(dst, src, count);
         }
+
+        template <typename T, typename Size, template<class> class Allocator>
+        void maybe_init_snan (T* data, Size count, Allocator<T> const& allocator)
+        {
+            amrex::ignore_unused(data, count, allocator);
+            if constexpr (std::is_same_v<float,  std::remove_cv_t<T>> ||
+                          std::is_same_v<double, std::remove_cv_t<T>>) {
+                if (amrex::InitSNaN()) {
+#ifdef AMREX_USE_GPU
+                   if constexpr (RunOnGpu<Allocator<T>>::value) {
+                       amrex::fill_snan<RunOn::Device>(data, count);
+                       Gpu::streamSynchronize();
+                       return;
+                   } else if constexpr (IsPolymorphicArenaAllocator<Allocator<T>>::value) {
+                       if (allocator.arena()->isManaged() ||
+                           allocator.arena()->isDevice())
+                       {
+                           amrex::fill_snan<RunOn::Device>(data, count);
+                           Gpu::streamSynchronize();
+                           return;
+                       }
+                   }
+#endif
+                   amrex::fill_snan<RunOn::Host>(data, count);
+                }
+            }
+        }
     }
 
     namespace VectorGrowthStrategy
@@ -254,6 +283,7 @@ namespace amrex
         {
             if (a_size != 0) {
                 m_data = allocate(m_size);
+                detail::maybe_init_snan(m_data, m_size, (Allocator const&)(*this));
             }
         }
 
@@ -366,7 +396,7 @@ namespace amrex
             else
             {
                 // if the allocators are not the same we give up and copy
-                *this = a_vector;
+                *this = a_vector; // must copy instead of move
             }
 
             return *this;
@@ -594,16 +624,18 @@ namespace amrex
 
         void resize (size_type a_new_size)
         {
-            if (m_capacity < a_new_size) {
-                reserve(a_new_size);
+            auto old_size = m_size;
+            resize_without_init_snan(a_new_size);
+            if (old_size < a_new_size) {
+                detail::maybe_init_snan(m_data + old_size,
+                                        m_size - old_size, (Allocator const&)(*this));
             }
-            m_size = a_new_size;
         }
 
         void resize (size_type a_new_size, const T& a_val)
         {
             size_type old_size = m_size;
-            resize(a_new_size);
+            resize_without_init_snan(a_new_size);
             if (old_size < a_new_size)
             {
                 detail::uninitializedFillNImpl(m_data + old_size,
@@ -738,6 +770,14 @@ namespace amrex
             m_size = new_size;
             m_capacity = new_capacity;
         }
+
+        void resize_without_init_snan (size_type a_new_size)
+        {
+            if (m_capacity < a_new_size) {
+                reserve(a_new_size);
+            }
+            m_size = a_new_size;
+        }
     };
 }
 
diff --git a/Src/Base/AMReX_ParallelDescriptor.H b/Src/Base/AMReX_ParallelDescriptor.H
index f949ae6f45c..a41d393833d 100644
--- a/Src/Base/AMReX_ParallelDescriptor.H
+++ b/Src/Base/AMReX_ParallelDescriptor.H
@@ -107,7 +107,7 @@ while ( false )
     void Initialize ();
     void Finalize ();
 
-    extern AMREX_EXPORT int use_gpu_aware_mpi;
+    extern AMREX_EXPORT bool use_gpu_aware_mpi;
     inline bool UseGpuAwareMpi () { return use_gpu_aware_mpi; }
 
     //! Split the process pool into teams
diff --git a/Src/Base/AMReX_ParallelDescriptor.cpp b/Src/Base/AMReX_ParallelDescriptor.cpp
index 60f08d4ca35..f6ac26e7984 100644
--- a/Src/Base/AMReX_ParallelDescriptor.cpp
+++ b/Src/Base/AMReX_ParallelDescriptor.cpp
@@ -57,9 +57,9 @@ namespace amrex::ParallelDescriptor {
 #endif
 
 #ifdef AMREX_USE_GPU
-    int use_gpu_aware_mpi = false;
+    bool use_gpu_aware_mpi = false;
 #else
-    int use_gpu_aware_mpi = false;
+    bool use_gpu_aware_mpi = false;
 #endif
 
     ProcessTeam m_Team;
@@ -1537,9 +1537,9 @@ StartTeams ()
     int do_team_reduce = 0;
 
 #if defined(BL_USE_MPI3)
-    ParmParse pp("team");
-    pp.queryAdd("size", team_size);
-    pp.queryAdd("reduce", do_team_reduce);
+    ParmParse pp("amrex.team");
+    pp.query("size", team_size);
+    pp.query("reduce", do_team_reduce);
     if (nprocs % team_size != 0) {
         amrex::Abort("Number of processes not divisible by team size");
     }
diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H
index 11524eca689..cc9588793da 100644
--- a/Src/Base/AMReX_ParmParse.H
+++ b/Src/Base/AMReX_ParmParse.H
@@ -1,25 +1,31 @@
-
-#ifndef BL_PARMPARSE_H
-#define BL_PARMPARSE_H
+#ifndef AMREX_PARMPARSE_H_
+#define AMREX_PARMPARSE_H_
 #include <AMReX_Config.H>
 
 #include <AMReX_BLassert.H>
+#include <AMReX_Enum.H>
+#include <AMReX_INT.H>
+#include <AMReX_IParser.H>
+#include <AMReX_Parser.H>
 #include <AMReX_TypeTraits.H>
 
+#include <array>
+#include <iosfwd>
+#include <unordered_map>
 #include <set>
-#include <stack>
 #include <string>
-#include <iosfwd>
+#include <string_view>
 #include <vector>
-#include <list>
-#include <array>
 
 namespace amrex {
 
-class Box;
+template<int dim>
+class BoxND;
+using Box = BoxND<AMREX_SPACEDIM>;
 template<int dim>
 class IntVectND;
 using IntVect = IntVectND<AMREX_SPACEDIM>;
+class RealVect;
 
 //
 // ParmParse class implements a simple database for the storage and
@@ -268,28 +274,31 @@ using IntVect = IntVectND<AMREX_SPACEDIM>;
 *    t = 1.5
 *    #endif
 *
+*  Math expression is supported for integers and reals. For example
+*
+*    n_cell = 128
+*    amrex.n_cell = n_cell*2 8 16**2
+*
+*  becomes
+*
+*    n_cell = 128
+*    amrex.n_cell = 256 8 256
+*
+*  More details can be found at https://amrex-codes.github.io/amrex/docs_html/Basics.html#parmparse
 */
 class ParmParse
 {
 public:
-    class Frame;
-    class Record;
-
     enum { LAST = -1, FIRST = 0, ALL = -1 };
     /**
     * \brief Construct an additional ParmParse object sharing the same
-    * internal table as any other such objects in existence.  If
-    * prefix is specified, load this string as the code prefix
-    * for this particular ParmParse object.
+    * internal table as any other such objects in existence.  If prefix is
+    * specified, load this string as the code prefix for this particular
+    * ParmParse object. If parser_prefix is specified, it will be used as
+    * prefixed in math expression evaluations.
     */
-    explicit ParmParse (const std::string& prefix = std::string());
-
-    ParmParse (ParmParse const& rhs) = default;
-    ~ParmParse () = default;
-
-    ParmParse (ParmParse && rhs) = delete;
-    ParmParse& operator= (const ParmParse& rhs) = delete;
-    ParmParse& operator= (ParmParse&& rhs) = delete;
+    explicit ParmParse (std::string prefix = std::string(),
+                        std::string parser_prefix = std::string());
 
     //! Returns true if name is in table.
     [[nodiscard]] bool contains (const char* name) const;
@@ -305,15 +314,6 @@ public:
     */
     [[nodiscard]] int countname (const std::string& name) const;
     /**
-    * \brief Returns the number of records of the given name (prepended with prefix) appears
-    * in the table
-    */
-    [[nodiscard]] int countRecords (const std::string& name) const;
-    //! Returns the nth record of the given name, or zero if none found.
-    [[nodiscard]] Record getRecord (const std::string& name, int n = LAST) const;
-    //! Write the contents of the table in ASCII to the ostream.
-    static void dumpTable (std::ostream& os, bool prettyPrint = false);
-    /**
     * \brief Get the ival'th value of kth occurrence of the requested name.
     * If successful, the value is converted to a bool and stored
     * in reference ref.  If the kth occurrence does not exist or
@@ -345,7 +345,7 @@ public:
                   int         ival = FIRST) const;
     //! Same as querykth() but searches for the last occurrence of name.
     int query (const char* name,
-               bool&        ref,
+               bool&       ref,
                int         ival = FIRST) const;
     //! Add a key 'name'with value 'ref' to the end of the PP table.
     void add (const char* name, bool val);
@@ -383,8 +383,7 @@ public:
                int&        ref,
                int         ival = FIRST) const;
     //! Add a key 'name'with value 'ref' to the end of the PP table.
-    void add (const char* name,
-              int  val);
+    void add (const char* name, int  val);
     /**
     * \brief Get the ival'th value of kth occurrence of the requested name.
     * If successful, the value is converted to an int and stored
@@ -418,8 +417,7 @@ public:
                long&        ref,
                int         ival = FIRST) const;
     //! Add a key 'name'with value 'ref' to the end of the PP table.
-    void add (const char* name,
-              long  val);
+    void add (const char* name, long  val);
     /**
     * \brief Get the ival'th value of kth occurrence of the requested name.
     * If successful, the value is converted to an int and stored
@@ -453,8 +451,7 @@ public:
                long long&  ref,
                int         ival = FIRST) const;
     //! Add a key 'name'with value 'ref' to the end of the PP table.
-    void add (const char* name,
-              long long  val);
+    void add (const char* name, long long  val);
     /**
     * \brief Get the ival'th value of kth occurrence of the requested name.
     * If successful, the value is converted to a float and stored
@@ -488,8 +485,7 @@ public:
                float&      ref,
                int         ival = FIRST) const;
     //! Add a key 'name'with value 'ref' to the end of the PP table.
-    void add (const char* name,
-              float  val);
+    void add (const char* name, float  val);
     /**
     * \brief Get the ival'th value of kth occurrence of the requested name.
     * If successful, the value is converted to a double and stored
@@ -523,8 +519,7 @@ public:
                double&     ref,
                int         ival = FIRST) const;
     //! Add a key 'name'with value 'ref' to the end of the PP table.
-    void add (const char* name,
-              double  val);
+    void add (const char* name, double val);
     /**
     * \brief Get the ival'th value of kth occurrence of the requested name.
     * If successful, the value is converted to a std::string and stored
@@ -534,15 +529,15 @@ public:
     * is output and the program halts.   Note that ival = 0 is the
     * first value in the list.
     */
-    void getkth (const char* name,
-                 int         k,
-                 std::string&    ref,
-                 int         ival = FIRST) const;
+    void getkth (const char*  name,
+                 int          k,
+                 std::string& ref,
+                 int          ival = FIRST) const;
 
     //! Same as getkth() but searches for the last occurrence of name.
-    void get (const char* name,
-              std::string&    ref,
-              int         ival = FIRST) const;
+    void get (const char*  name,
+              std::string& ref,
+              int          ival = FIRST) const;
     /**
     * \brief Similar to getkth() but returns 0 if there is no kth occurrence
     * of name.  If successful, it returns 1 and stores the value in
@@ -550,23 +545,16 @@ public:
     * occurrence does not, or if there is a type mismatch, then the
     * program signals an error and halts.
     */
-    int querykth (const char* name,
-                  int         k,
-                  std::string&   ref,
-                  int         ival = FIRST) const;
+    int querykth (const char*  name,
+                  int          k,
+                  std::string& ref,
+                  int          ival = FIRST) const;
     //! Same as querykth() but searches for the last occurrence of name.
-    int query (const char* name,
-               std::string&    ref,
-               int         ival = FIRST) const;
+    int query (const char*  name,
+               std::string& ref,
+               int          ival = FIRST) const;
     //! Add a key 'name'with value 'ref' to the end of the PP table.
-    void add (const char* name,
-              const std::string&  val);
-
-    //! keyword for files to load
-    static std::string const FileKeyword;
-
-    //! Add keys and values from a file to the end of the PP table.
-    static void addfile (std::string const& filename);
+    void add (const char* name, const std::string& val);
 
     /**
     * \brief Get the ival'th value of kth occurrence of the requested name.
@@ -594,15 +582,14 @@ public:
     */
     int querykth (const char* name,
                   int         k,
-                  IntVect&   ref,
+                  IntVect&    ref,
                   int         ival = FIRST) const;
     //! Same as querykth() but searches for the last occurrence of name.
     int query (const char* name,
                IntVect&    ref,
                int         ival = FIRST) const;
     //! Add a key 'name'with value 'ref' to the end of the PP table.
-    void add (const char* name,
-              const IntVect&  val);
+    void add (const char* name, const IntVect& val);
     /**
     * \brief Get the ival'th value of kth occurrence of the requested name.
     * If successful, the value is converted to a Box and stored
@@ -636,8 +623,7 @@ public:
                Box&        ref,
                int         ival = FIRST) const;
     //! Add a key 'name'with value 'ref' to the end of the PP table.
-    void add (const char* name,
-              const Box&  val);
+    void add (const char* name, const Box& val);
     /**
     * \brief Gets an std::vector\<int\> of num_val values from kth occurrence of
     * given name.  If successful, the values are converted to an int
@@ -650,27 +636,27 @@ public:
     * converted to an int, an error message is reported and the
     * program halts.
     */
-    void getktharr (const char* name,
-                    int         k,
+    void getktharr (const char*       name,
+                    int               k,
                     std::vector<int>& ref,
-                    int         start_ix = FIRST,
-                    int         num_val = ALL) const;
+                    int               start_ix = FIRST,
+                    int               num_val = ALL) const;
     //! Same as getktharr() but searches for last occurrence of name.
-    void getarr (const char* name,
+    void getarr (const char*       name,
                  std::vector<int>& ref,
-                 int         start_ix = FIRST,
-                 int         num_val = ALL) const;
+                 int               start_ix = FIRST,
+                 int               num_val = ALL) const;
     //! queryktharr() is to querykth() as getktharr() is to getkth().
-    int queryktharr (const char* name,
-                     int         k,
+    int queryktharr (const char*       name,
+                     int               k,
                      std::vector<int>& ref,
-                     int         start_ix = FIRST,
-                     int         num_val = ALL) const;
+                     int               start_ix = FIRST,
+                     int               num_val = ALL) const;
     //! Same as queryktharr() but searches for last occurrence of name.
-    int queryarr (const char* name,
+    int queryarr (const char*       name,
                   std::vector<int>& ref,
-                  int         start_ix = FIRST,
-                  int         num_val = ALL) const;
+                  int               start_ix = FIRST,
+                  int               num_val = ALL) const;
     //! Add a key 'name' with vector of values 'ref' to the end of the PP table.
     void addarr (const char* name, const std::vector<int>& ref);
 
@@ -686,30 +672,29 @@ public:
     * converted to a long, an error message is reported and the
     * program halts.
     */
-    void getktharr (const char* name,
-                    int         k,
+    void getktharr (const char*        name,
+                    int                k,
                     std::vector<long>& ref,
-                    int         start_ix = FIRST,
-                    int         num_val = ALL) const;
+                    int                start_ix = FIRST,
+                    int                num_val = ALL) const;
     //! Same as getktharr() but searches for last occurrence of name.
-    void getarr (const char* name,
+    void getarr (const char*        name,
                  std::vector<long>& ref,
-                 int         start_ix = FIRST,
-                 int         num_val = ALL) const;
+                 int                start_ix = FIRST,
+                 int                num_val = ALL) const;
     //! queryktharr() is to querykth() as getktharr() is to getkth().
-    int queryktharr (const char* name,
-                     int         k,
+    int queryktharr (const char*        name,
+                     int                k,
                      std::vector<long>& ref,
-                     int         start_ix = FIRST,
-                     int         num_val = ALL) const;
+                     int                start_ix = FIRST,
+                     int                num_val = ALL) const;
     //! Same as queryktharr() but searches for last occurrence of name.
-    int queryarr (const char* name,
+    int queryarr (const char*        name,
                   std::vector<long>& ref,
-                  int         start_ix = FIRST,
-                  int         num_val = ALL) const;
+                  int                start_ix = FIRST,
+                  int                num_val = ALL) const;
     //! Add a key 'name' with vector of values 'ref' to the end of the PP table.
-    void addarr (const char* name,
-                const std::vector<long>& ref);
+    void addarr (const char* name, const std::vector<long>& ref);
 
     /**
     * \brief Gets an std::vector\<long long\> of num_val values from kth occurrence of
@@ -723,27 +708,27 @@ public:
     * converted to a long long, an error message is reported and the
     * program halts.
     */
-    void getktharr (const char* name,
-                    int         k,
+    void getktharr (const char*             name,
+                    int                     k,
                     std::vector<long long>& ref,
-                    int         start_ix = FIRST,
-                    int         num_val = ALL) const;
+                    int                     start_ix = FIRST,
+                    int                     num_val = ALL) const;
     //! Same as getktharr() but searches for last occurrence of name.
-    void getarr (const char* name,
+    void getarr (const char*             name,
                  std::vector<long long>& ref,
-                 int         start_ix = FIRST,
-                 int         num_val = ALL) const;
+                 int                     start_ix = FIRST,
+                 int                     num_val = ALL) const;
     //! queryktharr() is to querykth() as getktharr() is to getkth().
-    int queryktharr (const char* name,
-                     int         k,
+    int queryktharr (const char*             name,
+                     int                     k,
                      std::vector<long long>& ref,
-                     int         start_ix = FIRST,
-                     int         num_val = ALL) const;
+                     int                     start_ix = FIRST,
+                     int                     num_val = ALL) const;
     //! Same as queryktharr() but searches for last occurrence of name.
-    int queryarr (const char* name,
+    int queryarr (const char*             name,
                   std::vector<long long>& ref,
-                  int         start_ix = FIRST,
-                  int         num_val = ALL) const;
+                  int                     start_ix = FIRST,
+                  int                     num_val = ALL) const;
     //! Add a key 'name' with vector of values 'ref' to the end of the PP table.
     void addarr (const char* name, const std::vector<long long>& ref);
 
@@ -759,27 +744,27 @@ public:
     * values cannot be converted to a float, an error message is
     * reported and the program halts.
     */
-    void getktharr (const char*   name,
-                    int           k,
+    void getktharr (const char*         name,
+                    int                 k,
                     std::vector<float>& ref,
-                    int           start_ix = FIRST,
-                    int           num_val = ALL) const;
+                    int                 start_ix = FIRST,
+                    int                 num_val = ALL) const;
     //! Same as getktharr() but searches for last occurrence of name.
-    void getarr (const char*   name,
+    void getarr (const char*         name,
                  std::vector<float>& ref,
-                 int           start_ix = FIRST,
-                 int           num_val = ALL) const;
+                 int                 start_ix = FIRST,
+                 int                 num_val = ALL) const;
     //! queryktharr() is to querykth() as getktharr() is to getkth().
-    int queryktharr (const char*   name,
-                     int           k,
+    int queryktharr (const char*         name,
+                     int                 k,
                      std::vector<float>& ref,
-                     int           start_ix = FIRST,
-                     int           num_val = ALL) const;
+                     int                 start_ix = FIRST,
+                     int                 num_val = ALL) const;
     //! Same as queryktharr() but searches for last occurrence of name.
-    int queryarr (const char*   name,
+    int queryarr (const char*         name,
                   std::vector<float>& ref,
-                  int           start_ix = FIRST,
-                  int           num_val = ALL) const;
+                  int                 start_ix = FIRST,
+                  int                 num_val = ALL) const;
     //! Add a key 'name' with vector of values 'ref' to the end of the PP table.
     void addarr (const char* name, const std::vector<float>& ref);
     /**
@@ -794,27 +779,27 @@ public:
     * values cannot be converted to a double, an error message is
     * reported and the program halts.
     */
-    void getktharr (const char*    name,
-                    int            k,
+    void getktharr (const char*          name,
+                    int                  k,
                     std::vector<double>& ref,
-                    int            start_ix = FIRST,
-                    int            num_val = ALL) const;
+                    int                  start_ix = FIRST,
+                    int                  num_val = ALL) const;
     //! Same as getktharr() but searches for last occurrence of name.
-    void getarr (const char*    name,
+    void getarr (const char*          name,
                  std::vector<double>& ref,
-                 int            start_ix = FIRST,
-                 int            num_val = ALL) const;
+                 int                  start_ix = FIRST,
+                 int                  num_val = ALL) const;
     //! queryktharr() is to querykth() as getktharr() is to getkth().
-    int queryktharr (const char*    name,
-                     int            k,
+    int queryktharr (const char*          name,
+                     int                  k,
                      std::vector<double>& ref,
-                     int            start_ix = FIRST,
-                     int            num_val = ALL) const;
+                     int                  start_ix = FIRST,
+                     int                  num_val = ALL) const;
     //! Same as queryktharr() but searches for last occurrence of name.
-    int queryarr (const char*    name,
+    int queryarr (const char*          name,
                   std::vector<double>& ref,
-                  int            start_ix = FIRST,
-                  int            num_val = ALL) const;
+                  int                  start_ix = FIRST,
+                  int                  num_val = ALL) const;
     //! Add a key 'name' with vector of values 'ref' to the end of the PP table.
     void addarr (const char* name, const std::vector<double>& ref);
     /**
@@ -829,27 +814,27 @@ public:
     * values cannot be converted to an std::string, an error message is
     * reported and the program halts.
     */
-    void getktharr (const char*     name,
-                    int             k,
+    void getktharr (const char*               name,
+                    int                       k,
                     std::vector<std::string>& ref,
-                    int             start_ix = FIRST,
-                    int             num_val = ALL) const;
+                    int                       start_ix = FIRST,
+                    int                       num_val = ALL) const;
     //! Same as getktharr() but searches for last occurrence of name.
-    void getarr (const char*     name,
+    void getarr (const char*               name,
                  std::vector<std::string>& ref,
-                 int             start_ix = FIRST,
-                 int             num_val = ALL) const;
+                 int                       start_ix = FIRST,
+                 int                       num_val = ALL) const;
     //! queryktharr() is to querykth() as getktharr() is to getkth().
-    int queryktharr (const char*     name,
-                     int             k,
+    int queryktharr (const char*               name,
+                     int                       k,
                      std::vector<std::string>& ref,
-                     int             start_ix = FIRST,
-                     int             num_val = ALL) const;
+                     int                       start_ix = FIRST,
+                     int                       num_val = ALL) const;
     //! Same as queryktharr() but searches for last occurrence of name.2
-    int queryarr (const char*     name,
+    int queryarr (const char*               name,
                   std::vector<std::string>& ref,
-                  int             start_ix = FIRST,
-                  int             num_val = ALL) const;
+                  int                       start_ix = FIRST,
+                  int                       num_val = ALL) const;
     //! Add a key 'name' with vector of values 'ref' to the end of the PP table.
     void addarr (const char* name, const std::vector<std::string>& ref);
     /**
@@ -864,27 +849,27 @@ public:
     * values cannot be converted to an IntVect, an error message is
     * reported and the program halts.
     */
-    void getktharr (const char*     name,
-                    int             k,
+    void getktharr (const char*           name,
+                    int                   k,
                     std::vector<IntVect>& ref,
-                    int             start_ix = FIRST,
-                    int             num_val = ALL) const;
+                    int                   start_ix = FIRST,
+                    int                   num_val = ALL) const;
     //! Same as getktharr() but searches for last occurrence of name.
-    void getarr (const char*     name,
+    void getarr (const char*           name,
                  std::vector<IntVect>& ref,
-                 int             start_ix = FIRST,
-                 int             num_val = ALL) const;
+                 int                   start_ix = FIRST,
+                 int                   num_val = ALL) const;
     //! queryktharr() is to querykth() as getktharr() is to getkth().
-    int queryktharr (const char*     name,
-                     int             k,
+    int queryktharr (const char*           name,
+                     int                   k,
                      std::vector<IntVect>& ref,
-                     int             start_ix = FIRST,
-                     int             num_val = ALL) const;
+                     int                   start_ix = FIRST,
+                     int                   num_val = ALL) const;
     //! Same as queryktharr() but searches for last occurrence of name.2
-    int queryarr (const char*     name,
+    int queryarr (const char*           name,
                   std::vector<IntVect>& ref,
-                  int             start_ix = FIRST,
-                  int             num_val = ALL) const;
+                  int                   start_ix = FIRST,
+                  int                   num_val = ALL) const;
     //! Add a key 'name' with vector of values 'ref' to the end of the PP table.
     void addarr (const char* name, const std::vector<IntVect>& ref);
     /**
@@ -899,30 +884,52 @@ public:
     * values cannot be converted to an Box, an error message is
     * reported and the program halts.
     */
-    void getktharr (const char*     name,
-                    int             k,
+    void getktharr (const char*       name,
+                    int               k,
                     std::vector<Box>& ref,
-                    int             start_ix = FIRST,
-                    int             num_val = ALL) const;
+                    int               start_ix = FIRST,
+                    int               num_val = ALL) const;
     //! Same as getktharr() but searches for last occurrence of name.
-    void getarr (const char*     name,
+    void getarr (const char*       name,
                  std::vector<Box>& ref,
-                 int             start_ix = FIRST,
-                 int             num_val = ALL) const;
+                 int               start_ix = FIRST,
+                 int               num_val = ALL) const;
     //! queryktharr() is to querykth() as getktharr() is to getkth().
-    int queryktharr (const char*     name,
-                     int             k,
+    int queryktharr (const char*       name,
+                     int               k,
                      std::vector<Box>& ref,
-                     int             start_ix = FIRST,
-                     int             num_val = ALL) const;
+                     int               start_ix = FIRST,
+                     int               num_val = ALL) const;
     //! Same as queryktharr() but searches for last occurrence of name.2
-    int queryarr (const char*     name,
+    int queryarr (const char*       name,
                   std::vector<Box>& ref,
-                  int             start_ix = FIRST,
-                  int             num_val = ALL) const;
+                  int               start_ix = FIRST,
+                  int               num_val = ALL) const;
     //! Add a key 'name' with vector of values 'ref' to the end of the PP table.
     void addarr (const char* name, const std::vector<Box>& refd);
 
+    /*
+     * \brief Query IntVect from array
+     *
+     * This reads IntVect from an array (e.g., `8 16 8`), not the format
+     * using parentheses (e.g., `(8,16,8)`).
+     */
+    int queryarr (const char* name, IntVect& ref) const;
+
+    /*
+     * \brief Get IntVect from array
+     *
+     * This reads IntVect from an array (e.g., `8 16 8`), not the format
+     * using parentheses (e.g., `(8,16,8)`).
+     */
+    void getarr (const char* name, IntVect& ref) const;
+
+    //! Query RealVect from array
+    int queryarr (const char* name, RealVect& ref) const;
+
+    //! Get RealVect from array
+    void getarr (const char* name, RealVect& ref) const;
+
     template <typename T, std::size_t N>
     void get (const char* name, std::array<T,N>& ref) const {
         std::vector<T> v;
@@ -1031,9 +1038,297 @@ public:
         return exist;
     }
 
+    /**
+     * \brief Query with Parser. If `name` is found, this uses amrex::Parser
+     * to parse the entire list of empty space separated values as a single
+     * scalar. The return value indicates whether it's found. Note that
+     * queryWithParser will be used recursively for unresolved symbols.
+     */
+    int queryWithParser (const char* name, int& ref) const;
+    int queryWithParser (const char* name, long& ref) const;
+    int queryWithParser (const char* name, long long& ref) const;
+    int queryWithParser (const char* name, float& ref) const;
+    int queryWithParser (const char* name, double& ref) const;
+
+    /**
+     * \brief Query with Parser. The return value indicates whether it's
+     * found. Note that queryWithParser will be used for unresolved
+     * symbols. If the number of elements in the input does not equal to
+     * `nvals`, it's a runtime error.
+     */
+    int queryarrWithParser (const char* name, int nvals, int* ref) const;
+    int queryarrWithParser (const char* name, int nvals, long* ref) const;
+    int queryarrWithParser (const char* name, int nvals, long long* ref) const;
+    int queryarrWithParser (const char* name, int nvals, float* ref) const;
+    int queryarrWithParser (const char* name, int nvals, double* ref) const;
+    template <typename T, std::enable_if_t<std::is_same_v<T,int> ||
+                                           std::is_same_v<T,long> ||
+                                           std::is_same_v<T,long long> ||
+                                           std::is_same_v<T,float> ||
+                                           std::is_same_v<T,double>,int> = 0>
+    int queryarrWithParser (const char* name, int nvals, std::vector<T>& ref) const
+    {
+        if (this->contains(name)) {
+            if (int(ref.size()) < nvals) { ref.resize(nvals); }
+            return this->queryarrWithParser(name, nvals, ref.data());
+        } else {
+            return 0;
+        }
+    }
+
+    /**
+     * \brief Query with Parser. If `name` is found, this uses amrex::Parser
+     * to parse the entire list of empty space separated values as a single
+     * scalar. If not, the value in `ref` will be added to the ParmParse
+     * database. The return value indicates whether it's found.
+     */
+    template <typename T, std::enable_if_t<std::is_same_v<T,int> ||
+                                           std::is_same_v<T,long> ||
+                                           std::is_same_v<T,long long> ||
+                                           std::is_same_v<T,float> ||
+                                           std::is_same_v<T,double>,int> = 0>
+    int queryAddWithParser (const char* name, T& ref) const
+    {
+        int exist = this->queryWithParser(name, ref);
+        if (!exist) {
+            this->add(name, ref);
+        }
+        return exist;
+    }
+
+    /**
+     * \brief Get with Parser. If `name` is found, this uses amrex::Parser
+     * to parse the entire list of empty space separated values as a single
+     * scalar. If not, it's a runtime error.
+     */
+    template <typename T, std::enable_if_t<std::is_same_v<T,int> ||
+                                           std::is_same_v<T,long> ||
+                                           std::is_same_v<T,long long> ||
+                                           std::is_same_v<T,float> ||
+                                           std::is_same_v<T,double>,int> = 0>
+    void getWithParser (const char* name, T& ref) const
+    {
+        int exist = this->queryWithParser(name, ref);
+        if (!exist) {
+            amrex::Error(std::string("ParmParse::getWithParser: failed to get ")+name);
+        }
+    }
+
+    /**
+     * \brief Get with Parser. If `name` is not found, it's a runtime
+     * error. If the number of elements does not equal to `nvals`, it's also
+     * a runtime error.
+     */
+    template <typename T, std::enable_if_t<std::is_same_v<T,int> ||
+                                           std::is_same_v<T,long> ||
+                                           std::is_same_v<T,long long> ||
+                                           std::is_same_v<T,float> ||
+                                           std::is_same_v<T,double>,int> = 0>
+    void getarrWithParser (const char* name, int nvals, T* ref) const
+    {
+        int exist = this->queryarrWithParser(name, nvals, ref);
+        if (!exist) {
+            amrex::Error(std::string("ParmParse::getarrWithParser: failed to get ")+name);
+        }
+    }
+
+    /**
+     * \brief Get with Parser. If `name` is not found, it's a runtime
+     * error. If the number of elements does not equal to `nvals`, it's also
+     * a runtime error.
+     */
+    template <typename T, std::enable_if_t<std::is_same_v<T,int> ||
+                                           std::is_same_v<T,long> ||
+                                           std::is_same_v<T,long long> ||
+                                           std::is_same_v<T,float> ||
+                                           std::is_same_v<T,double>,int> = 0>
+    void getarrWithParser (const char* name, int nvals, std::vector<T>& ref) const
+    {
+        int exist = this->queryarrWithParser(name, nvals, ref);
+        if (!exist) {
+            amrex::Error(std::string("ParmParse::getarrWithParser: failed to get ")+name);
+        }
+    }
+
+    /*
+     * \brief Query two names.
+     *
+     * This function queries with `new_name` first. If it's not found, it
+     * will try again with `old_name`.
+     */
+    template <typename T>
+    int query (const char* new_name, const char* old_name, T& ref)
+    {
+        return (this->query(new_name, ref) ||
+                this->query(old_name, ref));
+    }
+
+    /**
+     * \brief Get using two names.
+     *
+     * This function queries with `new_name` first, If it's not found, it
+     * will try again with `old_name`. It's an error if neither name is
+     * found.
+     */
+    template <typename T>
+    void get (const char* new_name, const char* old_name, T& ref)
+    {
+        auto exist = this->query(new_name, old_name, ref);
+        if (!exist) {
+            amrex::ErrorStream() << "ParmParse::get failed to find "
+                                 << new_name << " and " << old_name << '\n';
+            ParmParse::dumpTable(amrex::ErrorStream());
+            amrex::Abort();
+        }
+    }
+
+    /**
+     * \brief. Query enum value using given name.
+     *
+     * Here T is an enum class defined by AMREX_ENUM. The return value
+     * indicates if `name` is found. An exception is thrown, if the found
+     * string associated with the name cannot be converted to an enumerator
+     * (i.e., the string does not match any names in the definition of T).
+     */
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    int query (const char* name, T& ref)
+    {
+        std::string s;
+        int exist = this->query(name, s);
+        if (exist) {
+            try {
+                ref = amrex::getEnum<T>(s);
+            } catch (...) {
+                throw;
+            }
+        }
+        return exist;
+    }
+
+    /**
+     * \brief. Get enum value using given name.
+     *
+     * Here T is an enum class defined by AMREX_ENUM. It's a runtime error,
+     * if `name` is not found. An exception is thrown, if the found string
+     * associated with the name cannot be converted to an enumerator (i.e.,
+     * the string does not match any names in the definition of T).
+     */
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    void get (const char* name, T& ref)
+    {
+        std::string s;
+        this->get(name, s);
+        try {
+            ref = amrex::getEnum<T>(s);
+        } catch (...) {
+            throw;
+        }
+    }
+
+    //! Query an array of enum values using given name.
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    int queryarr (const char* name, std::vector<T>& ref)
+    {
+        std::vector<std::string> s;
+        int exist = this->queryarr(name, s);
+        if (exist) {
+            ref.resize(s.size());
+            for (std::size_t i = 0; i < s.size(); ++i) {
+                ref[i] = amrex::getEnum<T>(s[i]);
+            }
+        }
+        return exist;
+    }
+
+    //! Get an array of enum values using given name.
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    void getarr (const char* name, std::vector<T>& ref)
+    {
+        std::vector<std::string> s;
+        this->getarr(name, s);
+        ref.resize(s.size());
+        for (std::size_t i = 0; i < s.size(); ++i) {
+            ref[i] = amrex::getEnum<T>(s[i]);
+        }
+    }
+
+    /**
+     * \brief. Query enum value using given name.
+     *
+     * Here T is an enum class defined by AMREX_ENUM. The return value
+     * indicates if `name` is found. An exception is thrown, if the found
+     * string associated with the name cannot be case-insensitively
+     * converted to an enumerator (i.e., the found string, not `name`, does
+     * not case-insensitively match any names in the definition of T). If
+     * there are multiple matches, the first one is used.
+     */
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    int query_enum_case_insensitive (const char* name, T& ref)
+    {
+        std::string s;
+        int exist = this->query(name, s);
+        if (exist) {
+            s = amrex::toLower(s);
+            auto const& enum_names = amrex::getEnumNameStrings<T>();
+            auto found = std::find_if(enum_names.begin(), enum_names.end(),
+                                      [&] (std::string const& ename) {
+                                          return amrex::toLower(ename) == s;
+                                      });
+            if (found != enum_names.end()) {
+                ref = static_cast<T>(std::distance(enum_names.begin(), found));
+            } else {
+                std::string msg("query_enum_case_insensitive(\"");
+                msg.append(name).append("\",").append(amrex::getEnumClassName<T>())
+                    .append("&) failed.");
+                throw std::runtime_error(msg);
+            }
+        }
+        return exist;
+    }
+
+    /**
+     * \brief. Get enum value using given name.
+     *
+     * Here T is an enum class defined by AMREX_ENUM. It's a runtime error,
+     * if `name` is not found. An exception is thrown, if the found string
+     * associated with the name cannot be case-insensitively converted to an
+     * enumerator (i.e., the found string, not `name`, does not
+     * case-insensitively match any names in the definition of T). If there
+     * are multiple matches, the first one is used.
+     */
+    template <typename T, typename ET = amrex_enum_traits<T>,
+              std::enable_if_t<ET::value,int> = 0>
+    void get_enum_case_insensitive (const char* name, T& ref)
+    {
+        int exist = this->query_enum_case_insensitive(name, ref);
+        if (!exist) {
+            std::string msg("get_enum_case_insensitive(\"");
+            msg.append(name).append("\",").append(amrex::getEnumClassName<T>())
+                .append("&) failed.");
+            amrex::Abort(msg);
+        }
+    }
+
     //! Remove given name from the table.
     int remove (const char* name);
 
+    //! Make Parser using given string `func` as function body and `vars` as
+    //! variable names. Constants known to ParmParse will be set. It's a
+    //! runtime error, if there are unknown symbols in `func`.
+    [[nodiscard]] Parser makeParser (std::string const& func,
+                                     Vector<std::string> const& vars) const;
+
+    //! Make IParser using given string `func` as function body and `vars`
+    //! as variable names. Constants known to ParmParse will be set. It's a
+    //! runtime error, if there are unknown symbols in `func`.
+    [[nodiscard]] IParser makeIParser (std::string const& func,
+                                       Vector<std::string> const& vars) const;
+
     /**
     * \brief Construct an initial ParmParse object from the argc and argv
     * passed in to main().  An error will be signalled if another
@@ -1041,14 +1336,28 @@ public:
     * read the parameters in from that file first and then append
     * those derived from argv to the table.
     */
-    static void Initialize(int         argc,
-                           char**      argv,
-                           const char* parfile);
+    static void Initialize (int argc, char** argv, const char* parfile);
     /**
     * \brief The destructor.  The internal static table will only be deleted
     * if there are no other ParmParse objects in existence.
     */
-    static void Finalize();
+    static void Finalize ();
+
+    //! Set prefix used by math expression Parser
+    static void SetParserPrefix (std::string a_prefix);
+
+    static int Verbose ();
+    static void SetVerbose (int v);
+
+    //! Write the contents of the table in ASCII to the ostream.
+    static void dumpTable (std::ostream& os, bool prettyPrint = false);
+
+    //! Write the table in a pretty way to the ostream. If there are
+    //! duplicates, only the last one is printed.
+    static void prettyPrintTable (std::ostream& os);
+
+    //! Add keys and values from a file to the end of the PP table.
+    static void addfile (std::string const& filename);
 
     static bool QueryUnusedInputs ();
 
@@ -1061,79 +1370,32 @@ public:
     //! Returns [prefix.]* parameters.
     [[nodiscard]] static std::set<std::string> getEntries (const std::string& prefix = std::string());
 
-    struct PP_entry;
-    using Table = std::list<PP_entry>;
-    static void appendTable(ParmParse::Table& tab);
-    [[nodiscard]] const Table& table() const {return *m_table;}
-
-protected:
+    struct PP_entry {
+        // There can be multiple occurrences for a given name (e.g.,
+        // multiple lines starting with `foo =` in inputs. For each
+        // occurrence, there can be multiple values. Thus, the use of
+        // vector<vector<std::string>>.
+        std::vector<std::vector<std::string>> m_vals;
+        mutable Long                          m_count = 0;
+    };
+    using Table = std::unordered_map<std::string, PP_entry>;
 
-    friend class Frame;
-    friend class Record;
+    [[nodiscard]] const Table& table() const {return *m_table;}
 
-    explicit ParmParse (Table& a_table);
-    //
-    //! Set/Get the prefix.
-    [[nodiscard]] std::string getPrefix() const;
-    std::string setPrefix(const std::string& str);
-    void pushPrefix(const std::string& str);
-    void popPrefix();
-    [[nodiscard]] std::string prefixedName (const std::string& str) const;
-    //
-    //! Prefix used in keyword search.
-    std::stack<std::string> m_pstack;
-    Table* m_table;
-};
+    //! keyword for files to load
+    static std::string const FileKeyword;
 
-struct ParmParse::PP_entry
-{
-    PP_entry (std::string name, const std::list<std::string>& vals);
-    PP_entry (std::string name, const std::string& vals);
-    PP_entry (std::string name, const std::list<PP_entry>& table);
-    PP_entry (const PP_entry& pe);
-    PP_entry& operator= (const PP_entry& pe);
-    PP_entry (PP_entry&&) = delete;
-    PP_entry& operator= (PP_entry&&) = delete;
-    ~PP_entry ();
-    [[nodiscard]] std::string print() const;
-
-    std::string              m_name;
-    std::vector<std::string> m_vals;
-    Table*                   m_table;
-    mutable bool             m_queried;
-};
+    static std::string ParserPrefix;
 
+protected:
 
-class ParmParse::Frame
-{
-public:
-    Frame (ParmParse& pp, const std::string& pfix);
-    ~Frame ();
-    Frame (Frame const&) = default;
-    Frame (Frame&&) = delete;
-    Frame& operator= (Frame const&) = delete;
-    Frame& operator= (Frame &&) = delete;
-    void push(const std::string& str);
-    void pop();
-    [[nodiscard]] std::string getPrefix() const;
-private:
-    ParmParse* m_pp;
-    int        m_np{0};
-};
+    [[nodiscard]] std::string prefixedName (const std::string_view& str) const;
 
-class ParmParse::Record
-{
-public:
-    [[nodiscard]] const ParmParse* operator->() const;
-    [[nodiscard]] const ParmParse& operator* () const;
-private:
-    friend class ParmParse;
-    explicit Record (const ParmParse& pp);
-    ParmParse m_pp;
+    std::string m_prefix; // Prefix used in keyword search
+    std::string m_parser_prefix; // Prefix used by Parser
+    Table* m_table;
 };
 
-std::ostream& operator<< (std::ostream& os, const ParmParse::PP_entry& pp);
-
 }
 
-#endif /*BL_PARMPARSE_H*/
+#endif /* AMREX_PARMPARSE_H_ */
diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp
index a2b42b64a15..df1e18e9b08 100644
--- a/Src/Base/AMReX_ParmParse.cpp
+++ b/Src/Base/AMReX_ParmParse.cpp
@@ -1,28 +1,23 @@
-#include <AMReX.H>
 #include <AMReX_ParmParse.H>
-#include <AMReX_ParallelDescriptor.H>
+#include <AMReX.H>
 #include <AMReX_Box.H>
-#include <AMReX_IntVect.H>
-#include <AMReX_BLFort.H>
+#include <AMReX_OpenMP.H>
+#include <AMReX_ParallelDescriptor.H>
 #include <AMReX_Print.H>
+#include <AMReX_RealVect.H>
+#include <AMReX_Utility.H>
 
 #include <algorithm>
 #include <cctype>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
 #include <iostream>
-#include <iterator>
-#include <list>
+#include <limits>
 #include <numeric>
+#include <unordered_map>
 #include <regex>
-#include <stdexcept>
-#include <set>
 #include <sstream>
-#include <string>
+#include <stdexcept>
 #include <typeinfo>
-#include <vector>
+#include <type_traits>
 
 extern "C" void amrex_init_namelist (const char*);
 extern "C" void amrex_finalize_namelist ();
@@ -30,117 +25,55 @@ extern "C" void amrex_finalize_namelist ();
 namespace amrex {
 
 namespace {
-#ifdef AMREX_XSDK
-    bool finalize_verbose = false;
-#else
-    bool finalize_verbose = true;
-#endif
+    bool initialized = false;
+    ParmParse::Table g_table;
+    std::vector<std::set<std::string>> g_parser_recursive_symbols;
+    namespace pp_detail {
+        int verbose = -1;
+    }
 }
 
 std::string const ParmParse::FileKeyword = "FILE";
+std::string       ParmParse::ParserPrefix;
 
-//
-// Used by constructor to build table.
-//
-ParmParse::PP_entry::PP_entry (std::string name, const std::list<std::string>& vals)
-    :
-    m_name(std::move(name)),
-    m_vals(vals.begin(), vals.end()),
-    m_table(nullptr),
-    m_queried(false)
-{
-}
-
-ParmParse::PP_entry::PP_entry (std::string name, const std::string& val)
-    :
-    m_name(std::move(name)),
-    m_vals({val}),
-    m_table(nullptr),
-    m_queried(false)
-{
-}
+ParmParse::ParmParse (std::string prefix, std::string parser_prefix)
+    : m_prefix(std::move(prefix)),
+      m_parser_prefix(std::move(parser_prefix)),
+      m_table(&g_table)
+{}
 
-ParmParse::PP_entry::PP_entry (std::string name, const std::list<PP_entry>& table)
-    :
-    m_name(std::move(name)),
-    m_table(new Table(table)),
-    m_queried(false)
-{
-}
-
-ParmParse::PP_entry::PP_entry (const PP_entry& pe)
-    : m_name(pe.m_name),
-      m_vals(pe.m_vals),
-      m_table(nullptr),
-      m_queried(pe.m_queried)
-{
-    if ( pe.m_table )
-    {
-        m_table = new Table(*pe.m_table);
-    }
-}
-
-ParmParse::PP_entry::~PP_entry ()
+namespace
 {
-    delete m_table;
-}
 
-ParmParse::PP_entry&
-ParmParse::PP_entry::operator= (const PP_entry& pe)
+std::string pp_to_pretty_string (std::string const& name,
+                                 std::vector<std::string> const& vals)
 {
-    if ( &pe == this ) { return *this; }
-    m_name = pe.m_name;
-    m_vals = pe.m_vals;
-    m_table = nullptr;
-    m_queried = pe.m_queried;
-    if ( pe.m_table )
-    {
-        m_table = new Table(*pe.m_table);
+    std::stringstream ss;
+    ss << name << " =";
+    for (auto const& v : vals) {
+        ss << " " << v;
     }
-    return *this;
+    return ss.str();
 }
 
-std::string
-ParmParse::PP_entry::print () const {
-    std::stringstream t;
-    t << m_name << " = ";
-    int n = static_cast<int>(m_vals.size());
-    for ( int i = 0; i < n; i++)
-    {
-        t << m_vals[i];
-        if ( i < n-1 ) { t << " "; }
-    }
-    return t.str();
-}
-
-std::ostream&
-operator<< (std::ostream& os, const ParmParse::PP_entry& pp)
+std::string pp_to_string (std::string const& name,
+                          std::vector<std::string> const& vals)
 {
-    os << pp.m_name << "(nvals = " << pp.m_vals.size() << ") " << " :: [";
-    int n = static_cast<int>(pp.m_vals.size());
-    for ( int i = 0; i < n; i++ )
-    {
-        os << pp.m_vals[i];
-        if ( i < n-1 ) { os << ", "; }
+    std::stringstream ss;
+    ss << name << "(nvals = " << vals.size() << ") " << " :: [";
+    for (std::size_t i = 0; i < vals.size(); ++i) {
+        ss << vals[i];
+        if ( i < vals.size()-1 ) { ss << ", "; }
     }
-    os << "]";
-
-    if ( !os )
-    {
-        amrex::Error("write on ostream failed");
-    }
-    return os;
+    ss << "]";
+    return ss.str();
 }
 
-namespace
-{
 enum PType
 {
     pDefn,
-    pValue,
     pEQ_sign,
-    pOpenBracket,
-    pCloseBracket,
+    pValue,
     pEOF
 };
 
@@ -208,12 +141,13 @@ template <>
 bool
 is (const std::string& str, bool& val)
 {
-    if ( str == "true" || str == "t" )
+    auto const lo_str = amrex::toLower(str);
+    if ( lo_str == "true" || lo_str == "t" )
     {
         val = true;
         return true;
     }
-    if ( str == "false" || str == "f" )
+    if ( lo_str == "false" || lo_str == "f" )
     {
         val = false;
         return true;
@@ -233,8 +167,6 @@ is (const std::string& str, bool& val)
     return false;
 }
 
-ParmParse::Table g_table;
-
 template <class T> const char* tok_name(const T&) { return typeid(T).name(); }
 template <class T> const char* tok_name(std::vector<T>&) { return tok_name(T());}
 
@@ -348,16 +280,6 @@ getToken (const char*& str, std::string& ostr, int& num_linefeeds)
                ostr += ch; str++; pcnt = 1;
                state = LIST;
            }
-           else if ( ch == '{' )
-           {
-               str++;
-               return pOpenBracket;
-           }
-           else if ( ch == '}' )
-           {
-               str++;
-               return pCloseBracket;
-           }
            else if ( std::isalpha(ch) )
            {
                ostr += ch; str++;
@@ -433,107 +355,52 @@ getToken (const char*& str, std::string& ostr, int& num_linefeeds)
    }
 }
 
-
-//
-// Keyword aware string comparison.
-//
-
-
-bool
-ppfound (const std::string& keyword,
-         const ParmParse::PP_entry& pe,
-         bool recordQ)
-{
-    return (recordQ == (pe.m_table!=nullptr)) && (keyword == pe.m_name);
-}
-
 //
 // Return the index of the n'th occurrence of a parameter name,
 // except if n==-1, return the index of the last occurrence.
 // Return 0 if the specified occurrence does not exist.
 //
-const ParmParse::PP_entry*
-ppindex (const ParmParse::Table& table,
-         int         n,
-         const std::string& name,
-         bool recordQ)
+std::vector<std::string> const*
+ppindex (const ParmParse::Table& table, int n, const std::string& name)
 {
-    const ParmParse::PP_entry* fnd = nullptr;
+    auto found = table.find(name);
+    if (found == table.cend()) { return nullptr; }
 
-    if ( n == ParmParse::LAST )
-    {
-        //
-        // Search from back of list.
-        //
-        for (auto li = table.crbegin(), REnd = table.crend(); li != REnd; ++li)
-        {
-            if ( ppfound(name, *li, recordQ) )
-            {
-                fnd = &*li;
-                break;
-            }
-        }
-    }
-    else
-    {
-        for (auto const& li : table)
-        {
-            if ( ppfound(name, li, recordQ) )
-            {
-                fnd = &li;
-                if ( --n < 0 )
-                {
-                    break;
-                }
-            }
-        }
-        if ( n >= 0)
-        {
-            fnd = nullptr;
-        }
-    }
+#ifdef AMREX_USE_OMP
+#pragma omp atomic update
+#endif
+    ++(found->second.m_count);
 
-    if ( fnd )
-    {
-        //
-        // Found an entry; mark all occurrences of name as used.
-        //
-        for (const auto& li : table)
-        {
-            if ( ppfound(name, li, recordQ) )
-            {
-                li.m_queried = true;
-            }
-        }
+    if (n == ParmParse::LAST) {
+        return &(found->second.m_vals.back());
+    } else {
+        return &(found->second.m_vals[n]);
     }
-    return fnd;
 }
 
-void
-bldTable (const char*& str, std::list<ParmParse::PP_entry>& tab);
+void bldTable (const char*& str, ParmParse::Table& tab);
 
-namespace {
-    bool isTrue(std::smatch const& sm) {
-        const std::string op = sm[1].str();
-        const int dim = std::stoi(sm[2].str());
-        if (op == "<") {
-            return AMREX_SPACEDIM < dim;
-        } else if (op == ">") {
-            return AMREX_SPACEDIM > dim;
-        } else if (op == "==") {
-            return AMREX_SPACEDIM == dim;
-        } else if (op == "<=") {
-            return AMREX_SPACEDIM <= dim;
-        } else if (op == ">=") {
-            return AMREX_SPACEDIM >= dim;
-        } else {
-            return false;
-        }
+bool isTrue(std::smatch const& sm)
+{
+    const std::string op = sm[1].str();
+    const int dim = std::stoi(sm[2].str());
+    if (op == "<") {
+        return AMREX_SPACEDIM < dim;
+    } else if (op == ">") {
+        return AMREX_SPACEDIM > dim;
+    } else if (op == "==") {
+        return AMREX_SPACEDIM == dim;
+    } else if (op == "<=") {
+        return AMREX_SPACEDIM <= dim;
+    } else if (op == ">=") {
+        return AMREX_SPACEDIM >= dim;
+    } else {
+        return false;
     }
 }
 
 void
-read_file (const char* fname, std::list<ParmParse::PP_entry>& tab)
+read_file (const char* fname, ParmParse::Table& tab)
 {
     //
     // Space for input file if it exists.
@@ -629,9 +496,7 @@ read_file (const char* fname, std::list<ParmParse::PP_entry>& tab)
 }
 
 void
-addDefn (std::string&         def,
-         std::list<std::string>&   val,
-         std::list<ParmParse::PP_entry>& tab)
+addDefn (std::string& def, std::vector<std::string>& val, ParmParse::Table& tab)
 {
     //
     // Check that defn exists.
@@ -662,7 +527,7 @@ addDefn (std::string&         def,
     }
     else
     {
-        tab.emplace_back(def,val);
+        tab[def].m_vals.push_back(val);
     }
     val.clear();
     if ( def != ParmParse::FileKeyword ) {
@@ -671,54 +536,23 @@ addDefn (std::string&         def,
 }
 
 void
-addTable (std::string& def,
-          ParmParse::Table& val,
-          std::list<ParmParse::PP_entry>& tab)
-{
-    if ( def.empty() )
-    {
-        val.clear();
-        return;
-    }
-    //
-    // Check that it has values.
-    //
-    if ( val.empty() )
-    {
-        amrex::ErrorStream() << "ParmParse::addTable(): no values for Table " << def << "\n";
-        amrex::Abort();
-    }
-    tab.emplace_back(def, val);
-    val.clear();
-    def = std::string();
-}
-
-void
-bldTable (const char*&                    str,
-          std::list<ParmParse::PP_entry>& tab)
+bldTable (const char*& str, ParmParse::Table& tab)
 {
-    std::string            cur_name;
-    std::list<std::string> cur_list;
-    ParmParse::Table       cur_table;
-    std::string            tmp_str;
-    std::vector<int>       cur_linefeeds;
+    std::string              cur_name;
+    std::vector<std::string> cur_list;
+    std::vector<int>         cur_linefeeds;
 
     for (;;)
     {
         std::string tokname;
         int num_linefeeds;
 
-        PType token = getToken(str,tokname, num_linefeeds);
+        PType token = getToken(str, tokname, num_linefeeds);
 
         switch (token)
         {
-        case pCloseBracket:
-            if ( !cur_name.empty() && cur_list.empty() )
-            {
-                amrex::Abort("ParmParse::bldTable() defn with no list");
-            }
-            AMREX_FALLTHROUGH;
         case pEOF:
+        {
             if (std::accumulate(cur_linefeeds.begin(), cur_linefeeds.end(), int(0)) > 0)
             {
                 std::string error_message("ParmParse: Multiple lines in ");
@@ -731,33 +565,9 @@ bldTable (const char*&                    str,
             }
             addDefn(cur_name,cur_list,tab);
             return;
-        case pOpenBracket:
-            if ( cur_name.empty() )
-            {
-                amrex::Abort("ParmParse::bldTabe() '{' with no blocknamne");
-            }
-            if ( !cur_list.empty() )
-            {
-                tmp_str = cur_list.back();
-                cur_list.pop_back();
-                cur_linefeeds.pop_back();
-                if (std::accumulate(cur_linefeeds.begin(), cur_linefeeds.end(), int(0)) > 0)
-                {
-                    std::string error_message("ParmParse: Multiple lines in ");
-                    error_message.append(cur_name).append(" =");
-                    for (auto const& x : cur_list) {
-                        error_message.append(" ").append(x);
-                    }
-                    error_message.append(". Must use \\ for line continuation.");
-                    amrex::Abort(error_message);
-                }
-                addDefn(cur_name, cur_list, tab);
-                cur_name = tmp_str;
-            }
-            bldTable(str, cur_table);
-            addTable(cur_name, cur_table, tab);
-            break;
+        }
         case pEQ_sign:
+        {
             if ( cur_name.empty() )
             {
                 amrex::Abort("ParmParse::bldTable() EQ with no current defn");
@@ -767,7 +577,7 @@ bldTable (const char*&                    str,
                 //
                 // Read one too far, remove last name on list.
                 //
-                tmp_str = cur_list.back();
+                auto tmp_str = cur_list.back();
                 cur_list.pop_back();
                 cur_linefeeds.pop_back();
                 if (std::accumulate(cur_linefeeds.begin(), cur_linefeeds.end(), int(0)) > 0)
@@ -781,11 +591,13 @@ bldTable (const char*&                    str,
                     amrex::Abort(error_message);
                 }
                 addDefn(cur_name,cur_list,tab);
-                cur_name = tmp_str;
+                cur_name = std::move(tmp_str);
             }
             cur_linefeeds.clear();
             break;
+        }
         case pDefn:
+        {
             if ( cur_name.empty() )
             {
                 cur_name = tokname;
@@ -795,7 +607,9 @@ bldTable (const char*&                    str,
             // Otherwise, fall through, this may be a string.
             //
             AMREX_FALLTHROUGH;
+        }
         case pValue:
+        {
             if ( cur_name.empty() )
             {
                 std::string msg("ParmParse::bldTable(): value with no defn: ");
@@ -806,23 +620,28 @@ bldTable (const char*&                    str,
             cur_linefeeds.push_back(num_linefeeds);
             break;
         }
+        } // switch (token)
     }
 }
 
-namespace
-{
+template <typename T>
+bool pp_parser (const ParmParse::Table& table, const std::string& parser_prefix,
+                const std::string& name, const std::string& val, T& ref,
+                bool use_querywithparser);
+
 template <class T>
 bool
 squeryval (const ParmParse::Table& table,
-           const std::string& name,
-           T&           ref,
-           int          ival,
-           int          occurrence)
+           const std::string&      parser_prefix,
+           const std::string&      name,
+           T&                      ref,
+           int                     ival,
+           int                     occurrence)
 {
     //
     // Get last occurrence of name in table.
     //
-    const ParmParse::PP_entry* def = ppindex(table, occurrence, name, false);
+    auto const* def = ppindex(table, occurrence, name);
     if ( def == nullptr )
     {
         return false;
@@ -830,10 +649,10 @@ squeryval (const ParmParse::Table& table,
     //
     // Does it have ival values?
     //
-    if ( ival >= static_cast<int>(def->m_vals.size()) )
+    if ( ival >= static_cast<int>(def->size()) )
     {
         amrex::ErrorStream() << "ParmParse::queryval no value number"
-                  << ival << " for ";
+                             << ival << " for ";
         if ( occurrence ==  ParmParse::LAST )
         {
             amrex::ErrorStream() << "last occurrence of ";
@@ -842,17 +661,30 @@ squeryval (const ParmParse::Table& table,
         {
             amrex::ErrorStream() << " occurrence " << occurrence << " of ";
         }
-        amrex::ErrorStream() << def->m_name << '\n' << *def << '\n';
+        amrex::ErrorStream() << name << '\n' << pp_to_string(name,*def) << '\n';
         amrex::Abort();
     }
 
-    const std::string& valname = def->m_vals[ival];
+    const std::string& valname = (*def)[ival];
 
     bool ok = is(valname, ref);
     if ( !ok )
     {
+        if constexpr (std::is_same_v<T,int> ||
+                      std::is_same_v<T,long> ||
+                      std::is_same_v<T,long long> ||
+                      std::is_same_v<T,float> ||
+                      std::is_same_v<T,double>)
+        {
+            if (pp_parser(table, parser_prefix, name, valname, ref, false)) {
+                return true;
+            }
+        } else {
+            amrex::ignore_unused(parser_prefix);
+        }
+
         amrex::ErrorStream() << "ParmParse::queryval type mismatch on value number "
-                  << ival << " of " << '\n';
+                             << ival << " of " << '\n';
         if ( occurrence == ParmParse::LAST )
         {
             amrex::ErrorStream() << " last occurrence of ";
@@ -861,12 +693,12 @@ squeryval (const ParmParse::Table& table,
         {
             amrex::ErrorStream() << " occurrence number " << occurrence << " of ";
         }
-        amrex::ErrorStream() << def->m_name << '\n';
+        amrex::ErrorStream() << name << '\n';
         amrex::ErrorStream() << " Expected an \""
-                  << tok_name(ref)
-                  << "\" type  which can't be parsed from the string \""
-                  << valname << "\"\n"
-                  << *def << '\n';
+                             << tok_name(ref)
+                             << "\" type  which can't be parsed from the string \""
+                             << valname << "\"\n"
+                             << pp_to_string(name,*def) << '\n';
         amrex::Abort();
     }
     return true;
@@ -875,25 +707,26 @@ squeryval (const ParmParse::Table& table,
 template <class T>
 void
 sgetval (const ParmParse::Table& table,
-         const std::string& name,
-         T&           ref,
-         int          ival,
-         int          occurrence)
+         const std::string&      parser_prefix,
+         const std::string&      name,
+         T&                      ref,
+         int                     ival,
+         int                     occurrence)
 {
-    if ( squeryval(table, name,ref,ival,occurrence) == 0 )
+    if ( squeryval(table, parser_prefix, name,ref,ival,occurrence) == 0 )
     {
         amrex::ErrorStream() << "ParmParse::getval ";
         if ( occurrence >= 0 )
         {
             amrex::ErrorStream() << "occurrence number "
-                      << occurrence
-                      << " of ";
+                                 << occurrence
+                                 << " of ";
         }
 
         amrex::ErrorStream() << "ParmParse::getval(): "
-                  << name
-                  << " not found in table"
-                  << '\n';
+                             << name
+                             << " not found in table"
+                             << '\n';
         ParmParse::dumpTable(amrex::ErrorStream());
         amrex::Abort();
     }
@@ -902,16 +735,17 @@ sgetval (const ParmParse::Table& table,
 template <class T>
 bool
 squeryarr (const ParmParse::Table& table,
-           const std::string& name,
-           std::vector<T>&    ref,
-           int          start_ix,
-           int          num_val,
-           int          occurrence)
+           const std::string&      parser_prefix,
+           const std::string&      name,
+           std::vector<T>&         ref,
+           int                     start_ix,
+           int                     num_val,
+           int                     occurrence)
 {
     //
     // Get last occurrence of name in table.
     //
-    const ParmParse::PP_entry *def = ppindex(table,occurrence, name, false);
+    auto const* def = ppindex(table,occurrence, name);
     if ( def == nullptr )
     {
         return false;
@@ -922,7 +756,7 @@ squeryarr (const ParmParse::Table& table,
     //
     if ( num_val == ParmParse::ALL )
     {
-        num_val = static_cast<int>(def->m_vals.size());
+        num_val = static_cast<int>(def->size());
     }
 
     if ( num_val == 0 ) { return true; }
@@ -932,7 +766,7 @@ squeryarr (const ParmParse::Table& table,
     {
         ref.resize(stop_ix + 1);
     }
-    if ( stop_ix >= static_cast<int>(def->m_vals.size()) )
+    if ( stop_ix >= static_cast<int>(def->size()) )
     {
         amrex::ErrorStream() << "ParmParse::queryarr too many values requested for";
         if ( occurrence == ParmParse::LAST )
@@ -943,17 +777,30 @@ squeryarr (const ParmParse::Table& table,
         {
             amrex::ErrorStream() << " occurrence " << occurrence << " of ";
         }
-        amrex::ErrorStream() << def->m_name << '\n' << *def << '\n';
+        amrex::ErrorStream() << name << '\n' << pp_to_string(name,*def) << '\n';
         amrex::Abort();
     }
     for ( int n = start_ix; n <= stop_ix; n++ )
     {
-        const std::string& valname = def->m_vals[n];
+        const std::string& valname = (*def)[n];
         bool ok = is(valname, ref[n]);
         if ( !ok )
         {
+            if constexpr (std::is_same_v<T,int> ||
+                          std::is_same_v<T,long> ||
+                          std::is_same_v<T,long long> ||
+                          std::is_same_v<T,float> ||
+                          std::is_same_v<T,double>)
+            {
+                if (pp_parser(table, parser_prefix, name, valname, ref[n], false)) {
+                    continue;
+                }
+            } else {
+                amrex::ignore_unused(parser_prefix);
+            }
+
             amrex::ErrorStream() << "ParmParse::queryarr type mismatch on value number "
-                      <<  n << " of ";
+                                 <<  n << " of ";
             if ( occurrence == ParmParse::LAST )
             {
                 amrex::ErrorStream() << " last occurrence of ";
@@ -962,12 +809,12 @@ squeryarr (const ParmParse::Table& table,
             {
                 amrex::ErrorStream() << " occurrence number " << occurrence << " of ";
             }
-            amrex::ErrorStream() << def->m_name << '\n';
+            amrex::ErrorStream() << name << '\n';
             amrex::ErrorStream() << " Expected an \""
-                      << tok_name(ref)
-                      << "\" type which can't be parsed from the string \""
-                      << valname << "\"\n"
-                      << *def << '\n';
+                                 << tok_name(ref)
+                                 << "\" type which can't be parsed from the string \""
+                                 << valname << "\"\n"
+                                 << pp_to_string(name,*def) << '\n';
             amrex::Abort();
         }
     }
@@ -976,10 +823,15 @@ squeryarr (const ParmParse::Table& table,
 
 template <class T>
 void
-sgetarr (const ParmParse::Table& table, const std::string& name,
-         std::vector<T>& ref, int start_ix, int num_val, int occurrence)
+sgetarr (const ParmParse::Table& table,
+         const std::string&      parser_prefix,
+         const std::string&      name,
+         std::vector<T>&         ref,
+         int                     start_ix,
+         int                     num_val,
+         int                     occurrence)
 {
-    if ( squeryarr(table,name,ref,start_ix,num_val,occurrence) == 0 )
+    if ( squeryarr(table,parser_prefix,name,ref,start_ix,num_val,occurrence) == 0 )
     {
         amrex::ErrorStream() << "ParmParse::sgetarr ";
         if ( occurrence >= 0 )
@@ -987,9 +839,9 @@ sgetarr (const ParmParse::Table& table, const std::string& name,
             amrex::ErrorStream() << "occurrence number " << occurrence << " of ";
         }
         amrex::ErrorStream() << "ParmParse::sgetarr(): "
-                  << name
-                  << " not found in table"
-                  << '\n';
+                             << name
+                             << " not found in table"
+                             << '\n';
         ParmParse::dumpTable(amrex::ErrorStream());
         amrex::Abort();
     }
@@ -1001,37 +853,30 @@ saddval (const std::string& name, const T& ref)
 {
     std::stringstream val;
     val << std::setprecision(17) << ref;
-    ParmParse::PP_entry entry(name,val.str());
-    entry.m_queried=true;
-    g_table.push_back(entry);
-}
 
+    auto& entry = g_table[name];
+    entry.m_vals.emplace_back(std::vector<std::string>{val.str()});
+    ++entry.m_count;
+}
 
 template <class T>
 void
 saddarr (const std::string& name, const std::vector<T>& ref)
 {
-    std::list<std::string> arr;
+    std::vector<std::string> arr;
+    arr.reserve(ref.size());
     for (auto const& item : ref) {
         std::stringstream val;
         val << std::setprecision(17) << item;
         arr.push_back(val.str());
     }
-    ParmParse::PP_entry entry(name,arr);
-    entry.m_queried=true;
-    g_table.push_back(entry);
-}
 
+    auto& entry = g_table[name];
+    entry.m_vals.emplace_back(std::move(arr));
+    ++entry.m_count;
 }
 
-//
 // Initialize ParmParse.
-//
-
-namespace {
-    bool initialized = false;
-}
-
 void
 ppinit (int argc, char** argv, const char* parfile, ParmParse::Table& table)
 {
@@ -1049,202 +894,163 @@ ppinit (int argc, char** argv, const char* parfile, ParmParse::Table& table)
             argstr += argv[i];
             argstr += SPACE;
         }
-        std::list<ParmParse::PP_entry> arg_table;
+        ParmParse::Table arg_table;
         const char* b = argstr.c_str();
         bldTable(b, arg_table);
         //
         // Append arg_table to end of existing table.
         //
-        table.splice(table.end(), arg_table);
+        for (auto& [name, arg_entry] : arg_table) {
+            auto& src = arg_entry.m_vals;
+            auto& dst = table[name].m_vals;
+            std::move(std::begin(src), std::end(src), std::back_inserter(dst));
+        }
     }
     initialized = true;
 }
 
-}  // End of unnamed namespace.
-
-std::string
-ParmParse::prefixedName (const std::string& str) const
+bool unused_table_entries_q (const ParmParse::Table& table,
+                             const std::string& prefix = std::string())
 {
-    if ( str.empty() )
-    {
-        amrex::Error("ParmParse::prefixedName: has empty name");
-    }
-    if ( !m_pstack.top().empty())
-    {
-        return m_pstack.top() + '.' + str;
+    if (prefix.empty()) {
+        return std::any_of(table.begin(), table.end(),
+                           [] (auto const& x) -> bool {
+                               return x.second.m_count == 0;
+                           });
+    } else {
+        auto s = prefix + '.';
+        return std::any_of(table.begin(), table.end(),
+                           [&] (auto const& x) -> bool {
+                               return x.second.m_count == 0
+                                   && x.first.substr(0,s.size()) == s;
+                           });
     }
-    return str;
 }
 
-void
-ParmParse::addfile (std::string const& filename) {
-#ifdef AMREX_USE_MPI
-    if (ParallelDescriptor::Communicator() == MPI_COMM_NULL)
-    {
-        throw std::runtime_error("ParmParse::addfile: AMReX must be initialized");
+void pp_print_unused (const std::string& pfx, const ParmParse::Table& table)
+{
+    std::vector<std::string> sorted_names;
+    sorted_names.reserve(table.size());
+    for (auto const& [name, entry] : table) {
+        if (entry.m_count == 0) {
+            sorted_names.push_back(name);
+        }
     }
-#endif
-
-    auto l = std::list<std::string>{filename};
-    auto file = FileKeyword;
-    addDefn(file,
-            l,
-            g_table);
-}
+    std::sort(sorted_names.begin(), sorted_names.end());
 
-void
-ParmParse::pushPrefix (const std::string& str)
-{
-    std::string s(str);
-    if ( !s.empty() )
-    {
-        if ( !m_pstack.top().empty() )
-        {
-            s = m_pstack.top() + "." + s;
+    for (auto const& name : sorted_names) {
+        auto const& entry = table.at(name);
+        for (auto const& vals : entry.m_vals) {
+            amrex::AllPrint() << pfx << "::" << pp_to_string(name, vals) << '\n';
         }
-        m_pstack.push(s);
     }
 }
 
-void
-ParmParse::popPrefix ()
-{
-    if ( m_pstack.size() <= 1 )
-    {
-        amrex::Error("ParmParse::popPrefix: stack underflow");
+template <class T>
+bool squeryWithParser (const ParmParse::Table& table,
+                       const std::string&      parser_prefix,
+                       const std::string&      name,
+                       T&                      ref);
+
+template <typename T, typename PARSER_t = std::conditional_t<std::is_integral_v<T>,
+                                                             IParser, Parser>>
+PARSER_t
+pp_make_parser (std::string const& func, Vector<std::string> const& vars,
+                ParmParse::Table const& table, std::string const& parser_prefix,
+                bool use_querywithparser)
+{
+    using value_t =  std::conditional_t<std::is_integral_v<T>, long long, double>;
+
+    std::vector<std::string> prefixes;
+    prefixes.reserve(3);
+    prefixes.emplace_back();
+    if (! parser_prefix.empty()) {
+        prefixes.emplace_back(parser_prefix+".");
+    }
+    if (! ParmParse::ParserPrefix.empty()) {
+        prefixes.emplace_back(ParmParse::ParserPrefix+".");
     }
-    m_pstack.pop();
-}
 
-std::string
-ParmParse::getPrefix() const
-{
-    return m_pstack.top();
-}
+    PARSER_t parser(func);
 
-ParmParse::ParmParse (const std::string& prefix)
-    :
-    m_table(&g_table)
-{
-    m_pstack.push(prefix);
-}
+    auto symbols = parser.symbols();
+    for (auto const& var : vars) {
+        symbols.erase(var);
+    }
 
-ParmParse::ParmParse (Table& a_table)
-    : m_table(&a_table)
-{
-    m_pstack.emplace("");
-}
+    for (auto const& s : symbols) {
+        value_t v = 0;
+        bool r = false;
+        for (auto const& pf : prefixes) {
+            if (use_querywithparser) {
+                r = squeryWithParser(table, parser_prefix, pf+s, v);
+            } else {
+                r = squeryval(table, parser_prefix, pf+s, v,
+                              ParmParse::FIRST, ParmParse::LAST);
+            }
+            if (r) { break; }
+        }
+        if (r == false) {
+            amrex::Error("ParmParse: failed to parse " + func);
+        }
+        parser.setConstant(s, v);
+    }
+    if (!vars.empty()) {
+        parser.registerVariables(vars);
+    }
 
-ParmParse::Frame::Frame (ParmParse& pp, const std::string& pfix)
-    :
-    m_pp(&pp)
-{
-    push(pfix);
-    BL_ASSERT( m_np == 1 );
+    return parser;
 }
 
-ParmParse::Frame::~Frame ()
+template <typename T>
+bool pp_parser (const ParmParse::Table& table, const std::string& parser_prefix,
+                const std::string& name, const std::string& val, T& ref,
+                bool use_querywithparser)
 {
-    BL_ASSERT( m_np > 0 );
-    while ( m_np )
-    {
-        pop();
+    auto& recursive_symbols = g_parser_recursive_symbols[OpenMP::get_thread_num()];
+    if (auto found = recursive_symbols.find(name); found != recursive_symbols.end()) {
+        amrex::Error("ParmParse: recursive reference to "+name+" is not allowed");
+        return false;
+    } else {
+        recursive_symbols.insert(name);
     }
-    BL_ASSERT( m_np == 0 );
-}
 
-void
-ParmParse::Frame::push (const std::string& str)
-{
-    m_pp->pushPrefix(str);
-    m_np++;
-}
+    auto parser = pp_make_parser<T>(val, {}, table, parser_prefix, use_querywithparser);
+    auto exe = parser.template compileHost<0>();
+    ref = static_cast<T>(exe());
 
-void
-ParmParse::Frame::pop ()
-{
-    BL_ASSERT( m_np > 0);
-    m_pp->popPrefix();
-    m_np--;
+    recursive_symbols.erase(name);
+    return true;
 }
 
-std::string
-ParmParse::Frame::getPrefix () const
-{
-    return m_pp->getPrefix();
-}
+}  // End of unnamed namespace.
 
-void
-ParmParse::appendTable(ParmParse::Table& tab)
+std::string
+ParmParse::prefixedName (const std::string_view& str) const
 {
-  g_table.splice(g_table.end(), tab);
-}
+    AMREX_ASSERT( ! str.empty() );
 
-namespace {
-
-bool
-unused_table_entries_q (const ParmParse::Table& table, const std::string& prefix = std::string())
-{
-    for (auto const& li : table) // NOLINT(readability-use-anyofallof)
-    {
-        if ( li.m_table )
-        {
-            if ( !li.m_queried )
-            {
-                if (prefix.empty()) {
-                    return true;
-                } else {
-                    if (li.m_name.substr(0,prefix.size()+1) == prefix+".") {
-                        return true;
-                    }
-                }
-            }
-            else
-            {
-                if (unused_table_entries_q(*li.m_table, prefix)) { return true; }
-            }
-        }
-        else if ( !li.m_queried )
-        {
-            if (prefix.empty()) {
-                return true;
-            } else {
-                if (li.m_name.substr(0,prefix.size()+1) == prefix+".") {
-                    return true;
-                }
-            }
-        }
+    if (m_prefix.empty()) {
+        return std::string(str);
+    } else {
+        std::string r = m_prefix + '.';
+        r.append(str);
+        return r;
     }
-    return false;
 }
 
 void
-finalize_table (const std::string& pfx, const ParmParse::Table& table)
-{
-    for (auto const& li : table)
+ParmParse::addfile (std::string const& filename) {
+#ifdef AMREX_USE_MPI
+    if (ParallelDescriptor::Communicator() == MPI_COMM_NULL)
     {
-        if ( li.m_table )
-        {
-            if ( !li.m_queried )
-            {
-                if (finalize_verbose) {
-                    amrex::AllPrint() << "Record " << li.m_name << '\n';
-                }
-            }
-            else
-            {
-                finalize_table(pfx + "::" + li.m_name, *li.m_table);
-            }
-        }
-        else if ( !li.m_queried )
-        {
-            if (finalize_verbose) {
-                amrex::AllPrint() << pfx << "::" << li << '\n';
-            }
-        }
+        throw std::runtime_error("ParmParse::addfile: AMReX must be initialized");
     }
-}
+#endif
 
+    auto file = FileKeyword;
+    std::vector<std::string> val{{filename}};
+    addDefn(file, val, g_table);
 }
 
 void
@@ -1257,6 +1063,8 @@ ParmParse::Initialize (int         argc,
         amrex::Error("ParmParse::Initialize(): already initialized!");
     }
 
+    g_parser_recursive_symbols.resize(OpenMP::get_max_threads());
+
     ppinit(argc, argv, parfile, g_table);
 
     amrex::ExecOnFinalize(ParmParse::Finalize);
@@ -1267,10 +1075,11 @@ ParmParse::QueryUnusedInputs ()
 {
     if ( ParallelDescriptor::IOProcessor() && unused_table_entries_q(g_table))
     {
-        finalize_verbose = amrex::system::verbose;
-        if (finalize_verbose) { amrex::OutStream() << "Unused ParmParse Variables:\n"; }
-        finalize_table("  [TOP]", g_table);
-        if (finalize_verbose) { amrex::OutStream() << '\n'; }
+        if (ParmParse::Verbose()) {
+            amrex::OutStream() << "Unused ParmParse Variables:\n";
+            pp_print_unused("  [TOP]", g_table);
+            amrex::OutStream() << '\n';
+        }
         return true;
     }
     return false;
@@ -1282,51 +1091,66 @@ ParmParse::hasUnusedInputs (const std::string& prefix)
     return unused_table_entries_q(g_table, prefix);
 }
 
-namespace {
-void
-get_entries_under_prefix (std::vector<std::string>& found_entries,
-                         const ParmParse::Table& table,
-                         const std::string& prefix,
-                         const bool only_unused = false,
-                         const bool add_values = false)
+std::vector<std::string>
+ParmParse::getUnusedInputs (const std::string& prefix)
 {
+    std::vector<std::string> sorted_names;
     const std::string prefixdot = prefix.empty() ? std::string() : prefix+".";
-    for (auto const& entry : table) {
-        if ((! only_unused) || (only_unused && ! entry.m_queried)) {
-            if (entry.m_name.substr(0,prefixdot.size()) == prefixdot) {
-                std::string tmp(entry.m_name);
-                if (add_values) {
-                    tmp.append(" =");
-                    for (auto const& v : entry.m_vals) {
-                        tmp += " " + v;
-                    }
-                }
-                found_entries.emplace_back(std::move(tmp));
-            }
+    for (auto const& [name, entry] : g_table) {
+        if (entry.m_count == 0 &&
+            name.substr(0,prefixdot.size()) == prefixdot)
+        {
+            sorted_names.push_back(name);
         }
+    }
+    std::sort(sorted_names.begin(), sorted_names.end());
 
-        if (entry.m_table) {
-            get_entries_under_prefix(found_entries, table, prefix,
-                                     only_unused, add_values);
+    std::vector<std::string> r;
+    for (auto const& name : sorted_names) {
+        auto const& entry = g_table[name];
+        for (auto const& vals : entry.m_vals) {
+            std::string tmp(name);
+            tmp.append(" =");
+            for (auto const& v : vals) {
+                tmp += " " + v;
+            }
+            r.emplace_back(std::move(tmp));
         }
     }
-}
-}
 
-std::vector<std::string>
-ParmParse::getUnusedInputs (const std::string& prefix)
-{
-    std::vector<std::string> r;
-    get_entries_under_prefix(r, g_table, prefix, true, true);
     return r;
 }
 
 std::set<std::string>
 ParmParse::getEntries (const std::string& prefix)
 {
-    std::vector<std::string> r;
-    get_entries_under_prefix(r, g_table, prefix, false, false);
-    return std::set<std::string>(r.begin(), r.end());
+    std::set<std::string> r;
+    const std::string prefixdot = prefix.empty() ? std::string() : prefix+".";
+    for (auto const& [name, entry] : g_table) {
+        if (name.substr(0,prefixdot.size()) == prefixdot) {
+            r.insert(name);
+        }
+    }
+    return r;
+}
+
+int
+ParmParse::Verbose ()
+{
+    if (pp_detail::verbose < 0) {
+        pp_detail::verbose = std::max(amrex::Verbose(),0);
+        ParmParse pp("amrex.parmparse");
+        if (! pp.query("verbose", "v", pp_detail::verbose)) {
+            pp.add("verbose", pp_detail::verbose);
+        }
+    }
+    return pp_detail::verbose;
+}
+
+void
+ParmParse::SetVerbose (int v)
+{
+    pp_detail::verbose = v;
 }
 
 void
@@ -1334,13 +1158,11 @@ ParmParse::Finalize ()
 {
     if ( ParallelDescriptor::IOProcessor() && unused_table_entries_q(g_table))
     {
-        finalize_verbose = amrex::system::verbose;
-        if (finalize_verbose) { amrex::OutStream() << "Unused ParmParse Variables:\n"; }
-        finalize_table("  [TOP]", g_table);
-        if (finalize_verbose) { amrex::OutStream() << '\n'; }
-        //
-        // First loop through and delete all queried entries.
-        //
+        if (ParmParse::Verbose()) {
+            amrex::OutStream() << "Unused ParmParse Variables:\n";
+            pp_print_unused("  [TOP]", g_table);
+            amrex::OutStream() << '\n';
+        }
         if (amrex::system::abort_on_unused_inputs) {
             amrex::Abort("ERROR: unused ParmParse variables.");
         }
@@ -1351,19 +1173,65 @@ ParmParse::Finalize ()
     amrex_finalize_namelist();
 #endif
 
+    g_parser_recursive_symbols.clear();
+
+    pp_detail::verbose = -1;
     initialized = false;
 }
 
+void
+ParmParse::SetParserPrefix (std::string a_prefix)
+{
+    ParmParse::ParserPrefix = std::move(a_prefix);
+}
+
 void
 ParmParse::dumpTable (std::ostream& os, bool prettyPrint)
 {
-    for (auto const& li : g_table)
-    {
-        if(prettyPrint && li.m_queried) {
-            os << li.print() << '\n';
+    std::vector<std::string> sorted_names;
+    sorted_names.reserve(g_table.size());
+    for (auto const& [name, entry] : g_table) {
+        sorted_names.push_back(name);
+    }
+    std::sort(sorted_names.begin(), sorted_names.end());
+
+    for (auto const& name : sorted_names) {
+        auto const& entry = g_table[name];
+        if (prettyPrint && entry.m_count > 0) {
+            for (auto const& vals : entry.m_vals) {
+                os << pp_to_pretty_string(name, vals) << '\n';
+            }
         }
         else {
-            os << li << '\n';
+            for (auto const& vals : entry.m_vals) {
+                os << pp_to_string(name, vals) << '\n';
+            }
+        }
+    }
+}
+
+void
+ParmParse::prettyPrintTable (std::ostream& os)
+{
+    std::vector<std::string> sorted_names;
+    sorted_names.reserve(g_table.size());
+    for (auto const& [name, entry] : g_table) {
+        sorted_names.push_back(name);
+    }
+    std::sort(sorted_names.begin(), sorted_names.end());
+
+    for (auto const& name : sorted_names) {
+        auto const& entry = g_table[name];
+        std::vector<std::string> value_string;
+        std::unordered_map<std::string,int> count;
+        for (auto const& vals : entry.m_vals) {
+            value_string.emplace_back(pp_to_pretty_string(name, vals));
+            ++count[value_string.back()];
+        }
+        for (auto const& s : value_string) {
+            if (--count[s] == 0) {
+                os << s << '\n';
+            }
         }
     }
 }
@@ -1375,8 +1243,8 @@ ParmParse::countval (const char* name,
     //
     // First find n'th occurrence of name in table.
     //
-    const PP_entry* def = ppindex(*m_table, n, prefixedName(name), false);
-    return def == nullptr ? 0 : static_cast<int>(def->m_vals.size());
+    auto const* def = ppindex(*m_table, n, prefixedName(name));
+    return def == nullptr ? 0 : static_cast<int>(def->size());
 }
 
 // BOOL
@@ -1386,7 +1254,7 @@ ParmParse::getkth (const char* name,
                    bool&       ref,
                    int         ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 void
@@ -1394,7 +1262,7 @@ ParmParse::get (const char* name,
                 bool&       ref,
                 int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 int
@@ -1403,7 +1271,7 @@ ParmParse::querykth (const char* name,
                      bool&       ref,
                      int         ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 int
@@ -1411,7 +1279,7 @@ ParmParse::query (const char* name,
                   bool&       ref,
                   int         ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1425,25 +1293,25 @@ ParmParse::add (const char* name,
 void
 ParmParse::getkth (const char* name, int k, int& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, int& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, int& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, int& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1456,28 +1324,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<int>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<int>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<int>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<int>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1491,25 +1359,25 @@ ParmParse::addarr (const char* name, const std::vector<int>& ref)
 void
 ParmParse::getkth (const char* name, int k, long& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, long& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, long& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, long& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1523,28 +1391,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<long>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<long>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<long>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<long>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1557,25 +1425,25 @@ ParmParse::addarr (const char* name, const std::vector<long>& ref)
 void
 ParmParse::getkth (const char* name, int k, long long& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, long long& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, long long& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, long long& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1588,28 +1456,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<long long>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<long long>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<long long>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<long long>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1622,25 +1490,25 @@ ParmParse::addarr (const char* name, const std::vector<long long>& ref)
 void
 ParmParse::getkth (const char* name, int k, float& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, float& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, float& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, float& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1653,28 +1521,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<float>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<float>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<float>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<float>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1689,25 +1557,25 @@ ParmParse::addarr (const char* name, const std::vector<float>& ref)
 void
 ParmParse::getkth (const char* name, int k, double& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, double& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, double& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, double& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1720,28 +1588,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<double>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<double>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<double>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<double>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1756,25 +1624,25 @@ ParmParse::addarr (const char* name, const std::vector<double>& ref)
 void
 ParmParse::getkth (const char* name, int k, std::string& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, std::string& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, std::string& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, std::string& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1787,28 +1655,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<std::string>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<std::string>& ref,
                    int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<std::string>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<std::string>& ref,
                      int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1823,25 +1691,25 @@ ParmParse::addarr (const char* name, const std::vector<std::string>& ref)
 void
 ParmParse::getkth (const char* name, int k, IntVect& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, IntVect& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, IntVect& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, IntVect& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1854,28 +1722,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<IntVect>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<IntVect>& ref,
                    int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<IntVect>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<IntVect>& ref,
                      int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1888,25 +1756,25 @@ ParmParse::addarr (const char* name, const std::vector<IntVect>& ref)
 void
 ParmParse::getkth (const char* name, int k, Box& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, Box& ref, int ival) const
 {
-    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, Box& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, Box& ref, int ival) const
 {
-    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table,m_parser_prefix, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1919,28 +1787,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<Box>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<Box>& ref,
                    int start_ix, int num_val) const
 {
-    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<Box>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<Box>& ref,
                      int start_ix, int num_val) const
 {
-    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table,m_parser_prefix, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1950,36 +1818,62 @@ ParmParse::addarr (const char* name, const std::vector<Box>& ref)
 }
 
 
-//
-// Return number of occurrences of parameter name.
-//
+int
+ParmParse::queryarr (const char* name, IntVect& ref) const
+{
+    std::vector<int> v;
+    int exist = this->queryarr(name, v);
+    if (exist) {
+        AMREX_ALWAYS_ASSERT(v.size() == AMREX_SPACEDIM);
+        for (int i = 0; i < AMREX_SPACEDIM; ++i) { ref[i] = v[i]; }
+    }
+    return exist;
+}
+
+void
+ParmParse::getarr (const char* name, IntVect& ref) const
+{
+    std::vector<int> v;
+    this->getarr(name, v);
+    AMREX_ALWAYS_ASSERT(v.size() == AMREX_SPACEDIM);
+    for (int i = 0; i < AMREX_SPACEDIM; ++i) { ref[i] = v[i]; }
+}
 
 int
-ParmParse::countname (const std::string& name) const
+ParmParse::queryarr (const char* name, RealVect& ref) const
 {
-    int cnt = 0;
-    for (auto const& li : *m_table)
-    {
-        if ( ppfound(prefixedName(name), li, false) )
-        {
-            cnt++;
-        }
+    std::vector<Real> v;
+    int exist = this->queryarr(name, v);
+    if (exist) {
+        AMREX_ALWAYS_ASSERT(v.size() == AMREX_SPACEDIM);
+        for (int i = 0; i < AMREX_SPACEDIM; ++i) { ref[i] = v[i]; }
     }
-    return cnt;
+    return exist;
+}
+
+void
+ParmParse::getarr (const char* name, RealVect& ref) const
+{
+    std::vector<Real> v;
+    this->getarr(name, v);
+    AMREX_ALWAYS_ASSERT(v.size() == AMREX_SPACEDIM);
+    for (int i = 0; i < AMREX_SPACEDIM; ++i) { ref[i] = v[i]; }
 }
 
+//
+// Return number of occurrences of parameter name.
+//
+
 int
-ParmParse::countRecords (const std::string& name) const
+ParmParse::countname (const std::string& name) const
 {
-    int cnt = 0;
-    for (auto const& li : *m_table)
-    {
-        if ( ppfound(prefixedName(name), li, true) )
-        {
-            cnt++;
-        }
+    auto pname = prefixedName(name);
+    auto found = m_table->find(pname);
+    if (found != m_table->cend()) {
+        return static_cast<int>(found->second.m_vals.size());
+    } else {
+        return 0;
     }
-    return cnt;
 }
 
 //
@@ -1989,74 +1883,139 @@ ParmParse::countRecords (const std::string& name) const
 bool
 ParmParse::contains (const char* name) const
 {
-    for (auto const& li : *m_table)
-    {
-       if ( ppfound(prefixedName(name), li, false))
-       {
-           //
-           // Found an entry; mark all occurrences of name as used.
-           //
-           for (auto& lli : *m_table)
-           {
-               if ( ppfound(prefixedName(name), lli, false) )
-               {
-                   lli.m_queried = true;
-               }
-           }
-           return true;
-       }
+    auto pname = prefixedName(name);
+    auto found = m_table->find(pname);
+    if (found != m_table->cend()) {
+#ifdef AMREX_USE_OMP
+#pragma omp atomic update
+#endif
+        ++(found->second.m_count);
+        return true;
+    } else {
+        return false;
     }
-    return false;
 }
 
 int
 ParmParse::remove (const char* name)
 {
-    int r = 0;
-    for (auto it = m_table->begin(); it != m_table->end(); ) {
-        if (ppfound(prefixedName(name), *it, false)) {
-            it = m_table->erase(it);
-            ++r;
-        } else {
-            ++it;
-        }
+    auto const pname = prefixedName(name);
+    auto n = m_table->erase(pname);
+    return static_cast<int>(n);
+}
+
+namespace {
+template <class T>
+bool squeryWithParser (const ParmParse::Table& table,
+                       const std::string&      parser_prefix,
+                       const std::string&      name,
+                       T&                      ref)
+{
+    std::vector<std::string> vals;
+    bool exist = squeryarr(table, parser_prefix, name, vals,
+                           ParmParse::FIRST, ParmParse::ALL, ParmParse::LAST);
+    if (!exist) { return false; }
+
+    std::string combined_string;
+    for (auto const& v : vals) {
+        combined_string.append(v);
     }
-    return r;
+    return pp_parser(table, parser_prefix, name, combined_string, ref, true);
 }
 
-ParmParse::Record
-ParmParse::getRecord (const std::string& name, int n) const
-{
-    const PP_entry* pe = ppindex(*m_table, n, prefixedName(name), true);
-    if ( pe == nullptr )
-    {
-        amrex::ErrorStream() << "ParmParse::getRecord: record " << name << " not found" << '\n';
-        amrex::Abort();
-        return Record(ParmParse());
-    } else {
-        return Record(ParmParse(*pe->m_table));
+template <class T>
+bool squeryarrWithParser (const ParmParse::Table& table,
+                          const std::string&      parser_prefix,
+                          const std::string&      name,
+                          int                     nvals,
+                          T*                      ref)
+{
+    std::vector<std::string> vals;
+    bool exist = squeryarr(table, parser_prefix, name, vals,
+                           ParmParse::FIRST, ParmParse::ALL, ParmParse::LAST);
+    if (!exist) { return false; }
+
+    AMREX_ALWAYS_ASSERT(int(vals.size()) == nvals);
+    for (int ival = 0; ival < nvals; ++ival) {
+        bool r = pp_parser(table, parser_prefix, name, vals[ival], ref[ival], true);
+        if (!r) { return false; }
     }
+    return true;
+}
 }
 
-//
-//
-//
+int
+ParmParse::queryWithParser (const char* name, int& ref) const
+{
+    return squeryWithParser(*m_table,m_parser_prefix,prefixedName(name),ref);
+}
+
+int
+ParmParse::queryWithParser (const char* name, long& ref) const
+{
+    return squeryWithParser(*m_table,m_parser_prefix,prefixedName(name),ref);
+}
+
+int
+ParmParse::queryWithParser (const char* name, long long& ref) const
+{
+    return squeryWithParser(*m_table,m_parser_prefix,prefixedName(name),ref);
+}
 
-ParmParse::Record::Record ( const ParmParse& pp )
-    : m_pp(pp)
+int
+ParmParse::queryWithParser (const char* name, float& ref) const
+{
+    return squeryWithParser(*m_table,m_parser_prefix,prefixedName(name),ref);
+}
+
+int
+ParmParse::queryWithParser (const char* name, double& ref) const
+{
+    return squeryWithParser(*m_table,m_parser_prefix,prefixedName(name),ref);
+}
+
+int
+ParmParse::queryarrWithParser (const char* name, int nvals, int* ref) const
+{
+    return squeryarrWithParser(*m_table,m_parser_prefix,prefixedName(name),nvals,ref);
+}
+
+int
+ParmParse::queryarrWithParser (const char* name, int nvals, long* ref) const
+{
+    return squeryarrWithParser(*m_table,m_parser_prefix,prefixedName(name),nvals,ref);
+}
+
+int
+ParmParse::queryarrWithParser (const char* name, int nvals, long long* ref) const
+{
+    return squeryarrWithParser(*m_table,m_parser_prefix,prefixedName(name),nvals,ref);
+}
+
+int
+ParmParse::queryarrWithParser (const char* name, int nvals, float* ref) const
+{
+    return squeryarrWithParser(*m_table,m_parser_prefix,prefixedName(name),nvals,ref);
+}
+
+int
+ParmParse::queryarrWithParser (const char* name, int nvals, double* ref) const
 {
+    return squeryarrWithParser(*m_table,m_parser_prefix,prefixedName(name),nvals,ref);
 }
 
-const ParmParse*
-ParmParse::Record::operator-> () const
+Parser
+ParmParse::makeParser (std::string const& func,
+                       Vector<std::string> const& vars) const
 {
-    return &m_pp;
+    return pp_make_parser<double>(func, vars, *m_table, m_parser_prefix, true);
 }
 
-const ParmParse&
-ParmParse::Record::operator* () const
+IParser
+ParmParse::makeIParser (std::string const& func,
+                        Vector<std::string> const& vars) const
 {
-    return m_pp;
+    return pp_make_parser<long long>(func, vars, *m_table, m_parser_prefix, true);
 }
 
 }
diff --git a/Src/Base/AMReX_Periodicity.H b/Src/Base/AMReX_Periodicity.H
index 959e1bb34af..d16e175a4fb 100644
--- a/Src/Base/AMReX_Periodicity.H
+++ b/Src/Base/AMReX_Periodicity.H
@@ -32,7 +32,7 @@ public:
     //! Cell-centered domain Box "infinitely" long in non-periodic directions.
     [[nodiscard]] Box Domain () const noexcept;
 
-    [[nodiscard]] std::vector<IntVect> shiftIntVect () const;
+    [[nodiscard]] std::vector<IntVect> shiftIntVect (IntVect const& nghost = IntVect(0)) const;
 
     static const Periodicity& NonPeriodic () noexcept;
 
diff --git a/Src/Base/AMReX_Periodicity.cpp b/Src/Base/AMReX_Periodicity.cpp
index 20dfba1c326..797dfb79ab3 100644
--- a/Src/Base/AMReX_Periodicity.cpp
+++ b/Src/Base/AMReX_Periodicity.cpp
@@ -5,7 +5,7 @@
 namespace amrex {
 
 std::vector<IntVect>
-Periodicity::shiftIntVect () const
+Periodicity::shiftIntVect (IntVect const& nghost) const
 {
     std::vector<IntVect> r;
 
@@ -15,6 +15,9 @@ Periodicity::shiftIntVect () const
     for (int i = 0; i < AMREX_SPACEDIM; ++i) {
         if (isPeriodic(i)) {
             per[i] = jmp[i] = period[i];
+            while (per[i] < nghost[i]) {
+                per[i] += period[i];
+            }
         }
     }
 
diff --git a/Src/Base/AMReX_PlotFileDataImpl.cpp b/Src/Base/AMReX_PlotFileDataImpl.cpp
index b85c17ad93c..28cc38e27f4 100644
--- a/Src/Base/AMReX_PlotFileDataImpl.cpp
+++ b/Src/Base/AMReX_PlotFileDataImpl.cpp
@@ -31,7 +31,7 @@ PlotFileDataImpl::PlotFileDataImpl (std::string const& plotfile_name)
     for (int i = 0; i < m_ncomp; ++i) {
         std::string tmp;
         std::getline(is, tmp);
-        m_var_names[i] = amrex::trim(tmp);
+        m_var_names[i] = amrex::trim(std::move(tmp));
     }
 
     is >> m_spacedim >> m_time >> m_finest_level;
diff --git a/Src/Base/AMReX_RKIntegrator.H b/Src/Base/AMReX_RKIntegrator.H
index f72890c808e..a6efd028532 100644
--- a/Src/Base/AMReX_RKIntegrator.H
+++ b/Src/Base/AMReX_RKIntegrator.H
@@ -217,9 +217,6 @@ public:
                 BaseT::post_stage_action(S_new, stage_time);
             }
 
-            // Call the update hook for the stage state value
-            BaseT::pre_rhs_action(S_new, stage_time);
-
             // Fill F[i], the RHS at the current stage
             // F[i] = RHS(y, t) at y = stage_value, t = stage_time
             BaseT::Rhs(*F_nodes[i], S_new, stage_time);
diff --git a/Src/Base/AMReX_Random.cpp b/Src/Base/AMReX_Random.cpp
index a0b72155c50..891a69e140a 100644
--- a/Src/Base/AMReX_Random.cpp
+++ b/Src/Base/AMReX_Random.cpp
@@ -187,15 +187,14 @@ UniqueRandomSubset (Vector<int> &uSet, int setSize, int poolSize,
     Abort("**** Error in UniqueRandomSubset:  setSize > poolSize.");
   }
   std::set<int> copySet;
-  Vector<int> uSetTemp;
+  uSet.clear();
   while(static_cast<int>(copySet.size()) < setSize) {
     int r = static_cast<int>(Random_int(poolSize));
     if(copySet.find(r) == copySet.end()) {
       copySet.insert(r);
-      uSetTemp.push_back(r);
+      uSet.push_back(r);
     }
   }
-  uSet = uSetTemp;
   if(printSet) {
     for(int i(0); i < uSet.size(); ++i) {
         AllPrint() << "uSet[" << i << "]  = " << uSet[i] << '\n';
diff --git a/Src/Base/AMReX_String.H b/Src/Base/AMReX_String.H
new file mode 100644
index 00000000000..147b7ab1870
--- /dev/null
+++ b/Src/Base/AMReX_String.H
@@ -0,0 +1,30 @@
+#ifndef AMREX_STRING_H_
+#define AMREX_STRING_H_
+#include <AMReX_Config.H>
+
+#include <string>
+#include <vector>
+
+namespace amrex {
+
+    //! Converts all characters of the string into lower case based on std::locale
+    std::string toLower (std::string s);
+
+    //! Converts all characters of the string into uppercase based on std::locale
+    std::string toUpper (std::string s);
+
+    //! Trim leading and trailing characters in the optional `space`
+    //! argument.
+    std::string trim (std::string s, std::string const& space = " \t");
+
+    //! Returns rootNNNN where NNNN == num.
+    std::string Concatenate (const std::string& root,
+                             int                num,
+                             int                mindigits = 5);
+
+    //! Split a string using given tokens in `sep`.
+    std::vector<std::string> split (std::string const& s,
+                                    std::string const& sep = " \t");
+}
+
+#endif
diff --git a/Src/Base/AMReX_String.cpp b/Src/Base/AMReX_String.cpp
new file mode 100644
index 00000000000..24dbce4532f
--- /dev/null
+++ b/Src/Base/AMReX_String.cpp
@@ -0,0 +1,54 @@
+#include <AMReX_String.H>
+#include <AMReX_BLassert.H>
+
+#include <algorithm>
+#include <cctype>
+#include <iomanip>
+#include <sstream>
+
+namespace amrex {
+
+std::string toLower (std::string s)
+{
+    std::transform(s.begin(), s.end(), s.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+    return s;
+}
+
+std::string toUpper (std::string s)
+{
+    std::transform(s.begin(), s.end(), s.begin(),
+                   [](unsigned char c) { return std::toupper(c); });
+    return s;
+}
+
+std::string trim(std::string s, std::string const& space)
+{
+    const auto sbegin = s.find_first_not_of(space);
+    if (sbegin == std::string::npos) { return std::string{}; }
+    const auto send = s.find_last_not_of(space);
+    s = s.substr(sbegin, send-sbegin+1);
+    return s;
+}
+
+std::string Concatenate (const std::string& root, int num, int mindigits)
+{
+    BL_ASSERT(mindigits >= 0);
+    std::stringstream result;
+    result << root << std::setfill('0') << std::setw(mindigits) << num;
+    return result.str();
+}
+
+std::vector<std::string> split (std::string const& s, std::string const& sep)
+{
+    std::vector<std::string> result;
+    std::size_t pos_begin, pos_end = 0;
+    while ((pos_begin = s.find_first_not_of(sep,pos_end)) != std::string::npos) {
+        pos_end = s.find_first_of(sep,pos_begin);
+        result.push_back(s.substr(pos_begin,pos_end-pos_begin));
+        if (pos_end == std::string::npos) { break; }
+    }
+    return result;
+}
+
+}
diff --git a/Src/Base/AMReX_TimeIntegrator.H b/Src/Base/AMReX_TimeIntegrator.H
index 10443361533..316ad0ff318 100644
--- a/Src/Base/AMReX_TimeIntegrator.H
+++ b/Src/Base/AMReX_TimeIntegrator.H
@@ -65,17 +65,14 @@ private:
     void set_default_functions ()
     {
         // By default, do nothing in the RHS
-        set_rhs([](T& /* S_rhs */, const T& /* S_data */, const amrex::Real /* time */){});
-        set_imex_rhs([](T& /* S_rhs */, const T& /* S_data */, const amrex::Real /* time */){},
-                     [](T& /* S_rhs */, const T& /* S_data */, const amrex::Real /* time */){});
-        set_fast_rhs([](T& /* S_rhs */, const T& /* S_data */, const amrex::Real /* time */){});
+        set_rhs([](T& /* S_rhs */, T& /* S_data */, const amrex::Real /* time */){});
+        set_imex_rhs([](T& /* S_rhs */, T& /* S_data */, const amrex::Real /* time */){},
+                     [](T& /* S_rhs */, T& /* S_data */, const amrex::Real /* time */){});
+        set_fast_rhs([](T& /* S_rhs */, T& /* S_data */, const amrex::Real /* time */){});
 
         // In general, the following functions can be used to fill BCs. Which
         // function to set will depend on the method type and intended use case
 
-        // By default, do nothing before calling the RHS
-        set_pre_rhs_action([](T& /* S_data */, amrex::Real /* time */){});
-
         // By default, do nothing after a stage or step
         set_post_stage_action([](T& /* S_data */, const amrex::Real /* time */){});
         set_post_step_action([](T& /* S_data */, const amrex::Real /* time */){});
@@ -134,27 +131,22 @@ public:
         }
     }
 
-    void set_rhs (std::function<void(T&, const T&, const amrex::Real)> F)
+    void set_rhs (std::function<void(T&, T&, const amrex::Real)> F)
     {
         integrator_ptr->set_rhs(F);
     }
 
-    void set_imex_rhs (std::function<void(T&, const T&, const amrex::Real)> Fi,
-                       std::function<void(T&, const T&, const amrex::Real)> Fe)
+    void set_imex_rhs (std::function<void(T&, T&, const amrex::Real)> Fi,
+                       std::function<void(T&, T&, const amrex::Real)> Fe)
     {
         integrator_ptr->set_imex_rhs(Fi, Fe);
     }
 
-    void set_fast_rhs (std::function<void(T&, const T&, const amrex::Real)> F)
+    void set_fast_rhs (std::function<void(T&, T&, const amrex::Real)> F)
     {
         integrator_ptr->set_fast_rhs(F);
     }
 
-    void set_pre_rhs_action (std::function<void (T&, amrex::Real)> A)
-    {
-        integrator_ptr->set_pre_rhs_action(A);
-    }
-
     void set_post_stage_action (std::function<void (T&, amrex::Real)> A)
     {
         integrator_ptr->set_post_stage_action(A);
@@ -175,11 +167,6 @@ public:
         integrator_ptr->set_post_fast_step_action(A);
     }
 
-    void set_post_update (std::function<void (T&, amrex::Real)> A)
-    {
-        integrator_ptr->set_post_update(A);
-    }
-
     amrex::Real get_time_step ()
     {
         return integrator_ptr->get_time_step();
diff --git a/Src/Base/AMReX_TinyProfiler.H b/Src/Base/AMReX_TinyProfiler.H
index 4ddbb05ea7e..0228949beb1 100644
--- a/Src/Base/AMReX_TinyProfiler.H
+++ b/Src/Base/AMReX_TinyProfiler.H
@@ -5,14 +5,6 @@
 #include <AMReX_INT.H>
 #include <AMReX_REAL.H>
 
-#ifdef AMREX_USE_CUDA
-#include <nvToolsExt.h>
-#endif
-
-#if defined(AMREX_USE_HIP) && defined(AMREX_USE_ROCTX)
-#include <roctracer/roctx.h>
-#endif
-
 #include <array>
 #include <deque>
 #include <iosfwd>
@@ -143,15 +135,19 @@ private:
     static std::deque<std::tuple<double,double,std::string*> > ttstack;
     static std::map<std::string,std::map<std::string, Stats> > statsmap;
     static double t_init;
-    static int device_synchronize_around_region;
+    static bool device_synchronize_around_region;
     static int n_print_tabs;
     static int verbose;
     static double print_threshold;
+    static bool enabled;
+    static bool memprof_enabled;
+    static std::string output_file;
 
-    static void PrintStats (std::map<std::string,Stats>& regstats, double dt_max);
+    static void PrintStats (std::map<std::string,Stats>& regstats, double dt_max,
+                            std::ostream* os);
     static void PrintMemStats (std::map<std::string, MemStat>& memstats,
                                std::string const& memname, double dt_max,
-                               double t_final);
+                               double t_final, std::ostream* os);
 };
 
 class TinyProfileRegion
diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index 1cffe78e75a..db922745784 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -15,6 +15,18 @@
 #include <omp.h>
 #endif
 
+#ifdef AMREX_USE_CUDA
+#if __has_include(<nvtx3/nvToolsExt.h>)
+#  include <nvtx3/nvToolsExt.h>
+#else
+#  include <nvToolsExt.h>
+#endif
+#endif
+
+#if defined(AMREX_USE_HIP) && defined(AMREX_USE_ROCTX)
+#include <roctracer/roctx.h>
+#endif
+
 #include <algorithm>
 #include <cmath>
 #include <iostream>
@@ -34,10 +46,13 @@ std::vector<std::string>          TinyProfiler::regionstack;
 std::deque<std::tuple<double,double,std::string*> > TinyProfiler::ttstack;
 std::map<std::string,std::map<std::string, TinyProfiler::Stats> > TinyProfiler::statsmap;
 double TinyProfiler::t_init = std::numeric_limits<double>::max();
-int TinyProfiler::device_synchronize_around_region = 0;
+bool TinyProfiler::device_synchronize_around_region = false;
 int TinyProfiler::n_print_tabs = 0;
 int TinyProfiler::verbose = 0;
 double TinyProfiler::print_threshold = 1.;
+bool TinyProfiler::enabled = true;
+bool TinyProfiler::memprof_enabled = true;
+std::string TinyProfiler::output_file;
 
 namespace {
     constexpr char mainregion[] = "main";
@@ -75,6 +90,8 @@ TinyProfiler::~TinyProfiler ()
 void
 TinyProfiler::start () noexcept
 {
+    if (!enabled) { return; }
+
     memory_start();
 
 #ifdef AMREX_USE_OMP
@@ -124,6 +141,8 @@ TinyProfiler::start () noexcept
             for (int itab = 0; itab < n_print_tabs; ++itab) {
                 whitespace += "  ";
             }
+            // If we try to print to output_file here, it may not be thread
+            // safe. Also note that this is controlled by verbose already.
             amrex::Print() << whitespace << "TP: Entering " << fname << '\n';
         }
     }
@@ -132,6 +151,8 @@ TinyProfiler::start () noexcept
 void
 TinyProfiler::stop () noexcept
 {
+    if (!enabled) { return; }
+
     memory_stop();
 
 #ifdef AMREX_USE_OMP
@@ -193,13 +214,18 @@ TinyProfiler::stop () noexcept
                 whitespace += "  ";
             }
             --n_print_tabs;
+            // If we try to print to output_file here, it may not be thread
+            // safe. Also note that this is controlled by verbose already.
             amrex::Print() << whitespace << "TP: Leaving  " << fname << '\n';
         }
     }
 }
 
 void
-TinyProfiler::memory_start () const noexcept {
+TinyProfiler::memory_start () const noexcept
+{
+    if (!memprof_enabled) { return; }
+
     // multiple omp threads may share the same TinyProfiler object so this function must be const
     // it is NOT allowed to double start a section
 #ifdef AMREX_USE_OMP
@@ -213,7 +239,10 @@ TinyProfiler::memory_start () const noexcept {
 }
 
 void
-TinyProfiler::memory_stop () const noexcept {
+TinyProfiler::memory_stop () const noexcept
+{
+    if (!memprof_enabled) { return; }
+
     // multiple omp threads may share the same TinyProfiler object so this function must be const
     // it IS allowed to double stop a section
 #ifdef AMREX_USE_OMP
@@ -232,7 +261,10 @@ TinyProfiler::memory_stop () const noexcept {
 }
 
 MemStat*
-TinyProfiler::memory_alloc (std::size_t nbytes, std::map<std::string, MemStat>& memstats) noexcept {
+TinyProfiler::memory_alloc (std::size_t nbytes, std::map<std::string, MemStat>& memstats) noexcept
+{
+    if (!memprof_enabled) { return nullptr; }
+
     // this function is not thread safe for the same memstats
     // the caller of this function (CArena::alloc) has a mutex
     MemStat* stat = nullptr;
@@ -258,7 +290,10 @@ TinyProfiler::memory_alloc (std::size_t nbytes, std::map<std::string, MemStat>&
 }
 
 void
-TinyProfiler::memory_free (std::size_t nbytes, MemStat* stat) noexcept {
+TinyProfiler::memory_free (std::size_t nbytes, MemStat* stat) noexcept
+{
+    if (!memprof_enabled) { return; }
+
     // this function is not thread safe for the same stat
     // the caller of this function (CArena::free) has a mutex
     if (stat) {
@@ -272,22 +307,48 @@ TinyProfiler::memory_free (std::size_t nbytes, MemStat* stat) noexcept {
 void
 TinyProfiler::Initialize () noexcept
 {
-    regionstack.emplace_back(mainregion);
-    t_init = amrex::second();
     {
         amrex::ParmParse pp("tiny_profiler");
         pp.queryAdd("device_synchronize_around_region", device_synchronize_around_region);
-        pp.queryAdd("verbose", verbose);
-        pp.queryAdd("v", verbose);
+        if (! pp.query("verbose", "v", verbose)) {
+            pp.add("verbose", verbose);
+        }
         // Specify the maximum percentage of inclusive time
         // that the "Other" section in the output can have (default 1%)
         pp.queryAdd("print_threshold", print_threshold);
+
+        pp.queryAdd("enabled", enabled);
+        pp.queryAdd("output_file", output_file);
     }
+
+    if (!enabled) { return; }
+
+    if (ParallelDescriptor::IOProcessor()) {
+        static bool first = true;
+        if (first && !output_file.empty() && output_file != "/dev/null") {
+            if (FileSystem::Exists(output_file)) {
+                FileSystem::Remove(output_file);
+            }
+            first = false;
+        }
+    }
+
+    regionstack.emplace_back(mainregion);
+    t_init = amrex::second();
 }
 
 void
 TinyProfiler::MemoryInitialize () noexcept
 {
+    {
+        amrex::ParmParse pp("tiny_profiler");
+        pp.queryAdd("enabled", enabled);
+        pp.queryAdd("memprof_enabled", memprof_enabled);
+        memprof_enabled = memprof_enabled && enabled;
+    }
+
+    if (!memprof_enabled) { return; }
+
 #ifdef AMREX_USE_OMP
     mem_stack_thread_private.resize(omp_get_max_threads());
 #endif
@@ -296,6 +357,8 @@ TinyProfiler::MemoryInitialize () noexcept
 void
 TinyProfiler::Finalize (bool bFlushing) noexcept
 {
+    if (!enabled) { return; }
+
     static bool finalized = false;
     if (!bFlushing) {                // If flushing, don't make this the last time!
         if (finalized) {
@@ -321,11 +384,25 @@ TinyProfiler::Finalize (bool bFlushing) noexcept
     ParallelReduce::Sum(dt_avg, ioproc, ParallelDescriptor::Communicator());
     dt_avg /= double(nprocs);
 
-    if  (ParallelDescriptor::IOProcessor())
+    std::ofstream ofs;
+    std::ostream* os = nullptr;
+    std::streamsize oldprec = 0;
+    if (ParallelDescriptor::IOProcessor()) {
+        if (output_file.empty()) {
+            os = &(amrex::OutStream());
+        } else if (output_file != "/dev/null") {
+            ofs.open(output_file, std::ios_base::app);
+            if (!ofs.is_open()) {
+                amrex::Error("TinyProfiler failed to open "+output_file);
+            }
+            os = static_cast<std::ostream*>(&ofs);
+        }
+    }
+
+    if (os)
     {
-        amrex::Print() << "\n\n";
-        amrex::Print().SetPrecision(4)
-            <<"TinyProfiler total time across processes [min...avg...max]: "
+        os->precision(4);
+        *os << "\n\nTinyProfiler total time across processes [min...avg...max]: "
             << dt_min << " ... " << dt_avg << " ... " << dt_max << "\n";
     }
 
@@ -349,19 +426,27 @@ TinyProfiler::Finalize (bool bFlushing) noexcept
         }
     }
 
-    PrintStats(lstatsmap[mainregion], dt_max);
+    PrintStats(lstatsmap[mainregion], dt_max, os);
     for (auto& kv : lstatsmap) {
         if (kv.first != mainregion) {
-            amrex::Print() << "\n\nBEGIN REGION " << kv.first << "\n";
-            PrintStats(kv.second, dt_max);
-            amrex::Print() << "END REGION " << kv.first << "\n";
+            if (os) {
+                *os << "\n\nBEGIN REGION " << kv.first << "\n";
+            }
+            PrintStats(kv.second, dt_max, os);
+            if (os) {
+                *os << "END REGION " << kv.first << "\n";
+            }
         }
     }
+
+    if(os) { os->precision(oldprec); }
 }
 
 void
 TinyProfiler::MemoryFinalize (bool bFlushing) noexcept
 {
+    if (!memprof_enabled) { return; }
+
     // This function must be called BEFORE the profiled arenas are deleted
 
     static bool finalized = false;
@@ -378,20 +463,39 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept
     int ioproc = ParallelDescriptor::IOProcessorNumber();
     ParallelReduce::Max(dt_max, ioproc, ParallelDescriptor::Communicator());
 
+    std::ofstream ofs;
+    std::ostream* os = nullptr;
+    std::streamsize oldprec = 0;
+    if (ParallelDescriptor::IOProcessor()) {
+        if (output_file.empty()) {
+            os = &(amrex::OutStream());
+        } else if (output_file != "/dev/null") {
+            ofs.open(output_file, std::ios_base::app);
+            if (!ofs.is_open()) {
+                amrex::Error("TinyProfiler failed to open "+output_file);
+            }
+            os = static_cast<std::ostream*>(&ofs);
+        }
+    }
+
     for (std::size_t i = 0; i < all_memstats.size(); ++i) {
-        PrintMemStats(*(all_memstats[i]), all_memnames[i], dt_max, t_final);
+        PrintMemStats(*(all_memstats[i]), all_memnames[i], dt_max, t_final, os);
     }
 
     if (!bFlushing) {
         all_memstats.clear();
         all_memnames.clear();
     }
+
+    if(os) { os->precision(oldprec); }
 }
 
 void
 TinyProfiler::RegisterArena (const std::string& memory_name,
                              std::map<std::string, MemStat>& memstats) noexcept
 {
+    if (!memprof_enabled) { return; }
+
     all_memstats.push_back(&memstats);
     all_memnames.push_back(memory_name);
 }
@@ -399,6 +503,8 @@ TinyProfiler::RegisterArena (const std::string& memory_name,
 void
 TinyProfiler::DeregisterArena (std::map<std::string, MemStat>& memstats) noexcept
 {
+    if (!memprof_enabled) { return; }
+
     for (std::size_t i = 0; i < all_memstats.size();) {
         if (all_memstats[i] == &memstats) {
             all_memstats.erase(all_memstats.begin() + i); // NOLINT
@@ -410,7 +516,8 @@ TinyProfiler::DeregisterArena (std::map<std::string, MemStat>& memstats) noexcep
 }
 
 void
-TinyProfiler::PrintStats (std::map<std::string,Stats>& regstats, double dt_max)
+TinyProfiler::PrintStats (std::map<std::string,Stats>& regstats, double dt_max,
+                          std::ostream* os)
 {
     // make sure the set of profiled functions is the same on all processes
     {
@@ -484,9 +591,9 @@ TinyProfiler::PrintStats (std::map<std::string,Stats>& regstats, double dt_max)
         }
     }
 
-    if (ParallelDescriptor::IOProcessor())
+    if (ParallelDescriptor::IOProcessor() && os)
     {
-        amrex::OutStream() << std::setfill(' ') << std::setprecision(4);
+        *os << std::setfill(' ') << std::setprecision(4);
         int wt = 9;
 
         int wnc = (int) std::log10 ((double) maxncalls) + 1;
@@ -551,34 +658,34 @@ TinyProfiler::PrintStats (std::map<std::string,Stats>& regstats, double dt_max)
             // make sure "Other" is printed at the end of the list
             allprocstats.push_back(other_procstat);
         }
-        amrex::OutStream() << "\n" << hline << "\n";
-        amrex::OutStream() << std::left
-                           << std::setw(maxfnamelen) << "Name"
-                           << std::right
-                           << std::setw(wnc+2) << "NCalls"
-                           << std::setw(wt+2) << "Excl. Min"
-                           << std::setw(wt+2) << "Excl. Avg"
-                           << std::setw(wt+2) << "Excl. Max"
-                           << std::setw(wp+2)  << "Max %"
-                           << "\n" << hline << "\n";
+        *os << "\n" << hline << "\n";
+        *os << std::left
+            << std::setw(maxfnamelen) << "Name"
+            << std::right
+            << std::setw(wnc+2) << "NCalls"
+            << std::setw(wt+2) << "Excl. Min"
+            << std::setw(wt+2) << "Excl. Avg"
+            << std::setw(wt+2) << "Excl. Max"
+            << std::setw(wp+2)  << "Max %"
+            << "\n" << hline << "\n";
         for (const auto & allprocstat : allprocstats)
         {
             if (!allprocstat.do_print) {
                 continue;
             }
-            amrex::OutStream() << std::setprecision(4) << std::left
-                               << std::setw(maxfnamelen) << allprocstat.fname
-                               << std::right
-                               << std::setw(wnc+2) << allprocstat.navg
-                               << std::setw(wt+2) << allprocstat.dtexmin
-                               << std::setw(wt+2) << allprocstat.dtexavg
-                               << std::setw(wt+2) << allprocstat.dtexmax
-                               << std::setprecision(2) << std::setw(wp+1) << std::fixed
-                               << allprocstat.dtexmax*(100.0/dt_max) << "%";
-            amrex::OutStream().unsetf(std::ios_base::fixed);
-            amrex::OutStream() << "\n";
-        }
-        amrex::OutStream() << hline << "\n";
+            *os << std::setprecision(4) << std::left
+                << std::setw(maxfnamelen) << allprocstat.fname
+                << std::right
+                << std::setw(wnc+2) << allprocstat.navg
+                << std::setw(wt+2) << allprocstat.dtexmin
+                << std::setw(wt+2) << allprocstat.dtexavg
+                << std::setw(wt+2) << allprocstat.dtexmax
+                << std::setprecision(2) << std::setw(wp+1) << std::fixed
+                << allprocstat.dtexmax*(100.0/dt_max) << "%";
+            os->unsetf(std::ios_base::fixed);
+            *os << "\n";
+        }
+        *os << hline << "\n";
         if (print_other_procstat) {
             allprocstats.pop_back();
         }
@@ -589,41 +696,41 @@ TinyProfiler::PrintStats (std::map<std::string,Stats>& regstats, double dt_max)
             // make sure "Other" is printed at the end of the list
             allprocstats.push_back(other_procstat);
         }
-        amrex::OutStream() << "\n" << hline << "\n";
-        amrex::OutStream() << std::left
-                           << std::setw(maxfnamelen) << "Name"
-                           << std::right
-                           << std::setw(wnc+2) << "NCalls"
-                           << std::setw(wt+2) << "Incl. Min"
-                           << std::setw(wt+2) << "Incl. Avg"
-                           << std::setw(wt+2) << "Incl. Max"
-                           << std::setw(wp+2)  << "Max %"
-                           << "\n" << hline << "\n";
+        *os << "\n" << hline << "\n";
+        *os << std::left
+            << std::setw(maxfnamelen) << "Name"
+            << std::right
+            << std::setw(wnc+2) << "NCalls"
+            << std::setw(wt+2) << "Incl. Min"
+            << std::setw(wt+2) << "Incl. Avg"
+            << std::setw(wt+2) << "Incl. Max"
+            << std::setw(wp+2)  << "Max %"
+            << "\n" << hline << "\n";
         for (const auto & allprocstat : allprocstats)
         {
             if (!allprocstat.do_print) {
                 continue;
             }
-            amrex::OutStream() << std::setprecision(4) << std::left
-                               << std::setw(maxfnamelen) << allprocstat.fname
-                               << std::right
-                               << std::setw(wnc+2) << allprocstat.navg
-                               << std::setw(wt+2) << allprocstat.dtinmin
-                               << std::setw(wt+2) << allprocstat.dtinavg
-                               << std::setw(wt+2) << allprocstat.dtinmax
-                               << std::setprecision(2) << std::setw(wp+1) << std::fixed
-                               << allprocstat.dtinmax*(100.0/dt_max) << "%";
-            amrex::OutStream().unsetf(std::ios_base::fixed);
-            amrex::OutStream() << "\n";
-        }
-        amrex::OutStream() << hline << "\n\n";
+            *os << std::setprecision(4) << std::left
+                << std::setw(maxfnamelen) << allprocstat.fname
+                << std::right
+                << std::setw(wnc+2) << allprocstat.navg
+                << std::setw(wt+2) << allprocstat.dtinmin
+                << std::setw(wt+2) << allprocstat.dtinavg
+                << std::setw(wt+2) << allprocstat.dtinmax
+                << std::setprecision(2) << std::setw(wp+1) << std::fixed
+                << allprocstat.dtinmax*(100.0/dt_max) << "%";
+            os->unsetf(std::ios_base::fixed);
+            *os << "\n";
+        }
+        *os << hline << "\n\n";
     }
 }
 
 void
-TinyProfiler::PrintMemStats(std::map<std::string, MemStat>& memstats,
-                            std::string const& memname, double dt_max,
-                            double t_final)
+TinyProfiler::PrintMemStats (std::map<std::string, MemStat>& memstats,
+                             std::string const& memname, double dt_max,
+                             double t_final, std::ostream* os)
 {
     // make sure the set of profiled functions is the same on all processes
     {
@@ -767,7 +874,7 @@ TinyProfiler::PrintMemStats(std::map<std::string, MemStat>& memstats,
         maxlen[i] += 2;
     }
 
-    if (allstatsstr.size() == 1) { return; }
+    if (allstatsstr.size() == 1 || !os) { return; }
 
     int lenhline = 0;
     for (auto i : maxlen) {
@@ -775,24 +882,26 @@ TinyProfiler::PrintMemStats(std::map<std::string, MemStat>& memstats,
     }
     const std::string hline(lenhline, '-');
 
-    amrex::OutStream() << memname << " Usage:\n";
-    amrex::OutStream() << hline << "\n";
+    *os << memname << " Usage:\n";
+    *os << hline << "\n";
     for (std::size_t i=0; i<allstatsstr.size(); ++i) {
-        amrex::OutStream() << std::left << std::setw(maxlen[0]) << allstatsstr[i][0];
+        *os << std::left << std::setw(maxlen[0]) << allstatsstr[i][0];
         for (std::size_t j=1; j<maxlen.size(); ++j) {
-            amrex::OutStream() << std::right << std::setw(maxlen[j]) << allstatsstr[i][j];
+            *os << std::right << std::setw(maxlen[j]) << allstatsstr[i][j];
         }
-        amrex::OutStream() << '\n';
+        *os << '\n';
         if (i==0) {
-            amrex::OutStream() << hline << "\n";
+            *os << hline << "\n";
         }
     }
-    amrex::OutStream() << hline << "\n\n";
+    *os << hline << "\n\n";
 }
 
 void
 TinyProfiler::StartRegion (std::string regname) noexcept
 {
+    if (!enabled) { return; }
+
     if (std::find(regionstack.begin(), regionstack.end(), regname) == regionstack.end()) {
         regionstack.emplace_back(std::move(regname));
     }
@@ -801,6 +910,8 @@ TinyProfiler::StartRegion (std::string regname) noexcept
 void
 TinyProfiler::StopRegion (const std::string& regname) noexcept
 {
+    if (!enabled) { return; }
+
     if (regname == regionstack.back()) {
         regionstack.pop_back();
     }
@@ -831,6 +942,8 @@ TinyProfileRegion::~TinyProfileRegion ()
 void
 TinyProfiler::PrintCallStack (std::ostream& os)
 {
+    if (!enabled) { return; }
+
     os << "===== TinyProfilers ======\n";
     for (auto const& x : ttstack) {
         os << *(std::get<2>(x)) << "\n";
diff --git a/Src/Base/AMReX_Utility.H b/Src/Base/AMReX_Utility.H
index 016b8adb0e2..6bec276dbf2 100644
--- a/Src/Base/AMReX_Utility.H
+++ b/Src/Base/AMReX_Utility.H
@@ -15,6 +15,7 @@
 #include <AMReX_Random.H>
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_FileSystem.H>
+#include <AMReX_String.H>
 
 #include <cfloat>
 #include <chrono>
@@ -44,17 +45,6 @@ namespace amrex
     const std::vector<std::string>& Tokenize (const std::string& instr,
                                               const std::string& separators);
 
-    //! Converts all characters of the string into lower or uppercase based on std::locale
-    std::string toLower (std::string s);
-    std::string toUpper (std::string s);
-
-    //! Trim leading and trailing white space
-    std::string trim (std::string s, std::string const& space = " \t");
-
-    //! Returns rootNNNN where NNNN == num.
-    std::string Concatenate (const std::string& root,
-                             int                num,
-                             int                mindigits = 5);
     /**
     *  \brief Creates the specified directories.  path may be either a full pathname
     *  or a relative pathname.  It will create all the directories in the
diff --git a/Src/Base/AMReX_Utility.cpp b/Src/Base/AMReX_Utility.cpp
index 1c79dfba92f..aa3d8a2d165 100644
--- a/Src/Base/AMReX_Utility.cpp
+++ b/Src/Base/AMReX_Utility.cpp
@@ -16,7 +16,6 @@
 #include <cerrno>
 #include <cstdlib>
 #include <cstring>
-#include <cctype>
 #include <cmath>
 #include <cstdio>
 #include <ctime>
@@ -113,44 +112,6 @@ amrex::Tokenize (const std::string& instr,
     return tokens;
 }
 
-std::string
-amrex::toLower (std::string s)
-{
-    std::transform(s.begin(), s.end(), s.begin(),
-                   [](unsigned char c) { return std::tolower(c); });
-    return s;
-}
-
-std::string
-amrex::toUpper (std::string s)
-{
-    std::transform(s.begin(), s.end(), s.begin(),
-                   [](unsigned char c) { return std::toupper(c); });
-    return s;
-}
-
-std::string
-amrex::trim(std::string s, std::string const& space)
-{
-    const auto sbegin = s.find_first_not_of(space);
-    if (sbegin == std::string::npos) { return std::string{}; }
-    const auto send = s.find_last_not_of(space);
-    s = s.substr(sbegin, send-sbegin+1);
-    return s;
-}
-
-std::string
-amrex::Concatenate (const std::string& root,
-                     int                num,
-                     int                mindigits)
-{
-    BL_ASSERT(mindigits >= 0);
-    std::stringstream result;
-    result << root << std::setfill('0') << std::setw(mindigits) << num;
-    return result.str();
-}
-
-
 bool
 amrex::UtilCreateDirectory (const std::string& path,
                             mode_t mode, bool verbose)
diff --git a/Src/Base/AMReX_VisMF.cpp b/Src/Base/AMReX_VisMF.cpp
index 4c8b4b43fb2..2b0ac6716d0 100644
--- a/Src/Base/AMReX_VisMF.cpp
+++ b/Src/Base/AMReX_VisMF.cpp
@@ -77,24 +77,24 @@ VisMF::Initialize ()
     amrex::ExecOnFinalize(VisMF::Finalize);
 
     ParmParse pp("vismf");
-    pp.queryAdd("v",verbose);
+    pp.query("verbose", "v",verbose);
 
     int headerVersion(currentVersion);
-    pp.queryAdd("headerversion", headerVersion);
+    pp.query("headerversion", headerVersion);
     if(headerVersion != currentVersion) {
       currentVersion = static_cast<VisMF::Header::Version> (headerVersion);
     }
 
-    pp.queryAdd("groupsets", groupSets);
-    pp.queryAdd("setbuf", setBuf);
-    pp.queryAdd("usesingleread", useSingleRead);
-    pp.queryAdd("usesinglewrite", useSingleWrite);
-    pp.queryAdd("checkfilepositions", checkFilePositions);
-    pp.queryAdd("usepersistentifstreams", usePersistentIFStreams);
-    pp.queryAdd("usesynchronousreads", useSynchronousReads);
-    pp.queryAdd("usedynamicsetselection", useDynamicSetSelection);
-    pp.queryAdd("iobuffersize", ioBufferSize);
-    pp.queryAdd("allowsparsewrites", allowSparseWrites);
+    pp.query("groupsets", groupSets);
+    pp.query("setbuf", setBuf);
+    pp.query("usesingleread", useSingleRead);
+    pp.query("usesinglewrite", useSingleWrite);
+    pp.query("checkfilepositions", checkFilePositions);
+    pp.query("usepersistentifstreams", usePersistentIFStreams);
+    pp.query("usesynchronousreads", useSynchronousReads);
+    pp.query("usedynamicsetselection", useDynamicSetSelection);
+    pp.query("iobuffersize", ioBufferSize);
+    pp.query("allowsparsewrites", allowSparseWrites);
 
     initialized = true;
 }
diff --git a/Src/Base/AMReX_iMultiFab.cpp b/Src/Base/AMReX_iMultiFab.cpp
index d79e1d71444..090f1ca96d6 100644
--- a/Src/Base/AMReX_iMultiFab.cpp
+++ b/Src/Base/AMReX_iMultiFab.cpp
@@ -4,7 +4,6 @@
 #include <AMReX_MultiFabUtil.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <AMReX_BLProfiler.H>
-#include <AMReX_ParmParse.H>
 
 #include <algorithm>
 #include <cfloat>
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index cebd1f9bce1..0436ad032e4 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -12,6 +12,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_Array.H
        AMReX_BlockMutex.H
        AMReX_BlockMutex.cpp
+       AMReX_Enum.H
        AMReX_GpuComplex.H
        AMReX_Vector.H
        AMReX_TableData.H
@@ -30,6 +31,8 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_parmparse_fi.cpp
        AMReX_ParmParse.H
        AMReX_Functional.H
+       AMReX_String.H
+       AMReX_String.cpp
        AMReX_Utility.H
        AMReX_Utility.cpp
        AMReX_FileSystem.H
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index dfbfb4f03a1..b009ebf7d65 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -2,6 +2,7 @@
 AMREX_BASE=EXE
 
 C$(AMREX_BASE)_headers += AMReX_ccse-mpi.H AMReX_Algorithm.H AMReX_Any.H AMReX_Array.H
+C$(AMREX_BASE)_headers += AMReX_Enum.H
 C$(AMREX_BASE)_headers += AMReX_Vector.H AMReX_TableData.H AMReX_Tuple.H AMReX_Math.H
 
 C$(AMREX_BASE)_headers += AMReX_TypeList.H
@@ -22,6 +23,9 @@ C$(AMREX_BASE)_sources += AMReX_PODVector.cpp
 C$(AMREX_BASE)_headers += AMReX_BlockMutex.H
 C$(AMREX_BASE)_sources += AMReX_BlockMutex.cpp
 
+C$(AMREX_BASE)_headers += AMReX_String.H
+C$(AMREX_BASE)_sources += AMReX_String.cpp
+
 C$(AMREX_BASE)_sources += AMReX_ParmParse.cpp AMReX_parmparse_fi.cpp AMReX_Utility.cpp
 C$(AMREX_BASE)_headers += AMReX_ParmParse.H AMReX_Utility.H AMReX_BLassert.H AMReX_ArrayLim.H
 C$(AMREX_BASE)_headers += AMReX_Functional.H AMReX_Reduce.H AMReX_Scan.H AMReX_Partition.H
diff --git a/Src/Base/Parser/AMReX_IParser.H b/Src/Base/Parser/AMReX_IParser.H
index 6cf1e5b2b0f..9b3f8af6f34 100644
--- a/Src/Base/Parser/AMReX_IParser.H
+++ b/Src/Base/Parser/AMReX_IParser.H
@@ -18,7 +18,7 @@ struct IParserExecutor
 {
     template <int M=N, std::enable_if_t<M==0,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int operator() () const noexcept
+    long long operator() () const noexcept
     {
         AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, nullptr);))
         AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, nullptr);))
@@ -26,16 +26,17 @@ struct IParserExecutor
 
     template <typename... Ts>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    std::enable_if_t<sizeof...(Ts) == N, int>
+    std::enable_if_t<sizeof...(Ts) == N && std::conjunction_v<std::is_integral<Ts>...>,
+                     long long>
     operator() (Ts... var) const noexcept
     {
-        amrex::GpuArray<int,N> l_var{var...};
+        amrex::GpuArray<long long,N> l_var{var...};
         AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, l_var.data());))
         AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, l_var.data());))
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int operator() (GpuArray<int,N> const& var) const noexcept
+    long long operator() (GpuArray<long long, N> const& var) const noexcept
     {
         AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, var.data());))
         AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, var.data());))
@@ -62,7 +63,7 @@ public:
 
     explicit operator bool () const;
 
-    void setConstant (std::string const& name, int c);
+    void setConstant (std::string const& name, long long c);
 
     void registerVariables (Vector<std::string> const& vars);
 
diff --git a/Src/Base/Parser/AMReX_IParser.cpp b/Src/Base/Parser/AMReX_IParser.cpp
index 0845d47b842..ecda330f198 100644
--- a/Src/Base/Parser/AMReX_IParser.cpp
+++ b/Src/Base/Parser/AMReX_IParser.cpp
@@ -54,7 +54,7 @@ IParser::operator bool () const
 }
 
 void
-IParser::setConstant (std::string const& name, int c)
+IParser::setConstant (std::string const& name, long long c)
 {
     if (m_data && m_data->m_iparser) {
         iparser_setconst(m_data->m_iparser, name.c_str(), c);
diff --git a/Src/Base/Parser/AMReX_IParser_Exe.H b/Src/Base/Parser/AMReX_IParser_Exe.H
index 67756a93599..e7e41c44d48 100644
--- a/Src/Base/Parser/AMReX_IParser_Exe.H
+++ b/Src/Base/Parser/AMReX_IParser_Exe.H
@@ -62,7 +62,7 @@ struct alignas(8) IParserExeNull {
 
 struct alignas(8) IParserExeNumber {
     enum iparser_exe_t type = IPARSER_EXE_NUMBER;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeSymbol {
@@ -76,7 +76,7 @@ struct alignas(8) IParserExeADD {
 
 struct alignas(8) IParserExeSUB {
     enum iparser_exe_t type = IPARSER_EXE_SUB;
-    int sign;
+    long long sign;
 };
 
 struct alignas(8) IParserExeMUL {
@@ -113,31 +113,31 @@ struct alignas(8) IParserExeF2_B {
 struct alignas(8) IParserExeADD_VP {
     enum iparser_exe_t type = IPARSER_EXE_ADD_VP;
     int i;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeSUB_VP {
     enum iparser_exe_t type = IPARSER_EXE_SUB_VP;
     int i;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeMUL_VP {
     enum iparser_exe_t type = IPARSER_EXE_MUL_VP;
     int i;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeDIV_VP {
     enum iparser_exe_t type = IPARSER_EXE_DIV_VP;
     int i;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeDIV_PV {
     enum iparser_exe_t type = IPARSER_EXE_DIV_PV;
     int i;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeADD_PP {
@@ -171,27 +171,27 @@ struct alignas(8) IParserExeNEG_P {
 
 struct alignas(8) IParserExeADD_VN {
     enum iparser_exe_t type = IPARSER_EXE_ADD_VN;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeSUB_VN {
     enum iparser_exe_t type = IPARSER_EXE_SUB_VN;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeMUL_VN {
     enum iparser_exe_t type = IPARSER_EXE_MUL_VN;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeDIV_VN {
     enum iparser_exe_t type = IPARSER_EXE_DIV_VN;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeDIV_NV {
     enum iparser_exe_t type = IPARSER_EXE_DIV_NV;
-    int v;
+    long long v;
 };
 
 struct alignas(8) IParserExeADD_PN {
@@ -202,7 +202,7 @@ struct alignas(8) IParserExeADD_PN {
 struct alignas(8) IParserExeSUB_PN {
     enum iparser_exe_t type = IPARSER_EXE_SUB_PN;
     int i;
-    int sign;
+    long long sign;
 };
 
 struct alignas(8) IParserExeMUL_PN {
@@ -229,19 +229,19 @@ struct alignas(8) IParserExeJUMP {
 template <int N>
 struct IParserStack
 {
-    int m_data[N];
+    long long m_data[N];
     int m_size = 0;
-    constexpr void push (int v) { m_data[m_size++] = v; }
+    constexpr void push (long long v) { m_data[m_size++] = v; }
     constexpr void pop () { --m_size; }
-    [[nodiscard]] constexpr int const& top () const { return m_data[m_size-1]; }
-    [[nodiscard]] constexpr int      & top ()       { return m_data[m_size-1]; }
-    [[nodiscard]] constexpr int operator[] (int i) const { return m_data[i]; }
+    [[nodiscard]] constexpr long long const& top () const { return m_data[m_size-1]; }
+    [[nodiscard]] constexpr long long      & top ()       { return m_data[m_size-1]; }
+    [[nodiscard]] constexpr long long operator[] (int i) const { return m_data[i]; }
 };
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int iparser_exe_eval (const char* p, int const* x)
+long long iparser_exe_eval (const char* p, long long const* x)
 {
-    if (p == nullptr) { return std::numeric_limits<int>::max(); }
+    if (p == nullptr) { return std::numeric_limits<long long>::max(); }
 
     IParserStack<AMREX_IPARSER_STACK_SIZE> pstack;
     while (*((iparser_exe_t*)p) != IPARSER_EXE_NULL) {
@@ -256,14 +256,14 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_SYMBOL:
         {
             int i = ((IParserExeSymbol*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.push(d);
             p    += sizeof(IParserExeSymbol);
             break;
         }
         case IPARSER_EXE_ADD:
         {
-            int b = pstack.top();
+            auto b = pstack.top();
             pstack.pop();
             pstack.top() += b;
             p += sizeof(IParserExeADD);
@@ -271,7 +271,7 @@ int iparser_exe_eval (const char* p, int const* x)
         }
         case IPARSER_EXE_SUB:
         {
-            int b = pstack.top();
+            auto b = pstack.top();
             pstack.pop();
             pstack.top() = (pstack.top() - b) * (((IParserExeSUB*)p)->sign);
             p += sizeof(IParserExeSUB);
@@ -279,7 +279,7 @@ int iparser_exe_eval (const char* p, int const* x)
         }
         case IPARSER_EXE_MUL:
         {
-            int b = pstack.top();
+            auto b = pstack.top();
             pstack.pop();
             pstack.top() *= b;
             p += sizeof(IParserExeMUL);
@@ -287,7 +287,7 @@ int iparser_exe_eval (const char* p, int const* x)
         }
         case IPARSER_EXE_DIV_F:
         {
-            int v = pstack.top();
+            auto v = pstack.top();
             pstack.pop();
             pstack.top() /= v;
             p += sizeof(IParserExeDIV_F);
@@ -295,7 +295,7 @@ int iparser_exe_eval (const char* p, int const* x)
         }
         case IPARSER_EXE_DIV_B:
         {
-            int v = pstack.top();
+            auto v = pstack.top();
             pstack.pop();
             pstack.top() = v / pstack.top();
             p += sizeof(IParserExeDIV_B);
@@ -315,7 +315,7 @@ int iparser_exe_eval (const char* p, int const* x)
         }
         case IPARSER_EXE_F2_F:
         {
-            int v = pstack.top();
+            auto v = pstack.top();
             pstack.pop();
             pstack.top() = iparser_call_f2(((IParserExeF2_F*)p)->ftype, pstack.top(), v);
             p += sizeof(IParserExeF2_F);
@@ -323,7 +323,7 @@ int iparser_exe_eval (const char* p, int const* x)
         }
         case IPARSER_EXE_F2_B:
         {
-            int v = pstack.top();
+            auto v = pstack.top();
             pstack.pop();
             pstack.top() = iparser_call_f2(((IParserExeF2_B*)p)->ftype, v, pstack.top());
             p += sizeof(IParserExeF2_B);
@@ -332,7 +332,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_ADD_VP:
         {
             int i = ((IParserExeADD_VP*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.push(((IParserExeADD_VP*)p)->v + d);
             p   += sizeof(IParserExeADD_VP);
             break;
@@ -340,7 +340,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_SUB_VP:
         {
             int i = ((IParserExeSUB_VP*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.push(((IParserExeSUB_VP*)p)->v - d);
             p   += sizeof(IParserExeSUB_VP);
             break;
@@ -348,7 +348,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_MUL_VP:
         {
             int i = ((IParserExeMUL_VP*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.push(((IParserExeMUL_VP*)p)->v * d);
             p   += sizeof(IParserExeMUL_VP);
             break;
@@ -356,7 +356,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_DIV_VP:
         {
             int i = ((IParserExeDIV_VP*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.push(((IParserExeDIV_VP*)p)->v / d);
             p   += sizeof(IParserExeDIV_VP);
             break;
@@ -364,7 +364,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_DIV_PV:
         {
             int i = ((IParserExeDIV_PV*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.push(d / ((IParserExeDIV_PV*)p)->v);
             p       += sizeof(IParserExeDIV_PV);
             break;
@@ -372,9 +372,9 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_ADD_PP:
         {
             int i = ((IParserExeADD_PP*)p)->i1;
-            int d1 = AMREX_IPARSER_GET_DATA(i);
+            auto d1 = AMREX_IPARSER_GET_DATA(i);
             i     = ((IParserExeADD_PP*)p)->i2;
-            int d2 = AMREX_IPARSER_GET_DATA(i);
+            auto d2 = AMREX_IPARSER_GET_DATA(i);
             pstack.push(d1+d2);
             p     += sizeof(IParserExeADD_PP);
             break;
@@ -382,9 +382,9 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_SUB_PP:
         {
             int i = ((IParserExeSUB_PP*)p)->i1;
-            int d1 = AMREX_IPARSER_GET_DATA(i);
+            auto d1 = AMREX_IPARSER_GET_DATA(i);
             i     = ((IParserExeSUB_PP*)p)->i2;
-            int d2 = AMREX_IPARSER_GET_DATA(i);
+            auto d2 = AMREX_IPARSER_GET_DATA(i);
             pstack.push(d1-d2);
             p     += sizeof(IParserExeSUB_PP);
             break;
@@ -392,9 +392,9 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_MUL_PP:
         {
             int i = ((IParserExeMUL_PP*)p)->i1;
-            int d1 = AMREX_IPARSER_GET_DATA(i);
+            auto d1 = AMREX_IPARSER_GET_DATA(i);
             i     = ((IParserExeMUL_PP*)p)->i2;
-            int d2 = AMREX_IPARSER_GET_DATA(i);
+            auto d2 = AMREX_IPARSER_GET_DATA(i);
             pstack.push(d1*d2);
             p     += sizeof(IParserExeMUL_PP);
             break;
@@ -402,9 +402,9 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_DIV_PP:
         {
             int i = ((IParserExeDIV_PP*)p)->i1;
-            int d1 = AMREX_IPARSER_GET_DATA(i);
+            auto d1 = AMREX_IPARSER_GET_DATA(i);
             i     = ((IParserExeDIV_PP*)p)->i2;
-            int d2 = AMREX_IPARSER_GET_DATA(i);
+            auto d2 = AMREX_IPARSER_GET_DATA(i);
             pstack.push(d1/d2);
             p      += sizeof(IParserExeDIV_PP);
             break;
@@ -412,7 +412,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_NEG_P:
         {
             int i = ((IParserExeNEG_P*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.push(-d);
             p     += sizeof(IParserExeNEG_P);
             break;
@@ -450,7 +450,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_ADD_PN:
         {
             int i = ((IParserExeADD_PN*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.top() += d;
             p         += sizeof(IParserExeADD_PN);
             break;
@@ -458,7 +458,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_SUB_PN:
         {
             int i = ((IParserExeSUB_PN*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.top() = (d - pstack.top()) * (((IParserExeSUB_PN*)p)->sign);
             p         += sizeof(IParserExeSUB_PN);
             break;
@@ -466,7 +466,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_MUL_PN:
         {
             int i = ((IParserExeMUL_PN*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             pstack.top() *= d;
             p         += sizeof(IParserExeMUL_PN);
             break;
@@ -474,7 +474,7 @@ int iparser_exe_eval (const char* p, int const* x)
         case IPARSER_EXE_DIV_PN:
         {
             int i = ((IParserExeDIV_PN*)p)->i;
-            int d = AMREX_IPARSER_GET_DATA(i);
+            auto d = AMREX_IPARSER_GET_DATA(i);
             if (((IParserExeDIV_PN*)p)->reverse) {
                 pstack.top() /= d;
             } else {
@@ -485,9 +485,9 @@ int iparser_exe_eval (const char* p, int const* x)
         }
         case IPARSER_EXE_IF:
         {
-            int cond = pstack.top();
+            auto cond = pstack.top();
             pstack.pop();
-            if (cond == 0.0) { // false branch
+            if (cond == 0) { // false branch
                 p += ((IParserExeIF*)p)->offset;
             }
             p += sizeof(IParserExeIF);
diff --git a/Src/Base/Parser/AMReX_IParser_Y.H b/Src/Base/Parser/AMReX_IParser_Y.H
index 50eb5de39ec..79b6cd7551b 100644
--- a/Src/Base/Parser/AMReX_IParser_Y.H
+++ b/Src/Base/Parser/AMReX_IParser_Y.H
@@ -75,7 +75,7 @@ enum iparser_node_t {
 
 union iparser_nvp {
     struct iparser_node* n;
-    int v;
+    long long v;
     int ip;
 };
 
@@ -89,7 +89,7 @@ struct iparser_node {
 
 struct iparser_number {
     enum iparser_node_t type;
-    int value;
+    long long value;
 };
 
 struct iparser_symbol {
@@ -134,7 +134,7 @@ void iparser_defexpr (struct iparser_node* body);
 struct iparser_symbol* iparser_makesymbol (char* name);
 struct iparser_node* iparser_newnode (enum iparser_node_t type, struct iparser_node* l,
                                       struct iparser_node* r);
-struct iparser_node* iparser_newnumber (int d);
+struct iparser_node* iparser_newnumber (long long d);
 struct iparser_node* iparser_newsymbol (struct iparser_symbol* sym);
 struct iparser_node* iparser_newf1 (enum iparser_f1_t ftype, struct iparser_node* l);
 struct iparser_node* iparser_newf2 (enum iparser_f2_t ftype, struct iparser_node* l,
@@ -164,7 +164,7 @@ struct amrex_iparser* iparser_dup (struct amrex_iparser* source);
 struct iparser_node* iparser_ast_dup (struct amrex_iparser* iparser, struct iparser_node* node, int move);
 
 void iparser_regvar (struct amrex_iparser* iparser, char const* name, int i);
-void iparser_setconst (struct amrex_iparser* iparser, char const* name, int c);
+void iparser_setconst (struct amrex_iparser* iparser, char const* name, long long c);
 void iparser_print (struct amrex_iparser* iparser);
 std::set<std::string> iparser_get_symbols (struct amrex_iparser* iparser);
 int iparser_depth (struct amrex_iparser* iparser);
@@ -174,27 +174,27 @@ void iparser_ast_optimize (struct iparser_node* node);
 std::size_t iparser_ast_size (struct iparser_node* node);
 void iparser_ast_print (struct iparser_node* node, std::string const& space, AllPrint& printer);
 void iparser_ast_regvar (struct iparser_node* node, char const* name, int i);
-void iparser_ast_setconst (struct iparser_node* node, char const* name, int c);
+void iparser_ast_setconst (struct iparser_node* node, char const* name, long long c);
 void iparser_ast_get_symbols (struct iparser_node* node, std::set<std::string>& symbols,
                               std::set<std::string>& local_symbols);
 int iparser_ast_depth (struct iparser_node* node);
 
 /*******************************************************************/
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int
-iparser_call_f1 (enum iparser_f1_t /*type*/, int a)
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE long long
+iparser_call_f1 (enum iparser_f1_t /*type*/, long long a)
 {
     /// There is only one type for now
     return std::abs(a);
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int
-iparser_call_f2 (enum iparser_f2_t type, int a, int b)
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE long long
+iparser_call_f2 (enum iparser_f2_t type, long long a, long long b)
 {
     switch (type) {
     case IPARSER_FLRDIV:
     {
-        int r = a/b;
+        long long r = a/b;
         if (r*b == a || (a < 0 && b < 0) || (a > 0 && b > 0)) {
             return r;
         } else {
@@ -206,13 +206,13 @@ iparser_call_f2 (enum iparser_f2_t type, int a, int b)
         if (b < 0) {
             return 0;
         } else {
-            int r = 1;
+            long long r = 1;
             while (b != 0) {
                 if (b & 1) {
                     r *= a;
                 }
                 b >>= 1;
-                a *= a;
+                if (b > 0) { a *= a; } // to avoid overflow
             }
             return r;
         }
@@ -243,13 +243,15 @@ iparser_call_f2 (enum iparser_f2_t type, int a, int b)
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int
-iparser_call_f3 (enum iparser_f3_t /*type*/, int a, int b, int c)
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE long long
+iparser_call_f3 (enum iparser_f3_t /*type*/, long long a, long long b, long long c)
 {
     // There is only one type currently
     return (a != 0) ? b : c;
 }
 
+long long iparser_atoll (const char* str);
+
 }
 
 #endif
diff --git a/Src/Base/Parser/AMReX_IParser_Y.cpp b/Src/Base/Parser/AMReX_IParser_Y.cpp
index cfb036de96e..00cca1717f0 100644
--- a/Src/Base/Parser/AMReX_IParser_Y.cpp
+++ b/Src/Base/Parser/AMReX_IParser_Y.cpp
@@ -51,7 +51,7 @@ iparser_newnode (enum iparser_node_t type, struct iparser_node* l, struct iparse
 }
 
 struct iparser_node*
-iparser_newnumber (int d)
+iparser_newnumber (long long d)
 {
     auto *r = (struct iparser_number*) std::malloc(sizeof(struct iparser_number));
     r->type = IPARSER_NUMBER;
@@ -398,7 +398,7 @@ iparser_ast_optimize (struct iparser_node* node)
         if (node->l->type == IPARSER_NUMBER &&
             node->r->type == IPARSER_NUMBER)
         {
-            int v = ((struct iparser_number*)(node->l))->value
+            auto v= ((struct iparser_number*)(node->l))->value
                 +   ((struct iparser_number*)(node->r))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
@@ -428,28 +428,28 @@ iparser_ast_optimize (struct iparser_node* node)
         else if (node->l->type == IPARSER_NUMBER &&
                  node->r->type == IPARSER_ADD_VP)
         {
-            int v = ((struct iparser_number*)(node->l))->value + IPARSER_EVAL_R(node);
+            auto v = ((struct iparser_number*)(node->l))->value + IPARSER_EVAL_R(node);
             IPARSER_MOVEUP_R(node, v);
             node->type = IPARSER_ADD_VP;
         }
         else if (node->l->type == IPARSER_NUMBER &&
                  node->r->type == IPARSER_SUB_VP)
         {
-            int v = ((struct iparser_number*)(node->l))->value + IPARSER_EVAL_R(node);
+            auto v = ((struct iparser_number*)(node->l))->value + IPARSER_EVAL_R(node);
             IPARSER_MOVEUP_R(node, v);
             node->type = IPARSER_SUB_VP;
         }
         else if (node->l->type == IPARSER_ADD_VP &&
                  node->r->type == IPARSER_NUMBER)
         {
-            int v = IPARSER_EVAL_L(node) + ((struct iparser_number*)(node->r))->value;
+            auto v = IPARSER_EVAL_L(node) + ((struct iparser_number*)(node->r))->value;
             IPARSER_MOVEUP_L(node, v);
             node->type = IPARSER_ADD_VP;
         }
         else if (node->l->type == IPARSER_SUB_VP &&
                  node->r->type == IPARSER_NUMBER)
         {
-            int v = IPARSER_EVAL_L(node) + ((struct iparser_number*)(node->r))->value;
+            auto v = IPARSER_EVAL_L(node) + ((struct iparser_number*)(node->r))->value;
             IPARSER_MOVEUP_L(node, v);
             node->type = IPARSER_SUB_VP;
         }
@@ -458,14 +458,14 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->r->l->type == IPARSER_NUMBER)
             { // #l + (#rl + node_rr) -> (#l + #rl) + node_rr, same type
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     +   ((struct iparser_number*)(node->r->l))->value;
                 node->r = node->r->r;
                 ((struct iparser_number*)(node->l))->value = v;
             }
             else if (node->r->r->type == IPARSER_NUMBER)
             { // #l + (node_rl + #rr) -> (#l + #rr) + node_rl, same type
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     +   ((struct iparser_number*)(node->r->r))->value;
                 node->r = node->r->l;
                 ((struct iparser_number*)(node->l))->value = v;
@@ -476,7 +476,7 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->r->l->type == IPARSER_NUMBER)
             { // #l + (#rl - node_rr) -> (#l + #rl) - node_rr, type change
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     +   ((struct iparser_number*)(node->r->l))->value;
                 node->r = node->r->r;
                 ((struct iparser_number*)(node->l))->value = v;
@@ -484,7 +484,7 @@ iparser_ast_optimize (struct iparser_node* node)
             }
             else if (node->r->r->type == IPARSER_NUMBER)
             { // #l + (node_rl - #rr) -> (#l - #rr) + node_rl, same type
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     -   ((struct iparser_number*)(node->r->r))->value;
                 node->r = node->r->l;
                 ((struct iparser_number*)(node->l))->value = v;
@@ -495,14 +495,14 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->l->l->type == IPARSER_NUMBER)
             { // (#ll + node_lr) + #r -> nodel_lr + (#ll + #r), same type
-                int v = ((struct iparser_number*)(node->l->l))->value
+                auto v= ((struct iparser_number*)(node->l->l))->value
                     +   ((struct iparser_number*)(node->r))->value;
                 node->l = node->l->r;
                 ((struct iparser_number*)(node->r))->value = v;
             }
             else if (node->l->r->type == IPARSER_NUMBER)
             { // (node_ll + #lr) + #r -> node_ll + (#lr + #r), same type
-                int v = ((struct iparser_number*)(node->l->r))->value
+                auto v= ((struct iparser_number*)(node->l->r))->value
                     +   ((struct iparser_number*)(node->r))->value;
                 node->l = node->l->l;
                 ((struct iparser_number*)(node->r))->value = v;
@@ -513,7 +513,7 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->l->l->type == IPARSER_NUMBER)
             { // (#ll - node_lr) + #r -> (#ll + #r) - node_lr, type change
-                int v = ((struct iparser_number*)(node->l->l))->value
+                auto v= ((struct iparser_number*)(node->l->l))->value
                     +   ((struct iparser_number*)(node->r))->value;
                 node->r = node->l->r;
                 ((struct iparser_number*)(node->l))->type = IPARSER_NUMBER;
@@ -522,7 +522,7 @@ iparser_ast_optimize (struct iparser_node* node)
             }
             else if (node->l->r->type == IPARSER_NUMBER)
             { // (node_ll - #lr) + #r -> node_ll + (#r - #lr), same type
-                int v = ((struct iparser_number*)(node->r))->value
+                auto v= ((struct iparser_number*)(node->r))->value
                     -   ((struct iparser_number*)(node->l->r))->value;
                 node->l = node->l->l;
                 ((struct iparser_number*)(node->r))->value = v;
@@ -536,7 +536,7 @@ iparser_ast_optimize (struct iparser_node* node)
         if (node->l->type == IPARSER_NUMBER &&
             node->r->type == IPARSER_NUMBER)
         {
-            int v = ((struct iparser_number*)(node->l))->value
+            auto v= ((struct iparser_number*)(node->l))->value
                 -   ((struct iparser_number*)(node->r))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
@@ -566,28 +566,28 @@ iparser_ast_optimize (struct iparser_node* node)
         else if (node->l->type == IPARSER_NUMBER &&
                  node->r->type == IPARSER_ADD_VP)
         {
-            int v = ((struct iparser_number*)(node->l))->value - IPARSER_EVAL_R(node);
+            auto v= ((struct iparser_number*)(node->l))->value - IPARSER_EVAL_R(node);
             IPARSER_MOVEUP_R(node, v);
             node->type = IPARSER_SUB_VP;
         }
         else if (node->l->type == IPARSER_NUMBER &&
                  node->r->type == IPARSER_SUB_VP)
         {
-            int v = ((struct iparser_number*)(node->l))->value - IPARSER_EVAL_R(node);
+            auto v= ((struct iparser_number*)(node->l))->value - IPARSER_EVAL_R(node);
             IPARSER_MOVEUP_R(node, v);
             node->type = IPARSER_ADD_VP;
         }
         else if (node->l->type == IPARSER_ADD_VP &&
                  node->r->type == IPARSER_NUMBER)
         {
-            int v = IPARSER_EVAL_L(node) - ((struct iparser_number*)(node->r))->value;
+            auto v= IPARSER_EVAL_L(node) - ((struct iparser_number*)(node->r))->value;
             IPARSER_MOVEUP_L(node, v);
             node->type = IPARSER_ADD_VP;
         }
         else if (node->l->type == IPARSER_SUB_VP &&
                  node->r->type == IPARSER_NUMBER)
         {
-            int v = IPARSER_EVAL_L(node) - ((struct iparser_number*)(node->r))->value;
+            auto v= IPARSER_EVAL_L(node) - ((struct iparser_number*)(node->r))->value;
             IPARSER_MOVEUP_L(node, v);
             node->type = IPARSER_SUB_VP;
         }
@@ -596,14 +596,14 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->r->l->type == IPARSER_NUMBER)
             { // #l - (#rl + node_rr) -> (#l - #rl) - node_rr, same type
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     -   ((struct iparser_number*)(node->r->l))->value;
                 node->r = node->r->r;
                 ((struct iparser_number*)(node->l))->value = v;
             }
             else if (node->r->r->type == IPARSER_NUMBER)
             { // #l - (node_rl + #rr) -> (#l - #rr) - node_rl, same type
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     -   ((struct iparser_number*)(node->r->r))->value;
                 node->r = node->r->l;
                 ((struct iparser_number*)(node->l))->value = v;
@@ -614,7 +614,7 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->r->l->type == IPARSER_NUMBER)
             { // #l - (#rl - node_rr) -> (#l - #rl) + node_rr, type change
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     -   ((struct iparser_number*)(node->r->l))->value;
                 node->r = node->r->r;
                 ((struct iparser_number*)(node->l))->value = v;
@@ -622,7 +622,7 @@ iparser_ast_optimize (struct iparser_node* node)
             }
             else if (node->r->r->type == IPARSER_NUMBER)
             { // #l - (node_rl - #rr) -> (#l + #rr) - node_rl, same type
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     +   ((struct iparser_number*)(node->r->r))->value;
                 node->r = node->r->l;
                 ((struct iparser_number*)(node->l))->value = v;
@@ -633,14 +633,14 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->l->l->type == IPARSER_NUMBER)
             { // (#ll + node_lr) - #r -> node_lr - (#r - #ll), same type
-                int v = ((struct iparser_number*)(node->r))->value
+                auto v= ((struct iparser_number*)(node->r))->value
                     -   ((struct iparser_number*)(node->l->l))->value;
                 node->l = node->l->r;
                 ((struct iparser_number*)(node->r))->value = v;
             }
             else if (node->l->r->type == IPARSER_NUMBER)
             { // (node_ll + #lr) - #r -> node_ll - (#r - #lr), same type
-                int v = ((struct iparser_number*)(node->r))->value
+                auto v= ((struct iparser_number*)(node->r))->value
                     -   ((struct iparser_number*)(node->l->r))->value;
                 node->l = node->l->l;
                 ((struct iparser_number*)(node->r))->value = v;
@@ -651,7 +651,7 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->l->l->type == IPARSER_NUMBER)
             { // (#ll - node_lr) - #r -> (#ll - #r) - node_lr, type change
-                int v = ((struct iparser_number*)(node->l->l))->value
+                auto v= ((struct iparser_number*)(node->l->l))->value
                     -   ((struct iparser_number*)(node->r))->value;
                 node->r = node->l->r;
                 node->l->type = IPARSER_NUMBER;
@@ -659,7 +659,7 @@ iparser_ast_optimize (struct iparser_node* node)
             }
             else if (node->l->r->type == IPARSER_NUMBER)
             { // (node_ll - #lr) - #r -> node_ll - (#r + #lr), same type
-                int v = ((struct iparser_number*)(node->r))->value
+                auto v= ((struct iparser_number*)(node->r))->value
                     +   ((struct iparser_number*)(node->l->r))->value;
                 node->l = node->l->l;
                 ((struct iparser_number*)(node->r))->value = v;
@@ -673,7 +673,7 @@ iparser_ast_optimize (struct iparser_node* node)
         if (node->l->type == IPARSER_NUMBER &&
             node->r->type == IPARSER_NUMBER)
         {
-            int v = ((struct iparser_number*)(node->l))->value
+            auto v= ((struct iparser_number*)(node->l))->value
                 *   ((struct iparser_number*)(node->r))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
@@ -703,14 +703,14 @@ iparser_ast_optimize (struct iparser_node* node)
         else if (node->l->type == IPARSER_NUMBER &&
                  node->r->type == IPARSER_MUL_VP)
         {
-            int v = ((struct iparser_number*)(node->l))->value * IPARSER_EVAL_R(node);
+            auto v= ((struct iparser_number*)(node->l))->value * IPARSER_EVAL_R(node);
             IPARSER_MOVEUP_R(node, v);
             node->type = IPARSER_MUL_VP;
         }
         else if (node->l->type == IPARSER_MUL_VP &&
                  node->r->type == IPARSER_NUMBER)
         {
-            int v = IPARSER_EVAL_L(node) * ((struct iparser_number*)(node->r))->value;
+            auto v= IPARSER_EVAL_L(node) * ((struct iparser_number*)(node->r))->value;
             IPARSER_MOVEUP_L(node, v);
             node->type = IPARSER_MUL_VP;
         }
@@ -719,14 +719,14 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->r->l->type == IPARSER_NUMBER)
             { // #l * (#rl * node_rr) -> (#l * #rl) * node_rr, same type
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     *   ((struct iparser_number*)(node->r->l))->value;
                 node->r = node->r->r;
                 ((struct iparser_number*)(node->l))->value = v;
             }
             else if (node->r->r->type == IPARSER_NUMBER)
             { // #l * (node_rl * #rr) -> (#l * #rr) * node_rl, same type
-                int v = ((struct iparser_number*)(node->l))->value
+                auto v= ((struct iparser_number*)(node->l))->value
                     *   ((struct iparser_number*)(node->r->r))->value;
                 node->r = node->r->l;
                 ((struct iparser_number*)(node->l))->value = v;
@@ -737,14 +737,14 @@ iparser_ast_optimize (struct iparser_node* node)
         {
             if (node->l->l->type == IPARSER_NUMBER)
             { // (#ll * node_lr) * #r -> nodel_lr * (#ll * #r), same type
-                int v = ((struct iparser_number*)(node->l->l))->value
+                auto v= ((struct iparser_number*)(node->l->l))->value
                     *   ((struct iparser_number*)(node->r))->value;
                 node->l = node->l->r;
                 ((struct iparser_number*)(node->r))->value = v;
             }
             else if (node->l->r->type == IPARSER_NUMBER)
             { // (node_ll * #lr) * #r -> node_ll + (#lr * #r), same type
-                int v = ((struct iparser_number*)(node->l->r))->value
+                auto v= ((struct iparser_number*)(node->l->r))->value
                     *   ((struct iparser_number*)(node->r))->value;
                 node->l = node->l->l;
                 ((struct iparser_number*)(node->r))->value = v;
@@ -758,7 +758,7 @@ iparser_ast_optimize (struct iparser_node* node)
         if (node->l->type == IPARSER_NUMBER &&
             node->r->type == IPARSER_NUMBER)
         {
-            int v = ((struct iparser_number*)(node->l))->value
+            auto v= ((struct iparser_number*)(node->l))->value
                 /   ((struct iparser_number*)(node->r))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
@@ -790,7 +790,7 @@ iparser_ast_optimize (struct iparser_node* node)
         iparser_ast_optimize(node->l);
         if (node->l->type == IPARSER_NUMBER)
         {
-            int v = -((struct iparser_number*)(node->l))->value;
+            auto v= -((struct iparser_number*)(node->l))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
         }
@@ -876,7 +876,7 @@ iparser_ast_optimize (struct iparser_node* node)
         iparser_ast_optimize(node->l);
         if (node->l->type == IPARSER_NUMBER)
         {
-            int v = iparser_call_f1
+            auto v= iparser_call_f1
                 (((struct iparser_f1*)node)->ftype,
                  ((struct iparser_number*)(((struct iparser_f1*)node)->l))->value);
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
@@ -889,7 +889,7 @@ iparser_ast_optimize (struct iparser_node* node)
         if (node->l->type == IPARSER_NUMBER &&
             node->r->type == IPARSER_NUMBER)
         {
-            int v = iparser_call_f2
+            auto v= iparser_call_f2
                 (((struct iparser_f2*)node)->ftype,
                  ((struct iparser_number*)(((struct iparser_f2*)node)->l))->value,
                  ((struct iparser_number*)(((struct iparser_f2*)node)->r))->value);
@@ -905,7 +905,7 @@ iparser_ast_optimize (struct iparser_node* node)
             ((struct iparser_f3*)node)->n2->type == IPARSER_NUMBER &&
             ((struct iparser_f3*)node)->n3->type == IPARSER_NUMBER)
         {
-            int v = iparser_call_f3
+            auto v= iparser_call_f3
                 (((struct iparser_f3*)node)->ftype,
                  ((struct iparser_number*)(((struct iparser_f3*)node)->n1))->value,
                  ((struct iparser_number*)(((struct iparser_f3*)node)->n2))->value,
@@ -918,7 +918,7 @@ iparser_ast_optimize (struct iparser_node* node)
         iparser_ast_optimize(node->r);
         if (node->r->type == IPARSER_NUMBER)
         {
-            int v = node->lvp.v + ((struct iparser_number*)(node->r))->value;
+            auto v= node->lvp.v + ((struct iparser_number*)(node->r))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
         }
@@ -927,7 +927,7 @@ iparser_ast_optimize (struct iparser_node* node)
         iparser_ast_optimize(node->r);
         if (node->r->type == IPARSER_NUMBER)
         {
-            int v = node->lvp.v - ((struct iparser_number*)(node->r))->value;
+            auto v= node->lvp.v - ((struct iparser_number*)(node->r))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
         }
@@ -936,7 +936,7 @@ iparser_ast_optimize (struct iparser_node* node)
         iparser_ast_optimize(node->r);
         if (node->r->type == IPARSER_NUMBER)
         {
-            int v = node->lvp.v * ((struct iparser_number*)(node->r))->value;
+            auto v= node->lvp.v * ((struct iparser_number*)(node->r))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
         }
@@ -945,7 +945,7 @@ iparser_ast_optimize (struct iparser_node* node)
         iparser_ast_optimize(node->r);
         if (node->r->type == IPARSER_NUMBER)
         {
-            int v = node->lvp.v / ((struct iparser_number*)(node->r))->value;
+            auto v= node->lvp.v / ((struct iparser_number*)(node->r))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
         }
@@ -954,7 +954,7 @@ iparser_ast_optimize (struct iparser_node* node)
         iparser_ast_optimize(node->r);
         if (node->r->type == IPARSER_NUMBER)
         {
-            int v = ((struct iparser_number*)(node->r))->value / node->lvp.v;
+            auto v= ((struct iparser_number*)(node->r))->value / node->lvp.v;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
         }
@@ -963,7 +963,7 @@ iparser_ast_optimize (struct iparser_node* node)
         iparser_ast_optimize(node->l);
         if (node->l->type == IPARSER_NUMBER)
         {
-            int v = -((struct iparser_number*)(node->l))->value;
+            auto v= -((struct iparser_number*)(node->l))->value;
             ((struct iparser_number*)node)->type = IPARSER_NUMBER;
             ((struct iparser_number*)node)->value = v;
         }
@@ -1280,7 +1280,7 @@ iparser_ast_regvar (struct iparser_node* node, char const* name, int i)
     }
 }
 
-void iparser_ast_setconst (struct iparser_node* node, char const* name, int c)
+void iparser_ast_setconst (struct iparser_node* node, char const* name, long long c)
 {
     switch (node->type)
     {
@@ -1396,7 +1396,7 @@ iparser_regvar (struct amrex_iparser* iparser, char const* name, int i)
 }
 
 void
-iparser_setconst (struct amrex_iparser* iparser, char const* name, int c)
+iparser_setconst (struct amrex_iparser* iparser, char const* name, long long c)
 {
     iparser_ast_setconst(iparser->ast, name, c);
     iparser_ast_optimize(iparser->ast);
@@ -1427,4 +1427,39 @@ iparser_depth (struct amrex_iparser* iparser)
     return iparser_ast_depth(iparser->ast);
 }
 
+long long
+iparser_atoll (const char* str)
+{
+    std::string s(str);
+    s.erase(std::remove(s.begin(), s.end(), '\''), s.end());
+
+    auto pos_E = s.find('E');
+    if (pos_E != std::string::npos) {
+        s[pos_E] = 'e';
+    }
+    auto pos_e = s.find('e');
+    if (pos_e != std::string::npos) {
+        std::string part_1 = s.substr(0, pos_e);
+        int ex = std::atoi(s.c_str()+pos_e+1);
+        auto pos_dot = part_1.find('.');
+        if (pos_dot != std::string::npos) {
+            // iparser'number does not have more than one dot.
+            ex -= static_cast<int>(part_1.size()-(pos_dot+1));
+            part_1.erase(pos_dot,1);
+        }
+        if (ex < 0) {
+            throw std::runtime_error(std::string(str) + " is not an integer");
+        }
+        part_1.resize(part_1.size()+ex,'0');
+        return std::atoll(part_1.c_str());
+    } else {
+        auto pos_dot = s.find('.');
+        if (pos_dot != std::string::npos && pos_dot+1 < s.size()) {
+            throw std::runtime_error(std::string(str) + " is not an integer");
+        }
+        // Note that atoll works as expected for numbers ending with `.` like `123.`.
+        return std::atoll(s.c_str());
+    }
+}
+
 }
diff --git a/Src/Base/Parser/amrex_iparser.l b/Src/Base/Parser/amrex_iparser.l
index 83c29a0ec62..15f19b90525 100644
--- a/Src/Base/Parser/amrex_iparser.l
+++ b/Src/Base/Parser/amrex_iparser.l
@@ -15,6 +15,9 @@
  /* Tokens NUMBER, SYMBOL, F1, POW, F2, etc. are defined in amrex_iparser.y. */
  /* Types IPARSER_ABS, IPARSER_MIN etc. are defined in AMReX_IParser_y.H. */
 
+ /* Used later to define NUMBER */
+EXP	([Ee][0-9]+)
+
 %%
 
 "+" |
@@ -48,10 +51,12 @@
 "if"          { amrex_iparserlval.f3 = amrex::IPARSER_IF;        return F3; }
 
  /* We use SYMBOL to hold variables and constants */
-[a-zA-Z_][a-zA-Z0-9_]*  { amrex_iparserlval.s = amrex::iparser_makesymbol(amrex_iparsertext); return SYMBOL; }
+[a-zA-Z_][a-zA-Z0-9_\.]*  { amrex_iparserlval.s = amrex::iparser_makesymbol(amrex_iparsertext); return SYMBOL; }
 
  /* Number */
-[0-9]+        { amrex_iparserlval.d = std::atoi(amrex_iparsertext);   return NUMBER; }
+[0-9][0-9']*          |
+[0-9]+"."[0-9]*{EXP}? |
+"."?[0-9]+{EXP}?  { amrex_iparserlval.d = amrex::iparser_atoll(amrex_iparsertext);   return NUMBER; }
 
  /* Special characters */
 [ \t]   /* ignore white space */
diff --git a/Src/Base/Parser/amrex_iparser.lex.nolint.H b/Src/Base/Parser/amrex_iparser.lex.nolint.H
index 0d062f79b64..411c4210b8a 100644
--- a/Src/Base/Parser/amrex_iparser.lex.nolint.H
+++ b/Src/Base/Parser/amrex_iparser.lex.nolint.H
@@ -611,8 +611,8 @@ static void yynoreturn yy_fatal_error ( const char* msg  );
 	(yy_hold_char) = *yy_cp; \
 	*yy_cp = '\0'; \
 	(yy_c_buf_p) = yy_cp;
-#define YY_NUM_RULES 32
-#define YY_END_OF_BUFFER 33
+#define YY_NUM_RULES 34
+#define YY_END_OF_BUFFER 35
 /* This struct is not used in this scanner,
    but its presence is necessary. */
 struct yy_trans_info
@@ -620,13 +620,14 @@ struct yy_trans_info
 	flex_int32_t yy_verify;
 	flex_int32_t yy_nxt;
 	};
-static const flex_int16_t yy_accept[50] =
+static const flex_int16_t yy_accept[59] =
     {   0,
-        0,    0,   33,   31,   28,   30,   31,   10,   11,    3,
-        1,    6,    2,    4,   27,    9,    7,    5,    8,   26,
-       31,   16,   26,   26,   26,   26,   20,   15,   14,   27,
-       18,   19,   17,   26,   29,   26,   26,   26,   25,   26,
-       26,   22,   12,   21,   26,   24,   23,   13,    0
+        0,    0,   35,   33,   30,   32,   33,   10,   11,    3,
+        1,    6,    2,   33,    4,   27,    9,    7,    5,    8,
+       26,   33,   16,   26,   26,   26,   26,   20,   15,   29,
+       14,   27,   28,   27,    0,   18,   19,   17,   26,   31,
+       26,   26,   26,   25,   26,   26,   22,   28,    0,   29,
+       12,   21,   26,   24,   23,   28,   13,    0
     } ;
 
 static const YY_CHAR yy_ec[256] =
@@ -634,17 +635,17 @@ static const YY_CHAR yy_ec[256] =
         1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-        1,    2,    4,    1,    1,    1,    1,    1,    1,    5,
-        6,    7,    8,    9,   10,    1,   11,   12,   12,   12,
-       12,   12,   12,   12,   12,   12,   12,    1,   13,   14,
-       15,   16,    1,    1,   17,   17,   17,   17,   17,   17,
-       17,   17,   17,   17,   17,   17,   17,   17,   17,   17,
-       17,   17,   17,   17,   17,   17,   17,   17,   17,   17,
-        1,   18,    1,   19,   17,    1,   20,   21,   17,   22,
-
-       17,   23,   17,   17,   24,   17,   17,   17,   25,   26,
-       27,   17,   17,   28,   29,   17,   17,   17,   17,   30,
-       17,   17,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    2,    4,    1,    1,    1,    1,    1,    5,    6,
+        7,    8,    9,   10,   11,   12,   13,   14,   14,   14,
+       14,   14,   14,   14,   14,   14,   14,    1,   15,   16,
+       17,   18,    1,    1,   19,   19,   19,   19,   20,   19,
+       19,   19,   19,   19,   19,   19,   19,   19,   19,   19,
+       19,   19,   19,   19,   19,   19,   19,   19,   19,   19,
+        1,   21,    1,   22,   19,    1,   23,   24,   19,   25,
+
+       20,   26,   19,   19,   27,   19,   19,   19,   28,   29,
+       30,   19,   19,   31,   32,   19,   19,   19,   19,   33,
+       19,   19,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
@@ -661,55 +662,64 @@ static const YY_CHAR yy_ec[256] =
         1,    1,    1,    1,    1
     } ;
 
-static const YY_CHAR yy_meta[31] =
+static const YY_CHAR yy_meta[34] =
     {   0,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-        1,    2,    1,    1,    1,    1,    2,    1,    1,    2,
-        2,    2,    2,    2,    2,    2,    2,    2,    2,    2
+        1,    2,    1,    2,    1,    1,    1,    1,    2,    2,
+        1,    1,    2,    2,    2,    2,    2,    2,    2,    2,
+        2,    2,    2
     } ;
 
-static const flex_int16_t yy_base[51] =
+static const flex_int16_t yy_base[60] =
     {   0,
-        0,    0,   54,   55,   55,   55,   38,   55,   55,   45,
-       55,   55,   55,   40,   38,   55,   34,   33,   32,    0,
-       43,   55,   10,   12,   13,   17,   55,   55,   55,   32,
-       55,   55,   55,    0,   55,   14,   20,   20,    0,   10,
-       13,    0,    0,    0,    9,    0,    0,    0,   55,   32
+        0,    0,   76,   77,   77,   77,   58,   77,   77,   66,
+       77,   77,   77,   59,   59,   29,   77,   54,   53,   52,
+        0,   65,   77,   11,   13,   15,   36,   77,   77,   30,
+       77,   32,   31,   42,   52,   77,   77,   77,    0,   77,
+       33,   39,   39,    0,   28,   31,    0,   38,   45,   43,
+        0,    0,   23,    0,    0,   39,    0,   77,   46
     } ;
 
-static const flex_int16_t yy_def[51] =
+static const flex_int16_t yy_def[60] =
     {   0,
-       49,    1,   49,   49,   49,   49,   49,   49,   49,   49,
-       49,   49,   49,   49,   49,   49,   49,   49,   49,   50,
-       49,   49,   50,   50,   50,   50,   49,   49,   49,   49,
-       49,   49,   49,   50,   49,   50,   50,   50,   50,   50,
-       50,   50,   50,   50,   50,   50,   50,   50,    0,   49
+       58,    1,   58,   58,   58,   58,   58,   58,   58,   58,
+       58,   58,   58,   58,   58,   58,   58,   58,   58,   58,
+       59,   58,   58,   59,   59,   59,   59,   58,   58,   58,
+       58,   58,   58,   58,   58,   58,   58,   58,   59,   58,
+       59,   59,   59,   59,   59,   59,   59,   58,   58,   58,
+       59,   59,   59,   59,   59,   58,   59,    0,   58
     } ;
 
-static const flex_int16_t yy_nxt[86] =
+static const flex_int16_t yy_nxt[111] =
     {   0,
-        4,    5,    6,    7,    8,    9,   10,   11,   12,   13,
-       14,   15,   16,   17,   18,   19,   20,   21,   22,   23,
-       20,   20,   20,   24,   25,   20,   26,   20,   20,   20,
-       36,   38,   40,   34,   39,   37,   41,   48,   47,   46,
-       45,   44,   43,   30,   42,   35,   33,   32,   31,   30,
-       29,   28,   27,   49,    3,   49,   49,   49,   49,   49,
-       49,   49,   49,   49,   49,   49,   49,   49,   49,   49,
-       49,   49,   49,   49,   49,   49,   49,   49,   49,   49,
-       49,   49,   49,   49,   49
+        4,    5,    6,    7,    4,    8,    9,   10,   11,   12,
+       13,   14,   15,   16,   17,   18,   19,   20,   21,   21,
+       22,   23,   24,   21,   21,   21,   25,   26,   21,   27,
+       21,   21,   21,   32,   41,   43,   32,   45,   44,   42,
+       33,   46,   34,   30,   48,   32,   32,   39,   35,   35,
+       49,   48,   56,   33,   57,   34,   50,   49,   56,   55,
+       54,   35,   53,   52,   51,   50,   47,   40,   38,   37,
+       36,   31,   30,   29,   28,   58,    3,   58,   58,   58,
+       58,   58,   58,   58,   58,   58,   58,   58,   58,   58,
+       58,   58,   58,   58,   58,   58,   58,   58,   58,   58,
+
+       58,   58,   58,   58,   58,   58,   58,   58,   58,   58
     } ;
 
-static const flex_int16_t yy_chk[86] =
+static const flex_int16_t yy_chk[111] =
     {   0,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-       23,   24,   25,   50,   24,   23,   25,   45,   41,   40,
-       38,   37,   36,   30,   26,   21,   19,   18,   17,   15,
-       14,   10,    7,    3,   49,   49,   49,   49,   49,   49,
-       49,   49,   49,   49,   49,   49,   49,   49,   49,   49,
-       49,   49,   49,   49,   49,   49,   49,   49,   49,   49,
-       49,   49,   49,   49,   49
+        1,    1,    1,   16,   24,   25,   32,   26,   25,   24,
+       16,   26,   16,   30,   33,   32,   34,   59,   16,   30,
+       33,   48,   56,   34,   53,   34,   50,   48,   49,   46,
+       45,   34,   43,   42,   41,   35,   27,   22,   20,   19,
+       18,   15,   14,   10,    7,    3,   58,   58,   58,   58,
+       58,   58,   58,   58,   58,   58,   58,   58,   58,   58,
+       58,   58,   58,   58,   58,   58,   58,   58,   58,   58,
+
+       58,   58,   58,   58,   58,   58,   58,   58,   58,   58
     } ;
 
 static yy_state_type yy_last_accepting_state;
@@ -734,6 +744,7 @@ char *yytext;
 #define YY_NO_INPUT 1
  /* Tokens NUMBER, SYMBOL, F1, POW, F2, etc. are defined in amrex_iparser.y. */
  /* Types IPARSER_ABS, IPARSER_MIN etc. are defined in AMReX_IParser_y.H. */
+ /* Used later to define NUMBER */
 
 #define INITIAL 0
 
@@ -974,13 +985,13 @@ yy_match:
 			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 				{
 				yy_current_state = (int) yy_def[yy_current_state];
-				if ( yy_current_state >= 50 )
+				if ( yy_current_state >= 59 )
 					yy_c = yy_meta[yy_c];
 				}
 			yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
 			++yy_cp;
 			}
-		while ( yy_current_state != 49 );
+		while ( yy_current_state != 58 );
 		yy_cp = (yy_last_accepting_cpos);
 		yy_current_state = (yy_last_accepting_state);
 
@@ -1079,30 +1090,32 @@ YY_RULE_SETUP
 	YY_BREAK
 /* Number */
 case 27:
+case 28:
+case 29:
 YY_RULE_SETUP
-{ amrex_iparserlval.d = std::atoi(amrex_iparsertext);   return NUMBER; }
+{ amrex_iparserlval.d = amrex::iparser_atoll(amrex_iparsertext);   return NUMBER; }
 	YY_BREAK
 /* Special characters */
-case 28:
+case 30:
 YY_RULE_SETUP
 /* ignore white space */
 	YY_BREAK
-case 29:
-/* rule 29 can match eol */
+case 31:
+/* rule 31 can match eol */
 YY_RULE_SETUP
 /* ignore line continuation */
 	YY_BREAK
-case 30:
-/* rule 30 can match eol */
+case 32:
+/* rule 32 can match eol */
 YY_RULE_SETUP
 { return EOL; }
 	YY_BREAK
 /* everything else */
-case 31:
+case 33:
 YY_RULE_SETUP
 { amrex_iparsererror("Unknown character %c", *amrex_iparsertext); }
 	YY_BREAK
-case 32:
+case 34:
 YY_RULE_SETUP
 YY_FATAL_ERROR( "flex scanner jammed" );
 	YY_BREAK
@@ -1403,7 +1416,7 @@ static int yy_get_next_buffer (void)
 		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 			{
 			yy_current_state = (int) yy_def[yy_current_state];
-			if ( yy_current_state >= 50 )
+			if ( yy_current_state >= 59 )
 				yy_c = yy_meta[yy_c];
 			}
 		yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
@@ -1431,11 +1444,11 @@ static int yy_get_next_buffer (void)
 	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 		{
 		yy_current_state = (int) yy_def[yy_current_state];
-		if ( yy_current_state >= 50 )
+		if ( yy_current_state >= 59 )
 			yy_c = yy_meta[yy_c];
 		}
 	yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
-	yy_is_jam = (yy_current_state == 49);
+	yy_is_jam = (yy_current_state == 58);
 
 		return yy_is_jam ? 0 : yy_current_state;
 }
diff --git a/Src/Base/Parser/amrex_iparser.tab.h b/Src/Base/Parser/amrex_iparser.tab.h
index 58d607b3415..a72ba4aeed2 100644
--- a/Src/Base/Parser/amrex_iparser.tab.h
+++ b/Src/Base/Parser/amrex_iparser.tab.h
@@ -89,7 +89,7 @@ union AMREX_IPARSERSTYPE
 {
 
     struct amrex::iparser_node* n;
-    int d;
+    long long d;
     struct amrex::iparser_symbol* s;
     enum amrex::iparser_f1_t f1;
     enum amrex::iparser_f2_t f2;
diff --git a/Src/Base/Parser/amrex_iparser.y b/Src/Base/Parser/amrex_iparser.y
index a079c55698b..2cfd83d07af 100644
--- a/Src/Base/Parser/amrex_iparser.y
+++ b/Src/Base/Parser/amrex_iparser.y
@@ -26,7 +26,7 @@ int amrex_iparserlex (void);
 */
 %union {
     struct amrex::iparser_node* n;
-    int d;
+    long long d;
     struct amrex::iparser_symbol* s;
     enum amrex::iparser_f1_t f1;
     enum amrex::iparser_f2_t f2;
diff --git a/Src/Base/Parser/amrex_parser.l b/Src/Base/Parser/amrex_parser.l
index 1854376ffaf..e619bd077f2 100644
--- a/Src/Base/Parser/amrex_parser.l
+++ b/Src/Base/Parser/amrex_parser.l
@@ -76,7 +76,7 @@ EXP	([Ee][-+]?[0-9]+)
 "if"          { amrex_parserlval.f3 = amrex::PARSER_IF;        return F3; }
 
  /* We use SYMBOL to hold variables and constants */
-[a-zA-Z_][a-zA-Z0-9_]*  { amrex_parserlval.s = amrex::parser_makesymbol(amrex_parsertext); return SYMBOL; }
+[a-zA-Z_][a-zA-Z0-9_\.]*  { amrex_parserlval.s = amrex::parser_makesymbol(amrex_parsertext); return SYMBOL; }
 
  /* Number */
 [0-9]+"."[0-9]*{EXP}? |
diff --git a/Src/Base/Parser/amrex_parser.lex.nolint.H b/Src/Base/Parser/amrex_parser.lex.nolint.H
index cc4021e53c7..a3cb9ad8830 100644
--- a/Src/Base/Parser/amrex_parser.lex.nolint.H
+++ b/Src/Base/Parser/amrex_parser.lex.nolint.H
@@ -674,7 +674,7 @@ static const YY_CHAR yy_ec[256] =
 static const YY_CHAR yy_meta[48] =
     {   0,
         1,    1,    2,    1,    1,    1,    1,    1,    1,    1,
-        1,    1,    3,    3,    3,    3,    1,    1,    1,    1,
+        3,    1,    3,    3,    3,    3,    1,    1,    1,    1,
         3,    3,    1,    1,    3,    3,    3,    3,    3,    3,
         3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
         3,    3,    3,    3,    3,    3,    3
diff --git a/Src/Boundary/Make.package b/Src/Boundary/Make.package
index 7dae7ec913f..009591b2b47 100644
--- a/Src/Boundary/Make.package
+++ b/Src/Boundary/Make.package
@@ -1,3 +1,5 @@
+ifndef AMREX_BOUNDARY_MAKE
+       AMREX_BOUNDARY_MAKE := 1
 
 CEXE_sources += AMReX_Mask.cpp AMReX_MultiMask.cpp AMReX_LO_BCTYPES.cpp
 
@@ -27,3 +29,4 @@ endif
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/Boundary
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Boundary
 
+endif
diff --git a/Src/CMakeLists.txt b/Src/CMakeLists.txt
index 56cd426efb9..6e8af043e0d 100644
--- a/Src/CMakeLists.txt
+++ b/Src/CMakeLists.txt
@@ -43,7 +43,6 @@ include(AMReXParallelBackends)
 # Add definitions
 #
 include(AMReXSetDefines)
-
 #
 # Find and link third party libraries if needed
 #
@@ -188,7 +187,7 @@ if (AMReX_SENSEI)
    add_subdirectory(Extern/SENSEI)
 endif ()
 
-if (AMReX_CONDUIT OR AMReX_ASCENT)
+if (AMReX_CONDUIT OR AMReX_ASCENT OR AMReX_CATALYST)
    add_subdirectory(Extern/Conduit)
 endif ()
 
diff --git a/Src/EB/AMReX_EB2.cpp b/Src/EB/AMReX_EB2.cpp
index 87ecb9c4392..f99eb504d2f 100644
--- a/Src/EB/AMReX_EB2.cpp
+++ b/Src/EB/AMReX_EB2.cpp
@@ -214,11 +214,11 @@ Build (const Geometry& geom, int required_coarsening_level,
         pp.queryAdd("stl_scale", stl_scale);
         std::vector<Real> stl_center{0.0_rt, 0.0_rt, 0.0_rt};
         pp.queryAdd("stl_center", stl_center);
-        int stl_reverse_normal = 0;
+        bool stl_reverse_normal = false;
         pp.queryAdd("stl_reverse_normal", stl_reverse_normal);
         IndexSpace::push(new IndexSpaceSTL(stl_file, stl_scale, // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
                                            {stl_center[0], stl_center[1], stl_center[2]},
-                                           stl_reverse_normal,
+                                           int(stl_reverse_normal),
                                            geom, required_coarsening_level,
                                            max_coarsening_level, ngrow,
                                            build_coarse_level_by_coarsening,
diff --git a/Src/EB/AMReX_EB2_GeometryShop.H b/Src/EB/AMReX_EB2_GeometryShop.H
index 1be28d536c4..12789310057 100644
--- a/Src/EB/AMReX_EB2_GeometryShop.H
+++ b/Src/EB/AMReX_EB2_GeometryShop.H
@@ -241,7 +241,7 @@ public:
         {
             const auto& problo = geom.ProbLoArray();
             const auto& dx = geom.CellSizeArray();
-            auto f = m_f;
+            auto const& f = m_f;
             ReduceOps<ReduceOpSum,ReduceOpSum,ReduceOpSum> reduce_op;
             ReduceData<int,int,int> reduce_data(reduce_op);
             using ReduceTuple = typename decltype(reduce_data)::Type;
@@ -297,13 +297,13 @@ public:
     void fillFab (BaseFab<Real>& levelset, const Geometry& geom, RunOn run_on,
                   Box const& bounding_box) const noexcept
     {
-        const auto problo = geom.ProbLoArray();
-        const auto dx = geom.CellSizeArray();
+        const auto& problo = geom.ProbLoArray();
+        const auto& dx = geom.CellSizeArray();
         const Box& bx = levelset.box();
         const auto& a = levelset.array();
-        const auto blo = amrex::lbound(bounding_box);
-        const auto bhi = amrex::ubound(bounding_box);
-        auto f = m_f;
+        const auto& blo = amrex::lbound(bounding_box);
+        const auto& bhi = amrex::ubound(bounding_box);
+        const auto& f = m_f;
         AMREX_HOST_DEVICE_FOR_3D_FLAG(run_on, bx, i, j, k,
         {
             a(i,j,k) = f(AMREX_D_DECL(problo[0]+amrex::Clamp(i,blo.x,bhi.x)*dx[0],
@@ -335,11 +335,11 @@ public:
     void fillFab_Cpu (BaseFab<Real>& levelset, const Geometry& geom,
                       Box const& bounding_box) const noexcept
     {
-        const auto problo = geom.ProbLoArray();
-        const auto dx = geom.CellSizeArray();
+        const auto& problo = geom.ProbLoArray();
+        const auto& dx = geom.CellSizeArray();
         const Box& bx = levelset.box();
-        const auto blo = amrex::lbound(bounding_box);
-        const auto bhi = amrex::ubound(bounding_box);
+        const auto& blo = amrex::lbound(bounding_box);
+        const auto& bhi = amrex::ubound(bounding_box);
 
         const auto& a = levelset.array();
         amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept
@@ -359,9 +359,9 @@ public:
     {
         auto const& dx = geom.CellSizeArray();
         auto const& problo = geom.ProbLoArray();
-        const auto blo = amrex::lbound(bounding_box);
-        const auto bhi = amrex::ubound(bounding_box);
-        auto f = m_f;
+        const auto& blo = amrex::lbound(bounding_box);
+        const auto& bhi = amrex::ubound(bounding_box);
+        auto const& f = m_f;
         for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
             Array4<Real> const& inter = inter_arr[idim];
             Array4<Type_t const> const& type = type_arr[idim];
@@ -430,8 +430,8 @@ public:
     {
         auto const& dx = geom.CellSizeArray();
         auto const& problo = geom.ProbLoArray();
-        const auto blo = amrex::lbound(bounding_box);
-        const auto bhi = amrex::ubound(bounding_box);
+        const auto& blo = amrex::lbound(bounding_box);
+        const auto& bhi = amrex::ubound(bounding_box);
         const Box bx{inter};
         amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept
         {
diff --git a/Src/EB/AMReX_EB2_MultiGFab.cpp b/Src/EB/AMReX_EB2_MultiGFab.cpp
index 8d09ca7c069..8461b5618d4 100644
--- a/Src/EB/AMReX_EB2_MultiGFab.cpp
+++ b/Src/EB/AMReX_EB2_MultiGFab.cpp
@@ -1,7 +1,6 @@
 
 #include <AMReX_EB2_MultiGFab.H>
 #include <AMReX_EB2_C.H>
-#include <AMReX_ParmParse.H>
 
 namespace amrex::EB2 {
 
diff --git a/Src/EB/AMReX_EBDataCollection.cpp b/Src/EB/AMReX_EBDataCollection.cpp
index 141f0614629..f9b3d66c79c 100644
--- a/Src/EB/AMReX_EBDataCollection.cpp
+++ b/Src/EB/AMReX_EBDataCollection.cpp
@@ -208,8 +208,8 @@ void EBDataCollection::extendDataOutsideDomain (IntVect const& level_ng)
                     if (apbx.bigEnd(idim) == nbx.bigEnd(idim)) {
                         apbx.growHi(idim,-1);
                     }
-                    auto lev_apidim_domain = lev_ap_domain[idim];
-                    Dim3 off = IntVect::TheDimensionVector(idim).dim3();
+                    auto const& lev_apidim_domain = lev_ap_domain[idim];
+                    Dim3 const& off = IntVect::TheDimensionVector(idim).dim3();
                     amrex::ParallelFor(apbx,
                     [=] AMREX_GPU_DEVICE (int i, int j, int k)
                     {
diff --git a/Src/EB/AMReX_WriteEBSurface.cpp b/Src/EB/AMReX_WriteEBSurface.cpp
index f26420cdcc7..2e94ccb7ffb 100644
--- a/Src/EB/AMReX_WriteEBSurface.cpp
+++ b/Src/EB/AMReX_WriteEBSurface.cpp
@@ -1,5 +1,4 @@
 #include <AMReX.H>
-#include <AMReX_ParmParse.H>
 #include <AMReX_Geometry.H>
 #include <AMReX_MultiFab.H>
 #include <AMReX_MultiCutFab.H>
diff --git a/Src/EB/AMReX_distFcnElement.H b/Src/EB/AMReX_distFcnElement.H
index 1d7c2ef0597..d0c7b867138 100644
--- a/Src/EB/AMReX_distFcnElement.H
+++ b/Src/EB/AMReX_distFcnElement.H
@@ -18,11 +18,11 @@ class distFcnElement2d { // NOLINT(cppcoreguidelines-special-member-functions)
 
   [[nodiscard]] virtual amrex::Real cpdist(amrex::RealVect pt, amrex::RealVect& cp) const = 0;
   [[nodiscard]] virtual amrex::Real cpside(amrex::RealVect pt, amrex::RealVect& cp) const = 0;
-  static int solve_thomas(const std::vector<amrex::Real> &a,
-                   std::vector<amrex::Real> b,
-                   const std::vector<amrex::Real> &c,
-                   std::vector<amrex::Real> d,
-                   std::vector<amrex::Real> &X);
+  static int solve_thomas (const std::vector<amrex::Real> &a,
+                           std::vector<amrex::Real> &b,
+                           const std::vector<amrex::Real> &c,
+                           std::vector<amrex::Real> &d,
+                           std::vector<amrex::Real> &X);
 };
 
 
diff --git a/Src/EB/AMReX_distFcnElement.cpp b/Src/EB/AMReX_distFcnElement.cpp
index b6040d30f53..c72bb6af8f9 100644
--- a/Src/EB/AMReX_distFcnElement.cpp
+++ b/Src/EB/AMReX_distFcnElement.cpp
@@ -6,11 +6,11 @@
 
 namespace amrex {
 
-int distFcnElement2d::solve_thomas(const std::vector<amrex::Real> &a,
-                                   std::vector<amrex::Real> b,
-                                   const std::vector<amrex::Real> &c,
-                                   std::vector<amrex::Real> d,
-                                   std::vector<amrex::Real> &x)
+int distFcnElement2d::solve_thomas (const std::vector<amrex::Real> &a,
+                                    std::vector<amrex::Real> &b,
+                                    const std::vector<amrex::Real> &c,
+                                    std::vector<amrex::Real> &d,
+                                    std::vector<amrex::Real> &x)
 {
   int n = static_cast<int>(d.size());
   x.resize(n);
diff --git a/Src/Extern/Conduit/AMReX_Conduit_Blueprint_ParticlesI.H b/Src/Extern/Conduit/AMReX_Conduit_Blueprint_ParticlesI.H
index e4186ba247b..67725cba876 100644
--- a/Src/Extern/Conduit/AMReX_Conduit_Blueprint_ParticlesI.H
+++ b/Src/Extern/Conduit/AMReX_Conduit_Blueprint_ParticlesI.H
@@ -69,14 +69,14 @@ ParticleTileToBlueprint(const ParticleTile<ParticleType,
         // for soa entries, we can use standard strides,
         // since these are contiguous arrays
 
-        n_coords["values/x"].set_external(const_cast<ParticleReal*>(&soa.GetRealData(0)[0]),
+        n_coords["values/x"].set_external(const_cast<ParticleReal*>(soa.GetRealData(0).data()),
                                           num_particles);
 #if AMREX_SPACEDIM > 1
-        n_coords["values/y"].set_external(const_cast<ParticleReal*>(&soa.GetRealData(1)[0]),
+        n_coords["values/y"].set_external(const_cast<ParticleReal*>(soa.GetRealData(1).data()),
                                           num_particles);
 #endif
 #if AMREX_SPACEDIM > 2
-        n_coords["values/z"].set_external(const_cast<ParticleReal*>(&soa.GetRealData(2)[0]),
+        n_coords["values/z"].set_external(const_cast<ParticleReal*>(soa.GetRealData(2).data()),
                                           num_particles);
 #endif
     } else
@@ -175,22 +175,12 @@ ParticleTileToBlueprint(const ParticleTile<ParticleType,
         // for soa entries, we can use standard strides,
         // since these are contiguous arrays
 
-        // id is the first int entry
-        conduit::Node &n_f_id = n_fields[topology_name + "_id"];
-
-        n_f_id["topology"] = topology_name;
-        n_f_id["association"] = "element";
-        n_f_id["values"].set_external(const_cast<int*>(&soa.GetIntData(0)[0]),
-                                      num_particles);
-
-        // cpu is the second int entry
-        conduit::Node &n_f_cpu = n_fields[topology_name + "_cpu"];
-
-        n_f_cpu["topology"] = topology_name;
-        n_f_cpu["association"] = "element";
-        n_f_cpu["values"].set_external(const_cast<int*>(&soa.GetIntData(0)[0]),
-                                       num_particles);
+        conduit::Node &n_f_idcpu = n_fields[topology_name + "_idcpu"];
 
+        n_f_idcpu["topology"] = topology_name;
+        n_f_idcpu["association"] = "element";
+        n_f_idcpu["values"].set_external(const_cast<uint64_t*>(soa.GetIdCPUData().data()),
+                                         num_particles);
     }
 
     // --------------------------------
@@ -232,7 +222,7 @@ ParticleTileToBlueprint(const ParticleTile<ParticleType,
         conduit::Node &n_f = n_fields[real_comp_names.at(vname_real_idx)];
         n_f["topology"] = topology_name;
         n_f["association"] = "element";
-        n_f["values"].set_external(const_cast<ParticleReal*>(&soa.GetRealData(i)[0]),
+        n_f["values"].set_external(const_cast<ParticleReal*>(soa.GetRealData(i).data()),
                                    num_particles);
 
         vname_real_idx++;
@@ -244,7 +234,7 @@ ParticleTileToBlueprint(const ParticleTile<ParticleType,
         conduit::Node &n_f = n_fields[int_comp_names.at(vname_int_idx)];
         n_f["topology"] = topology_name;
         n_f["association"] = "element";
-        n_f["values"].set_external(const_cast<int*>(&soa.GetIntData(i)[0]),
+        n_f["values"].set_external(const_cast<int*>(soa.GetIntData(i).data()),
                                    num_particles);
 
         vname_int_idx++;
diff --git a/Src/Extern/Conduit/CMakeLists.txt b/Src/Extern/Conduit/CMakeLists.txt
index 75e84099f1d..07c4722ac4b 100644
--- a/Src/Extern/Conduit/CMakeLists.txt
+++ b/Src/Extern/Conduit/CMakeLists.txt
@@ -4,6 +4,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
     # TODO: Particles PR merges another file
     target_sources(amrex_${D}d
        PRIVATE
+       AMReX_Conduit_Blueprint_ParticlesI.H
        AMReX_Conduit_Blueprint.H
        AMReX_Conduit_Blueprint.cpp
     )
diff --git a/Src/Extern/Conduit/Make.package b/Src/Extern/Conduit/Make.package
index 2fc13b7d606..def60ba1a74 100644
--- a/Src/Extern/Conduit/Make.package
+++ b/Src/Extern/Conduit/Make.package
@@ -3,7 +3,7 @@
 #
 
 CEXE_sources += AMReX_Conduit_Blueprint.cpp
-
+CEXE_headers += AMReX_Conduit_Blueprint_ParticlesI.H
 CEXE_headers += AMReX_Conduit_Blueprint.H
 
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/Extern/Conduit
diff --git a/Src/Extern/HDF5/AMReX_ParticleHDF5.H b/Src/Extern/HDF5/AMReX_ParticleHDF5.H
index 00fdf0c14be..a1b9d8faeb8 100644
--- a/Src/Extern/HDF5/AMReX_ParticleHDF5.H
+++ b/Src/Extern/HDF5/AMReX_ParticleHDF5.H
@@ -504,7 +504,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         return;
     }
 
-    BL_PROFILE("ParticleContainer::CheckpointPost()");
+    BL_PROFILE("ParticleContainer::CheckpointPostHDF5()");
 
     const int IOProcNumber = ParallelDescriptor::IOProcessorNumber();
     std::ofstream HdrFile;
@@ -545,7 +545,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         HdrFile.flush();
         HdrFile.close();
         if( ! HdrFile.good()) {
-            amrex::Abort("ParticleContainer::CheckpointPost(): problem writing HdrFile");
+            amrex::Abort("ParticleContainer::CheckpointPostHDF5(): problem writing HdrFile");
         }
     }
 }
@@ -597,23 +597,8 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     dcpl_int  = H5Pcreate(H5P_DATASET_CREATE);
     dcpl_real = H5Pcreate(H5P_DATASET_CREATE);
 
-#if (defined AMREX_USE_HDF5_ZFP) || (defined AMREX_USE_HDF5_SZ)
-    const char *chunk_env = NULL;
     std::string mode_env, value_env;
     double comp_value = -1.0;
-    hsize_t chunk_dim = 1024;
-
-    chunk_env = getenv("HDF5_CHUNK_SIZE");
-    if (chunk_env != NULL) {
-        chunk_dim = atoi(chunk_env);
-    }
-
-    H5Pset_chunk(dcpl_int, 1, &chunk_dim);
-    H5Pset_chunk(dcpl_real, 1, &chunk_dim);
-
-    H5Pset_alloc_time(dcpl_int, H5D_ALLOC_TIME_LATE);
-    H5Pset_alloc_time(dcpl_real, H5D_ALLOC_TIME_LATE);
-
     std::string::size_type pos = compression.find('@');
     if (pos != std::string::npos) {
         mode_env = compression.substr(0, pos);
@@ -623,30 +608,43 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         }
     }
 
-#ifdef AMREX_USE_HDF5_ZFP
-    pos = compression.find("ZFP");
-    if (pos != std::string::npos) {
-        ret = H5Z_zfp_initialize();
-        if (ret < 0) { amrex::Abort("ZFP initialize failed!"); }
-    }
-#endif
+    H5Pset_alloc_time(dcpl_int, H5D_ALLOC_TIME_LATE);
+    H5Pset_alloc_time(dcpl_real, H5D_ALLOC_TIME_LATE);
 
-#ifdef AMREX_USE_HDF5_SZ
-    pos = compression.find("SZ");
-    if (pos != std::string::npos) {
-        ret = H5Z_SZ_Init((char*)value_env.c_str());
-        if (ret < 0) {
-            std::cout << "SZ config file:" << value_env.c_str() << std::endl;
-            amrex::Abort("SZ initialize failed, check SZ config file!");
+    if (!mode_env.empty() && mode_env != "None") {
+        const char *chunk_env = NULL;
+        hsize_t chunk_dim = 1024;
+        chunk_env = getenv("HDF5_CHUNK_SIZE");
+        if (chunk_env != NULL) {
+            chunk_dim = atoi(chunk_env);
+        }
+
+        H5Pset_chunk(dcpl_int, 1, &chunk_dim);
+        H5Pset_chunk(dcpl_real, 1, &chunk_dim);
+
+#ifdef AMREX_USE_HDF5_ZFP
+        pos = mode_env.find("ZFP");
+        if (pos != std::string::npos) {
+            ret = H5Z_zfp_initialize();
+            if (ret < 0) { amrex::Abort("ZFP initialize failed!"); }
         }
-    }
 #endif
 
-    if (!mode_env.empty() && mode_env != "None") {
         if (mode_env == "ZLIB") {
+            H5Pset_shuffle(dcpl_int);
+            H5Pset_shuffle(dcpl_real);
             H5Pset_deflate(dcpl_int, (int)comp_value);
             H5Pset_deflate(dcpl_real, (int)comp_value);
         }
+#ifdef AMREX_USE_HDF5_SZ
+        else if (mode_env == "SZ") {
+            ret = H5Z_SZ_Init((char*)value_env.c_str());
+            if (ret < 0) {
+                std::cout << "SZ config file:" << value_env.c_str() << std::endl;
+                amrex::Abort("SZ initialize failed, check SZ config file!");
+            }
+        }
+#endif
 #ifdef AMREX_USE_HDF5_ZFP
         else if (mode_env == "ZFP_RATE") {
             H5Pset_zfp_rate(dcpl_int, comp_value);
@@ -664,18 +662,12 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             H5Pset_zfp_reversible(dcpl_int);
             H5Pset_zfp_reversible(dcpl_real);
         }
-        else if (mode_env == "ZLIB") {
-            H5Pset_deflate(dcpl_int, (int)comp_value);
-            H5Pset_deflate(dcpl_real, (int)comp_value);
-        }
 #endif
         if (ParallelDescriptor::MyProc() == 0) {
             std::cout << "\nHDF5 particle checkpoint using " << mode_env << ", " <<
                 value_env << ", " << chunk_dim << std::endl;
         }
     }
-#endif
-
 
     for (const auto& kv : m_particles[lev])
     {
@@ -772,6 +764,15 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 /*     } */
 /* #endif */
 
+    hsize_t chunk_size;
+    if (H5Pget_layout(dcpl_int) == H5D_CHUNKED) {
+        if (H5Pget_chunk(dcpl_int, 1, &chunk_size) > -1) {
+            if (chunk_size > total_int_size) {
+                H5Pset_chunk(dcpl_int, 1, &total_int_size);
+            }
+        }
+    }
+
     int_dset_space = H5Screate_simple(1, &total_int_size, NULL);
 #ifdef AMREX_USE_HDF5_ASYNC
     int_dset_id  = H5Dcreate_async(grp, "data:datatype=0", H5T_NATIVE_INT, int_dset_space, H5P_DEFAULT, dcpl_int, H5P_DEFAULT, es_par_g);
@@ -808,6 +809,14 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     }
 #endif
 
+    if (H5Pget_layout(dcpl_real) == H5D_CHUNKED) {
+        if (H5Pget_chunk(dcpl_real, 1, &chunk_size) > -1) {
+            if (chunk_size > total_real_size) {
+                H5Pset_chunk(dcpl_real, 1, &total_real_size);
+            }
+        }
+    }
+
     real_dset_space = H5Screate_simple(1, &total_real_size, NULL);
     if (sizeof(typename ParticleType::RealType) == 4) {
 #ifdef AMREX_USE_HDF5_ASYNC
@@ -837,7 +846,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     my_real_count  = 0;
 
     int max_mfi_count = 0, write_count = 0;
-    for (int i = 0; i < ParallelDescriptor::MyProc(); i++) {
+    for (int i = 0; i < ParallelDescriptor::NProcs(); i++) {
         if (max_mfi_count < all_mfi_cnt[i]) {
             max_mfi_count = all_mfi_cnt[i];
         }
diff --git a/Src/Extern/HDF5/AMReX_PlotFileUtilHDF5.cpp b/Src/Extern/HDF5/AMReX_PlotFileUtilHDF5.cpp
index 4bd81cb8a2c..19963bbc124 100644
--- a/Src/Extern/HDF5/AMReX_PlotFileUtilHDF5.cpp
+++ b/Src/Extern/HDF5/AMReX_PlotFileUtilHDF5.cpp
@@ -445,18 +445,8 @@ void WriteMultiLevelPlotfileHDF5SingleDset (const std::string& plotfilename,
     H5Pset_fill_time(dcpl_id, H5D_FILL_TIME_NEVER);
     H5Pset_alloc_time(dcpl_id, H5D_ALLOC_TIME_INCR);
 
-#if (defined AMREX_USE_HDF5_ZFP) || (defined AMREX_USE_HDF5_SZ)
-    const char *chunk_env = NULL;
     std::string mode_env, value_env;
     double comp_value = -1.0;
-    hsize_t chunk_dim = 1024;
-
-    chunk_env = getenv("HDF5_CHUNK_SIZE");
-    if (chunk_env != NULL)
-        chunk_dim = atoi(chunk_env);
-
-    H5Pset_chunk(dcpl_id, 1, &chunk_dim);
-
     std::string::size_type pos = compression.find('@');
     if (pos != std::string::npos) {
         mode_env = compression.substr(0, pos);
@@ -466,47 +456,56 @@ void WriteMultiLevelPlotfileHDF5SingleDset (const std::string& plotfilename,
         }
     }
 
+    if (!mode_env.empty() && mode_env != "None") {
+        const char *chunk_env = NULL;
+        hsize_t chunk_dim = 1024;
+        chunk_env = getenv("HDF5_CHUNK_SIZE");
+        if (chunk_env != NULL) {
+            chunk_dim = atoi(chunk_env);
+        }
+
+        H5Pset_chunk(dcpl_id, 1, &chunk_dim);
+
 #ifdef AMREX_USE_HDF5_ZFP
-    pos = compression.find("ZFP");
-    if (pos != std::string::npos) {
-        ret = H5Z_zfp_initialize();
-        if (ret < 0) { amrex::Abort("ZFP initialize failed!"); }
-    }
+        pos = mode_env.find("ZFP");
+        if (pos != std::string::npos) {
+            ret = H5Z_zfp_initialize();
+            if (ret < 0) { amrex::Abort("ZFP initialize failed!"); }
+        }
 #endif
 
+        if (mode_env == "ZLIB") {
+            H5Pset_shuffle(dcpl_id);
+            H5Pset_deflate(dcpl_id, (int)comp_value);
+        }
 #ifdef AMREX_USE_HDF5_SZ
-    pos = compression.find("SZ");
-    if (pos != std::string::npos) {
-        ret = H5Z_SZ_Init((char*)value_env.c_str());
-        if (ret < 0) {
-            std::cout << "SZ config file:" << value_env.c_str() << std::endl;
-            amrex::Abort("SZ initialize failed, check SZ config file!");
+        else if (mode_env == "SZ") {
+            ret = H5Z_SZ_Init((char*)value_env.c_str());
+            if (ret < 0) {
+                std::cout << "SZ config file:" << value_env.c_str() << std::endl;
+                amrex::Abort("SZ initialize failed, check SZ config file!");
+            }
         }
-    }
 #endif
-
-    if (!mode_env.empty() && mode_env != "None") {
-        if (mode_env == "ZLIB")
-            H5Pset_deflate(dcpl_id, (int)comp_value);
 #ifdef AMREX_USE_HDF5_ZFP
-        else if (mode_env == "ZFP_RATE")
+        else if (mode_env == "ZFP_RATE") {
             H5Pset_zfp_rate(dcpl_id, comp_value);
-        else if (mode_env == "ZFP_PRECISION")
+        }
+        else if (mode_env == "ZFP_PRECISION") {
             H5Pset_zfp_precision(dcpl_id, (unsigned int)comp_value);
-        else if (mode_env == "ZFP_ACCURACY")
+        }
+        else if (mode_env == "ZFP_ACCURACY") {
             H5Pset_zfp_accuracy(dcpl_id, comp_value);
-        else if (mode_env == "ZFP_REVERSIBLE")
+        }
+        else if (mode_env == "ZFP_REVERSIBLE") {
             H5Pset_zfp_reversible(dcpl_id);
-        else if (mode_env == "ZLIB")
-            H5Pset_deflate(dcpl_id, (int)comp_value);
+        }
 #endif
-
         if (ParallelDescriptor::MyProc() == 0) {
             std::cout << "\nHDF5 plotfile using " << mode_env << ", " <<
                 value_env << ", " << chunk_dim << std::endl;
         }
     }
-#endif
 
     BL_PROFILE_VAR("H5writeAllLevel", h5dwd);
 
@@ -697,12 +696,24 @@ void WriteMultiLevelPlotfileHDF5SingleDset (const std::string& plotfilename,
         for(MFIter mfi(*data); mfi.isValid(); ++mfi) {
             const FArrayBox &fab = (*data)[mfi];
             writeDataItems = fab.box().numPts() * (*data).nComp();
+            Real const* fabdata = fab.dataPtr();
+#ifdef AMREX_USE_GPU
+            std::unique_ptr<FArrayBox> hostfab;
+            if (fab.arena()->isManaged() || fab.arena()->isDevice()) {
+                hostfab = std::make_unique<FArrayBox>(fab.box(), fab.nComp(),
+                                                      The_Pinned_Arena());
+                Gpu::dtoh_memcpy_async(hostfab->dataPtr(), fab.dataPtr(),
+                                       fab.size()*sizeof(Real));
+                Gpu::streamSynchronize();
+                fabdata = hostfab->dataPtr();
+            }
+#endif
             if(doConvert) {
                 RealDescriptor::convertFromNativeFormat(static_cast<void *> (a_buffer.dataPtr()+writeDataSize),
-                                                        writeDataItems, fab.dataPtr(), *whichRD);
+                                                        writeDataItems, fabdata, *whichRD);
             } else {    // ---- copy from the fab
                 memcpy(static_cast<void *> (a_buffer.dataPtr()+writeDataSize),
-                       fab.dataPtr(), writeDataItems * whichRDBytes);
+                       fabdata, writeDataItems * whichRDBytes);
             }
             writeDataSize += writeDataItems;
         }
@@ -720,6 +731,16 @@ void WriteMultiLevelPlotfileHDF5SingleDset (const std::string& plotfilename,
         }
 #endif
 
+        // Force maximum chunk size to be size of write
+        if (H5Pget_layout(lev_dcpl_id) == H5D_CHUNKED) {
+            hsize_t chunk_size;
+            if (H5Pget_chunk(lev_dcpl_id, 1, &chunk_size) > -1) {
+                if (chunk_size > hs_allprocsize[0]) {
+                    H5Pset_chunk(lev_dcpl_id, 1, hs_allprocsize);
+                }
+            }
+        }
+
 #ifdef AMREX_USE_HDF5_ASYNC
         hid_t dataset = H5Dcreate_async(grp, dataname.c_str(), data_type, dataspace, H5P_DEFAULT, lev_dcpl_id, H5P_DEFAULT, es_id_g);
 #else
@@ -897,18 +918,8 @@ void WriteMultiLevelPlotfileHDF5MultiDset (const std::string& plotfilename,
     H5Pset_fill_time(dcpl_id, H5D_FILL_TIME_NEVER);
     H5Pset_alloc_time(dcpl_id, H5D_ALLOC_TIME_INCR);
 
-#if (defined AMREX_USE_HDF5_ZFP) || (defined AMREX_USE_HDF5_SZ)
-    const char *chunk_env = NULL;
     std::string mode_env, value_env;
     double comp_value = -1.0;
-    hsize_t chunk_dim = 1024;
-
-    chunk_env = getenv("HDF5_CHUNK_SIZE");
-    if (chunk_env != NULL)
-        chunk_dim = atoi(chunk_env);
-
-    H5Pset_chunk(dcpl_id, 1, &chunk_dim);
-
     std::string::size_type pos = compression.find('@');
     if (pos != std::string::npos) {
         mode_env = compression.substr(0, pos);
@@ -918,42 +929,56 @@ void WriteMultiLevelPlotfileHDF5MultiDset (const std::string& plotfilename,
         }
     }
 
+    if (!mode_env.empty() && mode_env != "None") {
+        const char *chunk_env = NULL;
+        hsize_t chunk_dim = 1024;
+        chunk_env = getenv("HDF5_CHUNK_SIZE");
+        if (chunk_env != NULL) {
+            chunk_dim = atoi(chunk_env);
+        }
+
+        H5Pset_chunk(dcpl_id, 1, &chunk_dim);
+
 #ifdef AMREX_USE_HDF5_ZFP
-    pos = compression.find("ZFP");
-    if (pos != std::string::npos) {
-        ret = H5Z_zfp_initialize();
-        if (ret < 0) { amrex::Abort("ZFP initialize failed!"); }
-    }
+        pos = mode_env.find("ZFP");
+        if (pos != std::string::npos) {
+            ret = H5Z_zfp_initialize();
+            if (ret < 0) { amrex::Abort("ZFP initialize failed!"); }
+        }
 #endif
 
+        if (mode_env == "ZLIB") {
+            H5Pset_shuffle(dcpl_id);
+            H5Pset_deflate(dcpl_id, (int)comp_value);
+        }
 #ifdef AMREX_USE_HDF5_SZ
-    pos = compression.find("SZ");
-    if (pos != std::string::npos) {
-        ret = H5Z_SZ_Init((char*)value_env.c_str());
-        if (ret < 0) { amrex::Abort("ZFP initialize failed, check SZ config file!"); }
-    }
+        else if (mode_env == "SZ") {
+            ret = H5Z_SZ_Init((char*)value_env.c_str());
+            if (ret < 0) {
+                std::cout << "SZ config file:" << value_env.c_str() << std::endl;
+                amrex::Abort("SZ initialize failed, check SZ config file!");
+            }
+        }
 #endif
-
-    if (!mode_env.empty() && mode_env != "None") {
-        if (mode_env == "ZLIB")
-            H5Pset_deflate(dcpl_id, (int)comp_value);
 #ifdef AMREX_USE_HDF5_ZFP
-        else if (mode_env == "ZFP_RATE")
+        else if (mode_env == "ZFP_RATE") {
             H5Pset_zfp_rate(dcpl_id, comp_value);
-        else if (mode_env == "ZFP_PRECISION")
+        }
+        else if (mode_env == "ZFP_PRECISION") {
             H5Pset_zfp_precision(dcpl_id, (unsigned int)comp_value);
-        else if (mode_env == "ZFP_ACCURACY")
+        }
+        else if (mode_env == "ZFP_ACCURACY") {
             H5Pset_zfp_accuracy(dcpl_id, comp_value);
-        else if (mode_env == "ZFP_REVERSIBLE")
+        }
+        else if (mode_env == "ZFP_REVERSIBLE") {
             H5Pset_zfp_reversible(dcpl_id);
+        }
 #endif
-
         if (ParallelDescriptor::MyProc() == 0) {
             std::cout << "\nHDF5 checkpoint using " << mode_env << ", " <<
                 value_env << ", " << chunk_dim << std::endl;
         }
     }
-#endif
 
     BL_PROFILE_VAR("H5writeAllLevel", h5dwd);
 
@@ -1178,6 +1203,15 @@ void WriteMultiLevelPlotfileHDF5MultiDset (const std::string& plotfilename,
 
             hid_t dataspace    = H5Screate_simple(1, hs_allprocsize, NULL);
             snprintf(dataname, sizeof dataname, "data:datatype=%d", jj);
+            // Force maximum chunk size to be size of write
+            if (H5Pget_layout(lev_dcpl_id) == H5D_CHUNKED) {
+                hsize_t chunk_size;
+                if (H5Pget_chunk(lev_dcpl_id, 1, &chunk_size) > -1) {
+                    if (chunk_size > hs_allprocsize[0]) {
+                        H5Pset_chunk(lev_dcpl_id, 1, hs_allprocsize);
+                    }
+                }
+            }
 #ifdef AMREX_USE_HDF5_ASYNC
             dataset = H5Dcreate_async(grp, dataname, data_type, dataspace, H5P_DEFAULT, lev_dcpl_id, H5P_DEFAULT, es_id_g);
             if(dataset < 0) { std::cout << ParallelDescriptor::MyProc() << "create data failed!  ret = " << dataset << std::endl; }
diff --git a/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H b/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H
index 271f2e0e4cd..462716ae2a3 100644
--- a/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H
+++ b/Src/Extern/HDF5/AMReX_WriteBinaryParticleDataHDF5.H
@@ -385,7 +385,7 @@ void WriteHDF5ParticleDataSync (PC const& pc,
     int nOutFiles(-1);
 
     ParmParse pp("particles");
-    pp.queryAdd("particles_nfiles",nOutFiles);
+    pp.query("particles_nfiles",nOutFiles);
     if(nOutFiles == -1) nOutFiles = NProcs;
     /* nOutFiles = std::max(1, std::min(nOutFiles,NProcs)); */
     pc.nOutFilesPrePost = nOutFiles;
diff --git a/Src/Extern/HYPRE/AMReX_HypreABecLap.H b/Src/Extern/HYPRE/AMReX_HypreABecLap.H
index 1d57d942a14..9b0685de434 100644
--- a/Src/Extern/HYPRE/AMReX_HypreABecLap.H
+++ b/Src/Extern/HYPRE/AMReX_HypreABecLap.H
@@ -7,7 +7,6 @@
 #include <AMReX_Geometry.H>
 #include <AMReX_MultiFab.H>
 #include <AMReX_BndryData.H>
-#include <AMReX_ParmParse.H>
 #include <AMReX_LO_BCTYPES.H>
 
 #include <HYPRE_struct_ls.h>
diff --git a/Src/Extern/HYPRE/AMReX_HypreABecLap.cpp b/Src/Extern/HYPRE/AMReX_HypreABecLap.cpp
index 4607fcc1fca..2d4c313c086 100644
--- a/Src/Extern/HYPRE/AMReX_HypreABecLap.cpp
+++ b/Src/Extern/HYPRE/AMReX_HypreABecLap.cpp
@@ -115,7 +115,7 @@ HypreABecLap::getSolution (MultiFab& a_soln)
         auto reghi = Hypre::hiV(reg);
         HYPRE_StructVectorGetBoxValues(x, reglo.data(), reghi.data(), (*soln)[mfi].dataPtr());
     }
-    Gpu::synchronize();
+    Gpu::hypreSynchronize();
 
     if (a_soln.nGrowVect() != 0) {
         MultiFab::Copy(a_soln, tmp, 0, 0, 1, 0);
@@ -235,7 +235,7 @@ HypreABecLap::prepareSolver ()
         HYPRE_StructMatrixSetBoxValues(A, reglo.data(), reghi.data(),
                                        regular_stencil_size, stencil_indices.data(),
                                        mat);
-        Gpu::synchronize();
+        Gpu::hypreSynchronize();
     }
     HYPRE_StructMatrixAssemble(A);
 
@@ -299,7 +299,7 @@ HypreABecLap::loadVectors (MultiFab& soln, const MultiFab& rhs)
         HYPRE_StructVectorSetBoxValues(x, reglo.data(), reghi.data(), soln[mfi].dataPtr());
         HYPRE_StructVectorSetBoxValues(b, reglo.data(), reghi.data(), rhs_diag[mfi].dataPtr());
     }
-    Gpu::synchronize();
+    Gpu::hypreSynchronize();
 }
 
 }
diff --git a/Src/Extern/HYPRE/AMReX_HypreABecLap2.cpp b/Src/Extern/HYPRE/AMReX_HypreABecLap2.cpp
index 1b7ba8e895b..a44d2d517e3 100644
--- a/Src/Extern/HYPRE/AMReX_HypreABecLap2.cpp
+++ b/Src/Extern/HYPRE/AMReX_HypreABecLap2.cpp
@@ -137,7 +137,7 @@ HypreABecLap2::getSolution (MultiFab& a_soln)
         HYPRE_SStructVectorGetBoxValues(x, part, reglo.data(), reghi.data(),
                                         0, (*soln)[mfi].dataPtr());
     }
-    Gpu::synchronize();
+    Gpu::hypreSynchronize();
 
     if (a_soln.nGrowVect() != 0) {
         MultiFab::Copy(a_soln, tmp, 0, 0, 1, 0);
@@ -262,7 +262,7 @@ HypreABecLap2::prepareSolver ()
         HYPRE_SStructMatrixSetBoxValues(A, part, reglo.data(), reghi.data(),
                                         0, regular_stencil_size, stencil_indices.data(),
                                         mat);
-        Gpu::synchronize();
+        Gpu::hypreSynchronize();
     }
     HYPRE_SStructMatrixAssemble(A);
 
@@ -335,7 +335,7 @@ HypreABecLap2::loadVectors (MultiFab& soln, const MultiFab& rhs)
         HYPRE_SStructVectorSetBoxValues(b, part, reglo.data(), reghi.data(),
                                         0, rhs_diag[mfi].dataPtr());
     }
-    Gpu::synchronize();
+    Gpu::hypreSynchronize();
 }
 
 }
diff --git a/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp b/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp
index d04b85c314b..216cf368f5a 100644
--- a/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp
+++ b/Src/Extern/HYPRE/AMReX_HypreABecLap3.cpp
@@ -77,7 +77,7 @@ HypreABecLap3::getSolution (MultiFab& a_soln)
             (*l_soln)[mfi].setVal<RunOn::Device>(0.0);
         }
     }
-    Gpu::synchronize();
+    Gpu::hypreSynchronize();
 
     if (use_tmp_mf) {
         MultiFab::Copy(a_soln, tmp, 0, 0, 1, 0);
@@ -499,9 +499,9 @@ HypreABecLap3::prepareSolver ()
                 });
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
             HYPRE_IJMatrixSetValues(A,nrows,ncols,rows,cols,mat);
-            Gpu::synchronize();
+            Gpu::hypreSynchronize();
         }
     }
     HYPRE_IJMatrixAssemble(A);
@@ -681,7 +681,7 @@ HypreABecLap3::loadVectors (MultiFab& soln, const MultiFab& rhs)
             HYPRE_IJVectorSetValues(b, nrows, cell_id_vec[mfi].dataPtr(), rhs_diag[mfi].dataPtr());
         }
     }
-    Gpu::synchronize();
+    Gpu::hypreSynchronize();
 }
 
 }  // namespace amrex
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap.H b/Src/Extern/HYPRE/AMReX_HypreMLABecLap.H
index 04147207d36..6f687766b86 100644
--- a/Src/Extern/HYPRE/AMReX_HypreMLABecLap.H
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap.H
@@ -39,6 +39,7 @@ public:
 
     void setVerbose (int v) { m_verbose = v; }
     void setMaxIter (int v) { m_maxiter = v; }
+    void setIsSingular (bool v) { m_is_singular = v; }
 
     void setup (Real a_ascalar, Real a_bscalar,
                 Vector<MultiFab const*> const& a_acoefs,
@@ -65,6 +66,7 @@ private:
 
     int m_verbose = 0;
     int m_maxiter = 200;
+    bool m_is_singular = false;
 
     Vector<Geometry> m_geom;
     Vector<BoxArray> m_grids;
@@ -87,6 +89,7 @@ private:
     Vector<std::unique_ptr<MLMGBndry>> m_bndry;
     Vector<std::unique_ptr<BndryRegister>> m_bndry_rhs;
     Vector<iMultiFab> m_fine_masks;
+    Vector<iMultiFab> m_crse_masks;
 
     // For coarse cells at coarse/fine interface. The vector is for AMR
     // levels.
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap.cpp b/Src/Extern/HYPRE/AMReX_HypreMLABecLap.cpp
index d7621c6bd0e..6f5718f826b 100644
--- a/Src/Extern/HYPRE/AMReX_HypreMLABecLap.cpp
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap.cpp
@@ -57,11 +57,16 @@ HypreMLABecLap::HypreMLABecLap (Vector<Geometry> a_geom,
     }
 
     m_fine_masks.resize(m_nlevels-1);
+    m_crse_masks.resize(m_nlevels-1);
     for (int ilev = 0; ilev < m_nlevels-1; ++ilev) {
         m_fine_masks[ilev] = amrex::makeFineMask(m_grids[ilev], m_dmap[ilev], IntVect(1),
                                                  m_grids[ilev+1], m_ref_ratio[ilev],
                                                  m_geom[ilev].periodicity(),
                                                  0, 1);
+        m_crse_masks[ilev].define(m_grids[ilev], m_dmap[ilev], 1, 1);
+        m_crse_masks[ilev].BuildMask(m_geom[ilev].Domain(),
+                                     m_geom[ilev].periodicity(),
+                                     1, 0, 0, 1);
     }
 
     m_c2f_offset_from.resize(m_nlevels-1);
@@ -406,6 +411,15 @@ void HypreMLABecLap::addNonStencilEntriesToGraph ()
     m_f2c_offset.resize(m_nlevels-1);
     m_f2c_values.resize(m_nlevels-1);
 
+    Vector<IntVect> period(m_nlevels);
+    Vector<IntVect> smallend(m_nlevels);
+    Vector<IntVect> bigend(m_nlevels);
+    for (int ilev = 0; ilev <m_nlevels; ++ilev) {
+        period[ilev] = m_geom[ilev].Domain().length();
+        smallend[ilev] = m_geom[ilev].Domain().smallEnd();
+        bigend[ilev] = m_geom[ilev].Domain().bigEnd();
+    }
+
     for (auto& entry : entries) {
         auto const from_level = std::get<0>(entry);
         auto const   to_level = std::get<3>(entry);
@@ -419,6 +433,15 @@ void HypreMLABecLap::addNonStencilEntriesToGraph ()
         GpuArray<HYPRE_Int,AMREX_SPACEDIM> to_index{AMREX_D_DECL(to_iv[0],
                                                                  to_iv[1],
                                                                  to_iv[2])};
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            if (m_geom[0].isPeriodic(idim)) {
+                if (to_index[idim] < smallend[to_level][idim]) {
+                    to_index[idim] += period[to_level][idim];
+                } else if (to_index[idim] > bigend[to_level][idim]) {
+                    to_index[idim] -= period[to_level][idim];
+                }
+            }
+        }
         constexpr int ivar = 0;
         HYPRE_SStructGraphAddEntries(m_ss_graph,
                                      from_level, from_index.data(), ivar,
@@ -588,12 +611,19 @@ void HypreMLABecLap::setup (Real a_ascalar, Real a_bscalar,
             const auto boxlo = amrex::lbound(vbx);
             const auto boxhi = amrex::ubound(vbx);
             // Set up stencil part of the matrix
+            auto fixed_pt = IntVect::TheMaxVector();
+            if (m_is_singular && m_nlevels-1 == ilev) {
+                auto const& box0 = m_grids.back()[0];
+                fixed_pt = box0.smallEnd() + 1;
+                // This cell does not have any non-stencil entries. So it's
+                // a good point for fixing singularity.
+            }
             amrex::fill(matfab,
             [=] AMREX_GPU_HOST_DEVICE (GpuArray<Real,stencil_size>& sten,
                                        int i, int j, int k)
             {
                 hypmlabeclap_mat(sten, i, j, k, boxlo, boxhi, sa, afab, sb, dx, bfabs,
-                                 bctype, bcl, bcmsk, bcval, bcrhs, ilev);
+                                 bctype, bcl, bcmsk, bcval, bcrhs, ilev, fixed_pt);
             });
 
             bool need_sync = true;
@@ -636,6 +666,7 @@ void HypreMLABecLap::setup (Real a_ascalar, Real a_bscalar,
                 auto const& c2f_offset_to_a = m_c2f_offset_to[ilev].const_array(mfi);
                 auto const& mat_a = matfab.array();
                 auto const& fine_mask = m_fine_masks[ilev].const_array(mfi);
+                auto const& crse_mask = m_crse_masks[ilev].const_array(mfi);
                 AMREX_D_TERM(auto offset_bx_a = m_offset_cf_bcoefs[ilev][0].isDefined()
                                               ? m_offset_cf_bcoefs[ilev][0].const_array(mfi)
                                               : Array4<int const>{};,
@@ -664,7 +695,7 @@ void HypreMLABecLap::setup (Real a_ascalar, Real a_bscalar,
                                      c2f_offset_to_a, dx, sb,
                                      AMREX_D_DECL(offset_bx_a,offset_by_a,offset_bz_a),
                                      AMREX_D_DECL(p_bx, p_by, p_bz),
-                                     fine_mask,rr);
+                                     fine_mask,rr, crse_mask);
                 });
                 if (c2f_total_from > 0) {
 #ifdef AMREX_USE_GPU
@@ -838,8 +869,8 @@ void HypreMLABecLap::setup (Real a_ascalar, Real a_bscalar,
         HYPRE_SStructSSAMGSetNumPostRelax(m_ss_solver, 4);
         HYPRE_SStructSSAMGSetNumCoarseRelax(m_ss_solver, 4);
 
-        HYPRE_SStructSSAMGSetLogging(m_ss_solver, m_verbose);
-        HYPRE_SStructSSAMGSetPrintLevel(m_ss_solver, m_verbose);
+        HYPRE_SStructSSAMGSetLogging(m_ss_solver, 1);
+        // HYPRE_SStructSSAMGSetPrintLevel(m_ss_solver, 1); /* 0: no, 1: setup, 2: solve, 3:both
 
 //        HYPRE_SStructSSAMGSetup(m_ss_solver, A, b, x);
 
@@ -854,15 +885,15 @@ void HypreMLABecLap::setup (Real a_ascalar, Real a_bscalar,
         HYPRE_BoomerAMGCreate(&m_solver);
 
         HYPRE_BoomerAMGSetOldDefault(m_solver); // Falgout coarsening with modified classical interpolation
-        HYPRE_BoomerAMGSetStrongThreshold(m_solver, (AMREX_SPACEDIM == 3) ? 0.6 : 0.25); // default is 0.25
+        HYPRE_BoomerAMGSetStrongThreshold(m_solver, (AMREX_SPACEDIM == 3) ? 0.4 : 0.25); // default is 0.25
         HYPRE_BoomerAMGSetRelaxOrder(m_solver, 1);   /* 0: default, natural order, 1: C/F relaxation order */
         HYPRE_BoomerAMGSetNumSweeps(m_solver, 2);   /* Sweeps on fine levels */
         // HYPRE_BoomerAMGSetFCycle(m_solver, 1); // default is 0
         // HYPRE_BoomerAMGSetCoarsenType(m_solver, 6);
         // HYPRE_BoomerAMGSetRelaxType(m_solver, 6);   /* G-S/Jacobi hybrid relaxation */
 
-        HYPRE_BoomerAMGSetLogging(m_solver, m_verbose);
-        HYPRE_BoomerAMGSetPrintLevel(m_solver, m_verbose);
+        HYPRE_BoomerAMGSetLogging(m_solver, 1);
+        // HYPRE_BoomerAMGSetPrintLevel(m_solver, 1); /* 0: no, 1: setup, 2: solve, 3:both
 
         HYPRE_ParCSRMatrix par_A;
         HYPRE_SStructMatrixGetObject(m_ss_A, (void**) &par_A);
@@ -956,6 +987,9 @@ void HypreMLABecLap::solve (Vector<MultiFab*> const& a_sol, Vector<MultiFab cons
             amrex::Abort("HypreMLABecLap::solve: TODO abstol > 0");
         }
 
+        HYPRE_Int num_iterations;
+        Real final_res;
+
 #ifdef AMREX_FEATURE_HYPRE_SSAMG
         if (m_hypre_solver_id == HypreSolverID::SSAMG)
         {
@@ -965,15 +999,13 @@ void HypreMLABecLap::solve (Vector<MultiFab*> const& a_sol, Vector<MultiFab cons
 
             HYPRE_SStructSSAMGSolve(m_ss_solver, m_ss_A, m_ss_b, m_ss_x);
 
-            if (m_verbose) {
-                HYPRE_Int num_iterations;
-                Real res;
-                HYPRE_SStructSSAMGGetNumIterations(m_ss_solver, &num_iterations);
-                HYPRE_SStructSSAMGGetFinalRelativeResidualNorm(m_ss_solver, &res);
+            HYPRE_SStructSSAMGGetNumIterations(m_ss_solver, &num_iterations);
+            HYPRE_SStructSSAMGGetFinalRelativeResidualNorm(m_ss_solver, &final_res);
 
+            if (m_verbose) {
                 amrex::Print() << "\n" << num_iterations
                                << " Hypre SSAMG Iterations, Relative Residual "
-                               << res << '\n';
+                               << final_res << '\n';
             }
         } else
 #endif
@@ -990,17 +1022,20 @@ void HypreMLABecLap::solve (Vector<MultiFab*> const& a_sol, Vector<MultiFab cons
 
             HYPRE_BoomerAMGSolve(m_solver, par_A, par_b, par_x);
 
-            if (m_verbose) {
-                HYPRE_Int num_iterations;
-                Real res;
-                HYPRE_BoomerAMGGetNumIterations(m_solver, &num_iterations);
-                HYPRE_BoomerAMGGetFinalRelativeResidualNorm(m_solver, &res);
+            HYPRE_BoomerAMGGetNumIterations(m_solver, &num_iterations);
+            HYPRE_BoomerAMGGetFinalRelativeResidualNorm(m_solver, &final_res);
 
+            if (m_verbose) {
                 amrex::Print() << "\n" << num_iterations
                                << " Hypre SS BoomerAMG Iterations, Relative Residual "
-                               << res << '\n';
+                               << final_res << '\n';
             }
         }
+
+        if (final_res > reltol) {
+            amrex::Abort("Hypre failed to converge after "+std::to_string(num_iterations)+
+                         " iterations. Final relative residual is "+std::to_string(final_res));
+        }
     }
 
     // Get solution
@@ -1044,8 +1079,6 @@ void HypreMLABecLap::solve (Vector<MultiFab*> const& a_sol, Vector<MultiFab cons
     for (int ilev = m_nlevels-2; ilev >= 0; --ilev) {
         amrex::average_down(*a_sol[ilev+1], *a_sol[ilev], 0, ncomp, m_ref_ratio[ilev]);
     }
-
-    // xxxxx abort if convergence is not reached.
 }
 
 #ifdef AMREX_USE_GPU
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap_2D_K.H b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_2D_K.H
index 57a37f19bf3..7d083e7d98c 100644
--- a/Src/Extern/HYPRE/AMReX_HypreMLABecLap_2D_K.H
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_2D_K.H
@@ -109,7 +109,7 @@ void hypmlabeclap_c2f (int i, int j, int k,
                        Array4<int const> const& offset_by,
                        Real const* bx, Real const* by,
                        Array4<int const> const& fine_mask,
-                       IntVect const& rr)
+                       IntVect const& rr, Array4<int const> const& crse_mask)
 {
     if (fine_mask(i,j,k)) {
         // Let's set off-diagonal elements to zero
@@ -159,9 +159,13 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real xInt = Real(-0.5) + (irx+Real(0.5))/Real(rr[0]);
                 Real xc[3] = {Real(-1.0), Real(0.0), Real(1.0)};
                 Real ct[3] = {Real(0.0), Real(0.0), Real(0.0)};
-                if (fine_mask(i-1,j,k)) {
+                if ( fine_mask(i-1,j,k) ||
+                    !crse_mask(i-1,j,k))
+                {
                     poly_interp_coeff<2>(xInt, &(xc[1]), &(ct[1]));
-                } else if (fine_mask(i+1,j,k)) {
+                } else if ( fine_mask(i+1,j,k) ||
+                           !crse_mask(i+1,j,k))
+                {
                     poly_interp_coeff<2>(xInt, xc, ct);
                 } else {
                     poly_interp_coeff<3>(xInt, xc, ct);
@@ -202,9 +206,13 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real yInt = Real(-0.5) + (iry+Real(0.5))/Real(rr[1]);
                 Real yc[3] = {Real(-1.0), Real(0.0), Real(1.0)};
                 Real ct[3] = {Real(0.0), Real(0.0), Real(0.0)};
-                if (fine_mask(i,j-1,k)) {
+                if ( fine_mask(i,j-1,k) ||
+                    !crse_mask(i,j-1,k))
+                {
                     poly_interp_coeff<2>(yInt, &(yc[1]), &(ct[1]));
-                } else if (fine_mask(i,j+1,k)) {
+                } else if ( fine_mask(i,j+1,k) ||
+                           !crse_mask(i,j+1,k))
+                {
                     poly_interp_coeff<2>(yInt, yc, ct);
                 } else {
                     poly_interp_coeff<3>(yInt, yc, ct);
@@ -244,9 +252,13 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real yInt = Real(-0.5) + (iry+Real(0.5))/Real(rr[1]);
                 Real yc[3] = {Real(-1.0), Real(0.0), Real(1.0)};
                 Real ct[3] = {Real(0.0), Real(0.0), Real(0.0)};
-                if (fine_mask(i,j-1,k)) {
+                if ( fine_mask(i,j-1,k) ||
+                    !crse_mask(i,j-1,k))
+                {
                     poly_interp_coeff<2>(yInt, &(yc[1]), &(ct[1]));
-                } else if (fine_mask(i,j+1,k)) {
+                } else if ( fine_mask(i,j+1,k) ||
+                           !crse_mask(i,j+1,k))
+                {
                     poly_interp_coeff<2>(yInt, yc, ct);
                 } else {
                     poly_interp_coeff<3>(yInt, yc, ct);
@@ -286,9 +298,13 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real xInt = Real(-0.5) + (irx+Real(0.5))/Real(rr[0]);
                 Real xc[3] = {Real(-1.0), Real(0.0), Real(1.0)};
                 Real ct[3] = {Real(0.0), Real(0.0), Real(0.0)};
-                if (fine_mask(i-1,j,k)) {
+                if ( fine_mask(i-1,j,k) ||
+                    !crse_mask(i-1,j,k))
+                {
                     poly_interp_coeff<2>(xInt, &(xc[1]), &(ct[1]));
-                } else if (fine_mask(i+1,j,k)) {
+                } else if ( fine_mask(i+1,j,k) ||
+                           !crse_mask(i+1,j,k))
+                {
                     poly_interp_coeff<2>(xInt, xc, ct);
                 } else {
                     poly_interp_coeff<3>(xInt, xc, ct);
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap_3D_K.H b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_3D_K.H
index 8e6e1a39b14..431650236f7 100644
--- a/Src/Extern/HYPRE/AMReX_HypreMLABecLap_3D_K.H
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_3D_K.H
@@ -166,7 +166,7 @@ void hypmlabeclap_c2f (int i, int j, int k,
                        Array4<int const> const& offset_bz,
                        Real const* bx, Real const* by, Real const* bz,
                        Array4<int const> const& fine_mask,
-                       IntVect const& rr)
+                       IntVect const& rr, Array4<int const> const& crse_mask)
 {
     if (fine_mask(i,j,k)) {
         // Let's set off-diagonal elements to zero
@@ -191,7 +191,11 @@ void hypmlabeclap_c2f (int i, int j, int k,
             (! fine_mask(i,j-1,k-1)) &&
             (! fine_mask(i,j+1,k-1)) &&
             (! fine_mask(i,j-1,k+1)) &&
-            (! fine_mask(i,j+1,k+1)))
+            (! fine_mask(i,j+1,k+1)) &&
+            (  crse_mask(i,j-1,k-1)) &&
+            (  crse_mask(i,j+1,k-1)) &&
+            (  crse_mask(i,j-1,k+1)) &&
+            (  crse_mask(i,j+1,k+1)))
         {
             corner[0] = true;
         }
@@ -199,7 +203,11 @@ void hypmlabeclap_c2f (int i, int j, int k,
             (! fine_mask(i-1,j,k-1)) &&
             (! fine_mask(i+1,j,k-1)) &&
             (! fine_mask(i-1,j,k+1)) &&
-            (! fine_mask(i+1,j,k+1)))
+            (! fine_mask(i+1,j,k+1)) &&
+            (  crse_mask(i-1,j,k-1)) &&
+            (  crse_mask(i+1,j,k-1)) &&
+            (  crse_mask(i-1,j,k+1)) &&
+            (  crse_mask(i+1,j,k+1)))
         {
             corner[1] = true;
         }
@@ -207,7 +215,11 @@ void hypmlabeclap_c2f (int i, int j, int k,
             (! fine_mask(i-1,j-1,k)) &&
             (! fine_mask(i+1,j-1,k)) &&
             (! fine_mask(i-1,j+1,k)) &&
-            (! fine_mask(i+1,j+1,k)))
+            (! fine_mask(i+1,j+1,k)) &&
+            (  crse_mask(i-1,j-1,k)) &&
+            (  crse_mask(i+1,j-1,k)) &&
+            (  crse_mask(i-1,j+1,k)) &&
+            (  crse_mask(i+1,j+1,k)))
         {
             corner[2] = true;
         }
@@ -253,28 +265,34 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real fac0 = fac*cc[0];
                 Real s0 = Real(1.0);
 
-                if (!fine_mask(i-1,j,k) && !fine_mask(i+1,j,k)) {
-                    s0 -= x*x;
-                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
-                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
-                } else if (!fine_mask(i-1,j,k)) {
+                if ( fine_mask(i-1,j,k) ||
+                    !crse_mask(i-1,j,k))
+                {
+                    s0 += Real(-0.5)*x;
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                } else if ( fine_mask(i+1,j,k) ||
+                           !crse_mask(i+1,j,k)) {
                     s0 += Real(0.5)*x;
                     stencil(i,j,k)[1] += fac0*Real(-0.5)*x;
                 } else {
-                    s0 += Real(-0.5)*x;
-                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                    s0 -= x*x;
+                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
                 }
 
-                if (!fine_mask(i,j-1,k) && !fine_mask(i,j+1,k)) {
-                    s0 -= y*y;
-                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
-                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
-                } else if (!fine_mask(i,j-1,k)) {
+                if ( fine_mask(i,j-1,k) ||
+                    !crse_mask(i,j-1,k))
+                {
+                    s0 += Real(-0.5)*y;
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                } else if ( fine_mask(i,j+1,k)  ||
+                           !crse_mask(i,j+1,k)) {
                     s0 += Real(0.5)*y;
                     stencil(i,j,k)[3] += fac0*Real(-0.5)*y;
                 } else {
-                    s0 += Real(-0.5)*y;
-                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                    s0 -= y*y;
+                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
                 }
 
                 stencil(i,j,k)[0] += fac0*s0;
@@ -322,28 +340,34 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real fac0 = fac*cc[0];
                 Real s0 = Real(1.0);
 
-                if (!fine_mask(i-1,j,k) && !fine_mask(i+1,j,k)) {
-                    s0 -= x*x;
-                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
-                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
-                } else if (!fine_mask(i-1,j,k)) {
+                if ( fine_mask(i-1,j,k) ||
+                    !crse_mask(i-1,j,k))
+                {
+                    s0 += Real(-0.5)*x;
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                } else if ( fine_mask(i+1,j,k) ||
+                           !crse_mask(i+1,j,k)) {
                     s0 += Real(0.5)*x;
                     stencil(i,j,k)[1] += fac0*Real(-0.5)*x;
                 } else {
-                    s0 += Real(-0.5)*x;
-                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                    s0 -= x*x;
+                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
                 }
 
-                if (!fine_mask(i,j,k-1) && !fine_mask(i,j,k+1)) {
-                    s0 -= z*z;
-                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
-                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
-                } else if (!fine_mask(i,j,k-1)) {
+                if ( fine_mask(i,j,k-1) ||
+                    !crse_mask(i,j,k-1))
+                {
+                    s0 += Real(-0.5)*z;
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                } else if ( fine_mask(i,j,k+1)  ||
+                           !crse_mask(i,j,k+1)) {
                     s0 += Real(0.5)*z;
                     stencil(i,j,k)[5] += fac0*Real(-0.5)*z;
                 } else {
-                    s0 += Real(-0.5)*z;
-                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                    s0 -= z*z;
+                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
                 }
 
                 stencil(i,j,k)[0] += fac0*s0;
@@ -393,28 +417,34 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real fac0 = fac*cc[0];
                 Real s0 = Real(1.0);
 
-                if (!fine_mask(i,j-1,k) && !fine_mask(i,j+1,k)) {
-                    s0 -= y*y;
-                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
-                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
-                } else if (!fine_mask(i,j-1,k)) {
+                if ( fine_mask(i,j-1,k) ||
+                    !crse_mask(i,j-1,k))
+                {
+                    s0 += Real(-0.5)*y;
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                } else if ( fine_mask(i,j+1,k) ||
+                           !crse_mask(i,j+1,k)) {
                     s0 += Real(0.5)*y;
                     stencil(i,j,k)[3] += fac0*Real(-0.5)*y;
                 } else {
-                    s0 += Real(-0.5)*y;
-                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                    s0 -= y*y;
+                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
                 }
 
-                if (!fine_mask(i,j,k-1) && !fine_mask(i,j,k+1)) {
-                    s0 -= z*z;
-                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
-                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
-                } else if (!fine_mask(i,j,k-1)) {
+                if ( fine_mask(i,j,k-1) ||
+                    !crse_mask(i,j,k-1))
+                {
+                    s0 += Real(-0.5)*z;
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                } else if ( fine_mask(i,j,k+1) ||
+                           !crse_mask(i,j,k+1)) {
                     s0 += Real(0.5)*z;
                     stencil(i,j,k)[5] += fac0*Real(-0.5)*z;
                 } else {
-                    s0 += Real(-0.5)*z;
-                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                    s0 -= z*z;
+                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
                 }
 
                 stencil(i,j,k)[0] += fac0*s0;
@@ -463,28 +493,34 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real fac0 = fac*cc[0];
                 Real s0 = Real(1.0);
 
-                if (!fine_mask(i,j-1,k) && !fine_mask(i,j+1,k)) {
-                    s0 -= y*y;
-                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
-                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
-                } else if (!fine_mask(i,j-1,k)) {
+                if ( fine_mask(i,j-1,k) ||
+                    !crse_mask(i,j-1,k))
+                {
+                    s0 += Real(-0.5)*y;
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                } else if ( fine_mask(i,j+1,k)  ||
+                           !crse_mask(i,j+1,k)) {
                     s0 += Real(0.5)*y;
                     stencil(i,j,k)[3] += fac0*Real(-0.5)*y;
                 } else {
-                    s0 += Real(-0.5)*y;
-                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                    s0 -= y*y;
+                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
                 }
 
-                if (!fine_mask(i,j,k-1) && !fine_mask(i,j,k+1)) {
-                    s0 -= z*z;
-                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
-                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
-                } else if (!fine_mask(i,j,k-1)) {
+                if ( fine_mask(i,j,k-1) ||
+                    !crse_mask(i,j,k-1))
+                {
+                    s0 += Real(-0.5)*z;
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                } else if ( fine_mask(i,j,k+1) ||
+                           !crse_mask(i,j,k+1)) {
                     s0 += Real(0.5)*z;
                     stencil(i,j,k)[5] += fac0*Real(-0.5)*z;
                 } else {
-                    s0 += Real(-0.5)*z;
-                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                    s0 -= z*z;
+                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
                 }
 
                 stencil(i,j,k)[0] += fac0*s0;
@@ -534,28 +570,34 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real fac0 = fac*cc[0];
                 Real s0 = Real(1.0);
 
-                if (!fine_mask(i-1,j,k) && !fine_mask(i+1,j,k)) {
-                    s0 -= x*x;
-                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
-                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
-                } else if (!fine_mask(i-1,j,k)) {
+                if ( fine_mask(i-1,j,k) ||
+                    !crse_mask(i-1,j,k))
+                {
+                    s0 += Real(-0.5)*x;
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                } else if ( fine_mask(i+1,j,k) ||
+                           !crse_mask(i+1,j,k)) {
                     s0 += Real(0.5)*x;
                     stencil(i,j,k)[1] += fac0*Real(-0.5)*x;
                 } else {
-                    s0 += Real(-0.5)*x;
-                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                    s0 -= x*x;
+                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
                 }
 
-                if (!fine_mask(i,j,k-1) && !fine_mask(i,j,k+1)) {
-                    s0 -= z*z;
-                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
-                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
-                } else if (!fine_mask(i,j,k-1)) {
+                if ( fine_mask(i,j,k-1) ||
+                    !crse_mask(i,j,k-1))
+                {
+                    s0 += Real(-0.5)*z;
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                } else if ( fine_mask(i,j,k+1) ||
+                           !crse_mask(i,j,k+1)) {
                     s0 += Real(0.5)*z;
                     stencil(i,j,k)[5] += fac0*Real(-0.5)*z;
                 } else {
-                    s0 += Real(-0.5)*z;
-                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                    s0 -= z*z;
+                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
                 }
 
                 stencil(i,j,k)[0] += fac0*s0;
@@ -605,28 +647,34 @@ void hypmlabeclap_c2f (int i, int j, int k,
                 Real fac0 = fac*cc[0];
                 Real s0 = Real(1.0);
 
-                if (!fine_mask(i-1,j,k) && !fine_mask(i+1,j,k)) {
-                    s0 -= x*x;
-                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
-                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
-                } else if (!fine_mask(i-1,j,k)) {
+                if ( fine_mask(i-1,j,k) ||
+                    !crse_mask(i-1,j,k))
+                {
+                    s0 += Real(-0.5)*x;
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                } else if ( fine_mask(i+1,j,k) ||
+                           !crse_mask(i+1,j,k)) {
                     s0 += Real(0.5)*x;
                     stencil(i,j,k)[1] += fac0*Real(-0.5)*x;
                 } else {
-                    s0 += Real(-0.5)*x;
-                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                    s0 -= x*x;
+                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
                 }
 
-                if (!fine_mask(i,j-1,k) && !fine_mask(i,j+1,k)) {
-                    s0 -= y*y;
-                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
-                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
-                } else if (!fine_mask(i,j-1,k)) {
+                if ( fine_mask(i,j-1,k) ||
+                    !crse_mask(i,j-1,k))
+                {
+                    s0 += Real(-0.5)*y;
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                } else if ( fine_mask(i,j+1,k) ||
+                           !crse_mask(i,j+1,k)) {
                     s0 += Real(0.5)*y;
                     stencil(i,j,k)[3] += fac0*Real(-0.5)*y;
                 } else {
-                    s0 += Real(-0.5)*y;
-                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                    s0 -= y*y;
+                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
                 }
 
                 stencil(i,j,k)[0] += fac0*s0;
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap_K.H b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_K.H
index ea38bf5037f..129a6a989a8 100644
--- a/Src/Extern/HYPRE/AMReX_HypreMLABecLap_K.H
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_K.H
@@ -22,7 +22,7 @@ void hypmlabeclap_mat (GpuArray<Real,2*AMREX_SPACEDIM+1>& sten, int i, int j, in
                        GpuArray<Array4<int const>, AMREX_SPACEDIM*2> const& bcmsk,
                        GpuArray<Array4<Real const>, AMREX_SPACEDIM*2> const& bcval,
                        GpuArray<Array4<Real>, AMREX_SPACEDIM*2> const& bcrhs,
-                       int level)
+                       int level, IntVect const& fixed_pt)
 {
     Real bxm = b[0] ? b[0](i  ,j  ,k  ) : Real(1.0);
     Real bxp = b[0] ? b[0](i+1,j  ,k  ) : Real(1.0);
@@ -223,6 +223,12 @@ void hypmlabeclap_mat (GpuArray<Real,2*AMREX_SPACEDIM+1>& sten, int i, int j, in
     }
 
 #endif
+
+    if (fixed_pt == IntVect(AMREX_D_DECL(i,j,k))) {
+        for (int n = 1; n < 2*AMREX_SPACEDIM+1; ++n) {
+            sten[n] = Real(0.0);
+        }
+    }
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
diff --git a/Src/Extern/HYPRE/AMReX_HypreNodeLap.cpp b/Src/Extern/HYPRE/AMReX_HypreNodeLap.cpp
index bb4bc557ca7..33e45ca0667 100644
--- a/Src/Extern/HYPRE/AMReX_HypreNodeLap.cpp
+++ b/Src/Extern/HYPRE/AMReX_HypreNodeLap.cpp
@@ -123,9 +123,9 @@ HypreNodeLap::HypreNodeLap (const BoxArray& grids_, const DistributionMapping& d
                 adjust_singular_matrix(ncols, cols, rows, mat);
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
             HYPRE_IJMatrixSetValues(A, nrows, ncols, rows, cols, mat);
-            Gpu::synchronize();
+            Gpu::hypreSynchronize();
         }
     }
     HYPRE_IJMatrixAssemble(A);
@@ -324,9 +324,9 @@ HypreNodeLap::loadVectors (MultiFab& soln, const MultiFab& rhs)
                 });
             }
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
             HYPRE_IJVectorSetValues(b, nrows, rows_vec.data(), bvec.data());
-            Gpu::synchronize();
+            Gpu::hypreSynchronize();
         }
     }
 }
@@ -347,7 +347,7 @@ HypreNodeLap::getSolution (MultiFab& soln)
             xvec.resize(nrows);
             Real* xp = xvec.data();
             HYPRE_IJVectorGetValues(x, nrows, rows_vec.data(), xp);
-            Gpu::synchronize();
+            Gpu::hypreSynchronize();
 
             const Box& bx = mfi.validbox();
             const auto& xfab = tmpsoln.array(mfi);
@@ -359,7 +359,7 @@ HypreNodeLap::getSolution (MultiFab& soln)
                 }
             });
 
-            Gpu::synchronize();
+            Gpu::streamSynchronize();
         }
     }
 
diff --git a/Src/Extern/HYPRE/AMReX_HypreSolver.H b/Src/Extern/HYPRE/AMReX_HypreSolver.H
index 8f34b72f2df..fb70c90bbbe 100644
--- a/Src/Extern/HYPRE/AMReX_HypreSolver.H
+++ b/Src/Extern/HYPRE/AMReX_HypreSolver.H
@@ -569,7 +569,7 @@ HypreSolver<MSS>::fill_matrix (Filler const& filler)
             Gpu::streamSynchronize();
             HYPRE_IJMatrixSetValues(m_A, nrows, ncols_vec.data(), rows,
                                     cols_vec.data(), mat_vec.data());
-            Gpu::synchronize();
+            Gpu::hypreSynchronize();
         }
     }
     HYPRE_IJMatrixAssemble(m_A);
@@ -653,7 +653,7 @@ HypreSolver<MSS>::load_vectors (Vector<MF      *> const& a_soln,
             Gpu::streamSynchronize();
             HYPRE_IJVectorSetValues(m_x, nrows, rows, xp);
             HYPRE_IJVectorSetValues(m_b, nrows, rows, bp);
-            Gpu::synchronize();
+            Gpu::hypreSynchronize();
         }
     }
 }
@@ -683,7 +683,7 @@ HypreSolver<MSS>::get_solution (Vector<MF*> const& a_soln)
             HYPRE_Int const* rows = m_global_id_vec[mfi].data();
 
             HYPRE_IJVectorGetValues(m_x, nrows, rows, xp);
-            Gpu::synchronize();
+            Gpu::hypreSynchronize();
 
             HYPRE_Int offset = 0;
             for (int ivar = 0; ivar < m_nvars; ++ivar) {
diff --git a/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H b/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H
index 30ff30a499b..47a028f852d 100644
--- a/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H
+++ b/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H
@@ -428,7 +428,6 @@ public:
             T S_rhs;
             unpack_vector(y_rhs, S_rhs);
 
-            BaseT::pre_rhs_action(S_data, rhs_time);
             BaseT::Rhs(S_rhs, S_data, rhs_time);
 
             return 0;
@@ -443,7 +442,6 @@ public:
             T S_rhs;
             unpack_vector(y_rhs, S_rhs);
 
-            BaseT::pre_rhs_action(S_data, rhs_time);
             BaseT::RhsIm(S_rhs, S_data, rhs_time);
 
             return 0;
@@ -458,7 +456,6 @@ public:
             T S_rhs;
             unpack_vector(y_rhs, S_rhs);
 
-            BaseT::pre_rhs_action(S_data, rhs_time);
             BaseT::RhsEx(S_rhs, S_data, rhs_time);
 
             return 0;
@@ -473,7 +470,6 @@ public:
             T S_rhs;
             unpack_vector(y_rhs, S_rhs);
 
-            BaseT::pre_rhs_action(S_data, rhs_time);
             BaseT::RhsFast(S_rhs, S_data, rhs_time);
 
             return 0;
diff --git a/Src/Extern/amrdata/AMReX_AmrData.H b/Src/Extern/amrdata/AMReX_AmrData.H
index e62ce6b1549..dcb9253fdc5 100644
--- a/Src/Extern/amrdata/AMReX_AmrData.H
+++ b/Src/Extern/amrdata/AMReX_AmrData.H
@@ -16,8 +16,6 @@
 
 namespace amrex {
 
-class Interpolater;
-
 class AmrData {
 
  protected:
@@ -182,8 +180,6 @@ class AmrData {
 
   // fill on interior by piecewise constant interpolation
   void FillInterior(FArrayBox &dest, int level, const Box &subbox);
-  void Interp(FArrayBox &fine, FArrayBox &crse,
-              const Box &fine_box, int lrat);
   void PcInterp(FArrayBox &fine, const FArrayBox &crse,
                 const Box &subbox, int lrat);
   FArrayBox *ReadGrid(std::istream &is, int numVar);
diff --git a/Src/Extern/amrdata/AMReX_AmrData.cpp b/Src/Extern/amrdata/AMReX_AmrData.cpp
index 9717f36a90c..a71d1a6b827 100644
--- a/Src/Extern/amrdata/AMReX_AmrData.cpp
+++ b/Src/Extern/amrdata/AMReX_AmrData.cpp
@@ -34,73 +34,6 @@ using std::ifstream;
 #define VSHOWVAL(verbose, val) { if(verbose) { \
                  cout << #val << " = " << val << endl; } }
 
-
-#if defined( BL_FORT_USE_UPPERCASE )
-#  if (BL_SPACEDIM == 1)
-#    define   FORT_PCINTERP    PCINTERP1D
-#  elif (BL_SPACEDIM == 2)
-#    define   FORT_CINTERP     CINTERP2D
-#    define   FORT_PCINTERP    PCINTERP2D
-#    define   FORT_CARTGRIDMINMAX CARTGRIDMINMAX2D
-#  elif (BL_SPACEDIM == 3)
-#    define   FORT_CINTERP     CINTERP3D
-#    define   FORT_PCINTERP    PCINTERP3D
-#    define   FORT_CARTGRIDMINMAX CARTGRIDMINMAX3D
-#  endif
-#elif defined( BL_FORT_USE_LOWERCASE )
-#  if (BL_SPACEDIM == 1)
-#    define   FORT_PCINTERP    pcinterp1d
-#  elif (BL_SPACEDIM == 2)
-#    define   FORT_CINTERP     cinterp2d
-#    define   FORT_PCINTERP    pcinterp2d
-#    define   FORT_CARTGRIDMINMAX cartgridminmax2d
-#  elif (BL_SPACEDIM == 3)
-#    define   FORT_CINTERP     cinterp3d
-#    define   FORT_PCINTERP    pcinterp3d
-#    define   FORT_CARTGRIDMINMAX cartgridminmax3d
-#  endif
-#else
-#  if (BL_SPACEDIM == 1)
-#    define   FORT_PCINTERP    pcinterp1d_
-#  elif (BL_SPACEDIM == 2)
-#    define   FORT_CINTERP     cinterp2d_
-#    define   FORT_PCINTERP    pcinterp2d_
-#    define   FORT_CARTGRIDMINMAX cartgridminmax2d_
-#  elif (BL_SPACEDIM == 3)
-#    define   FORT_CINTERP     cinterp3d_
-#    define   FORT_PCINTERP    pcinterp3d_
-#    define   FORT_CARTGRIDMINMAX cartgridminmax3d_
-#  endif
-#endif
-
-
-extern "C" {
-#if (BL_SPACEDIM != 1)
-  void FORT_CINTERP(amrex::Real *fine, AMREX_ARLIM_P(flo), AMREX_ARLIM_P(fhi),
-                  const int *fblo, const int *fbhi,
-                  const int &nvar, const int &lratio,
-                  const amrex::Real *crse, const int &clo, const int &chi,
-                  const int *cslo, const int *cshi,
-                  const int *fslo, const int *fshi,
-                  amrex::Real *cslope, const int &c_len,
-                  amrex::Real *fslope, amrex::Real *fdat, const int &f_len,
-                  amrex::Real *foff);
-#endif
-
-  void FORT_PCINTERP(amrex::Real *fine, AMREX_ARLIM_P(flo), AMREX_ARLIM_P(fhi),
-                   const int *fblo, const int *fbhi,
-                   const int &lrat, const int &nvar,
-                   const amrex::Real *crse, AMREX_ARLIM_P(clo), AMREX_ARLIM_P(chi),
-                   const int *cblo, const int *cbhi,
-                   amrex::Real *temp, const int &tlo, const int &thi);
-
-#if (BL_SPACEDIM != 1)
-  void FORT_CARTGRIDMINMAX (amrex::Real *data, AMREX_ARLIM_P(dlo), AMREX_ARLIM_P(dhi),
-                            const amrex::Real *vfrac, const amrex::Real &vfeps,
-                            amrex::Real &dmin, amrex::Real &dmax);
-#endif
-}
-
 namespace amrex {
 
 bool AmrData::verbose = false;
@@ -1775,7 +1708,7 @@ bool AmrData::MinMax(const Box &onBox, const string &derived, int level,
   bool valid(false);  // does onBox intersect any grids (are minmax valid)
   Real minVal, maxVal;
   dataMin =  std::numeric_limits<Real>::max();
-  dataMax = -std::numeric_limits<Real>::max();
+  dataMax =  std::numeric_limits<Real>::lowest();
   Box overlap;
 
   //  our strategy here is to use the VisMF min and maxes if possible
@@ -1833,22 +1766,21 @@ bool AmrData::MinMax(const Box &onBox, const string &derived, int level,
           if(visMFMin < dataMin || visMFMax > dataMax) {  // do it the hard way
             DefineFab(level, compIndex, gdx);
             DefineFab(level, vfIndex, gdx);
-            Real *ddat = (*dataGrids[level][compIndex])[gpli].dataPtr();
-            Real *vdat = (*dataGrids[level][vfIndex])[gpli].dataPtr();
-            const int *dlo = (*dataGrids[level][compIndex])[gpli].loVect();
-            const int *dhi = (*dataGrids[level][compIndex])[gpli].hiVect();
-
             overlap = onBox;
             overlap &= gpli.validbox();
             Real vfMaxVal = (*dataGrids[level][vfIndex])[gpli].max<RunOn::Host>(overlap, 0);
             if(vfMaxVal >= vfEps[level]) {
               ++cCountMixedFort;
               valid = true;
-
-              FORT_CARTGRIDMINMAX(ddat, AMREX_ARLIM(dlo), AMREX_ARLIM(dhi), vdat, vfEps[level],
-                                  minVal, maxVal);
-              dataMin = std::min(dataMin, minVal);
-              dataMax = std::max(dataMax, maxVal);
+              auto const& da = (*dataGrids[level][compIndex])[gpli].const_array();
+              auto const& va = (*dataGrids[level][vfIndex])[gpli].const_array();
+              amrex::LoopOnCpu((*dataGrids[level][compIndex])[gpli].box(), [&] (int i, int j, int k)
+              {
+                  if (va(i,j,k) >= vfEps[level]) {
+                      dataMin = std::min(dataMin, da(i,j,k));
+                      dataMax = std::max(dataMax, da(i,j,k));
+                  }
+              });
             }
           } else {
             ++cCountMixedSkipped;
@@ -1861,22 +1793,21 @@ bool AmrData::MinMax(const Box &onBox, const string &derived, int level,
         if(visMFMin < dataMin || visMFMax > dataMax) {  // do it the hard way
           DefineFab(level, compIndex, gdx);
           DefineFab(level, vfIndex, gdx);
-          Real *ddat = (*dataGrids[level][compIndex])[gpli].dataPtr();
-          Real *vdat = (*dataGrids[level][vfIndex])[gpli].dataPtr();
-          const int *dlo = (*dataGrids[level][compIndex])[gpli].loVect();
-          const int *dhi = (*dataGrids[level][compIndex])[gpli].hiVect();
-
           overlap = onBox;
           overlap &= gpli.validbox();
           Real vfMaxVal = (*dataGrids[level][vfIndex])[gpli].max<RunOn::Host>(overlap, 0);
           if(vfMaxVal >= vfEps[level]) {
             ++iCountMixedFort;
             valid = true;
-
-            FORT_CARTGRIDMINMAX(ddat, AMREX_ARLIM(dlo), AMREX_ARLIM(dhi), vdat, vfEps[level],
-                                minVal, maxVal);
-            dataMin = std::min(dataMin, minVal);
-            dataMax = std::max(dataMax, maxVal);
+            auto const& da = (*dataGrids[level][compIndex])[gpli].const_array();
+            auto const& va = (*dataGrids[level][vfIndex])[gpli].const_array();
+            amrex::LoopOnCpu((*dataGrids[level][compIndex])[gpli].box(), [&] (int i, int j, int k)
+            {
+                if (va(i,j,k) >= vfEps[level]) {
+                    dataMin = std::min(dataMin, da(i,j,k));
+                    dataMax = std::max(dataMax, da(i,j,k));
+                }
+            });
           } else {
             ++iCountAllBody;
           }
@@ -1963,58 +1894,6 @@ int AmrData::StateNumber(const string &statename) const {
 }
 
 
-// ---------------------------------------------------------------
-void AmrData::Interp(FArrayBox &fine, FArrayBox &crse,
-                     const Box &fine_box, int lrat)
-{
-#if (BL_SPACEDIM == 1)
-    amrex::ignore_unused(fine, crse, fine_box, lrat);
-    amrex::Abort("AmrData::MinMax:  should not be here for 1d.");
-#else
-   BL_ASSERT(fine.box().contains(fine_box));
-   Box crse_bx(amrex::coarsen(fine_box,lrat));
-   Box fslope_bx(amrex::refine(crse_bx,lrat));
-   Box cslope_bx(crse_bx);
-   cslope_bx.grow(1);
-   BL_ASSERT(crse.box() == cslope_bx);
-
-   // alloc temp space for coarse grid slopes
-   Long cLen = cslope_bx.numPts();
-   Real *cslope = new Real[BL_SPACEDIM*cLen];
-   Long loslp    = cslope_bx.index(crse_bx.smallEnd());
-   Long hislp    = cslope_bx.index(crse_bx.bigEnd());
-   Long cslope_vol = cslope_bx.numPts();
-   Long clo = 1 - loslp;
-   Long chi = clo + cslope_vol - 1;
-   cLen = hislp - loslp + 1;
-
-   // alloc temp space for one strip of fine grid slopes
-   int dir;
-   int fLen = fslope_bx.longside(dir);
-   Real *fdat   = new Real[(BL_SPACEDIM+2)*fLen];
-   Real *foff   = fdat + fLen;
-   Real *fslope = foff + fLen;
-
-
-   // alloc tmp space for slope calc and to allow for vectorization
-   const int *fblo = fine_box.loVect();
-   const int *fbhi = fine_box.hiVect();
-   const int *cblo = crse_bx.loVect();
-   const int *cbhi = crse_bx.hiVect();
-   const int *fslo = fslope_bx.loVect();
-   const int *fshi = fslope_bx.hiVect();
-
-   FORT_CINTERP(fine.dataPtr(0),AMREX_ARLIM(fine.loVect()),AMREX_ARLIM(fine.hiVect()),
-               fblo,fbhi,fine.nComp(),lrat,
-               crse.dataPtr(0),clo,chi,cblo,cbhi,fslo,fshi,
-               cslope,cLen,fslope,fdat,fLen,foff);
-
-   delete [] fdat;
-   delete [] cslope;
-#endif
-}
-
-
 // ---------------------------------------------------------------
 void AmrData::PcInterp(FArrayBox &fine, const FArrayBox &crse,
                        const Box &subbox, int lrat)
@@ -2026,27 +1905,15 @@ void AmrData::PcInterp(FArrayBox &fine, const FArrayBox &crse,
    Box fine_ovlp(subbox);
    fine_ovlp &= cfine;
    if(fine_ovlp.ok()) {
-      const int *fblo = fine_ovlp.smallEnd().getVect();
-      const int *fbhi = fine_ovlp.bigEnd().getVect();
-      Box crse_ovlp(fine_ovlp);
-      crse_ovlp.coarsen(lrat);
-      const int *cblo = crse_ovlp.smallEnd().getVect();
-      const int *cbhi = crse_ovlp.bigEnd().getVect();
-      Box fine_temp(crse_ovlp);
-      fine_temp.refine(lrat);
-      int tlo = fine_temp.smallEnd()[0];
-      int thi = fine_temp.bigEnd()[0];
-      int inextra(0);
-      if(fine_temp.ixType().test(0) == true) {  // node type
-        inextra = 1;
-      }
-      Real *tempSpace = new Real[thi-tlo+1+inextra];
-      FORT_PCINTERP(fine.dataPtr(0),AMREX_ARLIM(fine.loVect()),AMREX_ARLIM(fine.hiVect()),
-                   fblo,fbhi, lrat,fine.nComp(),
-                   crse.dataPtr(),AMREX_ARLIM(crse.loVect()),AMREX_ARLIM(crse.hiVect()),
-                   cblo,cbhi, tempSpace,tlo,thi);
-
-      delete [] tempSpace;
+       auto const& fa = fine.array();
+       auto const& ca = crse.const_array();
+       amrex::LoopOnCpu(fine_ovlp, fine.nComp(), [&] (int i, int j, int k, int n)
+       {
+           int ic = amrex::coarsen(i,lrat);
+           int jc = amrex::coarsen(j,lrat);
+           int kc = amrex::coarsen(k,lrat);
+           fa(i,j,k,n) = ca(ic,jc,kc,n);
+       });
    }
 }
 
diff --git a/Src/Extern/amrdata/AMReX_FABUTIL_1D.F b/Src/Extern/amrdata/AMReX_FABUTIL_1D.F
deleted file mode 100644
index 42824b08963..00000000000
--- a/Src/Extern/amrdata/AMReX_FABUTIL_1D.F
+++ /dev/null
@@ -1,297 +0,0 @@
-c ::: SCCS stuff "@(#)FABUTIL_2D.F        3.1\t6/25/93"
-
-#define   FORT_CINTERP       cinterp1d
-#define   FORT_PCINTERP      pcinterp1d
-
-#include "AMReX_REAL.H"
-
-#ifdef BL_USE_FLOAT
-#define REAL_T  real
-#define bigreal 1.0e30
-#define zero    0.0e0
-#define one     1.0e0
-#define half    0.5e0
-#else
-#define REAL_T  double precision
-#define bigreal 1.0d30
-#define zero    0.0d0
-#define one     1.0d0
-#define half    0.5d0
-#endif
-
-c ::: --------------------------------------------------------------
-      subroutine FORT_CINTERP (fine,floi1,floi2,fhii1,fhii2, fblo,fbhi,
-     $ nvar,lratio, crse,clo,chi,cblo,cbhi,fslo,fshi, cslope,clen,
-     $ fslope,fdat,flen,voff)
-
-      implicit none
-
-      integer floi1,floi2, fhii1,fhii2
-
-      integer fblo(2), fbhi(2)
-      integer cblo(2), cbhi(2)
-      integer fslo(2), fshi(2)
-      integer lratio, nvar, clen, flen, clo, chi
-      REAL_T fine(floi1 :fhii1 ,floi2 :fhii2, nvar)
-      REAL_T crse(clo:chi, nvar)
-      REAL_T cslope(clo:chi, 2)
-      REAL_T fslope(flen, 2)
-      REAL_T fdat(flen)
-      REAL_T voff(flen)
-
-c ::: NOTE: data must be sent in so that
-c ::: cslope(1,*) and crse(1,*) are associated with
-c ::: the same cell
-
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: cinterp:   conservative interpolation from coarse grid to
-c ::: subregion of fine grid defined by (fblo,fbhi)
-c :::
-c ::: Inputs/Outputs
-c ::: fine        <=>  (modify) fine grid array
-c ::: flo,fhi      =>  (const)  index limits of fine grid
-c ::: fblo,fbhi    =>  (const)  subregion of fine grid to get values
-c ::: nvar         =>  (const)  number of variables in state vector
-c ::: lratio       =>  (const)  refinement ratio between levels
-c :::
-c ::: crse         =>  (const)  coarse grid data widended by 1 zone
-c ::: and unrolled
-c ::: clo,chi      =>  (const)  one dimensional limits of crse grid
-c ::: cslo,cshi    =>  (const)  coarse grid index limits where
-c ::: slopes are to be defined. This is
-c ::: the projection of (fblo,fbhi) down
-c ::: to the coarse level
-c ::: fslo,fshi    =>  (const)  fine grid index limits where
-c ::: slopes are needed.  This is the
-c ::: refinement of (cslo,cshi) and
-c ::: contains but may not be identical
-c ::: to (fblo,fbhi).
-c ::: cslope       =>  (modify) temp array coarse grid slopes
-c ::: clen         =>  (const)  length of coarse gtid slopes
-c ::: fslope       =>  (modify) temp array for fine grid slope
-c ::: flen         =>  (const)  length of fine grid slope array
-c ::: fdat         =>  (const)  temp array for fine grid data
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-
-c ::: local var
-      integer n, fn
-      integer i, ic, ioff
-      integer j, jc, joff
-      integer ist, jst
-      REAL_T hafrat, volratio
-      REAL_T cen, forw, back, slp
-      REAL_T xoff, yoff
-      integer ncbx, ncby
-      integer ncsx, ncsy
-      integer islo, jslo
-      integer icc, istart, iend
-      integer lenx, leny, maxlen
-
-      call bl_abort("fix FORT_CINTERP for 1d.")
-
-      hafrat = half*float(lratio-1)
-      volratio = one/float(lratio)
-
-      ncbx = cbhi(1)-cblo(1)+1
-      ncby = cbhi(2)-cblo(2)+1
-      ncsx = ncbx+2
-      ncsy = ncby+2
-      ist = 1
-      jst = ncsx
-      islo = cblo(1)-1
-      jslo = cblo(2)-1
-      lenx = fbhi(1)-fblo(1)+1
-      leny = fbhi(2)-fblo(2)+1
-      maxlen = max(lenx,leny)
-      if (maxlen .eq. lenx) then
-          do 100 i = fblo(1), fbhi(1)
-              fn = i-fslo(1)+1
-              ioff = mod(fn-1,lratio)
-              voff(fn) = float(ioff)-hafrat
-100       continue
-      else
-          do 110 j = fblo(2), fbhi(2)
-              fn = j-fslo(2)+1
-              joff = mod(fn-1,lratio)
-              voff(fn) = float(joff)-hafrat
-110       continue
-      end if
-      do 120 n = 1, nvar
-
-c ::: ::::: compute slopes in x direction
-          do 130 i = 1, clen
-              cen = half*(crse(i+ist,n)-crse(i-ist,n))
-              forw = crse(i+ist,n)-crse(i,n)
-              back = crse(i,n)-crse(i-ist,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,1)=merge(slp,zero,forw*back>=0.0d0)
-130       continue
-c ::: ::::: compute slopes in y direction
-          do 140 i = 1, clen
-              cen = half*(crse(i+jst,n)-crse(i-jst,n))
-              forw = crse(i+jst,n)-crse(i,n)
-              back = crse(i,n)-crse(i-jst,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,2)=merge(slp,zero,forw*back>=0.0d0)
-140       continue
-          if (maxlen .eq. lenx) then
-              do 150 jc = cblo(2), cbhi(2)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                  do 160 ioff = 1, lratio
-                      icc = clo + ist + jst*(jc-jslo)
-                      istart = ioff
-                      iend = ioff + (ncbx-1)*lratio
-                      do 170 fn = istart, iend, lratio
-                          fslope(fn,1) = cslope(icc,1)
-                          fslope(fn,2) = cslope(icc,2)
-                          fdat(fn) = crse(icc,n)
-                          icc = icc + ist
-170                   continue
-160               continue
-
-                  do 180 joff = 0, lratio-1
-                      j = lratio*jc + joff
-                      if (j .lt. fblo(2)) then
-                          goto 180
-c                         --- next ---
-                      end if
-                      if (j .gt. fbhi(2)) then
-                          goto 181
-c                         --- break ---
-                      end if
-                      yoff = float(joff)-hafrat
-
-                      do 190 i = fblo(1), fbhi(1)
-                          fn = i-fslo(1)+1
-                          fine(i,j,n) = fdat(fn) + volratio* (voff(fn)*
-     $                    fslope(fn,1)+yoff*fslope(fn,2))
-190                   continue
-180               continue
-181               continue
-150           continue
-          else
-              do 200 ic = cblo(1), cbhi(1)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                  do 210 joff = 1, lratio
-                      icc = clo + ist*(ic-islo) + jst
-                      istart = joff
-                      iend = joff + (ncby-1)*lratio
-                      do 220 fn = istart, iend, lratio
-                          fslope(fn,1) = cslope(icc,1)
-                          fslope(fn,2) = cslope(icc,2)
-                          fdat(fn) = crse(icc,n)
-                          icc = icc + jst
-220                   continue
-210               continue
-
-                  do 230 ioff = 0, lratio-1
-                      i = lratio*ic + ioff
-                      if (i .lt. fblo(1)) then
-                          goto 230
-c                         --- next ---
-                      end if
-                      if (i .gt. fbhi(1)) then
-                          goto 231
-c                         --- break ---
-                      end if
-                      xoff = float(ioff)-hafrat
-
-                      do 240 j = fblo(2), fbhi(2)
-                          fn = j-fslo(2)+1
-                          fine(i,j,n) = fdat(fn) + volratio* (xoff*
-     $                    fslope(fn,1)+voff(fn)*fslope(fn,2))
-240                   continue
-230               continue
-231               continue
-200           continue
-          end if
-120   continue
-
-      return
-      end
-
-
-c ::: --------------------------------------------------------------
-      subroutine FORT_PCINTERP (fine,floi1,fhii1,fblo, fbhi,lrat,
-     $ nvar, crse,cloi1,chii1,cblo, cbhi,temp,tloi,thii)
-
-      implicit none
-
-      integer floi1
-      integer fhii1
-      integer cloi1
-      integer chii1
-
-      integer fblo(1), fbhi(1)
-      integer cblo(1), cbhi(1)
-      integer lrat, nvar, tloi, thii
-      REAL_T fine(floi1 :fhii1, nvar)
-      REAL_T crse(cloi1 :chii1, nvar)
-      REAL_T temp(tloi:thii + 1)
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: pcinterp:  use piecewise constant interpolation to define
-c ::: values on the subregion of the fine FAB defined
-c ::: by (fblo,fbhi).
-c :::
-c ::: Inputs/Outputs
-c ::: fine        <=>  (modify) fab to get interpolated values
-c ::: flo,fhi      =>  (const)  index limits of fine
-c ::: fblo,fbhi    =>  (const)  subregion of fine grid to get values
-c ::: crse         =>  (const)  fab holding coarse grid values
-c ::: clo,chi      =>  (const)  index limits of src
-c ::: cblo,cbhi    =>  (const)  subregion of coarse grid holding values
-c ::: temp         =>  (modify) temporary space for vectorization
-c ::: tlo,thi      =>  (const)  index limits of temp space
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: local var
-      integer i,ic,ioff,n
-      integer ixproj,ii,ll
-      ixproj(ii,ll) = (ii + ll*iabs(ii))/ll - iabs(ii)
-
-          do 130 n = 1, nvar
-              do 140 ioff = 0, lrat-1
-                  do 150 ic = cblo(1),cbhi(1)
-                      i = lrat*ic + ioff
-                      temp(i) = crse(ic,n)
-150               continue
-140           continue
-              do 160 i = fblo(1), fbhi(1)
-                  fine(i,n) = temp(i)
-160           continue
-130       continue
-
-      return
-      end
-
-
-c ::: --------------------------------------------------------------
-      subroutine cartgridminmax1d (data, lo1, lo2, hi1, hi2,
-     $                         vfracdata, vfeps, dmin, dmax)
-      implicit none
-
-      integer lo1, lo2, hi1, hi2
-      REAL_T data(lo1:hi1 ,lo2:hi2)
-      REAL_T vfracdata(lo1:hi1 ,lo2:hi2)
-      REAL_T vfeps, dmin, dmax
-
-      integer i, j
-
-      call bl_abort("fix cartgridminmax1d for 1d.")
-
-      dmax = -bigreal
-      dmin = bigreal
-      do 420 j = lo2, hi2
-          do 430 i = lo1, hi1
-c      print *, "i j vfracdata(i,j) = ",i,j,vfracdata(i,j)
-              if ( .not. (vfracdata(i,j).lt.vfeps)) then
-                dmax = max(dmax,data(i,j))
-                dmin = min(dmin,data(i,j))
-              endif
-430       continue
-420   continue
-
-      return
-      end
-
diff --git a/Src/Extern/amrdata/AMReX_FABUTIL_2D.F b/Src/Extern/amrdata/AMReX_FABUTIL_2D.F
deleted file mode 100644
index 8748760ff35..00000000000
--- a/Src/Extern/amrdata/AMReX_FABUTIL_2D.F
+++ /dev/null
@@ -1,297 +0,0 @@
-c ::: SCCS stuff "@(#)FABUTIL_2D.F        3.1\t6/25/93"
-
-#define   FORT_CINTERP       cinterp2d
-#define   FORT_PCINTERP      pcinterp2d
-
-#include "AMReX_REAL.H"
-
-#ifdef BL_USE_FLOAT
-#define REAL_T  real
-#define bigreal 1.0e30
-#define zero    0.0e0
-#define one     1.0e0
-#define half    0.5e0
-#else
-#define REAL_T  double precision
-#define bigreal 1.0d30
-#define zero    0.0d0
-#define one     1.0d0
-#define half    0.5d0
-#endif
-
-c ::: --------------------------------------------------------------
-      subroutine FORT_CINTERP (fine,floi1,floi2,fhii1,fhii2, fblo,fbhi,
-     $ nvar,lratio, crse,clo,chi,cblo,cbhi,fslo,fshi, cslope,clen,
-     $ fslope,fdat,flen,voff)
-
-      implicit none
-
-      integer floi1,floi2, fhii1,fhii2
-
-      integer fblo(2), fbhi(2)
-      integer cblo(2), cbhi(2)
-      integer fslo(2), fshi(2)
-      integer lratio, nvar, clen, flen, clo, chi
-      REAL_T fine(floi1 :fhii1 ,floi2 :fhii2, nvar)
-      REAL_T crse(clo:chi, nvar)
-      REAL_T cslope(clo:chi, 2)
-      REAL_T fslope(flen, 2)
-      REAL_T fdat(flen)
-      REAL_T voff(flen)
-
-c ::: NOTE: data must be sent in so that
-c ::: cslope(1,*) and crse(1,*) are associated with
-c ::: the same cell
-
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: cinterp:   conservative interpolation from coarse grid to
-c ::: subregion of fine grid defined by (fblo,fbhi)
-c :::
-c ::: Inputs/Outputs
-c ::: fine        <=>  (modify) fine grid array
-c ::: flo,fhi      =>  (const)  index limits of fine grid
-c ::: fblo,fbhi    =>  (const)  subregion of fine grid to get values
-c ::: nvar         =>  (const)  number of variables in state vector
-c ::: lratio       =>  (const)  refinement ratio between levels
-c :::
-c ::: crse         =>  (const)  coarse grid data widended by 1 zone
-c ::: and unrolled
-c ::: clo,chi      =>  (const)  one dimensional limits of crse grid
-c ::: cslo,cshi    =>  (const)  coarse grid index limits where
-c ::: slopes are to be defined. This is
-c ::: the projection of (fblo,fbhi) down
-c ::: to the coarse level
-c ::: fslo,fshi    =>  (const)  fine grid index limits where
-c ::: slopes are needed.  This is the
-c ::: refinement of (cslo,cshi) and
-c ::: contains but may not be identical
-c ::: to (fblo,fbhi).
-c ::: cslope       =>  (modify) temp array coarse grid slopes
-c ::: clen         =>  (const)  length of coarse gtid slopes
-c ::: fslope       =>  (modify) temp array for fine grid slope
-c ::: flen         =>  (const)  length of fine grid slope array
-c ::: fdat         =>  (const)  temp array for fine grid data
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-
-c ::: local var
-      integer n, fn
-      integer i, ic, ioff
-      integer j, jc, joff
-      integer ist, jst
-      REAL_T hafrat, volratio
-      REAL_T cen, forw, back, slp
-      REAL_T xoff, yoff
-      integer ncbx, ncby
-      integer ncsx, ncsy
-      integer islo, jslo
-      integer icc, istart, iend
-      integer lenx, leny, maxlen
-
-      hafrat = half*float(lratio-1)
-      volratio = one/float(lratio)
-
-      ncbx = cbhi(1)-cblo(1)+1
-      ncby = cbhi(2)-cblo(2)+1
-      ncsx = ncbx+2
-      ncsy = ncby+2
-      ist = 1
-      jst = ncsx
-      islo = cblo(1)-1
-      jslo = cblo(2)-1
-      lenx = fbhi(1)-fblo(1)+1
-      leny = fbhi(2)-fblo(2)+1
-      maxlen = max(lenx,leny)
-      if (maxlen .eq. lenx) then
-          do 100 i = fblo(1), fbhi(1)
-              fn = i-fslo(1)+1
-              ioff = mod(fn-1,lratio)
-              voff(fn) = float(ioff)-hafrat
-100       continue
-      else
-          do 110 j = fblo(2), fbhi(2)
-              fn = j-fslo(2)+1
-              joff = mod(fn-1,lratio)
-              voff(fn) = float(joff)-hafrat
-110       continue
-      end if
-      do 120 n = 1, nvar
-
-c ::: ::::: compute slopes in x direction
-          do 130 i = 1, clen
-              cen = half*(crse(i+ist,n)-crse(i-ist,n))
-              forw = crse(i+ist,n)-crse(i,n)
-              back = crse(i,n)-crse(i-ist,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,1)=merge(slp,zero,forw*back>=0.0d0)
-130       continue
-c ::: ::::: compute slopes in y direction
-          do 140 i = 1, clen
-              cen = half*(crse(i+jst,n)-crse(i-jst,n))
-              forw = crse(i+jst,n)-crse(i,n)
-              back = crse(i,n)-crse(i-jst,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,2)=merge(slp,zero,forw*back>=0.0d0)
-140       continue
-          if (maxlen .eq. lenx) then
-              do 150 jc = cblo(2), cbhi(2)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                  do 160 ioff = 1, lratio
-                      icc = clo + ist + jst*(jc-jslo)
-                      istart = ioff
-                      iend = ioff + (ncbx-1)*lratio
-                      do 170 fn = istart, iend, lratio
-                          fslope(fn,1) = cslope(icc,1)
-                          fslope(fn,2) = cslope(icc,2)
-                          fdat(fn) = crse(icc,n)
-                          icc = icc + ist
-170                   continue
-160               continue
-
-                  do 180 joff = 0, lratio-1
-                      j = lratio*jc + joff
-                      if (j .lt. fblo(2)) then
-                          goto 180
-c                         --- next ---
-                      end if
-                      if (j .gt. fbhi(2)) then
-                          goto 181
-c                         --- break ---
-                      end if
-                      yoff = float(joff)-hafrat
-
-                      do 190 i = fblo(1), fbhi(1)
-                          fn = i-fslo(1)+1
-                          fine(i,j,n) = fdat(fn) + volratio* (voff(fn)*
-     $                    fslope(fn,1)+yoff*fslope(fn,2))
-190                   continue
-180               continue
-181               continue
-150           continue
-          else
-              do 200 ic = cblo(1), cbhi(1)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                  do 210 joff = 1, lratio
-                      icc = clo + ist*(ic-islo) + jst
-                      istart = joff
-                      iend = joff + (ncby-1)*lratio
-                      do 220 fn = istart, iend, lratio
-                          fslope(fn,1) = cslope(icc,1)
-                          fslope(fn,2) = cslope(icc,2)
-                          fdat(fn) = crse(icc,n)
-                          icc = icc + jst
-220                   continue
-210               continue
-
-                  do 230 ioff = 0, lratio-1
-                      i = lratio*ic + ioff
-                      if (i .lt. fblo(1)) then
-                          goto 230
-c                         --- next ---
-                      end if
-                      if (i .gt. fbhi(1)) then
-                          goto 231
-c                         --- break ---
-                      end if
-                      xoff = float(ioff)-hafrat
-
-                      do 240 j = fblo(2), fbhi(2)
-                          fn = j-fslo(2)+1
-                          fine(i,j,n) = fdat(fn) + volratio* (xoff*
-     $                    fslope(fn,1)+voff(fn)*fslope(fn,2))
-240                   continue
-230               continue
-231               continue
-200           continue
-          end if
-120   continue
-
-      return
-      end
-
-
-c ::: --------------------------------------------------------------
-      subroutine FORT_PCINTERP (fine,floi1,floi2,fhii1,fhii2,fblo,
-     $     fbhi,lrat,nvar,crse,cloi1,cloi2,chii1,chii2,cblo,
-     $     cbhi,temp,tloi,thii)
-
-      implicit none
-
-      integer floi1,floi2
-      integer fhii1,fhii2
-      integer cloi1,cloi2
-      integer chii1,chii2
-
-      integer fblo(2), fbhi(2)
-      integer cblo(2), cbhi(2)
-      integer lrat, nvar, tloi, thii
-      REAL_T fine(floi1 :fhii1 ,floi2 :fhii2, nvar)
-      REAL_T crse(cloi1 :chii1 ,cloi2 :chii2, nvar)
-      REAL_T temp(tloi:thii + 1)
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: pcinterp:  use piecewise constant interpolation to define
-c ::: values on the subregion of the fine FAB defined
-c ::: by (fblo,fbhi).
-c :::
-c ::: Inputs/Outputs
-c ::: fine        <=>  (modify) fab to get interpolated values
-c ::: flo,fhi      =>  (const)  index limits of fine
-c ::: fblo,fbhi    =>  (const)  subregion of fine grid to get values
-c ::: crse         =>  (const)  fab holding coarse grid values
-c ::: clo,chi      =>  (const)  index limits of src
-c ::: cblo,cbhi    =>  (const)  subregion of coarse grid holding values
-c ::: temp         =>  (modify) temporary space for vectorization
-c ::: tlo,thi      =>  (const)  index limits of temp space
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: local var
-      integer i,j,k,ic,jc,kc,ioff,n
-      integer ixproj,ii,ll
-      ixproj(ii,ll) = (ii + ll*iabs(ii))/ll - iabs(ii)
-
-      do 120 j = fblo(2), fbhi(2)
-          jc = ixproj(j,lrat)
-          do 130 n = 1, nvar
-              do 140 ioff = 0, lrat-1
-                  do 150 ic = cblo(1),cbhi(1)
-                      i = lrat*ic + ioff
-                      temp(i) = crse(ic,jc,n)
-150               continue
-140           continue
-              do 160 i = fblo(1), fbhi(1)
-                  fine(i,j,n) = temp(i)
-160           continue
-130       continue
-120   continue
-
-      return
-      end
-
-
-c ::: --------------------------------------------------------------
-      subroutine cartgridminmax2d (data, lo1, lo2, hi1, hi2,
-     $                         vfracdata, vfeps, dmin, dmax)
-      implicit none
-
-      integer lo1, lo2, hi1, hi2
-      REAL_T data(lo1:hi1 ,lo2:hi2)
-      REAL_T vfracdata(lo1:hi1 ,lo2:hi2)
-      REAL_T vfeps, dmin, dmax
-
-      integer i, j
-
-      dmax = -bigreal
-      dmin = bigreal
-      do 420 j = lo2, hi2
-          do 430 i = lo1, hi1
-c      print *, "i j vfracdata(i,j) = ",i,j,vfracdata(i,j)
-              if ( .not. (vfracdata(i,j).lt.vfeps)) then
-                dmax = max(dmax,data(i,j))
-                dmin = min(dmin,data(i,j))
-              endif
-430       continue
-420   continue
-
-      return
-      end
-
diff --git a/Src/Extern/amrdata/AMReX_FABUTIL_3D.F b/Src/Extern/amrdata/AMReX_FABUTIL_3D.F
deleted file mode 100644
index 8a6ff561577..00000000000
--- a/Src/Extern/amrdata/AMReX_FABUTIL_3D.F
+++ /dev/null
@@ -1,758 +0,0 @@
-c ::: SCCS stuff "@(#)FABUTIL_3D.F        3.1\t6/25/93"
-
-#define   FORT_CINTERP   cinterp3d
-#define   FORT_PCINTERP  pcinterp3d
-#define   FORT_VCINTERP  vcinterp3d
-#define   FORT_CARTGRIDMINMAX  cartgridminmax3d
-
-#include "AMReX_REAL.H"
-
-#ifdef BL_USE_FLOAT
-#define REAL_T  real
-#define bigreal 1.0e30
-#define zero    0.0e0
-#define one     1.0e0
-#define half    0.5e0
-#else
-#define REAL_T  double precision
-#define bigreal 1.0d30
-#define zero    0.0d0
-#define one     1.0d0
-#define half    0.5d0
-#endif
-
-#define IX_PROJ(A,B) (A+B*iabs(A))/B-iabs(A)
-
-
-c ::: --------------------------------------------------------------
-      subroutine FORT_CARTGRIDMINMAX (data, dlo1, dlo2, dlo3, dhi1,
-     $                                dhi2, dhi3, vfracdata, vfeps,
-     $                                dmin, dmax)
-      implicit none
-
-      integer dlo1, dlo2, dlo3, dhi1, dhi2, dhi3
-      REAL_T data(dlo1:dhi1 ,dlo2:dhi2, dlo3:dhi3)
-      REAL_T vfracdata(dlo1:dhi1 ,dlo2:dhi2, dlo3:dhi3)
-      REAL_T vfeps, dmin, dmax
-
-      integer i, j, k
-
-      dmax = -bigreal
-      dmin = bigreal
-      do 410 k = dlo3, dhi3
-        do 420 j = dlo2, dhi2
-          do 430 i = dlo1, dhi1
-              if ( (vfracdata(i,j,k).ge.vfeps)) then
-                dmax = max(dmax,data(i,j,k))
-                dmin = min(dmin,data(i,j,k))
-              endif
-430       continue
-420     continue
-410   continue
-
-      return
-      end
-
-
-c ::: --------------------------------------------------------------
-      subroutine FORT_CINTERP (fine,floi1,floi2,floi3,fhii1,fhii2,fhii3,
-     $  fblo,fbhi,nvar,lratio, crse,clo,chi,cblo,cbhi,fslo,fshi, cslope,
-     $ clen,fslope,fdat,flen,voff)
-
-      implicit none
-
-      integer floi1,floi2,floi3, fhii1,fhii2,fhii3
-      integer fblo(3), fbhi(3)
-      integer cblo(3), cbhi(3)
-      integer fslo(3), fshi(3)
-      integer lratio, nvar, clen, flen, clo, chi
-      REAL_T fine(floi1:fhii1,floi2:fhii2,floi3:fhii3,nvar)
-      REAL_T crse(clo:chi, nvar)
-      REAL_T cslope(clo:chi, 3)
-      REAL_T fslope(flen, 3)
-      REAL_T fdat(flen)
-      REAL_T voff(flen)
-
-c ::: NOTE: data must be sent in so that
-c ::: cslope(1,*) and crse(1,*) are associated with
-c ::: the same cell
-
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: cinterp:   conservative interpolation from coarse grid to
-c ::: subregion of fine grid defined by (fblo,fbhi)
-c :::
-c ::: Inputs/Outputs
-c ::: fine        <=>  (modify) fine grid array
-c ::: flo,fhi      =>  (const)  index limits of fine grid
-c ::: fblo,fbhi    =>  (const)  subregion of fine grid to get values
-c ::: nvar         =>  (const)  number of variables in state vector
-c ::: lratio       =>  (const)  refinement ratio between levels
-c :::
-c ::: crse         =>  (const)  coarse grid data widended by 1 zone
-c ::: and unrolled
-c ::: clo,chi      =>  (const)  one dimensional limits of crse grid
-c ::: cslo,cshi    =>  (const)  coarse grid index limits where
-c ::: slopes are to be defined. This is
-c ::: the projection of (fblo,fbhi) down
-c ::: to the coarse level
-c ::: fslo,fshi    =>  (const)  fine grid index limits where
-c ::: slopes are needed.  This is the
-c ::: refinement of (cslo,cshi) and
-c ::: contains but may not be identical
-c ::: to (fblo,fbhi).
-c ::: cslope       =>  (modify) temp array coarse grid slopes
-c ::: clen         =>  (const)  length of coarse gtid slopes
-c ::: fslope       =>  (modify) temp array for fine grid slope
-c ::: flen         =>  (const)  length of fine grid slope array
-c ::: fdat         =>  (const)  temp array for fine grid data
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-
-c ::: local var
-      integer n, fn
-      integer i, ic, ioff
-      integer j, jc, joff
-      integer k, kc, koff
-      integer ist, jst, kst
-      REAL_T hafrat, volratio
-      REAL_T cen, forw, back, slp
-      REAL_T xoff, yoff, zoff
-      integer ncbx, ncby, ncbz
-      integer ncsx, ncsy, ncsz
-      integer islo, jslo, kslo
-      integer icc, istart, iend
-      integer lenx, leny, lenz, maxlen
-
-      hafrat = half*float(lratio-1)
-      volratio = one/float(lratio)
-
-      ncbx = cbhi(1)-cblo(1)+1
-      ncby = cbhi(2)-cblo(2)+1
-      ncbz = cbhi(3)-cblo(3)+1
-      ncsx = ncbx+2
-      ncsy = ncby+2
-      ncsz = ncbz+2
-      ist = 1
-      jst = ncsx
-      kst = ncsx*ncsy
-      islo = cblo(1)-1
-      jslo = cblo(2)-1
-      kslo = cblo(3)-1
-      lenx = fbhi(1)-fblo(1)+1
-      leny = fbhi(2)-fblo(2)+1
-      lenz = fbhi(3)-fblo(3)+1
-      maxlen = max(lenx,leny,lenz)
-      if (maxlen .eq. lenx) then
-          do 100 i = fblo(1), fbhi(1)
-              fn = i-fslo(1)+1
-              ioff = mod(fn-1,lratio)
-              voff(fn) = float(ioff)-hafrat
-100       continue
-      else if (maxlen .eq. leny) then
-          do 110 j = fblo(2), fbhi(2)
-              fn = j-fslo(2)+1
-              joff = mod(fn-1,lratio)
-              voff(fn) = float(joff)-hafrat
-110       continue
-      else
-          do 120 k = fblo(3), fbhi(3)
-              fn = k-fslo(3)+1
-              koff = mod(fn-1,lratio)
-              voff(fn) = float(koff)-hafrat
-120       continue
-      end if
-      do 130 n = 1, nvar
-
-c ::: ::::: compute slopes in x direction
-          do 140 i = 1, clen
-              cen = half*(crse(i+ist,n)-crse(i-ist,n))
-              forw = crse(i+ist,n)-crse(i,n)
-              back = crse(i,n)-crse(i-ist,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,1)=merge(slp,zero,forw*back>=0.0d0)
-140       continue
-c ::: ::::: compute slopes in y direction
-          do 150 i = 1, clen
-              cen = half*(crse(i+jst,n)-crse(i-jst,n))
-              forw = crse(i+jst,n)-crse(i,n)
-              back = crse(i,n)-crse(i-jst,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,2)=merge(slp,zero,forw*back>=0.0d0)
-150       continue
-c ::: ::::: compute slopes in z direction
-          do 160 i = 1, clen
-              cen = half*(crse(i+kst,n)-crse(i-kst,n))
-              forw = crse(i+kst,n)-crse(i,n)
-              back = crse(i,n)-crse(i-kst,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,3)=merge(slp,zero,forw*back>=0.0d0)
-160       continue
-
-          if (maxlen .eq. lenx) then
-              do 170 kc = cblo(3), cbhi(3)
-                  do 180 jc = cblo(2), cbhi(2)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                      do 190 ioff = 1, lratio
-                          icc = clo + ist + jst*(jc-jslo) + kst*(kc-
-     $                    kslo)
-                          istart = ioff
-                          iend = ioff + (ncbx-1)*lratio
-                          do 200 fn = istart, iend, lratio
-                              fslope(fn,1) = cslope(icc,1)
-                              fslope(fn,2) = cslope(icc,2)
-                              fslope(fn,3) = cslope(icc,3)
-                              fdat(fn) = crse(icc,n)
-                              icc = icc + ist
-200                       continue
-190                   continue
-
-                      do 210 koff = 0, lratio-1
-                          k = lratio*kc + koff
-                          if (k .lt. fblo(3)) then
-                              goto 210
-c                             --- next ---
-                          end if
-                          if (k .gt. fbhi(3)) then
-                              goto 211
-c                             --- break ---
-                          end if
-                          zoff = float(koff)-hafrat
-                          do 220 joff = 0, lratio-1
-                              j = lratio*jc + joff
-                              if (j .lt. fblo(2)) then
-                                  goto 220
-c                                 --- next ---
-                              end if
-                              if (j .gt. fbhi(2)) then
-                                  goto 221
-c                                 --- break ---
-                              end if
-                              yoff = float(joff)-hafrat
-
-                              do 230 i = fblo(1), fbhi(1)
-                                  fn = i-fslo(1)+1
-                                  fine(i,j,k,n) = fdat(fn) + volratio*
-     $                            (voff(fn)*fslope(fn,1)+yoff*fslope(fn,
-     $                            2)+ zoff*fslope(fn,3))
-230                           continue
-220                       continue
-221                       continue
-210                   continue
-211                   continue
-180               continue
-170           continue
-          else if (maxlen .eq. leny) then
-              do 240 kc = cblo(3), cbhi(3)
-                  do 250 ic = cblo(1), cbhi(1)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                      do 260 joff = 1, lratio
-                          icc = clo + ist*(ic-islo) + jst + kst*(kc-
-     $                    kslo)
-                          istart = joff
-                          iend = joff + (ncby-1)*lratio
-                          do 270 fn = istart, iend, lratio
-                              fslope(fn,1) = cslope(icc,1)
-                              fslope(fn,2) = cslope(icc,2)
-                              fslope(fn,3) = cslope(icc,3)
-                              fdat(fn) = crse(icc,n)
-                              icc = icc + jst
-270                       continue
-260                   continue
-
-                      do 280 koff = 0, lratio-1
-                          k = lratio*kc + koff
-                          if (k .lt. fblo(3)) then
-                              goto 280
-c                             --- next ---
-                          end if
-                          if (k .gt. fbhi(3)) then
-                              goto 281
-c                             --- break ---
-                          end if
-                          zoff = float(koff)-hafrat
-                          do 290 ioff = 0, lratio-1
-                              i = lratio*ic + ioff
-                              if (i .lt. fblo(1)) then
-                                  goto 290
-c                                 --- next ---
-                              end if
-                              if (i .gt. fbhi(1)) then
-                                  goto 291
-c                                 --- break ---
-                              end if
-                              xoff = float(ioff)-hafrat
-
-                              do 300 j = fblo(2), fbhi(2)
-                                  fn = j-fslo(2)+1
-                                  fine(i,j,k,n) = fdat(fn) + volratio*
-     $                            (xoff*fslope(fn,1)+voff(fn)*fslope(fn,
-     $                            2)+ zoff*fslope(fn,3))
-300                           continue
-290                       continue
-291                       continue
-280                   continue
-281                   continue
-250               continue
-240           continue
-          else
-              do 310 jc = cblo(2), cbhi(2)
-                  do 320 ic = cblo(1), cbhi(1)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                      do 330 koff = 1, lratio
-                          icc = clo + ist*(ic-islo) + jst*(jc-jslo) +
-     $                    kst
-                          istart = koff
-                          iend = koff + (ncbz-1)*lratio
-                          do 340 fn = istart, iend, lratio
-                              fslope(fn,1) = cslope(icc,1)
-                              fslope(fn,2) = cslope(icc,2)
-                              fslope(fn,3) = cslope(icc,3)
-                              fdat(fn) = crse(icc,n)
-                              icc = icc + kst
-340                       continue
-330                   continue
-
-                      do 350 joff = 0, lratio-1
-                          j = lratio*jc + joff
-                          if (j .lt. fblo(2)) then
-                              goto 350
-c                             --- next ---
-                          end if
-                          if (j .gt. fbhi(2)) then
-                              goto 351
-c                             --- break ---
-                          end if
-                          yoff = float(joff)-hafrat
-                          do 360 ioff = 0, lratio-1
-                              i = lratio*ic + ioff
-                              if (i .lt. fblo(1)) then
-                                  goto 360
-c                                 --- next ---
-                              end if
-                              if (i .gt. fbhi(1)) then
-                                  goto 361
-c                                 --- break ---
-                              end if
-                              xoff = float(ioff)-hafrat
-
-                              do 370 k = fblo(3), fbhi(3)
-                                  fn = k-fslo(3)+1
-                                  fine(i,j,k,n) = fdat(fn) + volratio*
-     $                            (xoff*fslope(fn,1)+yoff*fslope(fn,2)+
-     $                            voff(fn)*fslope(fn,3))
-370                           continue
-360                       continue
-361                       continue
-350                   continue
-351                   continue
-320               continue
-310           continue
-          end if
-130   continue
-
-      return
-      end
-
-
-c ::: --------------------------------------------------------------
-      subroutine FORT_PCINTERP (fine,floi1,floi2,floi3,fhii1,fhii2,
-     $ fhii3,fblo,fbhi,lrat,nvar,crse,cloi1,cloi2,cloi3,chii1,chii2,
-     $ chii3,cblo,cbhi,temp,tloi,thii)
-
-      implicit none
-
-      integer floi1,floi2,floi3
-      integer fhii1,fhii2,fhii3
-      integer cloi1,cloi2,cloi3
-      integer chii1,chii2,chii3
-
-      integer fblo(3), fbhi(3)
-      integer cblo(3), cbhi(3)
-      integer lrat, nvar, tloi, thii
-      REAL_T fine(floi1:fhii1,floi2:fhii2,floi3:fhii3, nvar)
-      REAL_T crse(cloi1:chii1,cloi2:chii2,cloi3:chii3, nvar)
-      REAL_T temp(tloi:thii + 1)
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: pcinterp:  use piecewise constant interpolation to define
-c ::: values on the subregion of the fine FAB defined
-c ::: by (fblo,fbhi).
-c :::
-c ::: Inputs/Outputs
-c ::: fine        <=>  (modify) fab to get interpolated values
-c ::: flo,fhi      =>  (const)  index limits of fine
-c ::: fblo,fbhi    =>  (const)  subregion of fine grid to get values
-c ::: crse         =>  (const)  fab holding coarse grid values
-c ::: clo,chi      =>  (const)  index limits of src
-c ::: cblo,cbhi    =>  (const)  subregion of coarse grid holding values
-c ::: temp         =>  (modify) temporary space for vectorization
-c ::: tlo,thi      =>  (const)  index limits of temp space
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: local var
-      integer i,j,k,ic,jc,kc,ioff,n
-      integer ixproj,ii,ll
-      ixproj(ii,ll) = (ii + ll*iabs(ii))/ll - iabs(ii)
-
-      do 130 k = fblo(3), fbhi(3)
-          kc = ixproj(k,lrat)
-          do 140 j = fblo(2), fbhi(2)
-              jc = ixproj(j,lrat)
-              do 150 n = 1, nvar
-                  do 160 ioff = 0, lrat-1
-                      do 170 ic = cblo(1),cbhi(1)
-                          i = lrat*ic + ioff
-                          temp(i) = crse(ic,jc,kc,n)
-170                   continue
-160               continue
-                  do 180 i = fblo(1), fbhi(1)
-                      fine(i,j,k,n) = temp(i)
-180               continue
-150           continue
-140       continue
-130   continue
-
-      return
-      end
-
-
-c ::: --------------------------------------------------------------
-      subroutine FORT_VCINTERP (fine,floi1,floi2,floi3,fhii1,fhii2,
-     $ fhii3,fbloi1,fbloi2,fbloi3, fbhii1,fbhii2,fbhii3,nvar,lratio,
-     $ crse,clo,chi,cblo,cbhi, fslo,fshi,cvloi1,cvloi2,cvloi3,cvhii1,
-     $ cvhii2,cvhii3, cslope,clen,fslope,fdat,flen,voff, cvc1,fvc1,cvc2,
-     $ fvc2,cvc3,fvc3)
-
-      implicit none
-
-      integer floi1,floi2,floi3
-      integer fhii1,fhii2,fhii3
-      integer fbloi1,fbloi2,fbloi3
-      integer fbhii1,fbhii2,fbhii3
-      integer cvloi1,cvloi2,cvloi3
-      integer cvhii1,cvhii2,cvhii3
-
-      integer cblo(3), cbhi(3)
-      integer fslo(3), fshi(3)
-      integer lratio, nvar, clen, flen, clo, chi
-      REAL_T fine(floi1:fhii1,floi2:fhii2,floi3:fhii3,nvar)
-      REAL_T crse(clo:chi, nvar)
-      REAL_T cslope(clo:chi, 3)
-      REAL_T fslope(flen, 3)
-      REAL_T fdat(flen)
-      REAL_T voff(flen)
-
-      REAL_T cvc1(cvloi1 :cvhii1+1)
-      REAL_T cvc2(cvloi2 :cvhii2+1)
-      REAL_T cvc3(cvloi3 :cvhii3+1)
-      REAL_T fvc1(fbloi1 :fbhii1+1)
-      REAL_T fvc2(fbloi2 :fbhii2+1)
-      REAL_T fvc3(fbloi3 :fbhii3+1)
-
-c ::: NOTE: data must be sent in so that
-c ::: cslope(1,*) and crse(1,*) are associated with
-c ::: the same cell
-
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-c ::: vcinterp:   conservative interpolation in volume coordinates
-c ::: from coarse grid to
-c ::: subregion of fine grid defined by (fblo,fbhi)
-c :::
-c ::: Inputs/Outputs
-c ::: fine        <=>  (modify) fine grid array
-c ::: flo,fhi      =>  (const)  index limits of fine grid
-c ::: fblo,fbhi    =>  (const)  subregion of fine grid to get values
-c ::: nvar         =>  (const)  number of variables in state vector
-c ::: lratio       =>  (const)  refinement ratio between levels
-c :::
-c ::: crse         =>  (const)  coarse grid data widended by 1 zone
-c ::: and unrolled
-c ::: clo,chi      =>  (const)  one dimensional limits of crse grid
-c ::: cslo,cshi    =>  (const)  coarse grid index limits where
-c ::: slopes are to be defined. This is
-c ::: the projection of (fblo,fbhi) down
-c ::: to the coarse level
-c ::: fslo,fshi    =>  (const)  fine grid index limits where
-c ::: slopes are needed.  This is the
-c ::: refinement of (cslo,cshi) and
-c ::: contains but may not be identical
-c ::: to (fblo,fbhi).
-c ::: cslope       =>  (modify) temp array coarse grid slopes
-c ::: clen         =>  (const)  length of coarse gtid slopes
-c ::: fslope       =>  (modify) temp array for fine grid slope
-c ::: flen         =>  (const)  length of fine grid slope array
-c ::: fdat         =>  (const)  temp array for fine grid data
-c ::: cvlo,cvhi    =>  (const)  coarse grid index limits where
-c ::: volume coordinates are defined
-c ::: cvc1         =>  (const)  coarse grid volume coords, x-dir
-c ::: cvc2         =>  (const)  coarse grid volume coords, y_dir
-c ::: cvc3         =>  (const)  coarse grid volume coords, z-dir
-c ::: fvc1         =>  (const)  fine grid volume coords, x-dir
-c ::: fvc2         =>  (const)  fine grid volume coords, y-dir
-c ::: fvc3         =>  (const)  fine grid volume coords, z-dir
-c ::: ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-
-c ::: local var
-      integer n, fn
-      integer i, ic, ioff
-      integer j, jc, joff
-      integer k, kc, koff
-      integer ist, jst, kst
-      REAL_T cen, forw, back, slp
-      REAL_T xoff, yoff, zoff
-      REAL_T fcen, ccen
-      integer ncbx, ncby, ncbz
-      integer ncsx, ncsy, ncsz
-      integer islo, jslo, kslo
-      integer icc, istart, iend
-      integer lenx, leny, lenz, maxlen
-
-      ncbx = cbhi(1)-cblo(1)+1
-      ncby = cbhi(2)-cblo(2)+1
-      ncbz = cbhi(3)-cblo(3)+1
-      ncsx = ncbx+2
-      ncsy = ncby+2
-      ncsz = ncbz+2
-      ist = 1
-      jst = ncsx
-      kst = ncsx*ncsy
-      islo = cblo(1)-1
-      jslo = cblo(2)-1
-      kslo = cblo(3)-1
-      lenx = fbhii1-fbloi1+1
-      leny = fbhii2-fbloi2+1
-      lenz = fbhii3-fbloi3+1
-      maxlen = max(lenx,leny,lenz)
-      if (maxlen .eq. lenx) then
-          do 380 i = fbloi1, fbhii1
-              fn = i-fslo(1)+1
-              ic = IX_PROJ(i,lratio)
-              fcen = half*(fvc1(i)+fvc1(i+1))
-              ccen = half*(cvc1(ic)+cvc1(ic+1))
-              voff(fn) = (fcen-ccen)/(cvc1(ic+1)-cvc1(ic))
-380       continue
-      else if (maxlen .eq. leny) then
-          do 390 j = fbloi2, fbhii2
-              fn = j-fslo(2)+1
-              jc = IX_PROJ(j,lratio)
-              fcen = half*(fvc2(j)+fvc2(j+1))
-              ccen = half*(cvc2(jc)+cvc2(jc+1))
-              voff(fn) = (fcen-ccen)/(cvc2(jc+1)-cvc2(jc))
-390       continue
-      else
-          do 400 k = fbloi3, fbhii3
-              fn = k-fslo(3)+1
-              kc = IX_PROJ(k,lratio)
-              fcen = half*(fvc3(k)+fvc3(k+1))
-              ccen = half*(cvc3(kc)+cvc3(kc+1))
-              voff(fn) = (fcen-ccen)/(cvc3(kc+1)-cvc3(kc))
-400       continue
-      end if
-      do 410 n = 1, nvar
-
-c ::: ::::: compute slopes in x direction
-          do 420 i = 1, clen
-              cen = half*(crse(i+ist,n)-crse(i-ist,n))
-              forw = crse(i+ist,n)-crse(i,n)
-              back = crse(i,n)-crse(i-ist,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,1)=merge(slp,zero,forw*back>=0.0d0)
-420       continue
-c ::: ::::: compute slopes in y direction
-          do 430 i = 1, clen
-              cen = half*(crse(i+jst,n)-crse(i-jst,n))
-              forw = crse(i+jst,n)-crse(i,n)
-              back = crse(i,n)-crse(i-jst,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,2)=merge(slp,zero,forw*back>=0.0d0)
-430       continue
-c ::: ::::: compute slopes in z direction
-          do 440 i = 1, clen
-              cen = half*(crse(i+kst,n)-crse(i-kst,n))
-              forw = crse(i+kst,n)-crse(i,n)
-              back = crse(i,n)-crse(i-kst,n)
-              slp = sign(one,cen)*min(abs(cen),abs(forw),abs(back))
-              cslope(i,3)=merge(slp,zero,forw*back>=0.0d0)
-440       continue
-
-          if (maxlen .eq. lenx) then
-              do 450 kc = cblo(3), cbhi(3)
-                  do 460 jc = cblo(2), cbhi(2)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                      do 470 ioff = 1, lratio
-                          icc = clo + ist + jst*(jc-jslo) + kst*(kc-
-     $                    kslo)
-                          istart = ioff
-                          iend = ioff + (ncbx-1)*lratio
-                          do 480 fn = istart, iend, lratio
-                              fslope(fn,1) = cslope(icc,1)
-                              fslope(fn,2) = cslope(icc,2)
-                              fslope(fn,3) = cslope(icc,3)
-                              fdat(fn) = crse(icc,n)
-                              icc = icc + ist
-480                       continue
-470                   continue
-
-                      do 490 koff = 0, lratio-1
-                          k = lratio*kc + koff
-                          if (k .lt. fbloi3) then
-                              goto 490
-c                             --- next ---
-                          end if
-                          if (k .gt. fbhii3) then
-                              goto 491
-c                             --- break ---
-                          end if
-                          fcen = half*(fvc3(k)+fvc3(k+1))
-                          ccen = half*(cvc3(kc)+cvc3(kc+1))
-                          zoff = (fcen-ccen)/(cvc3(kc+1)-cvc3(kc))
-                          do 500 joff = 0, lratio-1
-                              j = lratio*jc + joff
-                              if (j .lt. fbloi2) then
-                                  goto 500
-c                                 --- next ---
-                              end if
-                              if (j .gt. fbhii2) then
-                                  goto 501
-c                                 --- break ---
-                              end if
-                              fcen = half*(fvc2(j)+fvc2(j+1))
-                              ccen = half*(cvc2(jc)+cvc2(jc+1))
-                              yoff = (fcen-ccen)/(cvc2(jc+1)-cvc2(jc))
-
-                              do 510 i = fbloi1, fbhii1
-                                  fn = i-fslo(1)+1
-                                  fine(i,j,k,n) = fdat(fn) + voff(fn)*
-     $                            fslope(fn,1)+yoff*fslope(fn,2)+ zoff*
-     $                            fslope(fn,3)
-510                           continue
-500                       continue
-501                       continue
-490                   continue
-491                   continue
-460               continue
-450           continue
-          else if (maxlen .eq. leny) then
-              do 520 kc = cblo(3), cbhi(3)
-                  do 530 ic = cblo(1), cbhi(1)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                      do 540 joff = 1, lratio
-                          icc = clo + ist*(ic-islo) + jst + kst*(kc-
-     $                    kslo)
-                          istart = joff
-                          iend = joff + (ncby-1)*lratio
-                          do 550 fn = istart, iend, lratio
-                              fslope(fn,1) = cslope(icc,1)
-                              fslope(fn,2) = cslope(icc,2)
-                              fslope(fn,3) = cslope(icc,3)
-                              fdat(fn) = crse(icc,n)
-                              icc = icc + jst
-550                       continue
-540                   continue
-
-                      do 560 koff = 0, lratio-1
-                          k = lratio*kc + koff
-                          if (k .lt. fbloi3) then
-                              goto 560
-c                             --- next ---
-                          end if
-                          if (k .gt. fbhii3) then
-                              goto 561
-c                             --- break ---
-                          end if
-                          fcen = half*(fvc3(k)+fvc3(k+1))
-                          ccen = half*(cvc3(kc)+cvc3(kc+1))
-                          zoff = (fcen-ccen)/(cvc3(kc+1)-cvc3(kc))
-                          do 570 ioff = 0, lratio-1
-                              i = lratio*ic + ioff
-                              if (i .lt. fbloi1) then
-                                  goto 570
-c                                 --- next ---
-                              end if
-                              if (i .gt. fbhii1) then
-                                  goto 571
-c                                 --- break ---
-                              end if
-                              fcen = half*(fvc1(i)+fvc1(i+1))
-                              ccen = half*(cvc1(ic)+cvc1(ic+1))
-                              xoff = (fcen-ccen)/(cvc1(ic+1)-cvc1(ic))
-
-                              do 580 j = fbloi2, fbhii2
-                                  fn = j-fslo(2)+1
-                                  fine(i,j,k,n) = fdat(fn) + xoff*
-     $                            fslope(fn,1)+voff(fn)*fslope(fn,2)+
-     $                            zoff*fslope(fn,3)
-580                           continue
-570                       continue
-571                       continue
-560                   continue
-561                   continue
-530               continue
-520           continue
-          else
-              do 590 jc = cblo(2), cbhi(2)
-                  do 600 ic = cblo(1), cbhi(1)
-
-c ::: ..,.......::::: strip out a fine grid slope vector
-                      do 610 koff = 1, lratio
-                          icc = clo + ist*(ic-islo) + jst*(jc-jslo) +
-     $                    kst
-                          istart = koff
-                          iend = koff + (ncbz-1)*lratio
-                          do 620 fn = istart, iend, lratio
-                              fslope(fn,1) = cslope(icc,1)
-                              fslope(fn,2) = cslope(icc,2)
-                              fslope(fn,3) = cslope(icc,3)
-                              fdat(fn) = crse(icc,n)
-                              icc = icc + kst
-620                       continue
-610                   continue
-
-                      do 630 joff = 0, lratio-1
-                          j = lratio*jc + joff
-                          if (j .lt. fbloi2) then
-                              goto 630
-c                             --- next ---
-                          end if
-                          if (j .gt. fbhii2) then
-                              goto 631
-c                             --- break ---
-                          end if
-                          fcen = half*(fvc2(j)+fvc2(j+1))
-                          ccen = half*(cvc2(jc)+cvc2(jc+1))
-                          yoff = (fcen-ccen)/(cvc2(jc+1)-cvc2(jc))
-                          do 640 ioff = 0, lratio-1
-                              i = lratio*ic + ioff
-                              if (i .lt. fbloi1) then
-                                  goto 640
-c                                 --- next ---
-                              end if
-                              if (i .gt. fbhii1) then
-                                  goto 641
-c                                 --- break ---
-                              end if
-                              fcen = half*(fvc1(i)+fvc1(i+1))
-                              ccen = half*(cvc1(ic)+cvc1(ic+1))
-                              xoff = (fcen-ccen)/(cvc1(ic+1)-cvc1(ic))
-
-                              do 650 k = fbloi3, fbhii3
-                                  fn = k-fslo(3)+1
-                                  fine(i,j,k,n) = fdat(fn) + xoff*
-     $                            fslope(fn,1)+yoff*fslope(fn,2)+
-     $                            voff(fn)*fslope(fn,3)
-650                           continue
-640                       continue
-641                       continue
-630                   continue
-631                   continue
-600               continue
-590           continue
-          end if
-410   continue
-
-      return
-      end
-
-
diff --git a/Src/Extern/amrdata/CMakeLists.txt b/Src/Extern/amrdata/CMakeLists.txt
index 10761f6a641..d17fcddec19 100644
--- a/Src/Extern/amrdata/CMakeLists.txt
+++ b/Src/Extern/amrdata/CMakeLists.txt
@@ -12,7 +12,6 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_WritePlotFile.H
        AMReX_WritePlotFile.cpp
        AMReX_AmrvisConstants.H
-       AMReX_FABUTIL_${D}D.F
        )
 
     if (AMReX_PROFPARSER)
diff --git a/Src/Extern/amrdata/Make.package b/Src/Extern/amrdata/Make.package
index 3f3769ad174..9368e0669b3 100644
--- a/Src/Extern/amrdata/Make.package
+++ b/Src/Extern/amrdata/Make.package
@@ -1,6 +1,5 @@
 CEXE_sources += AMReX_AmrData.cpp AMReX_XYPlotDataList.cpp AMReX_DataServices.cpp AMReX_WritePlotFile.cpp
 CEXE_headers += AMReX_AmrData.H AMReX_AmrvisConstants.H AMReX_XYPlotDataList.H AMReX_DataServices.H AMReX_WritePlotFile.H
-FEXE_sources += AMReX_FABUTIL_${DIM}D.F
 
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/Extern/amrdata
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Extern/amrdata
diff --git a/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp b/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp
index 248ae076b6f..dd7916a9adf 100644
--- a/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp
+++ b/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp
@@ -47,6 +47,11 @@ extern "C" {
         ba->maxSize(iv);
     }
 
+    Long amrex_fi_boxarray_nboxes (const BoxArray* ba)
+    {
+        return ba->size();
+    }
+
     void amrex_fi_boxarray_get_box (const BoxArray* ba, int i, int lo[3], int hi[3])
     {
         const Box& bx = (*ba)[i];
diff --git a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90
index 902f2925425..b156a8ae996 100644
--- a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90
@@ -20,6 +20,7 @@ module amrex_boxarray_module
      procedure :: move          => amrex_boxarray_move     ! transfer ownership
      generic   :: maxSize       => amrex_boxarray_maxsize_int, &  ! make the boxes smaller
           &                        amrex_boxarray_maxsize_int3, amrex_boxarray_maxsize_iv
+     procedure :: nboxes        => amrex_boxarray_nboxes
      procedure :: get_box       => amrex_boxarray_get_box
      procedure :: nodal_type    => amrex_boxarray_nodal_type  ! get index type
      procedure :: num_pts       => amrex_boxarray_num_pts
@@ -82,6 +83,13 @@ subroutine amrex_fi_boxarray_maxsize (ba,s) bind(c)
        integer(c_int), intent(in) :: s(3)
      end subroutine amrex_fi_boxarray_maxsize
 
+     pure function amrex_fi_boxarray_nboxes (ba) bind(c)
+       import
+       implicit none
+       type(c_ptr), value, intent(in) :: ba
+       integer(amrex_long) :: amrex_fi_boxarray_nboxes
+     end function amrex_fi_boxarray_nboxes
+
      subroutine amrex_fi_boxarray_get_box (ba,i,lo,hi) bind(c)
        import
        implicit none
@@ -194,6 +202,16 @@ subroutine amrex_boxarray_maxsize_iv (this, s)
     call amrex_fi_boxarray_maxsize(this%p, s)
   end subroutine amrex_boxarray_maxsize_iv
 
+  pure function amrex_boxarray_nboxes (this) result(n)
+    class(amrex_boxarray), intent(in) :: this
+    integer(amrex_long) :: n
+    if (c_associated(this%p)) then
+       n = amrex_fi_boxarray_nboxes(this%p)
+    else
+       n = 0
+    end if
+  end function amrex_boxarray_nboxes
+
   function amrex_boxarray_get_box (this, i) result(bx)
     class(amrex_boxarray) :: this
     integer, intent(in)   :: i
@@ -220,7 +238,11 @@ end function amrex_boxarray_nodal_type
   pure function amrex_boxarray_num_pts (this) result(n)
     class(amrex_boxarray), intent(in) :: this
     integer(amrex_long) :: n
-    n = amrex_fi_boxarray_numpts(this%p)
+    if (c_associated(this%p)) then
+       n = amrex_fi_boxarray_numpts(this%p)
+    else
+       n = 0
+    end if
   end function amrex_boxarray_num_pts
 
   pure function amrex_boxarray_intersects_box (this, bx) result(r)
diff --git a/Src/LinearSolvers/CMakeLists.txt b/Src/LinearSolvers/CMakeLists.txt
index cae0b2028f0..6287ef4b422 100644
--- a/Src/LinearSolvers/CMakeLists.txt
+++ b/Src/LinearSolvers/CMakeLists.txt
@@ -21,6 +21,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        MLMG/AMReX_MLCellABecLap_K.H
        MLMG/AMReX_MLCellABecLap_${D}D_K.H
        MLMG/AMReX_MLCGSolver.H
+       MLMG/AMReX_PCGSolver.H
        MLMG/AMReX_MLABecLaplacian.H
        MLMG/AMReX_MLABecLap_K.H
        MLMG/AMReX_MLABecLap_${D}D_K.H
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_1D_K.H
index fbf324d6c98..29f8fd9f7e9 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_1D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_1D_K.H
@@ -157,6 +157,69 @@ void abec_gsrb_os (int i, int, int, int n, Array4<T> const& phi, Array4<T const>
 
 template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void abec_jacobi (int i, int, int, int n, Array4<T> const& phi,
+                  Array4<T const> const& rhs, Array4<T const> const& Ax,
+                  T alpha, Array4<T const> const& a,
+                  T dhx,
+                  Array4<T const> const& bX,
+                  Array4<int const> const& m0,
+                  Array4<int const> const& m1,
+                  Array4<T const> const& f0,
+                  Array4<T const> const& f1,
+                  Box const& vbox) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0)
+        ? f0(vlo.x,0,0,n) : T(0.0);
+    T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0)
+        ? f1(vhi.x,0,0,n) : T(0.0);
+
+    T delta = dhx*(bX(i,0,0,n)*cf0 + bX(i+1,0,0,n)*cf1);
+
+    T gamma = alpha*a(i,0,0)
+        +   dhx*( bX(i,0,0,n) + bX(i+1,0,0,n) );
+
+    phi(i,0,0,n) += T(2.0/3.0) * (rhs(i,0,0,n) - Ax(i,0,0,n)) / (gamma - delta);
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void abec_jacobi_os (int i, int, int, int n, Array4<T> const& phi,
+                     Array4<T const> const& rhs, Array4<T const> const& Ax,
+                     T alpha, Array4<T const> const& a,
+                     T dhx,
+                     Array4<T const> const& bX,
+                     Array4<int const> const& m0,
+                     Array4<int const> const& m1,
+                     Array4<T const> const& f0,
+                     Array4<T const> const& f1,
+                     Array4<int const> const& osm,
+                     Box const& vbox) noexcept
+{
+    if (osm(i,0,0) == 0) {
+        phi(i,0,0) = T(0.0);
+    } else {
+        const auto vlo = amrex::lbound(vbox);
+        const auto vhi = amrex::ubound(vbox);
+
+        T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0)
+            ? f0(vlo.x,0,0,n) : T(0.0);
+        T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0)
+            ? f1(vhi.x,0,0,n) : T(0.0);
+
+        T delta = dhx*(bX(i,0,0,n)*cf0 + bX(i+1,0,0,n)*cf1);
+
+        T gamma = alpha*a(i,0,0)
+            +   dhx*( bX(i,0,0,n) + bX(i+1,0,0,n) );
+
+        phi(i,0,0,n) += T(2.0/3.0) * (rhs(i,0,0,n) - Ax(i,0,0,n)) / (gamma - delta);
+    }
+}
+
+template <typename T>
+AMREX_FORCE_INLINE
 void abec_gsrb_with_line_solve (
                 Box const& /*box*/, Array4<T> const& /*phi*/, Array4<T const> const& /*rhs*/,
                 T /*alpha*/, Array4<T const> const& /*a*/,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_2D_K.H
index 9184a755151..2beecd9d422 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_2D_K.H
@@ -230,6 +230,81 @@ void abec_gsrb_os (int i, int j, int, int n, Array4<T> const& phi, Array4<T cons
 
 template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void abec_jacobi (int i, int j, int, int n, Array4<T> const& phi,
+                  Array4<T const> const& rhs, Array4< T const> const& Ax,
+                  T alpha, Array4<T const> const& a,
+                  T dhx, T dhy,
+                  Array4<T const> const& bX, Array4<T const> const& bY,
+                  Array4<int const> const& m0, Array4<int const> const& m2,
+                  Array4<int const> const& m1, Array4<int const> const& m3,
+                  Array4<T const> const& f0, Array4<T const> const& f2,
+                  Array4<T const> const& f1, Array4<T const> const& f3,
+                  Box const& vbox) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0)
+        ? f0(vlo.x,j,0,n) : T(0.0);
+    T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0)
+        ? f1(i,vlo.y,0,n) : T(0.0);
+    T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0)
+        ? f2(vhi.x,j,0,n) : T(0.0);
+    T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0)
+        ? f3(i,vhi.y,0,n) : T(0.0);
+
+    T delta = dhx*(bX(i,j,0,n)*cf0 + bX(i+1,j,0,n)*cf2)
+           +  dhy*(bY(i,j,0,n)*cf1 + bY(i,j+1,0,n)*cf3);
+
+    T gamma = alpha*a(i,j,0)
+        +   dhx*( bX(i,j,0,n) + bX(i+1,j,0,n) )
+        +   dhy*( bY(i,j,0,n) + bY(i,j+1,0,n) );
+
+    phi(i,j,0,n) += T(2.0/3.0) * (rhs(i,j,0,n) - Ax(i,j,0,n)) / (gamma - delta);
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void abec_jacobi_os (int i, int j, int, int n, Array4<T> const& phi,
+                     Array4<T const> const& rhs, Array4<T const> const& Ax,
+                     T alpha, Array4<T const> const& a,
+                     T dhx, T dhy,
+                     Array4<T const> const& bX, Array4<T const> const& bY,
+                     Array4<int const> const& m0, Array4<int const> const& m2,
+                     Array4<int const> const& m1, Array4<int const> const& m3,
+                     Array4<T const> const& f0, Array4<T const> const& f2,
+                     Array4<T const> const& f1, Array4<T const> const& f3,
+                     Array4<int const> const& osm,
+                     Box const& vbox) noexcept
+{
+    if (osm(i,j,0) == 0) {
+        phi(i,j,0,n) = T(0.0);
+    } else {
+        const auto vlo = amrex::lbound(vbox);
+        const auto vhi = amrex::ubound(vbox);
+
+        T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0)
+            ? f0(vlo.x,j,0,n) : T(0.0);
+        T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0)
+            ? f1(i,vlo.y,0,n) : T(0.0);
+        T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0)
+            ? f2(vhi.x,j,0,n) : T(0.0);
+        T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0)
+            ? f3(i,vhi.y,0,n) : T(0.0);
+
+        T delta = dhx*(bX(i,j,0,n)*cf0 + bX(i+1,j,0,n)*cf2)
+               +  dhy*(bY(i,j,0,n)*cf1 + bY(i,j+1,0,n)*cf3);
+
+        T gamma = alpha*a(i,j,0)
+            +   dhx*( bX(i,j,0,n) + bX(i+1,j,0,n) )
+            +   dhy*( bY(i,j,0,n) + bY(i,j+1,0,n) );
+
+        phi(i,j,0,n) += T(2.0/3.0) * (rhs(i,j,0,n) - Ax(i,j,0,n)) / (gamma - delta);
+    }
+}
+
+template <typename T>
+AMREX_FORCE_INLINE
 void abec_gsrb_with_line_solve (
                 Box const& box, Array4<T> const& phi, Array4<T const> const& rhs,
                 T alpha, Array4<T const> const& a,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H
index 7d6cca59b49..bb5172396cf 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H
@@ -329,6 +329,106 @@ void abec_gsrb_os (int i, int j, int k, int n,
 
 template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void abec_jacobi (int i, int j, int k, int n, Array4<T> const& phi,
+                  Array4<T const> const& rhs, Array4<T const> const& Ax,
+                  T alpha, Array4<T const> const& a,
+                  T dhx, T dhy, T dhz,
+                  Array4<T const> const& bX, Array4<T const> const& bY,
+                  Array4<T const> const& bZ,
+                  Array4<int const> const& m0, Array4<int const> const& m2,
+                  Array4<int const> const& m4,
+                  Array4<int const> const& m1, Array4<int const> const& m3,
+                  Array4<int const> const& m5,
+                  Array4<T const> const& f0, Array4<T const> const& f2,
+                  Array4<T const> const& f4,
+                  Array4<T const> const& f1, Array4<T const> const& f3,
+                  Array4<T const> const& f5,
+                  Box const& vbox) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0)
+        ? f0(vlo.x,j,k,n) : T(0.0);
+    T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0)
+        ? f1(i,vlo.y,k,n) : T(0.0);
+    T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0)
+        ? f2(i,j,vlo.z,n) : T(0.0);
+    T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0)
+        ? f3(vhi.x,j,k,n) : T(0.0);
+    T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0)
+        ? f4(i,vhi.y,k,n) : T(0.0);
+    T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0)
+        ? f5(i,j,vhi.z,n) : T(0.0);
+
+    T gamma = alpha*a(i,j,k)
+        +   dhx*(bX(i,j,k,n)+bX(i+1,j,k,n))
+        +   dhy*(bY(i,j,k,n)+bY(i,j+1,k,n))
+        +   dhz*(bZ(i,j,k,n)+bZ(i,j,k+1,n));
+
+    T g_m_d = gamma
+        - (dhx*(bX(i,j,k,n)*cf0 + bX(i+1,j,k,n)*cf3)
+        +  dhy*(bY(i,j,k,n)*cf1 + bY(i,j+1,k,n)*cf4)
+        +  dhz*(bZ(i,j,k,n)*cf2 + bZ(i,j,k+1,n)*cf5));
+
+    phi(i,j,k,n) += T(2.0/3.0) * (rhs(i,j,k,n) - Ax(i,j,k,n)) / g_m_d;
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void abec_jacobi_os (int i, int j, int k, int n,
+                     Array4<T> const& phi, Array4<T const> const& rhs,
+                     Array4<T const> const& Ax,
+                     T alpha, Array4<T const> const& a,
+                     T dhx, T dhy, T dhz,
+                     Array4<T const> const& bX, Array4<T const> const& bY,
+                     Array4<T const> const& bZ,
+                     Array4<int const> const& m0, Array4<int const> const& m2,
+                     Array4<int const> const& m4,
+                     Array4<int const> const& m1, Array4<int const> const& m3,
+                     Array4<int const> const& m5,
+                     Array4<T const> const& f0, Array4<T const> const& f2,
+                     Array4<T const> const& f4,
+                     Array4<T const> const& f1, Array4<T const> const& f3,
+                     Array4<T const> const& f5,
+                     Array4<int const> const& osm,
+                     Box const& vbox) noexcept
+{
+    if (osm(i,j,k) == 0) {
+        phi(i,j,k,n) = T(0.0);
+    } else {
+        const auto vlo = amrex::lbound(vbox);
+        const auto vhi = amrex::ubound(vbox);
+
+        T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0)
+            ? f0(vlo.x,j,k,n) : T(0.0);
+        T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0)
+            ? f1(i,vlo.y,k,n) : T(0.0);
+        T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0)
+            ? f2(i,j,vlo.z,n) : T(0.0);
+        T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0)
+            ? f3(vhi.x,j,k,n) : T(0.0);
+        T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0)
+            ? f4(i,vhi.y,k,n) : T(0.0);
+        T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0)
+            ? f5(i,j,vhi.z,n) : T(0.0);
+
+        T gamma = alpha*a(i,j,k)
+            +   dhx*(bX(i,j,k,n)+bX(i+1,j,k,n))
+            +   dhy*(bY(i,j,k,n)+bY(i,j+1,k,n))
+            +   dhz*(bZ(i,j,k,n)+bZ(i,j,k+1,n));
+
+        T g_m_d = gamma
+            - (dhx*(bX(i,j,k,n)*cf0 + bX(i+1,j,k,n)*cf3)
+            +  dhy*(bY(i,j,k,n)*cf1 + bY(i,j+1,k,n)*cf4)
+            +  dhz*(bZ(i,j,k,n)*cf2 + bZ(i,j,k+1,n)*cf5));
+
+        phi(i,j,k,n) += T(2.0/3.0) * (rhs(i,j,k,n) - Ax(i,j,k,n)) / g_m_d;
+    }
+}
+
+template <typename T>
+AMREX_FORCE_INLINE
 void tridiagonal_solve (Array1D<T,0,31>& a_ls, Array1D<T,0,31>& b_ls, Array1D<T,0,31>& c_ls,
                         Array1D<T,0,31>& r_ls, Array1D<T,0,31>& u_ls, Array1D<T,0,31>& gam,
                         int ilen ) noexcept
@@ -348,7 +448,7 @@ void tridiagonal_solve (Array1D<T,0,31>& a_ls, Array1D<T,0,31>& b_ls, Array1D<T,
 }
 
 template <typename T>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_FORCE_INLINE
 void abec_gsrb_with_line_solve (
                 Box const& box, Array4<T> const& phi, Array4<T const> const& rhs,
                 T alpha, Array4<T const> const& a,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
index 0f90a6ead17..9b56b8049df 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
@@ -864,6 +864,12 @@ MLABecLaplacianT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in
         regular_coarsening = this->mg_coarsen_ratio_vec[mglev-1] == this->mg_coarsen_ratio;
     }
 
+    MF Ax;
+    if (! this->m_use_gauss_seidel && regular_coarsening) { // jacobi
+        Ax.define(sol.boxArray(), sol.DistributionMap(), sol.nComp(), 0);
+        Fapply(amrlev, mglev, Ax, sol);
+    }
+
     const MF& acoef = m_a_coeffs[amrlev][mglev];
     AMREX_ALWAYS_ASSERT(acoef.nGrowVect() == 0);
     AMREX_D_TERM(const MF& bxcoef = m_b_coeffs[amrlev][mglev][0];,
@@ -939,40 +945,76 @@ MLABecLaplacianT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in
 
         if (this->m_overset_mask[amrlev][mglev]) {
             const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays();
-            ParallelFor(sol, IntVect(0), nc,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept
-            {
-                Box vbx(ama[box_no]);
-                abec_gsrb_os(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no],
-                             AMREX_D_DECL(dhx, dhy, dhz),
-                             AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]),
-                             AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]),
-                             AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]),
-                             AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]),
-                             AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]),
-                             osmma[box_no], vbx, redblack);
-            });
+            if (this->m_use_gauss_seidel) {
+                ParallelFor(sol, IntVect(0), nc,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept
+                {
+                    Box vbx(ama[box_no]);
+                    abec_gsrb_os(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no],
+                                 AMREX_D_DECL(dhx, dhy, dhz),
+                                 AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]),
+                                 AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]),
+                                 AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]),
+                                 AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]),
+                                 AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]),
+                                 osmma[box_no], vbx, redblack);
+                });
+            } else {
+                const auto& axma = Ax.const_arrays();
+                ParallelFor(sol, IntVect(0), nc,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept
+                {
+                    Box vbx(ama[box_no]);
+                    abec_jacobi_os(i,j,k,n, solnma[box_no], rhsma[box_no], axma[box_no],
+                                   alpha, ama[box_no],
+                                   AMREX_D_DECL(dhx, dhy, dhz),
+                                   AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]),
+                                   AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]),
+                                   AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]),
+                                   AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]),
+                                   AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]),
+                                   osmma[box_no], vbx);
+                });
+            }
         } else if (regular_coarsening) {
-            ParallelFor(sol, IntVect(0), nc,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept
-            {
-                Box vbx(ama[box_no]);
-                abec_gsrb(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no],
-                          AMREX_D_DECL(dhx, dhy, dhz),
-                          AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]),
-                          AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]),
-                          AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]),
-                          AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]),
-                          AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]),
-                          vbx, redblack);
-            });
+            if (this->m_use_gauss_seidel) {
+                ParallelFor(sol, IntVect(0), nc,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept
+                {
+                    Box vbx(ama[box_no]);
+                    abec_gsrb(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no],
+                              AMREX_D_DECL(dhx, dhy, dhz),
+                              AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]),
+                              AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]),
+                              AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]),
+                              AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]),
+                              AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]),
+                              vbx, redblack);
+                });
+            } else {
+                const auto& axma = Ax.const_arrays();
+                ParallelFor(sol, IntVect(0), nc,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept
+                {
+                    Box vbx(ama[box_no]);
+                    abec_jacobi(i,j,k,n, solnma[box_no], rhsma[box_no], axma[box_no],
+                                alpha, ama[box_no],
+                                AMREX_D_DECL(dhx, dhy, dhz),
+                                AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]),
+                                AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]),
+                                AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]),
+                                AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]),
+                                AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]),
+                                vbx);
+                });
+            }
         }
         Gpu::streamSynchronize();
     } else
 #endif
     {
         MFItInfo mfi_info;
-        if (Gpu::notInLaunchRegion()) { mfi_info.EnableTiling().SetDynamic(true); }
+        mfi_info.EnableTiling().SetDynamic(true);
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
@@ -1013,43 +1055,71 @@ MLABecLaplacianT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in
 
             if (this->m_overset_mask[amrlev][mglev]) {
                 const auto& osm = this->m_overset_mask[amrlev][mglev]->const_array(mfi);
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D(tbx, nc, i, j, k, n,
-                {
-                    abec_gsrb_os(i,j,k,n, solnfab, rhsfab, alpha, afab,
-                                 AMREX_D_DECL(dhx, dhy, dhz),
-                                 AMREX_D_DECL(bxfab, byfab, bzfab),
-                                 AMREX_D_DECL(m0,m2,m4),
-                                 AMREX_D_DECL(m1,m3,m5),
-                                 AMREX_D_DECL(f0fab,f2fab,f4fab),
-                                 AMREX_D_DECL(f1fab,f3fab,f5fab),
-                                 osm, vbx, redblack);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_LOOP_4D(tbx, nc, i, j, k, n,
+                    {
+                        abec_gsrb_os(i,j,k,n, solnfab, rhsfab, alpha, afab,
+                                     AMREX_D_DECL(dhx, dhy, dhz),
+                                     AMREX_D_DECL(bxfab, byfab, bzfab),
+                                     AMREX_D_DECL(m0,m2,m4),
+                                     AMREX_D_DECL(m1,m3,m5),
+                                     AMREX_D_DECL(f0fab,f2fab,f4fab),
+                                     AMREX_D_DECL(f1fab,f3fab,f5fab),
+                                     osm, vbx, redblack);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_LOOP_4D(tbx, nc, i, j, k, n,
+                    {
+                        abec_jacobi_os(i,j,k,n, solnfab, rhsfab, axfab,
+                                       alpha, afab,
+                                       AMREX_D_DECL(dhx, dhy, dhz),
+                                       AMREX_D_DECL(bxfab, byfab, bzfab),
+                                       AMREX_D_DECL(m0,m2,m4),
+                                       AMREX_D_DECL(m1,m3,m5),
+                                       AMREX_D_DECL(f0fab,f2fab,f4fab),
+                                       AMREX_D_DECL(f1fab,f3fab,f5fab),
+                                       osm, vbx);
+                    });
+                }
             } else if (regular_coarsening) {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D(tbx, nc, i, j, k, n,
-                {
-                    abec_gsrb(i,j,k,n, solnfab, rhsfab, alpha, afab,
-                              AMREX_D_DECL(dhx, dhy, dhz),
-                              AMREX_D_DECL(bxfab, byfab, bzfab),
-                              AMREX_D_DECL(m0,m2,m4),
-                              AMREX_D_DECL(m1,m3,m5),
-                              AMREX_D_DECL(f0fab,f2fab,f4fab),
-                              AMREX_D_DECL(f1fab,f3fab,f5fab),
-                              vbx, redblack);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_LOOP_4D(tbx, nc, i, j, k, n,
+                    {
+                        abec_gsrb(i,j,k,n, solnfab, rhsfab, alpha, afab,
+                                  AMREX_D_DECL(dhx, dhy, dhz),
+                                  AMREX_D_DECL(bxfab, byfab, bzfab),
+                                  AMREX_D_DECL(m0,m2,m4),
+                                  AMREX_D_DECL(m1,m3,m5),
+                                  AMREX_D_DECL(f0fab,f2fab,f4fab),
+                                  AMREX_D_DECL(f1fab,f3fab,f5fab),
+                                  vbx, redblack);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_LOOP_4D(tbx, nc, i, j, k, n,
+                    {
+                        abec_jacobi(i,j,k,n, solnfab, rhsfab, axfab,
+                                    alpha, afab,
+                                    AMREX_D_DECL(dhx, dhy, dhz),
+                                    AMREX_D_DECL(bxfab, byfab, bzfab),
+                                    AMREX_D_DECL(m0,m2,m4),
+                                    AMREX_D_DECL(m1,m3,m5),
+                                    AMREX_D_DECL(f0fab,f2fab,f4fab),
+                                    AMREX_D_DECL(f1fab,f3fab,f5fab),
+                                    vbx);
+                    });
+                }
             } else {
-                Gpu::LaunchSafeGuard lsg(false); // xxxxx gpu todo
                 // line solve does not with with GPU
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box,
-                {
-                    abec_gsrb_with_line_solve(thread_box, solnfab, rhsfab, alpha, afab,
-                                              AMREX_D_DECL(dhx, dhy, dhz),
-                                              AMREX_D_DECL(bxfab, byfab, bzfab),
-                                              AMREX_D_DECL(m0,m2,m4),
-                                              AMREX_D_DECL(m1,m3,m5),
-                                              AMREX_D_DECL(f0fab,f2fab,f4fab),
-                                              AMREX_D_DECL(f1fab,f3fab,f5fab),
-                                              vbx, redblack, nc);
-                });
+                abec_gsrb_with_line_solve(tbx, solnfab, rhsfab, alpha, afab,
+                                          AMREX_D_DECL(dhx, dhy, dhz),
+                                          AMREX_D_DECL(bxfab, byfab, bzfab),
+                                          AMREX_D_DECL(m0,m2,m4),
+                                          AMREX_D_DECL(m1,m3,m5),
+                                          AMREX_D_DECL(f0fab,f2fab,f4fab),
+                                          AMREX_D_DECL(f1fab,f3fab,f5fab),
+                                          vbx, redblack, nc);
             }
         }
     }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
index db8a10f2080..b613a4f3a83 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
@@ -396,6 +396,7 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
         if ( !initial_vec_zeroed ) {
             LocalAdd(sol, sorig, 0, 0, ncomp, nghost);
         }
+        if (ret == 8) { ret = 9; }
     }
     else
     {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H b/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H
index 02f7adeb115..970cf48fc20 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H
@@ -562,8 +562,8 @@ MLCellABecLapT<MF>::addInhomogNeumannFlux (
             if (! domain.contains(ccb)) {
                 for (int icomp = 0; icomp < ncomp; ++icomp) {
                     auto const& phi = sol.const_array(mfi,icomp);
-                    auto const bv = bndry.bndryValues(ori).multiFab().const_array(mfi,icomp);
-                    auto const bc = bcoef[idim] ? bcoef[idim]->const_array(mfi,icomp)
+                    auto const& bv = bndry.bndryValues(ori).multiFab().const_array(mfi,icomp);
+                    auto const& bc = bcoef[idim] ? bcoef[idim]->const_array(mfi,icomp)
                         : Array4<RT const>{};
                     auto const& f = grad[idim]->array(mfi,icomp);
                     if (ori.isLow()) {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
index 536d4c82b04..e04e16f8bd6 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
@@ -55,6 +55,8 @@ public:
     }
     void update () override;
 
+    void setGaussSeidel (bool flag) noexcept { m_use_gauss_seidel = flag; }
+
     virtual bool isCrossStencil () const { return true; }
     virtual bool isTensorOp () const { return false; }
 
@@ -86,6 +88,8 @@ public:
     void solutionResidual (int amrlev, MF& resid, MF& x, const MF& b,
                                    const MF* crse_bcdata=nullptr) override;
 
+    void prepareForFluxes (int amrlev, const MF* crse_bcdata = nullptr) override;
+
     void correctionResidual (int amrlev, int mglev, MF& resid, MF& x, const MF& b,
                                      BCMode bc_mode, const MF* crse_bcdata=nullptr) final;
 
@@ -133,6 +137,10 @@ public:
 
     Vector<std::unique_ptr<MF> > m_robin_bcval;
 
+#ifdef AMREX_USE_HYPRE
+    void setInterpBndryHalfWidth (int w) { m_interpbndry_halfwidth = w; }
+#endif
+
 protected:
 
     bool m_has_metric_term = false;
@@ -195,6 +203,8 @@ protected:
 
     mutable Vector<YAFluxRegisterT<MF>> m_fluxreg;
 
+    bool m_use_gauss_seidel = true; // use red-black Gauss-Seidel by default
+
 private:
 
     void defineAuxData ();
@@ -202,6 +212,8 @@ private:
 
     void computeVolInv () const;
     mutable Vector<Vector<RT> > m_volinv; // used by solvability fix
+
+    int m_interpbndry_halfwidth = 2;
 };
 
 template <typename T>
@@ -472,7 +484,9 @@ MLCellLinOpT<MF>::defineBC ()
         bc_data.setVal(0.0);
 
         m_bndry_cor[amrlev]->setBndryValues(*m_crse_cor_br[amrlev], 0, bc_data, 0, 0, ncomp,
-                                            IntVect(this->m_amr_ref_ratio[amrlev-1]));
+                                            IntVect(this->m_amr_ref_ratio[amrlev-1]),
+                                            InterpBndryDataT<MF>::IBD_max_order_DEF,
+                                            m_interpbndry_halfwidth);
 
         Vector<Array<LinOpBCType,AMREX_SPACEDIM> > bclohi
             (ncomp,Array<LinOpBCType,AMREX_SPACEDIM>{{AMREX_D_DECL(BCType::Dirichlet,
@@ -544,7 +558,9 @@ MLCellLinOpT<MF>::setLevelBC (int amrlev, const MF* a_levelbcdata, const MF* rob
                 m_crse_sol_br[amrlev]->setVal(RT(0.0));
             }
             m_bndry_sol[amrlev]->setBndryValues(*m_crse_sol_br[amrlev], 0,
-                                                bcdata, 0, 0, ncomp, br_ref_ratio);
+                                                bcdata, 0, 0, ncomp, br_ref_ratio,
+                                                InterpBndryDataT<MF>::IBD_max_order_DEF,
+                                                m_interpbndry_halfwidth);
             br_ref_ratio = this->m_coarse_data_crse_ratio;
         }
         else
@@ -639,7 +655,9 @@ MLCellLinOpT<MF>::updateSolBC (int amrlev, const MF& crse_bcdata) const
     m_crse_sol_br[amrlev]->copyFrom(crse_bcdata, 0, 0, 0, ncomp,
                                     this->m_geom[amrlev-1][0].periodicity());
     m_bndry_sol[amrlev]->updateBndryValues(*m_crse_sol_br[amrlev], 0, 0, ncomp,
-                                           IntVect(this->m_amr_ref_ratio[amrlev-1]));
+                                           IntVect(this->m_amr_ref_ratio[amrlev-1]),
+                                           InterpBndryDataT<MF>::IBD_max_order_DEF,
+                                           m_interpbndry_halfwidth);
 }
 
 template <typename MF>
@@ -652,7 +670,9 @@ MLCellLinOpT<MF>::updateCorBC (int amrlev, const MF& crse_bcdata) const
     m_crse_cor_br[amrlev]->copyFrom(crse_bcdata, 0, 0, 0, ncomp,
                                     this->m_geom[amrlev-1][0].periodicity());
     m_bndry_cor[amrlev]->updateBndryValues(*m_crse_cor_br[amrlev], 0, 0, ncomp,
-                                           IntVect(this->m_amr_ref_ratio[amrlev-1]));
+                                           IntVect(this->m_amr_ref_ratio[amrlev-1]),
+                                           InterpBndryDataT<MF>::IBD_max_order_DEF,
+                                           m_interpbndry_halfwidth);
 }
 
 template <typename MF>
@@ -1210,6 +1230,15 @@ MLCellLinOpT<MF>::solutionResidual (int amrlev, MF& resid, MF& x, const MF& b,
     MF::Xpay(resid, RT(-1.0), b, 0, 0, ncomp, IntVect(0));
 }
 
+template <typename MF>
+void
+MLCellLinOpT<MF>::prepareForFluxes (int amrlev, const MF* crse_bcdata)
+{
+    if (crse_bcdata != nullptr) {
+        updateSolBC(amrlev, *crse_bcdata);
+    }
+}
+
 template <typename MF>
 void
 MLCellLinOpT<MF>::correctionResidual (int amrlev, int mglev, MF& resid, MF& x, const MF& b,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H
index 8d461d3bb04..ce8859eae11 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H
@@ -58,6 +58,8 @@ public:
         return std::string("curl of curl");
     }
 
+    bool setUsePCG (bool flag) { return std::exchange(m_use_pcg, flag); }
+
     void setLevelBC (int amrlev, const MF* levelbcdata,
                      const MF* robinbc_a = nullptr,
                      const MF* robinbc_b = nullptr,
@@ -137,6 +139,7 @@ private:
     Vector<Vector<std::unique_ptr<Gpu::DeviceScalar
                                   <LUSolver<AMREX_SPACEDIM*2,RT>>>>> m_lusolver;
     Vector<Vector<Array<std::unique_ptr<MultiFab>,3>>> m_bcoefs;
+    bool m_use_pcg = false;
 };
 
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp
index 87853eb37bb..5e797e9a5b3 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp
@@ -353,22 +353,36 @@ void MLCurlCurl::smooth4 (int amrlev, int mglev, MF& sol, MF const& rhs,
         auto* plusolver = m_lusolver[amrlev][mglev]->dataPtr();
         ParallelFor(nmf, [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
         {
-            mlcurlcurl_gs4(i,j,k,ex[bno],ey[bno],ez[bno],rhsx[bno],rhsy[bno],rhsz[bno],
+            mlcurlcurl_gs4_lu(i,j,k,ex[bno],ey[bno],ez[bno],
+                              rhsx[bno],rhsy[bno],rhsz[bno],
 #if (AMREX_SPACEDIM == 2)
-                           b,
+                              b,
 #endif
-                           adxinv,color,*plusolver,dinfo,sinfo);
+                              adxinv,color,*plusolver,dinfo,sinfo);
         });
     } else {
         auto const& bcx = m_bcoefs[amrlev][mglev][0]->const_arrays();
         auto const& bcy = m_bcoefs[amrlev][mglev][1]->const_arrays();
         auto const& bcz = m_bcoefs[amrlev][mglev][2]->const_arrays();
-        ParallelFor(nmf, [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
-        {
+        if (m_use_pcg) {
+            ParallelFor(nmf, [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+            {
 
-            mlcurlcurl_gs4(i,j,k,ex[bno],ey[bno],ez[bno],rhsx[bno],rhsy[bno],rhsz[bno],
-                           adxinv,color,bcx[bno],bcy[bno],bcz[bno],dinfo,sinfo);
-        });
+                mlcurlcurl_gs4<true>(i,j,k,ex[bno],ey[bno],ez[bno],
+                                     rhsx[bno],rhsy[bno],rhsz[bno],
+                                     adxinv,color,bcx[bno],bcy[bno],bcz[bno],
+                                     dinfo,sinfo);
+            });
+        } else {
+            ParallelFor(nmf, [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+            {
+
+                mlcurlcurl_gs4<false>(i,j,k,ex[bno],ey[bno],ez[bno],
+                                      rhsx[bno],rhsy[bno],rhsz[bno],
+                                      adxinv,color,bcx[bno],bcy[bno],bcz[bno],
+                                      dinfo,sinfo);
+            });
+        }
     }
     Gpu::streamSynchronize();
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl_K.H b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl_K.H
index 0c1118f7dd3..e243b245f51 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl_K.H
@@ -4,6 +4,7 @@
 
 #include <AMReX_Array4.H>
 #include <AMReX_LUSolver.H>
+#include <AMReX_PCGSolver.H>
 
 namespace amrex {
 
@@ -427,20 +428,20 @@ void mlcurlcurl_adotx_z (int i, int j, int k, Array4<Real> const& Az,
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void mlcurlcurl_gs4 (int i, int j, int k,
-                     Array4<Real> const& ex,
-                     Array4<Real> const& ey,
-                     Array4<Real> const& ez,
-                     Array4<Real const> const& rhsx,
-                     Array4<Real const> const& rhsy,
-                     Array4<Real const> const& rhsz,
+void mlcurlcurl_gs4_lu (int i, int j, int k,
+                        Array4<Real> const& ex,
+                        Array4<Real> const& ey,
+                        Array4<Real> const& ez,
+                        Array4<Real const> const& rhsx,
+                        Array4<Real const> const& rhsy,
+                        Array4<Real const> const& rhsz,
 #if (AMREX_SPACEDIM == 2)
-                     Real beta,
+                        Real beta,
 #endif
-                     GpuArray<Real,AMREX_SPACEDIM> const& adxinv,
-                     int color, LUSolver<AMREX_SPACEDIM*2,Real> const& lusolver,
-                     CurlCurlDirichletInfo const& dinfo,
-                     CurlCurlSymmetryInfo const& sinfo)
+                        GpuArray<Real,AMREX_SPACEDIM> const& adxinv,
+                        int color, LUSolver<AMREX_SPACEDIM*2,Real> const& lusolver,
+                        CurlCurlDirichletInfo const& dinfo,
+                        CurlCurlSymmetryInfo const& sinfo)
 {
     if (dinfo.is_dirichlet_node(i,j,k)) { return; }
 
@@ -598,6 +599,7 @@ void mlcurlcurl_gs4 (int i, int j, int k,
 #endif
 }
 
+template <bool PCG>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlcurlcurl_gs4 (int i, int j, int k,
                      Array4<Real> const& ex,
@@ -661,55 +663,81 @@ void mlcurlcurl_gs4 (int i, int j, int k,
                          + dxy * (-ex(i-1,j+1,k  )
                                   +ex(i  ,j+1,k  )))};
 
-    GpuArray<Real,4> x;
+    GpuArray<Real,4> beta;
 
     if (sinfo.xlo_is_symmetric(i)) {
         b[0] = -b[1];
-        x[0] = x[1] = betax(i,j,k);
+        beta[0] = beta[1] = betax(i,j,k);
     } else if (sinfo.xhi_is_symmetric(i)) {
         b[1] = -b[0];
-        x[0] = x[1] = betax(i-1,j,k);
+        beta[0] = beta[1] = betax(i-1,j,k);
     } else {
-        x[0] = betax(i-1,j,k);
-        x[1] = betax(i  ,j,k);
+        beta[0] = betax(i-1,j,k);
+        beta[1] = betax(i  ,j,k);
     }
 
     if (sinfo.ylo_is_symmetric(j)) {
         b[2] = -b[3];
-        x[2] = x[3] = betay(i,j,k);
+        beta[2] = beta[3] = betay(i,j,k);
     } else if (sinfo.yhi_is_symmetric(j)) {
         b[3] = -b[2];
-        x[2] = x[3] = betay(i,j-1,k);
+        beta[2] = beta[3] = betay(i,j-1,k);
     } else {
-        x[2] = betay(i,j-1,k);
-        x[3] = betay(i,j  ,k);
+        beta[2] = betay(i,j-1,k);
+        beta[3] = betay(i,j  ,k);
     }
 
-    LUSolver<4,Real> lusolver
-        ({dyy*Real(2.0) + x[0],
-          Real(0.0),
-         -dxy,
-          dxy,
-          //
-          Real(0.0),
-          dyy*Real(2.0) + x[1],
-          dxy,
-         -dxy,
-          //
-         -dxy,
-          dxy,
-          dxx*Real(2.0) + x[2],
-          Real(0.0),
-          //
-          dxy,
-         -dxy,
-          Real(0.0),
-          dxx*Real(2.0) + x[3]});
-    lusolver(x.data(), b.data());
-    ex(i-1,j  ,k  ) = x[0];
-    ex(i  ,j  ,k  ) = x[1];
-    ey(i  ,j-1,k  ) = x[2];
-    ey(i  ,j  ,k  ) = x[3];
+    if constexpr (PCG) {
+        Real diagInv[4] = {Real(1.0) / (dyy*Real(2.0) + beta[0]),
+                           Real(1.0) / (dyy*Real(2.0) + beta[1]),
+                           Real(1.0) / (dxx*Real(2.0) + beta[2]),
+                           Real(1.0) / (dxx*Real(2.0) + beta[3])};
+        auto precond = [&] (Real * AMREX_RESTRICT z,
+                            Real const* AMREX_RESTRICT r)
+        {
+            for (int m = 0; m < 4; ++m) { z[m] = r[m] * diagInv[m]; }
+        };
+        auto mat = [&] (Real * AMREX_RESTRICT Av,
+                        Real const* AMREX_RESTRICT v)
+        {
+            Av[0] = (dyy*Real(2.0) + beta[0]) * v[0] - dxy * v[2] + dxy * v[3];
+            Av[1] = (dyy*Real(2.0) + beta[1]) * v[1] + dxy * v[2] - dxy * v[3];
+            Av[2] = -dxy * v[0] + dxy * v[1] + (dxx*Real(2.0) + beta[2]) * v[2];
+            Av[3] =  dxy * v[0] - dxy * v[1] + (dxx*Real(2.0) + beta[3]) * v[3];
+        };
+        Real sol[4] = {0, 0, 0, 0};
+        pcg_solve<4>(sol, b.data(), mat, precond, 8, Real(1.e-8));
+        ex(i-1,j  ,k  ) = sol[0];
+        ex(i  ,j  ,k  ) = sol[1];
+        ey(i  ,j-1,k  ) = sol[2];
+        ey(i  ,j  ,k  ) = sol[3];
+    } else {
+        LUSolver<4,Real> lusolver
+            ({dyy*Real(2.0) + beta[0],
+              Real(0.0),
+             -dxy,
+              dxy,
+              //
+              Real(0.0),
+              dyy*Real(2.0) + beta[1],
+              dxy,
+             -dxy,
+              //
+             -dxy,
+              dxy,
+              dxx*Real(2.0) + beta[2],
+              Real(0.0),
+              //
+              dxy,
+             -dxy,
+              Real(0.0),
+              dxx*Real(2.0) + beta[3]});
+        lusolver(beta.data(), b.data());
+        ex(i-1,j  ,k  ) = beta[0];
+        ex(i  ,j  ,k  ) = beta[1];
+        ey(i  ,j-1,k  ) = beta[2];
+        ey(i  ,j  ,k  ) = beta[3];
+    }
 
 #else
 
@@ -772,90 +800,128 @@ void mlcurlcurl_gs4 (int i, int j, int k,
                          + dyz * (-ey(i  ,j-1,k+1)
                                   +ey(i  ,j  ,k+1)))};
 
-    GpuArray<Real,6> x;
+    GpuArray<Real,6> beta;
 
     if (sinfo.xlo_is_symmetric(i)) {
         b[0] = -b[1];
-        x[0] = x[1] = betax(i,j,k);
+        beta[0] = beta[1] = betax(i,j,k);
     } else if (sinfo.xhi_is_symmetric(i)) {
         b[1] = -b[0];
-        x[0] = x[1] = betax(i-1,j,k);
+        beta[0] = beta[1] = betax(i-1,j,k);
     } else {
-        x[0] = betax(i-1,j,k);
-        x[1] = betax(i  ,j,k);
+        beta[0] = betax(i-1,j,k);
+        beta[1] = betax(i  ,j,k);
     }
 
     if (sinfo.ylo_is_symmetric(j)) {
         b[2] = -b[3];
-        x[2] = x[3] = betay(i,j,k);
+        beta[2] = beta[3] = betay(i,j,k);
     } else if (sinfo.yhi_is_symmetric(j)) {
         b[3] = -b[2];
-        x[2] = x[3] = betay(i,j-1,k);
+        beta[2] = beta[3] = betay(i,j-1,k);
     } else {
-        x[2] = betay(i,j-1,k);
-        x[3] = betay(i,j  ,k);
+        beta[2] = betay(i,j-1,k);
+        beta[3] = betay(i,j  ,k);
     }
 
     if (sinfo.zlo_is_symmetric(k)) {
         b[4] = -b[5];
-        x[4] = x[5] = betaz(i,j,k);
+        beta[4] = beta[5] = betaz(i,j,k);
     } else if (sinfo.zhi_is_symmetric(k)) {
         b[5] = -b[4];
-        x[4] = x[5] = betaz(i,j,k-1);
+        beta[4] = beta[5] = betaz(i,j,k-1);
     } else {
-        x[4] = betaz(i,j,k-1);
-        x[5] = betaz(i,j,k  );
+        beta[4] = betaz(i,j,k-1);
+        beta[5] = betaz(i,j,k  );
     }
 
-    LUSolver<6,Real> lusolver
-        ({(dyy+dzz)*Real(2.0) + x[0],
-          Real(0.0),
-         -dxy,
-          dxy,
-         -dxz,
-          dxz,
-          //
-          Real(0.0),
-          (dyy+dzz)*Real(2.0) + x[1],
-          dxy,
-         -dxy,
-          dxz,
-         -dxz,
-          //
-         -dxy,
-          dxy,
-          (dxx+dzz)*Real(2.0) + x[2],
-          Real(0.0),
-         -dyz,
-          dyz,
-          //
-          dxy,
-         -dxy,
-          Real(0.0),
-          (dxx+dzz)*Real(2.0) + x[3],
-          dyz,
-         -dyz,
-          //
-         -dxz,
-          dxz,
-         -dyz,
-          dyz,
-          (dxx+dyy)*Real(2.0) + x[4],
-          Real(0.0),
-          //
-          dxz,
-         -dxz,
-          dyz,
-         -dyz,
-          Real(0.0),
-          (dxx+dyy)*Real(2.0) + x[5]});
-    lusolver(x.data(), b.data());
-    ex(i-1,j  ,k  ) = x[0];
-    ex(i  ,j  ,k  ) = x[1];
-    ey(i  ,j-1,k  ) = x[2];
-    ey(i  ,j  ,k  ) = x[3];
-    ez(i  ,j  ,k-1) = x[4];
-    ez(i  ,j  ,k  ) = x[5];
+    if constexpr (PCG) {
+        Real diagInv[6] = {Real(1.0) / ((dyy+dzz)*Real(2.0) + beta[0]),
+                           Real(1.0) / ((dyy+dzz)*Real(2.0) + beta[1]),
+                           Real(1.0) / ((dxx+dzz)*Real(2.0) + beta[2]),
+                           Real(1.0) / ((dxx+dzz)*Real(2.0) + beta[3]),
+                           Real(1.0) / ((dxx+dyy)*Real(2.0) + beta[4]),
+                           Real(1.0) / ((dxx+dyy)*Real(2.0) + beta[5])};
+        auto precond = [&] (Real * AMREX_RESTRICT z,
+                            Real const* AMREX_RESTRICT r)
+        {
+            for (int m = 0; m < 6; ++m) { z[m] = r[m] * diagInv[m]; }
+        };
+        auto mat = [&] (Real * AMREX_RESTRICT Av,
+                        Real const* AMREX_RESTRICT v)
+        {
+            Av[0] = ((dyy+dzz)*Real(2.0) + beta[0]) * v[0] - dxy * v[2]
+                + dxy * v[3] - dxz * v[4] + dxz * v[5];
+            Av[1] = ((dyy+dzz)*Real(2.0) + beta[1]) * v[1] + dxy * v[2]
+                - dxy * v[3] + dxz * v[4] - dxz * v[5];
+            Av[2] = -dxy * v[0] + dxy * v[1] + ((dxx+dzz)*Real(2.0) + beta[2]) * v[2]
+                - dyz * v[4] + dyz * v[5];
+            Av[3] = dxy * v[0] - dxy * v[1] + ((dxx+dzz)*Real(2.0) + beta[3]) * v[3]
+                + dyz * v[4] - dyz * v[5];
+            Av[4] = -dxz * v[0] + dxz * v[1] - dyz * v[2] + dyz * v[3]
+                + ((dxx+dyy)*Real(2.0) + beta[4]) * v[4];
+            Av[5] = dxz * v[0] - dxz * v[1] + dyz * v[2] - dyz * v[3]
+                + ((dxx+dyy)*Real(2.0) + beta[5]) * v[5];
+        };
+        Real sol[6] = {0, 0, 0, 0, 0, 0};
+        pcg_solve<6>(sol, b.data(), mat, precond, 8, Real(1.e-8));
+        ex(i-1,j  ,k  ) = sol[0];
+        ex(i  ,j  ,k  ) = sol[1];
+        ey(i  ,j-1,k  ) = sol[2];
+        ey(i  ,j  ,k  ) = sol[3];
+        ez(i  ,j  ,k-1) = sol[4];
+        ez(i  ,j  ,k  ) = sol[5];
+    } else {
+        LUSolver<6,Real> lusolver
+            ({(dyy+dzz)*Real(2.0) + beta[0],
+              Real(0.0),
+             -dxy,
+              dxy,
+             -dxz,
+              dxz,
+              //
+              Real(0.0),
+              (dyy+dzz)*Real(2.0) + beta[1],
+              dxy,
+             -dxy,
+              dxz,
+             -dxz,
+              //
+             -dxy,
+              dxy,
+              (dxx+dzz)*Real(2.0) + beta[2],
+              Real(0.0),
+             -dyz,
+              dyz,
+              //
+              dxy,
+             -dxy,
+              Real(0.0),
+              (dxx+dzz)*Real(2.0) + beta[3],
+              dyz,
+             -dyz,
+              //
+             -dxz,
+              dxz,
+             -dyz,
+              dyz,
+              (dxx+dyy)*Real(2.0) + beta[4],
+              Real(0.0),
+              //
+              dxz,
+             -dxz,
+              dyz,
+             -dyz,
+              Real(0.0),
+              (dxx+dyy)*Real(2.0) + beta[5]});
+        lusolver(beta.data(), b.data());
+        ex(i-1,j  ,k  ) = beta[0];
+        ex(i  ,j  ,k  ) = beta[1];
+        ey(i  ,j-1,k  ) = beta[2];
+        ey(i  ,j  ,k  ) = beta[3];
+        ez(i  ,j  ,k-1) = beta[4];
+        ez(i  ,j  ,k  ) = beta[5];
+    }
 #endif
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_1D_K.H
index b9e9984f8d1..8f8c9dabfc4 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_1D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_1D_K.H
@@ -17,6 +17,21 @@ void mlebndfdlap_gsrb (int /*i*/, int /*j*/, int /*k*/, Array4<Real> const& /*x*
 {
 }
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_adotx (int /*i*/, int /*j*/, int /*k*/, Array4<Real> const& /*y*/,
+                            Array4<Real const> const& /*x*/, Array4<int const> const& /*dmsk*/,
+                            Array4<Real const> const& /* sig */, Real /*bx*/) noexcept
+{
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_gsrb (int /*i*/, int /*j*/, int /*k*/, Array4<Real> const& /*x*/,
+                           Array4<Real const> const& /*rhs*/, Array4<int const> const& /*dmsk*/,
+                           Array4<Real const> const& /* sig */,
+                           Real /*bx*/, int /*redblack*/) noexcept
+{
+}
+
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H
index c23797f8f7b..57bf89bba29 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H
@@ -422,6 +422,214 @@ void mlebndfdlap_gsrb_rz (int i, int j, int k, Array4<Real> const& x,
     }
 }
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_adotx (int i, int j, int k, Array4<Real> const& y,
+                            Array4<Real const> const& x,
+                            Array4<int const> const& dmsk,
+                            Array4<Real const> const& sig,
+                            Real bx, Real by) noexcept
+{
+    if (dmsk(i,j,k)) {
+        y(i,j,k) = Real(0.0);
+    } else {
+        Real sigxm = Real(0.5)*(sig(i-1,j-1,k)+sig(i-1,j  ,k));
+        Real sigxp = Real(0.5)*(sig(i  ,j-1,k)+sig(i  ,j  ,k));
+        Real sigym = Real(0.5)*(sig(i-1,j-1,k)+sig(i  ,j-1,k));
+        Real sigyp = Real(0.5)*(sig(i-1,j  ,k)+sig(i  ,j  ,k));
+        y(i,j,k) = bx * (sigxm*x(i-1,j,k) + sigxp*x(i+1,j,k))
+            +      by * (sigym*x(i,j-1,k) + sigyp*x(i,j+1,k))
+            - (bx*(sigxm+sigxp) + by*(sigym+sigyp)) * x(i,j,k);
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_gsrb (int i, int j, int k, Array4<Real> const& x,
+                           Array4<Real const> const& rhs,
+                           Array4<int const> const& dmsk,
+                           Array4<Real const> const& sig,
+                           Real bx, Real by, int redblack) noexcept
+{
+    if ((i+j+k+redblack)%2 == 0) {
+        if (dmsk(i,j,k)) {
+            x(i,j,k) = Real(0.);
+        } else {
+            Real sigxm = Real(0.5)*(sig(i-1,j-1,k)+sig(i-1,j  ,k));
+            Real sigxp = Real(0.5)*(sig(i  ,j-1,k)+sig(i  ,j  ,k));
+            Real sigym = Real(0.5)*(sig(i-1,j-1,k)+sig(i  ,j-1,k));
+            Real sigyp = Real(0.5)*(sig(i-1,j  ,k)+sig(i  ,j  ,k));
+            Real gamma = -(bx*(sigxm+sigxp) + by*(sigym+sigyp));
+            Real Ax = bx * (sigxm*x(i-1,j,k) + sigxp*x(i+1,j,k))
+                +     by * (sigym*x(i,j-1,k) + sigyp*x(i,j+1,k))
+                + gamma * x(i,j,k);
+            constexpr Real omega = Real(1.25);
+            x(i,j,k) += (rhs(i,j,k) - Ax) * (omega / gamma);
+        }
+    }
+}
+
+template <typename F>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_adotx_eb_doit (int i, int j, int k, Array4<Real> const& y,
+                                    Array4<Real const> const& x, Array4<Real const> const& levset,
+                                    Array4<int const> const& dmsk,
+                                    Array4<Real const> const& ecx, Array4<Real const> const& ecy,
+                                    Array4<Real const> const& sig, Array4<Real const> const& vfrc,
+                                    F const& xeb, Real bx, Real by) noexcept
+{
+    if (dmsk(i,j,k)) {
+        y(i,j,k) = Real(0.0);
+    } else {
+        Real tmp, sigma;
+        Real hp, hm, scale, out;
+
+        sigma = (sig(i,j-1,k)*vfrc(i,j-1,k) + sig(i,j,k)*vfrc(i,j,k))
+            /  (vfrc(i,j-1,k)              + vfrc(i,j,k));
+        hp = (ecx(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.)*ecx(i,j,k));
+        if (levset(i+1,j,k) < Real(0.0)) {
+            tmp = sigma*(x(i+1,j,k) - x(i,j,k));
+        } else {
+            tmp = sigma*((xeb(i+1,j,k) - x(i,j,k)) / hp);
+        }
+
+        sigma = (sig(i-1,j-1,k)*vfrc(i-1,j-1,k) + sig(i-1,j,k)*vfrc(i-1,j,k))
+            /  (vfrc(i-1,j-1,k)                + vfrc(i-1,j,k));
+        hm = (ecx(i-1,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecx(i-1,j,k));
+        if (levset(i-1,j,k) < Real(0.0)) {
+            tmp += sigma*(x(i-1,j,k) - x(i,j,k));
+        } else {
+            tmp += sigma*((xeb(i-1,j,k) - x(i,j,k)) / hm);
+        }
+
+        out = tmp * bx * Real(2.0) / (hp+hm);
+        scale = amrex::min(hm, hp);
+
+        sigma = (sig(i-1,j,k)*vfrc(i-1,j,k) + sig(i,j,k)*vfrc(i,j,k))
+            /  (vfrc(i-1,j,k)              + vfrc(i,j,k));
+        hp = (ecy(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.)*ecy(i,j,k));
+        if (levset(i,j+1,k) < Real(0.0)) {
+            tmp = sigma*(x(i,j+1,k) - x(i,j,k));
+        } else {
+            tmp = sigma*((xeb(i,j+1,k) - x(i,j,k)) / hp);
+        }
+
+        sigma = (sig(i-1,j-1,k)*vfrc(i-1,j-1,k) + sig(i,j-1,k)*vfrc(i,j-1,k))
+            /  (vfrc(i-1,j-1,k)                + vfrc(i,j-1,k));
+        hm = (ecy(i,j-1,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecy(i,j-1,k));
+        if (levset(i,j-1,k) < Real(0.0)) {
+            tmp += sigma*(x(i,j-1,k) - x(i,j,k));
+        } else {
+            tmp += sigma*((xeb(i,j-1,k) - x(i,j,k)) / hm);
+        }
+
+        out += tmp * by * Real(2.0) / (hp+hm);
+        scale = amrex::min(scale, hm, hp);
+
+        y(i,j,k) = out*scale;
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_adotx_eb (int i, int j, int k, Array4<Real> const& y,
+                               Array4<Real const> const& x, Array4<Real const> const& levset,
+                               Array4<int const> const& dmsk,
+                               Array4<Real const> const& ecx, Array4<Real const> const& ecy,
+                               Array4<Real const> const& sig, Array4<Real const> const& vfrc,
+                               Real xeb, Real bx, Real by) noexcept
+{
+    mlebndfdlap_sig_adotx_eb_doit(i, j, k, y, x, levset, dmsk, ecx, ecy, sig, vfrc,
+                                  [=] (int, int, int) -> Real { return xeb; },
+                                  bx, by);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_adotx_eb (int i, int j, int k, Array4<Real> const& y,
+                               Array4<Real const> const& x, Array4<Real const> const& levset,
+                               Array4<int const> const& dmsk,
+                               Array4<Real const> const& ecx, Array4<Real const> const& ecy,
+                               Array4<Real const> const& sig, Array4<Real const> const& vfrc,
+                               Array4<Real const> const& xeb, Real bx, Real by) noexcept
+{
+    mlebndfdlap_sig_adotx_eb_doit(i, j, k, y, x, levset, dmsk, ecx, ecy, sig, vfrc,
+                                  [=] (int i1, int i2, int i3) -> Real {
+                                      return xeb(i1,i2,i3); },
+                                  bx, by);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_gsrb_eb (int i, int j, int k, Array4<Real> const& x,
+                              Array4<Real const> const& rhs,
+                              Array4<Real const> const& levset,
+                              Array4<int const> const& dmsk,
+                              Array4<Real const> const& ecx,
+                              Array4<Real const> const& ecy,
+                              Array4<Real const> const& sig,
+                              Array4<Real const> const& vfrc,
+                              Real bx, Real by, int redblack) noexcept
+{
+    if ((i+j+k+redblack)%2 == 0) {
+        if (dmsk(i,j,k)) {
+            x(i,j,k) = Real(0.);
+        } else {
+            Real tmp0, tmp1, sigma;
+            Real hp, hm, scale;
+
+            sigma = (sig(i,j-1,k)*vfrc(i,j-1,k) + sig(i,j,k)*vfrc(i,j,k))
+                /  (vfrc(i,j-1,k)              + vfrc(i,j,k));
+            hp = (ecx(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.)*ecx(i,j,k));
+            if (levset(i+1,j,k) < Real(0.0)) { // regular
+                tmp0 = sigma*Real(-1.0);
+                tmp1 = sigma*x(i+1,j,k);
+            } else {
+                tmp0 = sigma*Real(-1.0) / hp;
+                tmp1 = Real(0.0);
+            }
+
+            sigma = (sig(i-1,j-1,k)*vfrc(i-1,j-1,k) + sig(i-1,j,k)*vfrc(i-1,j,k))
+                /  (vfrc(i-1,j-1,k)                + vfrc(i-1,j,k));
+            hm = (ecx(i-1,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecx(i-1,j,k));
+            if (levset(i-1,j,k) < Real(0.0)) {
+                tmp0 += sigma*Real(-1.0);
+                tmp1 += sigma*x(i-1,j,k);
+            } else {
+                tmp0 += sigma*Real(-1.0) / hm;
+            }
+
+            Real gamma = tmp0 * (bx * Real(2.0) / (hp+hm));
+            Real rho   = tmp1 * (bx * Real(2.0) / (hp+hm));
+            scale = amrex::min(hm, hp);
+
+            sigma = (sig(i-1,j,k)*vfrc(i-1,j,k) + sig(i,j,k)*vfrc(i,j,k))
+                /  (vfrc(i-1,j,k)              + vfrc(i,j,k));
+            hp = (ecy(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.)*ecy(i,j,k));
+            if (levset(i,j+1,k) < Real(0.0)) {
+                tmp0 = sigma*Real(-1.0);
+                tmp1 = sigma*x(i,j+1,k);
+            } else {
+                tmp0 = sigma*Real(-1.0) / hp;
+                tmp1 = Real(0.0);
+            }
+
+            sigma = (sig(i-1,j-1,k)*vfrc(i-1,j-1,k) + sig(i,j-1,k)*vfrc(i,j-1,k))
+                /  (vfrc(i-1,j-1,k)                + vfrc(i,j-1,k));
+            hm = (ecy(i,j-1,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecy(i,j-1,k));
+            if (levset(i,j-1,k) < Real(0.0)) {
+                tmp0 += sigma*Real(-1.0);
+                tmp1 += sigma*x(i,j-1,k);
+            } else {
+                tmp0 += sigma*Real(-1.0) / hm;
+            }
+
+            gamma += tmp0 * (by * Real(2.0) / (hp+hm));
+            rho   += tmp1 * (by * Real(2.0) / (hp+hm));
+            scale = amrex::min(scale, hm, hp);
+
+            Real Ax = rho + gamma*x(i,j,k);
+            constexpr Real omega = Real(1.25);
+            x(i,j,k) += (rhs(i,j,k) - Ax*scale) * (omega / (gamma*scale));
+        }
+    }
+}
+
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_3D_K.H
index bebaa33bbc2..9b7fc0fc2bd 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_3D_K.H
@@ -236,6 +236,389 @@ void mlebndfdlap_gsrb (int i, int j, int k, Array4<Real> const& x,
     }
 }
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_adotx (int i, int j, int k, Array4<Real> const& y,
+                            Array4<Real const> const& x,
+                            Array4<int const> const& dmsk,
+                            Array4<Real const> const& sig,
+                            Real bx, Real by, Real bz) noexcept
+{
+    if (dmsk(i,j,k)) {
+        y(i,j,k) = Real(0.0);
+    } else {
+        Real sigxm = Real(0.25)*(sig(i-1,j-1,k-1) +
+                                 sig(i-1,j  ,k-1) +
+                                 sig(i-1,j-1,k  ) +
+                                 sig(i-1,j  ,k  ));
+        Real sigxp = Real(0.25)*(sig(i  ,j-1,k-1) +
+                                 sig(i  ,j  ,k-1) +
+                                 sig(i  ,j-1,k  ) +
+                                 sig(i  ,j  ,k  ));
+        Real sigym = Real(0.25)*(sig(i-1,j-1,k-1) +
+                                 sig(i  ,j-1,k-1) +
+                                 sig(i-1,j-1,k  ) +
+                                 sig(i  ,j-1,k  ));
+        Real sigyp = Real(0.25)*(sig(i-1,j  ,k-1) +
+                                 sig(i  ,j  ,k-1) +
+                                 sig(i-1,j  ,k  ) +
+                                 sig(i  ,j  ,k  ));
+        Real sigzm = Real(0.25)*(sig(i-1,j-1,k-1) +
+                                 sig(i  ,j-1,k-1) +
+                                 sig(i-1,j  ,k-1) +
+                                 sig(i  ,j  ,k-1));
+        Real sigzp = Real(0.25)*(sig(i-1,j-1,k  ) +
+                                 sig(i  ,j-1,k  ) +
+                                 sig(i-1,j  ,k  ) +
+                                 sig(i  ,j  ,k  ));
+        y(i,j,k) = bx * (sigxm*x(i-1,j,k) + sigxp*x(i+1,j,k))
+            +      by * (sigym*x(i,j-1,k) + sigyp*x(i,j+1,k))
+            +      bz * (sigzm*x(i,j,k-1) + sigzp*x(i,j,k+1))
+            - (bx*(sigxm+sigxp) + by*(sigym+sigyp) + bz*(sigzm+sigzp)) * x(i,j,k);
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_gsrb (int i, int j, int k, Array4<Real> const& x,
+                           Array4<Real const> const& rhs,
+                           Array4<int const> const& dmsk,
+                           Array4<Real const> const& sig,
+                           Real bx, Real by, Real bz, int redblack) noexcept
+{
+    if ((i+j+k+redblack)%2 == 0) {
+        if (dmsk(i,j,k)) {
+            x(i,j,k) = Real(0.);
+        } else {
+            Real sigxm = Real(0.25)*(sig(i-1,j-1,k-1) +
+                                     sig(i-1,j  ,k-1) +
+                                     sig(i-1,j-1,k  ) +
+                                     sig(i-1,j  ,k  ));
+            Real sigxp = Real(0.25)*(sig(i  ,j-1,k-1) +
+                                     sig(i  ,j  ,k-1) +
+                                     sig(i  ,j-1,k  ) +
+                                     sig(i  ,j  ,k  ));
+            Real sigym = Real(0.25)*(sig(i-1,j-1,k-1) +
+                                     sig(i  ,j-1,k-1) +
+                                     sig(i-1,j-1,k  ) +
+                                     sig(i  ,j-1,k  ));
+            Real sigyp = Real(0.25)*(sig(i-1,j  ,k-1) +
+                                     sig(i  ,j  ,k-1) +
+                                     sig(i-1,j  ,k  ) +
+                                     sig(i  ,j  ,k  ));
+            Real sigzm = Real(0.25)*(sig(i-1,j-1,k-1) +
+                                     sig(i  ,j-1,k-1) +
+                                     sig(i-1,j  ,k-1) +
+                                     sig(i  ,j  ,k-1));
+            Real sigzp = Real(0.25)*(sig(i-1,j-1,k  ) +
+                                     sig(i  ,j-1,k  ) +
+                                     sig(i-1,j  ,k  ) +
+                                     sig(i  ,j  ,k  ));
+            Real gamma = -(bx*(sigxm+sigxp) + by*(sigym+sigyp) + bz*(sigzm+sigzp));
+            Real Ax = bx * (sigxm*x(i-1,j,k) + sigxp*x(i+1,j,k))
+                +     by * (sigym*x(i,j-1,k) + sigyp*x(i,j+1,k))
+                +     bz * (sigzm*x(i,j,k-1) + sigzp*x(i,j,k+1))
+                + gamma * x(i,j,k);
+            constexpr Real omega = Real(1.25);
+            x(i,j,k) += (rhs(i,j,k) - Ax) * (omega / gamma);
+        }
+    }
+}
+
+template <typename F>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_adotx_eb_doit (int i, int j, int k, Array4<Real> const& y,
+                                    Array4<Real const> const& x,
+                                    Array4<Real const> const& levset,
+                                    Array4<int const> const& dmsk,
+                                    Array4<Real const> const& ecx,
+                                    Array4<Real const> const& ecy,
+                                    Array4<Real const> const& ecz,
+                                    Array4<Real const> const& sig,
+                                    Array4<Real const> const& vfrc,
+                                    F const& xeb,
+                                    Real bx, Real by, Real bz) noexcept
+{
+    if (dmsk(i,j,k)) {
+        y(i,j,k) = Real(0.0);
+    } else {
+        Real tmp, sigma;
+        Real hp, hm, scale, out;
+
+        sigma = (sig(i  ,j-1,k-1) * vfrc(i  ,j-1,k-1) +
+                 sig(i  ,j  ,k-1) * vfrc(i  ,j  ,k-1) +
+                 sig(i  ,j-1,k  ) * vfrc(i  ,j-1,k  ) +
+                 sig(i  ,j  ,k  ) * vfrc(i  ,j  ,k  ))
+            /   (                   vfrc(i  ,j-1,k-1) +
+                                    vfrc(i  ,j  ,k-1) +
+                                    vfrc(i  ,j-1,k  ) +
+                                    vfrc(i  ,j  ,k  ));
+        hp = (ecx(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.)*ecx(i,j,k));
+        if (levset(i+1,j,k) < Real(0.0)) { // regular
+            tmp = sigma*(x(i+1,j,k) - x(i,j,k));
+        } else {
+            tmp = sigma*((xeb(i+1,j,k) - x(i,j,k)) / hp);
+        }
+
+        sigma = (sig(i-1,j-1,k-1) * vfrc(i-1,j-1,k-1) +
+                 sig(i-1,j  ,k-1) * vfrc(i-1,j  ,k-1) +
+                 sig(i-1,j-1,k  ) * vfrc(i-1,j-1,k  ) +
+                 sig(i-1,j  ,k  ) * vfrc(i-1,j  ,k  ))
+            /   (                   vfrc(i-1,j-1,k-1) +
+                                    vfrc(i-1,j  ,k-1) +
+                                    vfrc(i-1,j-1,k  ) +
+                                    vfrc(i-1,j  ,k  ));
+        hm = (ecx(i-1,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecx(i-1,j,k));
+        if (levset(i-1,j,k) < Real(0.0)) {
+            tmp += sigma*(x(i-1,j,k) - x(i,j,k));
+        } else {
+            tmp += sigma*((xeb(i-1,j,k) - x(i,j,k)) / hm);
+        }
+
+        out = tmp * bx * Real(2.0) / (hp+hm);
+        scale = amrex::min(hm, hp);
+
+        sigma = (sig(i-1,j  ,k-1) * vfrc(i-1,j  ,k-1) +
+                 sig(i  ,j  ,k-1) * vfrc(i  ,j  ,k-1) +
+                 sig(i-1,j  ,k  ) * vfrc(i-1,j  ,k  ) +
+                 sig(i  ,j  ,k  ) * vfrc(i  ,j  ,k  ))
+            /   (                   vfrc(i-1,j  ,k-1) +
+                                    vfrc(i  ,j  ,k-1) +
+                                    vfrc(i-1,j  ,k  ) +
+                                    vfrc(i  ,j  ,k  ));
+        hp = (ecy(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.)*ecy(i,j,k));
+        if (levset(i,j+1,k) < Real(0.0)) {
+            tmp = sigma*(x(i,j+1,k) - x(i,j,k));
+        } else {
+            tmp = sigma*((xeb(i,j+1,k) - x(i,j,k)) / hp);
+        }
+
+        sigma = (sig(i-1,j-1,k-1) * vfrc(i-1,j-1,k-1) +
+                 sig(i  ,j-1,k-1) * vfrc(i  ,j-1,k-1) +
+                 sig(i-1,j-1,k  ) * vfrc(i-1,j-1,k  ) +
+                 sig(i  ,j-1,k  ) * vfrc(i  ,j-1,k  ))
+            /   (                   vfrc(i-1,j-1,k-1) +
+                                    vfrc(i  ,j-1,k-1) +
+                                    vfrc(i-1,j-1,k  ) +
+                                    vfrc(i  ,j-1,k  ));
+        hm = (ecy(i,j-1,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecy(i,j-1,k));
+        if (levset(i,j-1,k) < Real(0.0)) {
+            tmp += sigma*(x(i,j-1,k) - x(i,j,k));
+        } else {
+            tmp += sigma*((xeb(i,j-1,k) - x(i,j,k)) / hm);
+        }
+
+        out += tmp * by * Real(2.0) / (hp+hm);
+        scale = amrex::min(scale, hm, hp);
+
+        sigma = (sig(i-1,j-1,k  ) * vfrc(i-1,j-1,k  ) +
+                 sig(i  ,j-1,k  ) * vfrc(i  ,j-1,k  ) +
+                 sig(i-1,j  ,k  ) * vfrc(i-1,j  ,k  ) +
+                 sig(i  ,j  ,k  ) * vfrc(i  ,j  ,k  ))
+            /   (                   vfrc(i-1,j-1,k  ) +
+                                    vfrc(i  ,j-1,k  ) +
+                                    vfrc(i-1,j  ,k  ) +
+                                    vfrc(i  ,j  ,k  ));
+        hp = (ecz(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.0)*ecz(i,j,k));
+        if (levset(i,j,k+1) < Real(0.0)) {
+            tmp = sigma*(x(i,j,k+1) - x(i,j,k));
+        } else {
+            tmp = sigma*((xeb(i,j,k+1) - x(i,j,k)) / hp);
+        }
+
+        sigma = (sig(i-1,j-1,k-1) * vfrc(i-1,j-1,k-1) +
+                 sig(i  ,j-1,k-1) * vfrc(i  ,j-1,k-1) +
+                 sig(i-1,j  ,k-1) * vfrc(i-1,j  ,k-1) +
+                 sig(i  ,j  ,k-1) * vfrc(i  ,j  ,k-1))
+            /   (                   vfrc(i-1,j-1,k-1) +
+                                    vfrc(i  ,j-1,k-1) +
+                                    vfrc(i-1,j  ,k-1) +
+                                    vfrc(i  ,j  ,k-1));
+        hm = (ecz(i,j,k-1) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecz(i,j,k-1));
+        if (levset(i,j,k-1) < Real(0.0)) {
+            tmp += sigma*(x(i,j,k-1) - x(i,j,k));
+        } else {
+            tmp += sigma*((xeb(i,j,k-1) - x(i,j,k)) / hm);
+        }
+
+        out += tmp * bz * Real(2.0) / (hp+hm);
+        scale = amrex::min(scale, hm, hp);
+
+        y(i,j,k) = out*scale;
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_adotx_eb (int i, int j, int k, Array4<Real> const& y,
+                               Array4<Real const> const& x,
+                               Array4<Real const> const& levset,
+                               Array4<int const> const& dmsk,
+                               Array4<Real const> const& ecx,
+                               Array4<Real const> const& ecy,
+                               Array4<Real const> const& ecz,
+                               Array4<Real const> const& sig,
+                               Array4<Real const> const& vfrc,
+                               Real xeb, Real bx, Real by, Real bz) noexcept
+{
+    mlebndfdlap_sig_adotx_eb_doit(i, j, k, y, x, levset, dmsk, ecx, ecy, ecz, sig, vfrc,
+                                  [=] (int, int, int) -> Real { return xeb; },
+                                  bx, by, bz);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_adotx_eb (int i, int j, int k, Array4<Real> const& y,
+                               Array4<Real const> const& x,
+                               Array4<Real const> const& levset,
+                               Array4<int const> const& dmsk,
+                               Array4<Real const> const& ecx,
+                               Array4<Real const> const& ecy,
+                               Array4<Real const> const& ecz,
+                               Array4<Real const> const& sig,
+                               Array4<Real const> const& vfrc,
+                               Array4<Real const> const& xeb,
+                               Real bx, Real by, Real bz) noexcept
+{
+    mlebndfdlap_sig_adotx_eb_doit(i, j, k, y, x, levset, dmsk, ecx, ecy, ecz, sig, vfrc,
+                                  [=] (int i1, int i2, int i3) -> Real {
+                                      return xeb(i1,i2,i3); },
+                                  bx, by, bz);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebndfdlap_sig_gsrb_eb (int i, int j, int k, Array4<Real> const& x,
+                              Array4<Real const> const& rhs,
+                              Array4<Real const> const& levset,
+                              Array4<int const> const& dmsk,
+                              Array4<Real const> const& ecx,
+                              Array4<Real const> const& ecy,
+                              Array4<Real const> const& ecz,
+                              Array4<Real const> const& sig,
+                              Array4<Real const> const& vfrc,
+                              Real bx, Real by, Real bz, int redblack) noexcept
+{
+    if ((i+j+k+redblack)%2 == 0) {
+        if (dmsk(i,j,k)) {
+            x(i,j,k) = Real(0.);
+        } else {
+            Real tmp0, tmp1, sigma;
+            Real hp, hm, scale;
+
+            sigma = (sig(i  ,j-1,k-1) * vfrc(i  ,j-1,k-1) +
+                     sig(i  ,j  ,k-1) * vfrc(i  ,j  ,k-1) +
+                     sig(i  ,j-1,k  ) * vfrc(i  ,j-1,k  ) +
+                     sig(i  ,j  ,k  ) * vfrc(i  ,j  ,k  ))
+                /   (                   vfrc(i  ,j-1,k-1) +
+                                        vfrc(i  ,j  ,k-1) +
+                                        vfrc(i  ,j-1,k  ) +
+                                        vfrc(i  ,j  ,k  ));
+            hp = (ecx(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.)*ecx(i,j,k));
+            if (levset(i+1,j,k) < Real(0.0)) { // regular
+                tmp0 = sigma*Real(-1.0);
+                tmp1 = sigma*x(i+1,j,k);
+            } else {
+                tmp0 = sigma*Real(-1.0) / hp;
+                tmp1 = Real(0.0);
+            }
+
+            sigma = (sig(i-1,j-1,k-1) * vfrc(i-1,j-1,k-1) +
+                     sig(i-1,j  ,k-1) * vfrc(i-1,j  ,k-1) +
+                     sig(i-1,j-1,k  ) * vfrc(i-1,j-1,k  ) +
+                     sig(i-1,j  ,k  ) * vfrc(i-1,j  ,k  ))
+                /   (                   vfrc(i-1,j-1,k-1) +
+                                        vfrc(i-1,j  ,k-1) +
+                                        vfrc(i-1,j-1,k  ) +
+                                        vfrc(i-1,j  ,k  ));
+            hm = (ecx(i-1,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecx(i-1,j,k));
+            if (levset(i-1,j,k) < Real(0.0)) {
+                tmp0 += sigma*Real(-1.0);
+                tmp1 += sigma*x(i-1,j,k);
+            } else {
+                tmp0 += sigma*Real(-1.0) / hm;
+            }
+
+            Real gamma = tmp0 * (bx * Real(2.0) / (hp+hm));
+            Real rho   = tmp1 * (bx * Real(2.0) / (hp+hm));
+            scale = amrex::min(hm, hp);
+
+            sigma = (sig(i-1,j  ,k-1) * vfrc(i-1,j  ,k-1) +
+                     sig(i  ,j  ,k-1) * vfrc(i  ,j  ,k-1) +
+                     sig(i-1,j  ,k  ) * vfrc(i-1,j  ,k  ) +
+                     sig(i  ,j  ,k  ) * vfrc(i  ,j  ,k  ))
+                /   (                   vfrc(i-1,j  ,k-1) +
+                                        vfrc(i  ,j  ,k-1) +
+                                        vfrc(i-1,j  ,k  ) +
+                                        vfrc(i  ,j  ,k  ));
+            hp = (ecy(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.)*ecy(i,j,k));
+            if (levset(i,j+1,k) < Real(0.0)) {
+                tmp0 = sigma*Real(-1.0);
+                tmp1 = sigma*x(i,j+1,k);
+            } else {
+                tmp0 = sigma*Real(-1.0) / hp;
+                tmp1 = Real(0.0);
+            }
+
+            sigma = (sig(i-1,j-1,k-1) * vfrc(i-1,j-1,k-1) +
+                     sig(i  ,j-1,k-1) * vfrc(i  ,j-1,k-1) +
+                     sig(i-1,j-1,k  ) * vfrc(i-1,j-1,k  ) +
+                     sig(i  ,j-1,k  ) * vfrc(i  ,j-1,k  ))
+                /   (                   vfrc(i-1,j-1,k-1) +
+                                        vfrc(i  ,j-1,k-1) +
+                                        vfrc(i-1,j-1,k  ) +
+                                        vfrc(i  ,j-1,k  ));
+            hm = (ecy(i,j-1,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecy(i,j-1,k));
+            if (levset(i,j-1,k) < Real(0.0)) {
+                tmp0 += sigma*Real(-1.0);
+                tmp1 += sigma*x(i,j-1,k);
+            } else {
+                tmp0 += sigma*Real(-1.0) / hm;
+            }
+
+            gamma += tmp0 * (by * Real(2.0) / (hp+hm));
+            rho   += tmp1 * (by * Real(2.0) / (hp+hm));
+            scale = amrex::min(scale, hm, hp);
+
+            sigma = (sig(i-1,j-1,k  ) * vfrc(i-1,j-1,k  ) +
+                     sig(i  ,j-1,k  ) * vfrc(i  ,j-1,k  ) +
+                     sig(i-1,j  ,k  ) * vfrc(i-1,j  ,k  ) +
+                     sig(i  ,j  ,k  ) * vfrc(i  ,j  ,k  ))
+                /   (                   vfrc(i-1,j-1,k  ) +
+                                        vfrc(i  ,j-1,k  ) +
+                                        vfrc(i-1,j  ,k  ) +
+                                        vfrc(i  ,j  ,k  ));
+            hp = (ecz(i,j,k) == Real(1.0)) ? Real(1.0) : (Real(1.0)+Real(2.0)*ecz(i,j,k));
+            if (levset(i,j,k+1) < Real(0.0)) {
+                tmp0 = sigma*Real(-1.0);
+                tmp1 = sigma*x(i,j,k+1);
+            } else {
+                tmp0 = sigma*Real(-1.0) / hp;
+                tmp1 = Real(0.0);
+            }
+
+            sigma = (sig(i-1,j-1,k-1) * vfrc(i-1,j-1,k-1) +
+                     sig(i  ,j-1,k-1) * vfrc(i  ,j-1,k-1) +
+                     sig(i-1,j  ,k-1) * vfrc(i-1,j  ,k-1) +
+                     sig(i  ,j  ,k-1) * vfrc(i  ,j  ,k-1))
+                /   (                   vfrc(i-1,j-1,k-1) +
+                                        vfrc(i  ,j-1,k-1) +
+                                        vfrc(i-1,j  ,k-1) +
+                                        vfrc(i  ,j  ,k-1));
+            hm = (ecz(i,j,k-1) == Real(1.0)) ? Real(1.0) : (Real(1.0)-Real(2.)*ecz(i,j,k-1));
+            if (levset(i,j,k-1) < Real(0.0)) {
+                tmp0 += sigma*Real(-1.0);
+                tmp1 += sigma*x(i,j,k-1);
+            } else {
+                tmp0 += sigma*Real(-1.0) / hm;
+            }
+
+            gamma += tmp0 * (bz * Real(2.0) / (hp+hm));
+            rho   += tmp1 * (bz * Real(2.0) / (hp+hm));
+            scale = amrex::min(scale, hm, hp);
+
+            Real Ax = rho + gamma*x(i,j,k);
+            constexpr Real omega = Real(1.25);
+            x(i,j,k) += (rhs(i,j,k) - Ax*scale) * (omega / (gamma*scale));
+        }
+    }
+}
+
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
index 6e7559d21d9..6ebbd2c65e7 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
@@ -20,6 +20,9 @@ namespace amrex {
 //
 // del dot (sigma grad phi) - alpha/r^2 phi = rhs, for RZ where alpha is a
 // scalar constant that is zero by default.
+//
+// New feature: for non-RZ, sigma can also be a single-component
+// cell-centered multifab.
 
 class MLEBNodeFDLaplacian
     : public MLNodeLinOp
@@ -50,6 +53,8 @@ public:
 
     void setSigma (Array<Real,AMREX_SPACEDIM> const& a_sigma) noexcept;
 
+    void setSigma (int amrlev, MultiFab const& a_sigma);
+
     void setRZ (bool flag);
 
     void setAlpha (Real a_alpha);
@@ -116,6 +121,8 @@ public:
 
 private:
     GpuArray<Real,AMREX_SPACEDIM> m_sigma{{AMREX_D_DECL(1_rt,1_rt,1_rt)}};
+    Vector<Vector<std::unique_ptr<MultiFab>>> m_sigma_mf;
+    bool m_has_sigma_mf = false;
     Real m_s_phi_eb = std::numeric_limits<Real>::lowest();
     Vector<MultiFab> m_phi_eb;
     int m_rz = false;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
index 5559f133821..af4a6a6d742 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
@@ -4,6 +4,11 @@
 #include <AMReX_MLNodeTensorLap_K.H>
 #include <AMReX_MultiFabUtil.H>
 
+#ifdef AMREX_USE_EB
+#include <AMReX_EB2.H>
+#include <AMReX_EBMultiFabUtil.H>
+#endif
+
 namespace amrex {
 
 #ifdef AMREX_USE_EB
@@ -35,6 +40,19 @@ MLEBNodeFDLaplacian::setSigma (Array<Real,AMREX_SPACEDIM> const& a_sigma) noexce
     }
 }
 
+void
+MLEBNodeFDLaplacian::setSigma (int amrlev, MultiFab const& a_sigma)
+{
+    m_has_sigma_mf = true;
+    m_sigma_mf[amrlev][0] = std::make_unique<MultiFab>
+        (this->m_grids[amrlev][0], this->m_dmap[amrlev][0], 1, 1, MFInfo{},
+         *(this->m_factory[amrlev][0]));
+    MultiFab::Copy(*m_sigma_mf[amrlev][0], a_sigma, 0, 0, 1, 0);
+#ifdef AMREX_USE_EB
+    amrex::EB_set_covered(*m_sigma_mf[amrlev][0], Real(0.0));
+#endif
+}
+
 void
 MLEBNodeFDLaplacian::setRZ (bool flag) // NOLINT
 {
@@ -92,6 +110,11 @@ MLEBNodeFDLaplacian::define (const Vector<Geometry>& a_geom,
     int eb_limit_coarsening = true;
     m_coarsening_strategy = CoarseningStrategy::Sigma; // This will fill nodes outside Neumann BC
     MLNodeLinOp::define(a_geom, cc_grids, a_dmap, a_info, _factory, eb_limit_coarsening);
+
+    m_sigma_mf.resize(this->m_num_amr_levels);
+    for (int ilev = 0; ilev < this->m_num_amr_levels; ++ilev) {
+        m_sigma_mf[ilev].resize(this->m_num_mg_levels[ilev]);
+    }
 }
 
 #endif
@@ -118,16 +141,25 @@ MLEBNodeFDLaplacian::define (const Vector<Geometry>& a_geom,
 
     m_coarsening_strategy = CoarseningStrategy::Sigma; // This will fill nodes outside Neumann BC
     MLNodeLinOp::define(a_geom, cc_grids, a_dmap, a_info);
+
+    m_sigma_mf.resize(this->m_num_amr_levels);
+    for (int ilev = 0; ilev < this->m_num_amr_levels; ++ilev) {
+        m_sigma_mf[ilev].resize(this->m_num_mg_levels[ilev]);
+    }
 }
 
 #ifdef AMREX_USE_EB
 std::unique_ptr<FabFactory<FArrayBox> >
 MLEBNodeFDLaplacian::makeFactory (int amrlev, int mglev) const
 {
-    return makeEBFabFactory(m_geom[amrlev][mglev],
-                            m_grids[amrlev][mglev],
-                            m_dmap[amrlev][mglev],
-                            {1,1,1}, EBSupport::full);
+    if (EB2::TopIndexSpaceIfPresent()) {
+        return makeEBFabFactory(m_geom[amrlev][mglev],
+                                m_grids[amrlev][mglev],
+                                m_dmap[amrlev][mglev],
+                                {1,1,1}, EBSupport::full);
+    } else {
+        return MLNodeLinOp::makeFactory(amrlev, mglev);
+    }
 }
 #endif
 
@@ -138,7 +170,7 @@ MLEBNodeFDLaplacian::restriction (int amrlev, int cmglev, MultiFab& crse, MultiF
 
     applyBC(amrlev, cmglev-1, fine, BCMode::Homogeneous, StateMode::Solution);
 
-    IntVect const ratio = mg_coarsen_ratio_vec[cmglev-1];
+    IntVect const ratio = (amrlev > 0) ? IntVect(2) : mg_coarsen_ratio_vec[cmglev-1];
     int semicoarsening_dir = info.semicoarsening_direction;
 
     bool need_parallel_copy = !amrex::isMFIterSafe(crse, fine);
@@ -184,7 +216,7 @@ MLEBNodeFDLaplacian::interpolation (int amrlev, int fmglev, MultiFab& fine,
 {
     BL_PROFILE("MLEBNodeFDLaplacian::interpolation()");
 
-    IntVect const ratio = mg_coarsen_ratio_vec[fmglev];
+    IntVect const ratio = (amrlev > 0) ? IntVect(2) : mg_coarsen_ratio_vec[fmglev];
     int semicoarsening_dir = info.semicoarsening_direction;
 
     bool need_parallel_copy = !amrex::isMFIterSafe(crse, fine);
@@ -237,17 +269,19 @@ MLEBNodeFDLaplacian::prepareForSolve ()
     for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
         for (int mglev = 0; mglev < m_num_mg_levels[amrlev]; ++mglev) {
             const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][mglev].get());
-            auto const& levset_mf = factory->getLevelSet();
-            auto const& levset_ar = levset_mf.const_arrays();
-            auto& dmask_mf = *m_dirichlet_mask[amrlev][mglev];
-            auto const& dmask_ar = dmask_mf.arrays();
-            amrex::ParallelFor(dmask_mf,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-            {
-                if (levset_ar[box_no](i,j,k) >= Real(0.0)) {
-                    dmask_ar[box_no](i,j,k) = -1;
-                }
-            });
+            if (factory) {
+                auto const& levset_mf = factory->getLevelSet();
+                auto const& levset_ar = levset_mf.const_arrays();
+                auto& dmask_mf = *m_dirichlet_mask[amrlev][mglev];
+                auto const& dmask_ar = dmask_mf.arrays();
+                amrex::ParallelFor(dmask_mf,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    if (levset_ar[box_no](i,j,k) >= Real(0.0)) {
+                        dmask_ar[box_no](i,j,k) = -1;
+                    }
+                });
+            }
         }
     }
 #endif
@@ -279,16 +313,63 @@ MLEBNodeFDLaplacian::prepareForSolve ()
         if (m_sigma[0] == 0._rt) {
             m_sigma[0] = 1._rt; // For backward compatibility
         }
+        AMREX_ASSERT(!m_has_sigma_mf);
     }
 #endif
+
+    if (m_has_sigma_mf) {
+        AMREX_D_TERM(m_sigma[0] = Real(1.0);,
+                     m_sigma[1] = Real(1.0);,
+                     m_sigma[2] = Real(1.0));
+        AMREX_ALWAYS_ASSERT(this->m_num_amr_levels == 1);
+        for (int amrlev = 0; amrlev < this->m_num_amr_levels; ++amrlev) {
+            for (int mglev = 1; mglev < this->m_num_mg_levels[amrlev]; ++mglev) {
+                m_sigma_mf[amrlev][mglev] = std::make_unique<MultiFab>
+                    (this->m_grids[amrlev][mglev], this->m_dmap[amrlev][mglev], 1, 1,
+                     MFInfo{}, *(this->m_factory[amrlev][mglev]));
+                IntVect const ratio = (amrlev > 0) ? IntVect (2)
+                    : this->mg_coarsen_ratio_vec[mglev-1];
+#ifdef AMREX_USE_EB
+                amrex::EB_average_down
+#else
+                amrex::average_down
+#endif
+                    (*m_sigma_mf[amrlev][mglev-1],
+                     *m_sigma_mf[amrlev][mglev], 0, 1, ratio);
+            }
+
+            for (int mglev = 0; mglev < this->m_num_mg_levels[amrlev]; ++mglev) {
+                auto const& geom = this->m_geom[amrlev][mglev];
+                auto& sigma = *m_sigma_mf[amrlev][mglev];
+                sigma.FillBoundary(geom.periodicity());
+
+                const Box& domain = geom.Domain();
+                const auto lobc = LoBC();
+                const auto hibc = HiBC();
+
+                MFItInfo mfi_info;
+                if (Gpu::notInLaunchRegion()) { mfi_info.SetDynamic(true); }
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+                for (MFIter mfi(sigma, mfi_info); mfi.isValid(); ++mfi)
+                {
+                    Array4<Real> const& sfab = sigma.array(mfi);
+                    mlndlap_fillbc_cc<Real>(mfi.validbox(),sfab,domain,lobc,hibc);
+                }
+            }
+        }
+    }
 }
 
 #ifdef AMREX_USE_EB
 void
 MLEBNodeFDLaplacian::scaleRHS (int amrlev, MultiFab& rhs) const
 {
-    auto const& dmask = *m_dirichlet_mask[amrlev][0];
     const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][0].get());
+    if (!factory) { return; }
+
+    auto const& dmask = *m_dirichlet_mask[amrlev][0];
     auto const& edgecent = factory->getEdgeCent();
 
 #ifdef AMREX_USE_OMP
@@ -335,8 +416,10 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
 #ifdef AMREX_USE_EB
     const auto phieb = (m_in_solution_mode) ? m_s_phi_eb : Real(0.0);
     const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][mglev].get());
-    auto const& edgecent = factory->getEdgeCent();
-    auto const& levset_mf = factory->getLevelSet();
+    Array<const MultiCutFab*,AMREX_SPACEDIM> edgecent {AMREX_D_DECL(nullptr,nullptr,nullptr)};
+    if (factory) {
+        edgecent = factory->getEdgeCent();
+    }
 #endif
 
 #ifdef AMREX_USE_OMP
@@ -349,12 +432,12 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
         Array4<Real> const& yarr = out.array(mfi);
         Array4<int const> const& dmarr = dmask.const_array(mfi);
 #ifdef AMREX_USE_EB
-        bool cutfab = edgecent[0]->ok(mfi);
-        if (cutfab) {
+        bool cutfab = edgecent[0] && edgecent[0]->ok(mfi);
+        if (cutfab && factory) { // clang-tidy is not that smart
             AMREX_D_TERM(Array4<Real const> const& ecx = edgecent[0]->const_array(mfi);,
                          Array4<Real const> const& ecy = edgecent[1]->const_array(mfi);,
                          Array4<Real const> const& ecz = edgecent[2]->const_array(mfi));
-            auto const& levset = levset_mf.const_array(mfi);
+            auto const& levset = factory->getLevelSet().const_array(mfi);
             if (phieb == std::numeric_limits<Real>::lowest()) {
                 auto const& phiebarr = m_phi_eb[amrlev].const_array(mfi);
 #if (AMREX_SPACEDIM == 2)
@@ -366,7 +449,15 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
                     });
                 } else
 #endif
-                {
+                if (m_has_sigma_mf) {
+                    auto const& sigarr = m_sigma_mf[amrlev][mglev]->const_array(mfi);
+                    auto const& vfrc = factory->getVolFrac().const_array(mfi);
+                    AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
+                    {
+                        mlebndfdlap_sig_adotx_eb(i,j,k,yarr,xarr,levset,dmarr,AMREX_D_DECL(ecx,ecy,ecz),
+                                                 sigarr, vfrc, phiebarr, AMREX_D_DECL(bx,by,bz));
+                    });
+                } else {
                     AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                     {
                         mlebndfdlap_adotx_eb(i,j,k,yarr,xarr,levset,dmarr,AMREX_D_DECL(ecx,ecy,ecz),
@@ -383,7 +474,15 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
                     });
                 } else
 #endif
-                {
+                if (m_has_sigma_mf) {
+                    auto const& sigarr = m_sigma_mf[amrlev][mglev]->const_array(mfi);
+                    auto const& vfrc = factory->getVolFrac().const_array(mfi);
+                    AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
+                    {
+                        mlebndfdlap_sig_adotx_eb(i,j,k,yarr,xarr,levset,dmarr,AMREX_D_DECL(ecx,ecy,ecz),
+                                                 sigarr, vfrc, phieb, AMREX_D_DECL(bx,by,bz));
+                    });
+                } else {
                     AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                     {
                         mlebndfdlap_adotx_eb(i,j,k,yarr,xarr,levset,dmarr,AMREX_D_DECL(ecx,ecy,ecz),
@@ -402,7 +501,13 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
                 });
             } else
 #endif
-            {
+            if (m_has_sigma_mf) {
+                auto const& sigarr = m_sigma_mf[amrlev][mglev]->const_array(mfi);
+                AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
+                {
+                    mlebndfdlap_sig_adotx(i,j,k,yarr,xarr,dmarr,sigarr,AMREX_D_DECL(bx,by,bz));
+                });
+            } else {
                 AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                 {
                     mlebndfdlap_adotx(i,j,k,yarr,xarr,dmarr,AMREX_D_DECL(bx,by,bz));
@@ -438,8 +543,10 @@ MLEBNodeFDLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiF
 
 #ifdef AMREX_USE_EB
         const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][mglev].get());
-        auto const& edgecent = factory->getEdgeCent();
-        auto const& levset_mf = factory->getLevelSet();
+        Array<const MultiCutFab*,AMREX_SPACEDIM> edgecent {AMREX_D_DECL(nullptr,nullptr,nullptr)};
+        if (factory) {
+            edgecent = factory->getEdgeCent();
+        }
 #endif
 
 #ifdef AMREX_USE_OMP
@@ -452,12 +559,12 @@ MLEBNodeFDLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiF
             Array4<Real const> const& rhsarr = rhs.const_array(mfi);
             Array4<int const> const& dmskarr = dmask.const_array(mfi);
 #ifdef AMREX_USE_EB
-            bool cutfab = edgecent[0]->ok(mfi);
-            if (cutfab) {
+            bool cutfab = edgecent[0] && edgecent[0]->ok(mfi);
+            if (cutfab && factory) { // clang-tidy is not that smart
                 AMREX_D_TERM(Array4<Real const> const& ecx = edgecent[0]->const_array(mfi);,
                              Array4<Real const> const& ecy = edgecent[1]->const_array(mfi);,
                              Array4<Real const> const& ecz = edgecent[2]->const_array(mfi));
-                auto const& levset = levset_mf.const_array(mfi);
+                auto const& levset = factory->getLevelSet().const_array(mfi);
 #if (AMREX_SPACEDIM == 2)
                 if (m_rz) {
                     AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
@@ -467,7 +574,15 @@ MLEBNodeFDLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiF
                     });
                 } else
 #endif
-                {
+                if (m_has_sigma_mf) {
+                    auto const& sigarr = m_sigma_mf[amrlev][mglev]->const_array(mfi);
+                    auto const& vfrc = factory->getVolFrac().const_array(mfi);
+                    AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
+                    {
+                        mlebndfdlap_sig_gsrb_eb(i,j,k,solarr,rhsarr,levset,dmskarr,AMREX_D_DECL(ecx,ecy,ecz),
+                                                sigarr, vfrc, AMREX_D_DECL(bx,by,bz), redblack);
+                    });
+                } else {
                     AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                     {
                         mlebndfdlap_gsrb_eb(i,j,k,solarr,rhsarr,levset,dmskarr,AMREX_D_DECL(ecx,ecy,ecz),
@@ -486,7 +601,14 @@ MLEBNodeFDLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiF
                     });
                 } else
 #endif
-                {
+                if (m_has_sigma_mf) {
+                    auto const& sigarr = m_sigma_mf[amrlev][mglev]->const_array(mfi);
+                    AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
+                    {
+                        mlebndfdlap_sig_gsrb(i,j,k,solarr,rhsarr,dmskarr,sigarr,
+                                             AMREX_D_DECL(bx,by,bz), redblack);
+                    });
+                } else {
                     AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                     {
                         mlebndfdlap_gsrb(i,j,k,solarr,rhsarr,dmskarr,
@@ -530,8 +652,10 @@ MLEBNodeFDLaplacian::compGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>
     auto const& dmask = *m_dirichlet_mask[amrlev][mglev];
     const auto phieb = m_s_phi_eb;
     const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][mglev].get());
-    AMREX_ASSERT(factory);
-    auto const& edgecent = factory->getEdgeCent();
+    Array<const MultiCutFab*,AMREX_SPACEDIM> edgecent {AMREX_D_DECL(nullptr,nullptr,nullptr)};
+    if (factory) {
+        edgecent = factory->getEdgeCent();
+    }
 #endif
 
 #ifdef AMREX_USE_OMP
@@ -548,7 +672,7 @@ MLEBNodeFDLaplacian::compGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>
                      Array4<Real> const& gpz = grad[2]->array(mfi);)
 #ifdef AMREX_USE_EB
         Array4<int const> const& dmarr = dmask.const_array(mfi);
-        bool cutfab = edgecent[0]->ok(mfi);
+        bool cutfab = edgecent[0] && edgecent[0]->ok(mfi);
         AMREX_D_TERM(Array4<Real const> const& ecx
                          = cutfab ? edgecent[0]->const_array(mfi) : Array4<Real const>{};,
                      Array4<Real const> const& ecy
@@ -630,6 +754,7 @@ MLEBNodeFDLaplacian::postSolve (Vector<MultiFab>& sol) const
     for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
         const auto phieb = m_s_phi_eb;
         const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][0].get());
+        if (!factory) { return; }
         auto const& levset_mf = factory->getLevelSet();
         auto const& levset_ar = levset_mf.const_arrays();
         MultiFab& mf = sol[amrlev];
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
index 03da0874e79..3cc623b761e 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
@@ -376,6 +376,8 @@ public:
     virtual void solutionResidual (int amrlev, MF& resid, MF& x, const MF& b,
                                    const MF* crse_bcdata=nullptr) = 0;
 
+    virtual void prepareForFluxes (int /*amrlev*/, const MF* /*crse_bcdata*/ = nullptr) {}
+
     /**
      * \brief Compute residual for the residual-correction form, resid = b - L(x)
      *
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLMG.H b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
index 77f9abe409e..78b2ffdd3df 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLMG.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
@@ -164,6 +164,8 @@ public:
     void setHypreStrongThreshold (Real t) noexcept {hypre_strong_threshold = t;}
 #endif
 
+    void prepareForFluxes (Vector<MF const*> const& a_sol);
+
     template <typename AMF>
     void prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const& a_rhs);
 
@@ -538,6 +540,16 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
     return composite_norminf;
 }
 
+template <typename MF>
+void
+MLMGT<MF>::prepareForFluxes (Vector<MF const*> const& a_sol)
+{
+    for (int alev = finest_amr_lev; alev >= 0; --alev) {
+        const MF* crse_bcdata = (alev > 0) ? a_sol[alev-1] : nullptr;
+        linop.prepareForFluxes(alev, crse_bcdata);
+    }
+}
+
 template <typename MF>
 template <typename AMF>
 void
@@ -971,7 +983,6 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
             if (nGrowVect(*a_sol[alev]) == ng_sol) {
                 if constexpr (std::is_same<AMF,MF>()) {
                     sol[alev] = linop.makeAlias(*a_sol[alev]);
-                    setBndry(sol[alev], RT(0.0), 0, ncomp);
                     sol_is_alias[alev] = true;
                 }
             }
@@ -1558,29 +1569,32 @@ MLMGT<MF>::actualBottomSolve ()
             } else {
                 cg_type = MLCGSolverT<MF>::Type::BiCGStab;
             }
+
             int ret = bottomSolveWithCG(x, *bottom_b, cg_type);
-            // If the MLMG solve failed then set the correction to zero
-            if (ret != 0 && ret != 9) {
+
+            if (ret != 0 && (bottom_solver == BottomSolver::cgbicg ||
+                             bottom_solver == BottomSolver::bicgcg))
+            {
+                if (bottom_solver == BottomSolver::cgbicg) {
+                    cg_type = MLCGSolverT<MF>::Type::BiCGStab; // switch to bicg
+                } else {
+                    cg_type = MLCGSolverT<MF>::Type::CG; // switch to cg
+                }
                 setVal(cor[amrlev][mglev], RT(0.0));
-                if (bottom_solver == BottomSolver::cgbicg ||
-                    bottom_solver == BottomSolver::bicgcg) {
-                    if (bottom_solver == BottomSolver::cgbicg) {
-                        cg_type = MLCGSolverT<MF>::Type::BiCGStab; // switch to bicg
+                ret = bottomSolveWithCG(x, *bottom_b, cg_type);
+                if (ret == 0) { // switch permanently
+                    if (cg_type == MLCGSolverT<MF>::Type::CG) {
+                        bottom_solver = BottomSolver::cg;
                     } else {
-                        cg_type = MLCGSolverT<MF>::Type::CG; // switch to cg
-                    }
-                    ret = bottomSolveWithCG(x, *bottom_b, cg_type);
-                    if (ret != 0) {
-                        setVal(cor[amrlev][mglev], RT(0.0));
-                    } else { // switch permanently
-                        if (cg_type == MLCGSolverT<MF>::Type::CG) {
-                            bottom_solver = BottomSolver::cg;
-                        } else {
-                            bottom_solver = BottomSolver::bicgstab;
-                        }
+                        bottom_solver = BottomSolver::bicgstab;
                     }
                 }
             }
+
+            // If the bottom solve failed then set the correction to zero
+            if (ret != 0 && ret != 9) {
+                setVal(cor[amrlev][mglev], RT(0.0));
+            }
             const int n = (ret==0) ? nub : nuf;
             for (int i = 0; i < n; ++i) {
                 linop.smooth(amrlev, mglev, x, b);
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H
index 4f982e07752..91d02257396 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H
@@ -194,7 +194,7 @@ void mlndlap_normalize_aa (int i, int j, int k, Array4<Real> const& x,
     mlndlap_normalize_ha(i,j,k,x,sx,msk,dxinv);
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlndlap_jacobi_ha (int i, int, int, Array4<Real> const& sol, Real Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sx,
                         Array4<int const> const& msk,
@@ -208,7 +208,7 @@ void mlndlap_jacobi_ha (int i, int, int, Array4<Real> const& sol, Real Ax,
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_jacobi_ha (Box const& bx, Array4<Real> const& sol, Array4<Real const> const& Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sx,
                         Array4<int const> const& msk,
@@ -216,7 +216,7 @@ void mlndlap_jacobi_ha (Box const& bx, Array4<Real> const& sol, Array4<Real cons
 {
     Real fac = -dxinv[0]*dxinv[0];
 
-    amrex::LoopConcurrent(bx, [=] (int i, int, int) noexcept
+    amrex::LoopConcurrentOnCpu(bx, [&] (int i, int, int) noexcept
     {
         if (msk(i,0,0)) {
             sol(i,0,0) = Real(0.0);
@@ -227,7 +227,7 @@ void mlndlap_jacobi_ha (Box const& bx, Array4<Real> const& sol, Array4<Real cons
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlndlap_jacobi_aa (int i, int j, int k, Array4<Real> const& sol, Real Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sig,
                         Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
@@ -235,7 +235,7 @@ void mlndlap_jacobi_aa (int i, int j, int k, Array4<Real> const& sol, Real Ax,
     mlndlap_jacobi_ha(i,j,k,sol,Ax,rhs,sig,msk,dxinv);
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_jacobi_aa (Box const& bx, Array4<Real> const& sol, Array4<Real const> const& Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sig,
                         Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
@@ -243,7 +243,7 @@ void mlndlap_jacobi_aa (Box const& bx, Array4<Real> const& sol, Array4<Real cons
     mlndlap_jacobi_ha(bx,sol,Ax,rhs,sig,msk,dxinv);
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlndlap_jacobi_c (int i, int, int, Array4<Real> const& sol, Real Ax,
                        Array4<Real const> const& rhs, Real sig,
                        Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
@@ -256,12 +256,12 @@ void mlndlap_jacobi_c (int i, int, int, Array4<Real> const& sol, Real Ax,
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_jacobi_c (Box const& bx, Array4<Real> const& sol, Array4<Real const> const& Ax,
                        Array4<Real const> const& rhs, Real sig,
                        Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
 {
-    amrex::LoopConcurrent(bx, [=] (int i, int, int) noexcept
+    amrex::LoopConcurrentOnCpu(bx, [&] (int i, int, int) noexcept
     {
         if (msk(i,0,0)) {
             sol(i,0,0) = Real(0.0);
@@ -272,7 +272,7 @@ void mlndlap_jacobi_c (Box const& bx, Array4<Real> const& sol, Array4<Real const
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_ha (Box const& bx, Array4<Real> const& sol,
                               Array4<Real const> const& rhs,
                               Array4<Real const> const& sx,
@@ -281,7 +281,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4<Real> const& sol,
 {
     Real fac = dxinv[0]*dxinv[0];
 
-    amrex::Loop(bx, [=] (int i, int, int) noexcept
+    amrex::LoopOnCpu(bx, [&] (int i, int, int) noexcept
     {
         if (msk(i,0,0)) {
             sol(i,0,0) = Real(0.0);
@@ -295,7 +295,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4<Real> const& sol,
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
                               Array4<Real const> const& rhs,
                               Array4<Real const> const& sx,
@@ -305,7 +305,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
     mlndlap_gauss_seidel_ha(bx,sol,rhs,sx,msk,dxinv);
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_c (Box const& bx, Array4<Real> const& sol,
                              Array4<Real const> const& rhs, Real sig,
                              Array4<int const> const& msk,
@@ -313,7 +313,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4<Real> const& sol,
 {
     Real fac = dxinv[0]*dxinv[0];
 
-    amrex::Loop(bx, [=] (int i, int, int) noexcept
+    amrex::LoopOnCpu(bx, [&] (int i, int, int) noexcept
     {
         if (msk(i,0,0)) {
             sol(i,0,0) = Real(0.0);
@@ -327,7 +327,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4<Real> const& sol,
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_with_line_solve_aa(Box const&, Array4<Real> const&,
                                              Array4<Real const> const&, Array4<Real const> const&,
                                              Array4<int const> const&, GpuArray<Real,AMREX_SPACEDIM> const&) noexcept
@@ -556,7 +556,7 @@ Real mlndlap_adotx_sten (int /*i*/, int /*j*/, int /*k*/, Array4<Real const> con
                          Array4<Real const> const&, Array4<int const> const&) noexcept
 { return Real(0.0); }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_sten (Box const&, Array4<Real> const&,
                                 Array4<Real const> const&,
                                 Array4<Real const> const&,
@@ -575,6 +575,72 @@ void mlndlap_restriction_rap (int /*i*/, int /*j*/, int /*k*/, Array4<Real> cons
                               Array4<int const> const&) noexcept
 {}
 
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+int mlndlap_color (int i, int, int)
+{
+    return i%2;
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_ha (int i, int j, int k, Array4<Real> const& sol,
+                         Array4<Real const> const& rhs,
+                         Array4<Real const> const& sx,
+                         Array4<int const> const& msk,
+                         GpuArray<Real,AMREX_SPACEDIM> const& dxinv, int color) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        if (msk(i,0,0)) {
+            sol(i,0,0) = Real(0.0);
+        } else {
+            Real fac = dxinv[0]*dxinv[0];
+
+            Real s0 = Real(-1.0) * fac * (sx(i-1,0,0)+sx(i,0,0));
+            Real Ax = sol(i-1,0,0)*fac*sx(i-1,0,0)
+                +     sol(i+1,0,0)*fac*sx(i  ,0,0)
+                +     sol(i  ,0,0)*s0;
+            sol(i,0,0) += (rhs(i,0,0) - Ax) / s0;
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_aa (int i, int j, int k, Array4<Real> const& sol,
+                         Array4<Real const> const& rhs,
+                         Array4<Real const> const& sx,
+                         Array4<int const> const& msk,
+                         GpuArray<Real,AMREX_SPACEDIM> const& dxinv, int color) noexcept
+{
+    mlndlap_gscolor_ha(i,j,k,sol,rhs,sx,msk,dxinv,color);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_c (int i, int j, int k, Array4<Real> const& sol,
+                        Array4<Real const> const& rhs, Real sig,
+                        Array4<int const> const& msk,
+                        GpuArray<Real,AMREX_SPACEDIM> const& dxinv, int color) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        if (msk(i,0,0)) {
+            sol(i,0,0) = Real(0.0);
+        } else {
+            Real fac = dxinv[0]*dxinv[0];
+
+            Real s0 = Real(-2.0) * fac * sig;
+            Real Ax = sol(i-1,0,0)*fac*sig
+                +     sol(i+1,0,0)*fac*sig
+                +     sol(i  ,0,0)*s0;
+            sol(i,0,0) += (rhs(i,0,0) - Ax) / s0;
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_sten (int, int, int, Array4<Real> const&,
+                           Array4<Real const> const&,
+                           Array4<Real const> const&,
+                           Array4<int const> const&, int) noexcept
+{}
+
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H
index 372215f5d73..05f02aaa927 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H
@@ -418,7 +418,7 @@ void mlndlap_normalize_aa (int i, int j, int k, Array4<Real> const& x, Array4<Re
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlndlap_jacobi_ha (int i, int j, int k, Array4<Real> const& sol, Real Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sx,
                         Array4<Real const> const& sy, Array4<int const> const& msk,
@@ -436,7 +436,7 @@ void mlndlap_jacobi_ha (int i, int j, int k, Array4<Real> const& sol, Real Ax,
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_jacobi_ha (Box const& bx, Array4<Real> const& sol, Array4<Real const> const& Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sx,
                         Array4<Real const> const& sy, Array4<int const> const& msk,
@@ -445,7 +445,7 @@ void mlndlap_jacobi_ha (Box const& bx, Array4<Real> const& sol, Array4<Real cons
     Real facx = -Real(2.0/6.0)*dxinv[0]*dxinv[0];
     Real facy = -Real(2.0/6.0)*dxinv[1]*dxinv[1];
 
-    amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopConcurrentOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -457,7 +457,7 @@ void mlndlap_jacobi_ha (Box const& bx, Array4<Real> const& sol, Array4<Real cons
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlndlap_jacobi_aa (int i, int j, int k, Array4<Real> const& sol, Real Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sig,
                         Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
@@ -472,7 +472,7 @@ void mlndlap_jacobi_aa (int i, int j, int k, Array4<Real> const& sol, Real Ax,
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlndlap_jacobi_c (int i, int j, int k, Array4<Real> const& sol, Real Ax,
                        Array4<Real const> const& rhs, Real sig,
                        Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
@@ -487,14 +487,14 @@ void mlndlap_jacobi_c (int i, int j, int k, Array4<Real> const& sol, Real Ax,
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_jacobi_aa (Box const& bx, Array4<Real> const& sol, Array4<Real const> const& Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sig,
                         Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
 {
     Real fac = -Real(2.0/6.0)*(dxinv[0]*dxinv[0] + dxinv[1]*dxinv[1]);
 
-    amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopConcurrentOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -505,14 +505,14 @@ void mlndlap_jacobi_aa (Box const& bx, Array4<Real> const& sol, Array4<Real cons
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_jacobi_c (Box const& bx, Array4<Real> const& sol, Array4<Real const> const& Ax,
                        Array4<Real const> const& rhs, Real sig,
                        Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
 {
     Real fac = -Real(2.0/6.0)*(dxinv[0]*dxinv[0] + dxinv[1]*dxinv[1]);
 
-    amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopConcurrentOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -523,7 +523,7 @@ void mlndlap_jacobi_c (Box const& bx, Array4<Real> const& sol, Array4<Real const
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_ha (Box const& bx, Array4<Real> const& sol,
                               Array4<Real const> const& rhs, Array4<Real const> const& sx,
                               Array4<Real const> const& sy, Array4<int const> const& msk,
@@ -533,7 +533,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4<Real> const& sol,
     Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0];
     Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1];
 
-    amrex::Loop(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -570,7 +570,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4<Real> const& sol,
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
                               Array4<Real const> const& rhs, Array4<Real const> const& sig,
                               Array4<int const> const& msk,
@@ -583,7 +583,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
     Real f2xmy = Real(2.0)*facx - facy;
     Real fmx2y = Real(2.0)*facy - facx;
 
-    amrex::Loop(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -614,7 +614,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_c (Box const& bx, Array4<Real> const& sol,
                              Array4<Real const> const& rhs, Real sig,
                              Array4<int const> const& msk,
@@ -627,7 +627,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4<Real> const& sol,
     Real f2xmy = Real(2.0)*facx - facy;
     Real fmx2y = Real(2.0)*facy - facx;
 
-    amrex::Loop(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -658,7 +658,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4<Real> const& sol,
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_FORCE_INLINE
 void tridiagonal_solve (Array1D<Real,0,31>& a_ls, Array1D<Real,0,31>& b_ls, Array1D<Real,0,31>& c_ls,
                         Array1D<Real,0,31>& r_ls, Array1D<Real,0,31>& u_ls, Array1D<Real,0,31>& gam,
                         int ilen ) noexcept
@@ -677,7 +677,7 @@ void tridiagonal_solve (Array1D<Real,0,31>& a_ls, Array1D<Real,0,31>& b_ls, Arra
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_with_line_solve_aa (Box const& bx, Array4<Real> const& sol,
                               Array4<Real const> const& rhs, Array4<Real const> const& sig,
                               Array4<int const> const& msk,
@@ -1819,6 +1819,21 @@ void mlndlap_stencil_rap (int i, int j, int, Array4<Real> const& csten,
     csten(i,j,k,3) = Real(0.5)*(cross1+cross2);
 }
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlndlap_adotx_sten_doit (int i, int j, int k, Array4<Real const> const& x,
+                              Array4<Real const> const& sten) noexcept
+{
+    return     x(i-1,j-1,k)*sten(i-1,j-1,k,3)
+        +      x(i  ,j-1,k)*sten(i  ,j-1,k,2)
+        +      x(i+1,j-1,k)*sten(i  ,j-1,k,3)
+        +      x(i-1,j  ,k)*sten(i-1,j  ,k,1)
+        +      x(i  ,j  ,k)*sten(i  ,j  ,k,0)
+        +      x(i+1,j  ,k)*sten(i  ,j  ,k,1)
+        +      x(i-1,j+1,k)*sten(i-1,j  ,k,3)
+        +      x(i  ,j+1,k)*sten(i  ,j  ,k,2)
+        +      x(i+1,j+1,k)*sten(i  ,j  ,k,3);
+}
+
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 Real mlndlap_adotx_sten (int i, int j, int k, Array4<Real const> const& x,
                          Array4<Real const> const& sten, Array4<int const> const& msk) noexcept
@@ -1826,40 +1841,33 @@ Real mlndlap_adotx_sten (int i, int j, int k, Array4<Real const> const& x,
     if (msk(i,j,k)) {
         return Real(0.0);
     } else {
-        return     x(i-1,j-1,k)*sten(i-1,j-1,k,3)
-            +      x(i  ,j-1,k)*sten(i  ,j-1,k,2)
-            +      x(i+1,j-1,k)*sten(i  ,j-1,k,3)
-            +      x(i-1,j  ,k)*sten(i-1,j  ,k,1)
-            +      x(i  ,j  ,k)*sten(i  ,j  ,k,0)
-            +      x(i+1,j  ,k)*sten(i  ,j  ,k,1)
-            +      x(i-1,j+1,k)*sten(i-1,j  ,k,3)
-            +      x(i  ,j+1,k)*sten(i  ,j  ,k,2)
-            +      x(i+1,j+1,k)*sten(i  ,j  ,k,3);
+        return mlndlap_adotx_sten_doit(i,j,k,x,sten);
     }
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gauss_seidel_sten (int i, int j, int k, Array4<Real> const& sol,
+                                Array4<Real const> const& rhs,
+                                Array4<Real const> const& sten,
+                                Array4<int const> const& msk) noexcept
+{
+    if (msk(i,j,k)) {
+        sol(i,j,k) = Real(0.0);
+    } else if (sten(i,j,k,0) != Real(0.0)) {
+        Real Ax = mlndlap_adotx_sten_doit(i,j,k,sol,sten);
+        sol(i,j,k) += (rhs(i,j,k) - Ax) / sten(i,j,k,0);
+    }
+}
+
+inline
 void mlndlap_gauss_seidel_sten (Box const& bx, Array4<Real> const& sol,
                                 Array4<Real const> const& rhs,
                                 Array4<Real const> const& sten,
                                 Array4<int const> const& msk) noexcept
 {
-    amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept
+    AMREX_LOOP_3D(bx, i, j, k,
     {
-        if (msk(i,j,k)) {
-            sol(i,j,k) = Real(0.0);
-        } else if (sten(i,j,k,0) != Real(0.0)) {
-            Real Ax = sol(i-1,j-1,k)*sten(i-1,j-1,k,3)
-                +     sol(i  ,j-1,k)*sten(i  ,j-1,k,2)
-                +     sol(i+1,j-1,k)*sten(i  ,j-1,k,3)
-                +     sol(i-1,j  ,k)*sten(i-1,j  ,k,1)
-                +     sol(i  ,j  ,k)*sten(i  ,j  ,k,0)
-                +     sol(i+1,j  ,k)*sten(i  ,j  ,k,1)
-                +     sol(i-1,j+1,k)*sten(i-1,j  ,k,3)
-                +     sol(i  ,j+1,k)*sten(i  ,j  ,k,2)
-                +     sol(i+1,j+1,k)*sten(i  ,j  ,k,3);
-            sol(i,j,k) += (rhs(i,j,k) - Ax) / sten(i,j,k,0);
-        }
+        mlndlap_gauss_seidel_sten(i,j,k,sol,rhs,sten,msk);
     });
 }
 
@@ -3536,5 +3544,154 @@ void mlndlap_fillijmat_cs_gpu (const int ps, const int i, const int j, const int
 
 #endif
 
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+int mlndlap_color (int i, int j, int)
+{
+    return (i%2) + (j%2)*2;
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_ha (int i, int j, int k, Array4<Real> const& sol,
+                         Array4<Real const> const& rhs, Array4<Real const> const& sx,
+                         Array4<Real const> const& sy, Array4<int const> const& msk,
+                         GpuArray<Real,AMREX_SPACEDIM> const& dxinv, int color,
+                         bool is_rz) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        if (msk(i,j,k)) {
+            sol(i,j,k) = Real(0.0);
+        } else {
+            Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0];
+            Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1];
+
+            Real s0 = Real(-2.0)*(facx*(sx(i-1,j-1,k)+sx(i,j-1,k)+sx(i-1,j,k)+sx(i,j,k))
+                                 +facy*(sy(i-1,j-1,k)+sy(i,j-1,k)+sy(i-1,j,k)+sy(i,j,k)));
+
+            Real Ax = sol(i-1,j-1,k)*(facx*sx(i-1,j-1,k)+facy*sy(i-1,j-1,k))
+                    + sol(i+1,j-1,k)*(facx*sx(i  ,j-1,k)+facy*sy(i  ,j-1,k))
+                    + sol(i-1,j+1,k)*(facx*sx(i-1,j  ,k)+facy*sy(i-1,j  ,k))
+                    + sol(i+1,j+1,k)*(facx*sx(i  ,j  ,k)+facy*sy(i  ,j  ,k))
+                    + sol(i-1,j,k)*(Real(2.0)*facx*(sx(i-1,j-1,k)+sx(i-1,j,k))
+                                        -     facy*(sy(i-1,j-1,k)+sy(i-1,j,k)))
+                    + sol(i+1,j,k)*(Real(2.0)*facx*(sx(i  ,j-1,k)+sx(i  ,j,k))
+                                        -     facy*(sy(i  ,j-1,k)+sy(i  ,j,k)))
+                    + sol(i,j-1,k)*(   -facx*(sx(i-1,j-1,k)+sx(i,j-1,k))
+                             +Real(2.0)*facy*(sy(i-1,j-1,k)+sy(i,j-1,k)))
+                    + sol(i,j+1,k)*(   -facx*(sx(i-1,j  ,k)+sx(i,j  ,k))
+                             +Real(2.0)*facy*(sy(i-1,j  ,k)+sy(i,j  ,k)))
+                    + sol(i,j,k)*s0;
+
+            if (is_rz) {
+                Real fp = facy / static_cast<Real>(2*i+1);
+                Real fm = facy / static_cast<Real>(2*i-1);
+                Real frzlo = fm*sy(i-1,j-1,k)-fp*sy(i,j-1,k);
+                Real frzhi = fm*sy(i-1,j  ,k)-fp*sy(i,j  ,k);
+                s0 += - frzhi - frzlo;
+                Ax += frzhi*(sol(i,j+1,k)-sol(i,j,k))
+                    + frzlo*(sol(i,j-1,k)-sol(i,j,k));
+            }
+
+            sol(i,j,k) += (rhs(i,j,k) - Ax) / s0;
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_aa (int i, int j, int k, Array4<Real> const& sol,
+                         Array4<Real const> const& rhs, Array4<Real const> const& sig,
+                         Array4<int const> const& msk,
+                         GpuArray<Real,AMREX_SPACEDIM> const& dxinv, int color,
+                         bool is_rz) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        if (msk(i,j,k)) {
+            sol(i,j,k) = Real(0.0);
+        } else {
+            Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0];
+            Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1];
+            Real fxy = facx + facy;
+            Real f2xmy = Real(2.0)*facx - facy;
+            Real fmx2y = Real(2.0)*facy - facx;
+
+            Real s0 = (-Real(2.0))*fxy*(sig(i-1,j-1,k)+sig(i,j-1,k)+sig(i-1,j,k)+sig(i,j,k));
+            Real Ax =   sol(i-1,j-1,k)*fxy*sig(i-1,j-1,k)
+                      + sol(i+1,j-1,k)*fxy*sig(i  ,j-1,k)
+                      + sol(i-1,j+1,k)*fxy*sig(i-1,j  ,k)
+                      + sol(i+1,j+1,k)*fxy*sig(i  ,j  ,k)
+                      + sol(i-1,j,k)*f2xmy*(sig(i-1,j-1,k)+sig(i-1,j,k))
+                      + sol(i+1,j,k)*f2xmy*(sig(i  ,j-1,k)+sig(i  ,j,k))
+                      + sol(i,j-1,k)*fmx2y*(sig(i-1,j-1,k)+sig(i,j-1,k))
+                      + sol(i,j+1,k)*fmx2y*(sig(i-1,j  ,k)+sig(i,j  ,k))
+                      + sol(i,j,k)*s0;
+
+            if (is_rz) {
+                Real fp = facy / static_cast<Real>(2*i+1);
+                Real fm = facy / static_cast<Real>(2*i-1);
+                Real frzlo = fm*sig(i-1,j-1,k)-fp*sig(i,j-1,k);
+                Real frzhi = fm*sig(i-1,j  ,k)-fp*sig(i,j  ,k);
+                s0 += - frzhi - frzlo;
+                Ax += frzhi*(sol(i,j+1,k)-sol(i,j,k))
+                    + frzlo*(sol(i,j-1,k)-sol(i,j,k));
+            }
+
+            sol(i,j,k) += (rhs(i,j,k) - Ax) / s0;
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_c (int i, int j, int k, Array4<Real> const& sol,
+                        Array4<Real const> const& rhs, Real sig,
+                        Array4<int const> const& msk,
+                        GpuArray<Real,AMREX_SPACEDIM> const& dxinv, int color,
+                        bool is_rz) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        if (msk(i,j,k)) {
+            sol(i,j,k) = Real(0.0);
+        } else {
+            Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0];
+            Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1];
+            Real fxy = facx + facy;
+            Real f2xmy = Real(2.0)*facx - facy;
+            Real fmx2y = Real(2.0)*facy - facx;
+
+            Real s0 = (-Real(2.0))*fxy*Real(4.);
+            Real Ax =   sol(i-1,j-1,k)*fxy
+                      + sol(i+1,j-1,k)*fxy
+                      + sol(i-1,j+1,k)*fxy
+                      + sol(i+1,j+1,k)*fxy
+                      + sol(i-1,j,k)*f2xmy*Real(2.)
+                      + sol(i+1,j,k)*f2xmy*Real(2.)
+                      + sol(i,j-1,k)*fmx2y*Real(2.)
+                      + sol(i,j+1,k)*fmx2y*Real(2.)
+                      + sol(i,j,k)*s0;
+
+            if (is_rz) {
+                Real fp = facy / static_cast<Real>(2*i+1);
+                Real fm = facy / static_cast<Real>(2*i-1);
+                Real frzlo = fm-fp;
+                Real frzhi = fm-fp;
+                s0 += - frzhi - frzlo;
+                Ax += frzhi*(sol(i,j+1,k)-sol(i,j,k))
+                    + frzlo*(sol(i,j-1,k)-sol(i,j,k));
+            }
+
+            sol(i,j,k) += (rhs(i,j,k) - Ax*sig) / (s0*sig);
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_sten (int i, int j, int k, Array4<Real> const& sol,
+                           Array4<Real const> const& rhs,
+                           Array4<Real const> const& sten,
+                           Array4<int const> const& msk, int color) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        mlndlap_gauss_seidel_sten(i,j,k,sol,rhs,sten,msk);
+    }
+}
+
 }
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H
index f6b94e7c526..5d31de02711 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H
@@ -988,7 +988,7 @@ void mlndlap_normalize_aa (int i, int j, int k, Array4<Real> const& x, Array4<Re
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlndlap_jacobi_ha (int i, int j, int k, Array4<Real> const& sol, Real Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sx,
                         Array4<Real const> const& sy, Array4<Real const> const& sz,
@@ -1011,7 +1011,7 @@ void mlndlap_jacobi_ha (int i, int j, int k, Array4<Real> const& sol, Real Ax,
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_jacobi_ha (Box const& bx, Array4<Real> const& sol, Array4<Real const> const& Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sx,
                         Array4<Real const> const& sy, Array4<Real const> const& sz,
@@ -1021,7 +1021,7 @@ void mlndlap_jacobi_ha (Box const& bx, Array4<Real> const& sol, Array4<Real cons
     Real facy = Real(-4.0 / 36.0)*dxinv[1]*dxinv[1];
     Real facz = Real(-4.0 / 36.0)*dxinv[2]*dxinv[2];
 
-    amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopConcurrentOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -1037,7 +1037,7 @@ void mlndlap_jacobi_ha (Box const& bx, Array4<Real> const& sol, Array4<Real cons
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlndlap_jacobi_aa (int i, int j, int k, Array4<Real> const& sol, Real Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sig,
                         Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
@@ -1055,7 +1055,7 @@ void mlndlap_jacobi_aa (int i, int j, int k, Array4<Real> const& sol, Real Ax,
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void mlndlap_jacobi_c (int i, int j, int k, Array4<Real> const& sol, Real Ax,
                        Array4<Real const> const& rhs, Real sig,
                        Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
@@ -1072,7 +1072,7 @@ void mlndlap_jacobi_c (int i, int j, int k, Array4<Real> const& sol, Real Ax,
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_jacobi_aa (Box const& bx, Array4<Real> const& sol, Array4<Real const> const& Ax,
                         Array4<Real const> const& rhs, Array4<Real const> const& sig,
                         Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
@@ -1081,7 +1081,7 @@ void mlndlap_jacobi_aa (Box const& bx, Array4<Real> const& sol, Array4<Real cons
                                    dxinv[1]*dxinv[1] +
                                    dxinv[2]*dxinv[2]);
 
-    amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopConcurrentOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -1093,7 +1093,7 @@ void mlndlap_jacobi_aa (Box const& bx, Array4<Real> const& sol, Array4<Real cons
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_jacobi_c (Box const& bx, Array4<Real> const& sol, Array4<Real const> const& Ax,
                        Array4<Real const> const& rhs, Real sig,
                        Array4<int const> const& msk, GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
@@ -1102,7 +1102,7 @@ void mlndlap_jacobi_c (Box const& bx, Array4<Real> const& sol, Array4<Real const
                                    dxinv[1]*dxinv[1] +
                                    dxinv[2]*dxinv[2]);
 
-    amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopConcurrentOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -1113,7 +1113,7 @@ void mlndlap_jacobi_c (Box const& bx, Array4<Real> const& sol, Array4<Real const
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_ha (Box const& bx, Array4<Real> const& sol,
                               Array4<Real const> const& rhs, Array4<Real const> const& sx,
                               Array4<Real const> const& sy, Array4<Real const> const& sz,
@@ -1124,7 +1124,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4<Real> const& sol,
     Real facy = Real(1.0/36.0)*dxinv[1]*dxinv[1];
     Real facz = Real(1.0/36.0)*dxinv[2]*dxinv[2];
 
-    amrex::Loop(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -1220,7 +1220,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4<Real> const& sol,
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
                               Array4<Real const> const& rhs, Array4<Real const> const& sig,
                               Array4<int const> const& msk,
@@ -1237,7 +1237,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
     Real fm2x4ym2z = -Real(2.0)*facx + Real(4.0)*facy - Real(2.0)*facz;
     Real fm2xm2y4z = -Real(2.0)*facx - Real(2.0)*facy + Real(4.0)*facz;
 
-    amrex::Loop(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -1277,7 +1277,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_c (Box const& bx, Array4<Real> const& sol,
                              Array4<Real const> const& rhs, Real sig,
                              Array4<int const> const& msk,
@@ -1294,7 +1294,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4<Real> const& sol,
     Real fm2x4ym2z = -Real(2.0)*facx + Real(4.0)*facy - Real(2.0)*facz;
     Real fm2xm2y4z = -Real(2.0)*facx - Real(2.0)*facy + Real(4.0)*facz;
 
-    amrex::Loop(bx, [=] (int i, int j, int k) noexcept
+    amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept
     {
         if (msk(i,j,k)) {
             sol(i,j,k) = Real(0.0);
@@ -1333,7 +1333,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4<Real> const& sol,
     });
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+AMREX_FORCE_INLINE
 void tridiagonal_solve (Array1D<Real,0,31>& a_ls, Array1D<Real,0,31>& b_ls, Array1D<Real,0,31>& c_ls,
                         Array1D<Real,0,31>& r_ls, Array1D<Real,0,31>& u_ls, Array1D<Real,0,31>& gam,
                         int ilen ) noexcept
@@ -1352,7 +1352,7 @@ void tridiagonal_solve (Array1D<Real,0,31>& a_ls, Array1D<Real,0,31>& b_ls, Arra
     }
 }
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+inline
 void mlndlap_gauss_seidel_with_line_solve_aa (Box const& bx, Array4<Real> const& sol,
                               Array4<Real const> const& rhs, Array4<Real const> const& sig,
                               Array4<int const> const& msk,
@@ -5460,101 +5460,83 @@ void mlndlap_stencil_rap (int i, int j, int k, Array4<Real> const& csten,
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-Real mlndlap_adotx_sten (int i, int j, int k, Array4<Real const> const& x,
-                         Array4<Real const> const& sten, Array4<int const> const& msk) noexcept
+Real mlndlap_adotx_sten_doit (int i, int j, int k, Array4<Real const> const& x,
+                              Array4<Real const> const& sten) noexcept
 {
     using namespace nodelap_detail;
 
+    return     x(i  ,j  ,k  ) * sten(i  ,j  ,k  ,ist_000)
+        //
+        +      x(i-1,j  ,k  ) * sten(i-1,j  ,k  ,ist_p00)
+        +      x(i+1,j  ,k  ) * sten(i  ,j  ,k  ,ist_p00)
+        //
+        +      x(i  ,j-1,k  ) * sten(i  ,j-1,k  ,ist_0p0)
+        +      x(i  ,j+1,k  ) * sten(i  ,j  ,k  ,ist_0p0)
+        //
+        +      x(i  ,j  ,k-1) * sten(i  ,j  ,k-1,ist_00p)
+        +      x(i  ,j  ,k+1) * sten(i  ,j  ,k  ,ist_00p)
+        //
+        +      x(i-1,j-1,k  ) * sten(i-1,j-1,k  ,ist_pp0)
+        +      x(i+1,j-1,k  ) * sten(i  ,j-1,k  ,ist_pp0)
+        +      x(i-1,j+1,k  ) * sten(i-1,j  ,k  ,ist_pp0)
+        +      x(i+1,j+1,k  ) * sten(i  ,j  ,k  ,ist_pp0)
+        //
+        +      x(i-1,j  ,k-1) * sten(i-1,j  ,k-1,ist_p0p)
+        +      x(i+1,j  ,k-1) * sten(i  ,j  ,k-1,ist_p0p)
+        +      x(i-1,j  ,k+1) * sten(i-1,j  ,k  ,ist_p0p)
+        +      x(i+1,j  ,k+1) * sten(i  ,j  ,k  ,ist_p0p)
+        //
+        +      x(i  ,j-1,k-1) * sten(i  ,j-1,k-1,ist_0pp)
+        +      x(i  ,j+1,k-1) * sten(i  ,j  ,k-1,ist_0pp)
+        +      x(i  ,j-1,k+1) * sten(i  ,j-1,k  ,ist_0pp)
+        +      x(i  ,j+1,k+1) * sten(i  ,j  ,k  ,ist_0pp)
+        //
+        +      x(i-1,j-1,k-1) * sten(i-1,j-1,k-1,ist_ppp)
+        +      x(i+1,j-1,k-1) * sten(i  ,j-1,k-1,ist_ppp)
+        +      x(i-1,j+1,k-1) * sten(i-1,j  ,k-1,ist_ppp)
+        +      x(i+1,j+1,k-1) * sten(i  ,j  ,k-1,ist_ppp)
+        +      x(i-1,j-1,k+1) * sten(i-1,j-1,k  ,ist_ppp)
+        +      x(i+1,j-1,k+1) * sten(i  ,j-1,k  ,ist_ppp)
+        +      x(i-1,j+1,k+1) * sten(i-1,j  ,k  ,ist_ppp)
+        +      x(i+1,j+1,k+1) * sten(i  ,j  ,k  ,ist_ppp);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlndlap_adotx_sten (int i, int j, int k, Array4<Real const> const& x,
+                         Array4<Real const> const& sten, Array4<int const> const& msk) noexcept
+{
     if (msk(i,j,k)) {
         return Real(0.0);
     } else {
-        return     x(i  ,j  ,k  ) * sten(i  ,j  ,k  ,ist_000)
-            //
-            +      x(i-1,j  ,k  ) * sten(i-1,j  ,k  ,ist_p00)
-            +      x(i+1,j  ,k  ) * sten(i  ,j  ,k  ,ist_p00)
-            //
-            +      x(i  ,j-1,k  ) * sten(i  ,j-1,k  ,ist_0p0)
-            +      x(i  ,j+1,k  ) * sten(i  ,j  ,k  ,ist_0p0)
-            //
-            +      x(i  ,j  ,k-1) * sten(i  ,j  ,k-1,ist_00p)
-            +      x(i  ,j  ,k+1) * sten(i  ,j  ,k  ,ist_00p)
-            //
-            +      x(i-1,j-1,k  ) * sten(i-1,j-1,k  ,ist_pp0)
-            +      x(i+1,j-1,k  ) * sten(i  ,j-1,k  ,ist_pp0)
-            +      x(i-1,j+1,k  ) * sten(i-1,j  ,k  ,ist_pp0)
-            +      x(i+1,j+1,k  ) * sten(i  ,j  ,k  ,ist_pp0)
-            //
-            +      x(i-1,j  ,k-1) * sten(i-1,j  ,k-1,ist_p0p)
-            +      x(i+1,j  ,k-1) * sten(i  ,j  ,k-1,ist_p0p)
-            +      x(i-1,j  ,k+1) * sten(i-1,j  ,k  ,ist_p0p)
-            +      x(i+1,j  ,k+1) * sten(i  ,j  ,k  ,ist_p0p)
-            //
-            +      x(i  ,j-1,k-1) * sten(i  ,j-1,k-1,ist_0pp)
-            +      x(i  ,j+1,k-1) * sten(i  ,j  ,k-1,ist_0pp)
-            +      x(i  ,j-1,k+1) * sten(i  ,j-1,k  ,ist_0pp)
-            +      x(i  ,j+1,k+1) * sten(i  ,j  ,k  ,ist_0pp)
-            //
-            +      x(i-1,j-1,k-1) * sten(i-1,j-1,k-1,ist_ppp)
-            +      x(i+1,j-1,k-1) * sten(i  ,j-1,k-1,ist_ppp)
-            +      x(i-1,j+1,k-1) * sten(i-1,j  ,k-1,ist_ppp)
-            +      x(i+1,j+1,k-1) * sten(i  ,j  ,k-1,ist_ppp)
-            +      x(i-1,j-1,k+1) * sten(i-1,j-1,k  ,ist_ppp)
-            +      x(i+1,j-1,k+1) * sten(i  ,j-1,k  ,ist_ppp)
-            +      x(i-1,j+1,k+1) * sten(i-1,j  ,k  ,ist_ppp)
-            +      x(i+1,j+1,k+1) * sten(i  ,j  ,k  ,ist_ppp);
+        return mlndlap_adotx_sten_doit(i,j,k,x,sten);
     }
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_gauss_seidel_sten (Box const& bx, Array4<Real> const& sol,
+void mlndlap_gauss_seidel_sten (int i, int j, int k, Array4<Real> const& sol,
                                 Array4<Real const> const& rhs,
                                 Array4<Real const> const& sten,
                                 Array4<int const> const& msk) noexcept
 {
     using namespace nodelap_detail;
 
-    amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept
+    if (msk(i,j,k)) {
+        sol(i,j,k) = Real(0.0);
+    } else if (sten(i,j,k,ist_000) != Real(0.0)) {
+        Real Ax = mlndlap_adotx_sten_doit(i,j,k,sol,sten);
+        sol(i,j,k) += (rhs(i,j,k) - Ax) / sten(i,j,k,ist_000);
+    }
+}
+
+inline
+void mlndlap_gauss_seidel_sten (Box const& bx, Array4<Real> const& sol,
+                                Array4<Real const> const& rhs,
+                                Array4<Real const> const& sten,
+                                Array4<int const> const& msk) noexcept
+{
+    AMREX_LOOP_3D(bx, i, j, k,
     {
-        if (msk(i,j,k)) {
-            sol(i,j,k) = Real(0.0);
-        } else if (sten(i,j,k,ist_000) != Real(0.0)) {
-            Real Ax  = sol(i  ,j  ,k  ) * sten(i  ,j  ,k  ,ist_000)
-                //
-                +      sol(i-1,j  ,k  ) * sten(i-1,j  ,k  ,ist_p00)
-                +      sol(i+1,j  ,k  ) * sten(i  ,j  ,k  ,ist_p00)
-                //
-                +      sol(i  ,j-1,k  ) * sten(i  ,j-1,k  ,ist_0p0)
-                +      sol(i  ,j+1,k  ) * sten(i  ,j  ,k  ,ist_0p0)
-                //
-                +      sol(i  ,j  ,k-1) * sten(i  ,j  ,k-1,ist_00p)
-                +      sol(i  ,j  ,k+1) * sten(i  ,j  ,k  ,ist_00p)
-                //
-                +      sol(i-1,j-1,k  ) * sten(i-1,j-1,k  ,ist_pp0)
-                +      sol(i+1,j-1,k  ) * sten(i  ,j-1,k  ,ist_pp0)
-                +      sol(i-1,j+1,k  ) * sten(i-1,j  ,k  ,ist_pp0)
-                +      sol(i+1,j+1,k  ) * sten(i  ,j  ,k  ,ist_pp0)
-                //
-                +      sol(i-1,j  ,k-1) * sten(i-1,j  ,k-1,ist_p0p)
-                +      sol(i+1,j  ,k-1) * sten(i  ,j  ,k-1,ist_p0p)
-                +      sol(i-1,j  ,k+1) * sten(i-1,j  ,k  ,ist_p0p)
-                +      sol(i+1,j  ,k+1) * sten(i  ,j  ,k  ,ist_p0p)
-                //
-                +      sol(i  ,j-1,k-1) * sten(i  ,j-1,k-1,ist_0pp)
-                +      sol(i  ,j+1,k-1) * sten(i  ,j  ,k-1,ist_0pp)
-                +      sol(i  ,j-1,k+1) * sten(i  ,j-1,k  ,ist_0pp)
-                +      sol(i  ,j+1,k+1) * sten(i  ,j  ,k  ,ist_0pp)
-                //
-                +      sol(i-1,j-1,k-1) * sten(i-1,j-1,k-1,ist_ppp)
-                +      sol(i+1,j-1,k-1) * sten(i  ,j-1,k-1,ist_ppp)
-                +      sol(i-1,j+1,k-1) * sten(i-1,j  ,k-1,ist_ppp)
-                +      sol(i+1,j+1,k-1) * sten(i  ,j  ,k-1,ist_ppp)
-                +      sol(i-1,j-1,k+1) * sten(i-1,j-1,k  ,ist_ppp)
-                +      sol(i+1,j-1,k+1) * sten(i  ,j-1,k  ,ist_ppp)
-                +      sol(i-1,j+1,k+1) * sten(i-1,j  ,k  ,ist_ppp)
-                +      sol(i+1,j+1,k+1) * sten(i  ,j  ,k  ,ist_ppp);
-
-            sol(i,j,k) += (rhs(i,j,k) - Ax) / sten(i,j,k,ist_000);
-        }
+        mlndlap_gauss_seidel_sten(i,j,k,sol,rhs,sten,msk);
     });
 }
 
@@ -10887,5 +10869,239 @@ void mlndlap_fillijmat_cs_gpu (const int ps, const int i, const int j, const int
 
 #endif
 
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+int mlndlap_color (int i, int j, int k)
+{
+    return (i%2) + (j%2)*2 + (k%2)*4;
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_ha (int i, int j, int k, Array4<Real> const& sol,
+                         Array4<Real const> const& rhs, Array4<Real const> const& sx,
+                         Array4<Real const> const& sy, Array4<Real const> const& sz,
+                         Array4<int const> const& msk,
+                         GpuArray<Real,AMREX_SPACEDIM> const& dxinv, int color) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        if (msk(i,j,k)) {
+            sol(i,j,k) = Real(0.0);
+        } else {
+            Real facx = Real(1.0/36.0)*dxinv[0]*dxinv[0];
+            Real facy = Real(1.0/36.0)*dxinv[1]*dxinv[1];
+            Real facz = Real(1.0/36.0)*dxinv[2]*dxinv[2];
+
+            Real s0 = Real(-4.0)*(facx*(sx(i-1,j-1,k-1)+sx(i,j-1,k-1)+sx(i-1,j,k-1)+sx(i,j,k-1)
+                                       +sx(i-1,j-1,k  )+sx(i,j-1,k  )+sx(i-1,j,k  )+sx(i,j,k  ))
+                                 +facy*(sy(i-1,j-1,k-1)+sy(i,j-1,k-1)+sy(i-1,j,k-1)+sy(i,j,k-1)
+                                       +sy(i-1,j-1,k  )+sy(i,j-1,k  )+sy(i-1,j,k  )+sy(i,j,k  ))
+                                 +facz*(sz(i-1,j-1,k-1)+sz(i,j-1,k-1)+sz(i-1,j,k-1)+sz(i,j,k-1)
+                                       +sz(i-1,j-1,k  )+sz(i,j-1,k  )+sz(i-1,j,k  )+sz(i,j,k  )));
+            Real Ax = sol(i,j,k)*s0
+                     + sol(i-1,j-1,k-1)*(facx*sx(i-1,j-1,k-1)
+                                        +facy*sy(i-1,j-1,k-1)
+                                        +facz*sz(i-1,j-1,k-1))
+                     + sol(i+1,j-1,k-1)*(facx*sx(i  ,j-1,k-1)
+                                        +facy*sy(i  ,j-1,k-1)
+                                        +facz*sz(i  ,j-1,k-1))
+                     + sol(i-1,j+1,k-1)*(facx*sx(i-1,j  ,k-1)
+                                        +facy*sy(i-1,j  ,k-1)
+                                        +facz*sz(i-1,j  ,k-1))
+                     + sol(i+1,j+1,k-1)*(facx*sx(i  ,j  ,k-1)
+                                        +facy*sy(i  ,j  ,k-1)
+                                        +facz*sz(i  ,j  ,k-1))
+                     + sol(i-1,j-1,k+1)*(facx*sx(i-1,j-1,k  )
+                                        +facy*sy(i-1,j-1,k  )
+                                        +facz*sz(i-1,j-1,k  ))
+                     + sol(i+1,j-1,k+1)*(facx*sx(i  ,j-1,k  )
+                                        +facy*sy(i  ,j-1,k  )
+                                        +facz*sz(i  ,j-1,k  ))
+                     + sol(i-1,j+1,k+1)*(facx*sx(i-1,j  ,k  )
+                                        +facy*sy(i-1,j  ,k  )
+                                        +facz*sz(i-1,j  ,k  ))
+                     + sol(i+1,j+1,k+1)*(facx*sx(i  ,j  ,k  )
+                                        +facy*sy(i  ,j  ,k  )
+                                        +facz*sz(i  ,j  ,k  ))
+                     +sol(i  ,j-1,k-1)*(          -facx*(sx(i-1,j-1,k-1)+sx(i,j-1,k-1))
+                                        +Real(2.0)*facy*(sy(i-1,j-1,k-1)+sy(i,j-1,k-1))
+                                        +Real(2.0)*facz*(sz(i-1,j-1,k-1)+sz(i,j-1,k-1)))
+                     +sol(i  ,j+1,k-1)*(          -facx*(sx(i-1,j  ,k-1)+sx(i,j  ,k-1))
+                                        +Real(2.0)*facy*(sy(i-1,j  ,k-1)+sy(i,j  ,k-1))
+                                        +Real(2.0)*facz*(sz(i-1,j  ,k-1)+sz(i,j  ,k-1)))
+                     +sol(i  ,j-1,k+1)*(          -facx*(sx(i-1,j-1,k  )+sx(i,j-1,k  ))
+                                        +Real(2.0)*facy*(sy(i-1,j-1,k  )+sy(i,j-1,k  ))
+                                        +Real(2.0)*facz*(sz(i-1,j-1,k  )+sz(i,j-1,k  )))
+                     +sol(i  ,j+1,k+1)*(          -facx*(sx(i-1,j  ,k  )+sx(i,j  ,k  ))
+                                        +Real(2.0)*facy*(sy(i-1,j  ,k  )+sy(i,j  ,k  ))
+                                        +Real(2.0)*facz*(sz(i-1,j  ,k  )+sz(i,j  ,k  )))
+                     +sol(i-1,j  ,k-1)*( Real(2.0)*facx*(sx(i-1,j-1,k-1)+sx(i-1,j,k-1))
+                                                  -facy*(sy(i-1,j-1,k-1)+sy(i-1,j,k-1))
+                                        +Real(2.0)*facz*(sz(i-1,j-1,k-1)+sz(i-1,j,k-1)))
+                     +sol(i+1,j  ,k-1)*( Real(2.0)*facx*(sx(i  ,j-1,k-1)+sx(i  ,j,k-1))
+                                                  -facy*(sy(i  ,j-1,k-1)+sy(i  ,j,k-1))
+                                        +Real(2.0)*facz*(sz(i  ,j-1,k-1)+sz(i  ,j,k-1)))
+                     +sol(i-1,j  ,k+1)*( Real(2.0)*facx*(sx(i-1,j-1,k  )+sx(i-1,j,k  ))
+                                                  -facy*(sy(i-1,j-1,k  )+sy(i-1,j,k  ))
+                                        +Real(2.0)*facz*(sz(i-1,j-1,k  )+sz(i-1,j,k  )))
+                     +sol(i+1,j  ,k+1)*( Real(2.0)*facx*(sx(i  ,j-1,k  )+sx(i  ,j,k  ))
+                                                  -facy*(sy(i  ,j-1,k  )+sy(i  ,j,k  ))
+                                        +Real(2.0)*facz*(sz(i  ,j-1,k  )+sz(i  ,j,k  )))
+                     +sol(i-1,j-1,k  )*( Real(2.0)*facx*(sx(i-1,j-1,k-1)+sx(i-1,j-1,k))
+                                        +Real(2.0)*facy*(sy(i-1,j-1,k-1)+sy(i-1,j-1,k))
+                                                  -facz*(sz(i-1,j-1,k-1)+sz(i-1,j-1,k)))
+                     +sol(i+1,j-1,k  )*( Real(2.0)*facx*(sx(i  ,j-1,k-1)+sx(i  ,j-1,k))
+                                        +Real(2.0)*facy*(sy(i  ,j-1,k-1)+sy(i  ,j-1,k))
+                                                  -facz*(sz(i  ,j-1,k-1)+sz(i  ,j-1,k)))
+                     +sol(i-1,j+1,k  )*( Real(2.0)*facx*(sx(i-1,j  ,k-1)+sx(i-1,j  ,k))
+                                        +Real(2.0)*facy*(sy(i-1,j  ,k-1)+sy(i-1,j  ,k))
+                                                  -facz*(sz(i-1,j  ,k-1)+sz(i-1,j  ,k)))
+                     +sol(i+1,j+1,k  )*( Real(2.0)*facx*(sx(i  ,j  ,k-1)+sx(i  ,j  ,k))
+                                        +Real(2.0)*facy*(sy(i  ,j  ,k-1)+sy(i  ,j  ,k))
+                                                  -facz*(sz(i  ,j  ,k-1)+sz(i  ,j  ,k)))
+                     + Real(2.0)*sol(i-1,j,k)*(Real(2.0)*facx*(sx(i-1,j-1,k-1)+sx(i-1,j,k-1)+sx(i-1,j-1,k)+sx(i-1,j,k))
+                                                        -facy*(sy(i-1,j-1,k-1)+sy(i-1,j,k-1)+sy(i-1,j-1,k)+sy(i-1,j,k))
+                                                        -facz*(sz(i-1,j-1,k-1)+sz(i-1,j,k-1)+sz(i-1,j-1,k)+sz(i-1,j,k)))
+                     + Real(2.0)*sol(i+1,j,k)*(Real(2.0)*facx*(sx(i  ,j-1,k-1)+sx(i  ,j,k-1)+sx(i  ,j-1,k)+sx(i  ,j,k))
+                                                        -facy*(sy(i  ,j-1,k-1)+sy(i  ,j,k-1)+sy(i  ,j-1,k)+sy(i  ,j,k))
+                                                        -facz*(sz(i  ,j-1,k-1)+sz(i  ,j,k-1)+sz(i  ,j-1,k)+sz(i  ,j,k)))
+                     + Real(2.0)*sol(i,j-1,k)*(   -facx*(sx(i-1,j-1,k-1)+sx(i,j-1,k-1)+sx(i-1,j-1,k)+sx(i,j-1,k))
+                                        +Real(2.0)*facy*(sy(i-1,j-1,k-1)+sy(i,j-1,k-1)+sy(i-1,j-1,k)+sy(i,j-1,k))
+                                                  -facz*(sz(i-1,j-1,k-1)+sz(i,j-1,k-1)+sz(i-1,j-1,k)+sz(i,j-1,k)))
+                     + Real(2.0)*sol(i,j+1,k)*(   -facx*(sx(i-1,j  ,k-1)+sx(i,j  ,k-1)+sx(i-1,j  ,k)+sx(i,j  ,k))
+                                        +Real(2.0)*facy*(sy(i-1,j  ,k-1)+sy(i,j  ,k-1)+sy(i-1,j  ,k)+sy(i,j  ,k))
+                                                  -facz*(sz(i-1,j  ,k-1)+sz(i,j  ,k-1)+sz(i-1,j  ,k)+sz(i,j  ,k)))
+                     + Real(2.0)*sol(i,j,k-1)*(   -facx*(sx(i-1,j-1,k-1)+sx(i,j-1,k-1)+sx(i-1,j,k-1)+sx(i,j,k-1))
+                                                  -facy*(sy(i-1,j-1,k-1)+sy(i,j-1,k-1)+sy(i-1,j,k-1)+sy(i,j,k-1))
+                                        +Real(2.0)*facz*(sz(i-1,j-1,k-1)+sz(i,j-1,k-1)+sz(i-1,j,k-1)+sz(i,j,k-1)))
+                     + Real(2.0)*sol(i,j,k+1)*(   -facx*(sx(i-1,j-1,k  )+sx(i,j-1,k  )+sx(i-1,j,k  )+sx(i,j,k  ))
+                                                  -facy*(sy(i-1,j-1,k  )+sy(i,j-1,k  )+sy(i-1,j,k  )+sy(i,j,k  ))
+                                        +Real(2.0)*facz*(sz(i-1,j-1,k  )+sz(i,j-1,k  )+sz(i-1,j,k  )+sz(i,j,k  )));
+
+                sol(i,j,k) += (rhs(i,j,k) - Ax) / s0;
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_aa (int i, int j, int k, Array4<Real> const& sol,
+                         Array4<Real const> const& rhs, Array4<Real const> const& sig,
+                         Array4<int const> const& msk,
+                         GpuArray<Real,AMREX_SPACEDIM> const& dxinv, int color) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        if (msk(i,j,k)) {
+            sol(i,j,k) = Real(0.0);
+        } else {
+            Real facx = Real(1.0/36.0)*dxinv[0]*dxinv[0];
+            Real facy = Real(1.0/36.0)*dxinv[1]*dxinv[1];
+            Real facz = Real(1.0/36.0)*dxinv[2]*dxinv[2];
+            Real fxyz = facx + facy + facz;
+            Real fmx2y2z = -facx + Real(2.0)*facy + Real(2.0)*facz;
+            Real f2xmy2z = Real(2.0)*facx - facy + Real(2.0)*facz;
+            Real f2x2ymz = Real(2.0)*facx + Real(2.0)*facy - facz;
+            Real f4xm2ym2z = Real(4.0)*facx - Real(2.0)*facy - Real(2.0)*facz;
+            Real fm2x4ym2z = -Real(2.0)*facx + Real(4.0)*facy - Real(2.0)*facz;
+            Real fm2xm2y4z = -Real(2.0)*facx - Real(2.0)*facy + Real(4.0)*facz;
+
+            Real s0 = Real(-4.0)*fxyz*(sig(i-1,j-1,k-1)+sig(i,j-1,k-1)+sig(i-1,j,k-1)+sig(i,j,k-1)
+                                      +sig(i-1,j-1,k  )+sig(i,j-1,k  )+sig(i-1,j,k  )+sig(i,j,k  ));
+            Real Ax = sol(i,j,k)*s0
+                + fxyz*(sol(i-1,j-1,k-1)*sig(i-1,j-1,k-1)
+                      + sol(i+1,j-1,k-1)*sig(i  ,j-1,k-1)
+                      + sol(i-1,j+1,k-1)*sig(i-1,j  ,k-1)
+                      + sol(i+1,j+1,k-1)*sig(i  ,j  ,k-1)
+                      + sol(i-1,j-1,k+1)*sig(i-1,j-1,k  )
+                      + sol(i+1,j-1,k+1)*sig(i  ,j-1,k  )
+                      + sol(i-1,j+1,k+1)*sig(i-1,j  ,k  )
+                      + sol(i+1,j+1,k+1)*sig(i  ,j  ,k  ))
+                + fmx2y2z*(sol(i  ,j-1,k-1)*(sig(i-1,j-1,k-1)+sig(i,j-1,k-1))
+                         + sol(i  ,j+1,k-1)*(sig(i-1,j  ,k-1)+sig(i,j  ,k-1))
+                         + sol(i  ,j-1,k+1)*(sig(i-1,j-1,k  )+sig(i,j-1,k  ))
+                         + sol(i  ,j+1,k+1)*(sig(i-1,j  ,k  )+sig(i,j  ,k  )))
+                + f2xmy2z*(sol(i-1,j  ,k-1)*(sig(i-1,j-1,k-1)+sig(i-1,j,k-1))
+                         + sol(i+1,j  ,k-1)*(sig(i  ,j-1,k-1)+sig(i  ,j,k-1))
+                         + sol(i-1,j  ,k+1)*(sig(i-1,j-1,k  )+sig(i-1,j,k  ))
+                         + sol(i+1,j  ,k+1)*(sig(i  ,j-1,k  )+sig(i  ,j,k  )))
+                + f2x2ymz*(sol(i-1,j-1,k  )*(sig(i-1,j-1,k-1)+sig(i-1,j-1,k))
+                         + sol(i+1,j-1,k  )*(sig(i  ,j-1,k-1)+sig(i  ,j-1,k))
+                         + sol(i-1,j+1,k  )*(sig(i-1,j  ,k-1)+sig(i-1,j  ,k))
+                         + sol(i+1,j+1,k  )*(sig(i  ,j  ,k-1)+sig(i  ,j  ,k)))
+                + f4xm2ym2z*(sol(i-1,j,k)*(sig(i-1,j-1,k-1)+sig(i-1,j,k-1)+sig(i-1,j-1,k)+sig(i-1,j,k))
+                           + sol(i+1,j,k)*(sig(i  ,j-1,k-1)+sig(i  ,j,k-1)+sig(i  ,j-1,k)+sig(i  ,j,k)))
+                + fm2x4ym2z*(sol(i,j-1,k)*(sig(i-1,j-1,k-1)+sig(i,j-1,k-1)+sig(i-1,j-1,k)+sig(i,j-1,k))
+                           + sol(i,j+1,k)*(sig(i-1,j  ,k-1)+sig(i,j  ,k-1)+sig(i-1,j  ,k)+sig(i,j  ,k)))
+                + fm2xm2y4z*(sol(i,j,k-1)*(sig(i-1,j-1,k-1)+sig(i,j-1,k-1)+sig(i-1,j,k-1)+sig(i,j,k-1))
+                           + sol(i,j,k+1)*(sig(i-1,j-1,k  )+sig(i,j-1,k  )+sig(i-1,j,k  )+sig(i,j,k  )));
+
+            sol(i,j,k) += (rhs(i,j,k) - Ax) / s0;
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_c (int i, int j, int k, Array4<Real> const& sol,
+                        Array4<Real const> const& rhs, Real sig,
+                        Array4<int const> const& msk,
+                        GpuArray<Real,AMREX_SPACEDIM> const& dxinv, int color) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        if (msk(i,j,k)) {
+            sol(i,j,k) = Real(0.0);
+        } else {
+            Real facx = Real(1.0/36.0)*dxinv[0]*dxinv[0];
+            Real facy = Real(1.0/36.0)*dxinv[1]*dxinv[1];
+            Real facz = Real(1.0/36.0)*dxinv[2]*dxinv[2];
+            Real fxyz = facx + facy + facz;
+            Real fmx2y2z = -facx + Real(2.0)*facy + Real(2.0)*facz;
+            Real f2xmy2z = Real(2.0)*facx - facy + Real(2.0)*facz;
+            Real f2x2ymz = Real(2.0)*facx + Real(2.0)*facy - facz;
+            Real f4xm2ym2z = Real(4.0)*facx - Real(2.0)*facy - Real(2.0)*facz;
+            Real fm2x4ym2z = -Real(2.0)*facx + Real(4.0)*facy - Real(2.0)*facz;
+            Real fm2xm2y4z = -Real(2.0)*facx - Real(2.0)*facy + Real(4.0)*facz;
+
+            Real s0 = Real(-4.0)*fxyz*Real(8.);
+            Real Ax = sol(i,j,k)*s0
+                + fxyz*(sol(i-1,j-1,k-1)
+                      + sol(i+1,j-1,k-1)
+                      + sol(i-1,j+1,k-1)
+                      + sol(i+1,j+1,k-1)
+                      + sol(i-1,j-1,k+1)
+                      + sol(i+1,j-1,k+1)
+                      + sol(i-1,j+1,k+1)
+                      + sol(i+1,j+1,k+1))
+                + fmx2y2z*(sol(i  ,j-1,k-1)*Real(2.)
+                         + sol(i  ,j+1,k-1)*Real(2.)
+                         + sol(i  ,j-1,k+1)*Real(2.)
+                         + sol(i  ,j+1,k+1)*Real(2.))
+                + f2xmy2z*(sol(i-1,j  ,k-1)*Real(2.)
+                         + sol(i+1,j  ,k-1)*Real(2.)
+                         + sol(i-1,j  ,k+1)*Real(2.)
+                         + sol(i+1,j  ,k+1)*Real(2.))
+                + f2x2ymz*(sol(i-1,j-1,k  )*Real(2.)
+                         + sol(i+1,j-1,k  )*Real(2.)
+                         + sol(i-1,j+1,k  )*Real(2.)
+                         + sol(i+1,j+1,k  )*Real(2.))
+                + f4xm2ym2z*(sol(i-1,j,k)*Real(4.)
+                           + sol(i+1,j,k)*Real(4.))
+                + fm2x4ym2z*(sol(i,j-1,k)*Real(4.)
+                           + sol(i,j+1,k)*Real(4.))
+                + fm2xm2y4z*(sol(i,j,k-1)*Real(4.)
+                           + sol(i,j,k+1)*Real(4.));
+
+            sol(i,j,k) += (rhs(i,j,k) - Ax*sig) / (s0*sig);
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlndlap_gscolor_sten (int i, int j, int k, Array4<Real> const& sol,
+                           Array4<Real const> const& rhs,
+                           Array4<Real const> const& sten,
+                           Array4<int const> const& msk, int color) noexcept
+{
+    if (mlndlap_color(i,j,k) == color) {
+        mlndlap_gauss_seidel_sten(i,j,k,sol,rhs,sten,msk);
+    }
+}
+
 }
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
index da90f84e829..8e490f30348 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
@@ -356,95 +356,38 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
     const iMultiFab& dmsk = *m_dirichlet_mask[amrlev][mglev];
 
 #ifdef AMREX_USE_GPU
-    if (Gpu::inLaunchRegion())
+    auto const& solarr_ma = sol.arrays();
+    auto const& rhsarr_ma = rhs.const_arrays();
+    auto const& dmskarr_ma = dmsk.const_arrays();
+#else
+    bool regular_coarsening = true;
+    if (amrlev == 0 && mglev > 0)
     {
-        auto solarr_ma = sol.arrays();
-        auto rhsarr_ma = rhs.const_arrays();
-        auto dmskarr_ma = dmsk.const_arrays();
-        if (m_coarsening_strategy == CoarseningStrategy::RAP)
-        {
-            auto starr_ma = stencil->const_arrays();
-            for (int ns = 0; ns < m_smooth_num_sweeps; ++ns)
-            {
-                ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-                {
-                    Real Ax = mlndlap_adotx_sten(i,j,k,solarr_ma[box_no],starr_ma[box_no],dmskarr_ma[box_no]);
-                    mlndlap_jacobi_sten(i,j,k,solarr_ma[box_no],Ax,rhsarr_ma[box_no],starr_ma[box_no],dmskarr_ma[box_no]);
-                });
-            }
-        }
-        else if (sigma[0] == nullptr)
-        {
-            for (int ns = 0; ns < m_smooth_num_sweeps; ++ns)
-            {
-                Real const_sigma = m_const_sigma;
-                ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-                {
-                    Real Ax = mlndlap_adotx_c(i,j,k,solarr_ma[box_no],const_sigma,dmskarr_ma[box_no],
-#if (AMREX_SPACEDIM == 2)
-                                              is_rz,
-#endif
-                                              dxinvarr);
-                    mlndlap_jacobi_c(i,j,k, solarr_ma[box_no], Ax, rhsarr_ma[box_no], const_sigma,
-                                     dmskarr_ma[box_no], dxinvarr);
-                });
-            }
-        }
-        else if ((m_use_harmonic_average && mglev > 0) || m_use_mapped)
-        {
-            AMREX_D_TERM(MultiArray4<Real const> const& sxarr_ma = sigma[0]->const_arrays();,
-                         MultiArray4<Real const> const& syarr_ma = sigma[1]->const_arrays();,
-                         MultiArray4<Real const> const& szarr_ma = sigma[2]->const_arrays(););
-            for (int ns = 0; ns < m_smooth_num_sweeps; ++ns)
-            {
-                ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-                {
-                    Real Ax = mlndlap_adotx_ha(i,j,k,solarr_ma[box_no],AMREX_D_DECL(sxarr_ma[box_no],syarr_ma[box_no],szarr_ma[box_no]), dmskarr_ma[box_no],
-#if (AMREX_SPACEDIM == 2)
-                                               is_rz,
+        regular_coarsening = mg_coarsen_ratio_vec[mglev-1] == mg_coarsen_ratio;
+    }
+    if (sigma[0] == nullptr) {
+        AMREX_ALWAYS_ASSERT(regular_coarsening);
+    }
 #endif
-                                               dxinvarr);
-                    mlndlap_jacobi_ha(i,j,k, solarr_ma[box_no], Ax, rhsarr_ma[box_no], AMREX_D_DECL(sxarr_ma[box_no],syarr_ma[box_no],szarr_ma[box_no]),
-                                      dmskarr_ma[box_no], dxinvarr);
-                });
-            }
-        }
-        else
+
+    if (m_use_gauss_seidel)
+    {
+        if (m_coarsening_strategy == CoarseningStrategy::RAP)
         {
-            auto sarr_ma = sigma[0]->const_arrays();
-            for (int ns = 0; ns < m_smooth_num_sweeps; ++ns)
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion())
             {
-                ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                auto const& starr_ma = stencil->const_arrays();
+                for (int color = 0; color < AMREX_D_TERM(2,*2,*2); ++color)
                 {
-                    Real Ax = mlndlap_adotx_aa(i,j,k,solarr_ma[box_no],sarr_ma[box_no],dmskarr_ma[box_no],
-#if (AMREX_SPACEDIM == 2)
-                                               is_rz,
-#endif
-                                               dxinvarr);
-                    mlndlap_jacobi_aa(i,j,k, solarr_ma[box_no], Ax, rhsarr_ma[box_no], sarr_ma[box_no],
-                                      dmskarr_ma[box_no], dxinvarr);
-                });
-            }
-        }
-
-        Gpu::streamSynchronize();
-        if (m_smooth_num_sweeps > 1) { nodalSync(amrlev, mglev, sol); }
-    }
-    else // cpu
+                    ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                    {
+                        mlndlap_gscolor_sten(i,j,k,solarr_ma[box_no],rhsarr_ma[box_no],
+                                             starr_ma[box_no],dmskarr_ma[box_no],color);
+                    });
+                }
+            } else
 #endif
-    {
-        bool regular_coarsening = true;
-        if (amrlev == 0 && mglev > 0)
-        {
-            regular_coarsening = mg_coarsen_ratio_vec[mglev-1] == mg_coarsen_ratio;
-        }
-        if (sigma[0] == nullptr) {
-            AMREX_ALWAYS_ASSERT(regular_coarsening);
-        }
-
-        if (m_use_gauss_seidel)
-        {
-            if (m_coarsening_strategy == CoarseningStrategy::RAP)
             {
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
@@ -462,9 +405,27 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                     }
                 }
             }
-            else if (sigma[0] == nullptr)
+        }
+        else if (sigma[0] == nullptr)
+        {
+            Real const_sigma = m_const_sigma;
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion()) {
+                for (int color = 0; color < AMREX_D_TERM(2,*2,*2); ++color)
+                {
+                    ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                    {
+                        mlndlap_gscolor_c(i,j,k, solarr_ma[box_no], rhsarr_ma[box_no],
+                                          const_sigma, dmskarr_ma[box_no], dxinvarr, color
+#if (AMREX_SPACEDIM == 2)
+                                          ,is_rz
+#endif
+                            );
+                    });
+                }
+            } else
+#endif
             {
-                Real const_sigma = m_const_sigma;
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
 #endif
@@ -485,8 +446,32 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                     }
                 }
             }
-            else if ( (m_use_harmonic_average && mglev > 0) || m_use_mapped )
+        }
+        else if ( (m_use_harmonic_average && mglev > 0) || m_use_mapped )
+        {
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion())
             {
+                AMREX_D_TERM(MultiArray4<Real const> const& sxarr_ma = sigma[0]->const_arrays();,
+                             MultiArray4<Real const> const& syarr_ma = sigma[1]->const_arrays();,
+                             MultiArray4<Real const> const& szarr_ma = sigma[2]->const_arrays(););
+                for (int color = 0; color < AMREX_D_TERM(2,*2,*2); ++color)
+                {
+                    ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                    {
+                        mlndlap_gscolor_ha(i,j,k, solarr_ma[box_no], rhsarr_ma[box_no],
+                                           AMREX_D_DECL(sxarr_ma[box_no],syarr_ma[box_no],szarr_ma[box_no]),
+                                           dmskarr_ma[box_no], dxinvarr, color
+#if (AMREX_SPACEDIM == 2)
+                                           ,is_rz
+#endif
+                            );
+                    });
+                }
+            } else
+#endif
+            {
+
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
 #endif
@@ -511,51 +496,94 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                     }
                 }
             }
-            else
+        }
+        else
+        {
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion())
+            {
+                auto const& sarr_ma = sigma[0]->const_arrays();
+                for (int color = 0; color < AMREX_D_TERM(2,*2,*2); ++color)
+                {
+                    ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                    {
+                        mlndlap_gscolor_aa(i,j,k, solarr_ma[box_no], rhsarr_ma[box_no],
+                                           sarr_ma[box_no], dmskarr_ma[box_no], dxinvarr, color
+#if (AMREX_SPACEDIM == 2)
+                                           ,is_rz
+#endif
+                            );
+                    });
+                }
+            } else
+#endif
             {
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
 #endif
                 for (MFIter mfi(sol); mfi.isValid(); ++mfi)
                 {
-
                     const Box& bx = mfi.validbox();
                     Array4<Real const> const& sarr = sigma[0]->const_array(mfi);
                     Array4<Real> const& solarr = sol.array(mfi);
                     Array4<Real const> const& rhsarr = rhs.const_array(mfi);
                     Array4<int const> const& dmskarr = dmsk.const_array(mfi);
 
+#ifndef AMREX_USE_GPU
                     if ( regular_coarsening )
+#endif
                     {
                         for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) {
                             mlndlap_gauss_seidel_aa(bx, solarr, rhsarr,
                                                     sarr, dmskarr, dxinvarr
 #if (AMREX_SPACEDIM == 2)
-                                                   ,is_rz
+                                                    ,is_rz
 #endif
-                                 );
+                                );
                         }
-                    } else {
+                    }
+#ifndef AMREX_USE_GPU
+                    else {
                         for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) {
                             mlndlap_gauss_seidel_with_line_solve_aa(bx, solarr, rhsarr,
                                                                     sarr, dmskarr, dxinvarr
 #if (AMREX_SPACEDIM == 2)
-                                                                   ,is_rz
+                                                                    ,is_rz
 #endif
                                 );
                         }
                     }
+#endif
                 }
             }
-
-            nodalSync(amrlev, mglev, sol);
         }
-        else
-        {
-            MultiFab Ax(sol.boxArray(), sol.DistributionMap(), 1, 0);
-            Fapply(amrlev, mglev, Ax, sol);
 
-            if (m_coarsening_strategy == CoarseningStrategy::RAP)
+        Gpu::streamSynchronize();
+        nodalSync(amrlev, mglev, sol);
+    }
+    else
+    {
+        MultiFab Ax(sol.boxArray(), sol.DistributionMap(), 1, 0);
+        Fapply(amrlev, mglev, Ax, sol);
+
+#ifdef AMREX_USE_GPU
+        auto const& Axarr_ma = Ax.const_arrays();
+#endif
+
+        if (m_coarsening_strategy == CoarseningStrategy::RAP)
+        {
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion())
+            {
+                auto const& starr_ma = stencil->const_arrays();
+                ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    mlndlap_jacobi_sten(i,j,k,solarr_ma[box_no],Axarr_ma[box_no](i,j,k),
+                                        rhsarr_ma[box_no],starr_ma[box_no],
+                                        dmskarr_ma[box_no]);
+                });
+            } else
+#endif
             {
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
@@ -572,9 +600,22 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                     mlndlap_jacobi_sten(bx,solarr,Axarr,rhsarr,stenarr,dmskarr);
                 }
             }
-            else if (sigma[0] == nullptr)
+        }
+        else if (sigma[0] == nullptr)
+        {
+            Real const_sigma = m_const_sigma;
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion())
+            {
+                ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    mlndlap_jacobi_c(i,j,k,solarr_ma[box_no],Axarr_ma[box_no](i,j,k),
+                                     rhsarr_ma[box_no],const_sigma,
+                                     dmskarr_ma[box_no], dxinvarr);
+                });
+            } else
+#endif
             {
-                Real const_sigma = m_const_sigma;
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
 #endif
@@ -590,7 +631,23 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                                       dmskarr, dxinvarr);
                 }
             }
-            else if ( (m_use_harmonic_average && mglev > 0) || m_use_mapped )
+        }
+        else if ( (m_use_harmonic_average && mglev > 0) || m_use_mapped )
+        {
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion())
+            {
+                AMREX_D_TERM(MultiArray4<Real const> const& sxarr_ma = sigma[0]->const_arrays();,
+                             MultiArray4<Real const> const& syarr_ma = sigma[1]->const_arrays();,
+                             MultiArray4<Real const> const& szarr_ma = sigma[2]->const_arrays(););
+                ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    mlndlap_jacobi_ha(i,j,k,solarr_ma[box_no],Axarr_ma[box_no](i,j,k),rhsarr_ma[box_no],
+                                      AMREX_D_DECL(sxarr_ma[box_no],syarr_ma[box_no],szarr_ma[box_no]),
+                                      dmskarr_ma[box_no], dxinvarr);
+                });
+            } else
+#endif
             {
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
@@ -610,7 +667,21 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                                        dmskarr, dxinvarr);
                 }
             }
-            else
+        }
+        else
+        {
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion())
+            {
+                auto const& sarr_ma = sigma[0]->const_arrays();
+                ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    mlndlap_jacobi_aa(i,j,k,solarr_ma[box_no],Axarr_ma[box_no](i,j,k),
+                                      rhsarr_ma[box_no],sarr_ma[box_no],
+                                      dmskarr_ma[box_no], dxinvarr);
+                });
+            } else
+#endif
             {
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
@@ -629,6 +700,8 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                 }
             }
         }
+
+        Gpu::streamSynchronize();
     }
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H
index 0a5b90a7bab..6fecf13eafb 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H
@@ -367,6 +367,12 @@ MLPoissonT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb
 {
     BL_PROFILE("MLPoisson::Fsmooth()");
 
+    MF Ax;
+    if (! this->m_use_gauss_seidel) { // jacobi
+        Ax.define(sol.boxArray(), sol.DistributionMap(), sol.nComp(), 0);
+        Fapply(amrlev, mglev, Ax, sol);
+    }
+
     const auto& undrrelxr = this->m_undrrelxr[amrlev][mglev];
     const auto& maskvals  = this->m_maskvals [amrlev][mglev];
 
@@ -443,122 +449,130 @@ MLPoissonT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb
 #endif
 #endif
 
-#if (AMREX_SPACEDIM == 1)
         if (this->m_overset_mask[amrlev][mglev]) {
             AMREX_ASSERT(!this->m_has_metric_term);
             const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays();
-            ParallelFor(sol,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-            {
-                Box vbx(rhsma[box_no]);
-                mlpoisson_gsrb_os(i, j, k, solnma[box_no], rhsma[box_no],
-                                  osmma[box_no], dhx,
-                                  f0ma[box_no], m0ma[box_no],
-                                  f1ma[box_no], m1ma[box_no],
-                                  vbx, redblack);
-            });
-        } else if (this->m_has_metric_term) {
-            ParallelFor(sol,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-            {
-                Box vbx(rhsma[box_no]);
-                mlpoisson_gsrb_m(i, j, k, solnma[box_no], rhsma[box_no], dhx,
-                                 f0ma[box_no], m0ma[box_no],
-                                 f1ma[box_no], m1ma[box_no],
-                                 vbx, redblack,
-                                 dx, probxlo);
-            });
-        } else {
-            ParallelFor(sol,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-            {
-                Box vbx(rhsma[box_no]);
-                mlpoisson_gsrb(i, j, k, solnma[box_no], rhsma[box_no], dhx,
-                               f0ma[box_no], m0ma[box_no],
-                               f1ma[box_no], m1ma[box_no],
-                               vbx, redblack);
-            });
-        }
+            if (this->m_use_gauss_seidel) {
+                ParallelFor(sol,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    Box vbx(rhsma[box_no]);
+                    mlpoisson_gsrb_os(i, j, k, solnma[box_no], rhsma[box_no],
+                                      osmma[box_no], AMREX_D_DECL(dhx, dhy, dhz),
+                                      f0ma[box_no], m0ma[box_no],
+                                      f1ma[box_no], m1ma[box_no],
+#if (AMREX_SPACEDIM > 1)
+                                      f2ma[box_no], m2ma[box_no],
+                                      f3ma[box_no], m3ma[box_no],
+#if (AMREX_SPACEDIM > 2)
+                                      f4ma[box_no], m4ma[box_no],
+                                      f5ma[box_no], m5ma[box_no],
 #endif
-
-#if (AMREX_SPACEDIM == 2)
-        if (this->m_overset_mask[amrlev][mglev]) {
-            AMREX_ASSERT(!this->m_has_metric_term);
-            const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays();
-            ParallelFor(sol,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-            {
-                Box vbx(rhsma[box_no]);
-                mlpoisson_gsrb_os(i, j, k, solnma[box_no], rhsma[box_no],
-                                  osmma[box_no], dhx, dhy,
-                                  f0ma[box_no], m0ma[box_no],
-                                  f1ma[box_no], m1ma[box_no],
-                                  f2ma[box_no], m2ma[box_no],
-                                  f3ma[box_no], m3ma[box_no],
-                                  vbx, redblack);
-            });
-        } else if (this->m_has_metric_term) {
-            ParallelFor(sol,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-            {
-                Box vbx(rhsma[box_no]);
-                mlpoisson_gsrb_m(i, j, k, solnma[box_no], rhsma[box_no], dhx, dhy,
-                                 f0ma[box_no], m0ma[box_no],
-                                 f1ma[box_no], m1ma[box_no],
-                                 f2ma[box_no], m2ma[box_no],
-                                 f3ma[box_no], m3ma[box_no],
-                                 vbx, redblack,
-                                 dx, probxlo);
-            });
-        } else {
-            ParallelFor(sol,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-            {
-                Box vbx(rhsma[box_no]);
-                mlpoisson_gsrb(i, j, k, solnma[box_no], rhsma[box_no], dhx, dhy,
-                               f0ma[box_no], m0ma[box_no],
-                               f1ma[box_no], m1ma[box_no],
-                               f2ma[box_no], m2ma[box_no],
-                               f3ma[box_no], m3ma[box_no],
-                               vbx, redblack);
-            });
+#endif
+                                      vbx, redblack);
+                });
+            } else {
+                const auto& axma = Ax.const_arrays();
+                ParallelFor(sol,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    Box vbx(rhsma[box_no]);
+                    mlpoisson_jacobi_os(i, j, k, solnma[box_no], rhsma[box_no],
+                                        axma[box_no], osmma[box_no],
+                                        AMREX_D_DECL(dhx, dhy, dhz),
+                                        f0ma[box_no], m0ma[box_no],
+                                        f1ma[box_no], m1ma[box_no],
+#if (AMREX_SPACEDIM > 1)
+                                        f2ma[box_no], m2ma[box_no],
+                                        f3ma[box_no], m3ma[box_no],
+#if (AMREX_SPACEDIM > 2)
+                                        f4ma[box_no], m4ma[box_no],
+                                        f5ma[box_no], m5ma[box_no],
+#endif
+#endif
+                                        vbx);
+                });
+            }
         }
+#if (AMREX_SPACEDIM < 3)
+        else if (this->m_has_metric_term) {
+            if (this->m_use_gauss_seidel) {
+                ParallelFor(sol,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    Box vbx(rhsma[box_no]);
+                    mlpoisson_gsrb_m(i, j, k, solnma[box_no], rhsma[box_no],
+                                     AMREX_D_DECL(dhx, dhy, dhz),
+                                     f0ma[box_no], m0ma[box_no],
+                                     f1ma[box_no], m1ma[box_no],
+#if (AMREX_SPACEDIM > 1)
+                                     f2ma[box_no], m2ma[box_no],
+                                     f3ma[box_no], m3ma[box_no],
 #endif
-
-#if (AMREX_SPACEDIM == 3)
-        if (this->m_overset_mask[amrlev][mglev]) {
-            AMREX_ASSERT(!this->m_has_metric_term);
-            const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays();
-            ParallelFor(sol,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-            {
-                Box vbx(rhsma[box_no]);
-                mlpoisson_gsrb_os(i, j, k, solnma[box_no], rhsma[box_no],
-                                  osmma[box_no], dhx, dhy, dhz,
-                                  f0ma[box_no], m0ma[box_no],
-                                  f1ma[box_no], m1ma[box_no],
-                                  f2ma[box_no], m2ma[box_no],
-                                  f3ma[box_no], m3ma[box_no],
-                                  f4ma[box_no], m4ma[box_no],
-                                  f5ma[box_no], m5ma[box_no],
-                                  vbx, redblack);
-            });
-        } else {
-            ParallelFor(sol,
-            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-            {
-                Box vbx(rhsma[box_no]);
-                mlpoisson_gsrb(i, j, k, solnma[box_no], rhsma[box_no], dhx, dhy, dhz,
-                               f0ma[box_no], m0ma[box_no],
-                               f1ma[box_no], m1ma[box_no],
-                               f2ma[box_no], m2ma[box_no],
-                               f3ma[box_no], m3ma[box_no],
-                               f4ma[box_no], m4ma[box_no],
-                               f5ma[box_no], m5ma[box_no],
-                               vbx, redblack);
-            });
+                                     vbx, redblack,
+                                     dx, probxlo);
+                });
+            } else {
+                const auto& axma = Ax.const_arrays();
+                ParallelFor(sol,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    Box vbx(rhsma[box_no]);
+                    mlpoisson_jacobi_m(i, j, k, solnma[box_no], rhsma[box_no],
+                                       axma[box_no], AMREX_D_DECL(dhx, dhy, dhz),
+                                       f0ma[box_no], m0ma[box_no],
+                                       f1ma[box_no], m1ma[box_no],
+#if (AMREX_SPACEDIM > 1)
+                                       f2ma[box_no], m2ma[box_no],
+                                       f3ma[box_no], m3ma[box_no],
+#endif
+                                       vbx, dx, probxlo);
+                });
+            }
         }
 #endif
+        else {
+            if (this->m_use_gauss_seidel) {
+                ParallelFor(sol,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    Box vbx(rhsma[box_no]);
+                    mlpoisson_gsrb(i, j, k, solnma[box_no], rhsma[box_no],
+                                   AMREX_D_DECL(dhx, dhy, dhz),
+                                   f0ma[box_no], m0ma[box_no],
+                                   f1ma[box_no], m1ma[box_no],
+#if (AMREX_SPACEDIM > 1)
+                                   f2ma[box_no], m2ma[box_no],
+                                   f3ma[box_no], m3ma[box_no],
+#if (AMREX_SPACEDIM > 2)
+                                   f4ma[box_no], m4ma[box_no],
+                                   f5ma[box_no], m5ma[box_no],
+#endif
+#endif
+                                   vbx, redblack);
+                });
+            } else {
+                const auto& axma = Ax.const_arrays();
+                ParallelFor(sol,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    Box vbx(rhsma[box_no]);
+                    mlpoisson_jacobi(i, j, k, solnma[box_no], rhsma[box_no],
+                                     axma[box_no], AMREX_D_DECL(dhx, dhy, dhz),
+                                     f0ma[box_no], m0ma[box_no],
+                                     f1ma[box_no], m1ma[box_no],
+#if (AMREX_SPACEDIM > 1)
+                                     f2ma[box_no], m2ma[box_no],
+                                     f3ma[box_no], m3ma[box_no],
+#if (AMREX_SPACEDIM > 2)
+                                     f4ma[box_no], m4ma[box_no],
+                                     f5ma[box_no], m5ma[box_no],
+#endif
+#endif
+                                     vbx);
+                });
+            }
+        }
     } else
 #endif
     {
@@ -598,30 +612,64 @@ MLPoissonT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb
             if (this->m_overset_mask[amrlev][mglev]) {
                 AMREX_ASSERT(!this->m_has_metric_term);
                 const auto& osm = this->m_overset_mask[amrlev][mglev]->const_array(mfi);
-                AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
-                {
-                    mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx,
-                                      f0fab, m0,
-                                      f1fab, m1,
-                                      vbx, redblack);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx,
+                                          f0fab, m0,
+                                          f1fab, m1,
+                                          vbx, redblack);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_jacobi_os(i, j, k, solnfab, rhsfab, axfab,
+                                            osm, dhx,
+                                            f0fab, m0,
+                                            f1fab, m1,
+                                            vbx);
+                    });
+                }
             } else if (this->m_has_metric_term) {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
-                {
-                    mlpoisson_gsrb_m(i, j, k, solnfab, rhsfab, dhx,
-                                     f0fab, m0,
-                                     f1fab, m1,
-                                     vbx, redblack,
-                                     dx, probxlo);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_gsrb_m(i, j, k, solnfab, rhsfab, dhx,
+                                         f0fab, m0,
+                                         f1fab, m1,
+                                         vbx, redblack,
+                                         dx, probxlo);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_jacobi_m(i, j, k, solnfab, rhsfab, axfab, dhx,
+                                           f0fab, m0,
+                                           f1fab, m1,
+                                           vbx, dx, probxlo);
+                    });
+                }
             } else {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
-                {
-                    mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx,
-                                   f0fab, m0,
-                                   f1fab, m1,
-                                   vbx, redblack);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx,
+                                       f0fab, m0,
+                                       f1fab, m1,
+                                       vbx, redblack);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_jacobi(i, j, k, solnfab, rhsfab, axfab, dhx,
+                                         f0fab, m0,
+                                         f1fab, m1,
+                                         vbx);
+                    });
+                }
             }
 #endif
 
@@ -629,55 +677,110 @@ MLPoissonT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb
             if (this->m_overset_mask[amrlev][mglev]) {
                 AMREX_ASSERT(!this->m_has_metric_term);
                 const auto& osm = this->m_overset_mask[amrlev][mglev]->const_array(mfi);
-                AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
-                {
-                    mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, dhy,
-                                      f0fab, m0,
-                                      f1fab, m1,
-                                      f2fab, m2,
-                                      f3fab, m3,
-                                      vbx, redblack);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, dhy,
+                                          f0fab, m0,
+                                          f1fab, m1,
+                                          f2fab, m2,
+                                          f3fab, m3,
+                                          vbx, redblack);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_jacobi_os(i, j, k, solnfab, rhsfab, axfab,
+                                            osm, dhx, dhy,
+                                            f0fab, m0,
+                                            f1fab, m1,
+                                            f2fab, m2,
+                                            f3fab, m3,
+                                            vbx);
+                    });
+                }
             } else if (this->m_has_metric_term) {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
-                {
-                    mlpoisson_gsrb_m(i, j, k, solnfab, rhsfab, dhx, dhy,
-                                     f0fab, m0,
-                                     f1fab, m1,
-                                     f2fab, m2,
-                                     f3fab, m3,
-                                     vbx, redblack,
-                                     dx, probxlo);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_gsrb_m(i, j, k, solnfab, rhsfab, dhx, dhy,
+                                         f0fab, m0,
+                                         f1fab, m1,
+                                         f2fab, m2,
+                                         f3fab, m3,
+                                         vbx, redblack,
+                                         dx, probxlo);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_jacobi_m(i, j, k, solnfab, rhsfab, axfab, dhx, dhy,
+                                           f0fab, m0,
+                                           f1fab, m1,
+                                           f2fab, m2,
+                                           f3fab, m3,
+                                           vbx, dx, probxlo);
+                    });
+                }
             } else {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
-                {
-                    mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, dhy,
-                                   f0fab, m0,
-                                   f1fab, m1,
-                                   f2fab, m2,
-                                   f3fab, m3,
-                                   vbx, redblack);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, dhy,
+                                       f0fab, m0,
+                                       f1fab, m1,
+                                       f2fab, m2,
+                                       f3fab, m3,
+                                       vbx, redblack);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_jacobi(i, j, k, solnfab, rhsfab, axfab, dhx, dhy,
+                                         f0fab, m0,
+                                         f1fab, m1,
+                                         f2fab, m2,
+                                         f3fab, m3,
+                                         vbx);
+                    });
+                }
             }
-
 #endif
 
 #if (AMREX_SPACEDIM == 3)
             if (this->m_overset_mask[amrlev][mglev]) {
                 AMREX_ASSERT(!this->m_has_metric_term);
                 const auto& osm = this->m_overset_mask[amrlev][mglev]->const_array(mfi);
-                AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
-                {
-                    mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, dhy, dhz,
-                                      f0fab, m0,
-                                      f1fab, m1,
-                                      f2fab, m2,
-                                      f3fab, m3,
-                                      f4fab, m4,
-                                      f5fab, m5,
-                                      vbx, redblack);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, dhy, dhz,
+                                          f0fab, m0,
+                                          f1fab, m1,
+                                          f2fab, m2,
+                                          f3fab, m3,
+                                          f4fab, m4,
+                                          f5fab, m5,
+                                          vbx, redblack);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_jacobi_os(i, j, k, solnfab, rhsfab, axfab,
+                                            osm, dhx, dhy, dhz,
+                                            f0fab, m0,
+                                            f1fab, m1,
+                                            f2fab, m2,
+                                            f3fab, m3,
+                                            f4fab, m4,
+                                            f5fab, m5,
+                                            vbx);
+                    });
+                }
             } else if (this->hasHiddenDimension()) {
                 Box const& tbx_2d = this->compactify(tbx);
                 Box const& vbx_2d = this->compactify(vbx);
@@ -691,27 +794,58 @@ MLPoissonT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb
                 const auto& m1_2d = this->compactify(this->get_d1(m0,m1,m2));
                 const auto& m2_2d = this->compactify(this->get_d0(m3,m4,m5));
                 const auto& m3_2d = this->compactify(this->get_d1(m3,m4,m5));
-                AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx_2d, i, j, k,
-                {
-                    TwoD::mlpoisson_gsrb(i, j, k, solnfab_2d, rhsfab_2d, dh0, dh1,
-                                         f0fab_2d, m0_2d,
-                                         f1fab_2d, m1_2d,
-                                         f2fab_2d, m2_2d,
-                                         f3fab_2d, m3_2d,
-                                         vbx_2d, redblack);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx_2d, i, j, k,
+                    {
+                        TwoD::mlpoisson_gsrb(i, j, k, solnfab_2d, rhsfab_2d, dh0, dh1,
+                                             f0fab_2d, m0_2d,
+                                             f1fab_2d, m1_2d,
+                                             f2fab_2d, m2_2d,
+                                             f3fab_2d, m3_2d,
+                                             vbx_2d, redblack);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    const auto& axfab_2d = this->compactify(axfab);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx_2d, i, j, k,
+                    {
+                        TwoD::mlpoisson_jacobi(i, j, k, solnfab_2d, rhsfab_2d,
+                                               axfab_2d, dh0, dh1,
+                                               f0fab_2d, m0_2d,
+                                               f1fab_2d, m1_2d,
+                                               f2fab_2d, m2_2d,
+                                               f3fab_2d, m3_2d,
+                                               vbx_2d);
+                    });
+                }
             } else {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
-                {
-                    mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, dhy, dhz,
-                                   f0fab, m0,
-                                   f1fab, m1,
-                                   f2fab, m2,
-                                   f3fab, m3,
-                                   f4fab, m4,
-                                   f5fab, m5,
-                                   vbx, redblack);
-                });
+                if (this->m_use_gauss_seidel) {
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, dhy, dhz,
+                                       f0fab, m0,
+                                       f1fab, m1,
+                                       f2fab, m2,
+                                       f3fab, m3,
+                                       f4fab, m4,
+                                       f5fab, m5,
+                                       vbx, redblack);
+                    });
+                } else {
+                    const auto& axfab = Ax.const_array(mfi);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k,
+                    {
+                        mlpoisson_jacobi(i, j, k, solnfab, rhsfab, axfab,
+                                         dhx, dhy, dhz,
+                                         f0fab, m0,
+                                         f1fab, m1,
+                                         f2fab, m2,
+                                         f3fab, m3,
+                                         f4fab, m4,
+                                         f5fab, m5,
+                                         vbx);
+                    });
+                }
             }
 #endif
         }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_1D_K.H
index 071e97b4ea7..59257cf8b1c 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_1D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_1D_K.H
@@ -188,6 +188,81 @@ void mlpoisson_gsrb_m (int i, int, int, Array4<T> const& phi, Array4<T const> co
     }
 }
 
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlpoisson_jacobi (int i, int, int, Array4<T> const& phi, Array4<T const> const& rhs,
+                       Array4<T const> const& Ax, T dhx,
+                       Array4<T const> const& f0, Array4<int const> const& m0,
+                       Array4<T const> const& f1, Array4<int const> const& m1,
+                       Box const& vbox) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    T gamma = -dhx*T(2.0);
+
+    T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0)
+        ? f0(vlo.x,0,0) : T(0.0);
+    T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0)
+        ? f1(vhi.x,0,0) : T(0.0);
+
+    T g_m_d = gamma + dhx*(cf0+cf1);
+
+    phi(i,0,0) += T(2.0/3.0) * (rhs(i,0,0) - Ax(i,0,0)) / g_m_d;
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlpoisson_jacobi_os (int i, int, int, Array4<T> const& phi, Array4<T const> const& rhs,
+                          Array4<T const> const& Ax, Array4<int const> const& osm, T dhx,
+                          Array4<T const> const& f0, Array4<int const> const& m0,
+                          Array4<T const> const& f1, Array4<int const> const& m1,
+                          Box const& vbox) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    if (osm(i,0,0) == 0) {
+        phi(i,0,0) = T(0.0);
+    } else {
+        T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0)
+            ? f0(vlo.x,0,0) : T(0.0);
+        T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0)
+            ? f1(vhi.x,0,0) : T(0.0);
+
+        T gamma = -dhx*T(2.0);
+        T g_m_d = gamma + dhx*(cf0+cf1);
+
+        phi(i,0,0) += T(2.0/3.0) * (rhs(i,0,0) - Ax(i,0,0)) / g_m_d;
+    }
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlpoisson_jacobi_m (int i, int, int, Array4<T> const& phi, Array4<T const> const& rhs,
+                         Array4<T const> const& Ax, T dhx,
+                         Array4<T const> const& f0, Array4<int const> const& m0,
+                         Array4<T const> const& f1, Array4<int const> const& m1,
+                         Box const& vbox, T dx, T probxlo) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0)
+        ? f0(vlo.x,0,0) : T(0.0);
+    T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0)
+        ? f1(vhi.x,0,0) : T(0.0);
+
+    T rel = (probxlo + i   *dx) * (probxlo + i   *dx);
+    T rer = (probxlo +(i+1)*dx) * (probxlo +(i+1)*dx);
+
+    T gamma = -dhx*(rel+rer);
+
+    T g_m_d = gamma + dhx*(rel*cf0+rer*cf1);
+
+    phi(i,0,0) += T(2.0/3.0) * (rhs(i,0,0) - Ax(i,0,0)) / g_m_d;
+}
+
 template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlpoisson_normalize (int i, int, int, Array4<T> const& x,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_2D_K.H
index 9604de38feb..5feba2a0066 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_2D_K.H
@@ -305,6 +305,102 @@ void mlpoisson_gsrb_m (int i, int j, int, Array4<T> const& phi, Array4<T const>
     }
 }
 
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlpoisson_jacobi (int i, int j, int, Array4<T> const& phi, Array4<T const> const& rhs,
+                       Array4<T const> const& Ax, T dhx, T dhy,
+                       Array4<T const> const& f0, Array4<int const> const& m0,
+                       Array4<T const> const& f1, Array4<int const> const& m1,
+                       Array4<T const> const& f2, Array4<int const> const& m2,
+                       Array4<T const> const& f3, Array4<int const> const& m3,
+                       Box const& vbox) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    T gamma = T(-2.0)*(dhx+dhy);
+
+    T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0)
+        ? f0(vlo.x,j,0) : T(0.0);
+    T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0)
+        ? f1(i,vlo.y,0) : T(0.0);
+    T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0)
+        ? f2(vhi.x,j,0) : T(0.0);
+    T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0)
+        ? f3(i,vhi.y,0) : T(0.0);
+
+    T g_m_d = gamma + dhx*(cf0+cf2) + dhy*(cf1+cf3);
+
+    phi(i,j,0) += T(2.0/3.0) * (rhs(i,j,0) - Ax(i,j,0)) / g_m_d;
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlpoisson_jacobi_os (int i, int j, int, Array4<T> const& phi, Array4<T const> const& rhs,
+                          Array4<T const> const& Ax, Array4<int const> const& osm,
+                          T dhx, T dhy,
+                          Array4<T const> const& f0, Array4<int const> const& m0,
+                          Array4<T const> const& f1, Array4<int const> const& m1,
+                          Array4<T const> const& f2, Array4<int const> const& m2,
+                          Array4<T const> const& f3, Array4<int const> const& m3,
+                          Box const& vbox) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    T gamma = T(-2.0)*(dhx+dhy);
+
+    if (osm(i,j,0) == 0) {
+        phi(i,j,0) = T(0.0);
+    } else {
+        T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0)
+            ? f0(vlo.x,j,0) : T(0.0);
+        T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0)
+            ? f1(i,vlo.y,0) : T(0.0);
+        T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0)
+            ? f2(vhi.x,j,0) : T(0.0);
+        T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0)
+            ? f3(i,vhi.y,0) : T(0.0);
+
+        T g_m_d = gamma + dhx*(cf0+cf2) + dhy*(cf1+cf3);
+
+        phi(i,j,0) += T(2.0/3.0) * (rhs(i,j,0) - Ax(i,j,0)) / g_m_d;
+    }
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlpoisson_jacobi_m (int i, int j, int, Array4<T> const& phi, Array4<T const> const& rhs,
+                         Array4<T const> const& Ax, T dhx, T dhy,
+                         Array4<T const> const& f0, Array4<int const> const& m0,
+                         Array4<T const> const& f1, Array4<int const> const& m1,
+                         Array4<T const> const& f2, Array4<int const> const& m2,
+                         Array4<T const> const& f3, Array4<int const> const& m3,
+                         Box const& vbox, T dx, T probxlo) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0)
+        ? f0(vlo.x,j,0) : T(0.0);
+    T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0)
+        ? f1(i,vlo.y,0) : T(0.0);
+    T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0)
+        ? f2(vhi.x,j,0) : T(0.0);
+    T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0)
+        ? f3(i,vhi.y,0) : T(0.0);
+
+    T rel = probxlo + i*dx;
+    T rer = probxlo +(i+1)*dx;
+    T rc = probxlo + (i+T(0.5))*dx;
+
+    T gamma = -dhx*(rel+rer) - T(2.0)*dhy*rc;
+
+    T g_m_d = gamma + dhx*(rel*cf0+rer*cf2) + dhy*rc*(cf1+cf3);
+
+    phi(i,j,0) += T(2.0/3.0) * (rhs(i,j,0) - Ax(i,j,0)) / g_m_d;
+}
+
 template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlpoisson_normalize (int i, int j, int, Array4<T> const& x,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_3D_K.H
index e0823294d03..fa23bc4b6dd 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_3D_K.H
@@ -245,6 +245,84 @@ void mlpoisson_gsrb_os (int i, int j, int k, Array4<T> const& phi,
     }
 }
 
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlpoisson_jacobi (int i, int j, int k, Array4<T> const& phi,
+                       Array4<T const> const& rhs, Array4<T const> const& Ax,
+                       T dhx, T dhy, T dhz,
+                       Array4<T const> const& f0, Array4<int const> const& m0,
+                       Array4<T const> const& f1, Array4<int const> const& m1,
+                       Array4<T const> const& f2, Array4<int const> const& m2,
+                       Array4<T const> const& f3, Array4<int const> const& m3,
+                       Array4<T const> const& f4, Array4<int const> const& m4,
+                       Array4<T const> const& f5, Array4<int const> const& m5,
+                       Box const& vbox) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    const T gamma = T(-2.)*(dhx+dhy+dhz);
+
+    T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0)
+        ? f0(vlo.x,j,k) : T(0.0);
+    T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0)
+        ? f1(i,vlo.y,k) : T(0.0);
+    T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0)
+        ? f2(i,j,vlo.z) : T(0.0);
+    T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0)
+        ? f3(vhi.x,j,k) : T(0.0);
+    T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0)
+        ? f4(i,vhi.y,k) : T(0.0);
+    T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0)
+        ? f5(i,j,vhi.z) : T(0.0);
+
+    T g_m_d = gamma + dhx*(cf0+cf3) + dhy*(cf1+cf4) + dhz*(cf2+cf5);
+
+    phi(i,j,k) += T(2.0/3.0) * (rhs(i,j,k) - Ax(i,j,k)) / g_m_d;
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlpoisson_jacobi_os (int i, int j, int k, Array4<T> const& phi,
+                          Array4<T const> const& rhs,
+                          Array4<T const> const& Ax,
+                          Array4<int const> const& osm,
+                          T dhx, T dhy, T dhz,
+                          Array4<T const> const& f0, Array4<int const> const& m0,
+                          Array4<T const> const& f1, Array4<int const> const& m1,
+                          Array4<T const> const& f2, Array4<int const> const& m2,
+                          Array4<T const> const& f3, Array4<int const> const& m3,
+                          Array4<T const> const& f4, Array4<int const> const& m4,
+                          Array4<T const> const& f5, Array4<int const> const& m5,
+                          Box const& vbox) noexcept
+{
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    const T gamma = T(-2.)*(dhx+dhy+dhz);
+
+    if (osm(i,j,k) == 0) {
+        phi(i,j,k) = T(0.0);
+    } else {
+        T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0)
+            ? f0(vlo.x,j,k) : T(0.0);
+        T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0)
+            ? f1(i,vlo.y,k) : T(0.0);
+        T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0)
+            ? f2(i,j,vlo.z) : T(0.0);
+        T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0)
+            ? f3(vhi.x,j,k) : T(0.0);
+        T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0)
+            ? f4(i,vhi.y,k) : T(0.0);
+        T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0)
+            ? f5(i,j,vhi.z) : T(0.0);
+
+        T g_m_d = gamma + dhx*(cf0+cf3) + dhy*(cf1+cf4) + dhz*(cf2+cf5);
+
+        phi(i,j,k) += T(2.0/3.0) * (rhs(i,j,k) - Ax(i,j,k)) / g_m_d;
+    }
+}
+
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_PCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_PCGSolver.H
new file mode 100644
index 00000000000..1b9aa55426a
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_PCGSolver.H
@@ -0,0 +1,72 @@
+#ifndef AMREX_PCG_SOLVER_H_
+#define AMREX_PCG_SOLVER_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Algorithm.H>
+#include <AMReX_Array.H>
+#include <cmath>
+#include <type_traits>
+
+namespace amrex {
+
+/**
+ * \brief Preconditioned conjugate gradient solver
+ *
+ * \param x       initial guess
+ * \param r       initial residual
+ * \param mat     matrix
+ * \param precond preconditioner
+ * \param maxiter max number of iterations
+ * \param rel_tol relative tolerance
+ */
+template <int N, typename T, typename M, typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int pcg_solve (T* AMREX_RESTRICT x, T* AMREX_RESTRICT r,
+               M const& mat, P const& precond, int maxiter, T rel_tol)
+{
+    static_assert(std::is_floating_point_v<T>);
+
+    T rnorm0 = 0;
+    for (int i = 0; i < N; ++i) {
+        rnorm0 = std::max(rnorm0, std::abs(r[i]));
+    }
+    if (rnorm0 == 0) { return 0; }
+
+    int iter = 0;
+    T rho_prev = T(1.0); // initialized to quiet gcc warning
+    T p[N];
+    for (iter = 1; iter <= maxiter; ++iter) {
+        T z[N];
+        precond(z, r);
+        T rho = 0;
+        for (int i = 0; i < N; ++i) { rho += r[i]*z[i]; }
+        if (rho == 0) { break; }
+        if (iter == 1) {
+            for (int i = 0; i < N; ++i) { p[i] = z[i]; }
+        } else {
+            auto rr = rho * (T(1.0)/rho_prev);
+            for (int i = 0; i < N; ++i) {
+                p[i] = z[i] + rr * p[i];
+            }
+        }
+        T q[N];
+        mat(q, p);
+        T pq = 0;
+        for (int i = 0; i < N; ++i) { pq += p[i]*q[i]; }
+        if (pq == 0) { break; }
+        T alpha = rho * (T(1.0)/pq);
+        T rnorm = 0;
+        for (int i = 0; i < N; ++i) {
+            x[i] += alpha * p[i];
+            r[i] -= alpha * q[i];
+            rnorm = std::max(rnorm, std::abs(r[i]));
+        }
+        if (rnorm <= rnorm0*rel_tol) { break; }
+        rho_prev = rho;
+    }
+    return iter;
+}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/Make.package b/Src/LinearSolvers/MLMG/Make.package
index a8f267d4c26..3609164c919 100644
--- a/Src/LinearSolvers/MLMG/Make.package
+++ b/Src/LinearSolvers/MLMG/Make.package
@@ -22,7 +22,7 @@ CEXE_sources   += AMReX_MLNodeLinOp.cpp
 CEXE_headers   += AMReX_MLCellABecLap.H
 CEXE_headers   += AMReX_MLCellABecLap_K.H AMReX_MLCellABecLap_$(DIM)D_K.H
 
-CEXE_headers   += AMReX_MLCGSolver.H
+CEXE_headers   += AMReX_MLCGSolver.H AMReX_PCGSolver.H
 
 CEXE_headers   += AMReX_MLABecLaplacian.H
 CEXE_headers   += AMReX_MLABecLap_K.H AMReX_MLABecLap_$(DIM)D_K.H
@@ -96,4 +96,6 @@ endif
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG
 
+include $(AMREX_HOME)/Src/Boundary/Make.package
+
 endif
diff --git a/Src/Particle/AMReX_BinIterator.H b/Src/Particle/AMReX_BinIterator.H
index f57b61a0371..a5422a48fac 100644
--- a/Src/Particle/AMReX_BinIterator.H
+++ b/Src/Particle/AMReX_BinIterator.H
@@ -22,7 +22,7 @@ constexpr bool IsParticleTileData (Args...) {
 template <typename T>
 struct BinIterator
 {
-    using index_type = unsigned int;
+    using index_type = int;
 
     using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
diff --git a/Src/Particle/AMReX_DenseBins.H b/Src/Particle/AMReX_DenseBins.H
index 24c9cc293cb..ec5aebd6a20 100644
--- a/Src/Particle/AMReX_DenseBins.H
+++ b/Src/Particle/AMReX_DenseBins.H
@@ -30,7 +30,7 @@ namespace BinPolicy
 template <typename T>
 struct DenseBinIteratorFactory
 {
-    using index_type = unsigned int;
+    using index_type = int;
 
     using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
@@ -78,7 +78,7 @@ class DenseBins
 public:
 
     using BinIteratorFactory = DenseBinIteratorFactory<T>;
-    using index_type = unsigned int;
+    using index_type = int;
 
     using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
diff --git a/Src/Particle/AMReX_NeighborList.H b/Src/Particle/AMReX_NeighborList.H
index 53434ec7c3c..6233a41002a 100644
--- a/Src/Particle/AMReX_NeighborList.H
+++ b/Src/Particle/AMReX_NeighborList.H
@@ -310,8 +310,8 @@ public:
         m_pstruct = aos().dataPtr();
         auto* const pstruct_ptr = aos().dataPtr();
 
-        const size_t np_total = aos.size();
-        const size_t np_real  = src_tile.numRealParticles();
+        const int np_total = aos.size();
+        const int np_real  = src_tile.numRealParticles();
 
         auto const* off_bins_p = off_bins_v.data();
         auto const* dxi_p      = dxi_v.data();
@@ -333,7 +333,7 @@ public:
 
         // First pass: count the number of neighbors for each particle
         //---------------------------------------------------------------------------------------------------------
-        const size_t np_size  = (num_bin_types > 1) ? np_total : np_real;
+        const int np_size  = (num_bin_types > 1) ? np_total : np_real;
         m_nbor_counts.resize( np_size+1, 0);
         m_nbor_offsets.resize(np_size+1);
 
diff --git a/Src/Particle/AMReX_NeighborParticlesI.H b/Src/Particle/AMReX_NeighborParticlesI.H
index ca1747b6b8b..8e5bb2b2563 100644
--- a/Src/Particle/AMReX_NeighborParticlesI.H
+++ b/Src/Particle/AMReX_NeighborParticlesI.H
@@ -1065,7 +1065,7 @@ selectActualNeighbors (CheckPair const& check_pair, int num_cells)
                             if (isActualNeighbor) { break; }
                             int nbr_cell_id = (ii * ny + jj) * nz + kk;
                             for (auto p = poffset[nbr_cell_id]; p < poffset[nbr_cell_id+1]; ++p) {
-                                if (pperm[p] == i) { continue; }
+                                if (pperm[p] == int(i)) { continue; }
                                 if (detail::call_check_pair(check_pair, ptile_data, ptile_data, i, pperm[p])) {
                                     IntVect cell_ijk = getParticleCell(pstruct[pperm[p]], plo, dxi, domain);
                                     if (!box.contains(cell_ijk)) {
diff --git a/Src/Particle/AMReX_ParticleContainerBase.cpp b/Src/Particle/AMReX_ParticleContainerBase.cpp
index 7b405681e4d..4f0ec83f4cb 100644
--- a/Src/Particle/AMReX_ParticleContainerBase.cpp
+++ b/Src/Particle/AMReX_ParticleContainerBase.cpp
@@ -198,7 +198,7 @@ int ParticleContainerBase::MaxReaders ()
         first = false;
         ParmParse pp("particles");
         Max_Readers = Max_Readers_def;
-        pp.queryAdd("nreaders", Max_Readers);
+        pp.query("nreaders", Max_Readers);
         Max_Readers = std::min(ParallelDescriptor::NProcs(),Max_Readers);
         if (Max_Readers <= 0)
         {
@@ -224,7 +224,7 @@ Long ParticleContainerBase::MaxParticlesPerRead ()
         first = false;
         ParmParse pp("particles");
         Max_Particles_Per_Read = Max_Particles_Per_Read_def;
-        pp.queryAdd("nparts_per_read", Max_Particles_Per_Read);
+        pp.query("nparts_per_read", Max_Particles_Per_Read);
         if (Max_Particles_Per_Read <= 0)
         {
             amrex::Abort("particles.nparts_per_read must be positive");
@@ -244,7 +244,7 @@ const std::string& ParticleContainerBase::AggregationType ()
         first = false;
         aggregation_type = "None";
         ParmParse pp("particles");
-        pp.queryAdd("aggregation_type", aggregation_type);
+        pp.query("aggregation_type", aggregation_type);
         if (!(aggregation_type == "None" || aggregation_type == "Cell"))
         {
             amrex::Abort("particles.aggregation_type not implemented.");
@@ -264,7 +264,7 @@ int ParticleContainerBase::AggregationBuffer ()
         first = false;
         aggregation_buffer = 2;
         ParmParse pp("particles");
-        pp.queryAdd("aggregation_buffer", aggregation_buffer);
+        pp.query("aggregation_buffer", aggregation_buffer);
         if (aggregation_buffer <= 0)
         {
             amrex::Abort("particles.aggregation_buffer must be positive");
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index cd2d030e55c..74e65b792f0 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -56,8 +56,8 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         //                   && std::is_trivial<ParticleType>::value,
         //                      "Particle type must be standard layout and trivial.");
 
-        pp.queryAdd("use_prepost", usePrePost);
-        pp.queryAdd("do_unlink", doUnlink);
+        pp.query("use_prepost", usePrePost);
+        pp.query("do_unlink", doUnlink);
         pp.queryAdd("do_mem_efficient_sort", memEfficientSort);
 
         initialized = true;
@@ -71,9 +71,9 @@ IntVect
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>::Index (const P& p, int lev) const
 {
     const Geometry& geom = Geom(lev);
-    const auto domain = geom.Domain();
-    const auto plo = geom.ProbLoArray();
-    const auto dxi = geom.InvCellSizeArray();
+    const auto& domain = geom.Domain();
+    const auto& plo = geom.ProbLoArray();
+    const auto& dxi = geom.InvCellSizeArray();
 
     return Assignor{}(p, plo, dxi, domain);
 }
diff --git a/Src/Particle/AMReX_ParticleIO.H b/Src/Particle/AMReX_ParticleIO.H
index ca40256620e..01ab0ded869 100644
--- a/Src/Particle/AMReX_ParticleIO.H
+++ b/Src/Particle/AMReX_ParticleIO.H
@@ -651,7 +651,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
     int DATA_Digits_Read(5);
     ParmParse pp("particles");
-    pp.queryAdd("datadigits_read",DATA_Digits_Read);
+    pp.query("datadigits_read",DATA_Digits_Read);
 
     std::string fullname = dir;
     if (!fullname.empty() && fullname[fullname.size()-1] != '/') {
diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
index e40d43c9b38..b09f1d3583f 100644
--- a/Src/Particle/AMReX_ParticleUtil.H
+++ b/Src/Particle/AMReX_ParticleUtil.H
@@ -58,9 +58,9 @@ numParticlesOutOfRange (Iterator const& pti, IntVect nGrow)
     const auto& ptd = tile.getConstParticleTileData();
     const auto& geom = pti.Geom(pti.GetLevel());
 
-    const auto domain = geom.Domain();
-    const auto plo = geom.ProbLoArray();
-    const auto dxi = geom.InvCellSizeArray();
+    const auto& domain = geom.Domain();
+    const auto& plo = geom.ProbLoArray();
+    const auto& dxi = geom.InvCellSizeArray();
 
     Box box = pti.tilebox();
     box.grow(nGrow);
@@ -356,17 +356,54 @@ struct GetParticleBin
     }
 };
 
+/**
+ * \brief Returns the cell index for a given particle using the
+ * provided lower bounds and cell sizes.
+ *
+ * This version indexes cells starting from 0 at the lower left corner of
+ * the provided lower bounds, i.e., it returns a local index.
+ *
+ * \tparam P a type of AMReX particle.
+ *
+ * \param p the particle for which the cell index is calculated
+ * \param plo the low end of the domain
+ * \param dxi cell sizes in each dimension
+ */
 template <typename P>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 IntVect getParticleCell (P const& p,
                          amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
-                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
-                         const Box& domain) noexcept
+                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi) noexcept
 {
     IntVect iv(
         AMREX_D_DECL(int(amrex::Math::floor((p.pos(0)-plo[0])*dxi[0])),
                      int(amrex::Math::floor((p.pos(1)-plo[1])*dxi[1])),
                      int(amrex::Math::floor((p.pos(2)-plo[2])*dxi[2]))));
+    return iv;
+}
+
+/**
+ * \brief Returns the cell index for a given particle using the
+ * provided lower bounds, cell sizes and global domain offset.
+ *
+ * This version indexes cells starting from 0 at the lower left corner of
+ * the simulation geometry, i.e., it returns a global index.
+ *
+ * \tparam P a type of AMReX particle.
+ *
+ * \param p the particle for which the cell index is calculated
+ * \param plo the low end of the domain
+ * \param dxi cell sizes in each dimension
+ * \param domain AMReX box in which the given particle resides
+ */
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+IntVect getParticleCell (P const& p,
+                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                         const Box& domain) noexcept
+{
+    IntVect iv = getParticleCell(p, plo, dxi);
     iv += domain.smallEnd();
     return iv;
 }
diff --git a/Src/Particle/AMReX_SparseBins.H b/Src/Particle/AMReX_SparseBins.H
index fb026a8ca88..1629a997597 100644
--- a/Src/Particle/AMReX_SparseBins.H
+++ b/Src/Particle/AMReX_SparseBins.H
@@ -14,7 +14,7 @@ template <typename T>
 struct SparseBinIteratorFactory
 {
 
-    using index_type = unsigned int;
+    using index_type = int;
 
     using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
@@ -33,7 +33,7 @@ struct SparseBinIteratorFactory
         : m_bins_ptr(bins.dataPtr()),
           m_offsets_ptr(offsets.dataPtr()),
           m_permutation_ptr(permutation.dataPtr()),
-          m_items(items), m_num_bins(bins.size())
+          m_items(items), m_num_bins(int(bins.size()))
     {}
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
@@ -97,7 +97,7 @@ public:
 
     using BinIteratorFactory = SparseBinIteratorFactory<T>;
     using bin_type = IntVect;
-    using index_type = unsigned int;
+    using index_type = int;
 
     using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
diff --git a/Src/Particle/AMReX_TracerParticle_mod_K.H b/Src/Particle/AMReX_TracerParticle_mod_K.H
index ee85f61fac4..808c2b333b8 100644
--- a/Src/Particle/AMReX_TracerParticle_mod_K.H
+++ b/Src/Particle/AMReX_TracerParticle_mod_K.H
@@ -134,7 +134,7 @@ void linear_interpolate_to_particle (const P& p,
         amrex::Real sz[] = {amrex::Real(1.0) - zint, zint};
 #endif
 
-        for (int comp = start_comp; comp < ncomp; ++comp) {
+        for (int comp = start_comp; comp < start_comp + ncomp; ++comp) {
             val[ctr] = ParticleReal(0.0);
 #if (AMREX_SPACEDIM > 2)
             for (int kk = 0; kk <=1; ++kk) {
@@ -378,7 +378,7 @@ void linear_interpolate_to_particle_z (const P& p,
                                                hint_ilojhi,
                                                hint_ihijhi};
 #endif
-        for (int comp = start_comp; comp < ncomp; ++comp) {
+        for (int comp = start_comp; comp < start_comp + ncomp; ++comp) {
             val[ctr] = amrex::ParticleReal(0.);
 #if (AMREX_SPACEDIM == 2)
             int k0 = 0;
@@ -550,7 +550,7 @@ void linear_interpolate_to_particle_mapped (const P& p,
 
     int i = p.idata(0);
     int j = p.idata(1);
-    for (int comp = start_comp; comp < ncomp; ++comp) {
+    for (int comp = start_comp; comp < start_comp + ncomp; ++comp) {
 
 #if (AMREX_SPACEDIM == 2)
         // Value of data at surrounding nodes
diff --git a/Src/Particle/AMReX_TracerParticles.cpp b/Src/Particle/AMReX_TracerParticles.cpp
index 339cb58cfa8..849fb086a87 100644
--- a/Src/Particle/AMReX_TracerParticles.cpp
+++ b/Src/Particle/AMReX_TracerParticles.cpp
@@ -223,7 +223,7 @@ TracerParticleContainer::Timestamp (const std::string&      basename,
     // We'll spread the output over this many files.
     int nOutFiles(64);
     ParmParse pp("particles");
-    pp.queryAdd("particles_nfiles",nOutFiles);
+    pp.query("particles_nfiles",nOutFiles);
     if(nOutFiles == -1) {
       nOutFiles = NProcs;
     }
diff --git a/Src/Particle/AMReX_WriteBinaryParticleData.H b/Src/Particle/AMReX_WriteBinaryParticleData.H
index 10fd9e2febd..1bea38256dc 100644
--- a/Src/Particle/AMReX_WriteBinaryParticleData.H
+++ b/Src/Particle/AMReX_WriteBinaryParticleData.H
@@ -675,7 +675,7 @@ void WriteBinaryParticleDataSync (PC const& pc,
         HdrFile.close();
         if ( ! HdrFile.good())
         {
-            amrex::Abort("ParticleContainer::Checkpoint(): problem writing HdrFile");
+            amrex::Abort("amrex::WriteBinaryParticleDataSync(): problem writing HdrFile");
         }
     }
 }
@@ -969,7 +969,7 @@ void WriteBinaryParticleDataAsync (PC const& pc,
             HdrFile.close();
             if ( ! HdrFile.good())
             {
-                amrex::Abort("ParticleContainer::Checkpoint(): problem writing HdrFile");
+                amrex::Abort("amrex::WriteBinaryParticleDataAsync(): problem writing HdrFile");
             }
         }
 
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 99d72633b65..29489e8d4fe 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -121,7 +121,9 @@ else()
    #
    # List of subdirectories to search for CMakeLists.
    #
-   set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Reinit Amr CLZ Parser Parser2 CTOParFor RoundoffDomain)
+   set( AMREX_TESTS_SUBDIRS Amr AsyncOut CLZ CTOParFor DeviceGlobal Enum
+                            MultiBlock MultiPeriod Parser Parser2 Reinit
+                            RoundoffDomain)
 
    if (AMReX_PARTICLES)
      list(APPEND AMREX_TESTS_SUBDIRS Particles)
diff --git a/Tests/DeviceGlobal/CMakeLists.txt b/Tests/DeviceGlobal/CMakeLists.txt
new file mode 100644
index 00000000000..990662d406b
--- /dev/null
+++ b/Tests/DeviceGlobal/CMakeLists.txt
@@ -0,0 +1,15 @@
+if (( (AMReX_GPU_BACKEND STREQUAL "CUDA") OR
+      (AMReX_GPU_BACKEND STREQUAL "HIP" ) ) AND
+    (NOT AMReX_GPU_RDC))
+   return()
+endif ()
+
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp global_vars.cpp init.cpp work.cpp)
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/DeviceGlobal/GNUmakefile b/Tests/DeviceGlobal/GNUmakefile
new file mode 100644
index 00000000000..fd5fbd8f2c0
--- /dev/null
+++ b/Tests/DeviceGlobal/GNUmakefile
@@ -0,0 +1,23 @@
+AMREX_HOME ?= ../../
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_CUDA  = TRUE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+
+BL_NO_FORT = TRUE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/DeviceGlobal/Make.package b/Tests/DeviceGlobal/Make.package
new file mode 100644
index 00000000000..8df45d1f81b
--- /dev/null
+++ b/Tests/DeviceGlobal/Make.package
@@ -0,0 +1,2 @@
+CEXE_sources += main.cpp init.cpp work.cpp global_vars.cpp
+
diff --git a/Tests/DeviceGlobal/global_vars.H b/Tests/DeviceGlobal/global_vars.H
new file mode 100644
index 00000000000..88ce1f0c4fc
--- /dev/null
+++ b/Tests/DeviceGlobal/global_vars.H
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <AMReX_Gpu.H>
+#include <AMReX_INT.H>
+
+extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long,    dg_x);
+extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, 4, dg_y);
diff --git a/Tests/DeviceGlobal/global_vars.cpp b/Tests/DeviceGlobal/global_vars.cpp
new file mode 100644
index 00000000000..485f41f164c
--- /dev/null
+++ b/Tests/DeviceGlobal/global_vars.cpp
@@ -0,0 +1,7 @@
+
+#include "global_vars.H"
+
+// definitions of global variables
+
+AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long,    dg_x);
+AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, 4, dg_y);
diff --git a/Tests/DeviceGlobal/init.cpp b/Tests/DeviceGlobal/init.cpp
new file mode 100644
index 00000000000..2850941f1ee
--- /dev/null
+++ b/Tests/DeviceGlobal/init.cpp
@@ -0,0 +1,27 @@
+
+#include "global_vars.H"
+
+void init ()
+{
+    amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int)
+    {
+        dg_x = 1;
+        for (int n = 0; n < 4; ++n) {
+            dg_y[n] = 100 + n;
+        }
+    });
+
+    amrex::Gpu::streamSynchronize();
+}
+
+void init2 ()
+{
+    amrex::Gpu::PinnedVector<amrex::Long> pv{2,200,201,202,203};
+    amrex::Gpu::memcpy_from_host_to_device_global_async
+        (dg_x, pv.data(), sizeof(amrex::Long));
+    amrex::Gpu::memcpy_from_host_to_device_global_async
+        (dg_y, pv.data()+1, sizeof(amrex::Long));
+    amrex::Gpu::memcpy_from_host_to_device_global_async
+        (dg_y, pv.data()+2, sizeof(amrex::Long)*3, sizeof(amrex::Long));
+    amrex::Gpu::streamSynchronize();
+}
diff --git a/Tests/DeviceGlobal/main.cpp b/Tests/DeviceGlobal/main.cpp
new file mode 100644
index 00000000000..b3b67784722
--- /dev/null
+++ b/Tests/DeviceGlobal/main.cpp
@@ -0,0 +1,20 @@
+#include <AMReX.H>
+#include <AMReX_Print.H>
+
+void init();
+void work();
+void init2();
+void work2();
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+    {
+        init();
+        work();
+
+        init2();
+        work2();
+    }
+    amrex::Finalize();
+}
diff --git a/Tests/DeviceGlobal/work.cpp b/Tests/DeviceGlobal/work.cpp
new file mode 100644
index 00000000000..8350dad0661
--- /dev/null
+++ b/Tests/DeviceGlobal/work.cpp
@@ -0,0 +1,40 @@
+
+#include "global_vars.H"
+
+void work ()
+{
+    amrex::Gpu::PinnedVector<amrex::Long> pv;
+    pv.resize(5,0);
+    auto* p = pv.data();
+    amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int)
+    {
+        p[0] = dg_x;
+        for (int n = 0; n < 4; ++n) {
+            p[1+n] = dg_y[n];
+        }
+    });
+    amrex::Gpu::streamSynchronize();
+    AMREX_ALWAYS_ASSERT(pv[0] == 1 &&
+                        pv[1] == 100 &&
+                        pv[2] == 101 &&
+                        pv[3] == 102 &&
+                        pv[4] == 103);
+}
+
+void work2 ()
+{
+    amrex::Gpu::PinnedVector<amrex::Long> pv;
+    pv.resize(5,0);
+    amrex::Gpu::memcpy_from_device_global_to_host_async
+        (pv.data(), dg_x, sizeof(amrex::Long));
+    amrex::Gpu::memcpy_from_device_global_to_host_async
+        (pv.data()+1, dg_y, sizeof(amrex::Long));
+    amrex::Gpu::memcpy_from_device_global_to_host_async
+        (pv.data()+2, dg_y, sizeof(amrex::Long)*3, sizeof(amrex::Long));
+    amrex::Gpu::streamSynchronize();
+    AMREX_ALWAYS_ASSERT(pv[0] == 2 &&
+                        pv[1] == 200 &&
+                        pv[2] == 201 &&
+                        pv[3] == 202 &&
+                        pv[4] == 203);
+}
diff --git a/Tests/DivFreePatch/main.cpp b/Tests/DivFreePatch/main.cpp
index 7358946c434..857461a1f64 100644
--- a/Tests/DivFreePatch/main.cpp
+++ b/Tests/DivFreePatch/main.cpp
@@ -94,11 +94,14 @@ void CoarsenToFine(MultiFab& div_refined_coarse,
 
 
 Real MFdiff(const MultiFab& lhs, const MultiFab& rhs,
-            int strt_comp, int num_comp, int nghost, const std::string name = "")
+            int strt_comp, int num_comp, int nghost, const std::string name = "",
+            bool relative = false)
 {
     MultiFab temp(lhs.boxArray(), lhs.DistributionMap(), lhs.nComp(), nghost);
     Copy(temp, lhs, strt_comp, strt_comp, num_comp, nghost);
     temp.minus(rhs, strt_comp, num_comp, nghost);
+    if (relative) {
+        temp.divide(rhs, strt_comp, num_comp, nghost); }
 
     if (name != "")
         { amrex::VisMF::Write(temp, std::string("pltfiles/" + name)); }
@@ -301,7 +304,7 @@ void main_main ()
 //  Setup initial value on the coarse faces.
     for (int i=0; i<AMREX_SPACEDIM; ++i)
     {
-        setupMF(c_mf_faces[i], 1);
+        setupMF(c_mf_faces[i], i);
     }
 
     amrex::UtilCreateDirectoryDestructive("pltfiles");
@@ -403,16 +406,21 @@ void main_main ()
                   amrex::VisMF::Write(f_mf_faces_wg[1], std::string("pltfiles/fwgy"));,
                   amrex::VisMF::Write(f_mf_faces_wg[2], std::string("pltfiles/fwgz"));  );
 
-    amrex::Print() << " Max InterpFromCoarse divergence error: "
-                   << MFdiff(div_fine, div_refined_coarse, 0, 1, nghost_f, "diff") << '\n';
+    amrex::Print() << " Max InterpFromCoarse divergence error: absolute         relative\n "
+                   << "                                       "
+                   <<MFdiff(div_fine, div_refined_coarse, 0, 1, nghost_f, "diff")
+                   << "  "
+                   <<MFdiff(div_fine, div_refined_coarse, 0, 1, nghost_f, "", true)
+                   << '\n';
 
 // ***************************************************************
 
     // Change coarse values, save current fine values for checking
     //   the final result.
+    amrex::Print()<<" Cyclically permute the velocity components. Should not change div\n";
     for (int i=0; i<AMREX_SPACEDIM; ++i)
     {
-        setupMF(c_mf_faces[i], 2, BoxArray(amrex::coarsen(f_geom.Domain(), ratio).convert(c_mf_faces[i].ixType())));
+        setupMF(c_mf_faces[i], (i+1)%3, BoxArray(amrex::coarsen(f_geom.Domain(), ratio).convert(c_mf_faces[i].ixType())));
         Copy(f_mf_copy[i], f_mf_faces[i], 0, 0, 1, 0);
     }
 
@@ -424,8 +432,13 @@ void main_main ()
     CoarsenToFine(div_refined_coarse, div_coarse, c_geom, f_geom_all, ratio);
     amrex::VisMF::Write(div_coarse, std::string("pltfiles/coarsetofineB"));
 
-    amrex::Print() << " Checking new adjustment hasn't changed solution on fine region: "
-                   << MFdiff(div_fine, div_refined_coarse, 0, 1, nghost_f, "diffFP") << '\n';
+    amrex::Print() << " Change to divergence on fine region:  absolute         relative\n "
+                   << "                                       "
+                   <<MFdiff(div_fine, div_refined_coarse, 0, 1, nghost_f)
+                   << "  "
+                   <<MFdiff(div_fine, div_refined_coarse, 0, 1, nghost_f, "", true)
+                   << '\n';
+
 
 
 // ***************************************************************
@@ -485,20 +498,23 @@ void main_main ()
 // ================================================
 
     // Checking fine valid cells are identical.
-    Real max_diff = 0;
     for (int i=0; i<AMREX_SPACEDIM; ++i)
     {
-        Real max_i = std::abs( MFdiff(f_mf_copy[i], f_mf_faces[i], 0, 1, 0) );
-        max_diff = (max_diff > max_i) ? max_diff : max_i;
+        amrex::Print() << " Fine valid region maximum change, comp "<<i<<": "
+                       <<MFdiff(f_mf_copy[i], f_mf_faces[i], 0, 1, 0)
+                       << '\n';
     }
-    amrex::Print() << " Fine values maximum change: " << max_diff << '\n';
 
     // Check fine divergence = coarse divergence in ghost cells.
     calcDiv(f_mf_faces, div_fine, f_geom.CellSizeArray());
     amrex::VisMF::Write(div_fine, std::string("pltfiles/fineFP"));
 
-    amrex::Print() << " Max FillPatchTwoLevels divergence error: "
-                   << MFdiff(div_fine, div_refined_coarse, 0, 1, nghost_f, "diffFP") << '\n';
+    amrex::Print() << " Max FillPatchTwoLevels divergence error:  absolute         relative\n "
+                   << "                                       "
+                   <<MFdiff(div_fine, div_refined_coarse, 0, 1, nghost_f, "diffFP")
+                   << "  "
+                   <<MFdiff(div_fine, div_refined_coarse, 0, 1, nghost_f, "", true)
+                   << '\n';
 
     for (int i=0; i<AMREX_SPACEDIM; ++i)
     {
diff --git a/Tests/Enum/CMakeLists.txt b/Tests/Enum/CMakeLists.txt
new file mode 100644
index 00000000000..9c0e7f321d0
--- /dev/null
+++ b/Tests/Enum/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files inputs)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/Enum/GNUmakefile b/Tests/Enum/GNUmakefile
new file mode 100644
index 00000000000..d0d895ff522
--- /dev/null
+++ b/Tests/Enum/GNUmakefile
@@ -0,0 +1,24 @@
+AMREX_HOME := ../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/Enum/Make.package b/Tests/Enum/Make.package
new file mode 100644
index 00000000000..6b4b865e8fc
--- /dev/null
+++ b/Tests/Enum/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/Enum/inputs b/Tests/Enum/inputs
new file mode 100644
index 00000000000..0972043658f
--- /dev/null
+++ b/Tests/Enum/inputs
@@ -0,0 +1,10 @@
+
+color1 = red
+color2 = green
+color3 = blue
+color4 = greenxxx
+color5 = Blue
+
+colors = cyan yellow orange
+
+
diff --git a/Tests/Enum/main.cpp b/Tests/Enum/main.cpp
new file mode 100644
index 00000000000..6fb25b01a59
--- /dev/null
+++ b/Tests/Enum/main.cpp
@@ -0,0 +1,99 @@
+#include <AMReX.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_Print.H>
+
+using namespace amrex;
+
+AMREX_ENUM(MyColor,  red,  green, blue );
+
+namespace my_namespace {
+    AMREX_ENUM(MyColor, orange,     yellow,cyan );
+}
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        auto const& names = amrex::getEnumNameStrings<MyColor>();
+        auto const& names2 = amrex::getEnumNameStrings<my_namespace::MyColor>();
+        amrex::Print() << "colors:";
+        for (auto const& name : names) {
+            amrex::Print() << " " << name;
+        }
+        amrex::Print() << "\n";
+        amrex::Print() << "colors:";
+        for (auto const& name : names2) {
+            amrex::Print() << " " << name;
+        }
+        amrex::Print() << "\n";
+
+        ParmParse pp;
+        {
+            auto color = static_cast<MyColor>(999);
+            pp.query("color1", color);
+            amrex::Print() << "color = " << amrex::getEnumNameString(color) << '\n';
+            AMREX_ALWAYS_ASSERT(color == MyColor::red);
+        }
+        {
+            auto color = static_cast<MyColor>(999);
+            pp.get("color2", color);
+            amrex::Print() << "color = " << amrex::getEnumNameString(color) << '\n';
+            AMREX_ALWAYS_ASSERT(color == MyColor::green);
+        }
+        {
+            auto color = static_cast<MyColor>(999);
+            pp.get("color3", color);
+            amrex::Print() << "color = " << amrex::getEnumNameString(color) << '\n';
+            AMREX_ALWAYS_ASSERT(color == MyColor::blue);
+        }
+        {
+            auto color = static_cast<MyColor>(999);
+            try {
+                pp.query("color4", color);
+            } catch (std::runtime_error const& e) {
+                amrex::Print() << "As expected, " << e.what() << '\n';
+            }
+            AMREX_ALWAYS_ASSERT(color == static_cast<MyColor>(999));
+            try {
+                pp.get_enum_case_insensitive("color4", color);
+            } catch (std::runtime_error const& e) {
+                amrex::Print() << "As expected, " << e.what() << '\n';
+            }
+            AMREX_ALWAYS_ASSERT(color == static_cast<MyColor>(999));
+        }
+        {
+            auto color = static_cast<MyColor>(999);
+            try {
+                pp.query("color5", color);
+            } catch (std::runtime_error const& e) {
+                amrex::Print() << "As expected, " << e.what() << '\n';
+            }
+            AMREX_ALWAYS_ASSERT(color == static_cast<MyColor>(999));
+            pp.query_enum_case_insensitive("color5", color);
+            amrex::Print() << "color = " << amrex::getEnumNameString(color) << '\n';
+            AMREX_ALWAYS_ASSERT(color == MyColor::blue);
+        }
+        {
+            std::vector<my_namespace::MyColor> color;
+            pp.getarr("colors", color);
+            AMREX_ALWAYS_ASSERT(color.size() == 3 &&
+                                color[0] == my_namespace::MyColor::cyan &&
+                                color[1] == my_namespace::MyColor::yellow &&
+                                color[2] == my_namespace::MyColor::orange);
+            std::vector<my_namespace::MyColor> color2;
+            pp.queryarr("colors", color2);
+            AMREX_ALWAYS_ASSERT(color.size() == 3 &&
+                                color == color2 &&
+                                color[0] == my_namespace::MyColor::cyan &&
+                                color[1] == my_namespace::MyColor::yellow &&
+                                color[2] == my_namespace::MyColor::orange);
+            amrex::Print() << "colors:";
+            for (auto const& c : color) {
+                amrex::Print() << " " << amrex::getEnumNameString(c);
+            }
+            amrex::Print() << "\n";
+        }
+    }
+
+    amrex::Finalize();
+}
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H
index 5834e9dc624..eb93bf2836c 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H
+++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H
@@ -61,6 +61,7 @@ private:
     bool semicoarsening = false;
     int max_coarsening_level = 30;
     int max_semicoarsening_level = 0;
+    bool use_gauss_seidel = true; // true: red-black, false: jacobi
     bool use_hypre = false;
     bool use_petsc = false;
 
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
index 27cd0b7a4b4..7a7647ce93a 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
+++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
@@ -62,9 +62,10 @@ MyTest::solvePoisson ()
 
     if (composite_solve)
     {
-
         MLPoisson mlpoisson(geom, grids, dmap, info);
 
+        mlpoisson.setGaussSeidel(use_gauss_seidel);
+
         mlpoisson.setMaxOrder(linop_maxorder);
 
         // This is a 3d problem with Dirichlet BC
@@ -105,6 +106,8 @@ MyTest::solvePoisson ()
         {
             MLPoisson mlpoisson({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info);
 
+            mlpoisson.setGaussSeidel(use_gauss_seidel);
+
             mlpoisson.setMaxOrder(linop_maxorder);
 
             // This is a 3d problem with Dirichlet BC
@@ -163,6 +166,8 @@ MyTest::solveABecLaplacian ()
 
         MLABecLaplacian mlabec(geom, grids, dmap, info);
 
+        mlabec.setGaussSeidel(use_gauss_seidel);
+
         mlabec.setMaxOrder(linop_maxorder);
 
         mlabec.setDomainBC({AMREX_D_DECL(LinOpBCType::Dirichlet,
@@ -220,6 +225,8 @@ MyTest::solveABecLaplacian ()
         {
             MLABecLaplacian mlabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info);
 
+            mlabec.setGaussSeidel(use_gauss_seidel);
+
             mlabec.setMaxOrder(linop_maxorder);
 
             mlabec.setDomainBC({AMREX_D_DECL(LinOpBCType::Dirichlet,
@@ -290,6 +297,8 @@ MyTest::solveABecLaplacianInhomNeumann ()
 
         MLABecLaplacian mlabec(geom, grids, dmap, info);
 
+        mlabec.setGaussSeidel(use_gauss_seidel);
+
         mlabec.setMaxOrder(linop_maxorder);
 
         // This is a 3d problem with inhomogeneous Neumann BC
@@ -349,6 +358,8 @@ MyTest::solveABecLaplacianInhomNeumann ()
         {
             MLABecLaplacian mlabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info);
 
+            mlabec.setGaussSeidel(use_gauss_seidel);
+
             mlabec.setMaxOrder(linop_maxorder);
 
             // This is a 3d problem with inhomogeneous Neumann BC
@@ -547,6 +558,8 @@ MyTest::readParameters ()
     pp.query("max_coarsening_level", max_coarsening_level);
     pp.query("max_semicoarsening_level", max_semicoarsening_level);
 
+    pp.query("use_gauss_seidel", use_gauss_seidel);
+
     pp.query("use_gmres", use_gmres);
     AMREX_ALWAYS_ASSERT(use_gmres == false || prob_type == 2);
 
diff --git a/Tests/LinearSolvers/CurlCurl/MyTest.H b/Tests/LinearSolvers/CurlCurl/MyTest.H
index e665b59850c..036db35e05c 100644
--- a/Tests/LinearSolvers/CurlCurl/MyTest.H
+++ b/Tests/LinearSolvers/CurlCurl/MyTest.H
@@ -30,6 +30,7 @@ private:
     bool consolidation = true;
     int max_coarsening_level = 30;
 
+    bool use_pcg = false;
     bool use_gmres = false;
     bool gmres_use_precond = true;
     int gmres_precond_niters = 1;
diff --git a/Tests/LinearSolvers/CurlCurl/MyTest.cpp b/Tests/LinearSolvers/CurlCurl/MyTest.cpp
index aba1536b1ae..1093e8c8deb 100644
--- a/Tests/LinearSolvers/CurlCurl/MyTest.cpp
+++ b/Tests/LinearSolvers/CurlCurl/MyTest.cpp
@@ -46,6 +46,8 @@ MyTest::solve ()
     }
     mlcc.prepareRHS({&rhs});
 
+    if (use_pcg) { mlcc.setUsePCG(true); }
+
     using V = Array<MultiFab,3>;
     MLMGT<V> mlmg(mlcc);
     mlmg.setMaxIter(max_iter);
@@ -105,6 +107,7 @@ MyTest::readParameters ()
     pp.query("consolidation", consolidation);
     pp.query("max_coarsening_level", max_coarsening_level);
 
+    pp.query("use_pcg", use_pcg);
     pp.query("use_gmres", use_gmres);
     pp.query("gmres_use_precond", gmres_use_precond);
     pp.query("gmres_precond_niters", gmres_precond_niters);
diff --git a/Tests/MultiPeriod/CMakeLists.txt b/Tests/MultiPeriod/CMakeLists.txt
new file mode 100644
index 00000000000..d5760b11dce
--- /dev/null
+++ b/Tests/MultiPeriod/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files )
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/MultiPeriod/GNUmakefile b/Tests/MultiPeriod/GNUmakefile
new file mode 100644
index 00000000000..d0d895ff522
--- /dev/null
+++ b/Tests/MultiPeriod/GNUmakefile
@@ -0,0 +1,24 @@
+AMREX_HOME := ../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/MultiPeriod/Make.package b/Tests/MultiPeriod/Make.package
new file mode 100644
index 00000000000..6b4b865e8fc
--- /dev/null
+++ b/Tests/MultiPeriod/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/MultiPeriod/main.cpp b/Tests/MultiPeriod/main.cpp
new file mode 100644
index 00000000000..89075c96b79
--- /dev/null
+++ b/Tests/MultiPeriod/main.cpp
@@ -0,0 +1,73 @@
+#include <AMReX.H>
+#include <AMReX_Geometry.H>
+#include <AMReX_FabArray.H>
+#include <AMReX_MFParallelFor.H>
+#include <AMReX_ParReduce.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        // Domain size: 2 x 128 x 4
+        Box box(IntVect(0), IntVect(AMREX_D_DECL(1, 127, 3)));
+        Array<int,AMREX_SPACEDIM> is_periodic{AMREX_D_DECL(1,1,1)};
+        Geometry geom(box, RealBox(AMREX_D_DECL(Real(0),Real(0),Real(0)),
+                                   AMREX_D_DECL(Real(1),Real(1),Real(1))),
+                      CoordSys::cartesian, is_periodic);
+        BoxArray ba(box);
+        ba.maxSize(32);
+        ba.convert(IntVect(AMREX_D_DECL(1,0,0))); // nodal in x-direction
+        DistributionMapping dm(ba);
+
+        FabArray<BaseFab<Long>> mf1(ba,dm,1,IntVect(4));
+        FabArray<BaseFab<Long>> mf2(ba,dm,1,IntVect(5));
+
+        mf1.setVal(-1);
+        mf2.setVal(-2);
+
+        auto const& len = geom.Domain().length3d();
+        auto expected = [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                while (i <  0     ) { i += len[0]; }
+                while (i >= len[0]) { i -= len[0]; }
+                while (j <  0     ) { j += len[1]; }
+                while (j >= len[1]) { j -= len[1]; }
+                while (k <  0     ) { k += len[2]; }
+                while (k >= len[2]) { k -= len[2]; }
+                return Long(i) + Long(j)*Long(len[0]) + Long(k)*Long(len[0])*Long(len[1]);
+            };
+
+        auto const& ma1 = mf1.arrays();
+        auto const& ma2 = mf2.arrays();
+
+        // Initialize valid region
+        ParallelFor(mf1, IntVect(0), [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+        {
+            ma1[b](i,j,k) = expected(i,j,k);
+        });
+
+        mf1.FillBoundary(geom.periodicity());
+        mf2.ParallelCopy(mf1, 0, 0, 1, IntVect(0), mf2.nGrowVect(), geom.periodicity());
+
+        auto r1 = ParReduce(TypeList<ReduceOpSum>{}, TypeList<Long>{}, mf1, mf1.nGrowVect(),
+                            [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) -> GpuTuple<Long>
+                            {
+                                return { Long(expected(i,j,k) != ma1[b](i,j,k)) };
+                            });
+        auto r2 = ParReduce(TypeList<ReduceOpSum>{}, TypeList<Long>{}, mf2, mf2.nGrowVect(),
+                            [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) -> GpuTuple<Long>
+                            {
+                                return { Long(expected(i,j,k) != ma2[b](i,j,k)) };
+                            });
+
+        AMREX_ALWAYS_ASSERT(r1 == 0);
+        AMREX_ALWAYS_ASSERT(r2 == 0);
+
+        if (r1 == 0 && r2 == 0) {
+            amrex::Print() << "SUCCESS\n";
+        }
+    }
+    amrex::Finalize();
+}
diff --git a/Tests/Parser/main.cpp b/Tests/Parser/main.cpp
index 167c0a54a31..df82ba2f3f0 100644
--- a/Tests/Parser/main.cpp
+++ b/Tests/Parser/main.cpp
@@ -373,7 +373,7 @@ int main (int argc, char* argv[])
         int count = 0;
         int x = 11;
         {
-            auto f = [&] (std::string const& s) -> int
+            auto f = [&] (std::string const& s)
             {
                 amrex::Print() << count++ << ". Testing \"" << s << "\"\n";
                 IParser iparser(s);
@@ -390,7 +390,7 @@ int main (int argc, char* argv[])
             AMREX_ALWAYS_ASSERT(f("x/13/5") == ((x/13)/5));
             AMREX_ALWAYS_ASSERT(f("13/x/5") == ((13/x)/5));
 
-            auto g = [&] (std::string const& s, std::string const& c, int cv) -> int
+            auto g = [&] (std::string const& s, std::string const& c, int cv)
             {
                 amrex::Print() << count++ << ". Testing \"" << s << "\"\n";
                 IParser iparser(s);
@@ -408,7 +408,7 @@ int main (int argc, char* argv[])
             AMREX_ALWAYS_ASSERT(g("x/b/5", "b", 13) == ((x/13)/5));
             AMREX_ALWAYS_ASSERT(g("b/x/5", "b", 13) == ((13/x)/5));
 
-            auto h = [&] (std::string const& s) -> int
+            auto h = [&] (std::string const& s)
             {
                 amrex::Print() << count++ << ". Testing \"" << s << "\"\n";
                 IParser iparser(s);
@@ -435,6 +435,31 @@ int main (int argc, char* argv[])
                     }
                 }
             }
+
+            AMREX_ALWAYS_ASSERT(h("123456789012345") == 123456789012345LL);
+            AMREX_ALWAYS_ASSERT(h("123456789012345.") == 123456789012345LL);
+            AMREX_ALWAYS_ASSERT(h("123'456'789'012'345") == 123456789012345LL);
+            AMREX_ALWAYS_ASSERT(h("1.23456789012345e14") == 123456789012345LL);
+            AMREX_ALWAYS_ASSERT(h("1.0E3") == 1000);
+            AMREX_ALWAYS_ASSERT(h("2**40") == 1024LL*1024LL*1024LL*1024LL);
+
+            auto test_bad_number = [&] (std::string const& s)
+            {
+                amrex::Print() << count++ << ". Testing \"" << s << "\"\n";
+                try {
+                    IParser iparser(s);
+                    auto exe = iparser.compileHost<0>();
+                    auto r = exe();
+                    amrex::ignore_unused(r);
+                    return false;
+                } catch (std::runtime_error const& e) {
+                    amrex::Print() << "    Expected error: " << e.what() << '\n';
+                    return true;
+                }
+            };
+            AMREX_ALWAYS_ASSERT(test_bad_number("1000000e-4"));
+            AMREX_ALWAYS_ASSERT(test_bad_number("1.234e2"));
+            AMREX_ALWAYS_ASSERT(test_bad_number("3.14"));
         }
         amrex::Print() << "\nAll IParser tests passed\n\n";
     }
diff --git a/Tests/Particles/NeighborParticles/Constants.H b/Tests/Particles/NeighborParticles/Constants.H
index 18c94dbe255..78cfa3ca7d8 100644
--- a/Tests/Particles/NeighborParticles/Constants.H
+++ b/Tests/Particles/NeighborParticles/Constants.H
@@ -8,8 +8,8 @@ namespace Params
     // This is designed to represent MFiX-like conditions where the grid spacing is
     //     roughly 2.5 times the particle diameter.  In main.cpp we set grid spacing to 1
     //     so here we set cutoff to diameter = 1/2.5 --> cutoff = 0.2
-    static constexpr amrex::Real cutoff = 0.2  ;
-    static constexpr amrex::Real min_r  = 1.e-4;
+    static constexpr amrex::ParticleReal cutoff = 0.2  ;
+    static constexpr amrex::ParticleReal min_r  = 1.e-4;
 }
 
 #endif
diff --git a/Tests/Particles/NeighborParticles/MDParticleContainer.cpp b/Tests/Particles/NeighborParticles/MDParticleContainer.cpp
index 21543c7aed8..0a2570f798c 100644
--- a/Tests/Particles/NeighborParticles/MDParticleContainer.cpp
+++ b/Tests/Particles/NeighborParticles/MDParticleContainer.cpp
@@ -1,8 +1,9 @@
+#include <AMReX_Reduce.H>
+#include <AMReX_SPACE.H>
+
 #include "MDParticleContainer.H"
 #include "Constants.H"
-
 #include "CheckPair.H"
-#include <AMReX_SPACE.H>
 
 using namespace amrex;
 
@@ -148,8 +149,9 @@ std::pair<Real, Real> MDParticleContainer::minAndMaxDistance()
     const int lev = 0;
     auto& plev  = GetParticles(lev);
 
-    Real min_d = std::numeric_limits<Real>::max();
-    Real max_d = std::numeric_limits<Real>::min();
+    ReduceOps<ReduceOpMin, ReduceOpMax> reduce_op;
+    ReduceData<ParticleReal, ParticleReal> reduce_data(reduce_op);
+    using ReduceTuple = typename decltype(reduce_data)::Type;
 
     for(MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi)
     {
@@ -159,41 +161,40 @@ std::pair<Real, Real> MDParticleContainer::minAndMaxDistance()
 
         auto& ptile = plev[index];
         auto& aos   = ptile.GetArrayOfStructs();
-        const size_t np = aos.numParticles();
 
         auto nbor_data = m_neighbor_list[lev][index].data();
         ParticleType* pstruct = aos().dataPtr();
 
-        Gpu::DeviceScalar<Real> min_d_gpu(min_d);
-        Gpu::DeviceScalar<Real> max_d_gpu(max_d);
-
-        Real* pmin_d = min_d_gpu.dataPtr();
-        Real* pmax_d = max_d_gpu.dataPtr();
-
-        AMREX_FOR_1D ( np, i,
-        {
-            ParticleType& p1 = pstruct[i];
+        ParticleReal min_start = std::numeric_limits<ParticleReal>::max();
+        ParticleReal max_start = std::numeric_limits<ParticleReal>::lowest();
 
-            for (const auto& p2 : nbor_data.getNeighbors(i))
-            {
-                AMREX_D_TERM(Real dx = p1.pos(0) - p2.pos(0);,
-                             Real dy = p1.pos(1) - p2.pos(1);,
-                             Real dz = p1.pos(2) - p2.pos(2);)
+        reduce_op.eval(aos.numParticles(), reduce_data,
+                       [=] AMREX_GPU_DEVICE (int i) -> ReduceTuple
+                       {
+                           ParticleType& p1 = pstruct[i];
 
-                Real r2 = AMREX_D_TERM(dx*dx, + dy*dy, + dz*dz);
-                r2 = amrex::max(r2, Params::min_r*Params::min_r);
-                Real r = sqrt(r2);
+                           ParticleReal min_d = min_start;
+                           ParticleReal max_d = max_start;
 
-                Gpu::Atomic::Min(pmin_d, r);
-                Gpu::Atomic::Max(pmax_d, r);
-            }
-        });
+                           for (const auto& p2 : nbor_data.getNeighbors(i))
+                           {
+                               AMREX_D_TERM(ParticleReal dx = p1.pos(0) - p2.pos(0);,
+                                            ParticleReal dy = p1.pos(1) - p2.pos(1);,
+                                            ParticleReal dz = p1.pos(2) - p2.pos(2);)
 
-        Gpu::Device::streamSynchronize();
+                               ParticleReal r2 = AMREX_D_TERM(dx*dx, + dy*dy, + dz*dz);
+                               r2 = amrex::max(r2, Params::min_r*Params::min_r);
+                               auto r = ParticleReal(std::sqrt(r2));
 
-        min_d = std::min(min_d, min_d_gpu.dataValue());
-        max_d = std::max(max_d, max_d_gpu.dataValue());
+                               min_d = std::min(min_d, r);
+                               max_d = std::max(max_d, r);
+                           }
+                           return {min_d, max_d};
+                       });
     }
+
+    ParticleReal min_d = amrex::get<0>(reduce_data.value(reduce_op));
+    ParticleReal max_d = amrex::get<1>(reduce_data.value(reduce_op));
     ParallelDescriptor::ReduceRealMin(min_d, ParallelDescriptor::IOProcessorNumber());
     ParallelDescriptor::ReduceRealMax(max_d, ParallelDescriptor::IOProcessorNumber());
 
diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in
index 7731e988e68..f5045b715cb 100644
--- a/Tools/CMake/AMReXConfig.cmake.in
+++ b/Tools/CMake/AMReXConfig.cmake.in
@@ -79,6 +79,7 @@ set(AMReX_PARTICLES_FOUND           @AMReX_PARTICLES@)
 set(AMReX_P@AMReX_PARTICLES_PRECISION@_FOUND ON)
 set(AMReX_SENSEI_FOUND              @AMReX_SENSEI@)
 set(AMReX_CONDUIT_FOUND             @AMReX_CONDUIT@)
+set(AMReX_CATALYST_FOUND            @AMReX_CATALYST@)
 set(AMReX_ASCENT_FOUND              @AMReX_ASCENT@)
 set(AMReX_HYPRE_FOUND               @AMReX_HYPRE@)
 set(AMReX_PETSC_FOUND               @AMReX_PETSC@)
@@ -134,6 +135,7 @@ set(AMReX_PARTICLES_PRECISION       @AMReX_PARTICLES_PRECISION@)
 set(AMReX_SENSEI                    @AMReX_SENSEI@)
 set(AMReX_NO_SENSEI_AMR_INST        @AMReX_NO_SENSEI_AMR_INST@)
 set(AMReX_CONDUIT                   @AMReX_CONDUIT@)
+set(AMReX_CATALYST                  @AMReX_CATALYST@)
 set(AMReX_ASCENT                    @AMReX_ASCENT@)
 set(AMReX_HYPRE                     @AMReX_HYPRE@)
 set(AMReX_PETSC                     @AMReX_PETSC@)
@@ -202,6 +204,10 @@ if (@AMReX_ASCENT@)
     find_dependency(Ascent REQUIRED)
 endif ()
 
+if (@AMReX_CATALYST@)
+    find_dependency(Catalyst REQUIRED)
+endif ()
+
 if (@AMReX_CONDUIT@)
    find_dependency(Conduit REQUIRED)
 endif ()
diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake
index c99c3a63791..3e5d4c8bdb4 100644
--- a/Tools/CMake/AMReXOptions.cmake
+++ b/Tools/CMake/AMReXOptions.cmake
@@ -284,8 +284,7 @@ print_option(AMReX_FORTRAN_INTERFACES)
 option( AMReX_LINEAR_SOLVERS  "Build AMReX Linear solvers" ON )
 print_option( AMReX_LINEAR_SOLVERS )
 
-cmake_dependent_option( AMReX_AMRDATA "Build data services" OFF
-   "AMReX_FORTRAN" OFF )
+option( AMReX_AMRDATA "Build data services" OFF )
 print_option( AMReX_AMRDATA )
 
 option( AMReX_PARTICLES "Build particle classes" ON)
@@ -321,6 +320,11 @@ print_option( AMReX_NO_SENSEI_AMR_INST )
 option( AMReX_CONDUIT "Enable Conduit support" OFF )
 print_option( AMReX_CONDUIT )
 
+# Catalyst
+cmake_dependent_option( AMReX_CATALYST "Enable Catalyst support" OFF 
+   "AMReX_CONDUIT" OFF )
+print_option( AMReX_CATALYST )
+
 # Ascent
 cmake_dependent_option( AMReX_ASCENT "Enable Ascent support" OFF
    "AMReX_CONDUIT" OFF )
diff --git a/Tools/CMake/AMReXSetDefines.cmake b/Tools/CMake/AMReXSetDefines.cmake
index 783b1ab83d6..d271321bde4 100644
--- a/Tools/CMake/AMReXSetDefines.cmake
+++ b/Tools/CMake/AMReXSetDefines.cmake
@@ -148,6 +148,9 @@ add_amrex_define( AMREX_NO_SENSEI_AMR_INST NO_LEGACY IF AMReX_NO_SENSEI_AMR_INST
 # Conduit Support
 add_amrex_define( AMREX_USE_CONDUIT NO_LEGACY IF AMReX_CONDUIT )
 
+# Catalyst Support
+add_amrex_define( AMREX_USE_CATALYST NO_LEGACY IF AMReX_CATALYST )
+
 # Ascent Support
 add_amrex_define( AMREX_USE_ASCENT NO_LEGACY IF AMReX_ASCENT )
 
diff --git a/Tools/CMake/AMReXThirdPartyLibraries.cmake b/Tools/CMake/AMReXThirdPartyLibraries.cmake
index f8f49e9c478..abe62a2ebc9 100644
--- a/Tools/CMake/AMReXThirdPartyLibraries.cmake
+++ b/Tools/CMake/AMReXThirdPartyLibraries.cmake
@@ -90,6 +90,15 @@ if (AMReX_ASCENT) # Ascent will find conduit, so check for Ascent first
     endforeach()
 endif ()
 
+#
+#  Catalyst
+#
+if (AMReX_CATALYST) 
+    find_package(Catalyst REQUIRED PATHS "$ENV{CATALYST_IMPLEMENTATION_PATHS}")
+    foreach(D IN LISTS AMReX_SPACEDIM)
+        target_link_libraries(amrex_${D}d PUBLIC catalyst::catalyst)
+    endforeach()
+endif ()
 
 #
 # Conduit
diff --git a/Tools/CMake/AMReX_Config_ND.H.in b/Tools/CMake/AMReX_Config_ND.H.in
index 07df0fb53cb..3296a403ff0 100644
--- a/Tools/CMake/AMReX_Config_ND.H.in
+++ b/Tools/CMake/AMReX_Config_ND.H.in
@@ -42,6 +42,7 @@
 #cmakedefine AMREX_USE_SENSEI_INSITU
 #cmakedefine AMREX_NO_SENSEI_AMR_INST
 #cmakedefine AMREX_USE_CONDUIT
+#cmakedefine AMREX_USE_CATALYST
 #cmakedefine AMREX_USE_ASCENT
 #cmakedefine AMREX_USE_EB
 #cmakedefine AMREX_USE_CUDA
diff --git a/Tools/C_util/Convergence/Make.package b/Tools/C_util/Convergence/Make.package
index 06127427ca0..95fa155816a 100644
--- a/Tools/C_util/Convergence/Make.package
+++ b/Tools/C_util/Convergence/Make.package
@@ -4,7 +4,7 @@ CEXE_sources += AMReX_DataServices.cpp AMReX_AmrData.cpp
 
 CEXE_sources += ComputeAmrDataNorms.cpp
 CEXE_headers += ComputeAmrDataNorms.H
-FEXE_sources += AMReX_FABUTIL_$(DIM)D.F AMReX_AVGDOWN_$(DIM)D.F
+FEXE_sources += AMReX_AVGDOWN_$(DIM)D.F
 
 #ifeq ($(USE_ARRAYVIEW),TRUE)
 #  CEXE_sources += DatasetClient.cpp
diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs
index a1a2aa105b0..6a1c51c9292 100644
--- a/Tools/GNUMake/Make.defs
+++ b/Tools/GNUMake/Make.defs
@@ -216,6 +216,12 @@ else
   USE_CONDUIT := FALSE
 endif
 
+ifdef USE_CATALYST
+  USE_CATALYST := $(strip $(USE_CATALYST))
+else
+  USE_CATALYST := FALSE
+endif
+
 ifdef USE_ASCENT
   USE_ASCENT := $(strip $(USE_ASCENT))
 else
@@ -1057,6 +1063,11 @@ ifeq ($(USE_CONDUIT),TRUE)
   include        $(AMREX_HOME)/Tools/GNUMake/packages/Make.conduit
 endif
 
+ifeq ($(USE_CATALYST),TRUE)
+  $(info Loading $(AMREX_HOME)/Tools/GNUMake/packages/Make.catalyst...)
+  include        $(AMREX_HOME)/Tools/GNUMake/packages/Make.catalyst
+endif
+
 ifeq ($(USE_ASCENT),TRUE)
   $(info Loading $(AMREX_HOME)/Tools/GNUMake/packages/Make.ascent...)
   include        $(AMREX_HOME)/Tools/GNUMake/packages/Make.ascent
diff --git a/Tools/GNUMake/comps/dpcpp.mak b/Tools/GNUMake/comps/dpcpp.mak
index 3bcf5cb4372..4e9a7e4652e 100644
--- a/Tools/GNUMake/comps/dpcpp.mak
+++ b/Tools/GNUMake/comps/dpcpp.mak
@@ -45,6 +45,14 @@ ifeq ($(WARN_ALL),TRUE)
 
   warning_flags += -Wpedantic
 
+  # /tmp/icpx-2d34de0e47/global_vars-header-4390fb.h:25:36: error: zero size arrays are an extension [-Werror,-Wzero-length-array]
+  #    25 | const char* const kernel_names[] = {
+  #       |                                    ^
+  # 1 error generated.
+  #
+  # Seen in oneapi 2024.2.0 after adding Test/DeviceGlobal
+  warning_flags += -Wno-zero-length-array
+
   ifneq ($(WARN_SHADOW),FALSE)
     warning_flags += -Wshadow
   endif
diff --git a/Tools/GNUMake/packages/Make.catalyst b/Tools/GNUMake/packages/Make.catalyst
new file mode 100644
index 00000000000..810daa906f2
--- /dev/null
+++ b/Tools/GNUMake/packages/Make.catalyst
@@ -0,0 +1,16 @@
+#########################################################
+# Catalyst (https://gitlab.kitware.com/paraview/catalyst) Support 
+#########################################################
+
+CPPFLAGS += -DAMREX_USE_CATALYST
+
+ifdef CATALYST_DIR
+  INCLUDE_LOCATIONS += $(CATALYST_DIR)/include
+  VPATH_LOCATIONS   += $(CATALYST_DIR)/include
+  LIBRARY_LOCATIONS += $(CATALYST_DIR)/lib
+  LIBRARIES += -Wl,-rpath,$(CATALYST_DIR)/lib
+
+  LIBRARIES += -lcatalyst
+
+endif
+
diff --git a/Tools/Plotfile/fboxinfo.cpp b/Tools/Plotfile/fboxinfo.cpp
index 6d84ab519c7..85886d6c0c5 100644
--- a/Tools/Plotfile/fboxinfo.cpp
+++ b/Tools/Plotfile/fboxinfo.cpp
@@ -6,18 +6,18 @@ using namespace amrex;
 
 namespace {
 
-class BoxND
+class BoxNDDynamic
 {
 public:
-    friend std::ostream& operator<< (std::ostream& os, const BoxND& b);
-    BoxND (Box const& b, int dim) : m_box(b), m_dim(dim) {}
+    friend std::ostream& operator<< (std::ostream& os, const BoxNDDynamic& b);
+    BoxNDDynamic (Box const& b, int dim) : m_box(b), m_dim(dim) {}
 private:
     Box m_box;
     int m_dim;
 };
 
 std::ostream&
-operator<< (std::ostream& os, const BoxND& b)
+operator<< (std::ostream& os, const BoxNDDynamic& b)
 {
     if (b.m_dim == 1) {
         os << "("
@@ -175,10 +175,10 @@ void main_main()
                 const BoxArray& ba = plotfile.boxArray(ilev);
                 const Long nboxes = ba.size();
                 const Box prob_domain = plotfile.probDomain(ilev);
-                amrex::Print() << "   " << BoxND(prob_domain,dim)
+                amrex::Print() << "   " << BoxNDDynamic(prob_domain,dim)
                                << "  " << nboxes << "\n";
                 for (int ibox = 0; ibox < nboxes; ++ibox) {
-                    amrex::Print() << "      "  << BoxND(ba[ibox],dim) << "\n";
+                    amrex::Print() << "      "  << BoxNDDynamic(ba[ibox],dim) << "\n";
                 }
             }
         }