diff --git a/.codespell-ignore-words b/.codespell-ignore-words
index 4e265268e63..4069bf5a026 100644
--- a/.codespell-ignore-words
+++ b/.codespell-ignore-words
@@ -27,6 +27,7 @@ parms
 pres
 ptd
 recuse
+shft
 siz
 structed
 te
diff --git a/.github/workflows/apps.yml b/.github/workflows/apps.yml
index 45afcab4d77..0e7f3641984 100644
--- a/.github/workflows/apps.yml
+++ b/.github/workflows/apps.yml
@@ -11,7 +11,7 @@ jobs:
     name: Castro
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Get Latest Release Tag
       run: |
         CASTRO_TAG=$(wget https://github.com/AMReX-Astro/Castro/releases/latest 2>&1 | grep Location: | awk '{print $2}' | awk -F/ '{print $NF}')
@@ -19,13 +19,13 @@ jobs:
         MICROPHYSICS_TAG=$(wget https://github.com/AMReX-Astro/Microphysics/releases/latest 2>&1 | grep Location: | awk '{print $2}' | awk -F/ '{print $NF}')
         echo "MICROPHYSICS_TAG=$MICROPHYSICS_TAG" >> $GITHUB_ENV
     - name: Download Castro
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         repository: 'AMReX-Astro/Castro'
         ref: ${{env.CASTRO_TAG}}
         path: 'Castro'
     - name: Download Microphysics
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         repository: 'AMReX-Astro/Microphysics'
         ref: ${{env.MICROPHYSICS_TAG}}
@@ -35,7 +35,7 @@ jobs:
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -52,7 +52,7 @@ jobs:
         export AMREX_HOME=${PWD}
         export MICROPHYSICS_HOME=${PWD}/Microphysics
         cd Castro/Exec/hydro_tests/Sedov/
-        make -j2 CCACHE=ccache USE_MPI=FALSE
+        make -j4 CCACHE=ccache USE_MPI=FALSE
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -61,13 +61,13 @@ jobs:
     name: WarpX
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Get Latest Release Tag
       run: |
         WARPX_TAG=$(wget https://github.com/ECP-WarpX/WarpX/releases/latest 2>&1 | grep Location: | awk '{print $2}' | awk -F/ '{print $NF}')
         echo "WARPX_TAG=$WARPX_TAG" >> $GITHUB_ENV
     - name: Download WarpX
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         repository: 'ECP-WarpX/WarpX'
         ref: ${{env.WARPX_TAG}}
@@ -77,7 +77,7 @@ jobs:
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -97,7 +97,7 @@ jobs:
             -DWarpX_OPENPMD=OFF                   \
             -DCMAKE_VERBOSE_MAKEFILE=ON           \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build WarpX/build -j 2
+        cmake --build WarpX/build -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -111,7 +111,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/ascent.yml b/.github/workflows/ascent.yml
index 83d2f7ebac3..74421cebac3 100644
--- a/.github/workflows/ascent.yml
+++ b/.github/workflows/ascent.yml
@@ -18,7 +18,7 @@ jobs:
     container:
       image: alpinedav/ascent:0.9.2
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Configure
       run: |
         . /ascent_docker_setup_env.sh
@@ -26,9 +26,10 @@ jobs:
             -DCMAKE_BUILD_TYPE=Debug \
             -DAMReX_ENABLE_TESTS=ON  \
             -DAMReX_FORTRAN=OFF      \
-            -DAMReX_ASCENT=ON
+            -DAMReX_ASCENT=ON        \
+            -DAMReX_CONDUIT=ON
     - name: Build
       run: |
         . /ascent_docker_setup_env.sh
-        cmake --build build -j 2
+        cmake --build build -j 4
 
diff --git a/.github/workflows/bittree.yml b/.github/workflows/bittree.yml
new file mode 100644
index 00000000000..cf50e16631c
--- /dev/null
+++ b/.github/workflows/bittree.yml
@@ -0,0 +1,127 @@
+name: bittree
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.ref }}-${{ github.head_ref }}-bittree
+  cancel-in-progress: true
+
+jobs:
+  bittree-2d:
+    name: Bittree 2D
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Dependencies
+      run: |
+        .github/workflows/dependencies/dependencies.sh
+        .github/workflows/dependencies/dependencies_clang-tidy.sh 15
+        .github/workflows/dependencies/dependencies_ccache.sh
+    - name: Set Up Cache
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/ccache
+        key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
+        restore-keys: |
+             ccache-${{ github.workflow }}-${{ github.job }}-git-
+    - name: Check out Bittree
+      uses: actions/checkout@v4
+      with:
+        repository: Flash-X/Bittree
+        path: bittree
+    - name: Build Bittree
+      run: |
+        cd ${{ github.workspace }}/bittree
+        python setup.py library --dim 2 --prefix ${{ github.workspace }}/libbittree
+        cd build
+        make -j4
+        make install
+    - name: Build and Run Test
+      run: |
+        export CCACHE_COMPRESS=1
+        export CCACHE_COMPRESSLEVEL=10
+        export CCACHE_MAXSIZE=80M
+        export CCACHE_EXTRAFILES=${{ github.workspace }}/.clang-tidy
+        export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt
+        ccache -z
+
+        export AMREX_BITTREE_HOME=${{ github.workspace }}/libbittree
+        cd ${{ github.workspace }}/Tests/Amr/Advection_AmrCore/Exec
+        make -j4 USE_MPI=TRUE USE_BITTREE=TRUE DIM=2 TEST=TRUE \
+            CCACHE=ccache
+        mpiexec -n 2 ./main2d.gnu.TEST.MPI.ex inputs_bittree amr.plot_int=1000
+
+        ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
+            CLANG_TIDY=clang-tidy-15 \
+            CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
+
+        ccache -s
+        du -hs ~/.cache/ccache
+
+  bittree-3d:
+    name: Bittree 3D
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Dependencies
+      run: |
+        .github/workflows/dependencies/dependencies.sh
+        .github/workflows/dependencies/dependencies_clang-tidy.sh 15
+        .github/workflows/dependencies/dependencies_ccache.sh
+    - name: Set Up Cache
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/ccache
+        key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
+        restore-keys: |
+             ccache-${{ github.workflow }}-${{ github.job }}-git-
+    - name: Check out Bittree
+      uses: actions/checkout@v4
+      with:
+        repository: Flash-X/Bittree
+        path: bittree
+    - name: Build Bittree
+      run: |
+        cd ${{ github.workspace }}/bittree
+        python setup.py library --dim 3 --prefix ${{ github.workspace }}/libbittree
+        cd build
+        make -j4
+        make install
+    - name: Build and Run Test
+      run: |
+        export CCACHE_COMPRESS=1
+        export CCACHE_COMPRESSLEVEL=10
+        export CCACHE_MAXSIZE=80M
+        export CCACHE_EXTRAFILES=${{ github.workspace }}/.clang-tidy
+        export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt
+        ccache -z
+
+        export AMREX_BITTREE_HOME=${{ github.workspace }}/libbittree
+        cd ${{ github.workspace }}/Tests/Amr/Advection_AmrCore/Exec
+        make -j4 USE_MPI=TRUE USE_BITTREE=TRUE DIM=3 TEST=TRUE BL_NO_FORT=TRUE\
+            CCACHE=ccache
+        mpiexec -n 2 ./main3d.gnu.TEST.MPI.ex inputs_bittree max_step=10
+
+        ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
+            CLANG_TIDY=clang-tidy-15 \
+            CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
+
+        ccache -s
+        du -hs ~/.cache/ccache
+
+  save_pr_number:
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Save PR number
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+        run: |
+          echo $PR_NUMBER > pr_number.txt
+      - uses: actions/upload-artifact@v4
+        with:
+          name: pr_number
+          path: pr_number.txt
+          retention-days: 1
diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index eea7e576af6..b4886ff9623 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -15,14 +15,14 @@ jobs:
     name: Clang@7.0 C++17 SP NOMPI Debug [lib]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_clang.sh 7
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -54,17 +54,17 @@ jobs:
             -DCMAKE_CXX_COMPILER=$(which clang++-7)     \
             -DCMAKE_Fortran_COMPILER=$(which gfortran)  \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        make -j 2
+        make -j 4
         make install
         make test_install
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
         export PATH=/tmp/my-amrex/bin:$PATH
-        which fcompare
+        which amrex_fcompare
 
         ctest --output-on-failure
 
@@ -75,14 +75,14 @@ jobs:
     name: Clang@14.0 C++17 SP Particles DP Mesh Debug [tests]
     runs-on: ubuntu-22.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_clang.sh 14
         .github/workflows/dependencies/dependencies_clang-tidy.sh 14
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -114,10 +114,10 @@ jobs:
             -DCMAKE_CXX_COMPILER=$(which clang++-14)  \
             -DCMAKE_Fortran_COMPILER=$(which gfortran)  \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        make -j 2
+        make -j 4
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-14 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -131,14 +131,14 @@ jobs:
     name: Clang NOMPI Release [configure 2D]
     runs-on: ubuntu-22.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_clang.sh 14
         .github/workflows/dependencies/dependencies_clang-tidy.sh 14
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -154,12 +154,12 @@ jobs:
         ccache -z
 
         ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no
-        make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names" \
+        make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names" \
             CCACHE=ccache
         make install
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-14 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -175,7 +175,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/cleanup-cache-postpr.yml b/.github/workflows/cleanup-cache-postpr.yml
index 73d6eaf0903..9a2ffb0f61a 100644
--- a/.github/workflows/cleanup-cache-postpr.yml
+++ b/.github/workflows/cleanup-cache-postpr.yml
@@ -16,7 +16,7 @@ jobs:
     env:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Clean up ccache
         run: |
           gh extension install actions/gh-actions-cache
@@ -31,7 +31,10 @@ jobs:
           set +e
 
           keys=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH | cut -f 1)
+          # $keys might contain spaces. Thus we set IFS to \n.
+          IFS=$'\n'
           for k in $keys
           do
-            gh actions-cache delete $k -R $REPO -B $BRANCH --confirm
+            gh actions-cache delete "$k" -R $REPO -B $BRANCH --confirm
           done
+          unset IFS
diff --git a/.github/workflows/cleanup-cache.yml b/.github/workflows/cleanup-cache.yml
index 2a1a2f254a3..d18acbaa788 100644
--- a/.github/workflows/cleanup-cache.yml
+++ b/.github/workflows/cleanup-cache.yml
@@ -2,7 +2,7 @@ name: CleanUpCache
 
 on:
   workflow_run:
-    workflows: [LinuxClang, cuda, LinuxGcc, hip, Hypre, intel, macos, PETSc, SUNDIALS, windows, CodeQL, smoke, apps]
+    workflows: [bittree, LinuxClang, cuda, LinuxGcc, hip, Hypre, intel, macos, PETSc, SUNDIALS, CodeQL, smoke, apps]
     types:
       - completed
 
@@ -16,7 +16,7 @@ jobs:
     env:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Clean up ccache
         run: |
           gh extension install actions/gh-actions-cache
@@ -27,7 +27,7 @@ jobs:
           EVENT=${{ github.event.workflow_run.event }}
 
           # Triggering workflow run name (e.g., LinuxClang)
-          WORKFLOW_NAME=${{ github.event.workflow_run.name }}
+          WORKFLOW_NAME="${{ github.event.workflow_run.name }}"
 
           if [[ $EVENT == "pull_request" ]]; then
             gh run download ${{ github.event.workflow_run.id }} -n pr_number
@@ -45,16 +45,19 @@ jobs:
           # The goal is to keep the last used key of each job and delete all others.
 
           # something like ccache-LinuxClang-
-          keyprefix=ccache-${WORKFLOW_NAME}-
+          keyprefix="ccache-${WORKFLOW_NAME}-"
 
-          cached_jobs=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH --key $keyprefix | awk -F '-git-' '{print $1}' | sort | uniq)
+          cached_jobs=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH --key "$keyprefix" | awk -F '-git-' '{print $1}' | sort | uniq)
 
           # cached_jobs is something like "ccache-LinuxClang-configure-1d ccache-LinuxClang-configure-2d".
+          # It might also contain spaces. Thus we set IFS to \n.
+          IFS=$'\n'
           for j in $cached_jobs
           do
-            old_keys=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH --key ${j}-git- --sort last-used | cut -f 1 | tail -n +2)
+            old_keys=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH --key "${j}-git-" --sort last-used | cut -f 1 | tail -n +2)
             for k in $old_keys
             do
-              gh actions-cache delete $k -R $REPO -B $BRANCH --confirm
+              gh actions-cache delete "$k" -R $REPO -B $BRANCH --confirm
             done
           done
+          unset IFS
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 0e03e384633..c7340ee449b 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -14,6 +14,7 @@ concurrency:
 
 jobs:
   analyze:
+    if: ${{ github.repository == 'AMReX-Codes/amrex' || github.event_name != 'schedule' }}
     name: Analyze
     runs-on: ubuntu-latest
     permissions:
@@ -28,7 +29,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Install Packages (C++)
         if: ${{ matrix.language == 'cpp' }}
@@ -40,7 +41,7 @@ jobs:
 
       - name: Set Up Cache
         if: ${{ matrix.language == 'cpp' }}
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/ccache
           key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -58,14 +59,14 @@ jobs:
               -DCMAKE_CXX_COMPILER="/usr/local/bin/g++"
 
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@v2
+        uses: github/codeql-action/init@v3
         with:
           languages: ${{ matrix.language }}
           queries: +security-and-quality
           config-file: ./.github/workflows/codeql/codeql-config.yml
 
       - name: Build (py)
-        uses: github/codeql-action/autobuild@v2
+        uses: github/codeql-action/autobuild@v3
         if: ${{ matrix.language == 'python' }}
 
       - name: Build (C++)
@@ -76,7 +77,7 @@ jobs:
           export CCACHE_MAXSIZE=30M
           ccache -z
 
-          cmake --build build -j 2
+          cmake --build build -j 4
 
           ccache -s
           du -hs ~/.cache/ccache
@@ -85,10 +86,10 @@ jobs:
           touch Src/Base/AMReX.cpp
           export CCACHE_DISABLE=1
           cd build
-          make -j 2
+          make -j 4
 
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v2
+        uses: github/codeql-action/analyze@v3
         with:
           category: "/language:${{ matrix.language }}"
 
@@ -101,7 +102,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
index 491b839c00e..bec24c5c7ca 100644
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Install codespell
         run: |
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index b4abc1c0a05..1ceed52ba63 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -12,13 +12,13 @@ jobs:
     name: CUDA@11.2 GNU@9.3.0 C++17 Release [tests]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_nvcc.sh 11.2
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -52,7 +52,7 @@ jobs:
             -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache        \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
 
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -71,13 +71,13 @@ jobs:
       CUDAARCHS: "70"
       AMReX_CUDA_ARCH: "7.0"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_llvm_cuda11_clang15.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -104,7 +104,7 @@ jobs:
             -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache        \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
 
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -114,13 +114,14 @@ jobs:
     name: NVHPC NVCC/NVC++ C++17 Release [tests]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
+        .github/workflows/dependencies/ubuntu_free_disk_space.sh
         .github/workflows/dependencies/dependencies_nvhpc.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -164,7 +165,7 @@ jobs:
             -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache        \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
 
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -174,13 +175,13 @@ jobs:
     name: CUDA@11.2 GNU@9.3.0 [configure 3D]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_nvcc.sh 11.2
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -199,7 +200,7 @@ jobs:
         # /home/runner/work/amrex/amrex/Src/Base/AMReX_GpuLaunchGlobal.H:16:41: error: unused parameter ‘f0’ [-Werror=unused-parameter]
         #    16 |     AMREX_GPU_GLOBAL void launch_global (L f0) { f0(); }
         #
-        make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names -Wno-unused-parameter" CCACHE=ccache CUDA_ARCH="7.0 7.2"
+        make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names -Wno-unused-parameter" CCACHE=ccache CUDA_ARCH="7.0 7.2"
         make install
 
         ccache -s
@@ -214,7 +215,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/dependencies/dependencies_codeplay.sh b/.github/workflows/dependencies/dependencies_codeplay.sh
index dfd2e5a03f7..0f723a0bfc0 100755
--- a/.github/workflows/dependencies/dependencies_codeplay.sh
+++ b/.github/workflows/dependencies/dependencies_codeplay.sh
@@ -6,10 +6,25 @@
 
 set -eu -o pipefail
 
-curl -o oneapi_nvidia.sh -L "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=nvidia&filters[]=linux&aat=$1"
-chmod +x oneapi_nvidia.sh
-sudo ./oneapi_nvidia.sh --yes
+# `man apt.conf`:
+#   Number of retries to perform. If this is non-zero APT will retry
+#   failed files the given number of times.
+echo 'Acquire::Retries "3";' | sudo tee /etc/apt/apt.conf.d/80-retries
 
-curl -o oneapi_amd.sh -L "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=amd&filters[]=linux&aat=$1"
-chmod +x oneapi_amd.sh
-sudo ./oneapi_amd.sh --yes
+# https://developer.codeplay.com/apt/index.html
+sudo wget -qO - https://developer.codeplay.com/apt/public.key | gpg --dearmor | sudo tee /usr/share/keyrings/codeplay-keyring.gpg > /dev/null
+echo "deb [signed-by=/usr/share/keyrings/codeplay-keyring.gpg] https://developer.codeplay.com/apt all main" | sudo tee /etc/apt/sources.list.d/codeplay.list
+
+sudo apt-get clean
+sudo apt-get update
+
+# try apt install up to five times, to avoid connection splits
+status=1
+for itry in {1..5}
+do
+    sudo apt-get install -y --no-install-recommends \
+        $1 \
+        && { sudo apt-get clean; sudo apt-get update; status=0; break; }  \
+        || { sleep 10; }
+done
+if [[ ${status} -ne 0 ]]; then exit 1; fi
diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh
index 852342e4ac1..4673a7caed5 100755
--- a/.github/workflows/dependencies/dependencies_hip.sh
+++ b/.github/workflows/dependencies/dependencies_hip.sh
@@ -45,6 +45,9 @@ sudo apt-get install -y --no-install-recommends \
     rocrand-dev     \
     rocprim-dev
 
+# hiprand-dev is a new package that does not exist in old versions
+sudo apt-get install -y --no-install-recommends hiprand-dev || true
+
 # activate
 #
 source /etc/profile.d/rocm.sh
diff --git a/.github/workflows/dependencies/ubuntu_free_disk_space.sh b/.github/workflows/dependencies/ubuntu_free_disk_space.sh
new file mode 100755
index 00000000000..6b3e5b2f55e
--- /dev/null
+++ b/.github/workflows/dependencies/ubuntu_free_disk_space.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+#
+# Copyright 2023 The AMReX Community
+#
+# License: BSD-3-Clause-LBNL
+
+# Don't want to use the following line because apt-get remove may fail if
+# the package specfied does not exist.
+# set -eu -o pipefail
+
+# Large packages
+dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100
+
+echo 'Removing some packages we do not need'
+
+df -h
+
+apt list --installed
+
+sudo apt-get remove -y '^apache.*'
+sudo apt-get remove -y '^aspnetcore.*'
+sudo apt-get remove -y '^azure.*'
+sudo apt-get remove -y '^dotnet.*'
+sudo apt-get remove -y '^firebird.*'
+sudo apt-get remove -y '^firefox.*'
+sudo apt-get remove -y '^google.*'
+sudo apt-get remove -y '^hhvm.*'
+sudo apt-get remove -y '^microsoft.*'
+sudo apt-get remove -y '^mongodb.*'
+sudo apt-get remove -y '^mono-.*'
+sudo apt-get remove -y '^monodoc-.*'
+sudo apt-get remove -y '^mysql.*'
+sudo apt-get remove -y '^php.*'
+sudo apt-get remove -y '^powershell.*'
+sudo apt-get remove -y '^snapd.*'
+sudo apt-get remove -y '^temurin.*'
+
+sudo apt-get autoremove -y
+
+df -h
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 82e387cbff4..49d564b36b3 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           persist-credentials: false
 
@@ -27,12 +27,11 @@ jobs:
 
       - name: Deploy
         if: github.event_name == 'push' && github.repository == 'AMReX-Codes/amrex' && github.ref == 'refs/heads/development'
-        uses: JamesIves/github-pages-deploy-action@3.7.1
+        uses: JamesIves/github-pages-deploy-action@v4.5.0
         with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          ACCESS_TOKEN: ${{ secrets.DEPLOY_DOCS }}
-          REPOSITORY_NAME: AMReX-Codes/AMReX-Codes.github.io
-          BRANCH: main # The branch the action should deploy to.
-          FOLDER: build # The folder the action should deploy.
-          TARGET_FOLDER: amrex # The folder the action should deploy to.
-          CLEAN: false # Do not remove existing files from the deploy target.
+          ssh-key: ${{ secrets.AMREX_BUILD_DOCS }}
+          repository-name: AMReX-Codes/AMReX-Codes.github.io
+          branch: main # The branch the action should deploy to.
+          folder: build # The folder the action should deploy.
+          target-folder: amrex # The folder the action should deploy to.
+          clean: false # Do not remove existing files from the deploy target.
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index 6915a246018..8c2e8403f5d 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -16,14 +16,14 @@ jobs:
     name: GNU@8.4 C++17 Release [lib]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_gcc.sh 8
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -50,17 +50,17 @@ jobs:
             -DCMAKE_CXX_COMPILER=$(which g++-8)   \
             -DCMAKE_Fortran_COMPILER=$(which gfortran-8) \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        make -j 2
+        make -j 4
         make install
         make test_install
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
         export PATH=/tmp/my-amrex/bin:$PATH
-        which fcompare
+        which amrex_fcompare
 
         ctest --output-on-failure
 
@@ -72,14 +72,14 @@ jobs:
     name: GNU@9.3 C++17 3D Debug Fortran [tests]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -104,10 +104,10 @@ jobs:
             -DAMReX_FORTRAN=ON          \
             -DAMReX_SPACEDIM=3          \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -120,14 +120,14 @@ jobs:
     name: GNU@9.3 C++17 2D Debug Fortran [tests]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -152,10 +152,10 @@ jobs:
             -DAMReX_FORTRAN=ON          \
             -DAMReX_SPACEDIM=2          \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -168,14 +168,14 @@ jobs:
     name: GNU@9.3 C++17 1D Debug Fortran [tests]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -201,10 +201,10 @@ jobs:
             -DAMReX_FORTRAN=ON          \
             -DAMReX_SPACEDIM=1          \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -218,14 +218,14 @@ jobs:
     name: GNU@10.1 C++20 OMP [tests]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_gcc.sh 10
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -259,11 +259,11 @@ jobs:
             -DCMAKE_CXX_COMPILER=$(which g++-10)            \
             -DCMAKE_Fortran_COMPILER=$(which gfortran-10)   \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        make -j 2
+        make -j 4
 
         # Let's not use clang-tidy for this test because it wants to use C++20.
         # ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        # make -j2 -f clang-tidy-ccache-misses.mak \
+        # make -j4 -k -f clang-tidy-ccache-misses.mak \
         #     CLANG_TIDY=clang-tidy-12 \
         #     CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -277,14 +277,14 @@ jobs:
     name: GNU@8.4 C++17 NOMPI [tests]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_gcc.sh 8
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -317,10 +317,10 @@ jobs:
             -DCMAKE_CXX_COMPILER=$(which g++-8)   \
             -DCMAKE_Fortran_COMPILER=$(which gfortran-8) \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        make -j 2
+        make -j 4
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -334,14 +334,14 @@ jobs:
     name: GNU@12 C++17 w/o Fortran [tests]
     runs-on: ubuntu-22.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_gcc.sh 12
         .github/workflows/dependencies/dependencies_clang-tidy.sh 14
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -381,10 +381,10 @@ jobs:
             -DCMAKE_CXX_COMPILER=$(which g++-12)   \
             -DCMAKE_CXX_STANDARD=17     \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        make -j 2
+        make -j 4
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-14 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -398,14 +398,14 @@ jobs:
     name: GNU@9.3 Release [configure 1D]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -421,12 +421,12 @@ jobs:
         ccache -z
 
         ./configure --dim 1
-        make -j2 XTRA_CXXFLAGS=-fno-operator-names \
+        make -j4 XTRA_CXXFLAGS=-fno-operator-names \
             CCACHE=ccache
         make install
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -438,14 +438,14 @@ jobs:
     name: GNU@11.2 Release [configure 3D]
     runs-on: ubuntu-22.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 15
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -461,12 +461,12 @@ jobs:
         ccache -z
 
         ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes
-        make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
+        make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
             CCACHE=ccache
         make install
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-15 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -478,14 +478,14 @@ jobs:
     name: GNU@9.3 Release [configure 3D]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -501,12 +501,12 @@ jobs:
         ccache -z
 
         ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes --single-precision-particles yes --enable-tiny-profile yes
-        make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
+        make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
             CCACHE=ccache
         make install
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -518,14 +518,14 @@ jobs:
     name: GNU@9.3 OMP Debug [configure 3D]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -541,12 +541,12 @@ jobs:
         ccache -z
 
         ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes --with-omp yes --debug yes
-        make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
+        make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
             CCACHE=ccache
         make install
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -558,14 +558,14 @@ jobs:
     name: GNU Plotfile Tools [tools]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -581,11 +581,11 @@ jobs:
         ccache -z
 
         cd Tools/Plotfile
-        make -j2 USE_MPI=FALSE USE_OMP=FALSE WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
+        make -j4 USE_MPI=FALSE USE_OMP=FALSE WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
             CCACHE=ccache
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -597,14 +597,14 @@ jobs:
     name: GNU@9.3 C++17 [tests]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 12
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -627,10 +627,10 @@ jobs:
             -DCMAKE_VERBOSE_MAKEFILE=ON \
             -DAMReX_ENABLE_TESTS=ON     \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        make -j 2
+        make -j 4
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-12 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -649,7 +649,7 @@ jobs:
       CXX: h5pcc
       CC: h5cc
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
@@ -659,7 +659,7 @@ jobs:
       run: |
         cd Tests/HDF5Benchmark
         export OLCF_HDF5_ROOT=/usr/lib/x86_64-linux-gnu/hdf5/openmpi
-        make -j 2
+        make -j 4
         mpirun -np 2 ./main3d.gnu.TPROF.MPI.ex ./inputs
         h5dump -d "level_0/data:offsets=0"  -s "1" -c "1" ./plt00000.h5
         h5dump -d "level_0/data:datatype=1" -s "1" -c "1" ./plt00000/particle0/particle0.h5
@@ -673,7 +673,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml
index 0c0e59a96c4..0a66bac0ab3 100644
--- a/.github/workflows/hip.yml
+++ b/.github/workflows/hip.yml
@@ -11,13 +11,13 @@ jobs:
     name: HIP ROCm Flang C++17 [tests]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_hip.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -66,7 +66,7 @@ jobs:
             -DCMAKE_Fortran_COMPILER=$(which flang)       \
             -DCMAKE_CXX_STANDARD=17                       \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -75,13 +75,13 @@ jobs:
     name: HIP ROCm GFortran@9.3 C++17 [tests-hipcc]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_hip.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -128,7 +128,7 @@ jobs:
             -DCMAKE_Fortran_COMPILER=$(which gfortran)    \
             -DCMAKE_CXX_STANDARD=17                       \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build_full_legacywrapper -j 2
+        cmake --build build_full_legacywrapper -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -138,13 +138,13 @@ jobs:
     name: HIP EB [configure 2D]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_hip.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -158,7 +158,7 @@ jobs:
         ccache -z
 
         ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes --with-mpi no --with-omp no --single-precision yes --single-precision-particles yes
-        make -j2 WARN_ALL=TRUE XTRA_CXXFLAGS="-fno-operator-names" AMD_ARCH=gfx90a CCACHE=ccache
+        make -j4 WARN_ALL=TRUE XTRA_CXXFLAGS="-fno-operator-names" AMD_ARCH=gfx90a CCACHE=ccache
         make install
 
         ccache -s
@@ -169,13 +169,13 @@ jobs:
     name: HIP EB 3D GMake
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_hip.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -189,7 +189,7 @@ jobs:
         ccache -z
 
         cd Tests/LinearSolvers/NodeEB
-        make -j2 USE_HIP=TRUE USE_MPI=FALSE BL_NO_FORT=TRUE WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names AMD_ARCH=gfx90a CCACHE=ccache
+        make -j4 USE_HIP=TRUE USE_MPI=FALSE BL_NO_FORT=TRUE WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names AMD_ARCH=gfx90a CCACHE=ccache
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -203,7 +203,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/hypre.yml b/.github/workflows/hypre.yml
index 50423f3942a..b3916176f79 100644
--- a/.github/workflows/hypre.yml
+++ b/.github/workflows/hypre.yml
@@ -13,14 +13,14 @@ jobs:
     env:
       AMREX_HYPRE_HOME: ${HOME}/.cache/hypre-2.26.0-cuda
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_nvcc.sh 11.2
         sudo apt-get install -y libcublas-dev-11-2 libcusparse-dev-11-2
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: |
           ~/.cache/ccache
@@ -38,7 +38,7 @@ jobs:
           ./configure --with-cxxstandard=17 --with-cuda --enable-unified-memory \
               --with-cuda-home=/usr/local/cuda --with-gpu-arch="80" \
               --prefix=${{ env.AMREX_HYPRE_HOME }}
-          make -j 2
+          make -j 4
           make install
           cd ../../
         fi
@@ -53,7 +53,7 @@ jobs:
         export CUDA_PATH=/usr/local/cuda
         export PATH=${PATH}:/usr/local/cuda/bin
         cd Tests/LinearSolvers/CellEB
-        make -j2 USE_MPI=TRUE USE_HYPRE=TRUE DIM=2 USE_CUDA=TRUE CCACHE=ccache
+        make -j4 USE_MPI=TRUE USE_HYPRE=TRUE DIM=2 USE_CUDA=TRUE CCACHE=ccache
 
         ccache -s
         du -h -d1 ~/.cache
@@ -62,14 +62,14 @@ jobs:
     name: GCC 3D Hypre@2.21.0
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 14
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -81,7 +81,7 @@ jobs:
         tar xfz v2.21.0.tar.gz
         cd hypre-2.21.0/src
         ./configure --with-cxxstandard=17 --enable-bigint
-        make -j 2
+        make -j 4
         make install
         cd ../../
     - name: Build and Run Test
@@ -95,12 +95,12 @@ jobs:
 
         export AMREX_HYPRE_HOME=${PWD}/hypre-2.21.0/src/hypre
         cd Tests/LinearSolvers/ABecLaplacian_C
-        make -j2 USE_MPI=TRUE USE_HYPRE=TRUE DIM=3 \
+        make -j4 USE_MPI=TRUE USE_HYPRE=TRUE DIM=3 \
             CCACHE=ccache
         mpiexec -n 2 ./main3d.gnu.MPI.ex inputs.hypre
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-14 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -111,14 +111,14 @@ jobs:
     name: GCC EB 2D Hypre@2.28.0
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 14
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -130,7 +130,7 @@ jobs:
         tar xfz v2.28.0.tar.gz
         cd hypre-2.28.0/src
         ./configure --with-cxxstandard=17
-        make -j 2
+        make -j 4
         make install
         cd ../../
     - name: Build and Run Test
@@ -144,11 +144,11 @@ jobs:
 
         export AMREX_HYPRE_HOME=${PWD}/hypre-2.28.0/src/hypre
         cd Tests/LinearSolvers/Hypre
-        make -j2 USE_MPI=TRUE USE_HYPRE=TRUE DIM=2 CCACHE=ccache
+        make -j4 USE_MPI=TRUE USE_HYPRE=TRUE DIM=2 CCACHE=ccache
         mpiexec -n 2 ./main2d.gnu.MPI.ex inputs.2d
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-14 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -164,7 +164,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 031f7d331bf..e251612b599 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -11,13 +11,13 @@ jobs:
     name: oneAPI SYCL [tests]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_dpcpp.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -44,8 +44,9 @@ jobs:
             -DCMAKE_C_COMPILER=$(which icx)                \
             -DCMAKE_CXX_COMPILER=$(which icpx)             \
             -DCMAKE_Fortran_COMPILER=$(which ifx)          \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build --parallel 2
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache           \
+            -DAMReX_PARALLEL_LINK_JOBS=4
+        cmake --build build --parallel 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -54,13 +55,13 @@ jobs:
     name: oneAPI SYCL [tests w/ EB]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_dpcpp.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -86,8 +87,9 @@ jobs:
             -DAMReX_GPU_BACKEND=SYCL                       \
             -DCMAKE_C_COMPILER=$(which icx)                \
             -DCMAKE_CXX_COMPILER=$(which icpx)             \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build --parallel 2
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache           \
+            -DAMReX_PARALLEL_LINK_JOBS=4
+        cmake --build build --parallel 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -95,29 +97,24 @@ jobs:
   tests-oneapi-sycl-eb-nvidia:
     name: oneAPI SYCL for Nvidia GPUs [tests w/ EB]
     runs-on: ubuntu-latest
-    env:
-      CODEPLAYTOKEN: ${{ secrets.CODEPLAYTOKEN }}
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
-      if: ${{ env.CODEPLAYTOKEN != '' }}
       run: |
-        .github/workflows/dependencies/dependencies_nvcc.sh
+        .github/workflows/dependencies/ubuntu_free_disk_space.sh
         .github/workflows/dependencies/dependencies_dpcpp.sh
-        .github/workflows/dependencies/dependencies_codeplay.sh ${{ env.CODEPLAYTOKEN }}
+        .github/workflows/dependencies/dependencies_nvcc.sh 12.0
+        .github/workflows/dependencies/dependencies_codeplay.sh oneapi-nvidia-12.0
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      if: ${{ env.CODEPLAYTOKEN != '' }}
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
         restore-keys: |
              ccache-${{ github.workflow }}-${{ github.job }}-git-
     - name: Build & Install
-      if: ${{ env.CODEPLAYTOKEN != '' }}
-      # clang currently supports CUDA up to version 11.5 and a warning is issued with newer versions
-      env: {CXXFLAGS: "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-unknown-cuda-version"}
+      env: {CXXFLAGS: "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --offload-arch=sm_80 -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"}
       run: |
         export CCACHE_COMPRESS=1
         export CCACHE_COMPRESSLEVEL=10
@@ -136,8 +133,9 @@ jobs:
             -DAMReX_GPU_BACKEND=SYCL                       \
             -DCMAKE_C_COMPILER=$(which icx)                \
             -DCMAKE_CXX_COMPILER=$(which clang++)          \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build --parallel 2
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache           \
+            -DAMReX_PARALLEL_LINK_JOBS=4
+        cmake --build build --parallel 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -145,27 +143,23 @@ jobs:
   no-tests-oneapi-sycl-amd:
     name: oneAPI SYCL for AMD GPUs
     runs-on: ubuntu-20.04
-    env:
-      CODEPLAYTOKEN: ${{ secrets.CODEPLAYTOKEN }}
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
-      if: ${{ env.CODEPLAYTOKEN != '' }}
       run: |
-        .github/workflows/dependencies/dependencies_hip.sh 5.4.6
+        .github/workflows/dependencies/ubuntu_free_disk_space.sh
         .github/workflows/dependencies/dependencies_dpcpp.sh
-        .github/workflows/dependencies/dependencies_codeplay.sh ${{ env.CODEPLAYTOKEN }}
+        .github/workflows/dependencies/dependencies_hip.sh 5.4.3
+        .github/workflows/dependencies/dependencies_codeplay.sh oneapi-amd-5.4.3
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      if: ${{ env.CODEPLAYTOKEN != '' }}
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
         restore-keys: |
              ccache-${{ github.workflow }}-${{ github.job }}-git-
     - name: Build & Install
-      if: ${{ env.CODEPLAYTOKEN != '' }}
       env: {CXXFLAGS: "-fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"}
       run: |
         export CCACHE_COMPRESS=1
@@ -185,8 +179,9 @@ jobs:
             -DAMReX_SYCL_SUB_GROUP_SIZE=64                 \
             -DCMAKE_C_COMPILER=$(which icx)                \
             -DCMAKE_CXX_COMPILER=$(which clang++)          \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build --parallel 2
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache           \
+            -DAMReX_PARALLEL_LINK_JOBS=4
+        cmake --build build --parallel 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -196,14 +191,15 @@ jobs:
     name: ICC [tests]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
+        .github/workflows/dependencies/ubuntu_free_disk_space.sh
         .github/workflows/dependencies/dependencies_dpcpp.sh
         sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -224,12 +220,11 @@ jobs:
             -DCMAKE_VERBOSE_MAKEFILE=ON                    \
             -DAMReX_EB=ON                                  \
             -DAMReX_ENABLE_TESTS=ON                        \
-            -DAMReX_FORTRAN=ON                             \
+            -DAMReX_FORTRAN=OFF                            \
             -DCMAKE_C_COMPILER=$(which icc)                \
             -DCMAKE_CXX_COMPILER=$(which icpc)             \
-            -DCMAKE_Fortran_COMPILER=$(which ifort)        \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build --parallel 2
+        cmake --build build --parallel 4
         cmake --build build --target install
         cmake --build build --target test_install
 
@@ -253,7 +248,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 1c405c2d66d..007ac29cb3a 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -12,11 +12,11 @@ jobs:
     name: AppleClang Universal w/o MPI [tests-universal]
     runs-on: macos-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_mac.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: /Users/runner/Library/Caches/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -53,11 +53,11 @@ jobs:
     name: AppleClang@11.0 GFortran@9.3 [tests]
     runs-on: macos-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_mac.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: /Users/runner/Library/Caches/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -95,7 +95,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/petsc.yml b/.github/workflows/petsc.yml
index 6d0b92b1343..0e79fddd020 100644
--- a/.github/workflows/petsc.yml
+++ b/.github/workflows/petsc.yml
@@ -11,14 +11,14 @@ jobs:
     name: GCC 2D EB PETSc@3.18.1
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 14
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -31,7 +31,7 @@ jobs:
         cd petsc-3.18.1
         export PETSC_DIR=${PWD}
         ./configure --prefix=${PWD}/petsc
-        make -j 2
+        make -j 4
         make install
         cd ../
     - name: Build and Run Test
@@ -45,12 +45,12 @@ jobs:
 
         export AMREX_PETSC_HOME=${PWD}/petsc-3.18.1/petsc
         cd Tests/LinearSolvers/CellEB
-        make -j2 USE_MPI=TRUE USE_PETSC=TRUE DIM=2 TEST=TRUE \
+        make -j4 USE_MPI=TRUE USE_PETSC=TRUE DIM=2 TEST=TRUE \
             CCACHE=ccache
         mpiexec -n 2 ./main2d.gnu.TEST.MPI.ex inputs.rt.2d.petsc
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-14 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -66,7 +66,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/post-pr.yml b/.github/workflows/post-pr.yml
index f5b914033b7..2768ef376cc 100644
--- a/.github/workflows/post-pr.yml
+++ b/.github/workflows/post-pr.yml
@@ -13,7 +13,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/sensei.yml b/.github/workflows/sensei.yml
index fc5a0db3059..52f8e418a6b 100644
--- a/.github/workflows/sensei.yml
+++ b/.github/workflows/sensei.yml
@@ -21,7 +21,7 @@ jobs:
     container:
       image: senseiinsitu/ci:fedora35-amrex-20220613
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Setup
       run: mkdir build
     - name: Configure
@@ -35,4 +35,4 @@ jobs:
     - name: Build
       run: |
         cd build
-        cmake --build . -j 2
+        cmake --build . -j 4
diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml
index 080a17fd984..f3d0b899df9 100644
--- a/.github/workflows/smoke.yml
+++ b/.github/workflows/smoke.yml
@@ -11,14 +11,14 @@ jobs:
     name: GNU Multi-D
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_clang-tidy.sh 15
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -42,12 +42,12 @@ jobs:
             -DAMReX_EB=ON                         \
             -DCMAKE_CXX_STANDARD=17               \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        make -j 2
+        make -j 4
         make install
         make test_install
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-15 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -58,7 +58,7 @@ jobs:
             -DAMReX_ROOT=../../../installdir      \
             -DCMAKE_CXX_STANDARD=17               \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        make -j 2
+        make -j 4
         mpiexec -n 2 ./install_test ../../Amr/Advection_AmrCore/Exec/inputs-ci
 
         ccache -s
@@ -73,7 +73,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
index 8cd87d66e87..00a86b00ac4 100644
--- a/.github/workflows/style.yml
+++ b/.github/workflows/style.yml
@@ -10,21 +10,21 @@ jobs:
   tabs:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Tabs
         run: .github/workflows/style/check_tabs.sh
 
   trailing_whitespaces:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Trailing Whitespaces
         run: .github/workflows/style/check_trailing_whitespaces.sh
 
   doxygen:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Doxygen
         run: |
           sudo apt-get install -y --no-install-recommends doxygen
diff --git a/.github/workflows/sundials.yml b/.github/workflows/sundials.yml
index 12dfd42c159..c4735479105 100644
--- a/.github/workflows/sundials.yml
+++ b/.github/workflows/sundials.yml
@@ -15,13 +15,13 @@ jobs:
        CCACHE_COMPRESSLEVEL: 10
        CCACHE_MAXSIZE: 20M
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -41,7 +41,7 @@ jobs:
             -DCMAKE_CXX_STANDARD=17                  \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache     \
             -DCMAKE_C_COMPILER_LAUNCHER=ccache
-        make -j2
+        make -j4
         make install
         cd ../..
     - name: Compile Test
@@ -57,10 +57,10 @@ jobs:
             -DSUNDIALS_ROOT=${PWD}/sundials-6.5.0/instdir \
             -DCMAKE_CXX_STANDARD=17     \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 -f clang-tidy-ccache-misses.mak \
+        make -j4 -k -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy-14 \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
@@ -75,13 +75,13 @@ jobs:
        CCACHE_COMPRESSLEVEL: 10
        CCACHE_MAXSIZE: 75M
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies_nvcc.sh
         .github/workflows/dependencies/dependencies_ccache.sh
     - name: Set Up Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -110,7 +110,7 @@ jobs:
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache     \
             -DCMAKE_C_COMPILER_LAUNCHER=ccache       \
             -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache
-        make -j2
+        make -j4
         make install
         cd ../..
     - name: Compile Test
@@ -129,7 +129,7 @@ jobs:
             -DSUNDIALS_ROOT=${PWD}/sundials-6.5.0/instdir \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache  \
             -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -143,7 +143,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 985e143108f..ab3db558649 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -12,101 +12,104 @@ jobs:
     name: MSVC C++17 w/o Fortran w/o MPI
     runs-on: windows-latest
     steps:
-    - uses: actions/checkout@v3
-    - uses: seanmiddleditch/gha-setup-ninja@master
-    - name: Set Up Cache
-      uses: actions/cache@v3
-      with:
-        path: ~/.ccache
-        key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
-        restore-keys: |
-             ccache-${{ github.workflow }}-${{ github.job }}-git-
-    - name: Install Ccache
-      run: |
-        Invoke-WebRequest https://github.com/ccache/ccache/releases/download/v4.8/ccache-4.8-windows-x86_64.zip -OutFile ccache-4.8-windows-x86_64.zip
-        Expand-Archive ccache-4.8-windows-x86_64.zip
+    - uses: actions/checkout@v4
+    #- name: Set Up Cache
+    #  uses: actions/cache@v3
+    #  with:
+    #    path: ~/.ccache
+    #    key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
+    #    restore-keys: |
+    #         ccache-${{ github.workflow }}-${{ github.job }}-git-
+    #- name: Install Ccache
+    #  run: |
+    #    Invoke-WebRequest https://github.com/ccache/ccache/releases/download/v4.8/ccache-4.8-windows-x86_64.zip -OutFile ccache-4.8-windows-x86_64.zip
+    #    Expand-Archive ccache-4.8-windows-x86_64.zip
     - name: Build & Install
       run: |
-        $ccachepath = Join-Path $pwd "ccache-4.8-windows-x86_64"
-        $Env:PATH += ";$ccachepath"
-        $ccachecachedir = Join-Path $HOME ".ccache"
-        $Env:CCACHE_DIR="$ccachecachedir"
-        $Env:CCACHE_DIR
-        $Env:CCACHE_COMPRESS='1'
-        $Env:CCACHE_COMPRESSLEVEL='10'
-        $Env:CCACHE_MAXSIZE='105M'
-        ccache -z
+        #$ccachepath = Join-Path $pwd "ccache-4.8-windows-x86_64"
+        #$Env:PATH += ";$ccachepath"
+        #$ccachecachedir = Join-Path $HOME ".ccache"
+        #$Env:CCACHE_DIR="$ccachecachedir"
+        #$Env:CCACHE_DIR
+        #$Env:CCACHE_COMPRESS='1'
+        #$Env:CCACHE_COMPRESSLEVEL='10'
+        #$Env:CCACHE_MAXSIZE='105M'
+        #ccache -z
 
         cmake -S . -B build   `
-              -G "Ninja"      `
-              -DCMAKE_BUILD_TYPE=Debug      `
               -DBUILD_SHARED_LIBS=ON        `
               -DCMAKE_VERBOSE_MAKEFILE=ON   `
               -DAMReX_EB=OFF                `
               -DAMReX_ENABLE_TESTS=ON       `
               -DAMReX_FORTRAN=OFF           `
-              -DAMReX_MPI=OFF               `
-              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build --config Debug -j 2
+              -DAMReX_MPI=OFF
+              #-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+        cmake --build build --config Debug -j 4
+
+        cmake --build build --config Debug --target install
 
-        ccache -s
+        $Env:PATH += ";D:\\a\amrex\amrex\installdir\bin"
+        cmake --build build --config Debug --target test_install
+
+        #ccache -s
 
   # Build libamrex and all test (static)
   test_msvc_static:
     name: MSVC C++17 w/o Fortran w/o MPI static
     runs-on: windows-latest
     steps:
-    - uses: actions/checkout@v3
-    - uses: seanmiddleditch/gha-setup-ninja@master
-    - name: Set Up Cache
-      uses: actions/cache@v3
-      with:
-        path: ~/.ccache
-        key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
-        restore-keys: |
-             ccache-${{ github.workflow }}-${{ github.job }}-git-
-    - name: Install Ccache
-      run: |
-        Invoke-WebRequest https://github.com/ccache/ccache/releases/download/v4.8/ccache-4.8-windows-x86_64.zip -OutFile ccache-4.8-windows-x86_64.zip
-        Expand-Archive ccache-4.8-windows-x86_64.zip
+    - uses: actions/checkout@v4
+    #- name: Set Up Cache
+    #  uses: actions/cache@v3
+    #  with:
+    #    path: ~/.ccache
+    #    key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
+    #    restore-keys: |
+    #         ccache-${{ github.workflow }}-${{ github.job }}-git-
+    #- name: Install Ccache
+    #  run: |
+    #    Invoke-WebRequest https://github.com/ccache/ccache/releases/download/v4.8/ccache-4.8-windows-x86_64.zip -OutFile ccache-4.8-windows-x86_64.zip
+    #    Expand-Archive ccache-4.8-windows-x86_64.zip
     - name: Build & Install
       run: |
-        $ccachepath = Join-Path $pwd "ccache-4.8-windows-x86_64"
-        $Env:PATH += ";$ccachepath"
-        $ccachecachedir = Join-Path $HOME ".ccache"
-        $Env:CCACHE_DIR="$ccachecachedir"
-        $Env:CCACHE_COMPRESS='1'
-        $Env:CCACHE_COMPRESSLEVEL='10'
-        $Env:CCACHE_MAXSIZE='135M'
-        ccache -z
+        #$ccachepath = Join-Path $pwd "ccache-4.8-windows-x86_64"
+        #$Env:PATH += ";$ccachepath"
+        #$ccachecachedir = Join-Path $HOME ".ccache"
+        #$Env:CCACHE_DIR="$ccachecachedir"
+        #$Env:CCACHE_COMPRESS='1'
+        #$Env:CCACHE_COMPRESSLEVEL='10'
+        #$Env:CCACHE_MAXSIZE='135M'
+        #ccache -z
 
         cmake -S . -B build   `
-              -G "Ninja"      `
-              -DCMAKE_BUILD_TYPE=RelWithDebInfo `
               -DCMAKE_VERBOSE_MAKEFILE=ON   `
               -DAMReX_EB=ON                 `
               -DAMReX_ENABLE_TESTS=ON       `
               -DAMReX_FORTRAN=OFF           `
-              -DAMReX_MPI=OFF               `
-              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        cmake --build build --config RelWithDebInfo -j 2
+              -DAMReX_MPI=OFF
+              #-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+        cmake --build build --config RelWithDebInfo -j 4
+
+        cmake --build build --config RelWithDebInfo --target install
+        cmake --build build --config RelWithDebInfo --target test_install
 
-        ccache -s
+        #ccache -s
 
   # Build libamrex and all tests
   tests_clang:
     name: Clang C++17 w/o Fortran w/o MPI
     runs-on: windows-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - uses: seanmiddleditch/gha-setup-ninja@master
     - name: Build & Install
       shell: cmd
+      env:
+        CMAKE_GENERATOR_TOOLSET: "ClangCl"
+        CMAKE_GENERATOR: "Visual Studio 17 2022"
       run: |
         call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\vc\Auxiliary\build\vcvarsall.bat" x64
         cmake -S . -B build   ^
-              -T "ClangCl"    ^
-              -DCMAKE_BUILD_TYPE=Release    ^
               -DBUILD_SHARED_LIBS=ON        ^
               -DCMAKE_VERBOSE_MAKEFILE=ON   ^
               -DAMReX_EB=ON                 ^
@@ -114,19 +117,25 @@ jobs:
               -DAMReX_FORTRAN=OFF           ^
               -DAMReX_MPI=OFF               ^
               -DAMReX_OMP=ON
-        cmake --build build --config Release -j 2
+        cmake --build build --config Release -j 4
 
-  save_pr_number:
-    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Save PR number
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-        run: |
-          echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
-        with:
-          name: pr_number
-          path: pr_number.txt
-          retention-days: 1
+        cmake --build build --config Release --target install
+
+        set "PATH=%PATH%;D:\\a\amrex\amrex\installdir\bin"
+        cmake --build build --config Release --target test_install
+
+  # If we add ccache back, don't forget to update cleanup-cache.yml
+  #save_pr_number:
+  #  if: github.event_name == 'pull_request'
+  #  runs-on: ubuntu-latest
+  #  steps:
+  #    - name: Save PR number
+  #      env:
+  #        PR_NUMBER: ${{ github.event.number }}
+  #      run: |
+  #        echo $PR_NUMBER > pr_number.txt
+  #    - uses: actions/upload-artifact@v3
+  #      with:
+  #        name: pr_number
+  #        path: pr_number.txt
+  #        retention-days: 1
diff --git a/CHANGES b/CHANGES
index e27552e7cb4..a946f87cf23 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,306 @@
+# 24.03
+
+  -- Fix GPU restart for pure SoA particles (#3783)
+
+  -- fix for ref_ratio=1 (#3786)
+
+  -- Update GMRES/MLMG interface (#3779)
+
+  -- Ref ratio 3 (#3781)
+
+  -- Curl Curl solver: 4-color Gauss-Seidel smoother (#3778)
+
+  -- assert wavefront size (#3777)
+
+  -- Curl of Curl solver: Tweak restriction (#3765)
+
+  -- Adjust debug info argument for HIP compiler (#3761)
+
+  -- fixed bug in MLCurlCurL::xdoty() to prevent doing MPI sum twice. (#3774)
+
+  -- Implement portable assumptions with AMREX_ASSUME (#3770)
+     Fix bug in AMREX_ASSUME (#3773)
+
+  -- Pure SoA: `NextID` as `Long` (#3772)
+
+  -- GNU Make: set COMP_VERSION for hipcc and nvcc (#3771)
+
+  -- Make MFParallelFor safer from int overflow (#3768)
+
+  -- getParticleTileData: HostVector must be initialized during resize (#3769)
+
+  -- add AMREX_LIKELY and AMREX_UNLIKELY (#3767)
+
+  -- Only do a htod memcpy in getParticleTileData when necessary (#3760)
+
+  -- Sync GPU stream before getting the time in TinyProfiler (#3763)
+
+  -- Add a logspace-like function in AMReX_Algorithm.H (#3754)
+
+  -- Fix offset in send buffer of single precision particle communication (#3758)
+
+  -- EdgeFluxRegister for MHD (#3633)
+
+  -- Fix a typo in SYCL version of scan (#3757)
+
+  -- Interpolation from node-centered general mapped coordinates to tracers (#3750)
+
+  -- Remove various deprecated stuff not prefixed by amrex or bl (#3713)
+
+  -- GNU Make: Link flags (#3711)
+
+  -- Remove UB from is_aligned (#3751)
+
+  -- Add partitionParticles Function (#3743)
+
+  -- Minor new functions in AMReX_MPMD to provide flexibility for python binding (#3748)
+
+  -- Box::numPts() returns 0 for empty boxes (#3747)
+
+  -- New Linear Solver: Curl of Curl (#3682)
+
+  -- Use long integer in GPU kernels (#3742)
+
+  -- Add `ParticleIDWrapper::make_invalid()` (#3735)
+
+# 24.02
+
+  -- Disable m_aos for SoA Particle (#3736)
+     Update AoS Restrict (#3738)
+
+  -- Add helper function for setting id and cpu simultaneously. (#3733)
+
+  -- Disable SYCL on Nvidia and AMD CIs (#3726)
+
+  -- ParticleCopyPlan for SoA Particles (#3732)
+
+  -- Do not use std::forward twice on the same object in AmrParticleLocator (#3734)
+
+  -- Particle Copy Plan: Default Vals (#3729)
+
+  -- Fix circular header file dependency (#3725)
+
+  -- Provide portable Gpu::Atomic::Multiply and Gpu::Atomic::Divide implemented with CAS. (#3724)
+
+  -- Plotfile Tools: Add missing option to fcompare usage print (#3722)
+
+  -- Update documentation for STL files (#3723)
+
+  -- Add TypeMultiplier, MakeZeroTuple and IdentityTuple (#3718)
+
+  -- Interpolation routines for tracers with mapped_z. (#3714)
+
+  -- relax constraint that real_comp_names.size() == pc.NumRealComps() + NStructReal for pure SoA plotfiles (#3717)
+
+  -- Fix warnings in DenseBins::build with serial bin policy (#3716)
+
+  -- lockAdd: case of 2D plane in 3D (#3700)
+
+  -- Clean up interpolation routines in AMReX_TracerParticle_mod_K.H and AMReX_Particle_mod_K.H (#3679)
+
+  -- Add macro for loop unrolling across compilers (#3701)
+
+  -- Add a linspace-like function in AMReX_Algorithm.H (#3698)
+
+  -- use amrex::Gpu::memcpy for packParticleIDs (#3699)
+
+  -- BaseFab::lockAdd: Faster version of BaseFab::atomicAdd for OpenMP (#3696)
+
+  -- GMRES (#3648)
+
+  -- Add special named flag for invalid particles (#3688)
+
+  -- Fix BL_PROFILE_TINY_FLUSH (#3695)
+
+  -- Align GpuComplex to its size (#3691)
+
+  -- Fix Advection_AmrCore test (#3690)
+
+# 24.01
+
+  -- MLMG: Use free functions instead of MF member functions (#3681)
+
+  -- Add a few free functions for MLMG (#3680)
+
+  -- Eliminating Matrix operations in MLMG CG bottom solver if initial vector is zero (#3668)
+
+  -- Add a for loop that is unrolled at compile time (#3674)
+
+  -- Add PTD version of getParticleCell (#3675)
+
+  -- Improve ParIter docs (#3676)
+
+  -- Fix CI for ROCm 6.0 (#3673)
+
+  -- PureSoA IdCpu fixes (#3671)
+
+  -- CMake: AMReX_PARALLEL_LINK_JOBS (#3628)
+
+  -- Clang-Tidy in CI: Keep Going after Errors (#3667)
+
+  -- Delete empty below comments on classes and functions (#3669)
+
+  -- Documentation for Profiling: Hot Spots and Load Balance (#3622)
+
+  -- Fix warnings in SortParticlesForDeposition (#3664)
+
+  -- Fix Resize Issue of Fab with the Async Arena (#3663)
+
+  -- Fix SuperParticle `push_back` (#3661)
+
+  -- Pure SoA Particle: Separate Array for IdCPU (#3585)
+
+  -- Limit the scope of gpu_rand_generator (#3659)
+
+  -- Fix a typo in doxygen for NonLocalBC::FillBoundary (#3658)
+
+  -- GNU Make: Fix name collision for aurora (#3656)
+
+  -- two separate fixes -- particle_compare and ref_ratio=1 (#3655)
+
+  -- Clarify documentation on setEBDirchlet() and fix link to AMReX-Hydro (#3652)
+
+  -- Robustify the Cache Cleanup Scripts (#3650)
+
+  -- Disable CodeQL scheduled jobs on forks (#3649)
+
+  -- Work around compiler bug in nvcc 12.2 by using functor instead of lambda (#3653)
+
+# 23.12
+
+  -- solve_cg: avoid use of MF `z` (#3637)
+
+  -- Fix: nosmt OMP Threads Default (#3647)
+     `amrex.omp_threads`: Can Avoid SMT (#3607)
+
+  -- When checking for periodic outs on GPU, copy full particle data (#3646)
+
+  -- MLEBABecLap: Support Robin BC at Domain Boundaries (#3617)
+
+  -- Ascent: SoA Particle Support (#3350)
+
+  -- solve_bicgstab: use fewer MFs (#3635)
+
+  -- solve_bicgstab: cut use of `s` (#3629)
+
+  -- Bug fix for amrex::Subtract when called with interger nghost (#3634)
+
+  -- Fix typo in `MLMGT<MF>::getGradSolution` when `MF` is different from `AMF` (#3631)
+
+  -- SUNDIALS: Use sunrealtype instead of realtype (#3632)
+
+  -- SYCL: Use get_multi_ptr instead of get_pointer (#3630)
+
+  -- Plotfile Tools: GPU support (#3626)
+
+  -- solve_cg: use linop.make instead of MF constructor (#3627)
+
+  -- CArena: shrink_in_place and operator<< (#3621)
+
+  -- solve_bicgstab: use linop.make instead of MF constructor (#3619)
+
+  -- replace AMREX_DEVICE_COMPILE with AMREX_IF_ON_DEVICE and AMREX_IF_ON_HOST (#3591)
+
+  -- [Breaking] Prefix `amrex_` to each plotfile Tool (#3600)
+
+  -- FillRandom: Use MKL host API (#3536)
+
+  -- use hipPointerAttribute_t.type as HIP is removing hipPointerAttribute_t.memoryType (#3610)
+
+# 23.11
+
+  -- Give FlashFluxRegisters ways to accumulate data in registers (#3597)
+
+  -- `AMReXBuildInfo.cmake`: AMReX_DIR (#3609)
+
+  -- update doc for amrex::Abort on GPU (#3605)
+
+  -- Add runtime particle components to HDF5 wrapper (#3596)
+
+  -- Windows: Fix Installed AMReXBuildInfo.cmake (#3606)
+
+  -- Print AMReX version at the beginning of Initialize (#3604)
+
+  -- Install Move Tools to `shared/amrex` (#3599)
+
+  -- Revert "Add ability for GCC 8 in CMake to build fgradient which uses std::filesystem" (#3601)
+
+  -- Avoid std::filesystem (#3602)
+
+  -- Fix Assertion in MLEBNodeFDLaplacian (#3594)
+
+  -- Fix a memory "leak" in VisMF's persistent streams (#3592)
+
+  -- RealVect Static: Export (#3589)
+
+  -- change MaxCnt from 4 to max(4,max_level+1) for how many iterations we… (#3588)
+     … allow in creation of the initial grid hierarchy
+
+  -- Add Bittree CI (#3577)
+
+  -- BCType::ext_dir_cc (#3581)
+
+  -- Disable CCache in Windows CIs (#3566)
+
+  -- Fix ICC CI by Freeing up Disk Space (#3583)
+
+  -- Docs: Link pyAMReX (#3582)
+
+  -- NodeABecLaplacian: Reuse (#3579)
+
+  -- simplify how 2d surface integrals are computed (#3571)
+
+  -- Adding bittree interface to improve regridding performance in octree mode (#3555)
+
+  -- MLNodeABecLaplacian (#3559)
+
+  -- Fix Boundary Centroid in a Corner Case in 2D (#3568)
+
+# 23.10
+
+  -- Bugfix typo in AMReX_SundialsIntegrator.H Nvar vs NVar, the
+     declared/used variable is NVar (#3573)
+
+  -- Code Spell (#3563)
+
+  -- Add Fortran interface for average_down_faces (#3553)
+
+  -- PureSoA: Disable AoS Access (#3290)
+
+  -- Another terrain fix for MPI (#3557)
+     Fix Increment, OK, and EnforcePeriodicWhere for terrain-fitted particles. (#3556)
+
+  -- Added cvode functionality to SUNDIALS integrator (#3436)
+
+  -- ParmParse::addfile needs Init (#3440)
+
+  -- Make the same changes to ApplyInitialRedistribution as to ApplyMLRedistribution (#3554)
+
+  -- Reset EB Fab Type (#3552)
+     EB Data outside domain (#3549)
+
+  -- We weren't defining cent_hat out far enough (#3548)
+
+  -- Add Fortran interface for FillCoarsePatch for face variables (#3542)
+
+  -- print_state/printCell: Make it work without managed memory (#3543)
+
+  -- FillPatch Fortran Interface: Fix incorrect size of Vector<BCRec> (#3546)
+
+  -- ReduceOps: reset result readiness flag (#3545)
+
+  -- Fix Fortran interface for FillPatch for face variables (#3541)
+
+  -- Support multiple CUDA architectures at compilation (#3535)
+
+  -- Add Kestrel machine and remove Rhodes machine from Make.nrel (#3533)
+
+  -- Explicitly flush when writing to the terse run log (#3532)
+
+  -- Missing header in AMReX_GpuComplex.H (#3531)
+
+  -- Add global domain id offset to conduit wrapper (#3524)
+
 # 23.09
 
   -- Fix InitRandomPerBox for 1D & 2D (#3527)
@@ -20,7 +323,7 @@
 
   -- Simplify filterParticles Kernel (#3510)
 
-  -- Generatize particle-to-cell assignment function (#3499)
+  -- Generalize particle-to-cell assignment function (#3499)
      Follow-on to 3499 (#3514)
      ParticleLocator: Make Assignor optional template parameter (#3515)
 
@@ -208,7 +511,7 @@
 
 # 23.07
 
-  -- Allow users to change the default vector growth stategy (#3389)
+  -- Allow users to change the default vector growth strategy (#3389)
 
   -- Communications arena implementation (#3388)
 
diff --git a/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst b/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst
index 8726f51a2ba..cdd774488e4 100644
--- a/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst
+++ b/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst
@@ -93,6 +93,47 @@ it is also recommended to wrap any ``BL_PROFILE_TINY_FLUSH();`` calls in
 informative ``amrex::Print()`` lines to ensure accurate identification of each
 set of timers.
 
+Hot Spots and Load Balance
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The output of TinyProfiler can help us to identify hot spots. For example,
+the following output shows the top three hot spots of a linear solver test
+running on 4 MPI processes.
+
+.. highlight:: console
+
+::
+
+    --------------------------------------------------------------------------------------------
+    Name                                         NCalls  Excl. Min  Excl. Avg  Excl. Max   Max %
+    --------------------------------------------------------------------------------------------
+    MLPoisson::Fsmooth()                            560     0.4775     0.4793     0.4815  34.97%
+    MLPoisson::Fapply()                             114     0.1103      0.113     0.1167   8.48%
+    FabArray::Xpay()                                109        0.1     0.1013     0.1038   7.54%
+
+In this test, there are 16 boxes evenly distributed among 4 MPI processes. The
+output above shows that the load is perfectly balanced. However, if the load
+is not balanced, the results can be very different and sometimes
+misleading. For example, if we put 2, 2, 6 and 6 boxes on processes 0, 1, 2
+and 3, respectively, the top three hot spots now include two MPI
+communication functions, ``FillBoundary`` and ``ParallelCopy``.
+
+.. highlight:: console
+
+::
+
+    --------------------------------------------------------------------------------------------
+    Name                                         NCalls  Excl. Min  Excl. Avg  Excl. Max   Max %
+    --------------------------------------------------------------------------------------------
+    FillBoundary_finish()                           607    0.01568     0.3367     0.6574  41.97%
+    MLPoisson::Fsmooth()                            560     0.2133     0.4047     0.5973  38.13%
+    FabArray::ParallelCopy_finish()                 231   0.002977    0.09748     0.1895  12.10%
+
+The reason that the MPI communication appears slow is that the lightly
+loaded processes have to wait for messages sent by the heavily loaded
+processes. See also :ref:`sec:profopts` for a diagnostic option that may
+provide more insight on the load imbalance.
+
 .. _sec:full:profiling:
 
 Full Profiling
diff --git a/Docs/sphinx_documentation/source/AmrCore.rst b/Docs/sphinx_documentation/source/AmrCore.rst
index 6aadd62250e..0e4e6932126 100644
--- a/Docs/sphinx_documentation/source/AmrCore.rst
+++ b/Docs/sphinx_documentation/source/AmrCore.rst
@@ -648,11 +648,11 @@ interface to a Fortran routine that tags cells (in this case, :fortran:`state_er
             const int*  thi     = tilebox.hiVect();
 
                 // tag cells for refinement
-            state_error(tptr,  ARLIM_3D(tlo), ARLIM_3D(thi),
+            state_error(tptr,  AMREX_ARLIM_3D(tlo), AMREX_ARLIM_3D(thi),
                 BL_TO_FORTRAN_3D(state[mfi]),
                 &tagval, &clearval,
-                ARLIM_3D(tilebox.loVect()), ARLIM_3D(tilebox.hiVect()),
-                ZFILL(dx), ZFILL(prob_lo), &time, &phierr[lev]);
+                AMREX_ARLIM_3D(tilebox.loVect()), AMREX_ARLIM_3D(tilebox.hiVect()),
+                AMREX_ZFILL(dx), AMREX_ZFILL(prob_lo), &time, &phierr[lev]);
             //
             // Now update the tags in the TagBox in the tilebox region
                 // to be equal to itags
diff --git a/Docs/sphinx_documentation/source/AmrLevel.rst b/Docs/sphinx_documentation/source/AmrLevel.rst
index 6f0a1f6fb18..dfbae5cde25 100644
--- a/Docs/sphinx_documentation/source/AmrLevel.rst
+++ b/Docs/sphinx_documentation/source/AmrLevel.rst
@@ -110,7 +110,7 @@ the :cpp:`AmrLevelAdv` class, e.g.,
         int lo_bc[BL_SPACEDIM];
         int hi_bc[BL_SPACEDIM];
         for (int i = 0; i < BL_SPACEDIM; ++i) {
-            lo_bc[i] = hi_bc[i] = INT_DIR;   // periodic boundaries
+            lo_bc[i] = hi_bc[i] = amrex::BCType::int_dir;   // periodic boundaries
         }
 
         BCRec bc(lo_bc, hi_bc);
diff --git a/Docs/sphinx_documentation/source/Basics.rst b/Docs/sphinx_documentation/source/Basics.rst
index fd2d12ee0b8..8c5fe3dbba5 100644
--- a/Docs/sphinx_documentation/source/Basics.rst
+++ b/Docs/sphinx_documentation/source/Basics.rst
@@ -1942,7 +1942,8 @@ tiling flag is on. One can change the default size using :cpp:`ParmParse`
    | | FArrayBoxes.                                      |                                                      |
    +-----------------------------------------------------+------------------------------------------------------+
 
-Dynamic tiling, which runs one box per OpenMP thread, is also available.
+Dynamic tiling, which runs one box per OpenMP thread, either with or without
+tiling the box, is also available.
 This is useful when the underlying work cannot benefit from thread
 parallelization.  Dynamic tiling is implemented using the :cpp:`MFItInfo`
 object and requires the :cpp:`MFIter` loop to be defined in an OpenMP
@@ -1981,9 +1982,13 @@ Dynamic tiling also allows explicit definition of a tile size:
           ...
       }
 
-Usually :cpp:`MFIter` is used for accessing multiple MultiFabs like the second
-example, in which two MultiFabs, :cpp:`U` and :cpp:`F`, use :cpp:`MFIter` via
-:cpp:`operator[]`. These different MultiFabs may have different BoxArrays. For
+Note that :cpp:`EnableTiling()`, with no argument, will use the default tile size.
+
+Usually :cpp:`MFIter` is used for accessing multiple MultiFabs, like
+the second example in the previous section on :ref:`sec:basics:mfiter:notiling`
+in which two MultiFabs, :cpp:`U` and :cpp:`F`, use :cpp:`MFIter` via
+:cpp:`array()` and :cpp:`const_array()` functions. These different MultiFabs
+may have different BoxArrays. For
 example, :cpp:`U` might be cell-centered, whereas :cpp:`F` might be nodal in
 :math:`x`-direction and cell in other directions. The :cpp:`MFIter::validbox`
 and :cpp:`tilebox` functions return Boxes of the same type as the
@@ -2537,12 +2542,27 @@ The basic idea behind physical boundary conditions is as follows:
 
        ext_dir
            "External Dirichlet". It is the user's responsibility to write a routine
-           to fill ghost cells (more details below).
+           to fill ghost cells (more details below). The boundary location
+           is on the domain face even when the data inside the domain are
+           cell-centered.
+
+       ext_dir_cc
+           "External Dirichlet". It is the user's responsibility to write a routine
+           to fill ghost cells (more details below). The boundary location
+           is at the cell center of ghost cells outside the domain.
 
        foextrap
            "First Order Extrapolation"
            First order extrapolation from last cell in interior.
 
+       hoextrap
+           "High Order Extrapolation". The boundary location is on the domain
+           face even when the data inside the domain are cell-centered.
+
+       hoextrapcc
+           "High Order Extrapolation" The boundary location is at the cell
+           center of ghost cells outside the domain.
+
        reflect_even
            Reflection from interior cells with sign
            unchanged, :math:`q(-i) = q(i)`.
@@ -2797,3 +2817,6 @@ Backtrace files are produced by AMReX signal handler by default when
 segfault occurs or ``Abort`` is called.  If the application does not
 want AMReX to handle this, ``ParmParse`` parameter
 `amrex.signal_handling=0` can be used to disable it.
+
+See :ref:`sec:gpu:assertion` for considerations on using these functions in
+GPU-enabled code.
diff --git a/Docs/sphinx_documentation/source/Debugging.rst b/Docs/sphinx_documentation/source/Debugging.rst
index 89eee31c2bd..aa5a9dcc9a3 100644
--- a/Docs/sphinx_documentation/source/Debugging.rst
+++ b/Docs/sphinx_documentation/source/Debugging.rst
@@ -24,7 +24,7 @@ handling of floating point exceptions: ``amrex.fpe_trap_invalid`` for
 NaNs, ``amrex.fpe_trap_zero`` for division by zero and
 ``amrex.fpe_trap_overflow`` for overflow.  To more effectively trap the
 use of uninitialized values, AMReX also initializes ``FArrayBox``\ s in
-``MulitFab``\ s and arrays allocated by ``bl_allocate`` to signaling NaNs when it is compiled
+``MultiFab``\ s and arrays allocated by ``bl_allocate`` to signaling NaNs when it is compiled
 with ``TEST=TRUE`` or ``DEBUG=TRUE`` in GNU make, or with ``-DCMAKE_BUILD_TYPE=Debug`` in CMake.
 One can also control the setting for ``FArrayBox`` using the runtime parameter, ``fab.init_snan``.
 Note for Macs, M1 and M2 chips using Arm64 architecture are not able to trap division by zero.
diff --git a/Docs/sphinx_documentation/source/EB.rst b/Docs/sphinx_documentation/source/EB.rst
index 446e7305e50..9b847a3259c 100644
--- a/Docs/sphinx_documentation/source/EB.rst
+++ b/Docs/sphinx_documentation/source/EB.rst
@@ -48,7 +48,7 @@ Here is a simple example of initialize the database for an embedded sphere.
     EB2::Build(shop, geom, 0, 0);
 
 Alternatively, the EB information can be initialized from an STL file
-specified by a :cpp:`ParmParse` parameter ``eb2.stl_file``.  The
+specified by a :cpp:`ParmParse` parameter ``eb2.stl_file``. (This also requires setting ``eb2.geom_type = stl``.) The
 initialization is done by calling
 
 .. highlight:: c++
@@ -386,6 +386,9 @@ testing cell types and getting neighbor information. For example
         end do
     end do
 
+
+.. _sec:EB:redistribution:
+
 Small Cell Problem and Redistribution
 =====================================
 
diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
index 1391015f31e..c75440a27fd 100644
--- a/Docs/sphinx_documentation/source/GPU.rst
+++ b/Docs/sphinx_documentation/source/GPU.rst
@@ -217,7 +217,7 @@ variables to configure the build
    +------------------------------+-------------------------------------------------+-------------+-----------------+
    | SYCL_SUB_GROUP_SIZE          | Specify subgroup size                           | 32          | 64, 32, 16      |
    +------------------------------+-------------------------------------------------+-------------+-----------------+
-   | SYCL_MAX_PARALLEL_LINK_JOBS  | Number of parallel jobs in device link          | 1           | 1, 2, 3, etc.   |
+   | SYCL_PARALLEL_LINK_JOBS      | Number of parallel jobs in device link          | 1           | 1, 2, 3, etc.   |
    +------------------------------+-------------------------------------------------+-------------+-----------------+
 .. raw:: latex
 
@@ -307,7 +307,7 @@ If autodetection fails, a list of "common" architectures is assumed.
 Building for multiple CUDA architectures will generally result in a larger library and longer build times.
 
 **Note that AMReX supports NVIDIA GPU architectures with compute capability 6.0 or higher and
-CUDA Toolkit version 9.0 or higher.**
+CUDA Toolkit version 11.0 or higher.**
 
 In order to import the CUDA-enabled AMReX library into your CMake project, you need to include
 the following code into the appropriate CMakeLists.txt file:
@@ -428,22 +428,24 @@ Below is an example configuration for SYCL:
 
 .. table:: AMReX SYCL-specific build options
 
-   +------------------------------+-------------------------------------------------+-------------+-----------------+
-   | Variable Name                | Description                                     | Default     | Possible values |
-   +==============================+=================================================+=============+=================+
-   | AMReX_SYCL_AOT               | Enable SYCL ahead-of-time compilation           | NO          | YES, NO         |
-   +------------------------------+-------------------------------------------------+-------------+-----------------+
-   | AMReX_SYCL_AOT_GRF_MODE      | Specify AOT register file mode                  | Default     | Default, Large, |
-   |                              |                                                 |             | AutoLarge       |
-   +------------------------------+-------------------------------------------------+-------------+-----------------+
-   | AMREX_INTEL_ARCH             | Specify target if AOT is enabled                | None        | pvc, etc.       |
-   +------------------------------+-------------------------------------------------+-------------+-----------------+
-   | AMReX_SYCL_SPLIT_KERNEL      | Enable SYCL kernel splitting                    | YES         | YES, NO         |
-   +------------------------------+-------------------------------------------------+-------------+-----------------+
-   | AMReX_SYCL_ONEDPL            | Enable SYCL's oneDPL algorithms                 | NO          | YES, NO         |
-   +------------------------------+-------------------------------------------------+-------------+-----------------+
-   | AMReX_SYCL_SUB_GROUP_SIZE    | Specify subgroup size                           | 32          | 64, 32, 16      |
-   +------------------------------+-------------------------------------------------+-------------+-----------------+
+   +-------------------------------+----------------------------------------------+-------------+------------------+
+   | Variable Name                 | Description                                  | Default     | Possible values  |
+   +===============================+==============================================+=============+==================+
+   | AMReX_SYCL_AOT                | Enable SYCL ahead-of-time compilation        | NO          | YES, NO          |
+   +-------------------------------+----------------------------------------------+-------------+------------------+
+   | AMReX_SYCL_AOT_GRF_MODE       | Specify AOT register file mode               | Default     | Default, Large,  |
+   |                               |                                              |             | AutoLarge        |
+   +-------------------------------+----------------------------------------------+-------------+------------------+
+   | AMREX_INTEL_ARCH              | Specify target if AOT is enabled             | None        | pvc, etc.        |
+   +-------------------------------+----------------------------------------------+-------------+------------------+
+   | AMReX_SYCL_SPLIT_KERNEL       | Enable SYCL kernel splitting                 | YES         | YES, NO          |
+   +-------------------------------+----------------------------------------------+-------------+------------------+
+   | AMReX_SYCL_ONEDPL             | Enable SYCL's oneDPL algorithms              | NO          | YES, NO          |
+   +-------------------------------+----------------------------------------------+-------------+------------------+
+   | AMReX_SYCL_SUB_GROUP_SIZE     | Specify subgroup size                        | 32          | 64, 32, 16       |
+   +-------------------------------+----------------------------------------------+-------------+------------------+
+   | AMReX_PARALLEL_LINK_JOBS      | Specify number of parallel link jobs         | 1           | positive integer |
+   +-------------------------------+----------------------------------------------+-------------+------------------+
 .. raw:: latex
 
    \end{center}
@@ -489,11 +491,10 @@ GPU support.
 When AMReX is compiled with ``USE_OMP_OFFLOAD=TRUE``,
 ``AMREX_USE_OMP_OFFLOAD`` is defined.
 
-In addition to AMReX's preprocessor macros, CUDA provides the
-``__CUDA_ARCH__`` macro which is only defined when in device code.
-HIP and Sycl provide similar macros.
-``AMREX_DEVICE_COMPILE`` should be used when a ``__host__ __device__``
-function requires separate code for the CPU and GPU implementations.
+The macros ``AMREX_IF_ON_DEVICE((code_for_device))`` and
+``AMREX_IF_ON_HOST((code_for_host))`` should be used when a
+``__host__ __device__`` function requires separate code for the
+CPU and GPU implementations.
 
 .. ===================================================================
 
@@ -795,7 +796,7 @@ As another example, the following function computes the max- and 1-norm of a
 ::
 
     GpuTuple<Real,Real> compute_norms (MultiFab const& mf,
-                                       iMulitiFab const& mask)
+                                       iMultiFab const& mask)
     {
         auto const& data_ma = mf.const_arrays();
         auto const& mask_ma = mask.const_arrays();
@@ -1553,9 +1554,13 @@ Assertions and Error Checking
 To help debugging, we often use :cpp:`amrex::Assert` and
 :cpp:`amrex::Abort`.  These functions are GPU safe and can be used in
 GPU kernels.  However, implementing these functions requires additional
-GPU registers, which will reduce overall performance.  Therefore, it
-is preferred to implement such calls in debug mode only by wrapping the
-calls using ``#ifdef AMREX_DEBUG``.
+GPU registers, which will reduce overall performance.  Therefore, by
+default these functions and the macro ``AMREX_ALWAYS_ASSERT`` are no-ops
+for optimized builds (e.g., ``DEBUG=FALSE`` using the GNU Make build
+system) when called from kernels run on GPU. Calls to these functions from
+GPU kernels are active for debug builds and can optionally be activated
+at compile time for optimized builds (e.g., ``DEBUG=FALSE`` and
+``USE_ASSERTION=TRUE`` using the GNU Make build system).
 
 In CPU code, :cpp:`AMREX_GPU_ERROR_CHECK()` can be called
 to check the health of previous GPU launches.  This call
@@ -1738,14 +1743,14 @@ by "amrex" in your :cpp:`inputs` file.
 +----------------------------+-----------------------------------------------------------------------+-------------+----------+
 |                            | Description                                                           |   Type      | Default  |
 +============================+=======================================================================+=============+==========+
-| use_gpu_aware_mpi          | Whether to use GPU memory for communication buffers during MPI calls. | Bool        | False    |
-|                            | If true, the buffers will use device memory. If false, they will use  |             |          |
-|                            | pinned memory. In practice, we find it is usually not worth it to use |             |          |
-|                            | GPU aware MPI.                                                        |             |          |
+| use_gpu_aware_mpi          | Whether to use GPU memory for communication buffers during MPI calls. | Bool        | 0        |
+|                            | If true, the buffers will use device memory. If false (i.e., 0), they |             |          |
+|                            | will use pinned memory. In practice, we find it is not always worth   |             |          |
+|                            | it to use GPU aware MPI.                                              |             |          |
 +----------------------------+-----------------------------------------------------------------------+-------------+----------+
-| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a      | Bool        | False    |
+| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a      | Bool        | 0        |
 |                            | requested allocation, AMReX will call AMReX::Abort() with an error    |             |          |
 |                            | describing how much free memory there is and what was requested.      |             |          |
 +----------------------------+-----------------------------------------------------------------------+-------------+----------+
-| the_arena_is_managed       | Whether :cpp:`The_Arena()` allocates managed memory.                  | Bool        | False    |
+| the_arena_is_managed       | Whether :cpp:`The_Arena()` allocates managed memory.                  | Bool        | 0        |
 +----------------------------+-----------------------------------------------------------------------+-------------+----------+
diff --git a/Docs/sphinx_documentation/source/InputsComputeBackends.rst b/Docs/sphinx_documentation/source/InputsComputeBackends.rst
new file mode 100644
index 00000000000..26e5d527508
--- /dev/null
+++ b/Docs/sphinx_documentation/source/InputsComputeBackends.rst
@@ -0,0 +1,21 @@
+.. _Chap:InputsComputeBackends:
+
+Compute Backends
+================
+
+The following inputs must be preceded by ``amrex.`` and determine runtime options of CPU or GPU compute implementations.
+
++------------------------+-----------------------------------------------------------------------+-------------+------------+
+| Parameter              | Description                                                           |   Type      | Default    |
++========================+=======================================================================+=============+============+
+| ``omp_threads``        | If OpenMP is enabled, this can be used to set the default number of   |   String    | ``system`` |
+|                        | threads. The special value ``nosmt`` can be used to avoid using       |   or Int    |            |
+|                        | threads for virtual cores (aka Hyperthreading or SMT), as is default  |             |            |
+|                        | in OpenMP, and instead only spawns threads equal to the number of     |             |            |
+|                        | physical cores in the system.                                         |             |            |
+|                        | For the values ``system`` and ``nosmt``, the environment variable     |             |            |
+|                        | ``OMP_NUM_THREADS`` takes precedence. For Integer values,             |             |            |
+|                        | ``OMP_NUM_THREADS`` is ignored.                                       |             |            |
++------------------------+-----------------------------------------------------------------------+-------------+------------+
+
+For GPU-specific parameters, see also the :ref:`GPU chapter <sec:gpu:parameters>`.
diff --git a/Docs/sphinx_documentation/source/InputsPlotFiles.rst b/Docs/sphinx_documentation/source/InputsPlotFiles.rst
index 64b202ebf5c..9e8789a90ac 100644
--- a/Docs/sphinx_documentation/source/InputsPlotFiles.rst
+++ b/Docs/sphinx_documentation/source/InputsPlotFiles.rst
@@ -12,7 +12,7 @@ as whether a plotfile should be written out immediately after restarting a simul
 | plot_int            | Frequency of plotfile output;                                         |    Int      | -1        |
 |                     | if -1 then no plotfiles will be written                               |             |           |
 +---------------------+-----------------------------------------------------------------------+-------------+-----------+
-| plotfile_on_restart | Should we write a plotfile when we restart (only used if plot_int>0)  |   Bool      | False     |
+| plotfile_on_restart | Should we write a plotfile when we restart (only used if plot_int>0)  |   Bool      | 0 (false) |
 +---------------------+-----------------------------------------------------------------------+-------------+-----------+
 | plot_file           | Prefix to use for plotfile output                                     |  String     | plt       |
 +---------------------+-----------------------------------------------------------------------+-------------+-----------+
diff --git a/Docs/sphinx_documentation/source/Inputs_Chapter.rst b/Docs/sphinx_documentation/source/Inputs_Chapter.rst
index 0a64aeb492c..43ead40b3c6 100644
--- a/Docs/sphinx_documentation/source/Inputs_Chapter.rst
+++ b/Docs/sphinx_documentation/source/Inputs_Chapter.rst
@@ -9,6 +9,7 @@ Run-time Inputs
    InputsProblemDefinition
    InputsTimeStepping
    InputsLoadBalancing
+   InputsComputeBackends
    InputsPlotFiles
    InputsCheckpoint
 
diff --git a/Docs/sphinx_documentation/source/LinearSolvers.rst b/Docs/sphinx_documentation/source/LinearSolvers.rst
index 87048bc8195..ab0ba3506a9 100644
--- a/Docs/sphinx_documentation/source/LinearSolvers.rst
+++ b/Docs/sphinx_documentation/source/LinearSolvers.rst
@@ -293,6 +293,18 @@ For Robin boundary conditions, the ghost cells in
 store the numerical values in the condition,
 :math:`a\phi + b\frac{\partial\phi}{\partial n} = f`.
 
+4) Nodal solver provides the option to use an overset mask:
+
+.. highlight:: c++
+
+::
+
+   // omask is either 0 or 1. 1 means the node is an unknown. 0 means it's known.
+   void setOversetMask (int amrlev, const iMultiFab& a_dmask);
+
+Note this is an integer (not bool) MultiFab, so the values must be only either 0 or 1.
+
+
 .. _sec:linearsolver:pars:
 
 Parameters
@@ -483,7 +495,9 @@ To set homogeneous Dirichlet boundary conditions, call
     ml_ebabeclap->setEBHomogDirichlet(lev, coeff);
 
 where coeff can be a real number (i.e. the value is the same at every cell)
-or is the MultiFab holding the coefficient of the gradient at each cell with an EB face.
+or a MultiFab holding the coefficient of the gradient at each cell with an EB face.
+In other words, coeff is :math:`\beta` in the canonical form given in equation :eq:`eqn::abeclap`
+located at the EB surface centroid.
 
 To set inhomogeneous Dirichlet boundary conditions, call
 
@@ -494,8 +508,9 @@ To set inhomogeneous Dirichlet boundary conditions, call
     ml_ebabeclap->setEBDirichlet(lev, phi_on_eb, coeff);
 
 where phi_on_eb is the MultiFab holding the Dirichlet values in every cut cell,
-and coeff again is a real number (i.e. the value is the same at every cell)
-or a MultiFab holding the coefficient of the gradient at each cell with an EB face.
+and coeff again is a real number
+or a MultiFab holding the coefficient of the gradient at each cell with an EB face,
+i.e. :math:`\beta` in equation :eq:`eqn::abeclap` located at the EB surface centroid.
 
 Currently there are options to define the face-based coefficients on
 face centers vs face centroids, and to interpret the solution variable
@@ -565,7 +580,7 @@ The following parameter should be set to True if the problem to be solved has a
 In this case, the solution is only defined to within a constant.  Setting this parameter to True
 replaces one row in the matrix sent to hypre from AMReX by a row that sets the value at one cell to 0.
 
-- :cpp:`hypre.adjust_singular_matrix`:   Default is False.
+- :cpp:`hypre.adjust_singular_matrix`:   Default is false.
 
 
 The following parameters can be set in the inputs file to control the choice of preconditioner and smoother:
diff --git a/Docs/sphinx_documentation/source/Particle.rst b/Docs/sphinx_documentation/source/Particle.rst
index da5fabb02b2..e3a28591a72 100644
--- a/Docs/sphinx_documentation/source/Particle.rst
+++ b/Docs/sphinx_documentation/source/Particle.rst
@@ -86,7 +86,8 @@ tracked as the particle positions change. To do this, we provide the
 
 ::
 
-      ParticleContainer<3, 2, 4, 4> mypc;
+      using MyParticleContainer = ParticleContainer<3, 2, 4, 4>;
+      MyParticleContainer mypc;
 
 Like the :cpp:`Particle` class itself, the :cpp:`ParticleContainer`
 class is templated. The first two template parameters have the same meaning as
@@ -375,8 +376,8 @@ example, to iterate over all the AoS data:
 ::
 
 
-    using MyParIter = ConstParIter<2*BL_SPACEDIM>;
-    for (MyParIter pti(pc, lev); pti.isValid(); ++pti) {
+    using MyParConstIter = MyParticleContainer::ParConstIterType;
+    for (MyParConstIter pti(pc, lev); pti.isValid(); ++pti) {
         const auto& particles = pti.GetArrayOfStructs();
         for (const auto& p : particles) {
             // do stuff with p...
@@ -392,7 +393,7 @@ skipped. You can also access the SoA data using the :math:`ParIter` as follows:
 ::
 
 
-    using MyParIter = ParIter<0, 0, 2, 2>;
+    using MyParIter = MyParticleContainer::ParIterType;
     for (MyParIter pti(pc, lev); pti.isValid(); ++pti) {
         auto& particle_attributes = pti.GetStructOfArrays();
         RealVector& real_comp0 = particle_attributes.GetRealData(0);
@@ -713,7 +714,7 @@ with OpenMP, the first thing to look at is whether there are enough tiles availa
 +-------------------+-----------------------------------------------------------------------+-------------+-------------+
 |                   | Description                                                           |   Type      | Default     |
 +===================+=======================================================================+=============+=============+
-| do_tiling         | Whether to use tiling for particles. Should be on when using OpenMP,  | Bool        | False       |
+| do_tiling         | Whether to use tiling for particles. Should be on when using OpenMP,  | Bool        | false       |
 |                   | and off when running on GPUs.                                         |             |             |
 +-------------------+-----------------------------------------------------------------------+-------------+-------------+
 | tile_size         | If tiling is on, the maximum tile_size to in each direction           | Ints        | 1024000,8,8 |
@@ -739,7 +740,7 @@ problems with particle IO, you could try varying some / all of these parameters.
 | datadigits_read   | This for backwards compatibility, don't use unless you need to read   | Int         | 5           |
 |                   | and old (pre mid 2017) AMReX dataset.                                 |             |             |
 +-------------------+-----------------------------------------------------------------------+-------------+-------------+
-| use_prepost       | This is an optimization for large particle datasets that groups MPI   | Bool        | False       |
+| use_prepost       | This is an optimization for large particle datasets that groups MPI   | Bool        | false       |
 |                   | calls needed during the IO together. Try it seeing poor IO speeds     |             |             |
 |                   | on large problems.                                                    |             |             |
 +-------------------+-----------------------------------------------------------------------+-------------+-------------+
diff --git a/Docs/sphinx_documentation/source/Python_Chapter.rst b/Docs/sphinx_documentation/source/Python_Chapter.rst
new file mode 100644
index 00000000000..76061ea2ebe
--- /dev/null
+++ b/Docs/sphinx_documentation/source/Python_Chapter.rst
@@ -0,0 +1,13 @@
+.. role:: cpp(code)
+   :language: c++
+
+.. _Chap:Python:
+
+Python Interface
+================
+
+
+The core of AMReX is written in C++.
+For users who want to write all of their programs in Python, or C++ application developers that like to add Python interfaces to their applications for scripting, rapid prototyping, code coupling and/or AI/ML workflows, many AMReX classes, functions and all data containers are now also available.
+
+Please see `pyAMReX <https://github.com/AMReX-Codes/pyamrex/>`__ (`manual <https://pyamrex.readthedocs.io>`__) for further details.
diff --git a/Docs/sphinx_documentation/source/conf.py b/Docs/sphinx_documentation/source/conf.py
index dc29ab6e041..8cb17c78e1f 100644
--- a/Docs/sphinx_documentation/source/conf.py
+++ b/Docs/sphinx_documentation/source/conf.py
@@ -42,7 +42,7 @@ def get_amrex_version():
 
 intersphinx_mapping = {
     'amrex_tutorials': ('https://amrex-codes.github.io/amrex/tutorials_html/', None),
-    'amrex_hydro':('https://amrex-codes.github.io/amrex/hydro_html/', None)
+    'amrex_hydro':('https://amrex-fluids.github.io/amrex-hydro/docs_html/', None)
 }
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/Docs/sphinx_documentation/source/index.rst b/Docs/sphinx_documentation/source/index.rst
index 8cdced26a19..b748a6394a5 100644
--- a/Docs/sphinx_documentation/source/index.rst
+++ b/Docs/sphinx_documentation/source/index.rst
@@ -51,6 +51,7 @@ Documentation on migration from BoxLib is available in the AMReX repository at D
    LinearSolvers_Chapter
    Particle_Chapter
    Fortran_Chapter
+   Python_Chapter
    EB_Chapter
    TimeIntegration_Chapter
    GPU_Chapter
diff --git a/GNUmakefile.in b/GNUmakefile.in
index ab31f4e192b..dda52f90d04 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -49,6 +49,9 @@ endif
 ifeq ($(USE_SUNDIALS),TRUE)
     Pdirs += Extern/SUNDIALS
 endif
+ifeq ($(USE_BITTREE),TRUE)
+	Pdirs += Extern/Bittree
+endif
 Ppack := $(foreach dir, $(Pdirs), $(AMREX_HOME)/Src/$(dir)/Make.package)
 include $(Ppack)
 
diff --git a/LICENSE b/LICENSE
index 60027deecd0..a9a04e67eac 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,33 +1,31 @@
-Copyright (c) 2017, The Regents of the University of California,
-through Lawrence Berkeley National Laboratory and the Alliance for
-Sustainable Energy, LLC., through National Renewable Energy Laboratory
-(subject to receipt of any required approvals from the U.S. Dept. of
-Energy).  All rights reserved.
+AMReX Copyright (c) 2024, The Regents of the University of California,
+through Lawrence Berkeley National Laboratory (subject to receipt of any
+required approvals from the U.S. Dept. of Energy).  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
+modification, are permitted provided that the following conditions are met:
 
-(1) Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
+(1) Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
 
 (2) Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 
-(3) Neither the name of the University of California, Lawrence
-Berkeley National Laboratory, Alliance for Sustainable Energy, LLC.,
-National Renewable Energy Laboratory, U.S. Dept. of Energy nor the
-names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
+(3) Neither the name of the University of California, Lawrence Berkeley
+National Laboratory, U.S. Dept. of Energy nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
 
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index 786fa11babc..c7efc128376 100644
--- a/README.md
+++ b/README.md
@@ -93,25 +93,24 @@ Any level of changes are welcomed: documentation, bug fixes, new test problems,
 new solvers, etc. For more details on how to contribute to AMReX, please see
 [CONTRIBUTING.md](CONTRIBUTING.md).
 
-## License
+## Copyright Notice
+
+AMReX Copyright (c) 2024, The Regents of the University of California,
+through Lawrence Berkeley National Laboratory (subject to receipt of any
+required approvals from the U.S. Dept. of Energy).  All rights reserved.
+
+If you have questions about your rights to use or distribute this software,
+please contact Berkeley Lab's Intellectual Property Office at IPO@lbl.gov.
 
-AMReX Copyright (c) 2017, The Regents of the University of California,
-through Lawrence Berkeley National Laboratory and the Alliance for
-Sustainable Energy, LLC., through National Renewable Energy Laboratory
-(subject to receipt of any required approvals from the U.S. Dept. of
-Energy).  All rights reserved.
-
-If you have questions about your rights to use or distribute this
-software, please contact Berkeley Lab's Innovation & Partnerships
-Office at IPO@lbl.gov.
-
-NOTICE.  This Software was developed under funding from the
-U.S. Department of Energy and the U.S. Government consequently retains
-certain rights. As such, the U.S. Government has been granted for
-itself and others acting on its behalf a paid-up, nonexclusive,
-irrevocable, worldwide license in the Software to reproduce,
-distribute copies to the public, prepare derivative works, and perform
-publicly and display publicly, and to permit other to do so.
+NOTICE.  This Software was developed under funding from the U.S. Department
+of Energy and the U.S. Government consequently retains certain rights.  As
+such, the U.S. Government has been granted for itself and others acting on
+its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
+Software to reproduce, distribute copies to the public, prepare derivative
+works, and perform publicly and display publicly, and to permit others to do
+so.
+
+## License
 
 License for AMReX can be found at [LICENSE](LICENSE).
 
diff --git a/Src/Amr/AMReX_Amr.H b/Src/Amr/AMReX_Amr.H
index a7173fd105a..bb18ec9d160 100644
--- a/Src/Amr/AMReX_Amr.H
+++ b/Src/Amr/AMReX_Amr.H
@@ -30,7 +30,6 @@ class AmrInSituBridge;
 * not belong on a single level, like establishing and updating the hierarchy
 * of levels, global timestepping, and managing the different AmrLevels
 */
-
 class Amr
     : public AmrCore
 {
diff --git a/Src/Amr/AMReX_Amr.cpp b/Src/Amr/AMReX_Amr.cpp
index 0b1dfb6f886..30ca8f8dc00 100644
--- a/Src/Amr/AMReX_Amr.cpp
+++ b/Src/Amr/AMReX_Amr.cpp
@@ -1830,8 +1830,8 @@ Amr::checkPoint ()
                 amrex::FileOpenFailed(FAHeaderFilesName);
             }
 
-            for(int i(0); i < FAHeaderNames.size(); ++i) {
-                FAHeaderFile << FAHeaderNames[i] << '\n';
+            for(const auto & FAHeaderName : FAHeaderNames) {
+                FAHeaderFile << FAHeaderName << '\n';
             }
         }
     }
@@ -2206,7 +2206,8 @@ Amr::coarseTimeStep (Real stop_time)
     }
     if (record_run_info_terse && ParallelDescriptor::IOProcessor()) {
         runlog_terse << level_steps[0] << " " << cumtime << " " << dt_level[0];
-        runlog_terse << std::endl; // Make sure we flush!
+        runlog_terse << '\n';
+        runlog_terse.flush();
     }
 
     int check_test = 0;
@@ -2346,11 +2347,11 @@ Amr::coarseTimeStep (Real stop_time)
         if(ParallelDescriptor::IOProcessor()) {
             if (to_checkpoint)
             {
-                amrex::ErrorStream() << "Stopped by user w/ checkpoint" << std::endl;
+                amrex::ErrorStream() << "Stopped by user w/ checkpoint" << '\n';
             }
             else
             {
-                amrex::ErrorStream() << "Stopped by user w/o checkpoint" << std::endl;
+                amrex::ErrorStream() << "Stopped by user w/o checkpoint" << '\n';
             }
         }
     }
@@ -2735,7 +2736,7 @@ Amr::regrid (int  lbase,
                        << time
                        << " : REGRID  with lbase = "
                        << lbase
-                       << std::endl;
+                       << '\n';
 
         if (verbose > 1)
         {
@@ -2916,7 +2917,8 @@ Amr::printGridInfo (std::ostream& os,
         }
     }
 
-    os << std::endl; // Make sure we flush!
+    os << '\n';
+    os.flush();
 }
 
 
@@ -3085,7 +3087,7 @@ Amr::bldFineLevels (Real strt_time)
     {
         bool grids_the_same;
 
-        const int MaxCnt = 4;
+        const int MaxCnt = std::max(4, max_level+1);
 
         int count = 0;
 
diff --git a/Src/Amr/AMReX_AmrLevel.H b/Src/Amr/AMReX_AmrLevel.H
index d4ac6c7c70d..37d0bdb9200 100644
--- a/Src/Amr/AMReX_AmrLevel.H
+++ b/Src/Amr/AMReX_AmrLevel.H
@@ -34,7 +34,6 @@ class TagBoxArray;
 * AmrLevel functions both as a container for state data on a level
 * and also manages the advancement of data in time.
 */
-
 class AmrLevel
 {
     friend class Amr;
@@ -554,8 +553,8 @@ private:
     //
     // The data.
     //
-    AmrLevel&                         m_amrlevel;
-    MultiFab&                         m_leveldata;
+    AmrLevel*                         m_amrlevel;
+    MultiFab*                         m_leveldata;
     std::vector< std::pair<int,int> > m_range;
     MultiFab                          m_fabs;
     int                               m_ncomp;
@@ -592,8 +591,8 @@ private:
     //
     // The data.
     //
-    AmrLevel&                  m_amrlevel;
-    MultiFab&                  m_leveldata;
+    AmrLevel*                  m_amrlevel;
+    MultiFab*                  m_leveldata;
     MultiFabCopyDescriptor     m_mfcd;
     Vector< Vector<MultiFabId> > m_mfid;     // [level][oldnew]
     Interpolater*              m_map = nullptr;
diff --git a/Src/Amr/AMReX_AmrLevel.cpp b/Src/Amr/AMReX_AmrLevel.cpp
index 7cdd83e1cae..dff9c88b61c 100644
--- a/Src/Amr/AMReX_AmrLevel.cpp
+++ b/Src/Amr/AMReX_AmrLevel.cpp
@@ -663,17 +663,17 @@ AmrLevel::setPhysBoundaryValues (FArrayBox& dest,
 FillPatchIteratorHelper::FillPatchIteratorHelper (AmrLevel& amrlevel,
                                                   MultiFab& leveldata)
     :
-    m_amrlevel(amrlevel),
-    m_leveldata(leveldata),
-    m_mfid(m_amrlevel.level+1)
+    m_amrlevel(&amrlevel),
+    m_leveldata(&leveldata),
+    m_mfid(m_amrlevel->level+1)
 {}
 
 FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel,
                                       MultiFab& leveldata)
     :
     MFIter(leveldata),
-    m_amrlevel(amrlevel),
-    m_leveldata(leveldata),
+    m_amrlevel(&amrlevel),
+    m_leveldata(&leveldata),
     m_ncomp(0)
 {
     MFIter::depth = 0;
@@ -688,9 +688,9 @@ FillPatchIteratorHelper::FillPatchIteratorHelper (AmrLevel&     amrlevel,
                                                   int           ncomp,
                                                   InterpBase*   mapper)
     :
-    m_amrlevel(amrlevel),
-    m_leveldata(leveldata),
-    m_mfid(m_amrlevel.level+1),
+    m_amrlevel(&amrlevel),
+    m_leveldata(&leveldata),
+    m_mfid(m_amrlevel->level+1),
     m_time(time),
     m_growsize(boxGrow),
     m_index(index),
@@ -709,8 +709,8 @@ FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel,
                                       int       ncomp)
     :
     MFIter(leveldata),
-    m_amrlevel(amrlevel),
-    m_leveldata(leveldata),
+    m_amrlevel(&amrlevel),
+    m_leveldata(&leveldata),
     m_ncomp(ncomp)
 {
     BL_ASSERT(scomp >= 0);
@@ -726,12 +726,13 @@ FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel,
 #endif
 }
 
-static
+namespace {
 bool
 NeedToTouchUpPhysCorners (const Geometry& geom)
 {
     return geom.isAnyPeriodic() && !geom.isAllPeriodic();
 }
+}
 
 void
 FillPatchIteratorHelper::Initialize (int           boxGrow,
@@ -756,24 +757,24 @@ FillPatchIteratorHelper::Initialize (int           boxGrow,
     m_index        = idx;
     m_scomp        = scomp;
     m_ncomp        = ncomp;
-    m_FixUpCorners = NeedToTouchUpPhysCorners(m_amrlevel.geom);
+    m_FixUpCorners = NeedToTouchUpPhysCorners(m_amrlevel->geom);
 
     const int         MyProc     = ParallelDescriptor::MyProc();
-    auto&             amrLevels  = m_amrlevel.parent->getAmrLevels();
-    const AmrLevel&   topLevel   = *amrLevels[m_amrlevel.level];
+    auto&             amrLevels  = m_amrlevel->parent->getAmrLevels();
+    const AmrLevel&   topLevel   = *amrLevels[m_amrlevel->level];
     const Box&        topPDomain = topLevel.state[m_index].getDomain();
-    const IndexType&  boxType    = m_leveldata.boxArray().ixType();
+    const IndexType&  boxType    = m_leveldata->boxArray().ixType();
     const bool        extrap     = AmrLevel::desc_lst[m_index].extrap();
     //
     // Check that the interpolaters are identical.
     //
     BL_ASSERT(AmrLevel::desc_lst[m_index].identicalInterps(scomp,ncomp));
 
-    for (int l = 0; l <= m_amrlevel.level; ++l)
+    for (int l = 0; l <= m_amrlevel->level; ++l)
     {
         amrLevels[l]->state[m_index].RegisterData(m_mfcd, m_mfid[l]);
     }
-    for (int i = 0, N = static_cast<int>(m_leveldata.boxArray().size()); i < N; ++i)
+    for (int i = 0, N = static_cast<int>(m_leveldata->boxArray().size()); i < N; ++i)
     {
         //
         // A couple typedefs we'll use in the next code segment.
@@ -782,20 +783,20 @@ FillPatchIteratorHelper::Initialize (int           boxGrow,
 
         using IntAAAFBIDMapValType = std::map<int,Vector<Vector<Vector<FillBoxId> > > >::value_type;
 
-        if (m_leveldata.DistributionMap()[i] != MyProc) { continue; }
+        if (m_leveldata->DistributionMap()[i] != MyProc) { continue; }
         //
         // Insert with a hint since the indices are ordered lowest to highest.
         //
         IntAAAFBIDMapValType v1(i,Vector<Vector<Vector<FillBoxId> > >());
 
-        m_fbid.insert(m_fbid.end(),v1)->second.resize(m_amrlevel.level+1);
+        m_fbid.insert(m_fbid.end(),v1)->second.resize(m_amrlevel->level+1);
 
         IntAABoxMapValType v2(i,Vector<Vector<Box> >());
 
-        m_fbox.insert(m_fbox.end(),v2)->second.resize(m_amrlevel.level+1);
-        m_cbox.insert(m_cbox.end(),v2)->second.resize(m_amrlevel.level+1);
+        m_fbox.insert(m_fbox.end(),v2)->second.resize(m_amrlevel->level+1);
+        m_cbox.insert(m_cbox.end(),v2)->second.resize(m_amrlevel->level+1);
 
-        m_ba.insert(m_ba.end(),std::map<int,Box>::value_type(i,amrex::grow(m_leveldata.boxArray()[i],m_growsize)));
+        m_ba.insert(m_ba.end(),std::map<int,Box>::value_type(i,amrex::grow(m_leveldata->boxArray()[i],m_growsize)));
     }
 
     BoxList        tempUnfillable(boxType);
@@ -862,7 +863,7 @@ FillPatchIteratorHelper::Initialize (int           boxGrow,
         Vector< Vector<Box> >&                TheFineBoxes = m_fbox[bxidx];
         Vector< Vector< Vector<FillBoxId> > >& TheFBIDs     = m_fbid[bxidx];
 
-        for (int l = m_amrlevel.level; l >= 0 && !Done; --l)
+        for (int l = m_amrlevel->level; l >= 0 && !Done; --l)
         {
             unfillableThisLevel.clear();
 
@@ -892,7 +893,7 @@ FillPatchIteratorHelper::Initialize (int           boxGrow,
             {
                 crse_boxes.push_back(fbx);
 
-                if (l != m_amrlevel.level)
+                if (l != m_amrlevel->level)
                 {
                     const Box& cbox = m_map->CoarseBox(fbx,fine_ratio);
 
@@ -999,15 +1000,15 @@ FillPatchIterator::Initialize (int  boxGrow,
     m_ncomp = ncomp;
     m_range = desc.sameInterps(scomp,ncomp);
 
-    m_fabs.define(m_leveldata.boxArray(),m_leveldata.DistributionMap(),
-                  m_ncomp,boxGrow,MFInfo(),m_leveldata.Factory());
+    m_fabs.define(m_leveldata->boxArray(),m_leveldata->DistributionMap(),
+                  m_ncomp,boxGrow,MFInfo(),m_leveldata->Factory());
 
-    const Geometry& geom = m_amrlevel.Geom();
+    const Geometry& geom = m_amrlevel->Geom();
 
     m_fabs.setDomainBndry(std::numeric_limits<Real>::quiet_NaN(), geom);
 
-    const IndexType& boxType = m_leveldata.boxArray().ixType();
-    const int level = m_amrlevel.level;
+    const IndexType& boxType = m_leveldata->boxArray().ixType();
+    const int level = m_amrlevel->level;
 
     for (int i = 0, DComp = 0; i < static_cast<int>(m_range.size()); i++)
     {
@@ -1021,8 +1022,8 @@ FillPatchIterator::Initialize (int  boxGrow,
         else
         {
             if (level == 1 ||
-                amrex::ProperlyNested(m_amrlevel.crse_ratio,
-                                      m_amrlevel.parent->blockingFactor(m_amrlevel.level),
+                amrex::ProperlyNested(m_amrlevel->crse_ratio,
+                                      m_amrlevel->parent->blockingFactor(m_amrlevel->level),
                                       boxGrow, boxType, desc.interp(SComp)))
             {
                 FillFromTwoLevels(time, idx, SComp, DComp, NComp);
@@ -1034,7 +1035,7 @@ FillPatchIterator::Initialize (int  boxGrow,
 #  endif
                 {
                     IntVect new_blocking_factor = AmrLevel::ProperBlockingFactor
-                        (m_amrlevel, boxGrow, boxType, desc, SComp);
+                        (*m_amrlevel, boxGrow, boxType, desc, SComp);
                     amrex::Print() << "WARNING: Grids are not properly nested. Consider using amr.blocking_factor = "
                                    << AMREX_D_TERM(new_blocking_factor[0],
                                          << " " << new_blocking_factor[1],
@@ -1049,7 +1050,7 @@ FillPatchIterator::Initialize (int  boxGrow,
                     first = false;
                     if (ParallelDescriptor::IOProcessor() && amrex::Verbose()) {
                         IntVect new_blocking_factor = AmrLevel::ProperBlockingFactor
-                            (m_amrlevel, boxGrow, boxType, desc, SComp);
+                            (*m_amrlevel, boxGrow, boxType, desc, SComp);
                         amrex::Print() << "WARNING: Grids are not properly nested.  We might have to use\n"
                                        << "         two coarse levels to do fillpatch.  Consider using\n";
                         if (new_blocking_factor < IntVect{AMREX_D_DECL(128,128,128)}) {
@@ -1060,8 +1061,8 @@ FillPatchIterator::Initialize (int  boxGrow,
                     }
                 }
 
-                auto* fph = new FillPatchIteratorHelper(m_amrlevel,
-                                                        m_leveldata,
+                auto* fph = new FillPatchIteratorHelper(*m_amrlevel,
+                                                        *m_leveldata,
                                                         boxGrow,
                                                         time,
                                                         idx,
@@ -1086,7 +1087,7 @@ FillPatchIterator::Initialize (int  boxGrow,
     //
     // Call hack to touch up fillPatched data.
     //
-    m_amrlevel.set_preferred_boundary_values(m_fabs,
+    m_amrlevel->set_preferred_boundary_values(m_fabs,
                                              idx,
                                              scomp,
                                              0,
@@ -1098,15 +1099,15 @@ void
 FillPatchIterator::FillFromLevel0 (Real time, int idx, int scomp, int dcomp, int ncomp)
 {
     BL_PROFILE("FillPatchIterator::FillFromLevel0()");
-    BL_ASSERT(m_amrlevel.level == 0);
+    BL_ASSERT(m_amrlevel->level == 0);
 
-    StateData& statedata = m_amrlevel.state[idx];
+    StateData& statedata = m_amrlevel->state[idx];
 
     Vector<MultiFab*> smf;
     Vector<Real> stime;
     statedata.getData(smf,stime,time);
 
-    const Geometry& geom = m_amrlevel.geom;
+    const Geometry& geom = m_amrlevel->geom;
 
     StateDataPhysBCFunct physbcf(statedata,scomp,geom);
 
@@ -1117,13 +1118,13 @@ void
 FillPatchIterator::FillFromTwoLevels (Real time, int idx, int scomp, int dcomp, int ncomp)
 {
     BL_PROFILE("FillPatchIterator::FillFromTwoLevels()");
-    int ilev_fine = m_amrlevel.level;
+    int ilev_fine = m_amrlevel->level;
     int ilev_crse = ilev_fine-1;
 
     BL_ASSERT(ilev_crse >= 0);
 
-    AmrLevel& fine_level = m_amrlevel;
-    AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse);
+    AmrLevel& fine_level = *m_amrlevel;
+    AmrLevel& crse_level = m_amrlevel->parent->getLevel(ilev_crse);
 
     const Geometry& geom_fine = fine_level.geom;
     const Geometry& geom_crse = crse_level.geom;
@@ -1154,7 +1155,8 @@ FillPatchIterator::FillFromTwoLevels (Real time, int idx, int scomp, int dcomp,
                               desc.getBCs(),scomp);
 }
 
-static
+namespace {
+
 bool
 HasPhysBndry (const Box&      b,
               const Box&      dmn,
@@ -1174,7 +1176,6 @@ HasPhysBndry (const Box&      b,
     return false;
 }
 
-static
 void
 FixUpPhysCorners (FArrayBox&      fab,
                   AmrLevel&       TheLevel,
@@ -1258,6 +1259,8 @@ FixUpPhysCorners (FArrayBox&      fab,
     }
 }
 
+}
+
 void
 FillPatchIteratorHelper::fill (FArrayBox& fab,
                                int        dcomp,
@@ -1268,17 +1271,17 @@ FillPatchIteratorHelper::fill (FArrayBox& fab,
     BL_ASSERT(fab.box() == m_ba[idx]);
     BL_ASSERT(fab.nComp() >= dcomp + m_ncomp);
 
-    Vector< Vector<std::unique_ptr<FArrayBox> > > cfab(m_amrlevel.level+1);
+    Vector< Vector<std::unique_ptr<FArrayBox> > > cfab(m_amrlevel->level+1);
     Vector< Vector<Box> >&                TheCrseBoxes = m_cbox[idx];
     Vector< Vector<Box> >&                TheFineBoxes = m_fbox[idx];
     Vector< Vector< Vector<FillBoxId> > >& TheFBIDs    = m_fbid[idx];
     const bool                            extrap       = AmrLevel::desc_lst[m_index].extrap();
-    auto&                                 amrLevels    = m_amrlevel.parent->getAmrLevels();
+    auto&                                 amrLevels    = m_amrlevel->parent->getAmrLevels();
     //
     // Build all coarse fabs from which we'll interpolate and
     // fill them with coarse data as best we can.
     //
-    for (int l = 0; l <= m_amrlevel.level; l++)
+    for (int l = 0; l <= m_amrlevel->level; l++)
     {
         StateData&                       TheState  = amrLevels[l]->state[m_index];
         const Vector<Box>&               CrseBoxes = TheCrseBoxes[l];
@@ -1323,7 +1326,7 @@ FillPatchIteratorHelper::fill (FArrayBox& fab,
     //
     // Now work from the bottom up interpolating to next higher level.
     //
-    for (int l = 0; l < m_amrlevel.level; l++)
+    for (int l = 0; l < m_amrlevel->level; l++)
     {
         auto&              CrseFabs   = cfab[l];
         AmrLevel&          TheLevel   = *amrLevels[l];
@@ -1442,8 +1445,8 @@ FillPatchIteratorHelper::fill (FArrayBox& fab,
             //
             // Copy intersect finefab into next level m_cboxes.
             //
-            for (int j = 0, K = static_cast<int>(FinerCrseFabs.size()); j < K; ++j) {
-                FinerCrseFabs[j]->copy<RunOn::Host>(finefab);
+            for (const auto & FinerCrseFab : FinerCrseFabs) {
+                FinerCrseFab->copy<RunOn::Host>(finefab);
             }
         }
 
@@ -1452,15 +1455,15 @@ FillPatchIteratorHelper::fill (FArrayBox& fab,
     //
     // Now for the finest level stuff.
     //
-    StateData&         FineState      = m_amrlevel.state[m_index];
+    StateData&         FineState      = m_amrlevel->state[m_index];
     const Box&         FineDomain     = FineState.getDomain();
-    const Geometry&    FineGeom       = m_amrlevel.geom;
-    auto&              FinestCrseFabs = cfab[m_amrlevel.level];
+    const Geometry&    FineGeom       = m_amrlevel->geom;
+    auto&              FinestCrseFabs = cfab[m_amrlevel->level];
     //
     // Copy intersect coarse into destination fab.
     //
-    for (int i = 0, N = static_cast<int>(FinestCrseFabs.size()); i < N; ++i) {
-        fab.copy<RunOn::Host>(*FinestCrseFabs[i],0,dcomp,m_ncomp);
+    for (const auto & FinestCrseFab : FinestCrseFabs) {
+        fab.copy<RunOn::Host>(*FinestCrseFab,0,dcomp,m_ncomp);
     }
 
     if (FineGeom.isAnyPeriodic() && !FineDomain.contains(fab.box()))
@@ -1469,17 +1472,17 @@ FillPatchIteratorHelper::fill (FArrayBox& fab,
 
         FineGeom.periodicShift(FineDomain,fab.box(),pshifts);
 
-        for (int i = 0, N = static_cast<int>(FinestCrseFabs.size()); i < N; i++)
+        for (const auto & FinestCrseFab : FinestCrseFabs)
         {
             for (const auto& iv : pshifts)
             {
                 fab.shift(iv);
 
-                Box src_dst = FinestCrseFabs[i]->box() & fab.box();
+                Box src_dst = FinestCrseFab->box() & fab.box();
                 src_dst    &= FineDomain;
 
                 if (src_dst.ok()) {
-                    fab.copy<RunOn::Host>(*FinestCrseFabs[i],src_dst,0,src_dst,dcomp,m_ncomp);
+                    fab.copy<RunOn::Host>(*FinestCrseFab,src_dst,0,src_dst,dcomp,m_ncomp);
                 }
 
                 fab.shift(-iv);
@@ -1495,7 +1498,7 @@ FillPatchIteratorHelper::fill (FArrayBox& fab,
     //
     if (! FineState.getDomain().contains(fab.box()))
     {
-        m_amrlevel.setPhysBoundaryValues(fab,
+        m_amrlevel->setPhysBoundaryValues(fab,
                                          m_index,
                                          m_time,
                                          dcomp,
@@ -1506,7 +1509,7 @@ FillPatchIteratorHelper::fill (FArrayBox& fab,
 
     if (m_FixUpCorners)
     {
-        FixUpPhysCorners(fab,m_amrlevel,m_index,m_time,m_scomp,dcomp,m_ncomp);
+        FixUpPhysCorners(fab,*m_amrlevel,m_index,m_time,m_scomp,dcomp,m_ncomp);
         Gpu::streamSynchronize();  // In case this runs on GPU
     }
 }
@@ -1843,10 +1846,10 @@ AmrLevel::UpdateDistributionMaps ( DistributionMapping& update_dmap )
     if (dmap.size() == mapsize)
     { dmap = update_dmap; }
 
-    for (int i = 0; i < state.size(); ++i)
+    for (auto & i : state)
     {
-       if (state[i].DistributionMap().size() == mapsize)
-          { state[i].setDistributionMap(update_dmap); }
+       if (i.DistributionMap().size() == mapsize)
+          { i.setDistributionMap(update_dmap); }
     }
 }
 
diff --git a/Src/Amr/AMReX_Derive.H b/Src/Amr/AMReX_Derive.H
index e1a7310a7b2..1e0cceb7894 100644
--- a/Src/Amr/AMReX_Derive.H
+++ b/Src/Amr/AMReX_Derive.H
@@ -100,7 +100,6 @@ class DescriptorList;
 * from the state data contained in AmrLevel and its derivatives. Some
 * examples might be kinetic energy, vorticity, concentration gradients ...
 */
-
 class DeriveRec
 {
    friend class DeriveList;
@@ -339,7 +338,6 @@ private:
 *
 * DeriveList manages and provides access to the list of DeriveRecs.
 */
-
 class DeriveList
 {
 public:
diff --git a/Src/Amr/AMReX_LevelBld.H b/Src/Amr/AMReX_LevelBld.H
index 8b421265bfc..bb79184ca5e 100644
--- a/Src/Amr/AMReX_LevelBld.H
+++ b/Src/Amr/AMReX_LevelBld.H
@@ -18,7 +18,6 @@ namespace amrex {
 * Abstract base class specifying an interface for building problem-specific
 * AmrLevels.
 */
-
 class LevelBld
 {
 public:
diff --git a/Src/Amr/AMReX_StateData.H b/Src/Amr/AMReX_StateData.H
index 251e6482a45..e030eacb5b1 100644
--- a/Src/Amr/AMReX_StateData.H
+++ b/Src/Amr/AMReX_StateData.H
@@ -29,7 +29,6 @@ class StateDataPhysBCFunct;
 *
 * StateData holds state data on a level for the current and previous time step.
 */
-
 class StateData
 {
     friend class StateDataPhysBCFunct;
@@ -471,7 +470,7 @@ public:
 private:
     StateData* statedata;
     int src_comp;
-    const Geometry& geom;
+    const Geometry* geom;
 
 };
 
diff --git a/Src/Amr/AMReX_StateData.cpp b/Src/Amr/AMReX_StateData.cpp
index fffa8074e8b..7fd2c0c5745 100644
--- a/Src/Amr/AMReX_StateData.cpp
+++ b/Src/Amr/AMReX_StateData.cpp
@@ -863,7 +863,7 @@ StateData::printTimeInterval (std::ostream &os) const
 StateDataPhysBCFunct::StateDataPhysBCFunct (StateData&sd, int sc, const Geometry& geom_)
     : statedata(&sd),
       src_comp(sc),
-      geom(geom_)
+      geom(&geom_)
 { }
 
 void
@@ -876,8 +876,8 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
     const Box&     domain_mt   = amrex::convert(statedata->getDomain(),mf.ixType());
     const int*     domainlo    = domain_mt.loVect();
     const int*     domainhi    = domain_mt.hiVect();
-    const Real*    dx          = geom.CellSize();
-    const RealBox& prob_domain = geom.ProbDomain();
+    const Real*    dx          = geom->CellSize();
+    const RealBox& prob_domain = geom->ProbDomain();
 
     bool has_bndryfunc_fab = statedata->desc->hasBndryFuncFab();
     bool run_on_gpu = statedata->desc->RunOnGPU() && Gpu::inLaunchRegion();
@@ -901,7 +901,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
             bool is_periodic = false;
             for (int i = 0; i < AMREX_SPACEDIM; ++i) {
                 bool touch = bx.smallEnd(i) < domainlo[i] || bx.bigEnd(i) > domainhi[i];
-                if (geom.isPeriodic(i)) {
+                if (geom->isPeriodic(i)) {
                     is_periodic = is_periodic || touch;
                 } else {
                     has_phys_bc = has_phys_bc || touch;
@@ -911,7 +911,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
             if (has_phys_bc)
             {
                 if (has_bndryfunc_fab) {
-                    statedata->FillBoundary(bx, dest, time, geom, dest_comp, src_comp, num_comp);
+                    statedata->FillBoundary(bx, dest, time, *geom, dest_comp, src_comp, num_comp);
                 } else {
                     statedata->FillBoundary(dest, time, dx, prob_domain, dest_comp, src_comp, num_comp);
                 }
@@ -922,7 +922,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
 
                     for (int dir = 0; dir < AMREX_SPACEDIM; dir++)
                     {
-                        if (!geom.isPeriodic(dir))
+                        if (!(geom->isPeriodic(dir)))
                         {
                             const int lo = domainlo[dir] - bx.smallEnd(dir);
                             const int hi = bx.bigEnd(dir) - domainhi[dir];
@@ -933,12 +933,12 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
 
                     for (int dir = 0; dir < AMREX_SPACEDIM; dir++)
                     {
-                        if (!geom.isPeriodic(dir)) { continue; }
+                        if (!(geom->isPeriodic(dir))) { continue; }
 
                         Box lo_slab = bx;
                         Box hi_slab = bx;
-                        lo_slab.shift(dir, geom.period(dir));
-                        hi_slab.shift(dir,-geom.period(dir));
+                        lo_slab.shift(dir, geom->period(dir));
+                        hi_slab.shift(dir,-geom->period(dir));
                         lo_slab &= GrownDomain;
                         hi_slab &= GrownDomain;
 
@@ -950,7 +950,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
                                 tmp.resize(lo_slab,num_comp);
                                 Elixir elitmp = tmp.elixir();
                                 Array4<Real> const& tmpa = tmp.array();
-                                const int ishift = -geom.period(dir);
+                                const int ishift = -geom->period(dir);
                                 amrex::launch(lo_slab,
                                 [=] AMREX_GPU_DEVICE (Box const& tbx) noexcept
                                 {
@@ -971,7 +971,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
                                     }
                                 });
                                 if (has_bndryfunc_fab) {
-                                    statedata->FillBoundary(lo_slab, tmp, time, geom, 0, src_comp, num_comp);
+                                    statedata->FillBoundary(lo_slab, tmp, time, *geom, 0, src_comp, num_comp);
                                 } else {
                                     statedata->FillBoundary(tmp, time, dx, prob_domain, 0, src_comp, num_comp);
                                 }
@@ -999,10 +999,10 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
 #endif
                             {
                                 tmp.resize(lo_slab,num_comp);
-                                const Box db = amrex::shift(lo_slab, dir, -geom.period(dir));
+                                const Box db = amrex::shift(lo_slab, dir, -geom->period(dir));
                                 tmp.copy<RunOn::Host>(dest, db, dest_comp, lo_slab, 0, num_comp);
                                 if (has_bndryfunc_fab) {
-                                    statedata->FillBoundary(lo_slab, tmp, time, geom, 0, src_comp, num_comp);
+                                    statedata->FillBoundary(lo_slab, tmp, time, *geom, 0, src_comp, num_comp);
                                 } else {
                                     statedata->FillBoundary(tmp, time, dx, prob_domain, 0, src_comp, num_comp);
                                 }
@@ -1018,7 +1018,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
                                 tmp.resize(hi_slab,num_comp);
                                 Elixir elitmp = tmp.elixir();
                                 Array4<Real> const& tmpa = tmp.array();
-                                const int ishift = geom.period(dir);
+                                const int ishift = geom->period(dir);
                                 amrex::launch(hi_slab,
                                 [=] AMREX_GPU_DEVICE (Box const& tbx) noexcept
                                 {
@@ -1039,7 +1039,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
                                     }
                                 });
                                 if (has_bndryfunc_fab) {
-                                    statedata->FillBoundary(hi_slab, tmp, time, geom, 0, src_comp, num_comp);
+                                    statedata->FillBoundary(hi_slab, tmp, time, *geom, 0, src_comp, num_comp);
                                 } else {
                                     statedata->FillBoundary(tmp, time, dx, prob_domain, 0, src_comp, num_comp);
                                 }
@@ -1067,10 +1067,10 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int
 #endif
                             {
                                 tmp.resize(hi_slab,num_comp);
-                                const Box db = amrex::shift(hi_slab, dir, geom.period(dir));
+                                const Box db = amrex::shift(hi_slab, dir, geom->period(dir));
                                 tmp.copy<RunOn::Host>(dest, db, dest_comp, hi_slab, 0, num_comp);
                                 if (has_bndryfunc_fab) {
-                                    statedata->FillBoundary(hi_slab, tmp, time, geom, 0, src_comp, num_comp);
+                                    statedata->FillBoundary(hi_slab, tmp, time, *geom, 0, src_comp, num_comp);
                                 } else {
                                     statedata->FillBoundary(tmp, time, dx, prob_domain, 0, src_comp, num_comp);
                                 }
diff --git a/Src/Amr/AMReX_StateDescriptor.H b/Src/Amr/AMReX_StateDescriptor.H
index 6cd6c92cdd3..2830b955705 100644
--- a/Src/Amr/AMReX_StateDescriptor.H
+++ b/Src/Amr/AMReX_StateDescriptor.H
@@ -29,7 +29,6 @@ namespace amrex {
 /**
 * \brief Attributes of StateData.
 */
-
 class StateDescriptor
 {
     friend class DescriptorList;
@@ -434,7 +433,6 @@ private:
 *
 * A container class for StateDescriptors.
 */
-
 class DescriptorList
 {
 public:
diff --git a/Src/Amr/AMReX_StateDescriptor.cpp b/Src/Amr/AMReX_StateDescriptor.cpp
index 161090d1a7f..4f1e3c3fe1b 100644
--- a/Src/Amr/AMReX_StateDescriptor.cpp
+++ b/Src/Amr/AMReX_StateDescriptor.cpp
@@ -19,7 +19,7 @@ StateDescriptor::bf_thread_safety (const int* /*lo*/,const int* /*hi*/,
     if (!bf_ext_dir_threadsafe) {
         bool has_ext_dir = false;
         for (int i=0; i<2*AMREX_SPACEDIM*ng && !has_ext_dir; ++i) {
-            has_ext_dir = bc[i]==BCType::ext_dir;
+            has_ext_dir = ((bc[i]==BCType::ext_dir) || (bc[i]==BCType::ext_dir_cc));
         }
         if (has_ext_dir) { thread_safe = false; }
     }
diff --git a/Src/AmrCore/AMReX_AmrCore.H b/Src/AmrCore/AMReX_AmrCore.H
index 20428b40930..2969b986a75 100644
--- a/Src/AmrCore/AMReX_AmrCore.H
+++ b/Src/AmrCore/AMReX_AmrCore.H
@@ -20,7 +20,6 @@ class AmrParGDB;
  * virtual functions to allocate, initialize and delete data.  It also
  * requires the derived class to tag cells for refinement.
  */
-
 class AmrCore
     : public AmrMesh
 {
diff --git a/Src/AmrCore/AMReX_AmrCore.cpp b/Src/AmrCore/AMReX_AmrCore.cpp
index 502b3f5cb23..1e56c1652d2 100644
--- a/Src/AmrCore/AMReX_AmrCore.cpp
+++ b/Src/AmrCore/AMReX_AmrCore.cpp
@@ -1,6 +1,5 @@
 
 #include <AMReX_AmrCore.H>
-#include <AMReX_Print.H>
 
 #ifdef AMREX_PARTICLES
 #include <AMReX_AmrParGDB.H>
@@ -104,7 +103,7 @@ AmrCore::regrid (int lbase, Real time, bool)
                 DistributionMapping level_dmap = dmap[lev];
                 if (ba_changed) {
                     level_grids = new_grids[lev];
-                    level_dmap = DistributionMapping(level_grids);
+                    level_dmap = MakeDistributionMap(lev, level_grids);
                 }
                 const auto old_num_setdm = num_setdm;
                 RemakeLevel(lev, time, level_grids, level_dmap);
@@ -117,7 +116,7 @@ AmrCore::regrid (int lbase, Real time, bool)
         }
         else  // a new level
         {
-            DistributionMapping new_dmap(new_grids[lev]);
+            DistributionMapping new_dmap = MakeDistributionMap(lev, new_grids[lev]);
             const auto old_num_setdm = num_setdm;
             MakeNewLevelFromCoarse(lev, time, new_grids[lev], new_dmap);
             SetBoxArray(lev, new_grids[lev]);
@@ -227,7 +226,8 @@ AmrCore::printGridSummary (std::ostream& os, int min_lev, int max_lev) const noe
         }
     }
 
-    os << std::endl; // Make sure we flush!
+    os << '\n';
+    os.flush();
 }
 
 }
diff --git a/Src/AmrCore/AMReX_AmrMesh.H b/Src/AmrCore/AMReX_AmrMesh.H
index a3c6fbc8f62..f5d49f5c5da 100644
--- a/Src/AmrCore/AMReX_AmrMesh.H
+++ b/Src/AmrCore/AMReX_AmrMesh.H
@@ -11,6 +11,10 @@
 #include <AMReX_BoxArray.H>
 #include <AMReX_TagBox.H>
 
+#ifdef AMREX_USE_BITTREE
+#include <Bittree_BittreeAmr.h>
+#endif
+
 namespace amrex {
 
 struct AmrInfo {
@@ -166,7 +170,7 @@ public:
     void SetGridEff (Real eff) noexcept { grid_eff = eff; }
     void SetNProper (int n) noexcept { n_proper = n; }
 
-    //! Set ref_ratio would require rebuiling Geometry objects.
+    //! Set ref_ratio would require rebuilding Geometry objects.
 
     void SetFinestLevel (int new_finest_level) noexcept { finest_level = new_finest_level; }
     void SetDistributionMap (int lev, const DistributionMapping& dmap_in) noexcept;
@@ -253,6 +257,8 @@ public:
 
     [[nodiscard]] long CountCells (int lev) noexcept;
 
+    [[nodiscard]] virtual DistributionMapping MakeDistributionMap (int lev, BoxArray const& ba);
+
 protected:
 
     int finest_level;    //!< Current finest level.
@@ -260,6 +266,11 @@ protected:
     Vector<DistributionMapping> dmap;
     Vector<BoxArray>            grids;
 
+#ifdef AMREX_USE_BITTREE
+    bool use_bittree = false;
+    std::unique_ptr<bittree::BittreeAmr> btmesh;
+#endif
+
     unsigned int num_setdm = 0;
     unsigned int num_setba = 0;
 
diff --git a/Src/AmrCore/AMReX_AmrMesh.cpp b/Src/AmrCore/AMReX_AmrMesh.cpp
index 70a8df1dabf..0ed59002f2e 100644
--- a/Src/AmrCore/AMReX_AmrMesh.cpp
+++ b/Src/AmrCore/AMReX_AmrMesh.cpp
@@ -5,6 +5,13 @@
 #include <AMReX_ParmParse.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <AMReX_Print.H>
+#include <AMReX_Vector.H>
+
+#ifdef AMREX_USE_BITTREE
+#include <AMReX_Bittree.H>
+#endif
+
+#include <memory>
 
 namespace amrex {
 
@@ -376,6 +383,10 @@ AmrMesh::InitAmrMesh (int max_level_in, const Vector<int>& n_cell_in,
 
     finest_level = -1;
 
+#ifdef AMREX_USE_BITTREE
+    pp.queryAdd("use_bittree",use_bittree);
+#endif
+
     if (check_input) { checkInput(); }
 }
 
@@ -437,6 +448,26 @@ AmrMesh::LevelDefined (int lev) noexcept
     return lev <= max_level && !grids[lev].empty() && !dmap[lev].empty();
 }
 
+DistributionMapping
+AmrMesh::MakeDistributionMap (int lev, BoxArray const& ba)
+{
+
+    BL_PROFILE("AmrMesh::MakeDistributionMap()");
+
+    if (verbose) {
+        amrex::Print() << "Creating new distribution map on level: " << lev << "\n";
+    }
+
+#ifdef AMREX_USE_BITTREE
+    // if (use_bittree) {
+    //     return DistributionMapping(ba);
+    // } else
+#endif
+    {
+        return DistributionMapping(ba);
+    }
+}
+
 void
 AmrMesh::ChopGrids (int lev, BoxArray& ba, int target_size) const
 {
@@ -514,6 +545,10 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector<BoxArray>&
 
     if (new_grids.size() < max_crse+2) { new_grids.resize(max_crse+2); }
 
+#ifdef AMREX_USE_BITTREE
+    if(!use_bittree) {
+#endif
+
     //
     // Construct problem domain at each level.
     //
@@ -728,13 +763,47 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector<BoxArray>&
                 }
                 new_bx.Bcast();  // Broadcast the new BoxList to other processes
 
-                //
-                // Refine up to levf.
-                //
-                new_bx.refine(ref_ratio[levc]);
-                BL_ASSERT(new_bx.isDisjoint());
+                bool odd_ref_ratio = false;
+                for (auto const& rr : ref_ratio[levc]) {
+                    if (rr != 1 && (rr%2 != 0)) {
+                        odd_ref_ratio = true;
+                    }
+                }
+
+                if (odd_ref_ratio)
+                {
+                    // This approach imposes max_grid_size (suitably scaled) before
+                    //     refining so as to ensure fine grids align with coarse grids
+
+                    //
+                    // Impose max_grid_size (suitably coarsened)
+                    //
+                    AMREX_ASSERT(max_grid_size[levf].allGE(ref_ratio[levc]));
+                    new_grids[levf] = BoxArray(std::move(new_bx), max_grid_size[levf]/ref_ratio[levc]);
+
+                    //
+                    // Refine up to levf.
+                    //
+                    new_grids[levf].refine(ref_ratio[levc]);
+                }
+                else
+                {
+                    // This approach imposes max_grid_size after refining.
+                    // For ref_ratio = 3 this can create fine grids that do not correctly divide by 3,
+                    //     but we leave it here so as not to change the gridding in
+                    //     existing ref_ratio = 2 or 4 applications
 
-                new_grids[levf] = BoxArray(std::move(new_bx), max_grid_size[levf]);
+                    //
+                    // Refine up to levf.
+                    //
+                    new_bx.refine(ref_ratio[levc]);
+
+                    //
+                    // Impose max_grid_size
+                    //
+                    new_grids[levf] = BoxArray(std::move(new_bx), max_grid_size[levf]);
+                }
+                BL_ASSERT(new_grids[levf].isDisjoint());
             }
         }
     }
@@ -774,6 +843,72 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector<BoxArray>&
             }
         }
     }
+
+#ifdef AMREX_USE_BITTREE
+    }
+#endif
+
+#ifdef AMREX_USE_BITTREE
+    // Bittree version
+    if(use_bittree) {
+        // Initialize BT refinement
+        btmesh->refine_init();
+
+        // -------------------------------------------------------------------
+        // Use tagging data to mark BT for refinement, then use the new bitmap
+        // to calculate the new grids.
+        auto tree0 = btmesh->getTree();
+
+        // [1] Error Estimation and tagging
+        // btTags is indexed by bitid, Bittree's internal indexing scheme.
+        // For any id, btTags = 1 if should be parent, -1 if should not be parent (or not exist).
+        std::vector<int> btTags(tree0->id_upper_bound(),0);
+
+        for (int lev=max_crse; lev>=lbase; --lev) {
+
+            TagBoxArray tags(grids[lev],dmap[lev], n_error_buf[lev]);
+            ErrorEst(lev, tags, time, 0);
+            tags.buffer(n_error_buf[lev]);
+
+            for (MFIter mfi(tags); mfi.isValid(); ++mfi) {
+                auto const& tagbox = tags.const_array(mfi);
+                bool has_set_tags = amrex::Reduce::AnyOf(mfi.validbox(),
+                                                         [=] AMREX_GPU_DEVICE (int i, int j, int k)
+                                                         {
+                                                              return tagbox(i,j,k)!=TagBox::CLEAR;
+                                                         });
+
+                // Set the values of btTags.
+                int bitid = btUnit::getBitid(btmesh.get(),false,lev,mfi.index());
+                // TODO Check lev == tree0->block_level(bitid)
+                if(has_set_tags) {
+                    btTags[bitid] = 1;
+                }
+                else {
+                    btTags[bitid] = -1;
+                }
+            }
+        }
+
+        // [2] btRefine - check for proper octree nesting and update bitmap
+        MPI_Comm comm = ParallelContext::CommunicatorSub();
+        int changed = btUnit::btRefine(btmesh.get(), btTags, max_crse, lbase, grids, dmap, comm);
+
+        // [3] btCalculateGrids - use new bitmap to generate new grids
+        if (changed>0) {
+            btUnit::btCalculateGrids(btmesh.get(),lbase,new_finest,new_grids,max_grid_size);
+        } else {
+            new_finest = finest_level;
+            for(int i=0; i<=finest_level; ++i) {
+                new_grids[i] = grids[i];
+            }
+        }
+
+        // Finalize BT refinement
+        btmesh->refine_apply();
+    }
+#endif
+
 }
 
 void
@@ -783,11 +918,48 @@ AmrMesh::MakeNewGrids (Real time)
     {
         finest_level = 0;
 
-        const BoxArray& ba = MakeBaseGrids();
-        DistributionMapping dm(ba);
+        BoxArray ba;
+        DistributionMapping dm;
         const auto old_num_setdm = num_setdm;
         const auto old_num_setba = num_setba;
 
+#ifdef AMREX_USE_BITTREE
+        if(!use_bittree) {
+#endif
+            ba = MakeBaseGrids();
+            dm = MakeDistributionMap(0, ba);
+
+#ifdef AMREX_USE_BITTREE
+        }
+        else {
+            //Initialize Bittree
+
+            // top = number of grids on coarsest level in each direction
+            std::vector<int> top(AMREX_SPACEDIM,0);
+            IntVect ncells = geom[0].Domain().length();
+            for(int i=0; i<AMREX_SPACEDIM; ++i) {
+                top[i] = ncells[i] / max_grid_size[0][i];
+            }
+
+            // includes = boolean to check each coarsest level grid exists
+            // (Bittree supports having "holes" in the mesh)
+            int ngrids = AMREX_D_TERM(top[0],*top[1],*top[2]);
+            std::vector<int> includes(ngrids,1);
+
+            btmesh = std::make_unique<bittree::BittreeAmr>(top.data(),includes.data());
+
+            // Set BCs
+            for(int d=0; d<AMREX_SPACEDIM; ++d) {
+                btUnit::bcPeriodic[d] = geom[0].isPeriodic(d);
+            }
+
+
+            // Use Bittree to make coarsest level (don't need MakeBaseGrids)
+            // Need to use Bittree, so the indices of grids[lev] will be compatible with BT.
+            btUnit::btCalculateLevel(btmesh.get(),0,ba,max_grid_size[0]);
+            dm = MakeDistributionMap(0, ba);
+        }
+#endif
         MakeNewLevelFromScratch(0, time, ba, dm);
 
         if (old_num_setba == num_setba) {
@@ -812,7 +984,7 @@ AmrMesh::MakeNewGrids (Real time)
             if (new_finest <= finest_level) { break; }
             finest_level = new_finest;
 
-            DistributionMapping dm(new_grids[new_finest]);
+            DistributionMapping dm = MakeDistributionMap(new_finest, new_grids[new_finest]);
             const auto old_num_setdm = num_setdm;
 
             MakeNewLevelFromScratch(new_finest, time, new_grids[finest_level], dm);
@@ -843,7 +1015,7 @@ AmrMesh::MakeNewGrids (Real time)
                 for (int lev = 1; lev <= new_finest; ++lev) {
                     if (new_grids[lev] != grids[lev]) {
                         grids_the_same = false;
-                        DistributionMapping dm(new_grids[lev]);
+                        DistributionMapping dm = MakeDistributionMap(lev, new_grids[lev]);
                         const auto old_num_setdm = num_setdm;
 
                         MakeNewLevelFromScratch(lev, time, new_grids[lev], dm);
@@ -944,7 +1116,7 @@ AmrMesh::checkInput ()
     for (int i = 0; i < max_level; i++)
     {
         if (MaxRefRatio(i) < 2) {
-            amrex::Error("Amr::checkInput: bad ref_ratios");
+            amrex::Warning("Amr::checkInput: ref_ratios all equal to one!");
         }
     }
 
@@ -961,12 +1133,12 @@ AmrMesh::checkInput ()
     {
         int len = domain.length(idim);
         if (blocking_factor[0][idim] <= max_grid_size[0][idim]) {
-           if (len%blocking_factor[0][idim] != 0)
-           {
-              amrex::Print() << "domain size in direction " << idim << " is " << len << std::endl;
-              amrex::Print() << "blocking_factor is " << blocking_factor[0][idim] << std::endl;
-              amrex::Error("domain size not divisible by blocking_factor");
-           }
+            if (len%blocking_factor[0][idim] != 0)
+            {
+                amrex::Print() << "domain size in direction " << idim << " is " << len << '\n'
+                               << "blocking_factor is " << blocking_factor[0][idim] << '\n';
+                amrex::Error("domain size not divisible by blocking_factor");
+            }
         }
     }
 
@@ -996,7 +1168,7 @@ AmrMesh::checkInput ()
             amrex::Print() << "Blocking factors on levels " << i << " and " << i+1
                            << " are " << blocking_factor[i] << " " << blocking_factor[i+1]
                            << ". Ref ratio is " << ref_ratio[i]
-                           << ".  They vary too much between levels." << std::endl;
+                           << ".  They vary too much between levels." << '\n';
             amrex::Error("Blocking factors vary too much between levels");
         }
     }
@@ -1011,8 +1183,8 @@ AmrMesh::checkInput ()
             if (blocking_factor[i][idim] <= max_grid_size[i][idim]) {
                 if (max_grid_size[i][idim]%blocking_factor[i][idim] != 0) {
                     amrex::Print() << "max_grid_size in direction " << idim
-                                   << " is " << max_grid_size[i][idim] << std::endl;
-                    amrex::Print() << "blocking_factor is " << blocking_factor[i][idim] << std::endl;
+                                   << " is " << max_grid_size[i][idim] << '\n'
+                                   << "blocking_factor is " << blocking_factor[i][idim] << '\n';
                     amrex::Error("max_grid_size not divisible by blocking_factor");
                 }
             }
@@ -1029,7 +1201,7 @@ AmrMesh::checkInput ()
                                << " max_grid_size is " << max_grid_size[i][idim]
                                << " blocking factor is " << blocking_factor[i][idim] << "\n"
                                << "On level " << i+1 << " in direction " << idim
-                               << " blocking_factor is " << blocking_factor[i+1][idim] << std::endl;
+                               << " blocking_factor is " << blocking_factor[i+1][idim] << '\n';
                 amrex::Error("Coarse level blocking factor not a multiple of fine level blocking factor divided by ref ratio");
             }
         }
diff --git a/Src/AmrCore/AMReX_Cluster.H b/Src/AmrCore/AMReX_Cluster.H
index 5bbf5c796bc..7d60131e6cf 100644
--- a/Src/AmrCore/AMReX_Cluster.H
+++ b/Src/AmrCore/AMReX_Cluster.H
@@ -20,7 +20,6 @@ class ClusterList;
 *
 * Utility class for tagging error cells.
 */
-
 class Cluster
 {
 public:
@@ -138,7 +137,6 @@ private:
 *
 * A container class for Cluster.
 */
-
 class ClusterList
 {
 public:
diff --git a/Src/AmrCore/AMReX_Cluster.cpp b/Src/AmrCore/AMReX_Cluster.cpp
index 5383b9301f5..7c9ecc1d87a 100644
--- a/Src/AmrCore/AMReX_Cluster.cpp
+++ b/Src/AmrCore/AMReX_Cluster.cpp
@@ -148,7 +148,8 @@ Cluster::minBox () noexcept
 // Finds best cut location in histogram.
 //
 
-static
+namespace {
+
 int
 FindCut (const int* hist,
          int        lo,
@@ -236,7 +237,6 @@ FindCut (const int* hist,
     return lo + cutpoint;
 }
 
-namespace {
 //
 // Predicate in call to std::partition() in Cluster::chop().
 //
diff --git a/Src/AmrCore/AMReX_ErrorList.H b/Src/AmrCore/AMReX_ErrorList.H
index 9ab1a978966..fb39ce6408b 100644
--- a/Src/AmrCore/AMReX_ErrorList.H
+++ b/Src/AmrCore/AMReX_ErrorList.H
@@ -102,7 +102,6 @@ extern "C"
 * actual error tagging will be through derivation, so provision is made
 * for this as well.
 */
-
 class ErrorRec
 {
 public:
@@ -348,7 +347,6 @@ private:
 *
 * Container class for ErrorRecs.
 */
-
 class ErrorList
 {
 public:
@@ -455,20 +453,16 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
                  AMRErrorTag::TEST      test,
                  std::string            field,
                  const AMRErrorTagInfo& info = AMRErrorTagInfo()) noexcept
-      : m_test(test), m_field(std::move(field)), m_info(info)
+      : m_value(info.m_max_level, value), m_test(test), m_field(std::move(field)), m_info(info),
+        m_ngrow(SetNGrow())
       {
-          m_value.resize(info.m_max_level);
-          for (int i = 0; i < m_value.size(); ++i) {
-              m_value[i] = value;
-          }
-          m_ngrow = SetNGrow();
       }
 
     AMRErrorTag (amrex::Vector<amrex::Real>  value,
                  AMRErrorTag::TEST           test,
                  std::string                 field,
                  const AMRErrorTagInfo&      info = AMRErrorTagInfo()) noexcept
-      : m_test(test), m_field(std::move(field)), m_info(info)
+      : m_test(test), m_field(std::move(field)), m_info(info), m_ngrow(SetNGrow())
       {
           AMREX_ASSERT(!value.empty());
           m_value.resize(info.m_max_level);
@@ -480,7 +474,6 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
           for (auto i = int(value.size()); i < m_value.size(); ++i) {
               m_value[i] = value[value.size()-1];
           }
-          m_ngrow = SetNGrow();
       }
 
     AMRErrorTag (AMRErrorTag::UserFunc* userfunc,
diff --git a/Src/AmrCore/AMReX_ErrorList.cpp b/Src/AmrCore/AMReX_ErrorList.cpp
index 783a06504be..c9d1a7feef8 100644
--- a/Src/AmrCore/AMReX_ErrorList.cpp
+++ b/Src/AmrCore/AMReX_ErrorList.cpp
@@ -194,7 +194,9 @@ ErrorList::operator[] (int k) const noexcept
     return *vec[k];
 }
 
-static const char* err_name[] = { "Special", "Standard", "UseAverage" };
+namespace {
+    const char* err_name[] = { "Special", "Standard", "UseAverage" };
+}
 
 std::ostream&
 operator << (std::ostream&    os,
diff --git a/Src/AmrCore/AMReX_FillPatchUtil_I.H b/Src/AmrCore/AMReX_FillPatchUtil_I.H
index 36d5e9ee853..b3e1ee64aae 100644
--- a/Src/AmrCore/AMReX_FillPatchUtil_I.H
+++ b/Src/AmrCore/AMReX_FillPatchUtil_I.H
@@ -189,7 +189,7 @@ void FillPatchInterp (MultiFab& mf_fine_patch, int fcomp, MultiFab const& mf_crs
                       MFInterpolater* mapper, const Vector<BCRec>& bcs, int bcscomp);
 
 template <typename MF, typename Interp>
-std::enable_if_t<IsFabArray<MF>::value && !std::is_same<Interp,MFInterpolater>::value>
+std::enable_if_t<IsFabArray<MF>::value && !std::is_same_v<Interp,MFInterpolater>>
 FillPatchInterp (MF& mf_fine_patch, int fcomp, MF const& mf_crse_patch, int ccomp,
                  int ncomp, IntVect const& ng, const Geometry& cgeom, const Geometry& fgeom,
                  Box const& dest_domain, const IntVect& ratio,
@@ -240,7 +240,7 @@ FillPatchInterp (MF& mf_fine_patch, int fcomp, MF const& mf_crse_patch, int ccom
 }
 
 template <typename MF, typename iMF, typename Interp>
-std::enable_if_t<IsFabArray<MF>::value  && !std::is_same<Interp,MFInterpolater>::value>
+std::enable_if_t<IsFabArray<MF>::value  && !std::is_same_v<Interp,MFInterpolater>>
 InterpFace (Interp *interp,
             MF const& mf_crse_patch,    int crse_comp,
             MF&       mf_refined_patch, int fine_comp,
@@ -298,9 +298,9 @@ namespace {
 // ======== FArrayBox
 
     template <typename MF,
-              typename std::enable_if<std::is_same<typename MF::FABType::value_type,
-                                                   FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<std::is_same_v<typename MF::FABType::value_type,
+                                                   FArrayBox>,
+                                      int> = 0>
     MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp)
     {
         MF mf_crse_patch(fpc.ba_crse_patch, fpc.dm_patch, ncomp, 0, MFInfo(),
@@ -309,9 +309,9 @@ namespace {
     }
 
     template <typename MF,
-              typename std::enable_if<std::is_same<typename MF::FABType::value_type,
-                                                   FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<std::is_same_v<typename MF::FABType::value_type,
+                                                   FArrayBox>,
+                                      int> = 0>
     MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type)
     {
         MF mf_crse_patch(amrex::convert(fpc.ba_crse_patch, idx_type), fpc.dm_patch,
@@ -320,9 +320,9 @@ namespace {
     }
 
     template <typename MF,
-              typename std::enable_if<std::is_same<typename MF::FABType::value_type,
-                                                   FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<std::is_same_v<typename MF::FABType::value_type,
+                                                   FArrayBox>,
+                                      int> = 0>
     MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp)
     {
         MF mf_fine_patch(fpc.ba_fine_patch, fpc.dm_patch, ncomp, 0, MFInfo(),
@@ -331,9 +331,9 @@ namespace {
     }
 
     template <typename MF,
-              typename std::enable_if<std::is_same<typename MF::FABType::value_type,
-                                                   FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<std::is_same_v<typename MF::FABType::value_type,
+                                                   FArrayBox>,
+                                      int> = 0>
     MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type)
     {
         MF mf_fine_patch(amrex::convert(fpc.ba_fine_patch, idx_type), fpc.dm_patch,
@@ -342,9 +342,9 @@ namespace {
     }
 
     template <typename MF,
-              typename std::enable_if<std::is_same<typename MF::FABType::value_type,
-                                                   FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<std::is_same_v<typename MF::FABType::value_type,
+                                                   FArrayBox>,
+                                      int> = 0>
     MF make_mf_refined_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type, IntVect ratio)
     {
         MF mf_refined_patch(amrex::convert( amrex::refine( amrex::coarsen(fpc.ba_fine_patch, ratio), ratio), idx_type),
@@ -353,9 +353,9 @@ namespace {
     }
 
     template <typename MF,
-              typename std::enable_if<std::is_same<typename MF::FABType::value_type,
-                                                   FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<std::is_same_v<typename MF::FABType::value_type,
+                                                   FArrayBox>,
+                                      int> = 0>
     MF make_mf_crse_mask (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type, IntVect ratio)
     {
         MF mf_crse_mask(amrex::convert(amrex::coarsen(fpc.ba_fine_patch, ratio), idx_type),
@@ -364,9 +364,9 @@ namespace {
     }
 
     template <typename MF,
-              typename std::enable_if<std::is_same<typename MF::FABType::value_type,
-                                                   FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<std::is_same_v<typename MF::FABType::value_type,
+                                                   FArrayBox>,
+                                      int> = 0>
     void mf_set_domain_bndry (MF &mf, Geometry const & geom)
     {
         mf.setDomainBndry(std::numeric_limits<Real>::quiet_NaN(), geom);
@@ -376,63 +376,63 @@ namespace {
 // ======== Not FArrayBox
 
     template <typename MF,
-              typename std::enable_if<!std::is_same<typename MF::FABType::value_type,
-                                                    FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<!std::is_same_v<typename MF::FABType::value_type,
+                                                    FArrayBox>,
+                                      int> = 0>
     MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp)
     {
         return MF(fpc.ba_crse_patch, fpc.dm_patch, ncomp, 0);
     }
 
     template <typename MF,
-              typename std::enable_if<!std::is_same<typename MF::FABType::value_type,
-                                                    FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<!std::is_same_v<typename MF::FABType::value_type,
+                                                    FArrayBox>,
+                                      int> = 0>
     MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type)
     {
         return MF(amrex::convert(fpc.ba_crse_patch, idx_type), fpc.dm_patch, ncomp, 0);
     }
 
     template <typename MF,
-              typename std::enable_if<!std::is_same<typename MF::FABType::value_type,
-                                                    FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<!std::is_same_v<typename MF::FABType::value_type,
+                                                    FArrayBox>,
+                                      int> = 0>
     MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp)
     {
         return MF(fpc.ba_fine_patch, fpc.dm_patch, ncomp, 0);
     }
 
     template <typename MF,
-              typename std::enable_if<!std::is_same<typename MF::FABType::value_type,
-                                                    FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<!std::is_same_v<typename MF::FABType::value_type,
+                                                    FArrayBox>,
+                                      int> = 0>
     MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type)
     {
         return MF(amrex::convert(fpc.ba_fine_patch, idx_type), fpc.dm_patch, ncomp, 0);
     }
 
     template <typename MF,
-              typename std::enable_if<!std::is_same<typename MF::FABType::value_type,
-                                                   FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<!std::is_same_v<typename MF::FABType::value_type,
+                                                   FArrayBox>,
+                                      int> = 0>
     MF make_mf_refined_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type, IntVect ratio)
     {
         return MF(amrex::convert( amrex::refine( amrex::coarsen(fpc.ba_fine_patch, ratio), ratio), idx_type), fpc.dm_patch, ncomp, 0);
     }
 
     template <typename MF,
-              typename std::enable_if<!std::is_same<typename MF::FABType::value_type,
-                                                   FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<!std::is_same_v<typename MF::FABType::value_type,
+                                                   FArrayBox>,
+                                      int> = 0>
     MF make_mf_crse_mask (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type, IntVect ratio)
     {
         return MF(amrex::convert(amrex::coarsen(fpc.ba_fine_patch, ratio), idx_type), fpc.dm_patch, ncomp, 0);
     }
 
     template <typename MF,
-              typename std::enable_if<!std::is_same<typename MF::FABType::value_type,
-                                                    FArrayBox>::value,
-                                      int>::type = 0>
+              std::enable_if_t<!std::is_same_v<typename MF::FABType::value_type,
+                                                    FArrayBox>,
+                                      int> = 0>
     void mf_set_domain_bndry (MF &/*mf*/, Geometry const & /*geom*/)
     {
         // nothing
diff --git a/Src/AmrCore/AMReX_FillPatcher.H b/Src/AmrCore/AMReX_FillPatcher.H
index 46d1107dea3..5ff1c9550d3 100644
--- a/Src/AmrCore/AMReX_FillPatcher.H
+++ b/Src/AmrCore/AMReX_FillPatcher.H
@@ -15,13 +15,13 @@ namespace amrex {
  * with interpolation of the coarse data.  Then it fills the fine ghost
  * cells overlapping fine level valid cells with the fine level data.  If
  * the valid cells of the destination need to be filled, it will be done as
- * well.  Finally, it will fill the physical bounbary using the user
+ * well.  Finally, it will fill the physical boundary using the user
  * provided functor.  The `fill` member function can be used to do the
  * operations just described.  Alternatively, one can also use the
  * `fillCoarseFineBounary` to fill the ghost cells at the coarse/fine
  * boundary only.  Then one can manually call FillBoundary to fill the other
  * ghost cells, and use the physical BC functor to handle the physical
- * boundeary.
+ * boundary.
  *
  * The communication of the coarse data needed for spatial interpolation is
  * optimized at the cost of being error-prone.  One must follow the
@@ -42,7 +42,7 @@ namespace amrex {
  *
  * (3) When to destroy?  Usually, we do time steppig on a coarse level
  * first.  Then we recursively do time stepping on fine levels.  After the
- * finer level finishes, we do reflux and averge the fine data down to the
+ * finer level finishes, we do reflux and average the fine data down to the
  * coarse level.  After that we should destroy the FillPatcher object
  * associated with these two levels, because the coarse data stored in the
  * object has become outdated.  For AmrCore based codes, you could use
@@ -68,7 +68,6 @@ namespace amrex {
  * See AmrLevel::RK for an example of using the RungeKutta functions and
  * FillPatcher together.
  */
-
 template <class MF = MultiFab>
 class FillPatcher
 {
@@ -118,7 +117,7 @@ public:
      * \param fbc         for filling fine level physical BC
      * \param fbccomp     starting component of the fine level BC functor
      * \param bcs         BCRec specifying physical boundary types
-     * \parame bcscomp    starting component of the BCRec Vector.
+     * \param bcscomp    starting component of the BCRec Vector.
      * \param pre_interp  optional pre-interpolation hook for modifying the coarse data
      * \param post_interp optional post-interpolation hook for modifying the fine data
      */
diff --git a/Src/AmrCore/AMReX_FluxRegister.H b/Src/AmrCore/AMReX_FluxRegister.H
index 4178eb289ba..f5983e18872 100644
--- a/Src/AmrCore/AMReX_FluxRegister.H
+++ b/Src/AmrCore/AMReX_FluxRegister.H
@@ -14,7 +14,6 @@ namespace amrex {
 *
 * Stores and manipulates fluxes at coarse-fine interfaces.
 */
-
 class FluxRegister
     :
     public BndryRegister
diff --git a/Src/AmrCore/AMReX_InterpFaceReg_3D_C.H b/Src/AmrCore/AMReX_InterpFaceReg_3D_C.H
index c55cda3c357..2df7fef055a 100644
--- a/Src/AmrCore/AMReX_InterpFaceReg_3D_C.H
+++ b/Src/AmrCore/AMReX_InterpFaceReg_3D_C.H
@@ -12,11 +12,10 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4<Real> const
     int jc = amrex::coarsen(j,rr[1]);
     int kc = amrex::coarsen(k,rr[2]);
     if (idim == 0) {
-        if (jc == domface.smallEnd(1) || jc == domface.bigEnd(1)) {
-            for (int n = 0; n < ncomp; ++n) {
-                fine(i,j,k,n+scomp) = crse(ic,jc,kc,n);
-            }
-        } else {
+        for (int n = 0; n < ncomp; ++n) {
+            fine(i,j,k,n+scomp) = crse(ic,jc,kc,n);
+        }
+        if (jc != domface.smallEnd(1) && jc != domface.bigEnd(1) && rr[1] > 1) {
             Real sfy = Real(1.0);
             for (int n = 0; n < ncomp; ++n) {
                 Real dc = Real(0.5) * (crse(ic,jc+1,kc,n) - crse(ic,jc-1,kc,n));
@@ -32,11 +31,11 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4<Real> const
             }
             Real yoff = (static_cast<Real>(j - jc*rr[1]) + Real(0.5)) / Real(rr[1]) - Real(0.5);
             for (int n = 0; n < ncomp; ++n) {
-                fine(i,j,k,n+scomp) = crse(ic,jc,kc,n) + yoff * slope(i,j,k,n) * sfy;
+                fine(i,j,k,n+scomp) += yoff * slope(i,j,k,n) * sfy;
             }
         }
 
-        if (kc != domface.smallEnd(2) && kc != domface.bigEnd(2)) {
+        if (kc != domface.smallEnd(2) && kc != domface.bigEnd(2) && rr[2] > 1) {
             Real sfz = Real(1.0);
             for (int n = 0; n < ncomp; ++n) {
                 Real dc = Real(0.5) * (crse(ic,jc,kc+1,n) - crse(ic,jc,kc-1,n));
@@ -56,11 +55,10 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4<Real> const
             }
         }
     } else if (idim == 1) {
-        if (ic == domface.smallEnd(0) || ic == domface.bigEnd(0)) {
-            for (int n = 0; n < ncomp; ++n) {
-                fine(i,j,k,n+scomp) = crse(ic,jc,kc,n);
-            }
-        } else {
+        for (int n = 0; n < ncomp; ++n) {
+            fine(i,j,k,n+scomp) = crse(ic,jc,kc,n);
+        }
+        if (ic != domface.smallEnd(0) && ic != domface.bigEnd(0) && rr[0] > 1) {
             Real sfx = Real(1.0);
             for (int n = 0; n < ncomp; ++n) {
                 Real dc = Real(0.5) * (crse(ic+1,jc,kc,n) - crse(ic-1,jc,kc,n));
@@ -76,11 +74,11 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4<Real> const
             }
             Real xoff = (static_cast<Real>(i - ic*rr[0]) + Real(0.5)) / Real(rr[0]) - Real(0.5);
             for (int n = 0; n < ncomp; ++n) {
-                fine(i,j,k,n+scomp) = crse(ic,jc,kc,n) + xoff * slope(i,j,k,n) * sfx;
+                fine(i,j,k,n+scomp) += xoff * slope(i,j,k,n) * sfx;
             }
         }
 
-        if (kc != domface.smallEnd(2) && kc != domface.bigEnd(2)) {
+        if (kc != domface.smallEnd(2) && kc != domface.bigEnd(2) && rr[2] > 1) {
             Real sfz = Real(1.0);
             for (int n = 0; n < ncomp; ++n) {
                 Real dc = Real(0.5) * (crse(ic,jc,kc+1,n) - crse(ic,jc,kc-1,n));
@@ -100,11 +98,10 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4<Real> const
             }
         }
     } else {
-        if (ic == domface.smallEnd(0) || ic == domface.bigEnd(0)) {
-            for (int n = 0; n < ncomp; ++n) {
-                fine(i,j,k,n+scomp) = crse(ic,jc,kc,n);
-            }
-        } else {
+        for (int n = 0; n < ncomp; ++n) {
+            fine(i,j,k,n+scomp) = crse(ic,jc,kc,n);
+        }
+        if (ic != domface.smallEnd(0) && ic != domface.bigEnd(0) && rr[0] > 1) {
             Real sfx = Real(1.0);
             for (int n = 0; n < ncomp; ++n) {
                 Real dc = Real(0.5) * (crse(ic+1,jc,kc,n) - crse(ic-1,jc,kc,n));
@@ -120,11 +117,11 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4<Real> const
             }
             Real xoff = (static_cast<Real>(i - ic*rr[0]) + Real(0.5)) / Real(rr[0]) - Real(0.5);
             for (int n = 0; n < ncomp; ++n) {
-                fine(i,j,k,n+scomp) = crse(ic,jc,kc,n) + xoff * slope(i,j,k,n) * sfx;
+                fine(i,j,k,n+scomp) += xoff * slope(i,j,k,n) * sfx;
             }
         }
 
-        if (jc != domface.smallEnd(1) && jc != domface.bigEnd(1)) {
+        if (jc != domface.smallEnd(1) && jc != domface.bigEnd(1) && rr[1] > 1) {
             Real sfy = Real(1.0);
             for (int n = 0; n < ncomp; ++n) {
                 Real dc = Real(0.5) * (crse(ic,jc+1,kc,n) - crse(ic,jc-1,kc,n));
diff --git a/Src/AmrCore/AMReX_InterpFaceRegister.H b/Src/AmrCore/AMReX_InterpFaceRegister.H
index 5e9f92784e7..c54879bcaf6 100644
--- a/Src/AmrCore/AMReX_InterpFaceRegister.H
+++ b/Src/AmrCore/AMReX_InterpFaceRegister.H
@@ -10,9 +10,8 @@ namespace amrex {
 
 /**
  *  \brief InterpFaceRegister is a coarse/fine boundary register for
- *  interpolation of face data at the coarse/fine boundadry.
+ *  interpolation of face data at the coarse/fine boundary.
  */
-
 class InterpFaceRegister
 {
 public:
@@ -31,7 +30,7 @@ public:
                         Geometry const& fgeom, IntVect const& ref_ratio);
 
     /**
-     * \brief Defines an InterpFaceRegister objecct.
+     * \brief Defines an InterpFaceRegister object.
      *
      * \param fba  The fine level BoxArray
      * \param fdm  The fine level DistributionMapping
diff --git a/Src/AmrCore/AMReX_Interp_2D_C.H b/Src/AmrCore/AMReX_Interp_2D_C.H
index 20f8b1c3a0f..a47f265011d 100644
--- a/Src/AmrCore/AMReX_Interp_2D_C.H
+++ b/Src/AmrCore/AMReX_Interp_2D_C.H
@@ -128,6 +128,7 @@ facediv_face_interp (int ci, int cj, int /*ck*/,
 
             break;
         }
+        default : { break; }
     }
 }
 
diff --git a/Src/AmrCore/AMReX_Interp_3D_C.H b/Src/AmrCore/AMReX_Interp_3D_C.H
index daa19f6d5e3..709b44761d0 100644
--- a/Src/AmrCore/AMReX_Interp_3D_C.H
+++ b/Src/AmrCore/AMReX_Interp_3D_C.H
@@ -204,6 +204,7 @@ facediv_face_interp (int ci, int cj, int ck,
 
             break;
         }
+        default : { break; }
     }
 }
 
diff --git a/Src/AmrCore/AMReX_Interp_C.H b/Src/AmrCore/AMReX_Interp_C.H
index 86e7935a436..01e920680c0 100644
--- a/Src/AmrCore/AMReX_Interp_C.H
+++ b/Src/AmrCore/AMReX_Interp_C.H
@@ -15,8 +15,8 @@ namespace amrex {
 
 //
 // Fill fine values with piecewise-constant interpolation of coarse data.
-// Operate only on faces that overlap--ie, only fill the fine faces that make up each
-// coarse face, leave the in-between faces alone.
+// Operate only on faces that overlap -- i.e., only fill the fine faces that
+// make up each coarse face, leave the in-between faces alone.
 //
 template<typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void
@@ -84,6 +84,154 @@ face_linear_face_interp_z (int fi, int fj, int fk, int n, Array4<T> const& fine,
     }
 }
 
+//
+// Fill fine values with tangential interpolation of coarse data.
+// Operate only on faces that overlap -- i.e., only fill the fine faces that
+// make up each coarse face, leave the in-between faces alone.
+//
+template<typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void
+face_cons_linear_face_interp (int i, int j, int k, int n, Array4<T> const& fine,
+                              Array4<T const> const& crse, Array4<int const> const& mask,
+                              IntVect const& ratio,  Box const& per_grown_domain, int dim) noexcept
+{
+    int ci = amrex::coarsen(i, ratio[0]);
+
+#if (AMREX_SPACEDIM == 1)
+    amrex::ignore_unused(per_grown_domain);
+    int cj = 0;
+#else
+    int cj = amrex::coarsen(j, ratio[1]);
+#endif
+
+#if (AMREX_SPACEDIM == 3)
+    int ck = amrex::coarsen(k, ratio[2]);
+#else
+    int ck = 0;
+#endif
+
+    if (dim == 0 && ci*ratio[0] == i) {
+        // Check solve mask to ensure we don't overwrite valid fine data.
+        if (!mask || mask(ci, cj, ck, n)) {
+            fine(i, j, k, n) = crse(ci, cj, ck, n);
+#if (AMREX_SPACEDIM >= 2)
+            if (cj > per_grown_domain.smallEnd(1) && cj < per_grown_domain.bigEnd(1) && ratio[1] > 1) {
+                Real sfy = Real(1.0);
+                Real dc = Real(0.5) * (crse(ci,cj+1,ck,n) - crse(ci,cj-1,ck,n));
+                Real df = Real(2.0) * (crse(ci,cj+1,ck,n) - crse(ci,cj  ,ck,n));
+                Real db = Real(2.0) * (crse(ci,cj  ,ck,n) - crse(ci,cj-1,ck,n));
+                Real sy = (df*db >= Real(0.0)) ?
+                    amrex::min(std::abs(df),std::abs(db)) : Real(0.);
+                sy = std::copysign(Real(1.),dc)*amrex::min(sy,std::abs(dc));
+                if (dc != Real(0.0)) {
+                    sfy = amrex::min(sfy, sy / dc);
+                }
+                Real slope = dc;
+                Real yoff = (static_cast<Real>(j - cj*ratio[1]) + Real(0.5)) / Real(ratio[1]) - Real(0.5);
+                fine(i,j,k,n) += yoff * slope * sfy;
+            } // jc
+#if (AMREX_SPACEDIM == 3)
+            if (ck > per_grown_domain.smallEnd(2) && ck < per_grown_domain.bigEnd(2) && ratio[2] > 1) {
+                Real sfz = Real(1.0);
+                Real dc = Real(0.5) * (crse(ci,cj,ck+1,n) - crse(ci,cj,ck-1,n));
+                Real df = Real(2.0) * (crse(ci,cj,ck+1,n) - crse(ci,cj,ck  ,n));
+                Real db = Real(2.0) * (crse(ci,cj,ck  ,n) - crse(ci,cj,ck-1,n));
+                Real sz = (df*db >= Real(0.0)) ?
+                    amrex::min(std::abs(df),std::abs(db)) : Real(0.);
+                sz = std::copysign(Real(1.),dc)*amrex::min(sz,std::abs(dc));
+                if (dc != Real(0.0)) {
+                    sfz = amrex::min(sfz, sz / dc);
+                }
+                Real slope = dc;
+                Real zoff = (static_cast<Real>(k - ck*ratio[2]) + Real(0.5)) / Real(ratio[2]) - Real(0.5);
+                fine(i,j,k,n) += zoff * slope * sfz;
+            } // ck
+#endif
+#endif
+        } // mask
+    } // dim
+
+#if (AMREX_SPACEDIM >= 2)
+    if (dim == 1 && cj*ratio[1] == j) {
+        // Check solve mask to ensure we don't overwrite valid fine data.
+        if (!mask || mask(ci, cj, ck, n)) {
+            fine(i, j, k, n) = crse(ci, cj, ck, n);
+            if (ci > per_grown_domain.smallEnd(0) && ci < per_grown_domain.bigEnd(0) && ratio[0] > 1) {
+                Real sfx = Real(1.0);
+                Real dc = Real(0.5) * (crse(ci+1,cj,ck,n) - crse(ci-1,cj,ck,n));
+                Real df = Real(2.0) * (crse(ci+1,cj,ck,n) - crse(ci  ,cj,ck,n));
+                Real db = Real(2.0) * (crse(ci  ,cj,ck,n) - crse(ci-1,cj,ck,n));
+                Real sx = (df*db >= Real(0.0)) ?
+                    amrex::min(std::abs(df),std::abs(db)) : Real(0.);
+                sx = std::copysign(Real(1.),dc)*amrex::min(sx,std::abs(dc));
+                if (dc != Real(0.0)) {
+                    sfx = amrex::min(sfx, sx / dc);
+                }
+                Real slope = dc;
+                Real xoff = (static_cast<Real>(i - ci*ratio[0]) + Real(0.5)) / Real(ratio[0]) - Real(0.5);
+                fine(i,j,k,n) += xoff * slope * sfx;
+            } // ci
+#if (AMREX_SPACEDIM == 3)
+            if (ck > per_grown_domain.smallEnd(2) && ck < per_grown_domain.bigEnd(2) && ratio[2] > 1) {
+                Real sfz = Real(1.0);
+                Real dc = Real(0.5) * (crse(ci,cj,ck+1,n) - crse(ci,cj,ck-1,n));
+                Real df = Real(2.0) * (crse(ci,cj,ck+1,n) - crse(ci,cj,ck  ,n));
+                Real db = Real(2.0) * (crse(ci,cj,ck  ,n) - crse(ci,cj,ck-1,n));
+                Real sz = (df*db >= Real(0.0)) ?
+                    amrex::min(std::abs(df),std::abs(db)) : Real(0.);
+                sz = std::copysign(Real(1.),dc)*amrex::min(sz,std::abs(dc));
+                if (dc != Real(0.0)) {
+                    sfz = amrex::min(sfz, sz / dc);
+                }
+                Real slope = dc;
+                Real zoff = (static_cast<Real>(k - ck*ratio[2]) + Real(0.5)) / Real(ratio[2]) - Real(0.5);
+                fine(i,j,k,n) += zoff * slope * sfz;
+            } // ck
+#endif // SPACEDIM >= 3
+        } // mask
+    } // dim == 1
+#endif // SPACEDIM >= 2
+
+#if (AMREX_SPACEDIM == 3)
+    if (dim == 2 && ck*ratio[2] == k) {
+        // Check solve mask to ensure we don't overwrite valid fine data.
+        if (!mask || mask(ci, cj, ck, n)) {
+            fine(i, j, k, n) = crse(ci, cj, ck, n);
+            if (ci > per_grown_domain.smallEnd(0) && ci < per_grown_domain.bigEnd(0) && ratio[0] > 1) {
+                Real sfx = Real(1.0);
+                Real dc = Real(0.5) * (crse(ci+1,cj,ck,n) - crse(ci-1,cj,ck,n));
+                Real df = Real(2.0) * (crse(ci+1,cj,ck,n) - crse(ci  ,cj,ck,n));
+                Real db = Real(2.0) * (crse(ci  ,cj,ck,n) - crse(ci-1,cj,ck,n));
+                Real sx = (df*db >= Real(0.0)) ?
+                    amrex::min(std::abs(df),std::abs(db)) : Real(0.);
+                sx = std::copysign(Real(1.),dc)*amrex::min(sx,std::abs(dc));
+                if (dc != Real(0.0)) {
+                    sfx = amrex::min(sfx, sx / dc);
+                }
+                Real slope = dc;
+                Real xoff = (static_cast<Real>(i - ci*ratio[0]) + Real(0.5)) / Real(ratio[0]) - Real(0.5);
+                fine(i,j,k,n) += xoff * slope * sfx;
+            } // ci
+            if (cj > per_grown_domain.smallEnd(1) && cj < per_grown_domain.bigEnd(1) && ratio[1] > 1) {
+                Real sfy = Real(1.0);
+                Real dc = Real(0.5) * (crse(ci,cj+1,ck,n) - crse(ci,cj-1,ck,n));
+                Real df = Real(2.0) * (crse(ci,cj+1,ck,n) - crse(ci,cj  ,ck,n));
+                Real db = Real(2.0) * (crse(ci,cj  ,ck,n) - crse(ci,cj-1,ck,n));
+                Real sy = (df*db >= Real(0.0)) ?
+                    amrex::min(std::abs(df),std::abs(db)) : Real(0.);
+                sy = std::copysign(Real(1.),dc)*amrex::min(sy,std::abs(dc));
+                if (dc != Real(0.0)) {
+                    sfy = amrex::min(sfy, sy / dc);
+                }
+                Real slope = dc;
+                Real yoff = (static_cast<Real>(j - cj*ratio[1]) + Real(0.5)) / Real(ratio[1]) - Real(0.5);
+                fine(i,j,k,n) += yoff * slope * sfy;
+            } // cj
+        } // mask
+    } // dim == 2
+#endif
+}
+
 //
 // Do linear in dir, pc transverse to dir, leave alone the fine values
 // lining up with coarse edges--assume these have been set to hold the
diff --git a/Src/AmrCore/AMReX_Interpolater.H b/Src/AmrCore/AMReX_Interpolater.H
index d2fe66b0cbd..c13fb283a35 100644
--- a/Src/AmrCore/AMReX_Interpolater.H
+++ b/Src/AmrCore/AMReX_Interpolater.H
@@ -17,7 +17,6 @@ class IArrayBox;
 *
 * Specifies interpolater interface for coarse-to-fine interpolation in space.
 */
-
 class Interpolater
     : public InterpBase
 {
@@ -160,7 +159,6 @@ public:
 *
 * Bilinear interpolation on node centered data.
 */
-
 class NodeBilinear
     :
     public Interpolater
@@ -219,7 +217,6 @@ public:
 *
 * Bilinear interpolation on cell centered data.
 */
-
 class CellBilinear
     :
     public Interpolater
@@ -286,7 +283,6 @@ public:
 * sum_ivar a(ic,jc,ivar)*fab(if,jf,ivar) = 0 is satisfied
 * in all fine cells if,jf covering coarse cell ic,jc.
 */
-
 class CellConservativeLinear
     :
     public Interpolater
@@ -344,7 +340,6 @@ protected:
 * Linear conservative interpolation on cell centered data
 * but with protection against undershoots or overshoots.
 */
-
 class CellConservativeProtected
     :
     public CellConservativeLinear
@@ -393,7 +388,6 @@ public:
 *
 * Quadratic interpolation on cell centered data.
 */
-
 class CellQuadratic
     :
     public Interpolater
@@ -451,7 +445,6 @@ public:
 /**
 * \brief Piecewise Constant interpolation on cell centered data.
 */
-
 class PCInterp
     :
     public Interpolater
@@ -512,7 +505,6 @@ public:
 * in constructing the polynomial, the average of the polynomial inside that
 * cell is equal to the cell averaged value of the original data.
 */
-
 class CellConservativeQuartic
     :
     public Interpolater
@@ -574,7 +566,6 @@ public:
 * a given coarse cell will have the same divergence, even when the coarse
 * grid divergence is spatially varying.
 */
-
 class FaceDivFree
     :
     public Interpolater
@@ -663,11 +654,10 @@ public:
 
 
 /**
-* \brief Bilinear interpolation on face data.
+* \brief Piecewise constant tangential interpolation / linear normal interpolation of face data.
 *
-* Bilinear interpolation on data.
+* Piecewise constant tangential interpolation / linear normal interpolation of face data.
 */
-
 class FaceLinear
     :
     public Interpolater
@@ -782,6 +772,127 @@ public:
                      RunOn       runon) override;
 
 
+};
+
+/**
+* \brief Bilinear tangential interpolation / linear normal interpolation of face data.
+*
+* Bilinear tangential interpolation / linear normal interpolation of face data.
+*/
+class FaceConservativeLinear
+    :
+    public Interpolater
+{
+public:
+    /**
+    * \brief Returns coarsened box given fine box and refinement ratio.
+    *
+    * \param fine
+    * \param ratio
+    */
+    Box CoarseBox (const Box& fine, int ratio) override;
+
+    /**
+    * \brief Returns coarsened box given fine box and refinement ratio.
+    *
+    * \param fine
+    * \param ratio
+    */
+    Box CoarseBox (const Box& fine, const IntVect& ratio) override;
+
+    /**
+    * \brief Coarse to fine interpolation in space.
+    *
+    * \param crse
+    * \param crse_comp
+    * \param fine
+    * \param fine_comp
+    * \param ncomp
+    * \param fine_region
+    * \param ratio
+    * \param crse_geom
+    * \param fine_geom
+    * \param bcr
+    * \param actual_comp
+    * \param actual_state
+    */
+    void interp (const FArrayBox& crse,
+                 int              crse_comp,
+                 FArrayBox&       fine,
+                 int              fine_comp,
+                 int              ncomp,
+                 const Box&       fine_region,
+                 const IntVect&   ratio,
+                 const Geometry&  crse_geom,
+                 const Geometry&  fine_geom,
+                 Vector<BCRec> const& bcr,
+                 int              actual_comp,
+                 int              actual_state,
+                 RunOn            runon) override;
+
+    /**
+    * \brief Coarse to fine interpolation in space for face-based data.
+    *
+    * \param crse
+    * \param crse_comp
+    * \param fine
+    * \param fine_comp
+    * \param ncomp
+    * \param fine_region
+    * \param ratio
+    * \param solve_mask
+    * \param crse_geom
+    * \param fine_geom
+    * \param bcr
+    * \param bccomp
+    * \param runon
+    */
+    void interp_face (const FArrayBox& crse,
+                      int        crse_comp,
+                      FArrayBox&       fine,
+                      int        fine_comp,
+                      int        ncomp,
+                      const Box&       fine_region,
+                      const IntVect&   ratio,
+                      const IArrayBox& solve_mask,
+                      const Geometry&  crse_geom,
+                      const Geometry&  fine_geom,
+                      Vector<BCRec> const & bcr,
+                      int        bccomp,
+                      RunOn            runon) override;
+
+    /**
+    * \brief Coarse to fine interpolation in space.
+    *
+    * \param crse
+    * \param crse_comp
+    * \param fine
+    * \param fine_comp
+    * \param ncomp
+    * \param fine_region
+    * \param ratio
+    * \param crse_geom
+    * \param fine_geom
+    * \param bcr
+    * \param actual_comp
+    * \param actual_state
+    */
+    void interp_arr (Array<FArrayBox*, AMREX_SPACEDIM> const& crse,
+                     int         crse_comp,
+                     Array<FArrayBox*, AMREX_SPACEDIM> const& fine,
+                     int         fine_comp,
+                     int         ncomp,
+                     const Box&        fine_region,
+                     const IntVect&    ratio,
+                     Array<IArrayBox*, AMREX_SPACEDIM> const& solve_mask,
+                     const Geometry&   /*crse_geom*/,
+                     const Geometry&   /*fine_geom*/,
+                     Vector<Array<BCRec, AMREX_SPACEDIM> > const& /*bcr*/,
+                     int         /*actual_comp*/,
+                     int         /*actual_state*/,
+                     RunOn       runon) override;
+
+
 };
 
 /**
@@ -789,7 +900,6 @@ public:
 *
 * Quartic interpolation on cell centered data.
 */
-
 class CellQuartic
     :
     public Interpolater
@@ -847,6 +957,7 @@ extern AMREX_EXPORT PCInterp                  pc_interp;
 extern AMREX_EXPORT NodeBilinear              node_bilinear_interp;
 extern AMREX_EXPORT FaceDivFree               face_divfree_interp;
 extern AMREX_EXPORT FaceLinear                face_linear_interp;
+extern AMREX_EXPORT FaceConservativeLinear    face_cons_linear_interp;
 extern AMREX_EXPORT CellConservativeLinear    lincc_interp;
 extern AMREX_EXPORT CellConservativeLinear    cell_cons_interp;
 extern AMREX_EXPORT CellBilinear              cell_bilinear_interp;
diff --git a/Src/AmrCore/AMReX_Interpolater.cpp b/Src/AmrCore/AMReX_Interpolater.cpp
index 601b8b4b861..b5e855feb13 100644
--- a/Src/AmrCore/AMReX_Interpolater.cpp
+++ b/Src/AmrCore/AMReX_Interpolater.cpp
@@ -11,7 +11,7 @@
 namespace amrex {
 
 /*
- * PCInterp, NodeBilinear, FaceLinear, CellConservativeLinear, and
+ * PCInterp, NodeBilinear, FaceLinear, CellConservativeLinear and
  * CellBilinear are supported for all dimensions on cpu and gpu.
  *
  * CellConservativeProtected only works in 2D and 3D on cpu and gpu
@@ -23,6 +23,8 @@ namespace amrex {
  *
  * CellConservativeQuartic only works with ref ratio of 2 on cpu and gpu.
  *
+ * FaceConservativeLinear works in 2D and 3D on cpu and gpu.
+ *
  * FaceDivFree works in 2D and 3D on cpu and gpu.
  * The algorithm is restricted to ref ratio of 2.
  */
@@ -33,6 +35,7 @@ namespace amrex {
 PCInterp                  pc_interp;
 NodeBilinear              node_bilinear_interp;
 FaceLinear                face_linear_interp;
+FaceConservativeLinear    face_cons_linear_interp;
 FaceDivFree               face_divfree_interp;
 CellConservativeLinear    lincc_interp;
 CellConservativeLinear    cell_cons_interp(false);
@@ -142,7 +145,14 @@ FaceLinear::interp (const FArrayBox&  crse,
                     RunOn             runon)
 {
     //
-    // This version is called from InterpFromCoarseLevel
+    // This version is called from FillPatchInterp which is called by
+    //      InterpFromCoarseLevel in AMReX_FillPatchUtil_I.H
+    //
+    // It assumes no existing fine values that need to be preserved (unlike interp_face below)
+    //
+    // Inside each call to face_linear_interp_* (in AMRex_Interp_*D_C.H), we do:
+    //  * on fine faces which overlie crse faces, the fine value is set to the crse value (piecewise constant)
+    //  * on fine faces which are between two crse faces, the fine value is set to the average of the crse values (linear)
     //
     BL_PROFILE("FaceLinear::interp()");
 
@@ -193,6 +203,18 @@ FaceLinear::interp_face (const FArrayBox&  crse,
                          const int         /*bccomp*/,
                          RunOn             runon)
 {
+    //
+    // This version is called from InterpFace which is called from the version FillPatchTwoLevels_doit
+    //      that takes a single MF (in AMReX_FillPatchUtil_I.H)
+    //
+    // It assumes there are existing fine values which we want to preserve (unlike interp above)
+    //
+    // We do the interpolation in two steps:
+    //   1) face_linear_face_interp_*: on fine faces which overlie crse faces, the fine value is set to the crse value (piecewise constant) ONLY IF
+    //      there is not already fine data there
+    //   2) face_linear_interp_*: on fine faces which are between two crse faces, the fine value is set to the average of the values
+    //      on the faces overlying -- this uses only the results of step 1, it does not take the crse values
+    //
     BL_PROFILE("FaceLinear::interp_face()");
 
     AMREX_ASSERT(AMREX_D_TERM(fine_region.type(0),+fine_region.type(1),+fine_region.type(2)) == 1);
@@ -283,6 +305,17 @@ void FaceLinear::interp_arr (Array<FArrayBox*, AMREX_SPACEDIM> const& crse,
                              const int         /*actual_state*/,
                              const RunOn       runon)
 {
+    //
+    // This version is called from FillPatchTwoLevels_doit (that takes an Array of MF*) in AMReX_FillPatchUtil_I.H
+    //
+    // It assumes there are existing fine values which we want to preserve (like face_interp, unlike interp above)
+    //
+    // We do the interpolation in two steps:
+    //   1) face_linear_face_interp_*: on fine faces which overlie crse faces, the fine value is set to the crse value (piecewise constant) ONLY IF
+    //      there is not already fine data there
+    //   2) face_linear_interp_*: on fine faces which are between two crse faces, the fine value is set to the average of the values
+    //      on the faces overlying -- this uses only the results of step 1, it does not take the crse values
+    //
     BL_PROFILE("FaceLinear::interp_arr()");
 
     Array<IndexType, AMREX_SPACEDIM> types;
@@ -377,6 +410,307 @@ void FaceLinear::interp_arr (Array<FArrayBox*, AMREX_SPACEDIM> const& crse,
               });
 }
 
+Box
+FaceConservativeLinear::CoarseBox (const Box& fine, int ratio)
+{
+    return CoarseBox(fine, IntVect(ratio));
+}
+
+Box
+FaceConservativeLinear::CoarseBox (const Box& fine, const IntVect& ratio)
+{
+    IntVect ng(1);
+    for (int i = 0; i < AMREX_SPACEDIM; i++) {
+        if ( (fine.type(i) == IndexType::NODE) || (ratio[i] == 1) ) {
+            ng[i] = 0;
+        }
+    }
+    Box b = amrex::coarsen(fine,ratio); b.grow(ng);
+
+    for (int i = 0; i < AMREX_SPACEDIM; i++) {
+        if (b.type(i) == IndexType::NODE) {
+            if (b.type(i) == IndexType::NODE && b.length(i) < 2) {
+                // Don't want degenerate boxes in nodal direction.
+                b.growHi(i,1);
+            }
+        }
+    }
+    return b;
+}
+
+void
+FaceConservativeLinear::interp (const FArrayBox&     crse,
+                                int                  crse_comp,
+                                FArrayBox&           fine,
+                                int                  fine_comp,
+                                int                  ncomp,
+                                const Box&           fine_region,
+                                const IntVect&       ratio,
+                                const Geometry&      crse_geom,
+                                const Geometry&      fine_geom,
+                                Vector<BCRec> const& bcr,
+                                int                  /*actual_comp*/,
+                                int                  /*actual_state*/,
+                                RunOn                runon)
+{
+    //
+    // This version is called from FillPatchInterp which is called by
+    //      InterpFromCoarseLevel in AMReX_FillPatchUtil_I.H
+    //
+    // It assumes no existing fine values that need to be preserved thus does not send a mask to interp_face
+    //
+    BL_PROFILE("FaceConservativeLinear::interp()");
+
+    AMREX_ASSERT(AMREX_D_TERM(fine_region.type(0),+fine_region.type(1),+fine_region.type(2)) == 1);
+
+    // We intentionally do not allocate the mask so that all faces are filled from coarse values
+    IArrayBox dummy_mask;
+    int bccomp = 0; // This is also a dummy -- it's not used
+    interp_face(crse,crse_comp,fine,fine_comp,ncomp,fine_region,ratio,dummy_mask,
+                crse_geom,fine_geom,bcr,bccomp,runon);
+}
+
+void
+FaceConservativeLinear::interp_face (const FArrayBox&       crse,
+                                     const int              crse_comp,
+                                     FArrayBox&             fine,
+                                     const int              fine_comp,
+                                     const int              ncomp,
+                                     const Box&             fine_region,
+                                     const IntVect&         ratio,
+                                     const IArrayBox&       solve_mask,
+                                     const Geometry&        crse_geom,
+                                     const Geometry&      /*fine_geom */,
+                                     Vector<BCRec> const& /*bcr*/,
+                                     const int            /*bccomp*/,
+                                     RunOn                  runon)
+{
+    //
+    // This version is called from InterpFace which is called from the version FillPatchTwoLevels_doit
+    //      that takes a single MF (in AMReX_FillPatchUtil_I.H)
+    //
+    // It assumes there are existing fine values which we want to preserve (unlike interp above)
+    //
+    // We do the interpolation in two steps:
+    //   1) face_cons_linear_face_interp: on fine faces which overlie crse faces, slopes are computed (linear in 2d, bilinear in 3d)
+    //      and the fine value is over-written ONLY IF there is not already fine data there (assuming the mask is used)
+    //   2) face_linear_interp_*: on fine faces which are between two crse faces, the fine value is set to the average of the values
+    //      on the faces overlying -- this uses only the results of step 1
+    //      NOTE: we use the same routines as used by FaceLinear since this interpolation is only in the normal direction
+    //
+    BL_PROFILE("FaceConservativeLinear::interp_face()");
+
+    AMREX_ASSERT(AMREX_D_TERM(fine_region.type(0),+fine_region.type(1),+fine_region.type(2)) == 1);
+    Array4<Real> const& fine_arr = fine.array(fine_comp);
+    Array4<Real const> const& crse_arr = crse.const_array(crse_comp);
+    Array4<const int> mask_arr;
+    if (solve_mask.isAllocated()) {
+        mask_arr = solve_mask.const_array();
+    }
+
+    // We don't need to worry about face-based domain because this is only used in the tangential interpolation
+    Box per_grown_domain = crse_geom.Domain();
+    for (int dim = 0; dim < AMREX_SPACEDIM; dim++) {
+        if (crse_geom.isPeriodic(dim)) {
+            per_grown_domain.grow(dim,1);
+        }
+    }
+
+    //
+    // Fill fine ghost faces with interpolation of coarse data that is conservative linear
+    //      in the tangential direction.
+    // Operate only on faces that overlap--ie, only fill the fine faces that make up each
+    // coarse face, leave the in-between faces alone.
+    // The mask ensures we do not overwrite valid fine cells.
+    //
+    if (fine_region.type(0) == IndexType::NODE)
+    {
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n,
+        {
+            face_cons_linear_face_interp(i,j,k,n,fine_arr,crse_arr,mask_arr,ratio,per_grown_domain,0);
+        });
+    }
+#if (AMREX_SPACEDIM >= 2)
+    else if (fine_region.type(1) == IndexType::NODE)
+    {
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n,
+        {
+            face_cons_linear_face_interp(i,j,k,n,fine_arr,crse_arr,mask_arr,ratio,per_grown_domain,1);
+        });
+    }
+#if (AMREX_SPACEDIM == 3)
+    else
+    {
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n,
+        {
+            face_cons_linear_face_interp(i,j,k,n,fine_arr,crse_arr,mask_arr,ratio,per_grown_domain,2);
+        });
+    }
+#endif
+#endif
+
+    //
+    // Interpolate unfilled grow cells using best data from
+    // surrounding faces of valid region, and pc-interpd data
+    // on fine faces overlaying coarse edges.
+    //
+    if (fine_region.type(0) == IndexType::NODE)
+    {
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n,
+        {
+            face_linear_interp_x(i,j,k,n,fine_arr,ratio);
+        });
+    }
+#if (AMREX_SPACEDIM >= 2)
+    else if (fine_region.type(1) == IndexType::NODE)
+    {
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n,
+        {
+            face_linear_interp_y(i,j,k,n,fine_arr,ratio);
+        });
+    }
+#if (AMREX_SPACEDIM == 3)
+    else
+    {
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n,
+        {
+            face_linear_interp_z(i,j,k,n,fine_arr,ratio);
+        });
+    }
+#endif
+#endif
+}
+
+void FaceConservativeLinear::interp_arr (Array<FArrayBox*, AMREX_SPACEDIM> const& crse,
+                                         const int         crse_comp,
+                                         Array<FArrayBox*, AMREX_SPACEDIM> const& fine,
+                                         const int         fine_comp,
+                                         const int         ncomp,
+                                         const Box&        fine_region,
+                                         const IntVect&    ratio,
+                                         Array<IArrayBox*, AMREX_SPACEDIM> const& solve_mask,
+                                         const Geometry&   crse_geom,
+                                         const Geometry&   /*fine_geom*/,
+                                         Vector<Array<BCRec, AMREX_SPACEDIM> > const& /*bcr*/,
+                                         const int         /*actual_comp*/,
+                                         const int         /*actual_state*/,
+                                         const RunOn       runon)
+{
+    //
+    // This version is called from FillPatchTwoLevels_doit (that takes an Array of MF*) in AMReX_FillPatchUtil_I.H
+    //
+    // It assumes there are existing fine values which we want to preserve (like face_interp, unlike interp above)
+    //
+    // We do the interpolation in two steps:
+    //   1) face_cons_linear_face_interp_*: on fine faces which overlie crse faces, we compute tangential slopes
+    //      to compute the fine values (linear in 2d, bilinear in 3d) ONLY IF there is not already fine data there
+    //   2) face_cons_linear_interp_*: on fine faces which are between two crse faces, the fine value is set to the average of the values
+    //      on the faces overlying -- this uses only the results of step 1, it does not take the crse values
+    //      NOTE: here we use the same routines as used by FaceLinear since this interpolation is only in the normal direction
+    //
+    BL_PROFILE("FaceConservativeLinear::interp_arr()");
+
+    Array<IndexType, AMREX_SPACEDIM> types;
+    for (int d=0; d<AMREX_SPACEDIM; ++d)
+        { types[d].set(d); }
+
+    GpuArray<Array4<const Real>, AMREX_SPACEDIM> crse_arr;
+    GpuArray<Array4<Real>, AMREX_SPACEDIM> fine_arr;
+    GpuArray<Array4<const int>, AMREX_SPACEDIM> mask_arr;
+    for (int d=0; d<AMREX_SPACEDIM; ++d)
+    {
+        crse_arr[d] = crse[d]->const_array(crse_comp);
+        fine_arr[d] = fine[d]->array(fine_comp);
+        if (solve_mask[d] != nullptr)
+            { mask_arr[d] = solve_mask[d]->const_array(0); }
+    }
+
+    // We don't need to worry about face-based domain because this is only used in the tangential interpolation
+    Box per_grown_domain = crse_geom.Domain();
+    for (int dim = 0; dim < AMREX_SPACEDIM; dim++) {
+        if (crse_geom.isPeriodic(dim)) {
+            per_grown_domain.grow(dim,1);
+        }
+    }
+
+    //
+    // Fill fine ghost faces with interpolation of coarse data that is conservative linear
+    //      in the tangential direction.
+    // Operate only on faces that overlap--ie, only fill the fine faces that make up each
+    // coarse face, leave the in-between faces alone.
+    // The mask ensures we do not overwrite valid fine cells.
+    //
+    // Fuse the launches, 1 for each dimension, into a single launch.
+    AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM_FLAG(runon,
+              amrex::convert(fine_region,types[0]), bx0,
+              {
+                  AMREX_LOOP_3D(bx0, i, j, k,
+                  {
+                      for (int n=0; n<ncomp; ++n)
+                      {
+                          face_cons_linear_face_interp(i,j,k,n,fine_arr[0],crse_arr[0],mask_arr[0],ratio,per_grown_domain,0);
+                      }
+                  });
+              },
+              amrex::convert(fine_region,types[1]), bx1,
+              {
+                  AMREX_LOOP_3D(bx1, i, j, k,
+                  {
+                      for (int n=0; n<ncomp; ++n)
+                      {
+                          face_cons_linear_face_interp(i,j,k,n,fine_arr[1],crse_arr[1],mask_arr[1],ratio,per_grown_domain,1);
+                      }
+                  });
+              },
+              amrex::convert(fine_region,types[2]), bx2,
+              {
+                  AMREX_LOOP_3D(bx2, i, j, k,
+                  {
+                      for (int n=0; n<ncomp; ++n)
+                      {
+                          face_cons_linear_face_interp(i,j,k,n,fine_arr[2],crse_arr[2],mask_arr[2],ratio,per_grown_domain,2);
+                      }
+                  });
+              });
+
+    //
+    // Interpolate unfilled grow cells using best data from
+    // surrounding faces of valid region, and pc-interpd data
+    // on fine faces overlaying coarse edges.
+    //
+    AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM_FLAG(runon,
+              amrex::convert(fine_region,types[0]), bx0,
+              {
+                  AMREX_LOOP_3D(bx0, i, j, k,
+                  {
+                      for (int n=0; n<ncomp; ++n)
+                      {
+                          face_linear_interp_x(i,j,k,n,fine_arr[0],ratio);
+                      }
+                  });
+              },
+              amrex::convert(fine_region,types[1]), bx1,
+              {
+                  AMREX_LOOP_3D(bx1, i, j, k,
+                  {
+                      for (int n=0; n<ncomp; ++n)
+                      {
+                          face_linear_interp_y(i,j,k,n,fine_arr[1],ratio);
+                      }
+                  });
+              },
+              amrex::convert(fine_region,types[2]), bx2,
+              {
+                  AMREX_LOOP_3D(bx2, i, j, k,
+                  {
+                      for (int n=0; n<ncomp; ++n)
+                      {
+                          face_linear_interp_z(i,j,k,n,fine_arr[2],ratio);
+                      }
+                  });
+              });
+}
+
 Box
 CellBilinear::CoarseBox (const Box& fine, int ratio)
 {
diff --git a/Src/AmrCore/AMReX_TagBox.H b/Src/AmrCore/AMReX_TagBox.H
index 929e181e0e8..3d26f76e9cb 100644
--- a/Src/AmrCore/AMReX_TagBox.H
+++ b/Src/AmrCore/AMReX_TagBox.H
@@ -20,7 +20,6 @@ namespace amrex {
 *
 * This class is used to tag cells in a Box that need addition refinement.
 */
-
 class TagBox final
     :
     public BaseFab<char>
@@ -145,7 +144,6 @@ public:
 *
 * A container class for TagBoxes.
 */
-
 class TagBoxArray
     :
     public FabArray<TagBox>
diff --git a/Src/AmrCore/Make.package b/Src/AmrCore/Make.package
index df3c2e83d40..bd0bddd6704 100644
--- a/Src/AmrCore/Make.package
+++ b/Src/AmrCore/Make.package
@@ -1,5 +1,6 @@
 
-CEXE_headers += AMReX_AmrCore.H AMReX_Cluster.H AMReX_ErrorList.H AMReX_FillPatchUtil.H AMReX_FillPatchUtil_I.H AMReX_FluxRegister.H \
+CEXE_headers += AMReX_AmrCore.H AMReX_Cluster.H AMReX_ErrorList.H AMReX_FillPatchUtil.H \
+                AMReX_FillPatchUtil_I.H AMReX_FluxRegister.H \
                 AMReX_Interpolater.H AMReX_MFInterpolater.H AMReX_TagBox.H AMReX_AmrMesh.H \
                 AMReX_InterpBase.H
 CEXE_sources += AMReX_AmrCore.cpp AMReX_Cluster.cpp AMReX_ErrorList.cpp AMReX_FillPatchUtil.cpp AMReX_FluxRegister.cpp \
diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H
index c539a1d8e75..2b88553bcdf 100644
--- a/Src/Base/AMReX.H
+++ b/Src/Base/AMReX.H
@@ -113,16 +113,15 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Error (const char* msg = nullptr) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(msg);
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
 #else
-        if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); }
-        AMREX_DEVICE_ASSERT(0);
-#endif
-#else
-        Error_host("Error", msg);
+        AMREX_IF_ON_DEVICE((
+                if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); }
+                AMREX_DEVICE_ASSERT(0);
+        ))
 #endif
+        AMREX_IF_ON_HOST((Error_host("Error", msg);))
     }
 
     //! Print out warning message to cerr.
@@ -132,15 +131,12 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Warning (const char * msg) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(msg);
-#else
-        if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); }
-#endif
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
 #else
-        Warning_host(msg);
+        AMREX_IF_ON_DEVICE((if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); }))
 #endif
+        AMREX_IF_ON_HOST((Warning_host(msg);))
     }
 
     //! Print out message to cerr and exit via abort().
@@ -148,16 +144,15 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Abort (const char * msg = nullptr) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(msg);
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
 #else
-        if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); }
-        AMREX_DEVICE_ASSERT(0);
-#endif
-#else
-        Error_host("Abort", msg);
+        AMREX_IF_ON_DEVICE((
+                if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); }
+                AMREX_DEVICE_ASSERT(0);
+        ))
 #endif
+        AMREX_IF_ON_HOST((Error_host("Abort", msg);))
     }
 
     /**
@@ -170,22 +165,21 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Assert (const char* EX, const char* file, int line, const char* msg = nullptr) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(EX,file,line,msg);
-#else
-        if (msg) {
-            AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s",
-                                EX, file, line, msg);
-        } else {
-            AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d",
-                                EX, file, line);
-        }
-        AMREX_DEVICE_ASSERT(0);
-#endif
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(EX,file,line,msg);))
 #else
-        Assert_host(EX,file,line,msg);
+        AMREX_IF_ON_DEVICE((
+                if (msg) {
+                AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s",
+                                        EX, file, line, msg);
+                } else {
+                AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d",
+                                        EX, file, line);
+                }
+                AMREX_DEVICE_ASSERT(0);
+        ))
 #endif
+        AMREX_IF_ON_HOST((Assert_host(EX,file,line,msg);))
     }
 
     /**
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index f0e56952de2..2d6e7626c80 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -52,6 +52,7 @@
 #endif
 
 #ifdef AMREX_USE_OMP
+#include <AMReX_OpenMP.H>
 #include <omp.h>
 #endif
 
@@ -72,7 +73,9 @@
 #include <iostream>
 #include <iomanip>
 #include <new>
+#include <optional>
 #include <stack>
+#include <string>
 #include <thread>
 #include <limits>
 #include <vector>
@@ -443,6 +446,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
         pp.queryAdd("verbose", system::verbose);
     }
 
+    if (system::verbose > 0) {
+        amrex::Print() << "Initializing AMReX (" << amrex::Version() << ")...\n";
+    }
+
 #ifdef AMREX_USE_MPI
     if (system::verbose > 0) {
         amrex::Print() << "MPI initialized with "
@@ -450,20 +457,22 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
                        << " MPI processes\n";
         int provided = -1;
         MPI_Query_thread(&provided);
-        amrex::Print() << "MPI initialized with thread support level " << provided << std::endl;
+        amrex::Print() << "MPI initialized with thread support level " << provided << '\n';
     }
 #endif
 
 #ifdef AMREX_USE_OMP
+    amrex::OpenMP::Initialize();
+
+    // status output
     if (system::verbose > 0) {
 //    static_assert(_OPENMP >= 201107, "OpenMP >= 3.1 is required.");
         amrex::Print() << "OMP initialized with "
                        << omp_get_max_threads()
                        << " OMP threads\n";
     }
-#endif
 
-#if defined(AMREX_USE_MPI) && defined(AMREX_USE_OMP)
+    // warn if over-subscription is detected
     if (system::verbose > 0) {
         auto ncores = int(std::thread::hardware_concurrency());
         if (ncores != 0 && // It might be zero according to the C++ standard.
@@ -472,8 +481,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
             amrex::Print(amrex::ErrorStream())
                 << "AMReX Warning: You might be oversubscribing CPU cores with OMP threads.\n"
                 << "               There are " << ncores << " cores per node.\n"
-                << "               There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks per node.\n"
-                << "               But OMP is initialized with " << omp_get_max_threads() << " threads per rank.\n"
+#if defined(AMREX_USE_MPI)
+                << "               There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks (processes) per node.\n"
+#endif
+                << "               But OMP is initialized with " << omp_get_max_threads() << " threads per process.\n"
                 << "               You should consider setting OMP_NUM_THREADS="
                 << ncores/ParallelDescriptor::NProcsPerNode() << " or less in the environment.\n";
         }
@@ -677,7 +688,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 #endif
 
     if (system::verbose > 0) {
-        amrex::Print() << "AMReX (" << amrex::Version() << ") initialized" << std::endl;
+        amrex::Print() << "AMReX (" << amrex::Version() << ") initialized" << '\n';
     }
 
     BL_TINY_PROFILE_INITIALIZE();
@@ -748,7 +759,7 @@ amrex::Finalize (amrex::AMReX* pamrex)
                                << "min used in a thread: " << mp_min << " MB, "
                                << "max used in a thread: " << mp_max << " MB, "
 #endif
-                               << "tot used: " << mp_tot << " MB." << std::endl;
+                               << "tot used: " << mp_tot << " MB." << '\n';
             }
         } else {
             int global_max = mp_tot;
@@ -806,6 +817,10 @@ amrex::Finalize (amrex::AMReX* pamrex)
     Gpu::Device::Finalize();
 #endif
 
+#ifdef AMREX_USE_OMP
+    amrex::OpenMP::Finalize();
+#endif
+
 #if defined(AMREX_USE_UPCXX)
     upcxx::finalize();
 #endif
@@ -823,7 +838,7 @@ amrex::Finalize (amrex::AMReX* pamrex)
 #endif
 
     if (amrex::system::verbose > 0 && is_ioproc) {
-        amrex::OutStream() << "AMReX (" << amrex::Version() << ") finalized" << std::endl;
+        amrex::OutStream() << "AMReX (" << amrex::Version() << ") finalized" << '\n';
     }
 }
 
diff --git a/Src/Base/AMReX_Algorithm.H b/Src/Base/AMReX_Algorithm.H
index b418f3cc1c0..666bd04e230 100644
--- a/Src/Base/AMReX_Algorithm.H
+++ b/Src/Base/AMReX_Algorithm.H
@@ -89,7 +89,7 @@ namespace amrex
     // Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/epsilon
     template <typename T>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    typename std::enable_if<std::is_floating_point<T>::value,bool>::type
+    std::enable_if_t<std::is_floating_point_v<T>,bool>
     almostEqual (T x, T y, int ulp = 2)
     {
         // the machine epsilon has to be scaled to the magnitude of the values used
@@ -100,7 +100,7 @@ namespace amrex
     }
 
     template <class T, class F,
-              typename std::enable_if<std::is_floating_point<T>::value,int>::type FOO = 0>
+              std::enable_if_t<std::is_floating_point_v<T>,int>FOO = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T bisect (T lo, T hi, F f, T tol=1e-12, int max_iter=100)
     {
@@ -141,7 +141,7 @@ namespace amrex
     // It is assumed that the input data are sorted and T[lo] <= v < T[hi].
     // Note that this is different from std::lower_bound.
     template <typename T, typename I,
-              typename std::enable_if<std::is_integral<I>::value,int>::type = 0>
+              std::enable_if_t<std::is_integral_v<I>,int> = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     I bisect (T const* d, I lo, I hi, T const& v) {
         while (lo <= hi) {
@@ -161,51 +161,89 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE
     ItType upper_bound (ItType first, ItType last, const ValType& val)
     {
-#if AMREX_DEVICE_COMPILE
-        std::ptrdiff_t count = last-first;
-        while(count>0){
-            auto it = first;
-            const auto step = count/2;
-            it += step;
-            if (!(val < *it)){
-                first = ++it;
-                count -= step + 1;
+        AMREX_IF_ON_DEVICE((
+            std::ptrdiff_t count = last-first;
+            while(count>0){
+                auto it = first;
+                const auto step = count/2;
+                it += step;
+                if (!(val < *it)){
+                    first = ++it;
+                    count -= step + 1;
+                }
+                else{
+                    count = step;
+                }
             }
-            else{
-                count = step;
-            }
-        }
-
-        return first;
-#else
-        return std::upper_bound(first, last, val);
-#endif
+            return first;
+        ))
+        AMREX_IF_ON_HOST((
+            return std::upper_bound(first, last, val);
+        ))
     }
 
     template<typename ItType, typename ValType>
     AMREX_GPU_HOST_DEVICE
     ItType lower_bound (ItType first, ItType last, const ValType& val)
     {
-#ifdef AMREX_DEVICE_COMPILE
-        std::ptrdiff_t count = last-first;
-        while(count>0)
-        {
-            auto it = first;
-            const auto step = count/2;
-            it += step;
-            if (*it < val){
-                first = ++it;
-                count -= step + 1;
+        AMREX_IF_ON_DEVICE((
+            std::ptrdiff_t count = last-first;
+            while(count>0)
+            {
+                auto it = first;
+                const auto step = count/2;
+                it += step;
+                if (*it < val){
+                    first = ++it;
+                    count -= step + 1;
+                }
+                else{
+                    count = step;
+                }
             }
-            else{
-                count = step;
+
+            return first;
+        ))
+        AMREX_IF_ON_HOST((
+            return std::lower_bound(first, last, val);
+        ))
+    }
+
+    template<typename ItType, typename ValType,
+        std::enable_if_t<
+            std::is_floating_point_v<typename std::iterator_traits<ItType>::value_type> &&
+            std::is_floating_point_v<ValType>,
+        int> = 0>
+    AMREX_GPU_HOST_DEVICE
+    void linspace (ItType first, const ItType& last, const ValType& start, const ValType& stop)
+    {
+        const std::ptrdiff_t count = last-first;
+        if (count >= 2){
+            const auto delta = (stop - start)/(count - 1);
+            for (std::ptrdiff_t i = 0; i < count-1; ++i){
+                *(first++) = start + i*delta;
             }
+            *first = stop;
         }
+    }
 
-        return first;
-#else
-        return std::lower_bound(first, last, val);
-#endif
+    template<typename ItType, typename ValType,
+        std::enable_if_t<
+            std::is_floating_point_v<typename std::iterator_traits<ItType>::value_type> &&
+            std::is_floating_point_v<ValType>,
+        int> = 0>
+    AMREX_GPU_HOST_DEVICE
+    void logspace (ItType first, const ItType& last,
+        const ValType& start, const ValType& stop, const ValType& base)
+    {
+        const std::ptrdiff_t count = last-first;
+        if (count >= 2){
+            const auto delta = (stop - start)/(count - 1);
+            for (std::ptrdiff_t i = 0; i < count-1; ++i){
+                *(first++) = std::pow(base, start + i*delta);
+            }
+            *first = std::pow(base, stop);
+        }
     }
 
 namespace detail {
@@ -218,104 +256,125 @@ struct clz_tag : clzl_tag {};
 // unsigned long, and unsigned long long inputs. Because the sizes of these data types
 // vary on different platforms, we work with fixed-width integer types.
 // these tags and overloads select the smallest version of __builtin_clz that will hold the input type
-template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(unsigned int)>::type>
+template <typename T, typename = std::enable_if_t<sizeof(T) <= sizeof(unsigned int)>>
 AMREX_FORCE_INLINE
 int builtin_clz_wrapper (clz_tag, T x) noexcept
 {
     return static_cast<int>(__builtin_clz(x) - (sizeof(unsigned int) * CHAR_BIT - sizeof(T) * CHAR_BIT));
 }
 
-template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(unsigned long)>::type>
+template <typename T, typename = std::enable_if_t<sizeof(T) <= sizeof(unsigned long)>>
 AMREX_FORCE_INLINE
 int builtin_clz_wrapper (clzl_tag, T x) noexcept
 {
     return static_cast<int>(__builtin_clzl(x) - (sizeof(unsigned long) * CHAR_BIT - sizeof(T) * CHAR_BIT));
 }
 
-template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(unsigned long long)>::type>
+template <typename T, typename = std::enable_if_t<sizeof(T) <= sizeof(unsigned long long)>>
 AMREX_FORCE_INLINE
 int builtin_clz_wrapper (clzll_tag, T x) noexcept
 {
     return static_cast<int>(__builtin_clzll(x) - (sizeof(unsigned long long) * CHAR_BIT - sizeof(T) * CHAR_BIT));
 }
 
-#ifdef AMREX_USE_CUDA
-
-// likewise with CUDA, there are __clz functions that take (signed) int and long long int
-template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(int)>::type>
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-int clz_wrapper (clz_tag, T x) noexcept
-{
-    return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
-}
-
-template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(long long int)>::type>
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-int clz_wrapper (clzll_tag, T x) noexcept
-{
-    return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
 }
-#endif
 
-}
+template <class T, std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t>  ||
+                                    std::is_same_v<std::decay_t<T>,std::uint16_t> ||
+                                    std::is_same_v<std::decay_t<T>,std::uint32_t> ||
+                                    std::is_same_v<std::decay_t<T>,std::uint64_t>, int> = 0>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int clz (T x) noexcept;
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint8_t x) noexcept
+int clz_generic (std::uint8_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
+#if !defined(__NVCOMPILER)
     static constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+#else
+    constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+#endif
     auto upper = x >> 4;
     auto lower = x & 0xF;
     return upper ? clz_lookup[upper] : 4 + clz_lookup[lower];
-#endif
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint16_t x) noexcept
+int clz_generic (std::uint16_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     auto upper = std::uint8_t(x >> 8);
     auto lower = std::uint8_t(x & 0xFF);
     return upper ? clz(upper) : 8 + clz(lower);
-#endif
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint32_t x) noexcept
+int clz_generic (std::uint32_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     auto upper = std::uint16_t(x >> 16);
     auto lower = std::uint16_t(x & 0xFFFF);
     return upper ? clz(upper) : 16 + clz(lower);
-#endif
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint64_t x) noexcept
+int clz_generic (std::uint64_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     auto upper = std::uint32_t(x >> 32);
     auto lower = std::uint32_t(x & 0xFFFFFFFF);
     return upper ? clz(upper) : 32 + clz(lower);
+}
+
+#if defined AMREX_USE_CUDA
+
+namespace detail {
+    // likewise with CUDA, there are __clz functions that take (signed) int and long long int
+    template <typename T, typename = std::enable_if_t<sizeof(T) <= sizeof(int)> >
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    int clz_wrapper (clz_tag, T x) noexcept
+    {
+        return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
+    }
+
+    template <typename T, typename = std::enable_if_t<sizeof(T) <= sizeof(long long int)> >
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    int clz_wrapper (clzll_tag, T x) noexcept
+    {
+        return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
+    }
+}
+
+template <class T, std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t>  ||
+                                    std::is_same_v<std::decay_t<T>,std::uint16_t> ||
+                                    std::is_same_v<std::decay_t<T>,std::uint32_t> ||
+                                    std::is_same_v<std::decay_t<T>,std::uint64_t>, int> >
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int clz (T x) noexcept
+{
+    AMREX_IF_ON_DEVICE((return detail::clz_wrapper(detail::clz_tag{}, x);))
+#if AMREX_HAS_BUILTIN_CLZ
+    AMREX_IF_ON_HOST((return detail::builtin_clz_wrapper(detail::clz_tag{}, x);))
+#else
+    AMREX_IF_ON_HOST((return clz_generic(x);))
 #endif
 }
 
+#else // !defined AMREX_USE_CUDA
+
+template <class T, std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t>  ||
+                                    std::is_same_v<std::decay_t<T>,std::uint16_t> ||
+                                    std::is_same_v<std::decay_t<T>,std::uint32_t> ||
+                                    std::is_same_v<std::decay_t<T>,std::uint64_t>, int> >
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int clz (T x) noexcept
+{
+#if (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
+    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
+#else
+    return clz_generic(x);
+#endif
+}
+
+#endif // defined AMREX_USE_CUDA
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_Any.H b/Src/Base/AMReX_Any.H
index 2b9edc084b7..87196c9bf4d 100644
--- a/Src/Base/AMReX_Any.H
+++ b/Src/Base/AMReX_Any.H
@@ -88,7 +88,7 @@ private:
     struct innards final : innards_base // NOLINT(cppcoreguidelines-special-member-functions)
     {
         innards (MF && mf)
-            : m_mf(std::forward<MF>(mf))
+            : m_mf(std::move(mf))
             {}
 
         ~innards () final = default;
diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
index e42ebdc9cd2..2a6cbb25a08 100644
--- a/Src/Base/AMReX_Arena.H
+++ b/Src/Base/AMReX_Arena.H
@@ -22,7 +22,9 @@ inline std::size_t aligned_size (std::size_t align_requirement, std::size_t size
 
 inline bool is_aligned (const void* p, std::size_t alignment) noexcept
 {
-    return (reinterpret_cast<std::size_t>(p) % alignment) == 0;
+    auto* q = const_cast<void*>(p);
+    auto space = alignment;
+    return std::align(alignment, alignment, q, space) == p;
 }
 
 class Arena;
@@ -82,7 +84,6 @@ struct ArenaInfo
 * A virtual base class for objects that manage their own dynamic
 * memory allocation.
 */
-
 class Arena
 {
 public:
@@ -157,6 +158,11 @@ public:
      */
     virtual void registerForProfiling (const std::string& memory_name);
 
+#ifdef AMREX_USE_GPU
+    //! Is this GPU stream ordered memory allocator?
+    [[nodiscard]] virtual bool isStreamOrderedArena () const { return false; }
+#endif
+
     /**
     * \brief Given a minimum required arena size of sz bytes, this returns
     * the next largest arena size that will align to align_size bytes
diff --git a/Src/Base/AMReX_Array.H b/Src/Base/AMReX_Array.H
index aaf8298ee99..525133cde87 100644
--- a/Src/Base/AMReX_Array.H
+++ b/Src/Base/AMReX_Array.H
@@ -362,7 +362,7 @@ namespace amrex {
          * (the index \c i moves the fastest)
          */
         template <typename O=ORDER,
-                  typename std::enable_if<std::is_same<O,Order::F>::value,int>::type=0>
+                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j) const noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -376,7 +376,7 @@ namespace amrex {
          * (the index \c i moves the fastest)
          */
         template <typename O=ORDER,
-                  typename std::enable_if<std::is_same<O,Order::F>::value,int>::type=0>
+                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j) noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -390,7 +390,7 @@ namespace amrex {
          * is used (the index \c j moves the fastest).
          */
         template <typename O=ORDER,
-                  typename std::enable_if<std::is_same<O,Order::C>::value,int>::type=0>
+                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j) const noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -404,7 +404,7 @@ namespace amrex {
          * is used (the index \c j moves the fastest).
          */
         template <typename O=ORDER,
-                  typename std::enable_if<std::is_same<O,Order::C>::value,int>::type=0>
+                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j) noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -654,7 +654,7 @@ namespace amrex {
          * (the index \c i moves the fastest)
          */
         template <typename O=ORDER,
-                  typename std::enable_if<std::is_same<O,Order::F>::value,int>::type=0>
+                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j, int k) const noexcept {
             return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1))
@@ -668,7 +668,7 @@ namespace amrex {
          * (the index \c i moves the fastest)
          */
         template <typename O=ORDER,
-                  typename std::enable_if<std::is_same<O,Order::F>::value,int>::type=0>
+                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j, int k) noexcept {
             return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1))
@@ -682,7 +682,7 @@ namespace amrex {
          * is used (the index \c k moves the fastest).
          */
         template <typename O=ORDER,
-                  typename std::enable_if<std::is_same<O,Order::C>::value,int>::type=0>
+                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j, int k) const noexcept {
             return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1))
@@ -696,7 +696,7 @@ namespace amrex {
          * is used (the index \c k moves the fastest).
          */
         template <typename O=ORDER,
-                  typename std::enable_if<std::is_same<O,Order::C>::value,int>::type=0>
+                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j, int k) noexcept {
             return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1))
diff --git a/Src/Base/AMReX_Array4.H b/Src/Base/AMReX_Array4.H
index b2ff0fcb549..fc27ca77b7a 100644
--- a/Src/Base/AMReX_Array4.H
+++ b/Src/Base/AMReX_Array4.H
@@ -24,9 +24,9 @@ namespace amrex {
             {}
 
         template <class U=T,
-                  std::enable_if_t<std::is_const<U>::value,int> = 0>
+                  std::enable_if_t<std::is_const_v<U>,int> = 0>
         AMREX_GPU_HOST_DEVICE
-        constexpr CellData (CellData<typename std::remove_const<T>::type> const& rhs) noexcept
+        constexpr CellData (CellData<std::remove_const_t<T>> const& rhs) noexcept
             : p(rhs.p), stride(rhs.stride), ncomp(rhs.ncomp)
             {}
 
@@ -37,18 +37,19 @@ namespace amrex {
         int nComp() const noexcept { return ncomp; }
 
         template <class U=T,
-                  std::enable_if_t<!std::is_void<U>::value,int> = 0>
+                  std::enable_if_t<!std::is_void_v<U>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         U& operator[] (int n) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
             if (n < 0 || n >= ncomp) {
-#if AMREX_DEVICE_COMPILE
-                AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1);
-#else
-                std::stringstream ss;
-                ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")";
-                amrex::Abort(ss.str());
-#endif
+                AMREX_IF_ON_DEVICE((
+                    AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1);
+                ))
+                AMREX_IF_ON_HOST((
+                    std::stringstream ss;
+                    ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")";
+                    amrex::Abort(ss.str());
+                ))
             }
 #endif
             return p[n*stride];
@@ -69,9 +70,9 @@ namespace amrex {
         AMREX_GPU_HOST_DEVICE
         constexpr Array4 () noexcept : p(nullptr) {}
 
-        template <class U=T, typename std::enable_if<std::is_const<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<std::is_const_v<U>,int> = 0>
         AMREX_GPU_HOST_DEVICE
-        constexpr Array4 (Array4<typename std::remove_const<T>::type> const& rhs) noexcept
+        constexpr Array4 (Array4<std::remove_const_t<T>> const& rhs) noexcept
             : p(rhs.p),
               jstride(rhs.jstride),
               kstride(rhs.kstride),
@@ -93,9 +94,9 @@ namespace amrex {
             {}
 
         template <class U,
-                  typename std::enable_if
-                  <std::is_same<typename std::remove_const<T>::type,
-                                typename std::remove_const<U>::type>::value,int>::type = 0>
+                  std::enable_if_t
+                  <std::is_same_v<std::remove_const_t<T>,
+                                std::remove_const_t<U>>,int> = 0>
         AMREX_GPU_HOST_DEVICE
         constexpr Array4 (Array4<U> const& rhs, int start_comp) noexcept
             : p((T*)(rhs.p+start_comp*rhs.nstride)),
@@ -108,9 +109,9 @@ namespace amrex {
         {}
 
         template <class U,
-                  typename std::enable_if
-                  <std::is_same<typename std::remove_const<T>::type,
-                                typename std::remove_const<U>::type>::value,int>::type = 0>
+                  std::enable_if_t
+                  <std::is_same_v<std::remove_const_t<T>,
+                                std::remove_const_t<U>>,int> = 0>
         AMREX_GPU_HOST_DEVICE
         constexpr Array4 (Array4<U> const& rhs, int start_comp, int num_comps) noexcept
             : p((T*)(rhs.p+start_comp*rhs.nstride)),
@@ -125,7 +126,7 @@ namespace amrex {
         AMREX_GPU_HOST_DEVICE
         explicit operator bool() const noexcept { return p != nullptr; }
 
-        template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         U& operator() (int i, int j, int k) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
@@ -134,7 +135,7 @@ namespace amrex {
             return p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride];
         }
 
-        template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         U& operator() (int i, int j, int k, int n) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
@@ -143,7 +144,7 @@ namespace amrex {
             return p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride];
         }
 
-        template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T* ptr (int i, int j, int k) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
@@ -152,7 +153,7 @@ namespace amrex {
             return p + ((i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride);
         }
 
-        template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T* ptr (int i, int j, int k, int n) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
@@ -161,7 +162,7 @@ namespace amrex {
             return p + ((i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride);
         }
 
-        template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         U& operator() (IntVect const& iv) const noexcept {
 #if (AMREX_SPACEDIM == 1)
@@ -173,7 +174,7 @@ namespace amrex {
 #endif
         }
 
-        template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         U& operator() (IntVect const& iv, int n) const noexcept {
 #if (AMREX_SPACEDIM == 1)
@@ -185,7 +186,7 @@ namespace amrex {
 #endif
         }
 
-        template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T* ptr (IntVect const& iv) const noexcept {
 #if (AMREX_SPACEDIM == 1)
@@ -197,7 +198,7 @@ namespace amrex {
 #endif
         }
 
-        template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T* ptr (IntVect const& iv, int n) const noexcept {
 #if (AMREX_SPACEDIM == 1)
@@ -233,21 +234,22 @@ namespace amrex {
         {
             if (i<begin.x || i>=end.x || j<begin.y || j>=end.y || k<begin.z || k>=end.z
                 || n < 0 || n >= ncomp) {
-#if AMREX_DEVICE_COMPILE
-                AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n",
-                                    i, j, k, n, begin.x, end.x-1, begin.y, end.y-1,
-                                    begin.z, end.z-1, ncomp-1);
-                amrex::Abort();
-#else
-                std::stringstream ss;
-                ss << " (" << i << "," << j << "," << k << "," <<  n
-                   << ") is out of bound ("
-                   << begin.x << ":" << end.x-1 << ","
-                   << begin.y << ":" << end.y-1 << ","
-                   << begin.z << ":" << end.z-1 << ","
-                   << "0:" << ncomp-1 << ")";
-                amrex::Abort(ss.str());
-#endif
+                AMREX_IF_ON_DEVICE((
+                    AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n",
+                                        i, j, k, n, begin.x, end.x-1, begin.y, end.y-1,
+                                        begin.z, end.z-1, ncomp-1);
+                    amrex::Abort();
+                ))
+                AMREX_IF_ON_HOST((
+                    std::stringstream ss;
+                    ss << " (" << i << "," << j << "," << k << "," <<  n
+                    << ") is out of bound ("
+                    << begin.x << ":" << end.x-1 << ","
+                    << begin.y << ":" << end.y-1 << ","
+                    << begin.z << ":" << end.z-1 << ","
+                    << "0:" << ncomp-1 << ")";
+                    amrex::Abort(ss.str());
+                ))
             }
         }
 #endif
@@ -298,7 +300,7 @@ namespace amrex {
     template <class A, class Enable = void> struct HasMultiComp : std::false_type {};
     //
     template <class B>
-    struct HasMultiComp<B, typename std::enable_if<B().size() >= 1>::type>
+    struct HasMultiComp<B, std::enable_if_t<B().size() >= 1>>
         : std::true_type {};
 
     //
@@ -319,14 +321,14 @@ namespace amrex {
             return this->Array4<T>::operator()(i,j,k);
         }
 
-        template <class U=T, typename std::enable_if< amrex::HasMultiComp<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t< amrex::HasMultiComp<U>::value,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         typename U::reference_type
         operator() (int i, int j, int k, int n) const noexcept {
             return this->Array4<T>::operator()(i,j,k,0)[n];
         }
 
-        template <class U=T, typename std::enable_if<!amrex::HasMultiComp<U>::value,int>::type = 0>
+        template <class U=T, std::enable_if_t<!amrex::HasMultiComp<U>::value,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         U& operator() (int i, int j, int k, int n) const noexcept {
             return this->Array4<T>::operator()(i,j,k,n);
diff --git a/Src/Base/AMReX_ArrayLim.H b/Src/Base/AMReX_ArrayLim.H
index 2654f511a9b..7b32363d7fc 100644
--- a/Src/Base/AMReX_ArrayLim.H
+++ b/Src/Base/AMReX_ArrayLim.H
@@ -8,37 +8,6 @@
 
 #include <AMReX_Array.H>
 
-/*
-** C++ stuff ...
-*/
-
-#ifndef AMREX_XSDK
-
-#if     AMREX_SPACEDIM==1
-#define ARLIM_P(x)  const int&
-#define ARLIM(x)  (x)[0]
-#define ARLIM_3D(x)   amrex::GpuArray<int,3>{(x)[0], 0, 0}.data()
-#define ARLIM_ANYD(x) amrex::GpuArray<int,3>{(x)[0], 0, 0}.data()
-#define ZFILL(x) amrex::GpuArray<amrex::Real,3>{(x)[0], 0., 0.}.data()
-#define AMREX_REAL_ANYD(x) AMREX_ZFILL(x)
-#elif   AMREX_SPACEDIM==2
-#define ARLIM_P(x)  const int&,const int&
-#define ARLIM(x)  (x)[0],(x)[1]
-#define ARLIM_3D(x)   amrex::GpuArray<int,3>{(x)[0], (x)[1], 0}.data()
-#define ARLIM_ANYD(x) amrex::GpuArray<int,3>{(x)[0], (x)[1], 0}.data()
-#define ZFILL(x) amrex::GpuArray<amrex::Real,3>{(x)[0], (x)[1], 0.}.data()
-#define AMREX_REAL_ANYD(x) AMREX_ZFILL(x)
-#elif   AMREX_SPACEDIM==3
-#define ARLIM_P(x)  const int&,const int&,const int&
-#define ARLIM(x)  (x)[0],(x)[1],(x)[2]
-#define ARLIM_3D(x) x
-#define ARLIM_ANYD(x) x
-#define ZFILL(x) x
-#define AMREX_REAL_ANYD(x) AMREX_ZFILL(x)
-#endif
-
-#endif /* ndef AMREX_XSDK */
-
 #if     AMREX_SPACEDIM==1
 #define AMREX_ARLIM_P(x)  const int&
 #define AMREX_ARLIM(x)  (x)[0]
@@ -81,133 +50,6 @@
 
 #define BL_TO_FORTRAN_BOX(x) AMREX_ARLIM_3D((x).loVect()), AMREX_ARLIM_3D((x).hiVect())
 
-#else
-
-#if !defined(BL_LANG_FORT)
-
-#ifndef AMREX_XSDK
-
-/* C stuff */
-#if AMREX_SPACEDIM == 1
-#define ARLIM_P(x) const int*
-#elif AMREX_SPACEDIM == 2
-#define ARLIM_P(x) const int*, const int*
-#else
-#define ARLIM_P(x) const int*, const int*, const int*
-#endif
-
-#endif /* ndef AMREX_XSDK */
-
-#if AMREX_SPACEDIM == 1
-#define AMREX_ARLIM_P(x) const int*
-#elif AMREX_SPACEDIM == 2
-#define AMREX_ARLIM_P(x) const int*, const int*
-#else
-#define AMREX_ARLIM_P(x) const int*, const int*, const int*
-#endif
-
-#else
-
-#ifndef AMREX_XSDK
-
-/*
-** Fortran stuff ...
-*/
-
-#if __STDC__==1 || defined(__INTEL_COMPILER)
-
-#if (AMREX_SPACEDIM == 1)
-#define DIMS(a) a##_l1, a##_h1
-#define DIMDEC(a) a##_l1, a##_h1
-#define DIMV(a) a##_l1:a##_h1
-#define DIM1(a) a##_l1:a##_h1
-#define ARG_L1(a) a##_l1
-#define ARG_H1(a) a##_h1
-#define DIMARG(a) a##(1)
-#endif
-
-#if (AMREX_SPACEDIM == 2)
-#define DIMS(a) a##_l1, a##_l2, a##_h1, a##_h2
-#define DIMDEC(a) a##_l1, a##_l2, a##_h1, a##_h2
-#define DIMV(a) a##_l1:a##_h1, a##_l2:a##_h2
-#define DIM1(a) a##_l1:a##_h1
-#define DIM2(a) a##_l2:a##_h2
-#define ARG_L1(a) a##_l1
-#define ARG_L2(a) a##_l2
-#define ARG_H1(a) a##_h1
-#define ARG_H2(a) a##_h2
-#define DIMARG(a) a##(1),a##(2)
-#endif
-
-#if (AMREX_SPACEDIM == 3)
-#define DIMS(a) a##_l1, a##_l2, a##_l3, a##_h1, a##_h2, a##_h3
-#define DIMDEC(a) a##_l1, a##_l2, a##_l3, a##_h1, a##_h2, a##_h3
-#define DIMV(a) a##_l1:a##_h1, a##_l2:a##_h2, a##_l3:a##_h3
-#define DIM1(a) a##_l1:a##_h1
-#define DIM2(a) a##_l2:a##_h2
-#define DIM3(a) a##_l3:a##_h3
-#define DIM12(a) a##_l1:a##_h1, a##_l2:a##_h2
-#define DIM23(a) a##_l2:a##_h2, a##_l3:a##_h3
-#define DIM13(a) a##_l1:a##_h1, a##_l3:a##_h3
-#define ARG_L1(a) a##_l1
-#define ARG_L2(a) a##_l2
-#define ARG_L3(a) a##_l3
-#define ARG_H1(a) a##_h1
-#define ARG_H2(a) a##_h2
-#define ARG_H3(a) a##_h3
-#define DIMARG(a) a##(1),a##(2),a##(3)
-#endif
-
-#else
-
-#if (AMREX_SPACEDIM == 1)
-#define DIMS(a) a/**/_l1, a/**/_h1
-#define DIMDEC(a) a/**/_l1, a/**/_h1
-#define DIMV(a) a/**/_l1:a/**/_h1
-#define DIM1(a) a/**/_l1:a/**/_h1
-#define ARG_L1(a) a/**/_l1
-#define ARG_H1(a) a/**/_h1
-#define DIMARG(a) a/**/(1)
-#endif
-
-#if (AMREX_SPACEDIM == 2)
-#define DIMS(a) a/**/_l1, a/**/_l2, a/**/_h1, a/**/_h2
-#define DIMDEC(a) a/**/_l1, a/**/_l2, a/**/_h1, a/**/_h2
-#define DIMV(a) a/**/_l1:a/**/_h1, a/**/_l2:a/**/_h2
-#define DIM1(a) a/**/_l1:a/**/_h1
-#define DIM2(a) a/**/_l2:a/**/_h2
-#define ARG_L1(a) a/**/_l1
-#define ARG_L2(a) a/**/_l2
-#define ARG_H1(a) a/**/_h1
-#define ARG_H2(a) a/**/_h2
-#define DIMARG(a) a/**/(1),a/**/(2)
-#endif
-
-#if (AMREX_SPACEDIM == 3)
-#define DIMS(a) a/**/_l1, a/**/_l2, a/**/_l3, a/**/_h1, a/**/_h2, a/**/_h3
-#define DIMDEC(a) a/**/_l1, a/**/_l2, a/**/_l3, a/**/_h1, a/**/_h2, a/**/_h3
-#define DIMV(a) a/**/_l1:a/**/_h1, a/**/_l2:a/**/_h2, a/**/_l3:a/**/_h3
-#define DIM1(a) a/**/_l1:a/**/_h1
-#define DIM2(a) a/**/_l2:a/**/_h2
-#define DIM3(a) a/**/_l3:a/**/_h3
-#define DIM12(a) a/**/_l1:a/**/_h1, a/**/_l2:a/**/_h2
-#define DIM23(a) a/**/_l2:a/**/_h2, a/**/_l3:a/**/_h3
-#define DIM13(a) a/**/_l1:a/**/_h1, a/**/_l3:a/**/_h3
-#define ARG_L1(a) a/**/_l1
-#define ARG_L2(a) a/**/_l2
-#define ARG_L3(a) a/**/_l3
-#define ARG_H1(a) a/**/_h1
-#define ARG_H2(a) a/**/_h2
-#define ARG_H3(a) a/**/_h3
-#define DIMARG(a) a/**/(1),a/**/(2),a/**/(3)
-#endif
-
-#endif /*__STDC__==1*/
-
-#endif /* ndef AMREX_XSDK */
-
-#endif /*else of !defined(BL_LANG_FORT)*/
-
-#endif /*else of __cplusplus*/
+#endif /* __cplusplus */
 
-#endif /*BL_ARRAYLIM_H*/
+#endif /* AMREX_ARRAYLIM_H_ */
diff --git a/Src/Base/AMReX_BArena.H b/Src/Base/AMReX_BArena.H
index 9a3b4aa0f1b..d587d100859 100644
--- a/Src/Base/AMReX_BArena.H
+++ b/Src/Base/AMReX_BArena.H
@@ -11,7 +11,6 @@ namespace amrex {
 * This is the simplest dynamic memory management class derived from Arena.
 * Makes calls to std::malloc and std::free.
 */
-
 class BArena
     :
     public Arena
diff --git a/Src/Base/AMReX_BCRec.H b/Src/Base/AMReX_BCRec.H
index c39634cfb09..d23da777eda 100644
--- a/Src/Base/AMReX_BCRec.H
+++ b/Src/Base/AMReX_BCRec.H
@@ -10,10 +10,9 @@ namespace amrex {
 /**
 * \brief Boundary Condition Records.
 * Necessary information and functions for computing boundary conditions.
+*
+* This class has standard layout.  And we should keep it so!
 */
-
-// This class has standard layout.  And we should keep it so!
-
 class BCRec
 {
 public:
@@ -43,7 +42,7 @@ public:
         {}
     /*
     * \brief Yet another constructor.  Inherits bndry types from bc_domain
-    * when bx lies on edge of domain otherwise gets interior Dirchlet.
+    * when bx lies on edge of domain otherwise gets interior Dirichlet.
     */
     AMREX_GPU_HOST_DEVICE
     BCRec (const Box&   bx,
diff --git a/Src/Base/AMReX_BCUtil.H b/Src/Base/AMReX_BCUtil.H
index dd4e814610c..3ebc53a0576 100644
--- a/Src/Base/AMReX_BCUtil.H
+++ b/Src/Base/AMReX_BCUtil.H
@@ -9,11 +9,12 @@
 namespace amrex
 {
     // This is for filling cell-centered data outside physical domain
-    // (excluding periodic boundaries).  It only fills
-    // BCType::foextrap, BCType::hoextrap, BCType::reflect_even, and
-    // BCType::reflect::odd.  It does not fill BCType::ext_dir (i.e.,
-    // external Dirichlet).  If you have BCType::ext_dir, you can
-    // copy, paste and rename this function, and then modify it.
+    // (excluding periodic boundaries).  It only fills BCType::foextrap,
+    // BCType::hoextrap, BCType::hoextrapcc, BCType::reflect_even, and
+    // BCType::reflect::odd.  It does not fill BCType::ext_dir and
+    // BCType::ext_dir_cc (i.e., external Dirichlet).  If you have
+    // BCType::ext_dir or BCType::ext_dir_cc, you can copy, paste and rename
+    // this function, and then modify it.
     void FillDomainBoundary (MultiFab& phi, const Geometry& geom, const Vector<BCRec>& bc);
 }
 
diff --git a/Src/Base/AMReX_BCUtil.cpp b/Src/Base/AMReX_BCUtil.cpp
index c0a645ba05c..dc5b4f4f10c 100644
--- a/Src/Base/AMReX_BCUtil.cpp
+++ b/Src/Base/AMReX_BCUtil.cpp
@@ -15,7 +15,7 @@ void dummy_cpu_fill_extdir (Box const& /*bx*/, Array4<Real> const& /*dest*/,
                             const BCRec* /*bcr*/, const int /*bcomp*/,
                             const int /*orig_comp*/)
 {
-    // do something for external Dirichlet (BCType::ext_dir) if there are
+    // do something for external Dirichlet (BCType::ext_dir or BCType::ext_dir_cc) if there are
 }
 
 struct dummy_gpu_fill_extdir
@@ -27,7 +27,7 @@ struct dummy_gpu_fill_extdir
                      const BCRec* /*bcr*/, const int /*bcomp*/,
                      const int /*orig_comp*/) const
         {
-            // do something for external Dirichlet (BCType::ext_dir) if there are
+            // do something for external Dirichlet (BCType::ext_dir or BCType::ext_dir_cc) if there are
         }
 };
 
diff --git a/Src/Base/AMReX_BC_TYPES.H b/Src/Base/AMReX_BC_TYPES.H
index 872ac1eff5d..f35175be450 100644
--- a/Src/Base/AMReX_BC_TYPES.H
+++ b/Src/Base/AMReX_BC_TYPES.H
@@ -44,10 +44,8 @@ SYMMETRY       |  Un REFLECT_ODD |  REFLECT_EVEN    | REFLECT_EVEN   |
 
 
 INT_DIR      : data taken from other grids or interpolated
-EXT_DIR      : data specified on EDGE (FACE) of bndry for the linear solvers
-               for application codes, the location of the EXT_DIR data will
-               depend on how they do reconstruction, and may be edge or
-               cell-centered.
+EXT_DIR      : data specified on EDGE (FACE)
+EXT_DIR_CC   : data specified at cell center
 HOEXTRAP     : higher order extrapolation to EDGE of bndry
 HOEXTRAPCC   : linear extrapolation to CELL of bndry
 FOEXTRAP     : first order extrapolation from last cell in interior
@@ -74,6 +72,7 @@ enum mathematicalBndryTypes : int {
     ext_dir      =  3,
     hoextrap     =  4,
     hoextrapcc   =  5,
+    ext_dir_cc   =  6,
     user_1       = 1001,
     user_2       = 1002,
     user_3       = 1003
@@ -83,25 +82,4 @@ enum mathematicalBndryTypes : int {
 }
 #endif
 
-#ifndef AMREX_XSDK
-
-#define BOGUS_BC (-666)
-
-#define REFLECT_ODD (-1)
-#define INT_DIR 0
-#define REFLECT_EVEN 1
-#define FOEXTRAP  2
-#define EXT_DIR 3
-#define HOEXTRAP 4
-#define HOEXTRAPCC 5
-
-#define Interior 0
-#define Inflow 1
-#define Outflow 2
-#define Symmetry 3
-#define SlipWall 4
-#define NoSlipWall 5
-
-#endif
-
 #endif
diff --git a/Src/Base/AMReX_BLBackTrace.cpp b/Src/Base/AMReX_BLBackTrace.cpp
index 4a4d527dbb1..e5699996933 100644
--- a/Src/Base/AMReX_BLBackTrace.cpp
+++ b/Src/Base/AMReX_BLBackTrace.cpp
@@ -70,6 +70,7 @@ BLBackTrace::handler(int s)
     case SIGABRT:
         amrex::ErrorStream() << "SIGABRT\n";
         break;
+    default: break;
     }
 
 #if defined(AMREX_BACKTRACE_SUPPORTED) || defined(AMREX_TINY_PROFILING)
@@ -99,19 +100,19 @@ BLBackTrace::handler(int s)
         fclose(p);
     }
 
-    amrex::ErrorStream() << "See " << errfilename << " file for details" << std::endl;
+    amrex::ErrorStream() << "See " << errfilename << " file for details" << '\n';
 
     if (!bt_stack.empty()) {
         std::ofstream errfile;
         errfile.open(errfilename.c_str(), std::ofstream::out | std::ofstream::app);
         if (errfile.is_open()) {
-            errfile << std::endl;
+            errfile << '\n';
             while (!bt_stack.empty()) {
                 errfile << "== BACKTRACE == " << bt_stack.top().first
                         <<", " << bt_stack.top().second << "\n";
                 bt_stack.pop();
             }
-            errfile << std::endl;
+            errfile << '\n';
         }
     }
 
@@ -148,7 +149,7 @@ BLBackTrace::print_backtrace_info (const std::string& filename)
     {
         amrex::Print() << "Warning @ BLBackTrace::print_backtrace_info: "
                        << filename << " is not a valid output file."
-                       << std::endl;
+                       << '\n';
     }
 }
 
@@ -371,7 +372,7 @@ BLBTer::BLBTer(const std::string& s, const char* file, int line)
     std::ostringstream ss0;
     ss0 << "Proc. " << ParallelDescriptor::MyProc()
         << ": \"" << s << "\"";
-    BLBackTrace::bt_stack.push(std::make_pair(ss0.str(), line_file));
+    BLBackTrace::bt_stack.emplace(ss0.str(), line_file);
 #endif
 }
 
diff --git a/Src/Base/AMReX_BLProfiler.H b/Src/Base/AMReX_BLProfiler.H
index 3b72a627881..b7f8e6a1c27 100644
--- a/Src/Base/AMReX_BLProfiler.H
+++ b/Src/Base/AMReX_BLProfiler.H
@@ -514,7 +514,7 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) {
 #define BL_PROFILE_REGION_VAR(fname, rvname)
 #define BL_PROFILE_REGION_VAR_START(fname, rvname)
 #define BL_PROFILE_REGION_VAR_STOP(fname, rvname)
-#define BL_PROFILE_TINY_FLUSH() amrex::TinyProfiler::Finalize(true); TinyProfiler::MemoryFinalize(true)
+#define BL_PROFILE_TINY_FLUSH() amrex::TinyProfiler::Finalize(true); amrex::TinyProfiler::MemoryFinalize(true)
 #define BL_PROFILE_FLUSH()
 #define BL_TRACE_PROFILE_FLUSH()
 #define BL_TRACE_PROFILE_SETFLUSHSIZE(fsize)
diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H
index 99132038397..c1212fb7a0a 100644
--- a/Src/Base/AMReX_BaseFab.H
+++ b/Src/Base/AMReX_BaseFab.H
@@ -19,11 +19,10 @@
 #include <AMReX_Utility.H>
 #include <AMReX_Reduce.H>
 #include <AMReX_Gpu.H>
+#include <AMReX_Scan.H>
 #include <AMReX_Math.H>
-
-#ifdef AMREX_USE_OMP
-#include <omp.h>
-#endif
+#include <AMReX_OpenMP.H>
+#include <AMReX_MemPool.H>
 
 #include <cmath>
 #include <cstdlib>
@@ -90,62 +89,14 @@ makeArray4 (T* p, Box const& bx, int ncomp) noexcept
     return Array4<T>{p, amrex::begin(bx), amrex::end(bx), ncomp};
 }
 
-/**
-*  \brief A Fortran Array-like Object
-*  BaseFab emulates the Fortran array concept.
-*  Useful operations can be performed upon
-*  BaseFabs in C++, and they provide a convenient interface to
-*  Fortran when it is necessary to retreat into that language.
-
-*  BaseFab is a template class.  Through use of the
-*  template, a BaseFab may be based upon any class.  So far at least,
-*  most applications have been based upon simple types like integers,
-*  real*4s, or real*8s.  Most applications do not use BaseFabs
-*  directly, but utilize specialized classes derived from BaseFab.
-
-*  Classes derived from BaseFab include FArrayBox, IArrayBox, TagBox,
-*  Mask, EBFArrayBox, EBCellFlag and CutFab.
-
-*  BaseFab objects depend on the dimensionality of space
-*  (indirectly through the DOMAIN Box member).  It is
-*  typical to define the macro SPACEDIM to be 1, 2, or 3 to indicate
-*  the dimension of space.  See the discussion of class Box for more
-*  information.  A BaseFab contains a Box DOMAIN, which indicates the
-*  integer indexing space over which the array is defined.  A BaseFab
-*  also has NVAR components.  By components, we mean that for each
-*  point in the rectangular indexing space, there are NVAR values
-*  associated with that point.  A Fortran array corresponding to a
-*  BaseFab would have (SPACEDIM+1) dimensions.
-
-*  By design, the array layout in a BaseFab mirrors that of a
-*  Fortran array.  The first index (x direction for example) varies
-*  most rapidly, the next index (y direction), if any, varies next
-*  fastest. The component index varies last, after all the spatial
-*  indices.
-
-*  It is sometimes convenient to be able to treat a sub-array within an
-*  existing BaseFab as a BaseFab in its own right.  This is often
-*  referred to as aliasing the BaseFab.  Note that when aliasing is
-*  used, the BaseFabs domain will not, in general, be the same as the
-*  parent BaseFabs domain, nor will the number of components.
-*  BaseFab is a dimension dependent class, so SPACEDIM must be
-*  defined as either 1, 2, or 3 when compiling.
-
-*  This is NOT a polymorphic class.
-
-*  It does NOT provide a copy constructor or assignment operator.
-
-*  T MUST have a default constructor and an assignment operator.
-*/
-
 template <typename T>
-typename std::enable_if<std::is_arithmetic<T>::value>::type
+std::enable_if_t<std::is_arithmetic_v<T>>
 placementNew (T* const /*ptr*/, Long /*n*/)
 {}
 
 template <typename T>
-std::enable_if_t<std::is_trivially_default_constructible<T>::value
-                 && !std::is_arithmetic<T>::value>
+std::enable_if_t<std::is_trivially_default_constructible_v<T>
+                 && !std::is_arithmetic_v<T>>
 placementNew (T* const ptr, Long n)
 {
     for (Long i = 0; i < n; ++i) {
@@ -154,7 +105,7 @@ placementNew (T* const ptr, Long n)
 }
 
 template <typename T>
-std::enable_if_t<!std::is_trivially_default_constructible<T>::value>
+std::enable_if_t<!std::is_trivially_default_constructible_v<T>>
 placementNew (T* const ptr, Long n)
 {
     AMREX_HOST_DEVICE_FOR_1D ( n, i,
@@ -164,12 +115,12 @@ placementNew (T* const ptr, Long n)
 }
 
 template <typename T>
-typename std::enable_if<std::is_trivially_destructible<T>::value>::type
+std::enable_if_t<std::is_trivially_destructible_v<T>>
 placementDelete (T* const /*ptr*/, Long /*n*/)
 {}
 
 template <typename T>
-typename std::enable_if<!std::is_trivially_destructible<T>::value>::type
+std::enable_if_t<!std::is_trivially_destructible_v<T>>
 placementDelete (T* const ptr, Long n)
 {
     AMREX_HOST_DEVICE_FOR_1D (n, i,
@@ -178,6 +129,54 @@ placementDelete (T* const ptr, Long n)
     });
 }
 
+/**
+ * \brief A FortranArrayBox(FAB)-like object
+ *
+ * BaseFab emulates the Fortran array concept.
+ * Useful operations can be performed upon
+ * BaseFabs in C++, and they provide a convenient interface to
+ * Fortran when it is necessary to retreat into that language.
+ *
+ * BaseFab is a template class.  Through use of the
+ * template, a BaseFab may be based upon any class.  So far at least,
+ * most applications have been based upon simple types like integers,
+ * real*4s, or real*8s.  Most applications do not use BaseFabs
+ * directly, but utilize specialized classes derived from BaseFab.
+ *
+ * Classes derived from BaseFab include FArrayBox, IArrayBox, TagBox,
+ * Mask, EBFArrayBox, EBCellFlag and CutFab.
+ *
+ * BaseFab objects depend on the dimensionality of space
+ * (indirectly through the DOMAIN Box member).  It is
+ * typical to define the macro SPACEDIM to be 1, 2, or 3 to indicate
+ * the dimension of space.  See the discussion of class Box for more
+ * information.  A BaseFab contains a Box DOMAIN, which indicates the
+ * integer indexing space over which the array is defined.  A BaseFab
+ * also has NVAR components.  By components, we mean that for each
+ * point in the rectangular indexing space, there are NVAR values
+ * associated with that point.  A Fortran array corresponding to a
+ * BaseFab would have (SPACEDIM+1) dimensions.
+ *
+ * By design, the array layout in a BaseFab mirrors that of a
+ * Fortran array.  The first index (x direction for example) varies
+ * most rapidly, the next index (y direction), if any, varies next
+ * fastest. The component index varies last, after all the spatial
+ * indices.
+ *
+ * It is sometimes convenient to be able to treat a sub-array within an
+ * existing BaseFab as a BaseFab in its own right.  This is often
+ * referred to as aliasing the BaseFab.  Note that when aliasing is
+ * used, the BaseFabs domain will not, in general, be the same as the
+ * parent BaseFabs domain, nor will the number of components.
+ * BaseFab is a dimension dependent class, so SPACEDIM must be
+ * defined as either 1, 2, or 3 when compiling.
+ *
+ * This is NOT a polymorphic class.
+ *
+ * It does NOT provide a copy constructor or assignment operator.
+ *
+ * \tparam T MUST have a default constructor and an assignment operator.
+ */
 template <class T>
 class BaseFab
     : public DataAllocator
@@ -251,7 +250,7 @@ public:
     */
     void resize (const Box& b, int N = 1, Arena* ar = nullptr);
 
-    template <class U=T, typename std::enable_if<std::is_trivially_destructible<U>::value,int>::type = 0>
+    template <class U=T, std::enable_if_t<std::is_trivially_destructible_v<U>,int> = 0>
     [[nodiscard]] Elixir elixir () noexcept;
 
     /**
@@ -543,7 +542,7 @@ public:
                       int numcomp = 1) noexcept;
     /**
     * \brief As above, except that the destination Box is specified,
-    * but the source Box is taken to the equal to the source
+    * but the source Box is taken to the equal to the destination
     * Box, and all components of the destination BaseFab are
     * copied.
     */
@@ -979,6 +978,19 @@ public:
     BaseFab<T>& atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
                            int srccomp, int destcomp, int numcomp=1) noexcept;
 
+    /**
+    * \brief Atomically add srcbox region of src FAB to destbox region of this FAB.
+    * The srcbox and destbox must be same size. When OMP is on, this uses OMP locks
+    * in the implementation and it's usually faster than atomicAdd.
+    */
+#if defined(AMREX_USE_GPU)
+    template <RunOn run_on>
+#else
+    template <RunOn run_on=RunOn::Host>
+#endif
+    BaseFab<T>& lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
+                         int srccomp, int destcomp, int numcomp) noexcept;
+
     //! FAB SAXPY (y[i] <- y[i] + a * x[i]), in place.
 #if defined(AMREX_USE_GPU)
     template <RunOn run_on>
@@ -1631,6 +1643,9 @@ protected:
     Long truesize = 0L;         //!< nvar*numpts that was allocated on heap.
     bool ptr_owner = false;     //!< Owner of T*?
     bool shared_memory = false; //!< Is the memory allocated in shared memory?
+#ifdef AMREX_USE_GPU
+    gpuStream_t alloc_stream{};
+#endif
 };
 
 template <class T>
@@ -1902,6 +1917,9 @@ BaseFab<T>::define ()
     this->truesize  = this->nvar*this->domain.numPts();
     this->ptr_owner = true;
     this->dptr = static_cast<T*>(this->alloc(this->truesize*sizeof(T)));
+#ifdef AMREX_USE_GPU
+    this->alloc_stream = Gpu::gpuStream();
+#endif
 
     placementNew(this->dptr, this->truesize);
 
@@ -2003,6 +2021,9 @@ BaseFab<T>::BaseFab (BaseFab<T>&& rhs) noexcept
       dptr(rhs.dptr), domain(rhs.domain),
       nvar(rhs.nvar), truesize(rhs.truesize),
       ptr_owner(rhs.ptr_owner), shared_memory(rhs.shared_memory)
+#ifdef AMREX_USE_GPU
+      , alloc_stream(rhs.alloc_stream)
+#endif
 {
     rhs.dptr = nullptr;
     rhs.ptr_owner = false;
@@ -2021,6 +2042,9 @@ BaseFab<T>::operator= (BaseFab<T>&& rhs) noexcept
         truesize = rhs.truesize;
         ptr_owner = rhs.ptr_owner;
         shared_memory = rhs.shared_memory;
+#ifdef AMREX_USE_GPU
+        alloc_stream = rhs.alloc_stream;
+#endif
 
         rhs.dptr = nullptr;
         rhs.ptr_owner = false;
@@ -2062,7 +2086,11 @@ BaseFab<T>::resize (const Box& b, int n, Arena* ar)
         this->dptr = nullptr;
         define();
     }
-    else if (this->nvar*this->domain.numPts() > this->truesize)
+    else if (this->nvar*this->domain.numPts() > this->truesize
+#ifdef AMREX_USE_GPU
+             || (arena()->isStreamOrderedArena() && alloc_stream != Gpu::gpuStream())
+#endif
+             )
     {
         if (this->shared_memory) {
             amrex::Abort("BaseFab::resize: BaseFab in shared memory cannot increase size");
@@ -2075,7 +2103,7 @@ BaseFab<T>::resize (const Box& b, int n, Arena* ar)
 }
 
 template <class T>
-template <class U, typename std::enable_if<std::is_trivially_destructible<U>::value,int>::type>
+template <class U, std::enable_if_t<std::is_trivially_destructible_v<U>,int>>
 Elixir
 BaseFab<T>::elixir () noexcept
 {
@@ -2114,7 +2142,14 @@ BaseFab<T>::clear () noexcept
 
             placementDelete(this->dptr, this->truesize);
 
+#ifdef AMREX_USE_GPU
+            auto current_stream = Gpu::Device::gpuStream();
+            Gpu::Device::setStream(alloc_stream);
+#endif
             this->free(this->dptr);
+#ifdef AMREX_USE_GPU
+            Gpu::Device::setStream(current_stream);
+#endif
 
             if (this->nvar > 1) {
                 amrex::update_fab_stats(-this->truesize/this->nvar, -this->truesize, sizeof(T));
@@ -3277,6 +3312,110 @@ BaseFab<T>::atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& dest
     return *this;
 }
 
+template <class T>
+template <RunOn run_on>
+BaseFab<T>&
+BaseFab<T>::lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
+                     int srccomp, int destcomp, int numcomp) noexcept
+{
+#if defined(AMREX_USE_OMP) && (AMREX_SPACEDIM > 1)
+#if defined(AMREX_USE_GPU)
+    if (run_on == RunOn::Host || Gpu::notInLaunchRegion()) {
+#endif
+        BL_ASSERT(destbox.ok());
+        BL_ASSERT(src.box().contains(srcbox));
+        BL_ASSERT(box().contains(destbox));
+        BL_ASSERT(destbox.sameSize(srcbox));
+        BL_ASSERT(srccomp >= 0 && srccomp+numcomp <= src.nComp());
+        BL_ASSERT(destcomp >= 0 && destcomp+numcomp <= nComp());
+
+        Array4<T> const& d = this->array();
+        Array4<T const> const& s = src.const_array();
+        auto const& dlo = amrex::lbound(destbox);
+        auto const& dhi = amrex::ubound(destbox);
+        auto const& len = amrex::length(destbox);
+        auto const& slo = amrex::lbound(srcbox);
+        Dim3 const offset{slo.x-dlo.x, slo.y-dlo.y, slo.z-dlo.z};
+
+        int planedim;
+        int nplanes;
+        int plo;
+        if (len.z == 1) {
+            planedim = 1;
+            nplanes = len.y;
+            plo = dlo.y;
+        } else {
+            planedim = 2;
+            nplanes = len.z;
+            plo = dlo.z;
+        }
+
+        auto* mask = (bool*) amrex_mempool_alloc(sizeof(bool)*nplanes);
+        for (int ip = 0; ip < nplanes; ++ip) {
+            mask[ip] = false;
+        }
+
+        int mm = 0;
+        int planes_left = nplanes;
+        while (planes_left > 0) {
+            AMREX_ASSERT(mm < nplanes);
+            auto const m = mm + plo;
+            auto* lock = OpenMP::get_lock(m);
+            if (omp_test_lock(lock))
+            {
+                auto lo = dlo;
+                auto hi = dhi;
+                if (planedim == 1) {
+                    lo.y = m;
+                    hi.y = m;
+                } else {
+                    lo.z = m;
+                    hi.z = m;
+                }
+
+                for (int n = 0; n < numcomp; ++n) {
+                    for     (int k = lo.z; k <= hi.z; ++k) {
+                        for (int j = lo.y; j <= hi.y; ++j) {
+                            auto      * pdst = d.ptr(dlo.x,j         ,k         ,n+destcomp);
+                            auto const* psrc = s.ptr(slo.x,j+offset.y,k+offset.z,n+ srccomp);
+#pragma omp simd
+                            for (int ii = 0; ii < len.x; ++ii) {
+                                pdst[ii] += psrc[ii];
+                            }
+                        }
+                    }
+                }
+
+                mask[mm] = true;
+                --planes_left;
+                omp_unset_lock(lock);
+                if (planes_left == 0) { break; }
+            }
+
+            ++mm;
+            for (int ip = 0; ip < nplanes; ++ip) {
+                int new_mm = (mm+ip) % nplanes;
+                if ( ! mask[new_mm] ) {
+                    mm = new_mm;
+                    break;
+                }
+            }
+        }
+
+        amrex_mempool_free(mask);
+
+        return *this;
+
+#if defined(AMREX_USE_GPU)
+    } else {
+        return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcomp);
+    }
+#endif
+#else
+    return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcomp);
+#endif
+}
+
 template <class T>
 template <RunOn run_on>
 BaseFab<T>&
@@ -3505,7 +3644,6 @@ BaseFab<T>::protected_divide (const BaseFab<T>& src, const Box& srcbox, const Bo
 * and stored in component comp of this FAB.
 * This fab is returned as a reference for chaining.
 */
-
 template <class T>
 template <RunOn run_on>
 BaseFab<T>&
diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H
index 3dafadcd74b..6fef4918c61 100644
--- a/Src/Base/AMReX_BaseFabUtility.H
+++ b/Src/Base/AMReX_BaseFabUtility.H
@@ -22,11 +22,11 @@ cast (BaseFab<Tto>& tofab, BaseFab<Tfrom> const& fromfab,
 }
 
 template <typename STRUCT, typename F,
-          typename std::enable_if<(sizeof(STRUCT)<=36*8) &&
+          std::enable_if_t<(sizeof(STRUCT)<=36*8) &&
                                   AMREX_IS_TRIVIALLY_COPYABLE(STRUCT) &&
-                                  std::is_trivially_destructible<STRUCT>::value,
-                                  int>::type FOO = 0>
-void fill (BaseFab<STRUCT>& aos_fab, F && f)
+                                  std::is_trivially_destructible_v<STRUCT>,
+                                  int>FOO = 0>
+void fill (BaseFab<STRUCT>& aos_fab, F const& f)
 {
     Box const& box = aos_fab.box();
     auto const& aos = aos_fab.array();
@@ -36,37 +36,31 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
                   "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
 #ifdef AMREX_USE_GPU
     if (Gpu::inLaunchRegion()) {
-        const auto lo  = amrex::lbound(box);
-        const auto len = amrex::length(box);
-        const auto lenxy = len.x*len.y;
-        const auto lenx = len.x;
-        int ntotcells = box.numPts();
+        BoxIndexer indexer(box);
+        const auto ntotcells = std::uint64_t(box.numPts());
         int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
-        int nblocks = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
+        std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
+        AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
+        auto nblocks = int(nblocks_long);
         std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
         T* p = (T*)aos_fab.dataPtr();
 #ifdef AMREX_USE_SYCL
         amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
         {
-            int icell = handler.globalIdx();
-            unsigned int blockDimx = handler.blockDim();
-            unsigned int threadIdxx = handler.threadIdx();
-            unsigned int blockIdxx = handler.blockIdx();
+            auto const icell = std::uint64_t(handler.globalIdx());
+            std::uint64_t const blockDimx = handler.blockDim();
+            std::uint64_t const threadIdxx = handler.threadIdx();
+            std::uint64_t const blockIdxx = handler.blockIdx();
             auto const shared = (T*)handler.sharedMemory();
-            if (icell < ntotcells) {
+            if (icell < indexer.numPts()) {
                 auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
-                int k =  icell /   lenxy;
-                int j = (icell - k*lenxy) /   lenx;
-                int i = (icell - k*lenxy) - j*lenx;
-                i += lo.x;
-                j += lo.y;
-                k += lo.z;
+                auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             handler.sharedBarrier();
-            for (unsigned int m = threadIdxx,
-                     mend = amrex::min<unsigned int>(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE;
+            for (std::uint64_t m = threadIdxx,
+                     mend = amrex::min<std::uint64_t>(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
                  m < mend; m += blockDimx) {
                 p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
             }
@@ -75,31 +69,26 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
         amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
-            int icell = blockDim.x*blockIdx.x+threadIdx.x;
+            std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
             Gpu::SharedMemory<T> gsm;
             T* const shared = gsm.dataPtr();
-            if (icell < ntotcells) {
-                auto ga = new(shared+threadIdx.x*STRUCTSIZE) STRUCT;
-                int k =  icell /   lenxy;
-                int j = (icell - k*lenxy) /   lenx;
-                int i = (icell - k*lenxy) - j*lenx;
-                i += lo.x;
-                j += lo.y;
-                k += lo.z;
+            if (icell < indexer.numPts()) {
+                auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
+                auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             __syncthreads();
-            for (unsigned int m = threadIdx.x,
-                     mend = amrex::min<unsigned int>(blockDim.x, ntotcells-blockDim.x*blockIdx.x) * STRUCTSIZE;
+            for (std::uint64_t m = threadIdx.x,
+                     mend = amrex::min<std::uint64_t>(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
                  m < mend; m += blockDim.x) {
-                p[blockDim.x*blockIdx.x*STRUCTSIZE+m] = shared[m];
+                p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
             }
         });
 #endif
     } else
 #endif
     {
-        amrex::LoopOnCpu(box, [=] (int i, int j, int k) noexcept
+        amrex::LoopOnCpu(box, [&] (int i, int j, int k) noexcept
         {
             f(aos(i,j,k), i, j, k);
         });
diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
index 0a32d637d4f..82a2ad9cd13 100644
--- a/Src/Base/AMReX_Box.H
+++ b/Src/Base/AMReX_Box.H
@@ -15,6 +15,7 @@
 #include <AMReX_Vector.H>
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_GpuControl.H>
+#include <AMReX_Math.H>
 
 #include <iosfwd>
 
@@ -338,9 +339,10 @@ public:
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     Long numPts () const noexcept {
-        return AMREX_D_TERM( static_cast<Long>(length(0)),
-                            *static_cast<Long>(length(1)),
-                            *static_cast<Long>(length(2)));
+        return ok() ? AMREX_D_TERM( static_cast<Long>(length(0)),
+                                   *static_cast<Long>(length(1)),
+                                   *static_cast<Long>(length(2)))
+                    : Long(0);
     }
 
     /**
@@ -349,8 +351,10 @@ public:
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     double d_numPts () const noexcept {
-        BL_ASSERT(ok());
-        return AMREX_D_TERM(double(length(0)), *double(length(1)), *double(length(2)));
+        return ok() ? AMREX_D_TERM( double(length(0)),
+                                   *double(length(1)),
+                                   *double(length(2)))
+                    : 0.0;
     }
 
     /**
@@ -360,9 +364,10 @@ public:
     */
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     Long volume () const noexcept {
-        return AMREX_D_TERM( static_cast<Long>(length(0)-btype[0]),
-                            *static_cast<Long>(length(1)-btype[1]),
-                            *static_cast<Long>(length(2)-btype[2]));
+        return ok() ? AMREX_D_TERM( static_cast<Long>(length(0)-btype[0]),
+                                   *static_cast<Long>(length(1)-btype[1]),
+                                   *static_cast<Long>(length(2)-btype[2]))
+                    : Long(0);
     }
 
     /**
@@ -1835,6 +1840,94 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy
     return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ);
 }
 
+struct BoxIndexer
+{
+    std::uint64_t npts;
+
+#if (AMREX_SPACEDIM == 3)
+    Math::FastDivmodU64 fdxy;
+    Math::FastDivmodU64 fdx;
+    IntVect lo;
+
+    BoxIndexer (Box const& box)
+        : npts(box.numPts()),
+          fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
+          fdx (std::uint64_t(box.length(0))),
+          lo  (box.smallEnd())
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        std::uint64_t x, y, z, rem;
+        fdxy(z, rem, icell);
+        fdx(y, x, rem);
+        return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        std::uint64_t x, y, z, rem;
+        fdxy(z, rem, icell);
+        fdx(y, x, rem);
+        return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
+    }
+
+#elif (AMREX_SPACEDIM == 2)
+
+    Math::FastDivmodU64 fdx;
+    IntVect lo;
+
+    BoxIndexer (Box const& box)
+        : npts(box.numPts()),
+          fdx (std::uint64_t(box.length(0))),
+          lo  (box.smallEnd())
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        std::uint64_t x, y;
+        fdx(y, x, icell);
+        return {int(x)+lo[0], int(y)+lo[1], 0};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        std::uint64_t x, y;
+        fdx(y, x, icell);
+        return {int(x)+lo[0], int(y)+lo[1]};
+    }
+
+#elif (AMREX_SPACEDIM == 1)
+
+    int lo;
+
+    BoxIndexer (Box const& box)
+        : npts(box.numPts()),
+          lo(box.smallEnd(0))
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        return {int(icell)+lo, 0, 0};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        return IntVect{int(icell)+lo};
+    }
+
+#endif
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    std::uint64_t numPts () const { return npts; }
+};
+
 }
 
 #endif /*AMREX_BOX_H*/
diff --git a/Src/Base/AMReX_BoxArray.H b/Src/Base/AMReX_BoxArray.H
index 807cd9d8516..94358f580a4 100644
--- a/Src/Base/AMReX_BoxArray.H
+++ b/Src/Base/AMReX_BoxArray.H
@@ -515,16 +515,16 @@ struct BATransformer
 // for backward compatibility
 using BndryBATransformer = BATransformer;
 
-/**
-* \brief A collection of Boxes stored in an Array.  It is a
-* reference-counted concrete class, not a polymorphic one; i.e. you
-* cannot use any of the List member functions with a BoxList.
-*/
-
 class MFIter;
 class AmrMesh;
 class FabArrayBase;
 
+/**
+ * \brief A collection of Boxes stored in an Array.
+ *
+ * It is a reference-counted concrete class, not a polymorphic one; i.e. you
+ * cannot use any of the List member functions with a BoxList.
+ */
 class BoxArray
 {
 public:
diff --git a/Src/Base/AMReX_BoxArray.cpp b/Src/Base/AMReX_BoxArray.cpp
index 9413f5ae5c1..e5b41d6b9d3 100644
--- a/Src/Base/AMReX_BoxArray.cpp
+++ b/Src/Base/AMReX_BoxArray.cpp
@@ -61,7 +61,7 @@ BARef::BARef (const BoxList& bl)
 }
 
 BARef::BARef (BoxList&& bl) noexcept
-    : m_abox(std::move(bl.data()))
+    : m_abox(std::move(std::move(bl).data()))
 {
 #ifdef AMREX_MEM_PROFILING
     updateMemoryUsage_box(1);
@@ -170,7 +170,7 @@ BARef::define (BoxList&& bl) noexcept
 #ifdef AMREX_MEM_PROFILING
     updateMemoryUsage_box(-1);
 #endif
-    m_abox = std::move(bl.data());
+    m_abox = std::move(std::move(bl).data());
 #ifdef AMREX_MEM_PROFILING
     updateMemoryUsage_box(1);
 #endif
diff --git a/Src/Base/AMReX_BoxDomain.H b/Src/Base/AMReX_BoxDomain.H
index a82e5ddc72e..af92d631607 100644
--- a/Src/Base/AMReX_BoxDomain.H
+++ b/Src/Base/AMReX_BoxDomain.H
@@ -55,14 +55,12 @@ std::ostream& operator<< (std::ostream& os, const BoxDomain& bd);
 
 /**
 * \brief A List of Disjoint Boxes.
+*
 * A BoxDomain is a BoxList with the restriction that Boxes in the list
 * are disjoint.
+* Note that a BoxDomain is NOT a BoxList due to the protected inheritance.
+* This is a concrete class, not a polymorphic one.
 */
-
-//Note that a BoxDomain is NOT a BoxList due to the protected inheritance.
-//This is a concrete class, not a polymorphic one.
-
-
 class BoxDomain
     :
     protected BoxList
diff --git a/Src/Base/AMReX_BoxList.H b/Src/Base/AMReX_BoxList.H
index c0ff30025ff..ba06e746ce8 100644
--- a/Src/Base/AMReX_BoxList.H
+++ b/Src/Base/AMReX_BoxList.H
@@ -48,7 +48,6 @@ namespace amrex
 * IndexType.  This class implements operations for sets of Boxes.
 * This is a concrete class, not a polymorphic one.
 */
-
 class BoxList
 {
 public:
@@ -213,7 +212,7 @@ public:
     //! Returns a constant reference to the Vector<Box>.
     [[nodiscard]] const Vector<Box>& data () const noexcept { return m_lbox; }
 
-    void swap (BoxList& rhs) {
+    void swap (BoxList& rhs) noexcept {
         std::swap(m_lbox, rhs.m_lbox);
         std::swap(btype, rhs.btype);
     }
diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H
index d68285bc878..9547bc92f21 100644
--- a/Src/Base/AMReX_CArena.H
+++ b/Src/Base/AMReX_CArena.H
@@ -5,13 +5,14 @@
 #include <AMReX_Arena.H>
 
 #include <cstddef>
-#include <set>
-#include <vector>
+#include <functional>
+#include <iosfwd>
 #include <map>
 #include <mutex>
-#include <unordered_set>
-#include <functional>
+#include <set>
 #include <string>
+#include <unordered_set>
+#include <vector>
 
 namespace amrex {
 
@@ -23,7 +24,6 @@ struct MemStat;
 * chunks of heap space and apportions it out as requested.  It merges
 * together neighboring chunks on each free().
 */
-
 class CArena
     :
     public Arena
@@ -57,7 +57,7 @@ public:
      * Try to shrink in-place
      */
     [[nodiscard]] void*
-    shrink_in_place (void* pt, std::size_t sz) final;
+    shrink_in_place (void* pt, std::size_t new_size) final;
 
     /**
     * \brief Free up allocated memory.  Merge neighboring free memory chunks
@@ -164,15 +164,15 @@ protected:
         MemStat* m_stat;
     };
 
+    //! The list of blocks allocated via ::operator new().
+    std::vector<std::pair<void*,std::size_t> > m_alloc;
+
     /**
     * \brief The type of our freelist and blocklist.
     * We use a set sorted from lo to hi memory addresses.
     */
     using NL = std::set<Node>;
 
-    //! The list of blocks allocated via ::operator new().
-    std::vector<std::pair<void*,std::size_t> > m_alloc;
-
     /**
     * \brief The free list of allocated but not currently used blocks.
     * Maintained in lo to hi memory sorted order.
@@ -198,6 +198,8 @@ protected:
 
 
     std::mutex carena_mutex;
+
+    friend std::ostream& operator<< (std::ostream& os, const CArena& arena);
 };
 
 }
diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp
index 6f7979d4750..c47f8f5ed26 100644
--- a/Src/Base/AMReX_CArena.cpp
+++ b/Src/Base/AMReX_CArena.cpp
@@ -14,6 +14,7 @@ namespace amrex {
 
 #include <utility>
 #include <cstring>
+#include <iostream>
 
 namespace amrex {
 
@@ -203,9 +204,61 @@ CArena::alloc_in_place (void* pt, std::size_t szmin, std::size_t szmax)
 }
 
 void*
-CArena::shrink_in_place (void* /*pt*/, std::size_t sz)
+CArena::shrink_in_place (void* pt, std::size_t new_size)
 {
-    return alloc(sz); // xxxxx TODO
+    if ((pt == nullptr) || (new_size == 0)) { return nullptr; }
+
+    new_size = Arena::align(new_size);
+
+    std::lock_guard<std::mutex> lock(carena_mutex);
+
+    auto busy_it = m_busylist.find(Node(pt,nullptr,0));
+    if (busy_it == m_busylist.end()) {
+        amrex::Abort("CArena::shrink_in_place: unknown pointer");
+        return nullptr;
+    }
+    AMREX_ASSERT(m_freelist.find(*busy_it) == m_freelist.end());
+
+    auto const old_size = busy_it->size();
+
+    if (new_size > old_size) {
+        amrex::Abort("CArena::shrink_in_place: wrong size. Cannot shrink to a larger size.");
+        return nullptr;
+    } else if (new_size == old_size) {
+        return pt;
+    } else {
+        auto const leftover_size = old_size - new_size;
+
+        void* pt2 = static_cast<char*>(pt) + new_size;
+        Node new_free_node(pt2, busy_it->owner(), leftover_size);
+
+        void* pt_end = static_cast<char*>(pt) + old_size;
+        auto free_it = m_freelist.find(Node(pt_end,nullptr,0));
+        if ((free_it == m_freelist.end()) || ! new_free_node.coalescable(*free_it)) {
+            m_freelist.insert(free_it, new_free_node);
+        } else {
+            auto& node = const_cast<Node&>(*free_it);
+            // This is safe because the free list is std::set and the
+            // modification of `block` does not change the order of elements
+            // in the container, even though Node's operator< uses block.
+            node.block(pt2);
+            node.size(leftover_size + node.size());
+        }
+
+        const_cast<Node&>(*busy_it).size(new_size);
+
+        m_actually_used -= leftover_size;
+
+#ifdef AMREX_TINY_PROFILING
+        if (m_do_profiling) {
+            TinyProfiler::memory_free(old_size, busy_it->mem_stat());
+            auto* stat = TinyProfiler::memory_alloc(new_size, m_profiling_stats);
+            const_cast<Node&>(*busy_it).mem_stat(stat);
+        }
+#endif
+
+        return pt;
+    }
 }
 
 void
@@ -439,4 +492,43 @@ CArena::PrintUsage (std::ostream& os, std::string const& name, std::string const
        << m_busylist.size() << " busy blocks, " << m_freelist.size() << " free blocks\n";
 }
 
+std::ostream& operator<< (std::ostream& os, const CArena& arena)
+{
+    os << "CArea:\n"
+       << "    Hunk size: " << arena.m_hunk << "\n"
+       << "    Memory allocated: " << arena.m_used << "\n"
+       << "    Memory actually used: " << arena.m_actually_used << "\n";
+
+    if (arena.m_alloc.empty()) {
+        os << "    No memory allocations\n";
+    } else {
+        os << "    List of memory alloations: (address, size)\n";
+        for (auto const& a : arena.m_alloc) {
+            os << "        " << a.first << ", " << a.second << "\n";
+        }
+    }
+
+    if (arena.m_freelist.empty()) {
+        os << "    No free nodes\n";
+    } else {
+        os << "    List of free nodes: (address, owner, size)\n";
+        for (auto const& a : arena.m_freelist) {
+            os << "        " << a.block() << ", " << a.owner() << ", "
+               << a.size() << "\n";
+        }
+    }
+
+    if (arena.m_busylist.empty()) {
+        os << "    No busy nodes\n";
+    } else {
+        os << "    List of busy nodes: (address, owner, size)\n";
+        for (auto const& a : arena.m_busylist) {
+            os << "        " << a.block() << ", " << a.owner() << ", "
+               << a.size() << "\n";
+        }
+    }
+
+    return os;
+}
+
 }
diff --git a/Src/Base/AMReX_CONSTANTS.H b/Src/Base/AMReX_CONSTANTS.H
index d50153275e6..648d3da5c94 100644
--- a/Src/Base/AMReX_CONSTANTS.H
+++ b/Src/Base/AMReX_CONSTANTS.H
@@ -4,41 +4,7 @@
 
 #include <AMReX_REAL.H>
 
-
-#ifdef BL_LANG_FORT
-
-#ifndef AMREX_XSDK
-
-#define bigreal BL_REAL_E(1.0,30)
-#define zero    BL_REAL(0.0)
-#define one     BL_REAL(1.0)
-#define two     BL_REAL(2.0)
-#define three   BL_REAL(3.0)
-#define four    BL_REAL(4.0)
-#define five    BL_REAL(5.0)
-#define six     BL_REAL(6.0)
-#define seven   BL_REAL(7.0)
-#define eight   BL_REAL(8.0)
-#define nine    BL_REAL(9.0)
-#define ten     BL_REAL(10.0)
-#define twelve  BL_REAL(12.0)
-#define fifteen BL_REAL(15.0)
-#define sixteen BL_REAL(16.0)
-#define twenty  BL_REAL(20.0)
-#define seventy BL_REAL(70.0)
-#define ninety  BL_REAL(90.0)
-#define tenth   BL_REAL(0.1)
-#define eighth  BL_REAL(0.125)
-#define sixth   BL_REAL(0.16666666666666667)
-#define fifth   BL_REAL(0.2)
-#define fourth  BL_REAL(0.25)
-#define third   BL_REAL(0.33333333333333333)
-#define half    BL_REAL(0.5)
-#define two3rd  BL_REAL(0.66666666666666667)
-#define Pi      BL_REAL(3.1415926535897932)
-
-#endif /* ndef AMREX_XSDK */
-
-#endif /*BL_LANG_FORT*/
+/* Maybe in the future we will add some constants here. */
+/* If we do, make sure this file is Fortran safe. */
 
 #endif /*BL_CONSTANTS_H*/
diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H
index 73ca3a25e87..35e0ec3be7b 100644
--- a/Src/Base/AMReX_CTOParallelForImpl.H
+++ b/Src/Base/AMReX_CTOParallelForImpl.H
@@ -27,12 +27,12 @@ struct CompileTimeOptions {
 namespace detail
 {
     template <int MT, typename T, class F, typename... As>
-    std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value, bool>
-    ParallelFor_helper2 (T const& N, F&& f, TypeList<As...>,
+    std::enable_if_t<std::is_integral_v<T> || std::is_same_v<T,Box>, bool>
+    ParallelFor_helper2 (T const& N, F const& f, TypeList<As...>,
                          std::array<int,sizeof...(As)> const& runtime_options)
     {
         if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
-            if constexpr (std::is_integral<T>::value) {
+            if constexpr (std::is_integral_v<T>) {
                 ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (T i) noexcept
                 {
                     f(i, As{}...);
@@ -50,8 +50,8 @@ namespace detail
     }
 
     template <int MT, typename T, class F, typename... As>
-    std::enable_if_t<std::is_integral<T>::value, bool>
-    ParallelFor_helper2 (Box const& box, T ncomp, F&& f, TypeList<As...>,
+    std::enable_if_t<std::is_integral_v<T>, bool>
+    ParallelFor_helper2 (Box const& box, T ncomp, F const& f, TypeList<As...>,
                          std::array<int,sizeof...(As)> const& runtime_options)
     {
         if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
@@ -66,24 +66,24 @@ namespace detail
     }
 
     template <int MT, typename T, class F, typename... PPs, typename RO>
-    std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value>
-    ParallelFor_helper1 (T const& N, F&& f, TypeList<PPs...>,
+    std::enable_if_t<std::is_integral_v<T> || std::is_same_v<T,Box>>
+    ParallelFor_helper1 (T const& N, F const& f, TypeList<PPs...>,
                          RO const& runtime_options)
     {
         bool found_option = (false || ... ||
-                             ParallelFor_helper2<MT>(N, std::forward<F>(f),
+                             ParallelFor_helper2<MT>(N, f,
                                                      PPs{}, runtime_options));
         amrex::ignore_unused(found_option);
         AMREX_ASSERT(found_option);
     }
 
     template <int MT, typename T, class F, typename... PPs, typename RO>
-    std::enable_if_t<std::is_integral<T>::value>
-    ParallelFor_helper1 (Box const& box, T ncomp, F&& f, TypeList<PPs...>,
+    std::enable_if_t<std::is_integral_v<T>>
+    ParallelFor_helper1 (Box const& box, T ncomp, F const& f, TypeList<PPs...>,
                          RO const& runtime_options)
     {
         bool found_option = (false || ... ||
-                             ParallelFor_helper2<MT>(box, ncomp, std::forward<F>(f),
+                             ParallelFor_helper2<MT>(box, ncomp, f,
                                                      PPs{}, runtime_options));
         amrex::ignore_unused(found_option);
         AMREX_ASSERT(found_option);
@@ -93,7 +93,7 @@ namespace detail
 #endif
 
 template <int MT, typename T, class F, typename... CTOs>
-std::enable_if_t<std::is_integral<T>::value>
+std::enable_if_t<std::is_integral_v<T>>
 ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
              std::array<int,sizeof...(CTOs)> const& runtime_options,
              T N, F&& f)
@@ -124,7 +124,7 @@ void ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
 }
 
 template <int MT, typename T, class F, typename... CTOs>
-std::enable_if_t<std::is_integral<T>::value>
+std::enable_if_t<std::is_integral_v<T>>
 ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
              std::array<int,sizeof...(CTOs)> const& runtime_options,
              Box const& box, T ncomp, F&& f)
@@ -184,7 +184,7 @@ ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
  * \param f      a callable object taking an integer and working on that iteration.
  */
 template <typename T, class F, typename... CTOs>
-std::enable_if_t<std::is_integral<T>::value>
+std::enable_if_t<std::is_integral_v<T>>
 ParallelFor (TypeList<CTOs...> ctos,
              std::array<int,sizeof...(CTOs)> const& option,
              T N, F&& f)
@@ -292,7 +292,7 @@ void ParallelFor (TypeList<CTOs...> ctos,
  * \param f      a callable object taking three integers and working on the given cell.
  */
 template <typename T, class F, typename... CTOs>
-std::enable_if_t<std::is_integral<T>::value>
+std::enable_if_t<std::is_integral_v<T>>
 ParallelFor (TypeList<CTOs...> ctos,
              std::array<int,sizeof...(CTOs)> const& option,
              Box const& box, T ncomp, F&& f)
diff --git a/Src/Base/AMReX_CoordSys.H b/Src/Base/AMReX_CoordSys.H
index d558ba053db..24096c6f428 100644
--- a/Src/Base/AMReX_CoordSys.H
+++ b/Src/Base/AMReX_CoordSys.H
@@ -20,7 +20,6 @@ class FArrayBox;
 *
 * Routines for mapping between physical coordinate system and index space.
 */
-
 class CoordSys
 {
 public:
@@ -54,7 +53,7 @@ public:
         BL_ASSERT(c_sys != undef); return (c_sys == RZ);
     }
 
-    //! Is CoordType == cartesion?
+    //! Is CoordType == cartesian?
     [[nodiscard]] bool IsCartesian () const noexcept {
         BL_ASSERT(c_sys != undef); return (c_sys == cartesian);
     }
diff --git a/Src/Base/AMReX_CoordSys.cpp b/Src/Base/AMReX_CoordSys.cpp
index 888d2cc10c9..757af532cc5 100644
--- a/Src/Base/AMReX_CoordSys.cpp
+++ b/Src/Base/AMReX_CoordSys.cpp
@@ -482,6 +482,8 @@ CoordSys::AreaLo (const IntVect& point, int dir) const noexcept // NOLINT(readab
         {
         case 0: return dx[1];
         case 1: return dx[0];
+        default:
+            AMREX_ASSERT(0);
         }
         return 0._rt; // to silent compiler warning
     case RZ:
@@ -490,6 +492,8 @@ CoordSys::AreaLo (const IntVect& point, int dir) const noexcept // NOLINT(readab
         {
         case 0: return Real(TWOPI)*dx[1]*xlo[0];
         case 1: return ((xlo[0]+dx[0])*(xlo[0]+dx[0])-xlo[0]*xlo[0])*static_cast<Real>(0.5*TWOPI);
+        default:
+            AMREX_ASSERT(0);
         }
         return 0._rt; // to silent compiler warning
     default:
@@ -502,6 +506,8 @@ CoordSys::AreaLo (const IntVect& point, int dir) const noexcept // NOLINT(readab
     case 0: return dx[1]*dx[2];
     case 1: return dx[0]*dx[2];
     case 2: return dx[1]*dx[0];
+    default:
+        AMREX_ASSERT(0);
     }
 #endif
     return 0;
@@ -520,6 +526,8 @@ CoordSys::AreaHi (const IntVect& point, int dir) const noexcept // NOLINT(readab
         {
         case 0: return dx[1];
         case 1: return dx[0];
+        default:
+            AMREX_ASSERT(0);
         }
         return 0._rt; // to silent compiler warning
     case RZ:
@@ -528,6 +536,8 @@ CoordSys::AreaHi (const IntVect& point, int dir) const noexcept // NOLINT(readab
         {
         case 0: return Real(TWOPI)*dx[1]*xhi[0];
         case 1: return (xhi[0]*xhi[0]-(xhi[0]-dx[0])*(xhi[0]-dx[0]))*static_cast<Real>(TWOPI*0.5);
+        default:
+            AMREX_ASSERT(0);
         }
         return 0._rt; // to silent compiler warning
     default:
@@ -540,6 +550,8 @@ CoordSys::AreaHi (const IntVect& point, int dir) const noexcept // NOLINT(readab
     case 0: return dx[1]*dx[2];
     case 1: return dx[0]*dx[2];
     case 2: return dx[1]*dx[0];
+    default:
+        AMREX_ASSERT(0);
     }
 #endif
     return 0._rt;
diff --git a/Src/Base/AMReX_Dim3.H b/Src/Base/AMReX_Dim3.H
index 9aee68330c4..320503afe55 100644
--- a/Src/Base/AMReX_Dim3.H
+++ b/Src/Base/AMReX_Dim3.H
@@ -13,8 +13,8 @@ struct Dim3 { int x; int y; int z; };
 struct XDim3 { Real x; Real y; Real z; };
 
 template <typename T,
-          typename std::enable_if <std::is_same<T,  Dim3>::value ||
-                                   std::is_same<T, XDim3>::value>::type* = nullptr>
+          std::enable_if_t <std::is_same_v<T,  Dim3> ||
+                                   std::is_same_v<T, XDim3>>* = nullptr>
 std::ostream& operator<< (std::ostream& os, const T& d)
 {
     os << '(' << d.x << ',' << d.y << ',' << d.z << ')';
diff --git a/Src/Base/AMReX_DistributionMapping.H b/Src/Base/AMReX_DistributionMapping.H
index 0707532a0fc..e9aa82f16a2 100644
--- a/Src/Base/AMReX_DistributionMapping.H
+++ b/Src/Base/AMReX_DistributionMapping.H
@@ -37,7 +37,6 @@ class FabArrayBase;
 *  BoxArray are as equal across CPUs as is possible.  The SFC distribution is
 *  based on a space filling curve.
 */
-
 class DistributionMapping
 {
   public:
diff --git a/Src/Base/AMReX_DistributionMapping.cpp b/Src/Base/AMReX_DistributionMapping.cpp
index 9350f9203d1..958f7ef9ec6 100644
--- a/Src/Base/AMReX_DistributionMapping.cpp
+++ b/Src/Base/AMReX_DistributionMapping.cpp
@@ -221,9 +221,9 @@ DistributionMapping::LeastUsedCPUs (int         nprocs,
     }
 
     if (flag_verbose_mapper) {
-        Print() << "LeastUsedCPUs:" << std::endl;
+        Print() << "LeastUsedCPUs:" << '\n';
         for (const auto &p : LIpairV) {
-            Print() << "  Rank " << p.second << " contains " << p.first << std::endl;
+            Print() << "  Rank " << p.second << " contains " << p.first << '\n';
         }
     }
 #else
@@ -368,7 +368,7 @@ DistributionMapping::RoundRobinDoIt (int                  nboxes,
                                      bool                 sort)
 {
     if (flag_verbose_mapper) {
-        Print() << "DM: RoundRobinDoIt called..." << std::endl;
+        Print() << "DM: RoundRobinDoIt called..." << '\n';
     }
 
     int nprocs = ParallelContext::NProcsSub();
@@ -426,7 +426,7 @@ DistributionMapping::RoundRobinDoIt (int                  nboxes,
             m_ref->m_pmap[(*LIpairV)[i].second] = ParallelContext::local_to_global_rank(rank);
             if (flag_verbose_mapper) {
                 Print() << "  Mapping box " << (*LIpairV)[i].second << " of size "
-                        << (*LIpairV)[i].first << " to rank " << rank << std::endl;
+                        << (*LIpairV)[i].first << " to rank " << rank << '\n';
             }
         }
     }
@@ -439,7 +439,7 @@ DistributionMapping::RoundRobinDoIt (int                  nboxes,
             int rank = tid*nworkers + wrkerord[tid][wid];
             m_ref->m_pmap[i] = ParallelContext::local_to_global_rank(rank);
             if (flag_verbose_mapper) {
-                Print() << "  Mapping box " << i << " to rank " << rank << std::endl;
+                Print() << "  Mapping box " << i << " to rank " << rank << '\n';
             }
         }
     }
@@ -570,7 +570,7 @@ struct WeightedBoxList
     }
 };
 
-static
+namespace {
 void
 knapsack (const std::vector<Long>&         wgts,
           int                              nprocs,
@@ -712,6 +712,7 @@ top: ;
         }
     }
 }
+}
 
 void
 DistributionMapping::KnapSackDoIt (const std::vector<Long>& wgts,
@@ -722,7 +723,7 @@ DistributionMapping::KnapSackDoIt (const std::vector<Long>& wgts,
                                    bool                     sort)
 {
     if (flag_verbose_mapper) {
-        Print() << "DM: KnapSackDoIt called..." << std::endl;
+        Print() << "DM: KnapSackDoIt called..." << '\n';
     }
 
     BL_PROFILE("DistributionMapping::KnapSackDoIt()");
@@ -747,9 +748,9 @@ DistributionMapping::KnapSackDoIt (const std::vector<Long>& wgts,
 
     if (flag_verbose_mapper) {
         for (int i = 0, ni = static_cast<int>(vec.size()); i < ni; ++i) {
-            Print() << "  Bucket " << i << " contains boxes:" << std::endl;
+            Print() << "  Bucket " << i << " contains boxes:" << '\n';
             for (int x : vec[i]) {
-                Print() << "    " << x << std::endl;
+                Print() << "    " << x << '\n';
             }
         }
     }
@@ -774,7 +775,7 @@ DistributionMapping::KnapSackDoIt (const std::vector<Long>& wgts,
 
     if (flag_verbose_mapper) {
         for (const auto &p : LIpairV) {
-            Print() << "  Bucket " << p.second << " total weight: " << p.first << std::endl;
+            Print() << "  Bucket " << p.second << " total weight: " << p.first << '\n';
         }
     }
 
@@ -811,7 +812,7 @@ DistributionMapping::KnapSackDoIt (const std::vector<Long>& wgts,
         const int N = static_cast<int>(vi.size());
 
         if (flag_verbose_mapper) {
-            Print() << "  Mapping bucket " << idx << " to rank " << tid << std::endl;
+            Print() << "  Mapping bucket " << idx << " to rank " << tid << '\n';
         }
 
         if (nteams == nprocs) {
@@ -1171,7 +1172,7 @@ namespace {
     }
 }
 
-static
+namespace {
 void
 Distribute (const std::vector<SFCToken>&     tokens,
             const std::vector<Long>&         wgts,
@@ -1183,14 +1184,14 @@ Distribute (const std::vector<SFCToken>&     tokens,
     BL_PROFILE("DistributionMapping::Distribute()");
 
     if (flag_verbose_mapper) {
-        Print() << "Distribute:" << std::endl;
-        Print() << "  volpercpu: " << volpercpu << std::endl;
-        Print() << "  Sorted SFC Tokens:" << std::endl;
+        Print() << "Distribute:" << '\n'
+                << "  volpercpu: " << volpercpu << '\n'
+                << "  Sorted SFC Tokens:" << '\n';
         int idx = 0;
         for (const auto &t : tokens) {
             Print() << "    " << idx++ << ": "
                     << t.m_box << ": "
-                    << t.m_morton << std::endl;
+                    << t.m_morton << '\n';
         }
     }
 
@@ -1227,10 +1228,10 @@ Distribute (const std::vector<SFCToken>&     tokens,
     }
 
     if (flag_verbose_mapper) {
-        Print() << "Distributed SFC Tokens:" << std::endl;
+        Print() << "Distributed SFC Tokens:" << '\n';
         int idx = 0;
         for (int i = 0; i < nprocs; ++i) {
-            Print() << "  Rank/Team " << i << ":" << std::endl;
+            Print() << "  Rank/Team " << i << ":" << '\n';
             Real rank_vol = 0;
             for (const auto &box : v[i]) {
                 amrex::ignore_unused(box);
@@ -1238,11 +1239,11 @@ Distribute (const std::vector<SFCToken>&     tokens,
                 BL_ASSERT(box == t.m_box);
                 Print() << "    " << idx << ": "
                         << t.m_box << ": "
-                        << t.m_morton << std::endl;
+                        << t.m_morton << '\n';
                 rank_vol += static_cast<Real>(wgts[t.m_box]);
                 idx++;
             }
-            Print() << "    Total Rank Vol: " << rank_vol << std::endl;
+            Print() << "    Total Rank Vol: " << rank_vol << '\n';
         }
     }
 
@@ -1254,6 +1255,7 @@ Distribute (const std::vector<SFCToken>&     tokens,
     BL_ASSERT(cnt == tokens.size());
 #endif
 }
+}
 
 void
 DistributionMapping::SFCProcessorMapDoIt (const BoxArray&          boxes,
@@ -1263,7 +1265,7 @@ DistributionMapping::SFCProcessorMapDoIt (const BoxArray&          boxes,
                                           Real*                    eff)
 {
     if (flag_verbose_mapper) {
-        Print() << "DM: SFCProcessorMapDoIt called..." << std::endl;
+        Print() << "DM: SFCProcessorMapDoIt called..." << '\n';
     }
 
     BL_PROFILE("DistributionMapping::SFCProcessorMapDoIt()");
@@ -1339,7 +1341,7 @@ DistributionMapping::SFCProcessorMapDoIt (const BoxArray&          boxes,
 
     if (flag_verbose_mapper) {
         for (const auto &p : LIpairV) {
-            Print() << "  Bucket " << p.second << " contains " << p.first << std::endl;
+            Print() << "  Bucket " << p.second << " contains " << p.first << '\n';
         }
     }
 
@@ -1382,7 +1384,7 @@ DistributionMapping::SFCProcessorMapDoIt (const BoxArray&          boxes,
         const int Nbx = static_cast<int>(vi.size());// # of boxes assigned to this team
 
         if (flag_verbose_mapper) {
-            Print() << "Mapping bucket " << LIpairV[i].second << " to rank " << ord[i] << std::endl;
+            Print() << "Mapping bucket " << LIpairV[i].second << " to rank " << ord[i] << '\n';
         }
 
         if (nteams == nprocs) { // In this case, team id is process id.
diff --git a/Src/Base/AMReX_Extension.H b/Src/Base/AMReX_Extension.H
index cc299c0823e..065d23e1086 100644
--- a/Src/Base/AMReX_Extension.H
+++ b/Src/Base/AMReX_Extension.H
@@ -112,6 +112,9 @@
 #elif defined(__GNUC__)
 #define AMREX_FORCE_INLINE inline __attribute__((always_inline))
 
+#elif defined(_MSC_VER)
+#define AMREX_FORCE_INLINE inline __forceinline
+
 #else
 #define AMREX_FORCE_INLINE inline
 
@@ -133,6 +136,27 @@
 #define AMREX_NO_INLINE
 #endif
 
+// flatten
+#if defined(_MSC_VER)
+#define AMREX_FLATTEN [[msvc::flatten]]
+#elif defined(__clang__) || defined(__GNUC__)
+#define AMREX_FLATTEN __attribute__((flatten))
+#else
+#define AMREX_FLATTEN
+#endif
+
+// unroll loop
+#define AMREX_TO_STRING_HELPER(X)   #X
+#define AMREX_TO_STRING(X)          AMREX_TO_STRING_HELPER(X)
+
+#if defined(__clang__) || defined(__CUDACC__) || defined(__HIP__) || defined(__INTEL_CLANG_COMPILER)
+#define AMREX_UNROLL_LOOP(n)  _Pragma(AMREX_TO_STRING(unroll n))
+#elif defined(__GNUC__)
+#define AMREX_UNROLL_LOOP(n)  _Pragma(AMREX_TO_STRING(GCC unroll n))
+#else
+#define AMREX_UNROLL_LOOP(n)
+#endif
+
 // __attribute__((weak))
 
 #if defined(AMREX_TYPECHECK)
@@ -178,6 +202,42 @@
 #    define AMREX_FALLTHROUGH ((void)0)
 #endif
 
+// Note: following compilers support [[likely]] and [[unlikely]]
+//   - Clang >= 12.0
+//   - GCC >= 9.0
+//   - Intel >= 2021.7
+//   - MSVC >= 19.26
+//   - nvcc >= 12
+#if defined(__has_cpp_attribute) && __has_cpp_attribute(likely) >= 201803L
+#    define AMREX_LIKELY [[likely]]
+#    define AMREX_UNLIKELY [[unlikely]]
+#else
+#    define AMREX_LIKELY
+#    define AMREX_UNLIKELY
+#endif
+
+// Note: following compilers support assumptions, at least using builtin functions:
+//   - Clang >= 3.7
+//   - GCC >= 5.1
+//   - MSVC >= 19.20
+//   - nvcc >= 11.1.0
+//   - icx >= 2021.1.2
+#if defined(__has_cpp_attribute) && __has_cpp_attribute(assume)
+#   define AMREX_ASSUME(ASSUMPTION) [[assume(ASSUMPTION)]]
+#else
+#   if defined(__CUDA_ARCH__) && defined(__CUDACC__) && ( (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 2)) )
+#       define AMREX_ASSUME(ASSUMPTION) __builtin_assume(ASSUMPTION)
+#   elif defined(AMREX_CXX_INTEL) || defined(__clang__)
+#       define AMREX_ASSUME(ASSUMPTION) __builtin_assume(ASSUMPTION)
+#   elif defined(_MSC_VER)
+#       define AMREX_ASSUME(ASSUMPTION) __assume(ASSUMPTION)
+#   elif defined(__GNUC__)
+#       define AMREX_ASSUME(ASSUMPTION) if (ASSUMPTION) {} else { __builtin_unreachable(); }
+#   else
+#       define AMREX_ASSUME(ASSUMPTION)
+#   endif
+#endif
+
 // CI uses -Werror -Wc++17-extension, thus we need to add the __cplusplus clause
 #if !defined(AMREX_NO_NODISCARD) && defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201603L
 #   define AMREX_NODISCARD [[nodiscard]]
diff --git a/Src/Base/AMReX_FACopyDescriptor.H b/Src/Base/AMReX_FACopyDescriptor.H
index 5194ca7f876..6c05f069673 100644
--- a/Src/Base/AMReX_FACopyDescriptor.H
+++ b/Src/Base/AMReX_FACopyDescriptor.H
@@ -103,7 +103,6 @@ FabCopyDescriptor<FAB>::~FabCopyDescriptor ()
 * \brief This class orchestrates filling a destination fab of size destFabBox
 * from fabarray on the local processor (myProc).
 */
-
 template <class FAB>
 class FabArrayCopyDescriptor
 {
@@ -468,7 +467,7 @@ FabArrayCopyDescriptor<FAB>::CollectData ()
         const int Who = it->procThatHasData;
         const auto Cnt = static_cast<int>((it->box.numPts())*(it->nComp));
 
-        RcvTags[Who].push_back(it);
+        RcvTags[Who].emplace_back(it);
 
         Total_Rcvs_Size += Cnt;
 
@@ -698,7 +697,7 @@ FabArrayCopyDescriptor<FAB>::CollectData ()
         amrex::The_Arena()->free(md_recv_data);
     }
 
-    // Wait and upack data
+    // Wait and unpack data
     if (N_rcvs > 0)
     {
         Vector<MPI_Status> stats(N_rcvs);
diff --git a/Src/Base/AMReX_FArrayBox.H b/Src/Base/AMReX_FArrayBox.H
index 45e49ebab93..2dda4e6b0a0 100644
--- a/Src/Base/AMReX_FArrayBox.H
+++ b/Src/Base/AMReX_FArrayBox.H
@@ -23,10 +23,9 @@ class FArrayBox;
 * only want to write out 32 bit FABs.
 *
 * With the exception of the enumeration constants, this class is
-* primarily for FArrayBox implementors; i.e. user's shouldn't
+* primarily for FArrayBox implementers; i.e. user's shouldn't
 * call any of the member functions in this class directly.
 */
-
 class FABio // NOLINT(cppcoreguidelines-special-member-functions)
 {
 public:
@@ -116,7 +115,7 @@ public:
     * \brief Pure virtual function.  Derived classes MUST override this
     * function to skip over the next FAB f in the istream, under the
     * assumption that the header for the FAB f has already been
-    * skpped over.
+    * skipped over.
     */
     virtual void skip (std::istream& is,
                        FArrayBox&    f) const = 0;
@@ -224,7 +223,6 @@ private:
 *  This class does NOT provide a copy constructor or assignment operator,
 *  but it has a move constructor.
 */
-
 class FArrayBox
     :
     public BaseFab<Real>
@@ -241,7 +239,7 @@ public:
 
     /**
     * \brief Construct an initial FAB with the data space allocated but
-    * not inititialized. ncomp is the number of components
+    * not initialized. ncomp is the number of components
     * (variables) at each data point in the Box.
     */
     explicit FArrayBox (const Box& b,
@@ -409,7 +407,7 @@ public:
     /**
     * \brief Set the FABio::Format in the program.
     * This is the preferred way to set the output format
-    * in "new" FABs.  When desiging new programs, this should
+    * in "new" FABs.  When designing new programs, this should
     * be the only function that needs to be called in order
     * to set the format.
     */
diff --git a/Src/Base/AMReX_FILCC_1D.F90 b/Src/Base/AMReX_FILCC_1D.F90
deleted file mode 100644
index 873f67f5941..00000000000
--- a/Src/Base/AMReX_FILCC_1D.F90
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <AMReX_Config.H>
-
-! -----------------------------------------------------------
-!> This routine is intended to be a generic fill function
-!! for cell-centered data.  It knows how to extrapolate
-!! and reflect data and is used to supplement the problem-specific
-!! fill functions which call it.
-!!
-!! \param q           <=  array to fill
-!! \param lo,hi        => index extent of q array
-!! \param domlo,domhi  => index extent of problem domain
-!! \param dx           => cell spacing
-!! \param xlo          => physical location of lower left hand
-!!                        corner of q array
-!! \param bc           => array of boundary flags bc(SPACEDIM,lo:hi)
-!!
-!! NOTE: all corner as well as edge data is filled if not EXT_DIR
-! -----------------------------------------------------------
-
-#ifndef AMREX_XSDK
-
-subroutine filcc(q,q_l1,q_h1,domlo,domhi,dx,xlo,bc)
-
-  use amrex_fort_module
-  use amrex_filcc_module, only: filccn
-
-  implicit none
-
-  integer    q_l1, q_h1
-  integer    domlo(1), domhi(1)
-  integer    bc(1,2)
-  real(amrex_real)     xlo(1), dx(1)
-  real(amrex_real)     q(q_l1:q_h1)
-
-  integer :: q_lo(3), q_hi(3)
-
-  q_lo = [q_l1, 0, 0]
-  q_hi = [q_h1, 0, 0]
-
-  call filccn(q_lo, q_hi, q, q_lo, q_hi, 1, domlo, domhi, dx, xlo, bc)
-
-end subroutine filcc
-
-#endif
diff --git a/Src/Base/AMReX_FILCC_2D.F90 b/Src/Base/AMReX_FILCC_2D.F90
deleted file mode 100644
index 89ef77d8384..00000000000
--- a/Src/Base/AMReX_FILCC_2D.F90
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <AMReX_Config.H>
-
-#ifndef AMREX_XSDK
-
-! -----------------------------------------------------------
-!> This routine is intended to be a generic fill function
-!! for cell-centered data.  It knows how to extrapolate
-!! and reflect data and is used to supplement the problem-specific
-!! fill functions which call it.
-!!
-!! \param q           <=  array to fill
-!! \param lo,hi        => index extent of q array
-!! \param domlo,domhi  => index extent of problem domain
-!! \param dx           => cell spacing
-!! \param xlo          => physical location of lower left hand
-!!                        corner of q array
-!! \param bc           => array of boundary flags bc(SPACEDIM,lo:hi)
-!!
-!! NOTE: all corner as well as edge data is filled if not EXT_DIR
-! -----------------------------------------------------------
-
-subroutine filcc(q,q_l1,q_l2,q_h1,q_h2,domlo,domhi,dx,xlo,bc)
-
-  use amrex_fort_module
-  use amrex_filcc_module, only: filccn
-
-  implicit none
-
-  integer    q_l1, q_l2, q_h1, q_h2
-  integer    domlo(2), domhi(2)
-  integer    bc(2,2)
-  real(amrex_real)     xlo(2), dx(2)
-  real(amrex_real)     q(q_l1:q_h1,q_l2:q_h2)
-
-  integer :: q_lo(3), q_hi(3)
-
-  q_lo = [q_l1, q_l2, 0]
-  q_hi = [q_h1, q_h2, 0]
-
-  call filccn(q_lo, q_hi, q, q_lo, q_hi, 1, domlo, domhi, dx, xlo, bc)
-
-end subroutine filcc
-
-subroutine hoextraptocc(q,q_l1,q_l2,q_h1,q_h2,domlo,domhi,dx,xlo)
-
-  use amrex_fort_module
-  use amrex_filcc_module, only : amrex_hoextraptocc_2d
-
-  implicit none
-
-  integer    q_l1, q_l2, q_h1, q_h2
-  integer    domlo(2), domhi(2)
-  real(amrex_real)     xlo(2), dx(2)
-  real(amrex_real)     q(q_l1:q_h1,q_l2:q_h2)
-
-  call amrex_hoextraptocc_2d(q,q_l1,q_l2,q_h1,q_h2,domlo,domhi,dx,xlo)
-
-end subroutine hoextraptocc
-
-#endif
diff --git a/Src/Base/AMReX_FILCC_3D.F90 b/Src/Base/AMReX_FILCC_3D.F90
deleted file mode 100644
index aa3fec74ab0..00000000000
--- a/Src/Base/AMReX_FILCC_3D.F90
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <AMReX_Config.H>
-
-#ifndef AMREX_XSDK
-
-! -----------------------------------------------------------
-!> This routine is intended to be a generic fill function
-!! for cell centered data.  It knows how to exrapolate,
-!! and reflect data and can be used to supplement problem
-!! specific fill functions (ie. EXT_DIR).
-!!
-!! \param q        <=  array to fill
-!! \param q_l1,q_l2,q_l3,q_h1,q_h2,q_h3   => index extent of q array
-!! \param domlo,hi  => index extent of problem domain
-!! \param dx        => cell spacing
-!! \param xlo       => physical location of lower left hand
-!!                     corner of q array
-!! \param bc        => array of boundary flags bc(SPACEDIM,lo:hi)
-!!
-!! NOTE: corner data not used in computing soln but must have
-!!       reasonable values for arithmetic to live
-! -----------------------------------------------------------
-
-subroutine filcc(q,q_l1,q_l2,q_l3,q_h1,q_h2,q_h3,domlo,domhi,dx,xlo,bc)
-
-  use amrex_fort_module, only: rt => amrex_real
-  use amrex_filcc_module, only: filccn
-
-  implicit none
-
-  integer,  intent(in   ) :: q_l1, q_l2, q_l3, q_h1, q_h2, q_h3
-  integer,  intent(in   ) :: domlo(3), domhi(3)
-  real(rt), intent(in   ) :: xlo(3), dx(3)
-  real(rt), intent(inout) :: q(q_l1:q_h1,q_l2:q_h2,q_l3:q_h3)
-  integer,  intent(in   ) :: bc(3,2)
-
-  integer :: q_lo(3), q_hi(3)
-
-  q_lo = [q_l1, q_l2, q_l3]
-  q_hi = [q_h1, q_h2, q_h3]
-
-  call filccn(q_lo, q_hi, q, q_lo, q_hi, 1, domlo, domhi, dx, xlo, bc)
-
-end subroutine filcc
-
-
-
-subroutine hoextraptocc(q,q_l1,q_l2,q_l3,q_h1,q_h2,q_h3,domlo,domhi,dx,xlo)
-
-  use amrex_fort_module, only: rt => amrex_real
-  use amrex_filcc_module, only : amrex_hoextraptocc_3d
-
-  implicit none
-
-  integer,  intent(in   ) :: q_l1, q_l2, q_l3, q_h1, q_h2, q_h3
-  integer,  intent(in   ) :: domlo(3), domhi(3)
-  real(rt), intent(in   ) :: xlo(3), dx(3)
-  real(rt), intent(inout) :: q(q_l1:q_h1,q_l2:q_h2,q_l3:q_h3)
-
-  call amrex_hoextraptocc_3d(q,q_l1,q_l2,q_l3,q_h1,q_h2,q_h3,domlo,domhi,dx,xlo)
-
-end subroutine hoextraptocc
-
-#endif
diff --git a/Src/Base/AMReX_FPC.H b/Src/Base/AMReX_FPC.H
index 77c4dfa923b..8975ed8e9b9 100644
--- a/Src/Base/AMReX_FPC.H
+++ b/Src/Base/AMReX_FPC.H
@@ -15,7 +15,6 @@ namespace amrex {
 * namespaces, and we don't like global constants, we make them static
 * constant data members of this class.
 */
-
 class FPC
 {
 public:
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index a8839a4bcc0..b09156b6606 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -24,6 +24,7 @@
 #include <AMReX_MakeType.H>
 #include <AMReX_TypeTraits.H>
 #include <AMReX_LayoutData.H>
+#include <AMReX_BaseFab.H>
 #include <AMReX_BaseFabUtility.H>
 #include <AMReX_MFParallelFor.H>
 #include <AMReX_TagParallelFor.H>
@@ -39,81 +40,30 @@
 #include <omp.h>
 #endif
 
+#include <algorithm>
 #include <cstring>
 #include <limits>
 #include <map>
+#include <memory>
 #include <utility>
-#include <vector>
-#include <algorithm>
 #include <set>
 #include <string>
+#include <vector>
+
 
 namespace amrex {
 
-template <typename T, typename std::enable_if<!IsBaseFab<T>::value,int>::type = 0>
+template <typename T, std::enable_if_t<!IsBaseFab<T>::value,int> = 0>
 Long nBytesOwned (T const&) noexcept { return 0; }
 
 template <typename T>
 Long nBytesOwned (BaseFab<T> const& fab) noexcept { return fab.nBytesOwned(); }
 
-/*
-  A Collection of Fortran Array-like Objects
-
-
-  The FabArray<FAB> class implements a collection (stored as an array) of
-  Fortran array-like objects.  The parameterized type FAB is intended to be
-  any class derived from BaseFab<T>.  For example, FAB may be a BaseFab of
-  integers, so we could write:
-
-    FabArray<BaseFab<int> > int_fabs;
-
-  Then int_fabs is a FabArray that can hold a collection of BaseFab<int>
-  objects.
-
-  FabArray is not just a general container class for Fortran arrays.  It is
-  intended to hold "grid" data for use in finite difference calculations in
-  which the data is defined on a union of (usually disjoint) rectangular
-  regions embedded in a uniform index space.  This region, called the valid
-  region, is represented by a BoxArray.  For the purposes of this discussion,
-  the Kth Box in the BoxArray represents the interior region of the Kth grid.
-
-  Since the intent is to be used with finite difference calculations a
-  FabArray also includes the notion of a boundary region for each grid.  The
-  boundary region is specified by the ngrow parameter which tells the FabArray
-  to allocate each FAB to be ngrow cells larger in all directions than the
-  underlying Box.  The larger region covered by the union of all the FABs is
-  called the region of definition.  The underlying notion is that the valid
-  region contains the grid interior data and the region of definition includes
-  the interior region plus the boundary areas.
-
-  Operations are available to copy data from the valid regions into these
-  boundary areas where the two overlap.  The number of components, that is,
-  the number of values that can be stored in each cell of a FAB, is either
-  given as an argument to the constructor or is inherent in the definition of
-  the underlying FAB.  Each FAB in the FabArray will have the same number of
-  components.
-
-  In summary, a FabArray is an array of FABs.  The Kth element contains a FAB
-  that holds the data for the Kth grid, a Box that defines the valid region
-  of the Kth grid.
-
-  A typical use for a FabArray would be to hold the solution vector or
-  right-hand-side when solving a linear system of equations on a union of
-  rectangular grids.  The copy operations would be used to copy data from the
-  valid regions of neighboring grids into the boundary regions after each
-  relaxation step of the iterative method.  If a multigrid method is used, a
-  FabArray could be used to hold the data at each level in the multigrid
-  hierarchy.
-
-  This class is a concrete class not a polymorphic one.
-
-  This class does NOT provide a copy constructor or assignment operator.
-*/
-
-//
-// alloc: allocate memory or not
-//
+/**
+ * \brief FabArray memory allocation information
+ */
 struct MFInfo {
+    // alloc: allocate memory or not
     bool    alloc = true;
     Arena*  arena = nullptr;
     Vector<std::string> tags;
@@ -198,11 +148,8 @@ struct MultiArray4
 {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Array4<T> const& operator[] (int li) const noexcept {
-#if AMREX_DEVICE_COMPILE
-        return dp[li];
-#else
-        return hp[li];
-#endif
+        AMREX_IF_ON_DEVICE((return dp[li];))
+        AMREX_IF_ON_HOST((return hp[li];))
     }
 
 #ifdef AMREX_USE_GPU
@@ -317,6 +264,60 @@ Add (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, int
     }
 }
 
+/**
+ * \brief An Array of FortranArrayBox(FAB)-like Objects
+ *
+ * The FabArray<FAB> class implements a collection (stored as an array) of
+ * Fortran array box-like ( \p FAB ) objects.  The parameterized type \p FAB is intended to be
+ * any class derived from BaseFab<T>.  For example, \p FAB may be a BaseFab of
+ * integers, so we could write:
+ *
+ *   FabArray<BaseFab<int> > int_fabs;
+ *
+ * Then int_fabs is a FabArray that can hold a collection of BaseFab<int>
+ * objects.
+ *
+ * FabArray is not just a general container class for Fortran arrays.  It is
+ * intended to hold "grid" data for use in finite difference calculations in
+ * which the data is defined on a union of (usually disjoint) rectangular
+ * regions embedded in a uniform index space.  This region, called the valid
+ * region, is represented by a BoxArray.  For the purposes of this discussion,
+ * the Kth Box in the BoxArray represents the interior region of the Kth grid.
+ *
+ * Since the intent is to be used with finite difference calculations a
+ * FabArray also includes the notion of a boundary region for each grid.  The
+ * boundary region is specified by the ngrow parameter which tells the FabArray
+ * to allocate each \p FAB to be ngrow cells larger in all directions than the
+ * underlying Box.  The larger region covered by the union of all the \p FABs is
+ * called the region of definition.  The underlying notion is that the valid
+ * region contains the grid interior data and the region of definition includes
+ * the interior region plus the boundary areas.
+ *
+ * Operations are available to copy data from the valid regions into these
+ * boundary areas where the two overlap.  The number of components, that is,
+ * the number of values that can be stored in each cell of a \p FAB, is either
+ * given as an argument to the constructor or is inherent in the definition of
+ * the underlying \p FAB.  Each \p FAB in the FabArray will have the same number of
+ * components.
+ *
+ * In summary, a FabArray is an array of \p FABs.  The Kth element contains a \p FAB
+ * that holds the data for the Kth grid, a Box that defines the valid region
+ * of the Kth grid.
+ *
+ * A typical use for a FabArray would be to hold the solution vector or
+ * right-hand-side when solving a linear system of equations on a union of
+ * rectangular grids.  The copy operations would be used to copy data from the
+ * valid regions of neighboring grids into the boundary regions after each
+ * relaxation step of the iterative method.  If a multigrid method is used, a
+ * FabArray could be used to hold the data at each level in the multigrid
+ * hierarchy.
+ *
+ * This class is a concrete class not a polymorphic one.
+ *
+ * This class does NOT provide a copy constructor or assignment operator.
+ *
+ * \tparam FAB FortranArrayBox-like object. Typically a derived class of BaseFab. Not to be confused with FabArrayBase.
+ */
 template <class FAB>
 class FabArray
     :
@@ -332,7 +333,7 @@ public:
     * if FAB is a BaseFab or its child, value_type = FAB::value_type
     * else                              value_type = FAB;
     */
-    using value_type = typename std::conditional<IsBaseFab<FAB>::value, FAB, FABType>::type::value_type;
+    using value_type = typename std::conditional_t<IsBaseFab<FAB>::value, FAB, FABType>::value_type;
 
     using fab_type = FAB;
 
@@ -341,8 +342,9 @@ public:
     FabArray () noexcept;
 
     /**
-     * \brief Construct an empty FabArray<FAB> that has a default Arena.  If
-     * `define` is called later with a nulltpr as MFInfo's arena, the
+     * \brief Construct an empty FabArray<FAB> that has a default Arena.
+     *
+     * If `define` is called later with a nullptr as MFInfo's arena, the
      * default Arena `a` will be used.  If the arena in MFInfo is not a
      * nullptr, the MFInfo's arena will be used.
      */
@@ -499,69 +501,69 @@ public:
     FAB      * fabPtr (int K) noexcept;  // Here K is global index
     FAB const* fabPtr (int K) const noexcept;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void prefetchToHost (const MFIter& mfi) const noexcept;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void prefetchToDevice (const MFIter& mfi) const noexcept;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type const> array (const MFIter& mfi) const noexcept;
     //
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type> array (const MFIter& mfi) noexcept;
     //
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type const> array (int K) const noexcept;
     //
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type> array (int K) noexcept;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type const> const_array (const MFIter& mfi) const noexcept;
     //
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type const> const_array (int K) const noexcept;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type const> array (const MFIter& mfi, int start_comp) const noexcept;
     //
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type> array (const MFIter& mfi, int start_comp) noexcept;
     //
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type const> array (int K, int start_comp) const noexcept;
     //
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type> array (int K, int start_comp) noexcept;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type const> const_array (const MFIter& mfi, int start_comp) const noexcept;
     //
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     Array4<typename FabArray<FAB>::value_type const> const_array (int K, int start_comp) const noexcept;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     MultiArray4<typename FabArray<FAB>::value_type> arrays () noexcept;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     MultiArray4<typename FabArray<FAB>::value_type const> arrays () const noexcept;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     MultiArray4<typename FabArray<FAB>::value_type const> const_arrays () const noexcept;
 
     //! Explicitly set the Kth FAB in the FabArray to point to elem.
     void setFab (int boxno, std::unique_ptr<FAB> elem);
 
     //! Explicitly set the Kth FAB in the FabArray to point to elem.
-    template <class F=FAB, std::enable_if_t<std::is_move_constructible<F>::value,int> = 0>
+    template <class F=FAB, std::enable_if_t<std::is_move_constructible_v<F>,int> = 0>
     void setFab (int boxno, FAB&& elem);
 
     //! Explicitly set the FAB associated with mfi in the FabArray to point to elem.
     void setFab (const MFIter&mfi, std::unique_ptr<FAB> elem);
 
     //! Explicitly set the FAB associated with mfi in the FabArray to point to elem.
-    template <class F=FAB, std::enable_if_t<std::is_move_constructible<F>::value,int> = 0>
+    template <class F=FAB, std::enable_if_t<std::is_move_constructible_v<F>,int> = 0>
     void setFab (const MFIter&mfi, FAB&& elem);
 
     //! Release ownership of the FAB. This function is not thread safe.
@@ -614,11 +616,11 @@ public:
                    IntVect const& nghost);
 
     //! Set all components in the entire region of each FAB to val.
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val);
 
     //! Set all components in the entire region of each FAB to val.
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     FabArray<FAB>& operator= (value_type val);
 
     /**
@@ -626,13 +628,13 @@ public:
     * each FAB in the FabArray, starting at component comp to val.
     * Also set the value of nghost boundary cells.
     */
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val,
                  int        comp,
                  int        ncomp,
                  int        nghost = 0);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val,
                  int        comp,
                  int        ncomp,
@@ -644,14 +646,14 @@ public:
     * as nghost boundary cells, to val, provided they also intersect
     * with the Box region.
     */
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val,
                  const Box& region,
                  int        comp,
                  int        ncomp,
                  int        nghost = 0);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val,
                  const Box& region,
                  int        comp,
@@ -661,10 +663,10 @@ public:
     * \brief Set all components in the valid region of each FAB in the
     * FabArray to val, including nghost boundary cells.
     */
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val, int nghost);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val, const IntVect& nghost);
 
     /**
@@ -672,50 +674,50 @@ public:
     * FabArray to val, including nghost boundary cells, that also
     * intersect the Box region.
     */
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val, const Box& region, int nghost);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val, const Box& region, const IntVect& nghost);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void abs (int comp, int ncomp, int nghost = 0);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void abs (int comp, int ncomp, const IntVect& nghost);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void plus (value_type val, int comp, int num_comp, int nghost = 0);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void plus (value_type val, const Box& region, int comp, int num_comp, int nghost = 0);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void mult (value_type val, int comp, int num_comp, int nghost = 0);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void mult (value_type val, const Box& region, int comp, int num_comp, int nghost = 0);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void invert (value_type numerator, int comp, int num_comp, int nghost = 0);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void invert (value_type numerator, const Box& region, int comp, int num_comp, int nghost = 0);
 
     //! Set all values in the boundary region to val.
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setBndry (value_type val);
 
     //! Set ncomp values in the boundary region, starting at start_comp to val.
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setBndry (value_type val, int strt_comp, int ncomp);
 
    //! Set all values outside the Geometry domain to val.
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setDomainBndry (value_type val, const Geometry& geom);
 
     //! Set ncomp values outside the Geometry domain to val, starting at start_comp.
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setDomainBndry (value_type val, int strt_comp, int ncomp, const Geometry& geom);
 
     /**
@@ -1001,7 +1003,7 @@ public:
     void FillBoundary_nowait (int scomp, int ncomp, const IntVect& nghost, const Periodicity& period, bool cross = false);
 
     template <typename BUF=value_type,
-              class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+              class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void FillBoundary_finish ();
 
     void FillBoundary_test ();
@@ -1110,7 +1112,7 @@ public:
     //             (including ghost cells outside periodic boundaries)
     // physbnd   : boundary cells outside the domain (excluding periodic boundaries)
     // interior  : interior cells (i.e., valid cells)
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void BuildMask (const Box& phys_domain, const Periodicity& period,
                     value_type covered, value_type notcovered,
                     value_type physbnd, value_type interior);
@@ -1118,7 +1120,7 @@ public:
     // The following are private functions.  But we have to make them public for cuda.
 
     template <typename BUF=value_type,
-              class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+              class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
                       const Periodicity& period, bool cross,
                       bool enforce_periodicity_only = false,
@@ -1128,10 +1130,10 @@ public:
     void PC_local_cpu (const CPC& thecpc, FabArray<FAB> const& src,
                        int scomp, int dcomp, int ncomp, CpOp op);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void setVal (value_type val, const CommMetaData& thecmd, int scomp, int ncomp);
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     LayoutData<int> RecvLayoutMask (const CommMetaData& thecmd);
 
 #ifdef AMREX_USE_GPU
@@ -1308,7 +1310,7 @@ private:
 
     void setFab_assert (int K, FAB const& fab) const;
 
-    template <class F=FAB, typename std::enable_if<IsBaseFab<F>::value,int>::type = 0>
+    template <class F=FAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
     void build_arrays () const;
 
     void clear_arrays ();
@@ -1490,7 +1492,7 @@ FabArray<FAB>::fabPtr (int K) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::prefetchToHost (const MFIter& mfi) const noexcept
 {
@@ -1502,7 +1504,7 @@ FabArray<FAB>::prefetchToHost (const MFIter& mfi) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::prefetchToDevice (const MFIter& mfi) const noexcept
 {
@@ -1514,7 +1516,7 @@ FabArray<FAB>::prefetchToDevice (const MFIter& mfi) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::array (const MFIter& mfi) const noexcept
 {
@@ -1522,7 +1524,7 @@ FabArray<FAB>::array (const MFIter& mfi) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type>
 FabArray<FAB>::array (const MFIter& mfi) noexcept
 {
@@ -1530,7 +1532,7 @@ FabArray<FAB>::array (const MFIter& mfi) noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::array (int K) const noexcept
 {
@@ -1538,7 +1540,7 @@ FabArray<FAB>::array (int K) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type>
 FabArray<FAB>::array (int K) noexcept
 {
@@ -1546,7 +1548,7 @@ FabArray<FAB>::array (int K) noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::const_array (const MFIter& mfi) const noexcept
 {
@@ -1554,7 +1556,7 @@ FabArray<FAB>::const_array (const MFIter& mfi) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::const_array (int K) const noexcept
 {
@@ -1562,7 +1564,7 @@ FabArray<FAB>::const_array (int K) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::array (const MFIter& mfi, int start_comp) const noexcept
 {
@@ -1570,7 +1572,7 @@ FabArray<FAB>::array (const MFIter& mfi, int start_comp) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type>
 FabArray<FAB>::array (const MFIter& mfi, int start_comp) noexcept
 {
@@ -1578,7 +1580,7 @@ FabArray<FAB>::array (const MFIter& mfi, int start_comp) noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::array (int K, int start_comp) const noexcept
 {
@@ -1586,7 +1588,7 @@ FabArray<FAB>::array (int K, int start_comp) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type>
 FabArray<FAB>::array (int K, int start_comp) noexcept
 {
@@ -1594,7 +1596,7 @@ FabArray<FAB>::array (int K, int start_comp) noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::const_array (const MFIter& mfi, int start_comp) const noexcept
 {
@@ -1602,7 +1604,7 @@ FabArray<FAB>::const_array (const MFIter& mfi, int start_comp) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 Array4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::const_array (int K, int start_comp) const noexcept
 {
@@ -1610,7 +1612,7 @@ FabArray<FAB>::const_array (int K, int start_comp) const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 MultiArray4<typename FabArray<FAB>::value_type>
 FabArray<FAB>::arrays () noexcept
 {
@@ -1619,7 +1621,7 @@ FabArray<FAB>::arrays () noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 MultiArray4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::arrays () const noexcept
 {
@@ -1628,7 +1630,7 @@ FabArray<FAB>::arrays () const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 MultiArray4<typename FabArray<FAB>::value_type const>
 FabArray<FAB>::const_arrays () const noexcept
 {
@@ -1637,7 +1639,7 @@ FabArray<FAB>::const_arrays () const noexcept
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::build_arrays () const
 {
@@ -1781,7 +1783,7 @@ FabArray<FAB>::LocalAdd (FabArray<FAB> const& src, int scomp, int dcomp, int nco
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setVal (value_type val, int nghost)
 {
@@ -1789,7 +1791,7 @@ FabArray<FAB>::setVal (value_type val, int nghost)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setVal (value_type val, const IntVect& nghost)
 {
@@ -1797,7 +1799,7 @@ FabArray<FAB>::setVal (value_type val, const IntVect& nghost)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setVal (value_type val, const Box& region, int nghost)
 {
@@ -1805,7 +1807,7 @@ FabArray<FAB>::setVal (value_type val, const Box& region, int nghost)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setVal (value_type val, const Box& region, const IntVect& nghost)
 {
@@ -2158,7 +2160,7 @@ FabArray<FAB>::setFab (int boxno, std::unique_ptr<FAB> elem)
 }
 
 template <class FAB>
-template <class F, std::enable_if_t<std::is_move_constructible<F>::value,int> >
+template <class F, std::enable_if_t<std::is_move_constructible_v<F>,int> >
 void
 FabArray<FAB>::setFab (int boxno, FAB&& elem)
 {
@@ -2201,7 +2203,7 @@ FabArray<FAB>::setFab (const MFIter& mfi, std::unique_ptr<FAB> elem)
 }
 
 template <class FAB>
-template <class F, std::enable_if_t<std::is_move_constructible<F>::value,int> >
+template <class F, std::enable_if_t<std::is_move_constructible_v<F>,int> >
 void
 FabArray<FAB>::setFab (const MFIter& mfi, FAB&& elem)
 {
@@ -2223,7 +2225,7 @@ FabArray<FAB>::setFab (const MFIter& mfi, FAB&& elem)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setBndry (value_type val)
 {
@@ -2231,7 +2233,7 @@ FabArray<FAB>::setBndry (value_type val)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::setBndry (value_type val,
                          int        strt_comp,
@@ -2332,7 +2334,7 @@ FabArray<FAB>::setBndry (value_type val,
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setDomainBndry (value_type val, const Geometry& geom)
 {
@@ -2340,7 +2342,7 @@ FabArray<FAB>::setDomainBndry (value_type val, const Geometry& geom)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setDomainBndry (value_type val,
                                int        strt_comp,
@@ -2422,7 +2424,7 @@ FabArray<FAB>::copyTo (FAB& dest, int  nghost) const
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setVal (value_type val)
 {
@@ -2430,7 +2432,7 @@ FabArray<FAB>::setVal (value_type val)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 FabArray<FAB>&
 FabArray<FAB>::operator= (value_type val)
 {
@@ -2439,7 +2441,7 @@ FabArray<FAB>::operator= (value_type val)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setVal (value_type val,
                        int        comp,
@@ -2450,7 +2452,7 @@ FabArray<FAB>::setVal (value_type val,
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::setVal (value_type val,
                        int        comp,
@@ -2492,7 +2494,7 @@ FabArray<FAB>::setVal (value_type val,
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setVal (value_type val,
                        const Box& region,
@@ -2504,7 +2506,7 @@ FabArray<FAB>::setVal (value_type val,
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::setVal (value_type val,
                        const Box& region,
@@ -2553,7 +2555,7 @@ FabArray<FAB>::setVal (value_type val,
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::abs (int comp, int ncomp, int nghost)
 {
@@ -2561,7 +2563,7 @@ FabArray<FAB>::abs (int comp, int ncomp, int nghost)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::abs (int comp, int ncomp, const IntVect& nghost)
 {
@@ -2599,7 +2601,7 @@ FabArray<FAB>::abs (int comp, int ncomp, const IntVect& nghost)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::plus (value_type val, int comp, int num_comp, int nghost)
 {
@@ -2635,7 +2637,7 @@ FabArray<FAB>::plus (value_type val, int comp, int num_comp, int nghost)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::plus (value_type val, const Box& region, int comp, int num_comp, int nghost)
 {
@@ -2675,7 +2677,7 @@ FabArray<FAB>::plus (value_type val, const Box& region, int comp, int num_comp,
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::mult (value_type val, int comp, int num_comp, int nghost)
 {
@@ -2711,7 +2713,7 @@ FabArray<FAB>::mult (value_type val, int comp, int num_comp, int nghost)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::mult (value_type val, const Box& region, int comp, int num_comp, int nghost)
 {
@@ -2751,7 +2753,7 @@ FabArray<FAB>::mult (value_type val, const Box& region, int comp, int num_comp,
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::invert (value_type numerator, int comp, int num_comp, int nghost)
 {
@@ -2787,7 +2789,7 @@ FabArray<FAB>::invert (value_type numerator, int comp, int num_comp, int nghost)
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::invert (value_type numerator, const Box& region, int comp, int num_comp, int nghost)
 {
@@ -2830,7 +2832,7 @@ template <class FAB>
 void
 FabArray<FAB>::shift (const IntVect& v)
 {
-    clearThisBD();  // The new boxarry will have a different ID.
+    clearThisBD();  // The new boxarray will have a different ID.
     boxarray.shift(v);
     addThisBD();
 #ifdef AMREX_USE_OMP
@@ -3321,7 +3323,7 @@ FabArray<FAB>::FillBoundary_nowait (int scomp, int ncomp, const IntVect& nghost,
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::BuildMask (const Box& phys_domain, const Periodicity& period,
                           value_type covered, value_type notcovered,
@@ -3389,7 +3391,7 @@ FabArray<FAB>::BuildMask (const Box& phys_domain, const Periodicity& period,
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 void
 FabArray<FAB>::setVal (value_type val, const CommMetaData& thecmd, int scomp, int ncomp)
 {
@@ -3430,7 +3432,7 @@ FabArray<FAB>::setVal (value_type val, const CommMetaData& thecmd, int scomp, in
 }
 
 template <class FAB>
-template <class F, typename std::enable_if<IsBaseFab<F>::value,int>::type>
+template <class F, std::enable_if_t<IsBaseFab<F>::value,int>>
 LayoutData<int>
 FabArray<FAB>::RecvLayoutMask (const CommMetaData& thecmd)
 {
diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H
index 29d3d63b29e..09dfd5e22db 100644
--- a/Src/Base/AMReX_FabArrayBase.H
+++ b/Src/Base/AMReX_FabArrayBase.H
@@ -15,9 +15,11 @@
 #include <omp.h>
 #endif
 
+#include <ostream>
 #include <string>
 #include <utility>
 
+
 namespace amrex {
 
 class MFIter;
@@ -28,6 +30,12 @@ template <typename FAB> class FabArray;
 
 namespace EB2 { class IndexSpace; }
 
+/**
+ * \brief Base class for FabArray.
+ *
+ * Not to be confused with FArrayBox or `FAB` shorthands.
+ * Can be read as FArrayBox-like Array Base.
+ */
 class FabArrayBase
 {
     friend class MFIter;
@@ -645,7 +653,7 @@ public:
         ~ParForInfo ();
 
         std::pair<int*,int*> const& getBlocks () const { return m_nblocks_x; }
-        Box const* getBoxes () const { return m_boxes; }
+        BoxIndexer const* getBoxes () const { return m_boxes; }
 
         ParForInfo () = delete;
         ParForInfo (ParForInfo const&) = delete;
@@ -657,7 +665,7 @@ public:
         IntVect m_ng;
         int m_nthreads;
         std::pair<int*,int*> m_nblocks_x;
-        Box* m_boxes = nullptr;
+        BoxIndexer* m_boxes = nullptr;
         char* m_hp = nullptr;
         char* m_dp = nullptr;
     };
@@ -715,6 +723,11 @@ public:
 
 };
 
+[[nodiscard]] int nComp (FabArrayBase const& fa);
+[[nodiscard]] IntVect nGrowVect (FabArrayBase const& fa);
+[[nodiscard]] BoxArray const& boxArray (FabArrayBase const& fa);
+[[nodiscard]] DistributionMapping const& DistributionMap (FabArrayBase const& fa);
+
 #ifdef BL_USE_MPI
 bool CheckRcvStats (Vector<MPI_Status>& recv_stats, const Vector<std::size_t>& recv_size, int tag);
 #endif
diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp
index 8dd8275f66a..eb8fc99605b 100644
--- a/Src/Base/AMReX_FabArrayBase.cpp
+++ b/Src/Base/AMReX_FabArrayBase.cpp
@@ -1325,8 +1325,7 @@ FabArrayBase::RB90::define (const FabArrayBase& fa)
                     {
                         Box bxsnd = (n==0) ? amrex::get<0>(dst_to_src)(bxrcv)
                                            : amrex::get<1>(dst_to_src)(bxrcv);
-                        send_tags[dst_owner].push_back(FabArrayBase::CopyComTag(bxrcv, bxsnd,
-                                                                                krcv, ksnd));
+                        send_tags[dst_owner].emplace_back(bxrcv, bxsnd, krcv, ksnd);
                     }
                 }
             }
@@ -1498,8 +1497,7 @@ FabArrayBase::RB180::define (const FabArrayBase& fa)
                 if (dst_owner != myproc) // local copy will be dealt with later
                 {
                     Box const& bxsnd = convert(bxrcv);
-                    send_tags[dst_owner].push_back(FabArrayBase::CopyComTag(bxrcv, bxsnd,
-                                                                            krcv, ksnd));
+                    send_tags[dst_owner].emplace_back(bxrcv, bxsnd, krcv, ksnd);
                 }
             }
         }
@@ -1683,8 +1681,7 @@ FabArrayBase::PolarB::define (const FabArrayBase& fa)
                     if (dst_owner != myproc) // local copy will be dealt with later
                     {
                         Box const bxsnd = (n<4) ? convert(bxrcv) : convert_corner(bxrcv);
-                        send_tags[dst_owner].push_back(FabArrayBase::CopyComTag(bxrcv, bxsnd,
-                                                                                krcv, ksnd));
+                        send_tags[dst_owner].emplace_back(bxrcv, bxsnd, krcv, ksnd);
                     }
                 }
             }
@@ -2699,4 +2696,24 @@ FabArrayBase::flushParForCache ()
 
 #endif
 
+int nComp (FabArrayBase const& fa)
+{
+    return fa.nComp();
+}
+
+IntVect nGrowVect (FabArrayBase const& fa)
+{
+    return fa.nGrowVect();
+}
+
+BoxArray const& boxArray (FabArrayBase const& fa)
+{
+    return fa.boxArray();
+}
+
+DistributionMapping const& DistributionMap (FabArrayBase const& fa)
+{
+    return fa.DistributionMap();
+}
+
 }
diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
index ea877266f9a..307ca490505 100644
--- a/Src/Base/AMReX_FabArrayCommI.H
+++ b/Src/Base/AMReX_FabArrayCommI.H
@@ -3,7 +3,7 @@
 #include <AMReX_PCI.H>
 
 template <class FAB>
-template <typename BUF, class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <typename BUF, class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
                             const Periodicity& period, bool cross,
@@ -163,7 +163,7 @@ FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
 }
 
 template <class FAB>
-template <typename BUF, class F, typename std::enable_if<IsBaseFab<F>::value,int>::type Z>
+template <typename BUF, class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
 void
 FabArray<FAB>::FillBoundary_finish ()
 {
@@ -666,7 +666,7 @@ FabArray<FAB>::PrepareSendBuffers (const MapOfCopyComTagContainers&     SndTags,
             nbytes += cct.sbox.numPts() * ncomp * sizeof(BUF);
         }
 
-        std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+        std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
         nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
 
         // Also need to align the offset properly
@@ -757,7 +757,7 @@ FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers&  RcvTags,
             nbytes += cct.dbox.numPts() * ncomp * sizeof(BUF);
         }
 
-        std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+        std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
         nbytes = amrex::aligned_size(acd, nbytes);  // so that nbytes are aligned
 
         // Also need to align the offset properly
@@ -992,7 +992,7 @@ FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,
                 }
             }
 
-            std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+            std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
             nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned
 
             // Also need to align the offset properly
@@ -1065,7 +1065,7 @@ FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,
                 }
             }
 
-            std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+            std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
             nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
 
             // Also need to align the offset properly
diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H
index 78f3355d34a..88aaa771090 100644
--- a/Src/Base/AMReX_FabArrayUtility.H
+++ b/Src/Base/AMReX_FabArrayUtility.H
@@ -21,7 +21,7 @@ namespace fudetail {
 template <class FAB, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB>::value> >
 typename FAB::value_type
-ReduceSum_host (FabArray<FAB> const& fa, IntVect const& nghost, F&& f)
+ReduceSum_host (FabArray<FAB> const& fa, IntVect const& nghost, F const& f)
 {
     using value_type = typename FAB::value_type;
     value_type sm = 0;
@@ -47,7 +47,7 @@ std::enable_if_t<IsBaseFab<FAB>::value,
                  std::conditional_t<std::is_same<OP,ReduceOpLogicalAnd>::value ||
                                     std::is_same<OP,ReduceOpLogicalOr>::value,
                                     int, typename FAB::value_type> >
-ReduceMF (FabArray<FAB> const& fa, IntVect const& nghost, F&& f)
+ReduceMF (FabArray<FAB> const& fa, IntVect const& nghost, F const& f)
 {
     using T = std::conditional_t<std::is_same<OP,ReduceOpLogicalAnd>::value ||
                                  std::is_same<OP,ReduceOpLogicalOr>::value,
@@ -66,7 +66,7 @@ std::enable_if_t<IsBaseFab<FAB1>::value && IsBaseFab<FAB2>::value,
                  std::conditional_t<std::is_same<OP,ReduceOpLogicalAnd>::value ||
                                     std::is_same<OP,ReduceOpLogicalOr>::value,
                                     int, typename FAB1::value_type> >
-ReduceMF (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2, IntVect const& nghost, F&& f)
+ReduceMF (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2, IntVect const& nghost, F const& f)
 {
     using T = std::conditional_t<std::is_same<OP,ReduceOpLogicalAnd>::value ||
                                  std::is_same<OP,ReduceOpLogicalOr>::value,
@@ -88,7 +88,7 @@ std::enable_if_t<IsBaseFab<FAB1>::value && IsBaseFab<FAB2>::value && IsBaseFab<F
                                     std::is_same<OP,ReduceOpLogicalOr>::value,
                                     int, typename FAB1::value_type> >
 ReduceMF (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
-          FabArray<FAB3> const& fa3, IntVect const& nghost, F&& f)
+          FabArray<FAB3> const& fa3, IntVect const& nghost, F const& f)
 {
     using T = std::conditional_t<std::is_same<OP,ReduceOpLogicalAnd>::value ||
                                  std::is_same<OP,ReduceOpLogicalOr>::value,
@@ -156,7 +156,7 @@ template <class FAB1, class FAB2, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB1>::value> >
 typename FAB1::value_type
 ReduceSum_host (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
-                IntVect const& nghost, F&& f)
+                IntVect const& nghost, F const& f)
 {
     using value_type = typename FAB1::value_type;
     value_type sm = 0;
@@ -234,7 +234,7 @@ template <class FAB1, class FAB2, class FAB3, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB1>::value> >
 typename FAB1::value_type
 ReduceSum_host (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
-                FabArray<FAB3> const& fa3, IntVect const& nghost, F&& f)
+                FabArray<FAB3> const& fa3, IntVect const& nghost, F const& f)
 {
     using value_type = typename FAB1::value_type;
     value_type sm = 0;
@@ -311,7 +311,7 @@ namespace fudetail {
 template <class FAB, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB>::value> >
 typename FAB::value_type
-ReduceMin_host (FabArray<FAB> const& fa, IntVect const& nghost, F&& f)
+ReduceMin_host (FabArray<FAB> const& fa, IntVect const& nghost, F const& f)
 {
     using value_type = typename FAB::value_type;
     value_type r = std::numeric_limits<value_type>::max();
@@ -382,7 +382,7 @@ template <class FAB1, class FAB2, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB1>::value> >
 typename FAB1::value_type
 ReduceMin_host (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
-                IntVect const& nghost, F&& f)
+                IntVect const& nghost, F const& f)
 {
     using value_type = typename FAB1::value_type;
     value_type r = std::numeric_limits<value_type>::max();
@@ -460,7 +460,7 @@ template <class FAB1, class FAB2, class FAB3, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB1>::value> >
 typename FAB1::value_type
 ReduceMin_host (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
-                FabArray<FAB3> const& fa3, IntVect const& nghost, F&& f)
+                FabArray<FAB3> const& fa3, IntVect const& nghost, F const& f)
 {
     using value_type = typename FAB1::value_type;
     value_type r = std::numeric_limits<value_type>::max();
@@ -537,7 +537,7 @@ namespace fudetail {
 template <class FAB, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB>::value> >
 typename FAB::value_type
-ReduceMax_host (FabArray<FAB> const& fa, IntVect const& nghost, F&& f)
+ReduceMax_host (FabArray<FAB> const& fa, IntVect const& nghost, F const& f)
 {
     using value_type = typename FAB::value_type;
     value_type r = std::numeric_limits<value_type>::lowest();
@@ -609,7 +609,7 @@ template <class FAB1, class FAB2, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB1>::value> >
 typename FAB1::value_type
 ReduceMax_host (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
-                IntVect const& nghost, F&& f)
+                IntVect const& nghost, F const& f)
 {
     using value_type = typename FAB1::value_type;
     value_type r = std::numeric_limits<value_type>::lowest();
@@ -687,7 +687,7 @@ template <class FAB1, class FAB2, class FAB3, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB1>::value> >
 typename FAB1::value_type
 ReduceMax_host (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
-                FabArray<FAB3> const& fa3, IntVect const& nghost, F&& f)
+                FabArray<FAB3> const& fa3, IntVect const& nghost, F const& f)
 {
     using value_type = typename FAB1::value_type;
     value_type r = std::numeric_limits<value_type>::lowest();
@@ -764,7 +764,7 @@ namespace fudetail {
 template <class FAB, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB>::value> >
 bool
-ReduceLogicalAnd_host (FabArray<FAB> const& fa, IntVect const& nghost, F&& f)
+ReduceLogicalAnd_host (FabArray<FAB> const& fa, IntVect const& nghost, F const& f)
 {
     int r = true;
 
@@ -836,7 +836,7 @@ template <class FAB1, class FAB2, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB1>::value> >
 bool
 ReduceLogicalAnd_host (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
-                       IntVect const& nghost, F&& f)
+                       IntVect const& nghost, F const& f)
 {
     int r = true;
 
@@ -911,7 +911,7 @@ namespace fudetail {
 template <class FAB, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB>::value> >
 bool
-ReduceLogicalOr_host (FabArray<FAB> const& fa, IntVect const& nghost, F&& f)
+ReduceLogicalOr_host (FabArray<FAB> const& fa, IntVect const& nghost, F const& f)
 {
     int r = false;
 
@@ -940,9 +940,9 @@ ReduceLogicalOr_host_wrapper (FabArray<FAB> const& fa, IntVect const& nghost, F&
 
 template <class FAB, class F>
 std::enable_if_t<amrex::DefinitelyNotHostRunnable<F>::value, bool>
-ReduceLogicalOr_host (FabArray<FAB> const& fa, IntVect const& nghost, F&& f)
+ReduceLogicalOr_host (FabArray<FAB> const& fa, IntVect const& nghost, F&& /*f*/)
 {
-    amrex::ignore_unused(fa,nghost,f);
+    amrex::ignore_unused(fa,nghost);
     amrex::Abort("ReduceLogicalOr: Launch Region is off. Device lambda cannot be called by host.");
     return 0;
 }
@@ -983,7 +983,7 @@ template <class FAB1, class FAB2, class F,
           class bar = std::enable_if_t<IsBaseFab<FAB1>::value> >
 bool
 ReduceLogicalOr_host (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
-                      IntVect const& nghost, F&& f)
+                      IntVect const& nghost, F const& f)
 {
     int r = false;
 
@@ -1092,7 +1092,7 @@ printCell (FabArray<FAB> const& mf, const IntVect& cell, int comp = -1,
                 }
                 ss << dp[n-1];
                 amrex::AllPrint() << " At cell " << cell << " in Box " << bx
-                                  << ": " << ss.str() << std::endl;
+                                  << ": " << ss.str() << '\n';
             }
         }
     }
@@ -1103,7 +1103,7 @@ template <class FAB,
 void
 Subtract (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, int numcomp, int nghost)
 {
-    Subtract(dst,src,srccomp,dstcomp,numcomp,nghost);
+    Subtract(dst,src,srccomp,dstcomp,numcomp,IntVect(nghost));
 }
 
 template <class FAB,
@@ -1602,6 +1602,232 @@ Dot (FabArray<FAB> const& x, int xcomp, FabArray<FAB> const& y, int ycomp, int n
     return sm;
 }
 
+//! dst = val
+template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void setVal (MF& dst, typename MF::value_type val)
+{
+    dst.setVal(val);
+}
+
+//! dst = val in ghost cells.
+template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void setBndry (MF& dst, typename MF::value_type val, int scomp, int ncomp)
+{
+    dst.setBndry(val, scomp, ncomp);
+}
+
+//! dst *= val
+template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void Scale (MF& dst, typename MF::value_type val, int scomp, int ncomp, int nghost)
+{
+    dst.mult(val, scomp, ncomp, nghost);
+}
+
+//! dst = src
+template <class DMF, class SMF,
+          std::enable_if_t<IsMultiFabLike_v<DMF> &&
+                           IsMultiFabLike_v<SMF>, int> = 0>
+void LocalCopy (DMF& dst, SMF const& src, int scomp, int dcomp,
+                int ncomp, IntVect const& nghost)
+{
+    amrex::Copy(dst, src, scomp, dcomp, ncomp, nghost);
+}
+
+//! dst += src
+template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void LocalAdd (MF& dst, MF const& src, int scomp, int dcomp,
+                int ncomp, IntVect const& nghost)
+{
+    amrex::Add(dst, src, scomp, dcomp, ncomp, nghost);
+}
+
+//! dst += a * src
+template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void Saxpy (MF& dst, typename MF::value_type a, MF const& src, int scomp, int dcomp,
+            int ncomp, IntVect const& nghost)
+{
+    MF::Saxpy(dst, a, src, scomp, dcomp, ncomp, nghost);
+}
+
+//! dst = src + a * dst
+template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void Xpay (MF& dst, typename MF::value_type a, MF const& src, int scomp, int dcomp,
+           int ncomp, IntVect const& nghost)
+{
+    MF::Xpay(dst, a, src, scomp, dcomp, ncomp, nghost);
+}
+
+//! dst = a*src_a + b*src_b
+template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void LinComb (MF& dst,
+              typename MF::value_type a, MF const& src_a, int acomp,
+              typename MF::value_type b, MF const& src_b, int bcomp,
+              int dcomp, int ncomp, IntVect const& nghost)
+{
+    MF::LinComb(dst, a, src_a, acomp, b, src_b, bcomp, dcomp, ncomp, nghost);
+}
+
+//! dst = src w/ MPI communication
+template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>, int> = 0>
+void ParallelCopy (MF& dst, MF const& src, int scomp, int dcomp, int ncomp,
+                   IntVect const& ng_src = IntVect(0),
+                   IntVect const& ng_dst = IntVect(0),
+                   Periodicity const& period = Periodicity::NonPeriodic())
+{
+    dst.ParallelCopy(src, scomp, dcomp, ncomp, ng_src, ng_dst, period);
+}
+
+template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>, int> = 0>
+[[nodiscard]] typename MF::value_type
+norminf (MF const& mf, int scomp, int ncomp, IntVect const& nghost,
+         bool local = false)
+{
+    return mf.norminf(scomp, ncomp, nghost, local);
+}
+
+//! dst = val
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void setVal (Array<MF,N>& dst, typename MF::value_type val)
+{
+    for (auto& mf: dst) {
+        mf.setVal(val);
+    }
+}
+
+//! dst = val in ghost cells.
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void setBndry (Array<MF,N>& dst, typename MF::value_type val, int scomp, int ncomp)
+{
+    for (auto& mf : dst) {
+        mf.setBndry(val, scomp, ncomp);
+    }
+}
+
+//! dst *= val
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void Scale (Array<MF,N>& dst, typename MF::value_type val, int scomp, int ncomp,
+            int nghost)
+{
+    for (auto& mf : dst) {
+        mf.mult(val, scomp, ncomp, nghost);
+    }
+}
+
+//! dst = src
+template <class DMF, class SMF, std::size_t N,
+          std::enable_if_t<IsMultiFabLike_v<DMF> &&
+                           IsMultiFabLike_v<SMF>, int> = 0>
+void LocalCopy (Array<DMF,N>& dst, Array<SMF,N> const& src, int scomp, int dcomp,
+                int ncomp, IntVect const& nghost)
+{
+    for (std::size_t i = 0; i < N; ++i) {
+        amrex::Copy(dst[i], src[i], scomp, dcomp, ncomp, nghost);
+    }
+}
+
+//! dst += src
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void LocalAdd (Array<MF,N>& dst, Array<MF,N> const& src, int scomp, int dcomp,
+               int ncomp, IntVect const& nghost)
+{
+    for (std::size_t i = 0; i < N; ++i) {
+        amrex::Add(dst[i], src[i], scomp, dcomp, ncomp, nghost);
+    }
+}
+
+//! dst += a * src
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void Saxpy (Array<MF,N>& dst, typename MF::value_type a,
+            Array<MF,N> const& src, int scomp, int dcomp, int ncomp,
+            IntVect const& nghost)
+{
+    for (std::size_t i = 0; i < N; ++i) {
+        MF::Saxpy(dst[i], a, src[i], scomp, dcomp, ncomp, nghost);
+    }
+}
+
+//! dst = src + a * dst
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void Xpay (Array<MF,N>& dst, typename MF::value_type a,
+           Array<MF,N> const& src, int scomp, int dcomp, int ncomp,
+           IntVect const& nghost)
+{
+    for (std::size_t i = 0; i < N; ++i) {
+        MF::Xpay(dst[i], a, src[i], scomp, dcomp, ncomp, nghost);
+    }
+}
+
+//! dst = a*src_a + b*src_b
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
+void LinComb (Array<MF,N>& dst,
+              typename MF::value_type a, Array<MF,N> const& src_a, int acomp,
+              typename MF::value_type b, Array<MF,N> const& src_b, int bcomp,
+              int dcomp, int ncomp, IntVect const& nghost)
+{
+    for (std::size_t i = 0; i < N; ++i) {
+        MF::LinComb(dst[i], a, src_a[i], acomp, b, src_b[i], bcomp, dcomp, ncomp, nghost);
+    }
+}
+
+//! dst = src w/ MPI communication
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF>, int> = 0>
+void ParallelCopy (Array<MF,N>& dst, Array<MF,N> const& src,
+                   int scomp, int dcomp, int ncomp,
+                   IntVect const& ng_src = IntVect(0),
+                   IntVect const& ng_dst = IntVect(0),
+                   Periodicity const& period = Periodicity::NonPeriodic())
+{
+    for (std::size_t i = 0; i < N; ++i) {
+        dst[i].ParallelCopy(src[i], scomp, dcomp, ncomp, ng_src, ng_dst, period);
+    }
+}
+
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF>, int> = 0>
+[[nodiscard]] typename MF::value_type
+norminf (Array<MF,N> const& mf, int scomp, int ncomp, IntVect const& nghost,
+         bool local = false)
+{
+    auto r = typename MF::value_type(0);
+    for (std::size_t i = 0; i < N; ++i) {
+        auto tmp = mf[i].norminf(scomp, ncomp, nghost, true);
+        r = std::max(r,tmp);
+    }
+    if (!local) {
+        ParallelAllReduce::Max(r, ParallelContext::CommunicatorSub());
+    }
+    return r;
+}
+
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF> && (N > 0),
+                                                    int> = 0>
+[[nodiscard]] int nComp (Array<MF,N> const& mf)
+{
+    return mf[0].nComp();
+}
+
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF> && (N > 0),
+                                                    int> = 0>
+[[nodiscard]] IntVect nGrowVect (Array<MF,N> const& mf)
+{
+    return mf[0].nGrowVect();
+}
+
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF> && (N > 0),
+                                                    int> = 0>
+[[nodiscard]] BoxArray const&
+boxArray (Array<MF,N> const& mf)
+{
+    return mf[0].boxArray();
+}
+
+template <class MF, std::size_t N, std::enable_if_t<IsMultiFabLike_v<MF> && (N > 0),
+                                                    int> = 0>
+[[nodiscard]] DistributionMapping const&
+DistributionMap (Array<MF,N> const& mf)
+{
+    return mf[0].DistributionMap();
+}
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_FabConv.H b/Src/Base/AMReX_FabConv.H
index 78554000f25..25dae063de7 100644
--- a/Src/Base/AMReX_FabConv.H
+++ b/Src/Base/AMReX_FabConv.H
@@ -13,20 +13,18 @@
 
 namespace amrex {
 
-//
-// A Descriptor of the Long Integer type
-
 /**
-* This class is meant to hold all information needed to completely
-* describe the "int" or "Long" type on a machine.  To describe an integer both
-* the number of bytes and their ordering, relative to canonical
-* ordering 1 .. sizeof(Long), needs to be specified.
-* This allows us to write out integers in the native format on a machine,
-* and then by also saving the IntDescriptor, we can read them back in on
-* another machine and have enough information to construct the exact same
-* values.
-*/
-
+ * \brief A Descriptor of the Long Integer type
+ *
+ * This class is meant to hold all information needed to completely
+ * describe the "int" or "Long" type on a machine.  To describe an integer both
+ * the number of bytes and their ordering, relative to canonical
+ * ordering 1 .. sizeof(Long), needs to be specified.
+ * This allows us to write out integers in the native format on a machine,
+ * and then by also saving the IntDescriptor, we can read them back in on
+ * another machine and have enough information to construct the exact same
+ * values.
+ */
 class IntDescriptor
 {
 
@@ -72,39 +70,37 @@ std::ostream& operator<< (std::ostream& os, const IntDescriptor& id);
 //!
 std::istream& operator>> (std::istream& is, IntDescriptor& id);
 
-
-  //A Descriptor of the Real Type
-
 /**
-* \brief  This class is meant to hold all information needed to completely
-*  describe the "Real" floating-point type on a machine.  By "Real" here we
-*  mean either the "float" or "double" type that this version of AMReX
-*  was built with, which corresponds to whether BL_USE_FLOAT or
-*  BL_USE_DOUBLE was used to build the version of the library.
-*
-*  To describe a "Real" type two arrays are needed: one detailing the ordering
-*  of the bytes in the Real, relative to the canonical ordering
-*  1 .. sizeof(Real) and the other detailing the format of the floating-point
-*  number.
-*
-*  The array detailing the format of a floating-point number is an eight-element
-*  array of longs containing the following information:
-*
-*          format[0] = number of bits per number
-*          format[1] = number of bits in exponent
-*          format[2] = number of bits in mantissa
-*          format[3] = start bit of sign
-*          format[4] = start bit of exponent
-*          format[5] = start bit of mantissa
-*          format[6] = high order mantissa bit (CRAY needs this)
-*          format[7] = bias of exponent
-*
-*  This allows us to write out "Real"s in the native format on a machine,
-*  and then by also saving the IntDescriptor, we can read them back in on
-*  another machine and have enough information to construct the exact same
-*  "Real" values, provided the Reals have the same size on the two machines.
-*/
-
+ * \brief A Descriptor of the Real Type
+ *
+ * This class is meant to hold all information needed to completely
+ * describe the "Real" floating-point type on a machine.  By "Real" here we
+ * mean either the "float" or "double" type that this version of AMReX
+ * was built with, which corresponds to whether BL_USE_FLOAT or
+ * BL_USE_DOUBLE was used to build the version of the library.
+ *
+ * To describe a "Real" type two arrays are needed: one detailing the ordering
+ * of the bytes in the Real, relative to the canonical ordering
+ * 1 .. sizeof(Real) and the other detailing the format of the floating-point
+ * number.
+ *
+ * The array detailing the format of a floating-point number is an eight-element
+ * array of longs containing the following information:
+ *
+ *         format[0] = number of bits per number
+ *         format[1] = number of bits in exponent
+ *         format[2] = number of bits in mantissa
+ *         format[3] = start bit of sign
+ *         format[4] = start bit of exponent
+ *         format[5] = start bit of mantissa
+ *         format[6] = high order mantissa bit (CRAY needs this)
+ *         format[7] = bias of exponent
+ *
+ * This allows us to write out "Real"s in the native format on a machine,
+ * and then by also saving the IntDescriptor, we can read them back in on
+ * another machine and have enough information to construct the exact same
+ * "Real" values, provided the Reals have the same size on the two machines.
+ */
 class RealDescriptor
 {
 public:
diff --git a/Src/Base/AMReX_FabConv.cpp b/Src/Base/AMReX_FabConv.cpp
index 522216aadd1..e8dd870a058 100644
--- a/Src/Base/AMReX_FabConv.cpp
+++ b/Src/Base/AMReX_FabConv.cpp
@@ -170,7 +170,7 @@ RealDescriptor::clone () const
 // This exists solely to support reading "old" FABs.
 //
 
-static
+namespace {
 const int*
 selectOrdering (int prec,
                 int ordering)
@@ -208,6 +208,7 @@ selectOrdering (int prec,
     }
     return nullptr;
 }
+}
 
 //
 // This is here solely to support reading "old" FABs.
@@ -293,7 +294,7 @@ _pd_get_bit (char const* base,
 // for longer fields
 //
 
-static
+namespace {
 Long
 _pd_extract_field (char const* in,
                    int         offs,
@@ -367,7 +368,6 @@ _pd_extract_field (char const* in,
 //
 
 template <int NB>
-static
 void
 _pd_btrvout (char* out, Long  nitems)
 {
@@ -387,8 +387,10 @@ _pd_btrvout (char* out, Long  nitems)
     }
 }
 
-const int BitsMax       = 8*sizeof(Long);
-const int REVERSE_ORDER = 2;
+constexpr int BitsMax       = 8*sizeof(Long);
+constexpr int REVERSE_ORDER = 2;
+
+}
 
 //
 // Copy the least significant NB bits from the given Long into the byte array
@@ -487,7 +489,7 @@ _pd_set_bit (char* base, int offs)
 // on input.
 //
 
-static
+namespace {
 void
 _pd_reorder (char*      arr,
              Long       nitems,
@@ -512,7 +514,6 @@ _pd_reorder (char*      arr,
 // from inord to outord.
 //
 
-static
 void
 permute_real_word_order (void*       out,
                          const void* in,
@@ -786,7 +787,6 @@ PD_fconvert (void*       out,
     _pd_reorder((char*)out, nitems, outbytes, outord);
 }
 
-static
 void
 PD_fixdenormals (void*       out,
                  Long        nitems,
@@ -816,13 +816,15 @@ PD_fixdenormals (void*       out,
     }
 }
 
+}
+
 //
 // It's really sad that I need to do this ...
 //
 
 #undef  GETARRAY
+namespace {
 #define GETARRAY(TYPE)                                             \
-static                                                             \
 void                                                               \
 getarray (std::istream&  is,                                       \
           Vector< TYPE >& ar)                                       \
@@ -852,11 +854,12 @@ getarray (std::istream&  is,                                       \
 }
 GETARRAY(int)
 GETARRAY(Long)
+}
 #undef GETARRAY
 
 #undef  PUTARRAY
+namespace {
 #define PUTARRAY(TYPE)                 \
-static                                 \
 void                                   \
 putarray (std::ostream&        os,     \
           const Vector< TYPE >& ar)     \
@@ -874,6 +877,7 @@ putarray (std::ostream&        os,     \
 }
 PUTARRAY(int)
 PUTARRAY(Long)
+}
 #undef PUTARRAY
 
 std::ostream&
@@ -917,7 +921,7 @@ operator>> (std::istream&   is,
     return is;
 }
 
-static
+namespace {
 void
 PD_convert (void*                 out,
             const void*           in,
@@ -960,6 +964,7 @@ PD_convert (void*                 out,
         PD_fixdenormals(out, nitems, ord.format(), ord.order());
     }
 }
+}
 
 //
 // Convert nitems in RealDescriptor format to native Real format.
diff --git a/Src/Base/AMReX_FabDataType.H b/Src/Base/AMReX_FabDataType.H
new file mode 100644
index 00000000000..81537ae8065
--- /dev/null
+++ b/Src/Base/AMReX_FabDataType.H
@@ -0,0 +1,27 @@
+#ifndef AMREX_FAB_DATA_TYPE_H_
+#define AMREX_FAB_DATA_TYPE_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_TypeTraits.H>
+
+namespace amrex {
+
+template <typename T, class Enable = void> struct FabDataType {};
+//
+template <typename T>
+struct FabDataType <T, std::enable_if_t<IsMultiFabLike_v<T> > >
+{
+    using   fab_type = typename T::fab_type;
+    using value_type = typename T::value_type;
+};
+
+template <typename T>
+struct FabDataType <T, std::enable_if_t<IsMultiFabLike_v<typename T::value_type> > >
+{
+    using   fab_type = typename T::value_type::fab_type;
+    using value_type = typename T::value_type::value_type;
+};
+
+}
+
+#endif
diff --git a/Src/Base/AMReX_FilCC_1D_C.H b/Src/Base/AMReX_FilCC_1D_C.H
index 5f21541432c..4021e4b52a0 100644
--- a/Src/Base/AMReX_FilCC_1D_C.H
+++ b/Src/Base/AMReX_FilCC_1D_C.H
@@ -69,6 +69,7 @@ struct FilccCell
                     q(i,0,0,n) = -q(2*ilo-i-1,0,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (i > ihi)
@@ -111,6 +112,7 @@ struct FilccCell
                     q(i,0,0,n) = -q(2*ihi-i+1,0,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
         }
diff --git a/Src/Base/AMReX_FilCC_2D_C.H b/Src/Base/AMReX_FilCC_2D_C.H
index 79f65156d48..80b92929724 100644
--- a/Src/Base/AMReX_FilCC_2D_C.H
+++ b/Src/Base/AMReX_FilCC_2D_C.H
@@ -74,6 +74,7 @@ struct FilccCell
                     q(i,j,0,n) = -q(2*ilo-i-1,j,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (i > ihi)
@@ -116,6 +117,7 @@ struct FilccCell
                     q(i,j,0,n) = -q(2*ihi-i+1,j,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
 
@@ -159,6 +161,7 @@ struct FilccCell
                     q(i,j,0,n) = -q(i,2*jlo-j-1,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (j > jhi)
@@ -201,6 +204,7 @@ struct FilccCell
                     q(i,j,0,n) = -q(i,2*jhi-j+1,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
         }
diff --git a/Src/Base/AMReX_FilCC_3D_C.H b/Src/Base/AMReX_FilCC_3D_C.H
index 6a2e3dccfcf..f311045c0a1 100644
--- a/Src/Base/AMReX_FilCC_3D_C.H
+++ b/Src/Base/AMReX_FilCC_3D_C.H
@@ -73,6 +73,7 @@ struct FilccCell
                     q(i,j,k,n) = -q(2*ilo-i-1,j,k,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (i > ihi)
@@ -115,6 +116,7 @@ struct FilccCell
                     q(i,j,k,n) = -q(2*ihi-i+1,j,k,n);
                     break;
                 }
+                default: { break; }
                 }
             }
 
@@ -158,6 +160,7 @@ struct FilccCell
                     q(i,j,k,n) = -q(i,2*jlo-j-1,k,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (j > jhi)
@@ -200,6 +203,7 @@ struct FilccCell
                     q(i,j,k,n) = -q(i,2*jhi-j+1,k,n);
                     break;
                 }
+                default: { break; }
                 }
             }
 
@@ -243,6 +247,7 @@ struct FilccCell
                     q(i,j,k,n) = -q(i,j,2*klo-k-1,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (k > khi)
@@ -285,6 +290,7 @@ struct FilccCell
                     q(i,j,k,n) = -q(i,j,2*khi-k+1,n);
                     break;
                 }
+                default: { break; }
                 }
             }
         }
diff --git a/Src/Base/AMReX_FilCC_C.cpp b/Src/Base/AMReX_FilCC_C.cpp
index 7cdba486957..e2d8c6129fd 100644
--- a/Src/Base/AMReX_FilCC_C.cpp
+++ b/Src/Base/AMReX_FilCC_C.cpp
@@ -41,7 +41,7 @@ void fab_filcc (Box const& bx, Array4<Real> const& qn, int ncomp,
         if (lo.x < ilo) {
            const int imin = lo.x;
            const int imax = ilo-1;
-           if (bc.lo(0) == BCType::ext_dir) {
+           if (bc.lo(0) == BCType::ext_dir || bc.lo(0) == BCType::ext_dir_cc) {
                // Do nothing.
            } else if (bc.lo(0) == BCType::foextrap) {
                for (int k = lo.z; k <= hi.z; ++k) {
@@ -88,7 +88,7 @@ void fab_filcc (Box const& bx, Array4<Real> const& qn, int ncomp,
             const int imin = ihi+1;
             const int imax = hi.x;
 
-            if (bc.hi(0) == BCType::ext_dir) {
+            if (bc.hi(0) == BCType::ext_dir || bc.hi(0) == BCType::ext_dir_cc) {
                 // Do nothing.
             } else if (bc.hi(0) == BCType::foextrap) {
                 for (int k = lo.z; k <= hi.z; ++k) {
@@ -136,7 +136,7 @@ void fab_filcc (Box const& bx, Array4<Real> const& qn, int ncomp,
         if (lo.y < jlo) {
             const int jmin = lo.y;
             const int jmax = jlo-1;
-            if (bc.lo(1) == BCType::ext_dir) {
+            if (bc.lo(1) == BCType::ext_dir || bc.lo(1) == BCType::ext_dir_cc) {
                 // Do nothing.
             } else if (bc.lo(1) == BCType::foextrap) {
                 for (int k = lo.z; k <= hi.z; ++k) {
@@ -182,7 +182,7 @@ void fab_filcc (Box const& bx, Array4<Real> const& qn, int ncomp,
         if (hi.y > jhi) {
             const int jmin = jhi+1;
             const int jmax = hi.y;
-            if (bc.hi(1) == BCType::ext_dir) {
+            if (bc.hi(1) == BCType::ext_dir || bc.hi(1) == BCType::ext_dir_cc) {
                 // Do nothing.
             } else if (bc.hi(1) == BCType::foextrap) {
                 for (int k = lo.z; k <= hi.z; ++k) {
@@ -231,7 +231,7 @@ void fab_filcc (Box const& bx, Array4<Real> const& qn, int ncomp,
         if (lo.z < klo) {
             const int kmin = lo.z;
             const int kmax = klo-1;
-            if (bc.lo(2) == BCType::ext_dir) {
+            if (bc.lo(2) == BCType::ext_dir || bc.lo(2) == BCType::ext_dir_cc) {
                 // Do nothing.
             } else if (bc.lo(2) == BCType::foextrap) {
                 for (int k = kmin; k <= kmax; ++k) {
@@ -277,7 +277,7 @@ void fab_filcc (Box const& bx, Array4<Real> const& qn, int ncomp,
         if (hi.z > khi) {
             const int kmin = khi+1;
             const int kmax = hi.z;
-            if (bc.hi(2) == BCType::ext_dir) {
+            if (bc.hi(2) == BCType::ext_dir || bc.hi(2) == BCType::ext_dir_cc) {
                 // Do nothing.
             } else if (bc.hi(2) == BCType::foextrap) {
                 for (int k = kmin; k <= kmax; ++k) {
diff --git a/Src/Base/AMReX_FilFC_1D_C.H b/Src/Base/AMReX_FilFC_1D_C.H
index 43e3a1d41d0..31fe69f7fa0 100644
--- a/Src/Base/AMReX_FilFC_1D_C.H
+++ b/Src/Base/AMReX_FilFC_1D_C.H
@@ -66,6 +66,7 @@ struct FilfcFace
                     q(i,0,0,n) = -q(2*ilo-i,0,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (i == ihi)
@@ -106,6 +107,7 @@ struct FilfcFace
                     q(i,0,0,n) = -q(2*ihi-i,0,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
         }
diff --git a/Src/Base/AMReX_FilFC_2D_C.H b/Src/Base/AMReX_FilFC_2D_C.H
index 0185386392a..df76a2de23b 100644
--- a/Src/Base/AMReX_FilFC_2D_C.H
+++ b/Src/Base/AMReX_FilFC_2D_C.H
@@ -74,6 +74,7 @@ struct FilfcFace
                                                            : -q(2*ilo-i-1,j,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (i == ihi)
@@ -118,6 +119,7 @@ struct FilfcFace
                                                            : -q(2*ihi-i+1,j,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
 
@@ -163,6 +165,7 @@ struct FilfcFace
                                                            : -q(i,2*jlo-j-1,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (j == jhi)
@@ -207,6 +210,7 @@ struct FilfcFace
                                                            : -q(i,2*jhi-j+1,0,n);
                     break;
                 }
+                default: { break; }
                 }
             }
         }
diff --git a/Src/Base/AMReX_FilFC_3D_C.H b/Src/Base/AMReX_FilFC_3D_C.H
index 76c49fdc86a..1628131caae 100644
--- a/Src/Base/AMReX_FilFC_3D_C.H
+++ b/Src/Base/AMReX_FilFC_3D_C.H
@@ -77,6 +77,7 @@ struct FilfcFace
                                                            : -q(2*ilo-i-1,j,k,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             if (i == ihi)
@@ -121,6 +122,7 @@ struct FilfcFace
                                                            : -q(2*ihi-i+1,j,k,n);
                     break;
                 }
+                default: { break; }
                 }
             }
 
@@ -166,6 +168,7 @@ struct FilfcFace
                                                            : -q(i,2*jlo-j-1,k,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             else if (j == jhi)
@@ -210,6 +213,7 @@ struct FilfcFace
                                                            : -q(i,2*jhi-j+1,k,n);
                     break;
                 }
+                default: { break; }
                 }
             }
 
@@ -255,6 +259,7 @@ struct FilfcFace
                                                            : -q(i,j,2*klo-k-1,n);
                     break;
                 }
+                default: { break; }
                 }
             }
             if (k == khi)
@@ -299,6 +304,7 @@ struct FilfcFace
                                                            : -q(i,j,2*khi-k+1,n);
                     break;
                 }
+                default: { break; }
                 }
             }
         }
diff --git a/Src/Base/AMReX_FileSystem.cpp b/Src/Base/AMReX_FileSystem.cpp
index 283844427fe..5ba4cb5b585 100644
--- a/Src/Base/AMReX_FileSystem.cpp
+++ b/Src/Base/AMReX_FileSystem.cpp
@@ -154,11 +154,11 @@ CreateDirectories (std::string const& path, mode_t mode, bool verbose)
     }
 
     if(retVal == false  || verbose == true) {
-      for(int i(0); i < pathError.size(); ++i) {
+      for(auto & i : pathError) {
           amrex::AllPrint()<< "amrex::UtilCreateDirectory:: path errno:  "
-                           << pathError[i].first << " :: "
-                           << strerror(pathError[i].second)
-                           << std::endl;
+                           << i.first << " :: "
+                           << strerror(i.second)
+                           << '\n';
       }
     }
 
diff --git a/Src/Base/AMReX_ForkJoin.cpp b/Src/Base/AMReX_ForkJoin.cpp
index 2154ed90719..f0cc03ae04b 100644
--- a/Src/Base/AMReX_ForkJoin.cpp
+++ b/Src/Base/AMReX_ForkJoin.cpp
@@ -201,14 +201,14 @@ ForkJoin::copy_data_to_tasks ()
                     if (forked.size() <= i) {
                         if (flag_verbose) {
                             amrex::Print() << "  Creating forked " << mf_name << "[" << idx << "] for task " << i
-                                           << (mff.strategy == Strategy::split ? " (split)" : " (whole)") << std::endl;
+                                           << (mff.strategy == Strategy::split ? " (split)" : " (whole)") << '\n';
                         }
                         // look up the distribution mapping for this (box array, task) pair
                         const DistributionMapping &dm = get_dm(ba, i, orig.DistributionMap());
                         forked.emplace_back(ba, dm, task_comp_n, mff.ngrow);
                     } else if (flag_verbose) {
                         amrex::Print() << "  Forked " << mf_name << "[" << idx << "] for task " << i
-                                       << " already created" << std::endl;
+                                       << " already created" << '\n';
                     }
                     AMREX_ASSERT(i < forked.size());
 
@@ -216,7 +216,7 @@ ForkJoin::copy_data_to_tasks ()
                     if (mff.intent == Intent::in || mff.intent == Intent::inout) {
                         if (flag_verbose) {
                             amrex::Print() << "    Copying " << mf_name << "[" << idx << "] components ["
-                                           << comp_split[i].lo << ", " << comp_split[i].hi << ") into to task " << i << std::endl;
+                                           << comp_split[i].lo << ", " << comp_split[i].hi << ") into to task " << i << '\n';
                         }
                         // parallel copy data into forked MF
                         forked[i].Redistribute(orig, comp_split[i].lo, 0, task_comp_n, mff.ngrow);
@@ -257,7 +257,7 @@ ForkJoin::copy_data_from_tasks ()
                     for (int i = 0; i < NTasks(); ++i) {
                         if (flag_verbose) {
                             amrex::Print() << "  Copying " << mf_name << "[" << idx << "] components ["
-                                           << comp_split[i].lo << ", " << comp_split[i].hi << ") out from task " << i << " (unsplit)" << std::endl;
+                                           << comp_split[i].lo << ", " << comp_split[i].hi << ") out from task " << i << " (unsplit)" << '\n';
                         }
                         int task_comp_n = comp_split[i].hi - comp_split[i].lo;
                         AMREX_ASSERT(forked[i].nComp() == task_comp_n);
@@ -266,7 +266,7 @@ ForkJoin::copy_data_from_tasks ()
                 } else { // mff.strategy == single or duplicate
                     // copy all components from owner_task
                     if (flag_verbose) {
-                        amrex::Print() << "Copying " << mf_name << " out from task " << mff.owner_task << " (whole)" << std::endl;
+                        amrex::Print() << "Copying " << mf_name << " out from task " << mff.owner_task << " (whole)" << '\n';
                     }
                     AMREX_ASSERT(forked[mff.owner_task].nComp() == orig.nComp());
                     orig.Redistribute(forked[mff.owner_task], 0, 0, orig.nComp(), mff.ngrow);
@@ -307,13 +307,13 @@ ForkJoin::get_dm (const BoxArray& ba, int task_idx, const DistributionMapping& d
 
         if (flag_verbose) {
             amrex::Print() << "    Creating DM for (box array, task id) = ("
-                      << ba.getRefID() << ", " << task_idx << ")" << std::endl;
+                      << ba.getRefID() << ", " << task_idx << ")" << '\n';
         }
     } else {
         // DM has already been created
         if (flag_verbose) {
             amrex::Print() << "    DM for (box array, task id) = (" << ba.getRefID() << ", " << task_idx
-                           << ") already created" << std::endl;
+                           << ") already created" << '\n';
         }
     }
     AMREX_ASSERT(dm_vec[task_idx] != nullptr);
@@ -350,7 +350,7 @@ void ForkJoin::create_task_output_dir ()
 {
     if (!task_output_dir.empty() && !amrex::FileExists(task_output_dir)) {
         if (flag_verbose) {
-            Print() << "Creating task_output_dir: " << task_output_dir << std::endl;
+            Print() << "Creating task_output_dir: " << task_output_dir << '\n';
         }
         if (ParallelContext::IOProcessorSub()) {
             if (! amrex::UtilCreateDirectory(task_output_dir, 0755, flag_verbose)) {
diff --git a/Src/Base/AMReX_Functional.H b/Src/Base/AMReX_Functional.H
index bed6e7a4267..0098365db4a 100644
--- a/Src/Base/AMReX_Functional.H
+++ b/Src/Base/AMReX_Functional.H
@@ -63,6 +63,24 @@ struct LogicalOr
     }
 };
 
+template <typename T>
+struct Multiplies
+{
+    constexpr T operator() (const T & lhs, const T & rhs) const
+    {
+        return lhs * rhs;
+    }
+};
+
+template <typename T>
+struct Divides
+{
+    constexpr T operator() (const T & lhs, const T & rhs) const
+    {
+        return lhs / rhs;
+    }
+};
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_Geometry.H b/Src/Base/AMReX_Geometry.H
index 4017273151a..550b42f2f6a 100644
--- a/Src/Base/AMReX_Geometry.H
+++ b/Src/Base/AMReX_Geometry.H
@@ -16,14 +16,6 @@
 #include <map>
 
 namespace amrex {
-/**
-* \class Geometry
-* \brief Rectangular problem domain geometry.
-*
-* This class describes problem domain and coordinate system for
-* RECTANGULAR problem domains.  Since the problem domain is RECTANGULAR,
-* periodicity is meaningful.
-*/
 
 class MultiFab;
 class DistributionMapping;
@@ -67,6 +59,14 @@ public:
     int coord;
 };
 
+/**
+ * \class Geometry
+ * \brief Rectangular problem domain geometry.
+ *
+ * This class describes problem domain and coordinate system for
+ * RECTANGULAR problem domains.  Since the problem domain is RECTANGULAR,
+ * periodicity is meaningful.
+ */
 class Geometry
     :
     public CoordSys
diff --git a/Src/Base/AMReX_GpuAllocators.H b/Src/Base/AMReX_GpuAllocators.H
index e3f76bee216..de2fc0ad6bc 100644
--- a/Src/Base/AMReX_GpuAllocators.H
+++ b/Src/Base/AMReX_GpuAllocators.H
@@ -162,10 +162,10 @@ namespace amrex {
     //
     template <class T>
     struct IsArenaAllocator
-                <T,std::enable_if_t<std::is_base_of
+                <T,std::enable_if_t<std::is_base_of_v
                                     <ArenaAllocatorBase<typename T::value_type,
                                                         typename T::arena_wrapper_type>,
-                                     T>::value>>
+                                     T>>>
         : std::true_type {};
 
     template <typename T>
diff --git a/Src/Base/AMReX_GpuAsyncArray.H b/Src/Base/AMReX_GpuAsyncArray.H
index 13a1642d690..ab43496093a 100644
--- a/Src/Base/AMReX_GpuAsyncArray.H
+++ b/Src/Base/AMReX_GpuAsyncArray.H
@@ -24,7 +24,7 @@ extern "C" {
 namespace amrex {
 namespace Gpu {
 
-template <typename T, std::enable_if_t<std::is_trivially_copyable<T>::value,int> = 0>
+template <typename T, std::enable_if_t<std::is_trivially_copyable_v<T>,int> = 0>
 class AsyncArray
 {
 public:
@@ -43,7 +43,7 @@ public:
 #endif
     }
 
-    template <typename U = T, typename std::enable_if<std::is_standard_layout<U>::value && std::is_trivial<U>::value,int>::type = 0>
+    template <typename U = T, std::enable_if_t<std::is_standard_layout_v<U> && std::is_trivial_v<U>,int> = 0>
     explicit AsyncArray (const std::size_t n)
     {
         if (n == 0) { return; }
diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
index 55fc351156a..0a056ab5c38 100644
--- a/Src/Base/AMReX_GpuAtomic.H
+++ b/Src/Base/AMReX_GpuAtomic.H
@@ -13,11 +13,13 @@ namespace amrex {
 namespace Gpu::Atomic {
 
 // For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double.
+// For Multiply and Divide, we support generic types provided they are the same size as int or unsigned long long
+// and have *= and /= operators.
 // For LogicalOr and LogicalAnd, the data type is int.
 // For Exch and CAS, the data type is generic.
 // All these functions are non-atomic in host code!!!
 // If one needs them to be atomic in host code, use HostDevice::Atomic::*.  Currently only
-// HostDevice::Atomic is supported.  We could certainly add more.
+// HostDevice::Atomic::Add is supported.  We could certainly add more.
 
 namespace detail {
 
@@ -132,17 +134,17 @@ namespace detail {
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T Add_device (T* const sum, T const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicAdd(sum, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         sycl::atomic_ref<T,mo,ms,AS> a{*sum};
         return a.fetch_add(value);
 #else
-        amrex::ignore_unused(sum, value);
-        return T(); // should never get here, but have to return something
+        AMREX_IF_ON_DEVICE(( return atomicAdd(sum, value); ))
+        AMREX_IF_ON_HOST((
+            amrex::ignore_unused(sum, value);
+            return T(); // should never get here, but have to return something
+        ))
 #endif
     }
 
@@ -175,7 +177,7 @@ namespace detail {
 
 #endif
 
-#if defined(AMREX_USE_CUDA) && (__CUDA_ARCH__ < 600)
+#if defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     double Add_device (double* const sum, double const value) noexcept
@@ -195,17 +197,16 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T Add (T* sum, T value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
 #ifdef AMREX_USE_SYCL
-        return Add_device<T,AS>(sum, value);
-#else
-        return Add_device(sum, value);
-#endif
+        AMREX_IF_ON_DEVICE((return Add_device<T,AS>(sum, value);))
 #else
-        auto old = *sum;
-        *sum += value;
-        return old;
+        AMREX_IF_ON_DEVICE((return Add_device(sum, value);))
 #endif
+        AMREX_IF_ON_HOST((
+            auto old = *sum;
+            *sum += value;
+            return old;
+        ))
     }
 
 ////////////////////////////////////////////////////////////////////////
@@ -252,18 +253,19 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     bool If (T* const add, T const value, Op&& op, Cond&& cond) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return If_device(add, value, std::forward<Op>(op), std::forward<Cond>(cond));
-#else
-        T old = *add;
-        T const tmp = op(old, value);
-        if (cond(tmp)) {
-            *add = tmp;
-            return true;
-        } else {
-            return false;
-        }
-#endif
+        AMREX_IF_ON_DEVICE((
+            return If_device(add, value, std::forward<Op>(op), std::forward<Cond>(cond));
+        ))
+        AMREX_IF_ON_HOST((
+            T old = *add;
+            T const tmp = std::forward<Op>(op)(old, value);
+            if (std::forward<Cond>(cond)(tmp)) {
+                *add = tmp;
+                return true;
+            } else {
+                return false;
+            }
+        ))
     }
 
 ////////////////////////////////////////////////////////////////////////
@@ -278,14 +280,11 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void AddNoRet (T* sum, T value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-#ifdef AMREX_USE_SYCL
+#if defined(__SYCL_DEVICE_ONLY__)
         Add_device<T,AS>(sum, value);
 #else
-        Add_device(sum, value);
-#endif
-#else
-        *sum += value;
+        AMREX_IF_ON_DEVICE((Add_device(sum, value);))
+        AMREX_IF_ON_HOST((*sum += value;))
 #endif
     }
 
@@ -293,14 +292,11 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void AddNoRet (float* const sum, float const value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
-        atomicAddNoRet(sum, value);
+        AMREX_IF_ON_DEVICE((atomicAddNoRet(sum, value);))
 #pragma clang diagnostic pop
-#else
-        *sum += value;
-#endif
+        AMREX_IF_ON_HOST((*sum += value;))
     }
 #endif
 
@@ -314,18 +310,18 @@ namespace detail {
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T Min_device (T* const m, T const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicMin(m, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<T,mo,ms,as> a{*m};
         return a.fetch_min(value);
 #else
-        amrex::ignore_unused(m,value);
-        return T(); // should never get here, but have to return something
+        AMREX_IF_ON_DEVICE(( return atomicMin(m, value); ))
+        AMREX_IF_ON_HOST((
+            amrex::ignore_unused(m,value);
+            return T(); // should never get here, but have to return something
+        ))
 #endif
     }
 
@@ -357,13 +353,14 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T Min (T* const m, T const value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return Min_device(m, value);
-#else
-        auto const old = *m;
-        *m = (*m) < value ? (*m) : value;
-        return old;
-#endif
+        AMREX_IF_ON_DEVICE((
+            return Min_device(m, value);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *m;
+            *m = (*m) < value ? (*m) : value;
+            return old;
+        ))
     }
 
 ////////////////////////////////////////////////////////////////////////
@@ -376,18 +373,18 @@ namespace detail {
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T Max_device (T* const m, T const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicMax(m, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<T,mo,ms,as> a{*m};
         return a.fetch_max(value);
 #else
-        amrex::ignore_unused(m,value);
-        return T(); // should never get here, but have to return something
+        AMREX_IF_ON_DEVICE(( return atomicMax(m, value); ))
+        AMREX_IF_ON_HOST((
+            amrex::ignore_unused(m,value);
+            return T(); // should never get here, but have to return something
+        ))
 #endif
     }
 
@@ -419,13 +416,14 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T Max (T* const m, T const value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return Max_device(m, value);
-#else
-        auto const old = *m;
-        *m = (*m) > value ? (*m) : value;
-        return old;
-#endif
+        AMREX_IF_ON_DEVICE((
+            return Max_device(m, value);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *m;
+            *m = (*m) > value ? (*m) : value;
+            return old;
+        ))
     }
 
 ////////////////////////////////////////////////////////////////////////
@@ -435,19 +433,21 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int LogicalOr (int* const m, int const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicOr(m, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<int,mo,ms,as> a{*m};
         return a.fetch_or(value);
 #else
-        int const old = *m;
-        *m = (*m) || value;
-        return old;
+        AMREX_IF_ON_DEVICE((
+            return atomicOr(m, value);
+        ))
+        AMREX_IF_ON_HOST((
+            int const old = *m;
+            *m = (*m) || value;
+            return old;
+        ))
 #endif
     }
 
@@ -458,19 +458,21 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int LogicalAnd (int* const m, int const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicAnd(m, value ? ~0x0 : 0);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<int,mo,ms,as> a{*m};
         return a.fetch_and(value ? ~0x0 : 0);
 #else
-        int const old = *m;
-        *m = (*m) && value;
-        return old;
+        AMREX_IF_ON_DEVICE((
+            return atomicAnd(m, value ? ~0x0 : 0);
+        ))
+        AMREX_IF_ON_HOST((
+            int const old = *m;
+            *m = (*m) && value;
+            return old;
+        ))
 #endif
     }
 
@@ -482,19 +484,21 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T Exch (T* address, T val) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicExch(address, val);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<T,mo,ms,as> a{*address};
         return a.exchange(val);
 #else
-        auto const old = *address;
-        *address = val;
-        return old;
+        AMREX_IF_ON_DEVICE((
+            return atomicExch(address, val);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *address;
+            *address = val;
+            return old;
+        ))
 #endif
     }
 
@@ -506,10 +510,7 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T CAS (T* const address, T compare, T const val) noexcept
     {           // cannot be T const compare because of compare_exchange_strong
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicCAS(address, compare, val);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
@@ -517,27 +518,108 @@ namespace detail {
         a.compare_exchange_strong(compare, val);
         return compare;
 #else
-        auto const old = *address;
-        *address = (old == compare ? val : old);
-        return old;
+        AMREX_IF_ON_DEVICE((
+            return atomicCAS(address, compare, val);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *address;
+            *address = (old == compare ? val : old);
+            return old;
+        ))
 #endif
     }
+
+////////////////////////////////////////////////////////////////////////
+//  Multiply
+////////////////////////////////////////////////////////////////////////
+
+#ifdef AMREX_USE_GPU
+
+    template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T Multiply_device (T* const prod, T const value) noexcept
+    {
+        return detail::atomic_op<T, int>(prod,value,amrex::Multiplies<T>());
+    }
+
+    template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T Multiply_device (T* const prod, T const value) noexcept
+    {
+        return detail::atomic_op<T, unsigned long long>(prod,value,amrex::Multiplies<T>());
+    }
+
+#endif
+
+    template<class T>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T Multiply (T* const prod, T const value) noexcept
+    {
+        AMREX_IF_ON_DEVICE((
+            return Multiply_device(prod, value);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *prod;
+            *prod *= value;
+            return old;
+        ))
+    }
+
+////////////////////////////////////////////////////////////////////////
+//  Divide
+////////////////////////////////////////////////////////////////////////
+
+#ifdef AMREX_USE_GPU
+
+    template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T Divide_device (T* const quot, T const value) noexcept
+    {
+        return detail::atomic_op<T, int>(quot,value,amrex::Divides<T>());
+    }
+
+    template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T Divide_device (T* const quot, T const value) noexcept
+    {
+        return detail::atomic_op<T, unsigned long long>(quot,value,amrex::Divides<T>());
+    }
+
+#endif
+
+    template<class T>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T Divide (T* const quot, T const value) noexcept
+    {
+        AMREX_IF_ON_DEVICE((
+            return Divide_device(quot, value);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *quot;
+            *quot /= value;
+            return old;
+        ))
+    }
 }
 
 namespace HostDevice::Atomic {
 
     template <class T>
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void Add (T* const sum, T const value) noexcept
+    AMREX_FORCE_INLINE
+    void Add_Host (T* const sum, T const value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        Gpu::Atomic::AddNoRet(sum,value);
-#else
 #ifdef AMREX_USE_OMP
 #pragma omp atomic update
 #endif
         *sum += value;
-#endif
+    }
+
+    template <class T>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    void Add (T* const sum, T const value) noexcept
+    {
+        AMREX_IF_ON_DEVICE((Gpu::Atomic::AddNoRet(sum,value);))
+        AMREX_IF_ON_HOST((Add_Host(sum,value);))
     }
 
 }
diff --git a/Src/Base/AMReX_GpuBuffer.H b/Src/Base/AMReX_GpuBuffer.H
index a52dc04785b..f930cfa321e 100644
--- a/Src/Base/AMReX_GpuBuffer.H
+++ b/Src/Base/AMReX_GpuBuffer.H
@@ -12,7 +12,7 @@
 
 namespace amrex::Gpu {
 
-template <typename T, std::enable_if_t<std::is_trivially_copyable<T>::value,int> = 0>
+template <typename T, std::enable_if_t<std::is_trivially_copyable_v<T>,int> = 0>
 class Buffer
 {
 public:
diff --git a/Src/Base/AMReX_GpuComplex.H b/Src/Base/AMReX_GpuComplex.H
index 205788375f4..274da82604d 100644
--- a/Src/Base/AMReX_GpuComplex.H
+++ b/Src/Base/AMReX_GpuComplex.H
@@ -20,9 +20,12 @@ T norm (const GpuComplex<T>& a_z) noexcept;
  *  work in device code with Cuda yet.
  *
  *  Should be bit-wise compatible with std::complex.
+ *
+ *  GpuComplex is aligned to its size (stricter than std::complex) to allow for
+ *  coalesced memory accesses with nvidia GPUs.
  */
 template <typename T>
-struct GpuComplex
+struct alignas(2*sizeof(T)) GpuComplex
 {
     using value_type = T;
 
diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
index 05399b2e047..012941b0055 100644
--- a/Src/Base/AMReX_GpuContainers.H
+++ b/Src/Base/AMReX_GpuContainers.H
@@ -5,7 +5,6 @@
 #include <AMReX_Vector.H>
 #include <AMReX_PODVector.H>
 #include <AMReX_GpuAllocators.H>
-#include <AMReX_Scan.H>
 #include <type_traits>
 
 #include <numeric>
@@ -408,7 +407,7 @@ namespace amrex::Gpu {
                                std::is_trivially_copyable_v<T> &&
                                amrex::IsCallable<F, T&, Long>::value,
                                int> FOO = 0>
-    void fillAsync (IT first, IT last, F&& f) noexcept
+    void fillAsync (IT first, IT last, F const& f) noexcept
     {
         auto N = static_cast<Long>(std::distance(first, last));
         if (N <= 0) { return; }
diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H
index 0f21213a86d..1532045bec4 100644
--- a/Src/Base/AMReX_GpuControl.H
+++ b/Src/Base/AMReX_GpuControl.H
@@ -184,13 +184,13 @@ namespace Gpu {
 
 #else
 
-    [[nodiscard]] inline static constexpr bool inLaunchRegion () { return false; }
-    [[nodiscard]] inline static constexpr bool notInLaunchRegion () { return true; }
-    [[nodiscard]] inline static constexpr bool setLaunchRegion (bool) { return false; }
+    [[nodiscard]] inline constexpr bool inLaunchRegion () { return false; }
+    [[nodiscard]] inline constexpr bool notInLaunchRegion () { return true; }
+    [[nodiscard]] inline constexpr bool setLaunchRegion (bool) { return false; }
 
-    [[nodiscard]] inline static constexpr bool inGraphRegion () { return false; }
-    [[nodiscard]] inline static constexpr bool notInGraphRegion () { return true; }
-    [[nodiscard]] inline static constexpr bool setGraphRegion (bool) { return false; }
+    [[nodiscard]] inline constexpr bool inGraphRegion () { return false; }
+    [[nodiscard]] inline constexpr bool notInGraphRegion () { return true; }
+    [[nodiscard]] inline constexpr bool setGraphRegion (bool) { return false; }
 
     struct [[nodiscard]] LaunchSafeGuard
     {
@@ -202,10 +202,10 @@ namespace Gpu {
         explicit GraphSafeGuard (bool) {}
     };
 
-    [[nodiscard]] inline static constexpr bool inSingleStreamRegion () { return false; }
-    [[nodiscard]] inline static constexpr bool inNoSyncRegion () { return true; }
-    [[nodiscard]] inline static constexpr bool setSingleStreamRegion (bool) { return false; }
-    [[nodiscard]] inline static constexpr bool setNoSyncRegion (bool) { return true; }
+    [[nodiscard]] inline constexpr bool inSingleStreamRegion () { return false; }
+    [[nodiscard]] inline constexpr bool inNoSyncRegion () { return true; }
+    [[nodiscard]] inline constexpr bool setSingleStreamRegion (bool) { return false; }
+    [[nodiscard]] inline constexpr bool setNoSyncRegion (bool) { return true; }
     struct [[nodiscard]] SingleStreamRegion {};
     struct [[nodiscard]] NoSyncRegion {};
 
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index df3625d13ce..d699a20a2b5 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -354,6 +354,8 @@ Device::initialize_gpu ()
 
     AMREX_HIP_SAFE_CALL(hipGetDeviceProperties(&device_prop, device_id));
 
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(warp_size == device_prop.warpSize, "Incorrect warp size");
+
     // check compute capability
 
     // AMD devices do not support shared cache banking.
@@ -372,11 +374,13 @@ Device::initialize_gpu ()
     cudaDeviceGetAttribute(&memory_pools_supported, cudaDevAttrMemoryPoolsSupported, device_id);
 #endif
 
+#if (__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 4))
     if (sizeof(Real) == 8) {
         AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
     } else if (sizeof(Real) == 4) {
         AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
     }
+#endif
 
     for (int i = 0; i < max_gpu_streams; ++i) {
         AMREX_CUDA_SAFE_CALL(cudaStreamCreate(&gpu_stream_pool[i]));
diff --git a/Src/Base/AMReX_GpuElixir.H b/Src/Base/AMReX_GpuElixir.H
index c61f5678a2b..4c006e18c53 100644
--- a/Src/Base/AMReX_GpuElixir.H
+++ b/Src/Base/AMReX_GpuElixir.H
@@ -34,7 +34,7 @@ public:
         return *this;
     }
 
-    void append (Elixir && rhs) noexcept
+    void append (Elixir rhs) noexcept
     {
         m_pa.insert(m_pa.end(), rhs.m_pa.begin(), rhs.m_pa.end());
         rhs.m_pa.clear();
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index c1870d2ef58..435a11f342b 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -20,6 +20,7 @@
 #include <AMReX_GpuLaunchGlobal.H>
 #include <AMReX_RandomEngine.H>
 #include <AMReX_Algorithm.H>
+#include <AMReX_Math.H>
 #include <cstddef>
 #include <limits>
 #include <algorithm>
@@ -61,11 +62,11 @@ namespace amrex {
 // CPU variation
 
     template<class L>
-    void launch_host (L&& f0) noexcept { f0(); }
+    void launch_host (L&& f0) noexcept { std::forward<L>(f0)(); }
 
     template<class L, class... Lambdas>
     void launch_host (L&& f0, Lambdas&&... fs) noexcept {
-        f0();
+        std::forward<L>(f0)();
         launch_host(std::forward<Lambdas>(fs)...);
     }
 
@@ -76,11 +77,11 @@ namespace amrex {
 namespace Gpu {
 
 #ifdef AMREX_USE_GPU
-    static constexpr std::size_t numThreadsPerBlockParallelFor () {
+    inline constexpr std::size_t numThreadsPerBlockParallelFor () {
         return AMREX_GPU_MAX_THREADS;
     }
 #else
-    static constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; }
+    inline constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; }
 #endif
 
 // ************************************************
@@ -103,20 +104,21 @@ namespace Gpu {
     inline
     Box getThreadBox (const Box& bx, Long offset) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        const auto len = bx.length3d();
-        Long k = offset / (len[0]*len[1]);
-        Long j = (offset - k*(len[0]*len[1])) / len[0];
-        Long i = (offset - k*(len[0]*len[1])) - j*len[0];
-        IntVect iv{AMREX_D_DECL(static_cast<int>(i),
-                                static_cast<int>(j),
-                                static_cast<int>(k))};
-        iv += bx.smallEnd();
-        return (bx & Box(iv,iv,bx.type()));
-#else
-        amrex::ignore_unused(offset);
-        return bx;
-#endif
+        AMREX_IF_ON_DEVICE((
+            const auto len = bx.length3d();
+            Long k = offset / (len[0]*len[1]);
+            Long j = (offset - k*(len[0]*len[1])) / len[0];
+            Long i = (offset - k*(len[0]*len[1])) - j*len[0];
+            IntVect iv{AMREX_D_DECL(static_cast<int>(i),
+                                    static_cast<int>(j),
+                                    static_cast<int>(k))};
+            iv += bx.smallEnd();
+            return (bx & Box(iv,iv,bx.type()));
+        ))
+        AMREX_IF_ON_HOST((
+            amrex::ignore_unused(offset);
+            return bx;
+        ))
     }
 
 // ************************************************
diff --git a/Src/Base/AMReX_GpuLaunch.nolint.H b/Src/Base/AMReX_GpuLaunch.nolint.H
index 9cf92018e0a..c7df1737517 100644
--- a/Src/Base/AMReX_GpuLaunch.nolint.H
+++ b/Src/Base/AMReX_GpuLaunch.nolint.H
@@ -60,7 +60,7 @@
 #ifndef AMREX_USE_SYCL
 
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
-    {  using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    {  using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
     if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
     { \
         amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
@@ -101,7 +101,7 @@
     }
 
 #define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \
-    {  using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    {  using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
     if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
     { \
         amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
@@ -186,7 +186,7 @@
 // xxxxx SYCL todo: host disabled in host device
 
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
-    {  using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    {  using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
     if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
     { \
         amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
@@ -220,7 +220,7 @@
     }
 
 #define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \
-    {  using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    {  using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
     if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
     { \
         amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
@@ -290,7 +290,7 @@
 #else
 
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
-    {  using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    {  using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
     amrex::ignore_unused(where_to_run); \
     AMREX_PRAGMA_SIMD \
     for (amrex_i_inttype i = 0; i < n; ++i) { \
@@ -310,7 +310,7 @@
     );
 
 #define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \
-    {  using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    {  using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
     amrex::ignore_unused(where_to_run); \
     for (amrex_i_inttype i = 0; i < n; ++i) { \
         block \
diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 6ce9cca0f3a..04496348cf0 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -57,46 +57,46 @@ namespace detail {
 template<typename T, typename L>
 void launch (T const& n, L&& f) noexcept
 {
-    f(n);
+    std::forward<L>(f)(n);
 }
 
 template<int MT, typename T, typename L>
 void launch (T const& n, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
-    f(n);
+    std::forward<L>(f)(n);
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void For (T n, L&& f) noexcept
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void For (T n, L const& f) noexcept
 {
     for (T i = 0; i < n; ++i) {
         detail::call_f(f,i);
     }
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void For (T n, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(n, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void For (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     For(n, std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void For (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(n, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelFor (T n, L&& f) noexcept
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void ParallelFor (T n, L const& f) noexcept
 {
     AMREX_PRAGMA_SIMD
     for (T i = 0; i < n; ++i) {
@@ -104,20 +104,20 @@ void ParallelFor (T n, L&& f) noexcept
     }
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void ParallelFor (T n, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(n, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     ParallelFor(n, std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -125,7 +125,7 @@ void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 }
 
 template <typename L>
-void For (Box const& box, L&& f) noexcept
+void For (Box const& box, L const& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -157,7 +157,7 @@ void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
 }
 
 template <typename L>
-void ParallelFor (Box const& box, L&& f) noexcept
+void ParallelFor (Box const& box, L const& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -189,8 +189,8 @@ void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
     ParallelFor(box, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void For (Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void For (Box const& box, T ncomp, L const& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -203,28 +203,28 @@ void For (Box const& box, T ncomp, L&& f) noexcept
     }
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void For (Box const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box, ncomp, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     For(box, ncomp, std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     For(box, ncomp, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void ParallelFor (Box const& box, T ncomp, L const& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -238,20 +238,20 @@ void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
     }
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
     ParallelFor(box, ncomp, std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor(box, ncomp, std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -317,8 +317,8 @@ void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& b
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
@@ -327,8 +327,8 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <int MT, typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
@@ -338,8 +338,8 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void For (Gpu::KernelInfo const&,
           Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2) noexcept
@@ -348,8 +348,8 @@ void For (Gpu::KernelInfo const&,
 }
 
 template <int MT, typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void For (Gpu::KernelInfo const&,
           Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2) noexcept
@@ -359,9 +359,9 @@ void For (Gpu::KernelInfo const&,
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2,
           Box const& box3, T3 ncomp3, L3&& f3) noexcept
@@ -372,9 +372,9 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2,
           Box const& box3, T3 ncomp3, L3&& f3) noexcept
@@ -386,9 +386,9 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void For (Gpu::KernelInfo const&,
           Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2,
@@ -400,9 +400,9 @@ void For (Gpu::KernelInfo const&,
 }
 
 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void For (Gpu::KernelInfo const&,
           Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2,
@@ -432,14 +432,14 @@ void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 template <typename L1, typename L2>
 void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    ParallelFor(box1,box2,f1,f2);
+    ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <int MT, typename L1, typename L2>
 void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     amrex::ignore_unused(MT);
-    ParallelFor(box1,box2,f1,f2);
+    ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
@@ -473,8 +473,8 @@ void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
@@ -483,8 +483,8 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <int MT, typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
@@ -494,8 +494,8 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void ParallelFor (Gpu::KernelInfo const&,
                   Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
@@ -505,8 +505,8 @@ void ParallelFor (Gpu::KernelInfo const&,
 }
 
 template <int MT, typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void ParallelFor (Gpu::KernelInfo const&,
                   Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
@@ -517,9 +517,9 @@ void ParallelFor (Gpu::KernelInfo const&,
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
@@ -530,9 +530,9 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
@@ -544,9 +544,9 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void ParallelFor (Gpu::KernelInfo const&,
                   Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
@@ -558,9 +558,9 @@ void ParallelFor (Gpu::KernelInfo const&,
 }
 
 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void ParallelFor (Gpu::KernelInfo const&,
                   Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
@@ -572,13 +572,13 @@ void ParallelFor (Gpu::KernelInfo const&,
                 box3, ncomp3, std::forward<L3>(f3));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceParallelFor (T n, L&& f) noexcept
 {
     ParallelFor(n,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceParallelFor (T n, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -598,13 +598,13 @@ void HostDeviceParallelFor (Box const& box, L&& f) noexcept
     ParallelFor(box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor(box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -640,8 +640,8 @@ void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
@@ -649,8 +649,8 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <int MT, typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
@@ -659,9 +659,9 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2,
                             Box const& box3, T3 ncomp3, L3&& f3) noexcept
@@ -672,9 +672,9 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2,
                             Box const& box3, T3 ncomp3, L3&& f3) noexcept
@@ -685,13 +685,13 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceFor (T n, L&& f) noexcept
 {
     For(n,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceFor (T n, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -711,13 +711,13 @@ void HostDeviceFor (Box const& box, L&& f) noexcept
     For(box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept
 {
     For(box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -753,8 +753,8 @@ void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3,
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
@@ -762,8 +762,8 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <int MT, typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
@@ -772,9 +772,9 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2,
                     Box const& box3, T3 ncomp3, L3&& f3) noexcept
@@ -785,9 +785,9 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2,
                     Box const& box3, T3 ncomp3, L3&& f3) noexcept
@@ -798,13 +798,13 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
         box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     ParallelFor(n,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -824,13 +824,13 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexc
     ParallelFor(box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor(box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -868,8 +868,8 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&,
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
                             Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2) noexcept
@@ -878,8 +878,8 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&,
 }
 
 template <int MT, typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
                             Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2) noexcept
@@ -889,9 +889,9 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&,
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
                             Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2,
@@ -903,9 +903,9 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&,
 }
 
 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
                             Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2,
@@ -917,13 +917,13 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&,
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     For(n,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -943,13 +943,13 @@ void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
     For(box,std::forward<L>(f));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     For(box,ncomp,std::forward<L>(f));
 }
 
-template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     amrex::ignore_unused(MT);
@@ -987,8 +987,8 @@ void HostDeviceFor (Gpu::KernelInfo const&,
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceFor (Gpu::KernelInfo const&,
                     Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2) noexcept
@@ -997,8 +997,8 @@ void HostDeviceFor (Gpu::KernelInfo const&,
 }
 
 template <int MT, typename T1, typename T2, typename L1, typename L2,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>> >
 void HostDeviceFor (Gpu::KernelInfo const&,
                     Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2) noexcept
@@ -1008,9 +1008,9 @@ void HostDeviceFor (Gpu::KernelInfo const&,
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceFor (Gpu::KernelInfo const&,
                     Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2,
@@ -1022,9 +1022,9 @@ void HostDeviceFor (Gpu::KernelInfo const&,
 }
 
 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
-          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
-          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
-          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+          typename M1=std::enable_if_t<std::is_integral_v<T1>>,
+          typename M2=std::enable_if_t<std::is_integral_v<T2>>,
+          typename M3=std::enable_if_t<std::is_integral_v<T3>> >
 void HostDeviceFor (Gpu::KernelInfo const&,
                     Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2,
@@ -1036,8 +1036,8 @@ void HostDeviceFor (Gpu::KernelInfo const&,
         box3,ncomp3,std::forward<L3>(f3));
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelForRNG (T n, L&& f) noexcept
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void ParallelForRNG (T n, L const& f) noexcept
 {
     for (T i = 0; i < n; ++i) {
         f(i,RandomEngine{});
@@ -1045,7 +1045,7 @@ void ParallelForRNG (T n, L&& f) noexcept
 }
 
 template <typename L>
-void ParallelForRNG (Box const& box, L&& f) noexcept
+void ParallelForRNG (Box const& box, L const& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -1056,8 +1056,8 @@ void ParallelForRNG (Box const& box, L&& f) noexcept
     }}}
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
+void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -1073,7 +1073,7 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
 template <typename L>
 void single_task (L&& f) noexcept
 {
-    f();
+    std::forward<L>(f)();
 }
 
 }
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index aea0c030152..07f28d0944a 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -7,7 +7,7 @@ namespace amrex {
 #ifdef AMREX_USE_SYCL
 
 template <typename L>
-void single_task (gpuStream_t stream, L&& f) noexcept
+void single_task (gpuStream_t stream, L const& f) noexcept
 {
     auto& q = *(stream.queue);
     try {
@@ -21,10 +21,10 @@ void single_task (gpuStream_t stream, L&& f) noexcept
 
 template<typename L>
 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
-             gpuStream_t stream, L&& f) noexcept
+             gpuStream_t stream, L const& f) noexcept
 {
-    int nthreads_total = nthreads_per_block * nblocks;
-    std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
+    const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
+    const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
         / sizeof(unsigned long long);
     auto& q = *(stream.queue);
     try {
@@ -36,7 +36,7 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
             [=] (sycl::nd_item<1> item)
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                f(Gpu::Handler{&item,shared_data.get_pointer()});
+                f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
             });
         });
     } catch (sycl::exception const& ex) {
@@ -45,9 +45,9 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
 }
 
 template<typename L>
-void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
+void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept
 {
-    int nthreads_total = nthreads_per_block * nblocks;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
     auto& q = *(stream.queue);
     try {
         q.submit([&] (sycl::handler& h) {
@@ -66,10 +66,10 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
 
 template <int MT, typename L>
 void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
-             L&& f) noexcept
+             L const& f) noexcept
 {
-    int nthreads_total = MT * nblocks;
-    std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
+    const auto nthreads_total = MT * std::size_t(nblocks);
+    const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
         / sizeof(unsigned long long);
     auto& q = *(stream.queue);
     try {
@@ -82,7 +82,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                f(Gpu::Handler{&item,shared_data.get_pointer()});
+                f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
             });
         });
     } catch (sycl::exception const& ex) {
@@ -91,9 +91,9 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
 }
 
 template <int MT, typename L>
-void launch (int nblocks, gpuStream_t stream, L&& f) noexcept
+void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
 {
-    int nthreads_total = MT * nblocks;
+    const auto nthreads_total = MT * std::size_t(nblocks);
     auto& q = *(stream.queue);
     try {
         q.submit([&] (sycl::handler& h) {
@@ -112,12 +112,12 @@ void launch (int nblocks, gpuStream_t stream, L&& f) noexcept
 }
 
 template<int MT, typename T, typename L>
-void launch (T const& n, L&& f) noexcept
+void launch (T const& n, L const& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
     const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -188,12 +188,12 @@ namespace detail {
 }
 
 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
     const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         if (info.hasReduction()) {
@@ -206,11 +206,11 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (T i = item.get_global_id(0), stride = item.get_global_range(0);
-                         i < n; i += stride) {
-                        int n_active_threads = amrex::min(n-i+(T)item.get_local_id(0),
-                                                          (T)item.get_local_range(0));
-                        detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_pointer(),
+                    for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
+                         i < std::size_t(n); i += stride) {
+                        int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
+                                                          item.get_local_range(0));
+                        detail::call_f(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                           n_active_threads});
                     }
                 });
@@ -223,9 +223,9 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (T i = item.get_global_id(0), stride = item.get_global_range(0);
-                         i < n; i += stride) {
-                        detail::call_f(f, i, Gpu::Handler{&item});
+                    for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
+                         i < std::size_t(n); i += stride) {
+                        detail::call_f(f, T(i), Gpu::Handler{&item});
                     }
                 });
             });
@@ -236,17 +236,13 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 }
 
 template <int MT, typename L>
-void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const BoxIndexer indexer(box);
+    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         if (info.hasReduction()) {
@@ -259,17 +255,12 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
-                         icell < ncells; icell += stride) {
-                        int k =  icell /   lenxy;
-                        int j = (icell - k*lenxy) /   lenx;
-                        int i = (icell - k*lenxy) - j*lenx;
-                        i += lo.x;
-                        j += lo.y;
-                        k += lo.z;
-                        int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0),
-                                                          (int)item.get_local_range(0));
-                        detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_pointer(),
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
+                         icell < indexer.numPts(); icell += stride) {
+                        auto [i, j, k] = indexer(icell);
+                        int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
+                                                          std::uint64_t(item.get_local_range(0)));
+                        detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                                 n_active_threads});
                     }
                 });
@@ -282,14 +273,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
-                         icell < ncells; icell += stride) {
-                        int k =  icell /   lenxy;
-                        int j = (icell - k*lenxy) /   lenx;
-                        int i = (icell - k*lenxy) - j*lenx;
-                        i += lo.x;
-                        j += lo.y;
-                        k += lo.z;
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
+                         icell < indexer.numPts(); icell += stride) {
+                        auto [i, j, k] = indexer(icell);
                         detail::call_f(f,i,j,k,Gpu::Handler{&item});
                     }
                 });
@@ -301,17 +287,13 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 }
 
 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const BoxIndexer indexer(box);
+    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         if (info.hasReduction()) {
@@ -324,18 +306,13 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
-                         icell < ncells; icell += stride) {
-                        int k =  icell /   lenxy;
-                        int j = (icell - k*lenxy) /   lenx;
-                        int i = (icell - k*lenxy) - j*lenx;
-                        i += lo.x;
-                        j += lo.y;
-                        k += lo.z;
-                        int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0),
-                                                          (int)item.get_local_range(0));
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
+                         icell < indexer.numPts(); icell += stride) {
+                        auto [i, j, k] = indexer(icell);
+                        int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
+                                                          std::uint64_t(item.get_local_range(0)));
                         detail::call_f(f, i, j, k, ncomp,
-                                       Gpu::Handler{&item, shared_data.get_pointer(),
+                                       Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                     n_active_threads});
                     }
                 });
@@ -348,14 +325,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
-                         icell < ncells; icell += stride) {
-                        int k =  icell /   lenxy;
-                        int j = (icell - k*lenxy) /   lenx;
-                        int i = (icell - k*lenxy) - j*lenx;
-                        i += lo.x;
-                        j += lo.y;
-                        k += lo.z;
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
+                         icell < indexer.numPts(); icell += stride) {
+                        auto [i, j, k] = indexer(icell);
                         detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item});
                     }
                 });
@@ -367,12 +339,12 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
 }
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelForRNG (T n, L&& f) noexcept
+void ParallelForRNG (T n, L const& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
     const auto ec = Gpu::ExecutionConfig(n);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
@@ -384,11 +356,11 @@ void ParallelForRNG (T n, L&& f) noexcept
             [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                int tid = item.get_global_id(0);
+                auto const tid = item.get_global_id(0);
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
-                for (T i = tid, stride = item.get_global_range(0); i < n; i += stride) {
-                    f(i,rand_eng);
+                for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
+                    f(T(i),rand_eng);
                 }
                 engine_acc.store(engine, tid);
             });
@@ -400,17 +372,13 @@ void ParallelForRNG (T n, L&& f) noexcept
 }
 
 template <typename L>
-void ParallelForRNG (Box const& box, L&& f) noexcept
+void ParallelForRNG (Box const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
+    const BoxIndexer indexer(box);
+    const auto ec = Gpu::ExecutionConfig(box.numPts());
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
@@ -422,17 +390,12 @@ void ParallelForRNG (Box const& box, L&& f) noexcept
             [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                int tid = item.get_global_id(0);
+                auto const tid = item.get_global_id(0);
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
-                for (int icell = tid, stride = item.get_global_range(0);
-                     icell < ncells; icell += stride) {
-                    int k =  icell /   lenxy;
-                    int j = (icell - k*lenxy) /   lenx;
-                    int i = (icell - k*lenxy) - j*lenx;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
+                for (std::uint64_t icell = tid, stride = item.get_global_range(0);
+                     icell < indexer.numPts(); icell += stride) {
+                    auto [i, j, k] = indexer(icell);
                     f(i,j,k,rand_eng);
                 }
                 engine_acc.store(engine, tid);
@@ -445,17 +408,13 @@ void ParallelForRNG (Box const& box, L&& f) noexcept
 }
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
-void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
+void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
+    const BoxIndexer indexer(box);
+    const auto ec = Gpu::ExecutionConfig(box.numPts());
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
@@ -467,17 +426,12 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
             [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                int tid = item.get_global_id(0);
+                auto const tid = item.get_global_id(0);
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
-                for (int icell = tid, stride = item.get_global_range(0);
-                     icell < ncells; icell += stride) {
-                    int k =  icell /   lenxy;
-                    int j = (icell - k*lenxy) /   lenx;
-                    int i = (icell - k*lenxy) - j*lenx;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
+                for (std::uint64_t icell = tid, stride = item.get_global_range(0);
+                     icell < indexer.numPts(); icell += stride) {
+                    auto [i, j, k] = indexer(icell);
                     for (T n = 0; n < ncomp; ++n) {
                         f(i,j,k,n,rand_eng);
                     }
@@ -495,20 +449,11 @@ template <int MT, typename L1, typename L2>
 void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -518,24 +463,15 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    if (icell < ncells1) {
-                        int k =  icell /   len1xy;
-                        int j = (icell - k*len1xy) /   len1x;
-                        int i = (icell - k*len1xy) - j*len1x;
-                        i += lo1.x;
-                        j += lo1.y;
-                        k += lo1.z;
+                    if (icell < indexer1.numPts()) {
+                        auto [i, j, k] = indexer1(icell);
                         f1(i,j,k);
                     }
-                    if (icell < ncells2) {
-                        int k =  icell /   len2xy;
-                        int j = (icell - k*len2xy) /   len2x;
-                        int i = (icell - k*len2xy) - j*len2x;
-                        i += lo2.x;
-                        j += lo2.y;
-                        k += lo2.z;
+                    if (icell < indexer2.numPts()) {
+                        auto [i, j, k] = indexer2(icell);
                         f2(i,j,k);
                     }
                 }
@@ -552,25 +488,12 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells3 = box3.numPts();
-    int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len3xy = len3.x*len3.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto len3x = len3.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const BoxIndexer indexer3(box3);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -580,33 +503,19 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    if (icell < ncells1) {
-                        int k =  icell /   len1xy;
-                        int j = (icell - k*len1xy) /   len1x;
-                        int i = (icell - k*len1xy) - j*len1x;
-                        i += lo1.x;
-                        j += lo1.y;
-                        k += lo1.z;
+                    if (icell < indexer1.numPts()) {
+                        auto [i, j, k] = indexer1(icell);
                         f1(i,j,k);
                     }
-                    if (icell < ncells2) {
-                        int k =  icell /   len2xy;
-                        int j = (icell - k*len2xy) /   len2x;
-                        int i = (icell - k*len2xy) - j*len2x;
-                        i += lo2.x;
-                        j += lo2.y;
-                        k += lo2.z;
+                    if (icell < indexer2.numPts()) {
+                        auto [i, j, k] = indexer2(icell);
                         f2(i,j,k);
                     }
-                    if (icell < ncells3) {
-                        int k =  icell /   len3xy;
-                        int j = (icell - k*len3xy) /   len3x;
-                        int i = (icell - k*len3xy) - j*len3x;
-                        i += lo3.x;
-                        j += lo3.y;
-                        k += lo3.z;
+                    if (icell < indexer3.numPts()) {
+                        auto [i, j, k] = indexer3(icell);
                         f3(i,j,k);
                     }
                 }
@@ -625,20 +534,11 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -648,26 +548,17 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    if (icell < ncells1) {
-                        int k =  icell /   len1xy;
-                        int j = (icell - k*len1xy) /   len1x;
-                        int i = (icell - k*len1xy) - j*len1x;
-                        i += lo1.x;
-                        j += lo1.y;
-                        k += lo1.z;
+                    if (icell < indexer1.numPts()) {
+                        auto [i, j, k] = indexer1(icell);
                         for (T1 n = 0; n < ncomp1; ++n) {
                             f1(i,j,k,n);
                         }
                     }
-                    if (icell < ncells2) {
-                        int k =  icell /   len2xy;
-                        int j = (icell - k*len2xy) /   len2x;
-                        int i = (icell - k*len2xy) - j*len2x;
-                        i += lo2.x;
-                        j += lo2.y;
-                        k += lo2.z;
+                    if (icell < indexer2.numPts()) {
+                        auto [i, j, k] = indexer2(icell);
                         for (T2 n = 0; n < ncomp2; ++n) {
                             f2(i,j,k,n);
                         }
@@ -690,25 +581,12 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells3 = box3.numPts();
-    int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len3xy = len3.x*len3.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto len3x = len3.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const BoxIndexer indexer3(box3);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -718,37 +596,23 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    if (icell < ncells1) {
-                        int k =  icell /   len1xy;
-                        int j = (icell - k*len1xy) /   len1x;
-                        int i = (icell - k*len1xy) - j*len1x;
-                        i += lo1.x;
-                        j += lo1.y;
-                        k += lo1.z;
+                    if (icell < indexer1.numPts()) {
+                        auto [i, j, k] = indexer1(icell);
                         for (T1 n = 0; n < ncomp1; ++n) {
                             f1(i,j,k,n);
                         }
                     }
-                    if (icell < ncells2) {
-                        int k =  icell /   len2xy;
-                        int j = (icell - k*len2xy) /   len2x;
-                        int i = (icell - k*len2xy) - j*len2x;
-                        i += lo2.x;
-                        j += lo2.y;
-                        k += lo2.z;
+                    if (icell < indexer2.numPts()) {
+                        auto [i, j, k] = indexer2(icell);
                         for (T2 n = 0; n < ncomp2; ++n) {
                             f2(i,j,k,n);
                         }
                     }
-                    if (icell < ncells3) {
-                        int k =  icell /   len3xy;
-                        int j = (icell - k*len3xy) /   len3x;
-                        int i = (icell - k*len3xy) - j*len3x;
-                        i += lo3.x;
-                        j += lo3.y;
-                        k += lo3.z;
+                    if (icell < indexer3.numPts()) {
+                        auto [i, j, k] = indexer3(icell);
                         for (T3 n = 0; n < ncomp3; ++n) {
                             f3(i,j,k,n);
                         }
@@ -765,7 +629,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
 // CUDA or HIP
 
 template <typename L>
-void single_task (gpuStream_t stream, L&& f) noexcept
+void single_task (gpuStream_t stream, L const& f) noexcept
 {
     AMREX_LAUNCH_KERNEL(Gpu::Device::warp_size, 1, 1, 0, stream,
                         [=] AMREX_GPU_DEVICE () noexcept {f();});
@@ -774,7 +638,7 @@ void single_task (gpuStream_t stream, L&& f) noexcept
 
 template <int MT, typename L>
 void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
-             L&& f) noexcept
+             L const& f) noexcept
 {
     AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream,
                         [=] AMREX_GPU_DEVICE () noexcept { f(); });
@@ -782,7 +646,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
 }
 
 template <int MT, typename L>
-void launch (int nblocks, gpuStream_t stream, L&& f) noexcept
+void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
 {
     AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream,
                         [=] AMREX_GPU_DEVICE () noexcept { f(); });
@@ -791,7 +655,7 @@ void launch (int nblocks, gpuStream_t stream, L&& f) noexcept
 
 template<typename L>
 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
-             gpuStream_t stream, L&& f) noexcept
+             gpuStream_t stream, L const& f) noexcept
 {
     AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
@@ -806,7 +670,7 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
 }
 
 template<int MT, typename T, typename L>
-void launch (T const& n, L&& f) noexcept
+void launch (T const& n, L const& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
     const auto ec = Gpu::makeExecutionConfig<MT>(n);
@@ -822,7 +686,7 @@ void launch (T const& n, L&& f) noexcept
 namespace detail {
     template <typename F, typename N>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i, N /*nleft*/)
+    auto call_f (F const& f, N i, std::uint64_t /*nleft*/)
         noexcept -> decltype(f(0))
     {
         f(i);
@@ -830,15 +694,15 @@ namespace detail {
 
     template <typename F, typename N>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i, N nleft)
+    auto call_f (F const& f, N i, std::uint64_t nleft)
         noexcept -> decltype(f(0,Gpu::Handler{}))
     {
-        f(i,Gpu::Handler(amrex::min(nleft,(N)blockDim.x)));
+        f(i,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x)));
     }
 
     template <typename F>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, int /*nleft*/)
+    auto call_f (F const& f, int i, int j, int k, std::uint64_t /*nleft*/)
         noexcept -> decltype(f(0,0,0))
     {
         f(i,j,k);
@@ -846,15 +710,15 @@ namespace detail {
 
     template <typename F>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, int nleft)
+    auto call_f (F const& f, int i, int j, int k, std::uint64_t nleft)
         noexcept -> decltype(f(0,0,0,Gpu::Handler{}))
     {
-        f(i,j,k,Gpu::Handler(amrex::min(nleft,(int)blockDim.x)));
+        f(i,j,k,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x)));
     }
 
     template <typename F, typename T>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T ncomp, int /*nleft*/)
+    auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t /*nleft*/)
         noexcept -> decltype(f(0,0,0,0))
     {
         for (T n = 0; n < ncomp; ++n) f(i,j,k,n);
@@ -862,24 +726,24 @@ namespace detail {
 
     template <typename F, typename T>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T ncomp, int nleft)
+    auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t nleft)
         noexcept -> decltype(f(0,0,0,0,Gpu::Handler{}))
     {
-        for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(int)blockDim.x)));
+        for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x)));
     }
 }
 
 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
     const auto ec = Gpu::makeExecutionConfig<MT>(n);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             i < n; i += stride) {
-            detail::call_f(f, i, (n-i+(T)threadIdx.x));
+        for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
+             i < Long(n); i += stride) {
+            detail::call_f(f, T(i), (Long(n)-i+(Long)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -887,27 +751,18 @@ ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 
 template <int MT, typename L>
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const&, Box const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const BoxIndexer indexer(box);
+    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride)
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+             icell < indexer.numPts(); icell += stride)
         {
-            int k =  icell /   lenxy;
-            int j = (icell - k*lenxy) /   lenx;
-            int i = (icell - k*lenxy) - j*lenx;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            detail::call_f(f, i, j, k, (ncells-icell+(int)threadIdx.x));
+            auto [i, j, k] = indexer(icell);
+            detail::call_f(f, i, j, k, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -915,26 +770,17 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
 
 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const BoxIndexer indexer(box);
+    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            int k =  icell /   lenxy;
-            int j = (icell - k*lenxy) /   lenx;
-            int i = (icell - k*lenxy) - j*lenx;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            detail::call_f(f, i, j, k, ncomp, (ncells-icell+(int)threadIdx.x));
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+             icell < indexer.numPts(); icell += stride) {
+            auto [i, j, k] = indexer(icell);
+            detail::call_f(f, i, j, k, ncomp, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -942,7 +788,7 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelForRNG (T n, L&& f) noexcept
+ParallelForRNG (T n, L const& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
     randState_t* rand_state = getRandState();
@@ -951,10 +797,10 @@ ParallelForRNG (T n, L&& f) noexcept
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        int tid = blockDim.x*blockIdx.x+threadIdx.x;
+        Long tid = Long(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (T i = tid, stride = blockDim.x*gridDim.x; i < n; i += stride) {
-            f(i,engine);
+        for (Long i = tid, stride = Long(blockDim.x)*gridDim.x; i < Long(n); i += stride) {
+            f(T(i),engine);
         }
     });
     Gpu::streamSynchronize(); // To avoid multiple streams using RNG
@@ -963,29 +809,20 @@ ParallelForRNG (T n, L&& f) noexcept
 
 template <typename L>
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelForRNG (Box const& box, L&& f) noexcept
+ParallelForRNG (Box const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const BoxIndexer indexer(box);
+    const auto ec = Gpu::ExecutionConfig(box.numPts());
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        int tid = blockDim.x*blockIdx.x+threadIdx.x;
+        auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (int icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) {
-            int k =  icell /   lenxy;
-            int j = (icell - k*lenxy) /   lenx;
-            int i = (icell - k*lenxy) - j*lenx;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
+        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
+            auto [i, j, k] = indexer(icell);
             f(i,j,k,engine);
         }
     });
@@ -995,29 +832,20 @@ ParallelForRNG (Box const& box, L&& f) noexcept
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
-ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
+ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const BoxIndexer indexer(box);
+    const auto ec = Gpu::ExecutionConfig(box.numPts());
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        int tid = blockDim.x*blockIdx.x+threadIdx.x;
+        auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (int icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) {
-            int k =  icell /   lenxy;
-            int j = (icell - k*lenxy) /   lenx;
-            int i = (icell - k*lenxy) - j*lenx;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
+        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
+            auto [i, j, k] = indexer(icell);
             for (T n = 0; n < ncomp; ++n) {
                 f(i,j,k,n,engine);
             }
@@ -1033,38 +861,20 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   len1xy;
-                int j = (icell - k*len1xy) /   len1x;
-                int i = (icell - k*len1xy) - j*len1x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
+            if (icell < indexer1.numPts()) {
+                auto [i, j, k] = indexer1(icell);
                 f1(i,j,k);
             }
-            if (icell < ncells2) {
-                int k =  icell /   len2xy;
-                int j = (icell - k*len2xy) /   len2x;
-                int i = (icell - k*len2xy) - j*len2x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
+            if (icell < indexer2.numPts()) {
+                auto [i, j, k] = indexer2(icell);
                 f2(i,j,k);
             }
         }
@@ -1079,52 +889,25 @@ ParallelFor (Gpu::KernelInfo const&,
              L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells3 = box3.numPts();
-    int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len3xy = len3.x*len3.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto len3x = len3.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const BoxIndexer indexer3(box3);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   len1xy;
-                int j = (icell - k*len1xy) /   len1x;
-                int i = (icell - k*len1xy) - j*len1x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
+            if (icell < indexer1.numPts()) {
+                auto [i, j, k] = indexer1(icell);
                 f1(i,j,k);
             }
-            if (icell < ncells2) {
-                int k =  icell /   len2xy;
-                int j = (icell - k*len2xy) /   len2x;
-                int i = (icell - k*len2xy) - j*len2x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
+            if (icell < indexer2.numPts()) {
+                auto [i, j, k] = indexer2(icell);
                 f2(i,j,k);
             }
-            if (icell < ncells3) {
-                int k =  icell /   len3xy;
-                int j = (icell - k*len3xy) /   len3x;
-                int i = (icell - k*len3xy) - j*len3x;
-                i += lo3.x;
-                j += lo3.y;
-                k += lo3.z;
+            if (icell < indexer3.numPts()) {
+                auto [i, j, k] = indexer3(icell);
                 f3(i,j,k);
             }
         }
@@ -1141,40 +924,22 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   len1xy;
-                int j = (icell - k*len1xy) /   len1x;
-                int i = (icell - k*len1xy) - j*len1x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
+            if (icell < indexer1.numPts()) {
+                auto [i, j, k] = indexer1(icell);
                 for (T1 n = 0; n < ncomp1; ++n) {
                     f1(i,j,k,n);
                 }
             }
-            if (icell < ncells2) {
-                int k =  icell /   len2xy;
-                int j = (icell - k*len2xy) /   len2x;
-                int i = (icell - k*len2xy) - j*len2x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
+            if (icell < indexer2.numPts()) {
+                auto [i, j, k] = indexer2(icell);
                 for (T2 n = 0; n < ncomp2; ++n) {
                     f2(i,j,k,n);
                 }
@@ -1195,56 +960,29 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells3 = box3.numPts();
-    int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len3xy = len3.x*len3.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto len3x = len3.x;
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const BoxIndexer indexer3(box3);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   len1xy;
-                int j = (icell - k*len1xy) /   len1x;
-                int i = (icell - k*len1xy) - j*len1x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
+            if (icell < indexer1.numPts()) {
+                auto [i, j, k] = indexer1(icell);
                 for (T1 n = 0; n < ncomp1; ++n) {
                     f1(i,j,k,n);
                 }
             }
-            if (icell < ncells2) {
-                int k =  icell /   len2xy;
-                int j = (icell - k*len2xy) /   len2x;
-                int i = (icell - k*len2xy) - j*len2x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
+            if (icell < indexer2.numPts()) {
+                auto [i, j, k] = indexer2(icell);
                 for (T2 n = 0; n < ncomp2; ++n) {
                     f2(i,j,k,n);
                 }
             }
-            if (icell < ncells3) {
-                int k =  icell /   len3xy;
-                int j = (icell - k*len3xy) /   len3x;
-                int i = (icell - k*len3xy) - j*len3x;
-                i += lo3.x;
-                j += lo3.y;
-                k += lo3.z;
+            if (icell < indexer3.numPts()) {
+                auto [i, j, k] = indexer3(icell);
                 for (T3 n = 0; n < ncomp3; ++n) {
                     f3(i,j,k,n);
                 }
@@ -1678,7 +1416,7 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
         amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
 #else
         AMREX_PRAGMA_SIMD
-        for (T i = 0; i < n; ++i) f(i);
+        for (T i = 0; i < n; ++i) { f(i); }
 #endif
     }
 }
@@ -1694,7 +1432,7 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
         amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
 #else
         AMREX_PRAGMA_SIMD
-        for (T i = 0; i < n; ++i) f(i);
+        for (T i = 0; i < n; ++i) { f(i); }
 #endif
     }
 }
diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H b/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H
index 22c0e00b56b..f01673c9821 100644
--- a/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H
+++ b/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H
@@ -457,7 +457,7 @@
 #define AMREX_GPU_HOST_DEVICE_FOR_1D(n,i,block) \
 { \
     auto const& amrex_i_n = n; \
-    using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
     if (amrex::Gpu::inLaunchRegion()) { \
         amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
     } else { \
@@ -468,7 +468,7 @@
 #define AMREX_GPU_HOST_DEVICE_FOR_1D(n,i,block) \
 { \
     auto const& amrex_i_n = n; \
-    using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
     if (amrex::Gpu::inLaunchRegion()) { \
         amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
     } else { \
@@ -481,7 +481,7 @@
 
 #define AMREX_GPU_DEVICE_FOR_1D(n,i,block) \
 { \
-    using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
     amrex::ParallelFor(n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
 }
 
diff --git a/Src/Base/AMReX_GpuMemory.H b/Src/Base/AMReX_GpuMemory.H
index 1ffee387015..8daeceba914 100644
--- a/Src/Base/AMReX_GpuMemory.H
+++ b/Src/Base/AMReX_GpuMemory.H
@@ -51,7 +51,7 @@ struct Deleter {
     void operator() (void* pt) const noexcept { m_arena->free(pt); }
 };
 
-template <class T, std::enable_if_t<std::is_trivially_copyable<T>::value,int> = 0>
+template <class T, std::enable_if_t<std::is_trivially_copyable_v<T>,int> = 0>
 struct DeviceScalar
 {
     DeviceScalar (DeviceScalar const&) = delete;
@@ -104,7 +104,7 @@ private:
 
 #else
 
-    DeviceScalar (T init_val) : d(init_val) {}
+    DeviceScalar (T const& init_val) : d(init_val) {}
     DeviceScalar () = default;
     ~DeviceScalar () = default;
 
diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H
index 1c0b5731762..4fba23a849a 100644
--- a/Src/Base/AMReX_GpuQualifiers.H
+++ b/Src/Base/AMReX_GpuQualifiers.H
@@ -8,6 +8,12 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#if defined(AMREX_USE_CUDA) && (defined(AMREX_CXX_PGI) || defined(AMREX_CXX_NVHPC))
+#include <nv/target>
+#define AMREX_IF_ON_DEVICE(CODE) NV_IF_TARGET(NV_IS_DEVICE, CODE)
+#define AMREX_IF_ON_HOST(CODE) NV_IF_TARGET(NV_IS_HOST, CODE)
+#endif
+
 #define AMREX_GPU_HOST __host__
 #define AMREX_GPU_DEVICE __device__
 #define AMREX_GPU_GLOBAL __global__
@@ -31,6 +37,29 @@
 
 #define AMREX_DEVICE_COMPILE (__CUDA_ARCH__ || __HIP_DEVICE_COMPILE__ || __SYCL_DEVICE_ONLY__)
 
+// Remove surrounding parentheses if present
+#define AMREX_IMPL_STRIP_PARENS(X) AMREX_IMPL_ESC(AMREX_IMPL_ISH X)
+#define AMREX_IMPL_ISH(...) AMREX_IMPL_ISH __VA_ARGS__
+#define AMREX_IMPL_ESC(...) AMREX_IMPL_ESC_(__VA_ARGS__)
+#define AMREX_IMPL_ESC_(...) AMREX_IMPL_VAN_##__VA_ARGS__
+#define AMREX_IMPL_VAN_AMREX_IMPL_ISH
+
+#if !defined(AMREX_IF_ON_DEVICE) && !defined(AMREX_IF_ON_HOST)
+#if (defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__)) ||         \
+    (defined(AMREX_USE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \
+    (defined(AMREX_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
+#define AMREX_IF_ON_DEVICE(CODE) \
+  { AMREX_IMPL_STRIP_PARENS(CODE) }
+#define AMREX_IF_ON_HOST(CODE) \
+  {}
+#else
+#define AMREX_IF_ON_DEVICE(CODE) \
+  {}
+#define AMREX_IF_ON_HOST(CODE) \
+  { AMREX_IMPL_STRIP_PARENS(CODE) }
+#endif
+#endif
+
 #ifdef AMREX_USE_SYCL
 # include <sycl/sycl.hpp>
 #endif
diff --git a/Src/Base/AMReX_GpuRange.H b/Src/Base/AMReX_GpuRange.H
index b8d2ab89d08..ecf9a32fd2e 100644
--- a/Src/Base/AMReX_GpuRange.H
+++ b/Src/Base/AMReX_GpuRange.H
@@ -10,7 +10,7 @@
 
 namespace amrex {
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value,int>::type = 0>
+template <typename T, std::enable_if_t<std::is_integral_v<T>,int> = 0>
 bool isEmpty (T n) noexcept { return n <= 0; }
 
 AMREX_FORCE_INLINE bool isEmpty (Box const& b) noexcept { return b.isEmpty(); }
@@ -20,11 +20,11 @@ namespace Gpu {
 namespace range_detail {
 
 //! integer version
-template <typename T, typename std::enable_if<std::is_integral<T>::value,int>::type = 0>
+template <typename T, std::enable_if_t<std::is_integral_v<T>,int> = 0>
 AMREX_GPU_HOST_DEVICE
 Long size (T const& b) noexcept { return static_cast<Long>(b); }
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value,int>::type = 0>
+template <typename T, std::enable_if_t<std::is_integral_v<T>,int> = 0>
 AMREX_GPU_HOST_DEVICE
 Long at (T const& /*b*/, Long offset) noexcept { return offset; }
 
@@ -32,31 +32,31 @@ Long at (T const& /*b*/, Long offset) noexcept { return offset; }
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE Long size (Box const& b) noexcept
 {
-#if AMREX_DEVICE_COMPILE
-    return b.numPts();
-#else
-    amrex::ignore_unused(b);
-    return 1;
-#endif
+    AMREX_IF_ON_DEVICE((return b.numPts();))
+    AMREX_IF_ON_HOST((
+        amrex::ignore_unused(b);
+        return 1;
+    ))
 }
 
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE Box at (Box const& b, Long offset) noexcept
 {
-#if AMREX_DEVICE_COMPILE
-    auto len = b.length3d();
-    Long k = offset / (len[0]*len[1]);
-    Long j = (offset - k*(len[0]*len[1])) / len[0];
-    Long i = (offset - k*(len[0]*len[1])) - j*len[0];
-    IntVect iv{AMREX_D_DECL(static_cast<int>(i),
-                            static_cast<int>(j),
-                            static_cast<int>(k))};
-    iv += b.smallEnd();
-    return Box(iv,iv,b.type());
-#else
-    amrex::ignore_unused(offset);
-    return b;
-#endif
+    AMREX_IF_ON_DEVICE((
+        auto len = b.length3d();
+        Long k = offset / (len[0]*len[1]);
+        Long j = (offset - k*(len[0]*len[1])) / len[0];
+        Long i = (offset - k*(len[0]*len[1])) - j*len[0];
+        IntVect iv{AMREX_D_DECL(static_cast<int>(i),
+                                static_cast<int>(j),
+                                static_cast<int>(k))};
+        iv += b.smallEnd();
+        return Box(iv,iv,b.type());
+    ))
+    AMREX_IF_ON_HOST((
+        amrex::ignore_unused(offset);
+        return b;
+    ))
 }
 
 template <typename T>
@@ -73,7 +73,7 @@ struct range_impl
     struct iterator
     {
         AMREX_GPU_HOST_DEVICE
-        iterator (T const& b, Long i, Long s) noexcept : mi_b(b), mi_i(i), mi_s(s)  {}
+        iterator (T const& b, Long i, Long s) noexcept : mi_b(&b), mi_i(i), mi_s(s)  {}
 
         AMREX_GPU_HOST_DEVICE
         void operator++ () noexcept { mi_i += mi_s; }
@@ -82,23 +82,25 @@ struct range_impl
         bool operator!= (iterator const& rhs) const noexcept { return mi_i < rhs.mi_i; }
 
         AMREX_GPU_HOST_DEVICE
-        T operator* () const noexcept { return range_detail::at(mi_b,mi_i); }
+        T operator* () const noexcept { return range_detail::at(*mi_b,mi_i); }
 
     private:
-        T const& mi_b;
+        T const* mi_b;
         Long mi_i;
         Long mi_s;
     };
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     iterator begin () const noexcept {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x);
-#elif defined (__SYCL_DEVICE_ONLY__)
+#if defined (__SYCL_DEVICE_ONLY__)
         return iterator(m_b, m_gid, m_grange);
 #else
-        return iterator(m_b,0,1);
+        AMREX_IF_ON_DEVICE((
+            return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x);
+        ))
+        AMREX_IF_ON_HOST((
+            return iterator(m_b,0,1);
+        ))
 #endif
     }
 
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 8b5680b41b8..ecb992983ba 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -29,6 +29,7 @@ struct Dim1 {
 struct gpuStream_t {
     sycl::queue* queue = nullptr;
     bool operator== (gpuStream_t const& rhs) noexcept { return queue == rhs.queue; }
+    bool operator!= (gpuStream_t const& rhs) noexcept { return queue != rhs.queue; }
 };
 
 #endif
diff --git a/Src/Base/AMReX_GpuUtility.H b/Src/Base/AMReX_GpuUtility.H
index a1fa3cdd9dc..4adc111f5e2 100644
--- a/Src/Base/AMReX_GpuUtility.H
+++ b/Src/Base/AMReX_GpuUtility.H
@@ -26,8 +26,9 @@ namespace Gpu {
     template <typename T>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T LDG (Array4<T> const& a, int i, int j, int k) noexcept {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-        return __ldg(a.ptr(i,j,k));
+#if defined(AMREX_USE_CUDA)
+        AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k));))
+        AMREX_IF_ON_HOST((return a(i,j,k);))
 #else
         return a(i,j,k);
 #endif
@@ -36,8 +37,9 @@ namespace Gpu {
     template <typename T>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T LDG (Array4<T> const& a, int i, int j, int k, int n) noexcept {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-        return __ldg(a.ptr(i,j,k,n));
+#if defined(AMREX_USE_CUDA)
+        AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k,n));))
+        AMREX_IF_ON_HOST((return a(i,j,k,n);))
 #else
         return a(i,j,k,n);
 #endif
@@ -63,7 +65,11 @@ namespace Gpu {
 #if defined(AMREX_USE_HIP)
         hipPointerAttribute_t attrib;
         hipError_t r = hipPointerGetAttributes(&attrib, p);
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
         return r == hipSuccess && attrib.memoryType == hipMemoryTypeDevice;
+#else
+        return r == hipSuccess && attrib.type == hipMemoryTypeDevice;
+#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
 #elif defined(AMREX_USE_CUDA)
         CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE;
         CUmemorytype mem_type = static_cast<CUmemorytype>(0);
@@ -83,7 +89,11 @@ namespace Gpu {
 #if defined(AMREX_USE_HIP)
         hipPointerAttribute_t attrib;
         hipError_t r = hipPointerGetAttributes(&attrib, p);
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
         return r == hipSuccess && attrib.memoryType == hipMemoryTypeHost;
+#else
+        return r == hipSuccess && attrib.type == hipMemoryTypeHost;
+#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
 #elif defined(AMREX_USE_CUDA)
         CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE;
         CUmemorytype mem_type = static_cast<CUmemorytype>(0);
@@ -106,9 +116,15 @@ namespace Gpu {
         } else {
             hipPointerAttribute_t attrib;
             hipError_t r = hipPointerGetAttributes(&attrib, p);
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
             return r == hipSuccess &&
                 (attrib.memoryType == hipMemoryTypeHost   ||
                  attrib.memoryType == hipMemoryTypeDevice);
+#else
+            return r == hipSuccess &&
+                (attrib.type == hipMemoryTypeHost   ||
+                 attrib.type == hipMemoryTypeDevice);
+#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
         }
 #elif defined(AMREX_USE_CUDA)
         CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE;
diff --git a/Src/Base/AMReX_IArrayBox.H b/Src/Base/AMReX_IArrayBox.H
index 4d39ace1012..db0f26d5080 100644
--- a/Src/Base/AMReX_IArrayBox.H
+++ b/Src/Base/AMReX_IArrayBox.H
@@ -41,7 +41,6 @@ public:
 
 *  This class does NOT provide a copy constructor or assignment operator.
 */
-
 class IArrayBox
     :
     public BaseFab<int>
@@ -58,7 +57,7 @@ public:
 
     /**
     * \brief Construct an initial FAB with the data space allocated but
-    * not inititialized. ncomp is the number of components
+    * not initialized. ncomp is the number of components
     * (variables) at each data point in the Box.
     */
     explicit IArrayBox (const Box& b,
diff --git a/Src/Base/AMReX_INT.H b/Src/Base/AMReX_INT.H
index f8ab0e9ba8f..4356c70f12c 100644
--- a/Src/Base/AMReX_INT.H
+++ b/Src/Base/AMReX_INT.H
@@ -31,4 +31,29 @@ namespace amrex {
 }
 #endif
 
+#if (defined(__x86_64) || defined (__aarch64__)) && !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__)) && !defined(__NVCOMPILER)
+
+#define AMREX_INT128_SUPPORTED 1
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+
+typedef unsigned __int128 amrex_uint128_t; // NOLINT(modernize-use-using)
+typedef          __int128 amrex_int128_t;  // NOLINT(modernize-use-using)
+
+#ifdef __cplusplus
+namespace amrex {
+    using UInt128_t = amrex_uint128_t;
+    using  Int128_t = amrex_int128_t;
+}
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+#endif /* (defined(__x86_64) || defined (__aarch64__)) && !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__)) */
+
 #endif
diff --git a/Src/Base/AMReX_IndexType.H b/Src/Base/AMReX_IndexType.H
index 02a56aae2a2..0fd613d2a99 100644
--- a/Src/Base/AMReX_IndexType.H
+++ b/Src/Base/AMReX_IndexType.H
@@ -19,7 +19,6 @@ namespace amrex {
 * enumerated type CellIndex to be either CELL or NODE; i.e. each of the
 * AMREX_SPACEDIM dimensions must be either CELL or NODE.
 */
-
 class IndexType
 {
     friend MPI_Datatype ParallelDescriptor::Mpi_typemap<IndexType>::type();
diff --git a/Src/Base/AMReX_IntVect.H b/Src/Base/AMReX_IntVect.H
index fd71c93ae87..b2658a5ec94 100644
--- a/Src/Base/AMReX_IntVect.H
+++ b/Src/Base/AMReX_IntVect.H
@@ -42,7 +42,6 @@ int coarsen (int i, int ratio) noexcept
 * C++ array.  In addition, the basic arithmetic operators have been overloaded
 * to implement scaling and translation operations.
 */
-
 class IntVect
 {
     friend MPI_Datatype ParallelDescriptor::Mpi_typemap<IntVect>::type();
diff --git a/Src/Base/AMReX_IntegratorBase.H b/Src/Base/AMReX_IntegratorBase.H
index bf99d264fb1..568e063bed5 100644
--- a/Src/Base/AMReX_IntegratorBase.H
+++ b/Src/Base/AMReX_IntegratorBase.H
@@ -18,7 +18,7 @@ template<class T, typename Tv = void> struct IntegratorOps;
 
 #if defined(AMREX_PARTICLES)
 template<class T>
-struct IntegratorOps<T, typename std::enable_if<std::is_base_of<amrex::ParticleContainerBase, T>::value>::type>
+struct IntegratorOps<T, std::enable_if_t<std::is_base_of_v<amrex::ParticleContainerBase, T> > >
 {
 
     static void CreateLike (amrex::Vector<std::unique_ptr<T> >& V, const T& Other)
@@ -84,7 +84,7 @@ struct IntegratorOps<T, typename std::enable_if<std::is_base_of<amrex::ParticleC
 #endif
 
 template<class T>
-struct IntegratorOps<T, typename std::enable_if<std::is_same<amrex::Vector<amrex::MultiFab>, T>::value>::type>
+struct IntegratorOps<T, std::enable_if_t<std::is_same_v<amrex::Vector<amrex::MultiFab>, T> > >
 {
 
     static void CreateLike (amrex::Vector<std::unique_ptr<T> >& V, const T& Other, bool Grow = false)
@@ -130,7 +130,7 @@ struct IntegratorOps<T, typename std::enable_if<std::is_same<amrex::Vector<amrex
 };
 
 template<class T>
-struct IntegratorOps<T, typename std::enable_if<std::is_same<amrex::MultiFab, T>::value>::type>
+struct IntegratorOps<T, std::enable_if_t<std::is_same_v<amrex::MultiFab, T> > >
 {
 
     static void CreateLike (amrex::Vector<std::unique_ptr<T> >& V, const T& Other, bool Grow = false)
diff --git a/Src/Base/AMReX_LUSolver.H b/Src/Base/AMReX_LUSolver.H
new file mode 100644
index 00000000000..bd69822ea5a
--- /dev/null
+++ b/Src/Base/AMReX_LUSolver.H
@@ -0,0 +1,146 @@
+#ifndef AMREX_LU_SOLVER_H_
+#define AMREX_LU_SOLVER_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Arena.H>
+#include <AMReX_Array.H>
+#include <cmath>
+#include <limits>
+
+namespace amrex {
+
+// https://en.wikipedia.org/wiki/LU_decomposition
+
+template <int N, typename T>
+class LUSolver
+{
+public:
+
+    LUSolver () = default;
+
+    LUSolver (Array2D<T, 0, N-1, 0, N-1, Order::C> const& a_mat);
+
+    void define (Array2D<T, 0, N-1, 0, N-1, Order::C> const& a_mat);
+
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    void operator() (T* AMREX_RESTRICT x, T const* AMREX_RESTRICT b) const
+    {
+        for (int i = 0; i < N; ++i) {
+            x[i] = b[m_piv(i)];
+            for (int k = 0; k < i; ++k) {
+                x[i] -= m_mat(i,k) * x[k];
+            }
+        }
+
+        for (int i = N-1; i >= 0; --i) {
+            for (int k = i+1; k < N; ++k) {
+                x[i] -= m_mat(i,k) * x[k];
+            }
+            x[i] *= m_mat(i,i);
+        }
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE
+    Array2D<T,0,N-1,0,N-1,Order::C> invert () const
+    {
+        Array2D<T,0,N-1,0,N-1,Order::C> IA;
+        for (int j = 0; j < N; ++j) {
+            for (int i = 0; i < N; ++i) {
+                IA(i,j) = (m_piv(i) == j) ? T(1.0) : T(0.0);
+                for (int k = 0; k < i; ++k) {
+                    IA(i,j) -= m_mat(i,k) * IA(k,j);
+                }
+            }
+            for (int i = N-1; i >= 0; --i) {
+                for (int k = i+1; k < N; ++k) {
+                    IA(i,j) -= m_mat(i,k) * IA(k,j);
+                }
+                IA(i,j) *= m_mat(i,i);
+            }
+        }
+        return IA;
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE
+    T determinant () const
+    {
+        T det = m_mat(0,0);
+        for (int i = 1; i < N; ++i) {
+            det *= m_mat(i,i);
+        }
+        det = T(1.0) / det;
+        return (m_npivs % 2 == 0) ? det : -det;
+    }
+
+private:
+
+    void define_innard ();
+
+    Array2D<T, 0, N-1, 0, N-1, Order::C> m_mat;
+    Array1D<int, 0, N-1> m_piv;
+    int m_npivs = 0;
+};
+
+template <int N, typename T>
+LUSolver<N,T>::LUSolver (Array2D<T, 0, N-1, 0, N-1, Order::C> const& a_mat)
+    : m_mat(a_mat)
+{
+    define_innard();
+}
+
+template <int N, typename T>
+void LUSolver<N,T>::define (Array2D<T, 0, N-1, 0, N-1, Order::C> const& a_mat)
+{
+    m_mat = a_mat;
+    define_innard();
+}
+
+template <int N, typename T>
+void LUSolver<N,T>::define_innard ()
+{
+    static_assert(N > 1);
+    static_assert(std::is_floating_point_v<T>);
+
+    for (int i = 0; i < N; ++i) { m_piv(i) = i; }
+    m_npivs = 0;
+
+    for (int i = 0; i < N; ++i) {
+        T maxA = 0;
+        int imax = i;
+
+        for (int k = i; k < N; ++k) {
+            auto const absA = std::abs(m_mat(k,i));
+            if (absA > maxA) {
+                maxA = absA;
+                imax = k;
+            }
+        }
+
+        if (maxA < std::numeric_limits<T>::min()) {
+            amrex::Abort("LUSolver: matrix is degenerate");
+        }
+
+        if (imax != i) {
+            std::swap(m_piv(i), m_piv(imax));
+            for (int j = 0; j < N; ++j) {
+                std::swap(m_mat(i,j), m_mat(imax,j));
+            }
+            ++m_npivs;
+        }
+
+        for (int j = i+1; j < N; ++j) {
+            m_mat(j,i) /= m_mat(i,i);
+            for (int k = i+1; k < N; ++k) {
+                m_mat(j,k) -= m_mat(j,i) * m_mat(i,k);
+            }
+        }
+    }
+
+    for (int i = 0; i < N; ++i) {
+        m_mat(i,i) = T(1) / m_mat(i,i);
+    }
+}
+
+}
+
+#endif
diff --git a/Src/Base/AMReX_Loop.H b/Src/Base/AMReX_Loop.H
index 84b39107e45..f45a2198c56 100644
--- a/Src/Base/AMReX_Loop.H
+++ b/Src/Base/AMReX_Loop.H
@@ -8,7 +8,7 @@ namespace amrex {
 
 template <class F>
 AMREX_GPU_HOST_DEVICE
-void Loop (Dim3 lo, Dim3 hi, F&& f) noexcept
+void Loop (Dim3 lo, Dim3 hi, F const& f) noexcept
 {
     for (int k = lo.z; k <= hi.z; ++k) {
     for (int j = lo.y; j <= hi.y; ++j) {
@@ -19,7 +19,7 @@ void Loop (Dim3 lo, Dim3 hi, F&& f) noexcept
 
 template <class F>
 AMREX_GPU_HOST_DEVICE
-void Loop (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept
+void Loop (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept
 {
     for (int n = 0; n < ncomp; ++n) {
     for (int k = lo.z; k <= hi.z; ++k) {
@@ -31,7 +31,7 @@ void Loop (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept
 
 template <class F>
 AMREX_GPU_HOST_DEVICE
-void LoopConcurrent (Dim3 lo, Dim3 hi, F&& f) noexcept
+void LoopConcurrent (Dim3 lo, Dim3 hi, F const& f) noexcept
 {
     for (int k = lo.z; k <= hi.z; ++k) {
     for (int j = lo.y; j <= hi.y; ++j) {
@@ -43,7 +43,7 @@ void LoopConcurrent (Dim3 lo, Dim3 hi, F&& f) noexcept
 
 template <class F>
 AMREX_GPU_HOST_DEVICE
-void LoopConcurrent (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept
+void LoopConcurrent (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept
 {
     for (int n = 0; n < ncomp; ++n) {
     for (int k = lo.z; k <= hi.z; ++k) {
@@ -56,7 +56,7 @@ void LoopConcurrent (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept
 
 template <class F>
 AMREX_GPU_HOST_DEVICE
-void Loop (Box const& bx, F&& f) noexcept
+void Loop (Box const& bx, F const& f) noexcept
 {
     const auto lo = amrex::lbound(bx);
     const auto hi = amrex::ubound(bx);
@@ -69,7 +69,7 @@ void Loop (Box const& bx, F&& f) noexcept
 
 template <class F>
 AMREX_GPU_HOST_DEVICE
-void Loop (Box const& bx, int ncomp, F&& f) noexcept
+void Loop (Box const& bx, int ncomp, F const& f) noexcept
 {
     const auto lo = amrex::lbound(bx);
     const auto hi = amrex::ubound(bx);
@@ -83,7 +83,7 @@ void Loop (Box const& bx, int ncomp, F&& f) noexcept
 
 template <class F>
 AMREX_GPU_HOST_DEVICE
-void LoopConcurrent (Box const& bx, F&& f) noexcept
+void LoopConcurrent (Box const& bx, F const& f) noexcept
 {
     const auto lo = amrex::lbound(bx);
     const auto hi = amrex::ubound(bx);
@@ -97,7 +97,7 @@ void LoopConcurrent (Box const& bx, F&& f) noexcept
 
 template <class F>
 AMREX_GPU_HOST_DEVICE
-void LoopConcurrent (Box const& bx, int ncomp, F&& f) noexcept
+void LoopConcurrent (Box const& bx, int ncomp, F const& f) noexcept
 {
     const auto lo = amrex::lbound(bx);
     const auto hi = amrex::ubound(bx);
@@ -116,7 +116,7 @@ void LoopConcurrent (Box const& bx, int ncomp, F&& f) noexcept
 // of the warning, we have to use the functions below for those situations.
 
 template <class F>
-void LoopOnCpu (Dim3 lo, Dim3 hi, F&& f) noexcept
+void LoopOnCpu (Dim3 lo, Dim3 hi, F const& f) noexcept
 {
     for (int k = lo.z; k <= hi.z; ++k) {
     for (int j = lo.y; j <= hi.y; ++j) {
@@ -126,7 +126,7 @@ void LoopOnCpu (Dim3 lo, Dim3 hi, F&& f) noexcept
 }
 
 template <class F>
-void LoopOnCpu (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept
+void LoopOnCpu (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept
 {
     for (int n = 0; n < ncomp; ++n) {
     for (int k = lo.z; k <= hi.z; ++k) {
@@ -137,7 +137,7 @@ void LoopOnCpu (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept
 }
 
 template <class F>
-void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, F&& f) noexcept
+void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, F const& f) noexcept
 {
     for (int k = lo.z; k <= hi.z; ++k) {
     for (int j = lo.y; j <= hi.y; ++j) {
@@ -148,7 +148,7 @@ void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, F&& f) noexcept
 }
 
 template <class F>
-void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept
+void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept
 {
     for (int n = 0; n < ncomp; ++n) {
     for (int k = lo.z; k <= hi.z; ++k) {
@@ -160,7 +160,7 @@ void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept
 }
 
 template <class F>
-void LoopOnCpu (Box const& bx, F&& f) noexcept
+void LoopOnCpu (Box const& bx, F const& f) noexcept
 {
     const auto lo = amrex::lbound(bx);
     const auto hi = amrex::ubound(bx);
@@ -172,7 +172,7 @@ void LoopOnCpu (Box const& bx, F&& f) noexcept
 }
 
 template <class F>
-void LoopOnCpu (Box const& bx, int ncomp, F&& f) noexcept
+void LoopOnCpu (Box const& bx, int ncomp, F const& f) noexcept
 {
     const auto lo = amrex::lbound(bx);
     const auto hi = amrex::ubound(bx);
@@ -185,7 +185,7 @@ void LoopOnCpu (Box const& bx, int ncomp, F&& f) noexcept
 }
 
 template <class F>
-void LoopConcurrentOnCpu (Box const& bx, F&& f) noexcept
+void LoopConcurrentOnCpu (Box const& bx, F const& f) noexcept
 {
     const auto lo = amrex::lbound(bx);
     const auto hi = amrex::ubound(bx);
@@ -198,7 +198,7 @@ void LoopConcurrentOnCpu (Box const& bx, F&& f) noexcept
 }
 
 template <class F>
-void LoopConcurrentOnCpu (Box const& bx, int ncomp, F&& f) noexcept
+void LoopConcurrentOnCpu (Box const& bx, int ncomp, F const& f) noexcept
 {
     const auto lo = amrex::lbound(bx);
     const auto hi = amrex::ubound(bx);
@@ -211,6 +211,30 @@ void LoopConcurrentOnCpu (Box const& bx, int ncomp, F&& f) noexcept
     }}}}
 }
 
+// Implementation of "constexpr for" based on
+// https://artificial-mind.net/blog/2020/10/31/constexpr-for
+//
+// Approximates what one would get from a compile-time
+// unrolling of the loop
+// for (int i = 0; i < N; ++i) {
+//    f(i);
+// }
+//
+// The mechanism is recursive: we evaluate f(i) at the current
+// i and then call the for loop at i+1. f() is a lambda function
+// that provides the body of the loop and takes only an integer
+// i as its argument.
+
+template<auto I, auto N, class F>
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+constexpr void constexpr_for (F const& f)
+{
+    if constexpr (I < N) {
+        f(std::integral_constant<decltype(I), I>());
+        constexpr_for<I+1, N>(f);
+    }
+}
+
 #include <AMReX_Loop.nolint.H>
 
 }
diff --git a/Src/Base/AMReX_MFIter.H b/Src/Base/AMReX_MFIter.H
index bfad2d8c32f..7f0ca4d3539 100644
--- a/Src/Base/AMReX_MFIter.H
+++ b/Src/Base/AMReX_MFIter.H
@@ -129,10 +129,10 @@ public:
     [[nodiscard]] Box grownnodaltilebox (int dir, const IntVect& ng) const noexcept;
 
     //! Return the valid Box in which the current tile resides.
-    [[nodiscard]] Box validbox () const noexcept { return fabArray.box((*index_map)[currentIndex]); }
+    [[nodiscard]] Box validbox () const noexcept { return fabArray->box((*index_map)[currentIndex]); }
 
     //! Return the Box of the FAB at which we currently point.
-    [[nodiscard]] Box fabbox () const noexcept { return fabArray.fabbox((*index_map)[currentIndex]); }
+    [[nodiscard]] Box fabbox () const noexcept { return fabArray->fabbox((*index_map)[currentIndex]); }
 
     //! Increment iterator to the next tile we own.
     void operator++ () noexcept;
@@ -159,11 +159,11 @@ public:
     [[nodiscard]] int LocalIndex () const noexcept { return local_index_map ? (*local_index_map)[currentIndex] : currentIndex; }
 
     //! Constant reference to FabArray over which we're iterating.
-    [[nodiscard]] const FabArrayBase& theFabArrayBase () const noexcept { return fabArray; }
+    [[nodiscard]] const FabArrayBase& theFabArrayBase () const noexcept { return *fabArray; }
 
     [[nodiscard]] int tileIndex () const noexcept {return currentIndex;}
 
-    [[nodiscard]] const DistributionMapping& DistributionMap () const noexcept { return fabArray.DistributionMap(); }
+    [[nodiscard]] const DistributionMapping& DistributionMap () const noexcept { return fabArray->DistributionMap(); }
 
     static int allowMultipleMFIters (int allow);
 
@@ -173,7 +173,7 @@ protected:
 
     std::unique_ptr<FabArrayBase> m_fa;  //!< This must be the first member!
 
-    const FabArrayBase& fabArray;
+    const FabArrayBase* fabArray;
 
     IntVect tile_size;
 
diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp
index b05d0bbb2af..f68ab9ce35f 100644
--- a/Src/Base/AMReX_MFIter.cpp
+++ b/Src/Base/AMReX_MFIter.cpp
@@ -20,7 +20,7 @@ MFIter::allowMultipleMFIters (int allow)
 MFIter::MFIter (const FabArrayBase& fabarray_,
                 unsigned char       flags_)
     :
-    fabArray(fabarray_),
+    fabArray(&fabarray_),
     tile_size((flags_ & Tiling) ? FabArrayBase::mfiter_tile_size : IntVect::TheZeroVector()),
     flags(flags_),
     streams(Gpu::numGpuStreams()),
@@ -38,7 +38,7 @@ MFIter::MFIter (const FabArrayBase& fabarray_,
 MFIter::MFIter (const FabArrayBase& fabarray_,
                 bool                do_tiling_)
     :
-    fabArray(fabarray_),
+    fabArray(&fabarray_),
     tile_size((do_tiling_) ? FabArrayBase::mfiter_tile_size : IntVect::TheZeroVector()),
     flags(do_tiling_ ? Tiling : 0),
     streams(Gpu::numGpuStreams()),
@@ -57,7 +57,7 @@ MFIter::MFIter (const FabArrayBase& fabarray_,
                 const IntVect&      tilesize_,
                 unsigned char       flags_)
     :
-    fabArray(fabarray_),
+    fabArray(&fabarray_),
     tile_size(tilesize_),
     flags(flags_ | Tiling),
     streams(Gpu::numGpuStreams()),
@@ -75,7 +75,7 @@ MFIter::MFIter (const FabArrayBase& fabarray_,
 MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, unsigned char flags_)
     :
     m_fa(std::make_unique<FabArrayBase>(ba,dm,1,0)),
-    fabArray(*m_fa),
+    fabArray(m_fa.get()),
     tile_size((flags_ & Tiling) ? FabArrayBase::mfiter_tile_size : IntVect::TheZeroVector()),
     flags(flags_),
     streams(Gpu::numGpuStreams()),
@@ -99,7 +99,7 @@ MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, unsigned char
 MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, bool do_tiling_)
     :
     m_fa(std::make_unique<FabArrayBase>(ba,dm,1,0)),
-    fabArray(*m_fa),
+    fabArray(m_fa.get()),
     tile_size((do_tiling_) ? FabArrayBase::mfiter_tile_size : IntVect::TheZeroVector()),
     flags(do_tiling_ ? Tiling : 0),
     streams(Gpu::numGpuStreams()),
@@ -125,7 +125,7 @@ MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm,
                 const IntVect& tilesize_, unsigned char flags_)
     :
     m_fa(std::make_unique<FabArrayBase>(ba,dm,1,0)),
-    fabArray(*m_fa),
+    fabArray(m_fa.get()),
     tile_size(tilesize_),
     flags(flags_ | Tiling),
     streams(Gpu::numGpuStreams()),
@@ -150,7 +150,7 @@ MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm,
 MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, const MFItInfo& info)
     :
     m_fa(std::make_unique<FabArrayBase>(ba, dm, 1, 0)),
-    fabArray(*m_fa),
+    fabArray(m_fa.get()),
     tile_size(info.tilesize),
     flags(info.do_tiling ? Tiling : 0),
     streams(std::max(1,std::min(Gpu::numGpuStreams(),info.num_streams))),
@@ -182,7 +182,7 @@ MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, const MFItInf
 
 MFIter::MFIter (const FabArrayBase& fabarray_, const MFItInfo& info)
     :
-    fabArray(fabarray_),
+    fabArray(&fabarray_),
     tile_size(info.tilesize),
     flags(info.do_tiling ? Tiling : 0),
     streams(std::max(1,std::min(Gpu::numGpuStreams(),info.num_streams))),
@@ -282,14 +282,14 @@ MFIter::Initialize ()
 
     if (flags & AllBoxes)  // a very special case
     {
-        index_map    = &(fabArray.IndexArray());
+        index_map    = &(fabArray->IndexArray());
         currentIndex = 0;
         beginIndex   = 0;
         endIndex     = static_cast<int>(index_map->size());
     }
     else
     {
-        const FabArrayBase::TileArray* pta = fabArray.getTileArray(tile_size);
+        const FabArrayBase::TileArray* pta = fabArray->getTileArray(tile_size);
 
         index_map            = &(pta->indexMap);
         local_index_map      = &(pta->localIndexMap);
@@ -366,7 +366,7 @@ MFIter::Initialize ()
         Gpu::Device::setStreamIndex(currentIndex%streams);
 #endif
 
-        typ = fabArray.boxArray().ixType();
+        typ = fabArray->boxArray().ixType();
     }
 }
 
@@ -462,7 +462,7 @@ MFIter::growntilebox (int a_ng) const noexcept
 {
     Box bx = tilebox();
     IntVect ngv{a_ng};
-    if (a_ng < -100) { ngv = fabArray.nGrowVect(); }
+    if (a_ng < -100) { ngv = fabArray->nGrowVect(); }
     const Box& vbx = validbox();
     for (int d=0; d<AMREX_SPACEDIM; ++d) {
         if (bx.smallEnd(d) == vbx.smallEnd(d)) {
@@ -495,7 +495,7 @@ Box
 MFIter::grownnodaltilebox (int dir, int a_ng) const noexcept
 {
     IntVect ngv(a_ng);
-    if (a_ng < -100) { ngv = fabArray.nGrowVect(); }
+    if (a_ng < -100) { ngv = fabArray->nGrowVect(); }
     return grownnodaltilebox(dir, ngv);
 }
 
diff --git a/Src/Base/AMReX_MFParallelForC.H b/Src/Base/AMReX_MFParallelForC.H
index cd050b2e331..b2269df1ea1 100644
--- a/Src/Base/AMReX_MFParallelForC.H
+++ b/Src/Base/AMReX_MFParallelForC.H
@@ -10,7 +10,7 @@ namespace amrex::experimental::detail {
 
 template <typename MF, typename F>
 std::enable_if_t<IsFabArray<MF>::value>
-ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynamic, F&& f)
+ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynamic, F const& f)
 {
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
@@ -33,7 +33,7 @@ ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynami
 
 template <typename MF, typename F>
 std::enable_if_t<IsFabArray<MF>::value>
-ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const& ts, bool dynamic, F&& f)
+ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const& ts, bool dynamic, F const& f)
 {
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
diff --git a/Src/Base/AMReX_MFParallelForG.H b/Src/Base/AMReX_MFParallelForG.H
index ba65b18937d..066e46f3b89 100644
--- a/Src/Base/AMReX_MFParallelForG.H
+++ b/Src/Base/AMReX_MFParallelForG.H
@@ -12,36 +12,33 @@ namespace amrex {
 namespace detail {
 
 inline
-void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, Box*& pboxes,
+void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, BoxIndexer*& pboxes,
                             Vector<Box> const& boxes, Vector<Long> const& ncells, int nthreads)
 {
     if (!ncells.empty()) {
         const int nboxes = ncells.size();
-        const std::size_t nbytes_boxes = amrex::aligned_size(16, (nboxes+1) * sizeof(int));
-        const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(Box);
+        const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int));
+        const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer);
         a_hp = (char*)The_Pinned_Arena()->alloc(nbytes);
         int* hp_blks = (int*)a_hp;
-        Box* hp_boxes = (Box*)(a_hp + nbytes_boxes);
+        auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes);
         hp_blks[0] = 0;
-        Long ntot = 0;
         bool same_size = true;
         for (int i = 0; i < nboxes; ++i) {
             Long nblocks = (ncells[i] + nthreads-1) / nthreads;
+            AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits<int>::max()));
             hp_blks[i+1] = hp_blks[i] + static_cast<int>(nblocks);
-            ntot += nblocks;
             same_size = same_size && (ncells[i] == ncells[0]);
 
-            new (hp_boxes+i) Box(boxes[i]);
+            new (hp_boxes+i) BoxIndexer(boxes[i]);
         }
-        amrex::ignore_unused(ntot);
-        AMREX_ASSERT(static_cast<Long>(hp_blks[nboxes]) == ntot); // no overflow
 
         a_dp = (char*) The_Arena()->alloc(nbytes);
         Gpu::htod_memcpy_async(a_dp, a_hp, nbytes);
 
         blocks_x.first = hp_blks;
         blocks_x.second = (same_size) ? nullptr : (int*)a_dp;
-        pboxes = (Box*)(a_dp + nbytes_boxes);
+        pboxes = (BoxIndexer*)(a_dp + nbytes_boxes);
     }
 }
 
@@ -75,7 +72,7 @@ namespace parfor_mf_detail {
 
 template <int MT, typename MF, typename F>
 std::enable_if_t<IsFabArray<MF>::value>
-ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, bool, F&& f)
+ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, bool, F const& f)
 {
     const auto& index_array = mf.IndexArray();
     const int nboxes = index_array.size();
@@ -94,7 +91,7 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
         const int nblocks = par_for_blocks.first[nboxes];
         const int block_0_size = par_for_blocks.first[1];
         const int* dp_nblocks = par_for_blocks.second;
-        const Box* dp_boxes = parforinfo.getBoxes();
+        const BoxIndexer* dp_boxes = parforinfo.getBoxes();
 
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
 
@@ -102,13 +99,14 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
             <<<nblocks, MT, 0, Gpu::gpuStream()>>>
             ([=] AMREX_GPU_DEVICE () noexcept
              {
-                 int ibox, icell;
+                 int ibox;
+                 std::uint64_t icell;
                  if (dp_nblocks) {
                      ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdx.x));
-                     icell = (blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;
+                     icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;
                  } else {
                      ibox = blockIdx.x / block_0_size;
-                     icell = (blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;
+                     icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;
                  }
 
 #elif defined(AMREX_USE_SYCL)
@@ -116,27 +114,21 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
         amrex::launch<MT>(nblocks, Gpu::gpuStream(),
              [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
              {
-                 int ibox, icell;
+                 int ibox;
+                 std::uint64_t icell;
                  int blockIdxx = item.get_group_linear_id();
                  int threadIdxx = item.get_local_linear_id();
                  if (dp_nblocks) {
                      ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdxx));
-                     icell = (blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
+                     icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
                  } else {
                      ibox = blockIdxx / block_0_size;
-                     icell = (blockIdxx-ibox*block_0_size)*MT + threadIdxx;
+                     icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx;
                  }
 #endif
-                 Box const& b = dp_boxes[ibox];
-                 int ncells = b.numPts();
-                 if (icell < ncells) {
-                     const auto len = amrex::length(b);
-                     int k =  icell /   (len.x*len.y);
-                     int j = (icell - k*(len.x*len.y)) /   len.x;
-                     int i = (icell - k*(len.x*len.y)) - j*len.x;
-                     AMREX_D_TERM(i += b.smallEnd(0);,
-                                  j += b.smallEnd(1);,
-                                  k += b.smallEnd(2);)
+                 BoxIndexer const& indexer = dp_boxes[ibox];
+                 if (icell < indexer.numPts()) {
+                     auto [i, j, k] = indexer(icell);
                      for (int n = 0; n < ncomp; ++n) {
                          parfor_mf_detail::call_f(f, ibox, i, j, k, n);
                      }
diff --git a/Src/Base/AMReX_MPMD.H b/Src/Base/AMReX_MPMD.H
index 79ff3dd69b4..1c7ad218c1f 100644
--- a/Src/Base/AMReX_MPMD.H
+++ b/Src/Base/AMReX_MPMD.H
@@ -10,6 +10,8 @@
 
 namespace amrex::MPMD {
 
+void Initialize_without_split (int argc, char* argv[]);
+
 MPI_Comm Initialize (int argc, char* argv[]);
 
 void Finalize ();
@@ -18,12 +20,16 @@ bool Initialized ();
 
 int MyProc ();   //! Process ID in MPI_COMM_WORLD
 int NProcs ();   //! Number of processes in MPI_COMM_WORLD
+int AppNum ();   //! Get the appnum (color) required for MPI_Comm_split
 int MyProgId (); //! Program ID
 
 class Copier
 {
 public:
-    Copier (BoxArray const& ba, DistributionMapping const& dm);
+    explicit Copier (bool);
+
+    Copier (BoxArray const& ba, DistributionMapping const& dm,
+            bool send_ba = false);
 
     template <typename FAB>
     void send (FabArray<FAB> const& mf, int icomp, int ncomp) const;
@@ -31,9 +37,16 @@ public:
     template <typename FAB>
     void recv (FabArray<FAB>& mf, int icomp, int ncomp) const;
 
+    [[nodiscard]] BoxArray const& boxArray () const;
+
+    [[nodiscard]] DistributionMapping const& DistributionMap () const;
+
 private:
     std::map<int,FabArrayBase::CopyComTagsContainer> m_SndTags;
     std::map<int,FabArrayBase::CopyComTagsContainer> m_RcvTags;
+    bool m_is_thread_safe;
+    BoxArray m_ba;
+    DistributionMapping m_dm;
 };
 
 template <typename FAB>
@@ -61,7 +74,7 @@ void Copier::send (FabArray<FAB> const& mf, int icomp, int ncomp) const
             nbytes += cct.sbox.numPts() * ncomp * sizeof(typename FAB::value_type);
         }
 
-        std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+        std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
         nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
 
         // Also need to align the offset properly
@@ -125,7 +138,7 @@ void Copier::recv (FabArray<FAB>& mf, int icomp, int ncomp) const
             nbytes += cct.dbox.numPts() * ncomp * sizeof(typename FAB::value_type);
         }
 
-        std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+        std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
         nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned
 
         // Also need to align the offset properly
@@ -163,15 +176,16 @@ void Copier::recv (FabArray<FAB>& mf, int icomp, int ncomp) const
 #ifdef AMREX_USE_GPU
     if (Gpu::inLaunchRegion() && (mf.arena()->isDevice() || mf.arena()->isManaged())) {
         mf.unpack_recv_buffer_gpu(mf, icomp, ncomp, recv_data, recv_size, recv_cctc,
-                                  FabArrayBase::COPY, true);
+                                  FabArrayBase::COPY, m_is_thread_safe);
     } else
 #endif
     {
         mf.unpack_recv_buffer_cpu(mf, icomp, ncomp, recv_data, recv_size, recv_cctc,
-                                  FabArrayBase::COPY, true);
+                                  FabArrayBase::COPY, m_is_thread_safe);
     }
 }
 
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_MPMD.cpp b/Src/Base/AMReX_MPMD.cpp
index 51b76d4bf94..bbaae32902c 100644
--- a/Src/Base/AMReX_MPMD.cpp
+++ b/Src/Base/AMReX_MPMD.cpp
@@ -17,6 +17,7 @@ namespace {
     MPI_Comm app_comm = MPI_COMM_NULL;
     int myproc;
     int nprocs;
+    int appnum;
 }
 
 namespace {
@@ -31,7 +32,16 @@ int num_unique_elements (std::vector<T>& v)
 
 }
 
-MPI_Comm Initialize (int argc, char* argv[])
+/*
+Initialize_without_split function assigns and checks the required
+AMReX_MPMD variables. This function is internally leveraged by
+Initialize function.
+
+This function needs to be used EXPLICITLY ONLY with pyAMReX (python)
+so that the communication split can be performed using a python
+library, for example, mpi4py.
+*/
+void Initialize_without_split (int argc, char* argv[])
 {
     initialized = true;
     int flag;
@@ -46,7 +56,7 @@ MPI_Comm Initialize (int argc, char* argv[])
 
     int* p;
     MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_APPNUM, &p, &flag);
-    int appnum = *p;
+    appnum = *p;
 
     std::vector<int> all_appnum(nprocs);
     MPI_Allgather(&appnum, 1, MPI_INT, all_appnum.data(), 1, MPI_INT, MPI_COMM_WORLD);
@@ -77,13 +87,18 @@ MPI_Comm Initialize (int argc, char* argv[])
         }
     }
 
-    if (napps == 2) {
-        MPI_Comm_split(MPI_COMM_WORLD, appnum, myproc, &app_comm);
-    } else {
-        std::cout << "amrex::MPMD only supports two programs." << std::endl;
+    if (napps != 2) {
+        std::cout << "amrex::MPMD only supports two programs." << '\n';
         MPI_Abort(MPI_COMM_WORLD, 1);
     }
 
+}
+
+MPI_Comm Initialize (int argc, char* argv[])
+{
+    Initialize_without_split(argc,argv);
+    MPI_Comm_split(MPI_COMM_WORLD, appnum, myproc, &app_comm);
+
     return app_comm;
 }
 
@@ -109,12 +124,24 @@ int NProcs ()
     return nprocs;
 }
 
+/*
+AppNum function is provided so that appnum (color)
+can be passed to python library (mpi4py) to perform
+a pythonic version of MPI_Comm_split.
+*/
+int AppNum ()
+{
+    return appnum;
+}
+
 int MyProgId ()
 {
     return (myproc == ParallelDescriptor::MyProc()) ? 0 : 1;
 }
 
-Copier::Copier (BoxArray const& ba, DistributionMapping const& dm)
+Copier::Copier (BoxArray const& ba, DistributionMapping const& dm,
+        bool send_ba)
+        : m_ba(ba), m_dm(dm)
 {
     int rank_offset = myproc - ParallelDescriptor::MyProc();
     int this_root, other_root;
@@ -127,7 +154,6 @@ Copier::Copier (BoxArray const& ba, DistributionMapping const& dm)
     }
 
     Vector<Box> bv = ba.boxList().data();
-
     int this_nboxes = static_cast<int>(ba.size());
     Vector<int> procs = dm.ProcessorMap();
     if (rank_offset != 0) {
@@ -138,34 +164,46 @@ Copier::Copier (BoxArray const& ba, DistributionMapping const& dm)
 
     Vector<Box> obv;
     Vector<int> oprocs;
-    int other_nboxes;
+    int other_nboxes = this_nboxes;
     if (myproc == this_root) {
         if (rank_offset == 0) // the first program
         {
             MPI_Send(&this_nboxes, 1, MPI_INT, other_root, 0, MPI_COMM_WORLD);
-            MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 1, MPI_COMM_WORLD,
+            if (!send_ba)
+            {
+                MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 1, MPI_COMM_WORLD,
                      MPI_STATUS_IGNORE);
-            obv.resize(other_nboxes);
+                obv.resize(other_nboxes);
+            }
             MPI_Send(bv.data(), this_nboxes,
                      ParallelDescriptor::Mpi_typemap<Box>::type(),
                      other_root, 2, MPI_COMM_WORLD);
-            MPI_Recv(obv.data(), other_nboxes,
+            if (!send_ba)
+            {
+                MPI_Recv(obv.data(), other_nboxes,
                      ParallelDescriptor::Mpi_typemap<Box>::type(),
                      other_root, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-            oprocs.resize(other_nboxes);
+            }
             MPI_Send(procs.data(), this_nboxes, MPI_INT, other_root, 4, MPI_COMM_WORLD);
+            oprocs.resize(other_nboxes);
             MPI_Recv(oprocs.data(), other_nboxes, MPI_INT, other_root, 5, MPI_COMM_WORLD,
                      MPI_STATUS_IGNORE);
         }
         else // the second program
         {
-            MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 0, MPI_COMM_WORLD,
+            if (!send_ba)
+            {
+                MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 0, MPI_COMM_WORLD,
                      MPI_STATUS_IGNORE);
+                obv.resize(other_nboxes);
+            }
             MPI_Send(&this_nboxes, 1, MPI_INT, other_root, 1, MPI_COMM_WORLD);
-            obv.resize(other_nboxes);
-            MPI_Recv(obv.data(), other_nboxes,
+            if (!send_ba)
+            {
+                MPI_Recv(obv.data(), other_nboxes,
                      ParallelDescriptor::Mpi_typemap<Box>::type(),
                      other_root, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            }
             MPI_Send(bv.data(), this_nboxes,
                      ParallelDescriptor::Mpi_typemap<Box>::type(),
                      other_root, 3, MPI_COMM_WORLD);
@@ -176,15 +214,23 @@ Copier::Copier (BoxArray const& ba, DistributionMapping const& dm)
         }
     }
 
-    ParallelDescriptor::Bcast(&other_nboxes, 1);
-    if (obv.empty()) {
-        obv.resize(other_nboxes);
+    if (!send_ba) {
+        ParallelDescriptor::Bcast(&other_nboxes, 1);
+        if (obv.empty()){
+            obv.resize(other_nboxes);
+        }
+        ParallelDescriptor::Bcast(obv.data(), obv.size());
+    }
+
+    if (oprocs.empty()) {
         oprocs.resize(other_nboxes);
     }
-    ParallelDescriptor::Bcast(obv.data(), obv.size());
     ParallelDescriptor::Bcast(oprocs.data(), oprocs.size());
 
-    BoxArray oba(BoxList(std::move(obv)));
+    BoxArray oba;
+    if (!obv.empty()) {
+        oba.define(BoxList(std::move(obv)));
+    }
 
     // At this point, ba and bv hold our boxes, and oba holds the other
     // program's boxes. procs holds mpi ranks of our boxes, and oprocs holds
@@ -192,32 +238,138 @@ Copier::Copier (BoxArray const& ba, DistributionMapping const& dm)
     // MPI_COMM_WORLD.
 
     // Build communication meta-data
-
-    AMREX_ALWAYS_ASSERT(ba.ixType().cellCentered());
+    if (!send_ba){
+        AMREX_ALWAYS_ASSERT(ba.ixType() == oba.ixType());
+        m_is_thread_safe = ba.ixType().cellCentered();
+    }else{
+        m_is_thread_safe = true;
+    }
 
     std::vector<std::pair<int,Box> > isects;
 
     for (int i = 0; i < this_nboxes; ++i) {
         if (procs[i] == myproc) {
-            oba.intersections(bv[i], isects);
+            if (!send_ba){
+                oba.intersections(bv[i], isects);
+            }
+            else{
+                isects.resize(0);
+                isects.emplace_back(i,bv[i]);
+            }
             for (auto const& isec : isects) {
                 const int oi = isec.first;
                 const Box& bx = isec.second;
                 const int orank = oprocs[oi];
-                m_SndTags[orank].push_back
-                    (FabArrayBase::CopyComTag(bx, bx, oi, i));
-                m_RcvTags[orank].push_back
-                    (FabArrayBase::CopyComTag(bx, bx, i, oi));
+                m_SndTags[orank].emplace_back(bx, bx, oi, i);
+                m_RcvTags[orank].emplace_back(bx, bx, i, oi);
             }
         }
     }
 
-    for (auto& kv : m_SndTags) {
-        std::sort(kv.second.begin(), kv.second.end());
+    if (!send_ba){
+        for (auto& kv : m_SndTags) {
+            std::sort(kv.second.begin(), kv.second.end());
+        }
+        for (auto& kv : m_RcvTags) {
+            std::sort(kv.second.begin(), kv.second.end());
+        }
     }
-    for (auto& kv : m_RcvTags) {
-        std::sort(kv.second.begin(), kv.second.end());
+}
+
+Copier::Copier (bool)
+    : m_is_thread_safe(true)
+{
+    int rank_offset = myproc - ParallelDescriptor::MyProc();
+    int this_root, other_root;
+    if (rank_offset == 0) { // First program
+        this_root = 0;
+        other_root = ParallelDescriptor::NProcs();
+    } else {
+        this_root = rank_offset;
+        other_root = 0;
     }
+
+    Vector<Box> bv;
+    int this_nboxes;
+
+    if (myproc == this_root) {
+        int tags[2];
+        if (rank_offset == 0) // the first program
+        {
+            tags[0] = 1;
+            tags[1] = 3;
+        }
+        else // the second program
+        {
+            tags[0] = 0;
+            tags[1] = 2;
+        }
+
+        MPI_Recv(&this_nboxes, 1, MPI_INT, other_root, tags[0], MPI_COMM_WORLD,
+                 MPI_STATUS_IGNORE);
+        bv.resize(this_nboxes);
+        MPI_Recv(bv.data(), this_nboxes,
+                 ParallelDescriptor::Mpi_typemap<Box>::type(),
+                 other_root, tags[1], MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    }
+
+    ParallelDescriptor::Bcast(&this_nboxes, 1);
+    if (bv.empty()) {
+        bv.resize(this_nboxes);
+    }
+
+    ParallelDescriptor::Bcast(bv.data(), bv.size());
+    m_ba.define(BoxList(std::move(bv)));
+    m_dm.define(m_ba);
+    Vector<int> procs = m_dm.ProcessorMap();
+    if (rank_offset != 0) {
+        for (int i = 0; i < this_nboxes; ++i) {
+            procs[i] += rank_offset;
+        }
+    }
+
+    Vector<int> oprocs(this_nboxes);
+    if (myproc == this_root) {
+        if (rank_offset == 0) // the first program
+        {
+            MPI_Send(procs.data(), this_nboxes, MPI_INT, other_root, 4, MPI_COMM_WORLD);
+            MPI_Recv(oprocs.data(), this_nboxes, MPI_INT, other_root, 5, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+        }
+        else // the second program
+        {
+            MPI_Recv(oprocs.data(), this_nboxes, MPI_INT, other_root, 4, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+            MPI_Send(procs.data(), this_nboxes, MPI_INT, other_root, 5, MPI_COMM_WORLD);
+        }
+    }
+
+    ParallelDescriptor::Bcast(oprocs.data(), oprocs.size());
+
+    // procs holds mpi ranks of our boxes, and oprocs holds
+    // mpi ranks of the other program's boxes.  All mpi ranks are in
+    // MPI_COMM_WORLD.
+
+    // Build communication meta-data
+
+    for (int i = 0; i < this_nboxes; ++i) {
+        if (procs[i] == myproc) {
+            const Box& bx = m_ba[i];
+            const int orank = oprocs[i];
+            m_SndTags[orank].emplace_back(bx, bx, i, i);
+            m_RcvTags[orank].emplace_back(bx, bx, i, i);
+        }
+    }
+}
+
+BoxArray const& Copier::boxArray () const
+{
+    return m_ba;
+}
+
+DistributionMapping const& Copier::DistributionMap () const
+{
+    return m_dm;
 }
 
 }
diff --git a/Src/Base/AMReX_Machine.cpp b/Src/Base/AMReX_Machine.cpp
index bcb420b3f67..97de84f30c8 100644
--- a/Src/Base/AMReX_Machine.cpp
+++ b/Src/Base/AMReX_Machine.cpp
@@ -227,14 +227,14 @@ class Machine
         auto sg_rank_n = sg_g_ranks.size();
         if (flag_verbose) {
             Print() << "Machine::find_best_nbh(): called for " << nbh_rank_n
-                    << " of " << sg_rank_n << " ranks" << std::endl;
+                    << " of " << sg_rank_n << " ranks" << '\n';
         }
 
         Vector<int> result;
         auto key = NeighborhoodCache::hash(sg_g_ranks, nbh_rank_n);
         if (nbh_cache.get(key, result)) {
             if (flag_verbose) {
-                Print() << "Machine::find_best_nbh(): found neighborhood in cache" << std::endl;
+                Print() << "Machine::find_best_nbh(): found neighborhood in cache" << '\n';
             }
         } else {
             // get node IDs of current subgroup
@@ -251,10 +251,10 @@ class Machine
             }
 
             if (flag_very_verbose) {
-                Print() << "SubRank: GloRank: Node ID: Node Coord:" << std::endl;
+                Print() << "SubRank: GloRank: Node ID: Node Coord:" << '\n';
                 for (int i = 0; i < sg_rank_n; ++i) {
                     Print() << "  " << i << ": " << sg_g_ranks[i] << ": " << sg_node_ids[i]
-                            << ": " << to_str(id_to_coord(sg_node_ids[i])) << std::endl;
+                            << ": " << to_str(id_to_coord(sg_node_ids[i])) << '\n';
                 }
             }
 
@@ -268,8 +268,8 @@ class Machine
                 double base_score;
                 tie(base_nbh, base_score) = baseline_score(sg_node_ids, nbh_rank_n);
 
-                Print() << "Baseline neighborhood: " << to_str(base_nbh) << ", score = " << base_score << std::endl;
-                Print() << "Rank 0's neighborhood: " << to_str(local_nbh) << ", score = " << score << std::endl;
+                Print() << "Baseline neighborhood: " << to_str(base_nbh) << ", score = " << base_score << '\n'
+                        << "Rank 0's neighborhood: " << to_str(local_nbh) << ", score = " << score << '\n';
             }
 
             // determine the best neighborhood among ranks
@@ -287,12 +287,12 @@ class Machine
             std::sort(local_nbh.begin(), local_nbh.end());
             if (flag_verbose) {
                 Print() << "Winning neighborhood: " << winner_rank << ": " << to_str(local_nbh)
-                        << ", score = " << winner_score << std::endl;
+                        << ", score = " << winner_score << '\n';
             }
 
             result.reserve(nbh_rank_n);
-            for (int i = 0; i < local_nbh.size(); ++i) {
-                for (auto rank : node_ranks.at(local_nbh[i])) {
+            for (int i : local_nbh) {
+                for (auto rank : node_ranks.at(i)) {
                     if (result.size() < nbh_rank_n) {
                         result.push_back(rank);
                     }
@@ -302,7 +302,7 @@ class Machine
         }
 
         if (flag_very_verbose) {
-            Print() << "Ranks in neighborhood: " << to_str(result) << std::endl;
+            Print() << "Ranks in neighborhood: " << to_str(result) << '\n';
         }
 
         return result;
@@ -360,11 +360,11 @@ class Machine
             topo_addr  = get_env_str("SLURM_TOPOLOGY_ADDR");
 
             if (flag_verbose) {
-                Print() << "HOSTNAME = " << hostname << std::endl;
-                Print() << "NERSC_HOST = " << nersc_host << std::endl;
-                Print() << "SLURM_JOB_PARTITION = " << partition << std::endl;
-                Print() << "SLURM_NODELIST = " << node_list << std::endl;
-                Print() << "SLURM_TOPOLOGY_ADDR = " << topo_addr << std::endl;
+                Print() << "HOSTNAME = " << hostname << '\n'
+                        << "NERSC_HOST = " << nersc_host << '\n'
+                        << "SLURM_JOB_PARTITION = " << partition << '\n'
+                        << "SLURM_NODELIST = " << node_list << '\n'
+                        << "SLURM_TOPOLOGY_ADDR = " << topo_addr << '\n';
             }
         }
     }
@@ -379,19 +379,19 @@ class Machine
             if (pos != std::string::npos) {
                 result = stoi(topo_addr.substr(pos + tag.size())); // assumes format ".*nid(\d+)"
                 if (flag_verbose) {
-                    Print() << "Got node ID from SLURM_TOPOLOGY_ADDR: " << result << std::endl;
+                    Print() << "Got node ID from SLURM_TOPOLOGY_ADDR: " << result << '\n';
                 }
             } else {
                 if (cluster_name == "escori") {
                     tag = "cgpu";
                 }
                 auto mpi_proc_name = get_mpi_processor_name();
-                Print() << "MPI_Get_processor_name: " << mpi_proc_name << std::endl;
+                Print() << "MPI_Get_processor_name: " << mpi_proc_name << '\n';
                 pos = mpi_proc_name.find(tag);
                 if (pos != std::string::npos) {
                     result = stoi(mpi_proc_name.substr(pos + tag.size())); // assumes format ".*nid(\d+)"
                     if (flag_verbose) {
-                        Print() << "Got node ID from MPI_Get_processor_name(): " << result << std::endl;
+                        Print() << "Got node ID from MPI_Get_processor_name(): " << result << '\n';
                     }
                 }
             }
@@ -422,10 +422,10 @@ class Machine
             for (int i = 0; i < ids.size(); ++i) {
                 node_ranks[ids[i]].push_back(i);
             }
-            Print() << "Node ID: Node Coord: Ranks:" << std::endl;
+            Print() << "Node ID: Node Coord: Ranks:" << '\n';
             for (const auto & p : node_ranks) {
                 Print() << "  " << p.first << ": " << to_str(id_to_coord(p.first))
-                        << ": " << to_str(p.second) << std::endl;
+                        << ": " << to_str(p.second) << '\n';
             }
         }
         return ids;
@@ -467,7 +467,7 @@ class Machine
                 if (flag_very_verbose) {
                     Print() << "    Distance from " << a.id
                             << " to " << b.id
-                            << ": " << pair_dist << std::endl;
+                            << ": " << pair_dist << '\n';
                 }
             }
         }
@@ -482,7 +482,7 @@ class Machine
         BL_PROFILE("Machine::search_local_nbh()");
 
         if (amrex::Verbose() > 0) {
-            Print() << "Machine::search_local_nbh() called ..." << std::endl;
+            Print() << "Machine::search_local_nbh() called ..." << '\n';
         }
 
         Vector<int> result;
@@ -497,11 +497,11 @@ class Machine
         }
 
         if (flag_very_verbose) {
-            Print() << "  Candidates:" << std::endl;
+            Print() << "  Candidates:" << '\n';
             for (const auto & p : candidates) {
                 const auto & cand = p.second;
                 Print() << "    " << cand.id << " : " << to_str(cand.coord)
-                        << ": " << cand.rank_n << " ranks" << std::endl;
+                        << ": " << cand.rank_n << " ranks" << '\n';
             }
         }
 
@@ -518,7 +518,7 @@ class Machine
                     << ": " << to_str(cur_node.coord)
                     << ", ranks: " << cur_node.rank_n
                     << ", total ranks: " << total_rank_n
-                    << ", avg dist: " << 0 << std::endl;
+                    << ", avg dist: " << 0 << '\n';
         }
         if (total_rank_n >= nbh_rank_n) {
             return {std::move(result), 0};
@@ -541,7 +541,7 @@ class Machine
                     Print() << "    Distance from " << cand_node.id
                             << " to " << cur_node.id
                             << ": " << cand_dist
-                            << ", candidate avg: " << avg_dist << std::endl;
+                            << ", candidate avg: " << avg_dist << '\n';
                 }
                 // keep track of what should be the next node to add
                 if (avg_dist < min_avg_dist) {
@@ -565,7 +565,7 @@ class Machine
                             << ": " << to_str(cur_node.coord)
                             << ", ranks: " << cur_node.rank_n
                             << ", total ranks: " << total_rank_n
-                            << ", avg dist: " << min_avg_dist << std::endl;
+                            << ", avg dist: " << min_avg_dist << '\n';
                 }
             }
         }
diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H
index 769b9bf50f4..42762001268 100644
--- a/Src/Base/AMReX_Math.H
+++ b/Src/Base/AMReX_Math.H
@@ -4,6 +4,7 @@
 
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_Extension.H>
+#include <AMReX_INT.H>
 #include <cmath>
 #include <cstdlib>
 #include <type_traits>
@@ -57,7 +58,7 @@ using std::isinf;
 #endif
 
 template <typename T>
-constexpr std::enable_if_t<std::is_floating_point<T>::value,T> pi ()
+constexpr std::enable_if_t<std::is_floating_point_v<T>,T> pi ()
 {
     return T(3.1415926535897932384626433832795029L);
 }
@@ -68,11 +69,9 @@ double cospi (double x)
 {
 #if defined(AMREX_USE_SYCL)
     return sycl::cospi(x);
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    return ::cospi(x);
 #else
-    return std::cos(pi<double>()*x);
+    AMREX_IF_ON_DEVICE(( return ::cospi(x); ))
+    AMREX_IF_ON_HOST(( return std::cos(pi<double>()*x); ))
 #endif
 }
 
@@ -82,11 +81,9 @@ float cospi (float x)
 {
 #if defined(AMREX_USE_SYCL)
     return sycl::cospi(x);
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    return ::cospif(x);
 #else
-    return std::cos(pi<float>()*x);
+    AMREX_IF_ON_DEVICE(( return ::cospif(x); ))
+    AMREX_IF_ON_HOST(( return std::cos(pi<float>()*x); ))
 #endif
 }
 
@@ -96,11 +93,9 @@ double sinpi (double x)
 {
 #if defined(AMREX_USE_SYCL)
     return sycl::sinpi(x);
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    return ::sinpi(x);
 #else
-    return std::sin(pi<double>()*x);
+    AMREX_IF_ON_DEVICE(( return ::sinpi(x); ))
+    AMREX_IF_ON_HOST(( return std::sin(pi<double>()*x); ))
 #endif
 }
 
@@ -110,14 +105,32 @@ float sinpi (float x)
 {
 #if defined(AMREX_USE_SYCL)
     return sycl::sinpi(x);
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    return ::sinpif(x);
 #else
-    return std::sin(pi<float>()*x);
+    AMREX_IF_ON_DEVICE(( return ::sinpif(x); ))
+    AMREX_IF_ON_HOST(( return std::sin(pi<float>()*x); ))
 #endif
 }
 
+namespace detail {
+    AMREX_FORCE_INLINE void sincos (double x, double* sinx, double* cosx) {
+#if defined(_GNU_SOURCE) && !defined(__APPLE__)
+        ::sincos(x, sinx, cosx);
+#else
+        *sinx = std::sin(x);
+        *cosx = std::cos(x);
+#endif
+    }
+
+    AMREX_FORCE_INLINE void sincosf (float x, float* sinx, float* cosx) {
+#if defined(_GNU_SOURCE) && !defined(__APPLE__)
+        ::sincosf(x, sinx, cosx);
+#else
+        *sinx = std::sin(x);
+        *cosx = std::cos(x);
+#endif
+    }
+}
+
 //! Return sine and cosine of given number
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<double,double> sincos (double x)
@@ -125,13 +138,9 @@ std::pair<double,double> sincos (double x)
     std::pair<double,double> r;
 #if defined(AMREX_USE_SYCL)
     r.first = sycl::sincos(x, sycl::private_ptr<double>(&r.second));
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \
-      (defined(_GNU_SOURCE) && !defined(__APPLE__))
-    ::sincos(x, &r.first, &r.second);
 #else
-    r.first  = std::sin(x);
-    r.second = std::cos(x);
+    AMREX_IF_ON_DEVICE(( ::sincos(x, &r.first, &r.second); ))
+    AMREX_IF_ON_HOST(( detail::sincos(x, &r.first, &r.second); ))
 #endif
     return r;
 }
@@ -143,13 +152,9 @@ std::pair<float,float> sincos (float x)
     std::pair<float,float> r;
 #if defined(AMREX_USE_SYCL)
     r.first = sycl::sincos(x, sycl::private_ptr<float>(&r.second));
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \
-      (defined(_GNU_SOURCE) && !defined(__APPLE__))
-    ::sincosf(x, &r.first, &r.second);
 #else
-    r.first  = std::sin(x);
-    r.second = std::cos(x);
+    AMREX_IF_ON_DEVICE(( ::sincosf(x, &r.first, &r.second); ))
+    AMREX_IF_ON_HOST(( detail::sincosf(x, &r.first, &r.second); ))
 #endif
     return r;
 }
@@ -159,11 +164,11 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<double,double> sincospi (double x)
 {
     std::pair<double,double> r;
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    ::sincospi(x, &r.first, &r.second);
-#else
+#if defined(AMREX_USE_SYCL)
     r = sincos(pi<double>()*x);
+#else
+    AMREX_IF_ON_DEVICE(( ::sincospi(x, &r.first, &r.second); ))
+    AMREX_IF_ON_HOST(( r = sincos(pi<double>()*x); ))
 #endif
     return r;
 }
@@ -173,18 +178,18 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<float,float> sincospi (float x)
 {
     std::pair<float,float> r;
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    ::sincospif(x, &r.first, &r.second);
-#else
+#if defined(AMREX_USE_SYCL)
     r = sincos(pi<float>()*x);
+#else
+    AMREX_IF_ON_DEVICE(( ::sincospif(x, &r.first, &r.second); ))
+    AMREX_IF_ON_HOST(( r = sincos(pi<float>()*x); ))
 #endif
     return r;
 }
 
 //! Return pow(x, Power), where Power is an integer known at compile time
 template <int Power, typename T,
-    typename = typename std::enable_if<!std::is_integral<T>() || Power>=0>::type>
+    typename = std::enable_if_t<!std::is_integral<T>() || Power>=0>>
 AMREX_FORCE_INLINE
 constexpr T powi (T x) noexcept
 {
@@ -204,6 +209,161 @@ constexpr T powi (T x) noexcept
     }
 }
 
+#if defined(AMREX_INT128_SUPPORTED)
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+std::uint64_t umulhi (std::uint64_t a, std::uint64_t b)
+{
+#if defined(AMREX_USE_SYCL)
+    return sycl::mul_hi(a,b);
+#else
+    AMREX_IF_ON_DEVICE(( return __umul64hi(a, b); ))
+    AMREX_IF_ON_HOST((
+        auto tmp = amrex::UInt128_t(a) * amrex::UInt128_t(b);
+        return std::uint64_t(tmp >> 64);
+    ))
+#endif
+}
+#endif
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object to encapsulate the fast division+modulus operation for 64b integer division.
+///
+/// Example:
+///
+///
+///   uint64_t quotient, remainder, dividend, divisor;
+///
+///   FastDivmodU64 divmod(divisor);
+///
+///   divmod(quotient, remainder, dividend);
+///
+///   // quotient = (dividend / divisor)
+///   // remainder = (dividend % divisor)
+///
+struct FastDivmodU64
+{
+    std::uint64_t divisor;
+
+#ifdef AMREX_INT128_SUPPORTED
+    std::uint64_t multiplier = 1U;
+    unsigned int shift_right = 0;
+    unsigned int round_up = 0;
+
+    //
+    // Static methods
+    //
+
+    /// Computes b, where 2^b is the greatest power of two that is less than or equal to x
+    static std::uint32_t integer_log2 (std::uint64_t x)
+    {
+        std::uint32_t n = 0;
+        while (x >>= 1) {
+            ++n;
+        }
+        return n;
+    }
+
+    /// Construct the FastDivmod object, in host code only
+    ///
+    /// This precomputes some values based on the divisor and is computationally expensive.
+    FastDivmodU64 (std::uint64_t divisor_)
+        : divisor(divisor_)
+    {
+        if (divisor) {
+            shift_right = integer_log2(divisor);
+
+            if ((divisor & (divisor - 1)) == 0) {
+                multiplier = 0;
+            }
+            else {
+                std::uint64_t power_of_two = (std::uint64_t(1) << shift_right);
+                auto n = amrex::UInt128_t(power_of_two) << 64;
+                std::uint64_t multiplier_lo = n / divisor;
+                n += power_of_two;
+                multiplier = n / divisor;
+                round_up = (multiplier_lo == multiplier ? 1 : 0);
+            }
+        }
+    }
+
+#else
+
+    FastDivmodU64 (std::uint64_t divisor_) : divisor(divisor_) {}
+
+#endif
+
+    /// Returns the quotient of floor(dividend / divisor)
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE
+    std::uint64_t divide (std::uint64_t dividend) const
+    {
+#if defined(AMREX_INT128_SUPPORTED)
+        auto x = dividend;
+        if (multiplier) {
+            x = amrex::Math::umulhi(dividend + round_up, multiplier);
+        }
+        return (x >> shift_right);
+#else
+        return dividend / divisor;
+#endif
+    }
+
+    /// Computes the remainder given a computed quotient and dividend
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE
+    std::uint64_t modulus (std::uint64_t quotient, std::uint64_t dividend) const
+    {
+        return dividend - quotient * divisor;
+    }
+
+    /// Returns the quotient of floor(dividend / divisor) and computes the remainder
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE
+    std::uint64_t divmod (std::uint64_t &remainder, std::uint64_t dividend) const
+    {
+        auto quotient = divide(dividend);
+        remainder = modulus(quotient, dividend);
+        return quotient;
+    }
+
+    /// Computes integer division and modulus using precomputed values. This is computationally
+    /// inexpensive.
+    AMREX_GPU_HOST_DEVICE
+    void operator() (std::uint64_t &quotient, std::uint64_t &remainder, std::uint64_t dividend) const
+    {
+        quotient = divmod(remainder, dividend);
+    }
+};
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_MultiFab.H b/Src/Base/AMReX_MultiFab.H
index 416c4540da5..b2fc9c80a1f 100644
--- a/Src/Base/AMReX_MultiFab.H
+++ b/Src/Base/AMReX_MultiFab.H
@@ -4,16 +4,13 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_BLassert.H>
+#include <AMReX_BaseFab.H>
 #include <AMReX_FArrayBox.H>
 #include <AMReX_FabArray.H>
 #include <AMReX_FabArrayUtility.H>
 #include <AMReX_Periodicity.H>
 #include <AMReX_NonLocalBC.H>
 
-#ifdef AMREX_USE_EB
-#include <AMReX_EBMultiFabUtil.H>
-#endif
-
 #include <cstdint>
 
 namespace amrex
@@ -24,16 +21,17 @@ using fMultiFab = FabArray<BaseFab<float> >;
 class iMultiFab;
 
 /**
- * \brief
- * A collection (stored as an array) of FArrayBox objects.
+ * \brief A collection (stored as an array) of FArrayBox objects.
+ *
  * This class is useful for storing floating point data on a domain defined by
  * a union of rectangular regions embedded in a uniform index space.
  * MultiFab class extends the function of the underlying FabArray class just
  * as the FArrayBox class extends the function of BaseFab<Real>.
- * Additional member functions are defined for I/O and simple arithmetic operations on these aggregate objects.
+ * Additional member functions are defined for I/O and simple arithmetic
+ * operations on these aggregate objects.
+ *
  * This class does NOT provide a copy constructor or assignment operator.
  */
-
 class MultiFab
     :
     public FabArray<FArrayBox>
@@ -41,34 +39,36 @@ class MultiFab
 public:
 
     /**
-    * \brief Constructs an empty MultiFab.  Data can be defined at a later
-    * time using the define member functions inherited
-    * from FabArray.
+    * \brief Constructs an empty MultiFab.
+    *
+    * Data can be defined at a later time using the define member functions
+    * inherited from FabArray.
     */
     MultiFab () noexcept;
 
     /**
-    * \brief Constructs an empty MultiFab.  Data can be defined at a later
-    * time using the define member functions inherited from FabArray.  If
-    * `define` is called later with a nulltpr as MFInfo's arena, the default
-    * Arena `a` will be used.  If the arena in MFInfo is not a nullptr, the
-    * MFInfo's arena will be used.
+    * \brief Constructs an empty MultiFab.
+    *
+    * Data can be defined at a later time using the define member functions.
+    * If `define` is called later with a nullptr as MFInfo's arena, the
+    * default Arena `a` will be used.  If the arena in MFInfo is not a
+    * nullptr, the MFInfo's arena will be used.
     */
     explicit MultiFab (Arena* a) noexcept;
 
     /**
-    * \brief
-    * Constructs a MultiFab
-    * \param bs a valid region
+    * \brief Constructs a MultiFab
+    *
+    * The size of the FArrayBox is given by the Box grown by \p ngrow, and
+    * the number of components is given by \p ncomp. If \p info is set to
+    * not allocating memory, then no FArrayBoxes are allocated at
+    * this time but can be defined later.
+    *
+    * \param bxs a valid region
     * \param dm a DistribuionMapping
     * \param ncomp number of components
     * \param ngrow number of cells the region grows
     * \param info MFInfo
-
-    * The size of the FArrayBox is given by the Box grown by ngrow, and
-    * the number of components is given by ncomp. If info is set to
-    * not allocating memory, then no FArrayBoxes are allocated at
-    * this time but can be defined later.
     */
     MultiFab (const BoxArray&            bxs,
               const DistributionMapping& dm,
@@ -95,10 +95,11 @@ public:
 #endif
 
     /**
-     * \brief Make an alias MultiFab. maketype must be
-     * amrex::make_alias.  scomp is the starting component of the
-     * alias and ncomp is the number of components in the new aliasing
-     * MultiFab.
+     * \brief Make an alias MultiFab.
+     *
+     * Note that \p maketype must be `amrex::make_alias`,
+     * \p scomp is the starting component of the alias, and
+     * \p ncomp is the number of components in the new aliasing MultiFab.
      */
     MultiFab (const MultiFab& rhs, MakeType maketype, int scomp, int ncomp);
 
@@ -135,11 +136,13 @@ public:
 #endif
 
     MultiFab& operator= (Real r);
-    //
+
     /**
-    * \brief Returns the minimum value contained in component comp of the
-    * MultiFab.  The parameter nghost determines the number of
-    * boundary cells to search for the minimum.  The default is to
+    * \brief Returns the minimum value contained in component \p comp of the
+    * MultiFab.
+    *
+    * The parameter \p nghost determines the number of
+    * boundary cells to search for the minimum. The default is to
     * search only the valid regions of the FArrayBoxes.
     */
     [[nodiscard]] Real min (int comp,
@@ -154,16 +157,18 @@ public:
               int        nghost = 0,
               bool       local = false) const;
     /**
-    * \brief Returns the maximum value contained in component comp of the
-    * MultiFab.  The parameter nghost determines the number of
-    * boundary cells to search for the maximum.  The default is to
+    * \brief Returns the maximum value contained in component \p comp of the
+    * MultiFab.
+    *
+    * The parameter \p nghost determines the number of
+    * boundary cells to search for the maximum. The default is to
     * search only the valid regions of the FArrayBoxes.
     */
     [[nodiscard]] Real max (int comp,
               int nghost = 0,
               bool local = false) const;
     /**
-    * \brief Identical to the previous max() function, but confines its
+    * \brief Identical to the previous `max()` function, but confines its
     * search to intersection of Box b and the MultiFab.
     */
     [[nodiscard]] Real max (const Box& region,
@@ -191,7 +196,7 @@ public:
 
     /**
     * \brief Returns the maximum *absolute* values contained in
-    * each component of "comps" of the MultiFab.  "nghost" ghost cells are used.
+    * each component of \p comps of the MultiFab. \p nghost ghost cells are used.
     */
     [[nodiscard]] Vector<Real> norm0 (const Vector<int>& comps, int nghost = 0, bool local = false, bool ignore_covered = false ) const;
     [[nodiscard]] Vector<Real> norminf (const Vector<int>& comps, int nghost = 0, bool local = false, bool ignore_covered = false) const {
@@ -199,13 +204,14 @@ public:
     }
 
     /**
-    * \brief Returns the L1 norm of component "comp" over the MultiFab.
+    * \brief Returns the L1 norm of component \p comp over the MultiFab.
+    *
     * No ghost cells are used.  This version has no double counting for nodal data.
     */
     [[nodiscard]] Real norm1 (int comp, const Periodicity& period, bool ignore_covered = false) const;
     /**
-    * \brief Returns the L1 norm of component "comp" over the MultiFab.
-    * ngrow ghost cells are used.
+    * \brief Returns the L1 norm of component \p comp over the MultiFab.
+    * \p ngrow ghost cells are used.
     */
     [[nodiscard]] Real norm1 (int comp = 0, int ngrow = 0, bool local = false) const;
     /**
@@ -214,12 +220,12 @@ public:
     */
     [[nodiscard]] Vector<Real> norm1 (const Vector<int>& comps, int ngrow = 0, bool local = false) const;
     /**
-    * \brief Returns the L2 norm of component "comp" over the MultiFab.
+    * \brief Returns the L2 norm of component \p comp over the MultiFab.
     * No ghost cells are used.
     */
     [[nodiscard]] Real norm2 (int comp = 0) const;
     /**
-    * \brief Returns the L2 norm of component "comp" over the MultiFab.
+    * \brief Returns the L2 norm of component \p comp over the MultiFab.
     * No ghost cells are used. This version has no double counting for nodal data.
     */
     [[nodiscard]] Real norm2 (int comp, const Periodicity& period) const;
@@ -236,16 +242,17 @@ public:
     using FabArray<FArrayBox>::sum;
 
     /**
-    * \brief Same as sum with local=false, but for non-cell-centered data, this
-    *        skips non-unique points that are owned by multiple boxes.
+    * \brief Same as sum with \p local =false, but for non-cell-centered data, this
+    * skips non-unique points that are owned by multiple boxes.
     */
     [[nodiscard]] Real sum_unique (int comp = 0,
                      bool local = false,
                      const Periodicity& period = Periodicity::NonPeriodic()) const;
     /**
-    * \brief Adds the scalar value val to the value of each cell in the
-    * specified subregion of the MultiFab.  The subregion consists
-    * of the num_comp components starting at component comp.
+    * \brief Adds the scalar value \p val to the value of each cell in the
+    * specified subregion of the MultiFab.
+    *
+    * The subregion consists of the \p num_comp components starting at component \p comp.
     * The value of nghost specifies the number of cells in the
     * boundary region of each FArrayBox in the subregion that should
     * be modified.
diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H
index ad1fa669f32..5c05850f346 100644
--- a/Src/Base/AMReX_MultiFabUtil.H
+++ b/Src/Base/AMReX_MultiFabUtil.H
@@ -19,67 +19,76 @@ namespace amrex
                                      const MultiFab& nd, int scomp,
                                      int ncomp, int ngrow = 0);
 
-    //! Average edge-based MultiFab onto cell-centered MultiFab. This fills in
-    //! ngrow ghost cells in the cell-centered MultiFab. Both cell centered and
-    //! edge centered MultiFabs need to have ngrow ghost values
+    /**
+     * \brief Average edge-based MultiFab onto cell-centered MultiFab.
+     *
+     * This fills in \p ngrow ghost cells in the cell-centered MultiFab. Both cell centered and
+     * edge centered MultiFabs need to have \p ngrow ghost values.
+     */
     void average_edge_to_cellcenter (MultiFab& cc, int dcomp,
                                      const Vector<const MultiFab*>& edge,
                                      int ngrow = 0);
-
     //! Average face-based MultiFab onto cell-centered MultiFab.
     void average_face_to_cellcenter (MultiFab& cc, int dcomp,
                                      const Vector<const MultiFab*>& fc,
                                      int ngrow = 0);
-
+    //! Average face-based FabArray onto cell-centered FabArray.
     template <typename CMF, typename FMF,
               std::enable_if_t<IsFabArray_v<CMF> && IsFabArray_v<FMF>, int> = 0>
     void average_face_to_cellcenter (CMF& cc, int dcomp,
                                      const Array<const FMF*,AMREX_SPACEDIM>& fc,
                                      int ngrow = 0);
-
+    //! Average face-based MultiFab onto cell-centered MultiFab with geometric weighting.
     void average_face_to_cellcenter (MultiFab& cc,
                                      const Vector<const MultiFab*>& fc,
                                      const Geometry& geom);
+    //! Average face-based MultiFab onto cell-centered MultiFab with geometric weighting.
     void average_face_to_cellcenter (MultiFab& cc,
                                      const Array<const MultiFab*,AMREX_SPACEDIM>& fc,
                                      const Geometry& geom);
-
-    //! Average cell-centered MultiFab onto face-based MultiFab.
+    //! Average cell-centered MultiFab onto face-based MultiFab with geometric weighting.
     void average_cellcenter_to_face (const Vector<MultiFab*>& fc,
                                      const MultiFab& cc,
                                      const Geometry& geom,
                                      int ncomp = 1,
                                      bool use_harmonic_averaging = false);
+    //! Average cell-centered MultiFab onto face-based MultiFab with geometric weighting.
     void average_cellcenter_to_face (const Array<MultiFab*,AMREX_SPACEDIM>& fc,
                                      const MultiFab& cc,
                                      const Geometry& geom,
                                      int ncomp = 1,
                                      bool use_harmonic_averaging = false);
 
-    //! Average fine face-based MultiFab onto crse face-based MultiFab.
+    //! Average fine face-based FabArray onto crse face-based FabArray.
     template <typename MF, std::enable_if_t<IsFabArray<MF>::value,int> = 0>
     void average_down_faces (const Vector<const MF*>& fine,
                              const Vector<MF*>& crse,
                              const IntVect& ratio,
                              int ngcrse = 0);
+    //! Average fine face-based FabArray onto crse face-based FabArray.
     template <typename MF, std::enable_if_t<IsFabArray<MF>::value,int> = 0>
     void average_down_faces (const Vector<const MF*>& fine,
                              const Vector<MF*>& crse,
                              int ratio,
                              int ngcrse = 0);
+    //! Average fine face-based FabArray onto crse face-based FabArray.
     template <typename MF, std::enable_if_t<IsFabArray<MF>::value,int> = 0>
     void average_down_faces (const Array<const MF*,AMREX_SPACEDIM>& fine,
                              const Array<MF*,AMREX_SPACEDIM>& crse,
                              const IntVect& ratio,
                              int ngcrse = 0);
+    //! Average fine face-based FabArray onto crse face-based FabArray.
     template <typename MF, std::enable_if_t<IsFabArray<MF>::value,int> = 0>
     void average_down_faces (const Array<const MF*,AMREX_SPACEDIM>& fine,
                              const Array<MF*,AMREX_SPACEDIM>& crse,
                              int ratio,
                              int ngcrse = 0);
-    //! This version does average down for one direction.
-    //! It uses the IndexType of MultiFabs to determine the direction.
-    //! It is expected that one direction is nodal and the rest are cell-centered.
+    /**
+     * \brief This version does average down for one face direction.
+     *
+     * It uses the IndexType of MultiFabs to determine the direction.
+     * It is expected that one direction is nodal and the rest are cell-centered.
+     */
     template <typename FAB>
     void average_down_faces (const FabArray<FAB>& fine, FabArray<FAB>& crse,
                              const IntVect& ratio, int ngcrse=0);
@@ -117,9 +126,12 @@ namespace amrex
                              int ngcrse = 0,
                              bool mfiter_is_definitely_safe=false);
 
-    //! Average fine cell-based MultiFab onto crse cell-centered MultiFab using
-    //! volume-weighting. This routine DOES NOT assume that the crse BoxArray is
-    //! a coarsened version of the fine BoxArray.
+    /**
+     * \brief Volume weighed average of fine MultiFab onto coarse MultiFab.
+     *
+     * Both MultiFabs are assumed to be cell-centered. This routine DOES NOT assume that
+     * the crse BoxArray is a coarsened version of the fine BoxArray.
+     */
     void average_down (const MultiFab& S_fine, MultiFab& S_crse,
                        const Geometry& fgeom, const Geometry& cgeom,
                        int scomp, int ncomp, const IntVect& ratio);
@@ -375,7 +387,7 @@ namespace amrex
     void FillRandom (MultiFab& mf, int scomp, int ncomp);
 
     /**
-     * \brief Fill MultiFab with random numbers from nornmal distribution
+     * \brief Fill MultiFab with random numbers from normal distribution
      *
      * All cells including ghost cells are filled.
      *
@@ -625,18 +637,18 @@ void average_down (const FabArray<FAB>& S_fine, FabArray<FAB>& S_crse,
 
 
 
-   /**
-    * \brief Returns part of a norm based on two MultiFabs
-    * The MultiFabs MUST have the same underlying BoxArray.
-    * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
-    * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
-    */
-
+/**
+ * \brief Returns part of a norm based on two MultiFabs.
+ *
+ * The MultiFabs MUST have the same underlying BoxArray.
+ * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
+ * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
+ */
 template <typename F>
 Real
 NormHelper (const MultiFab& x, int xcomp,
             const MultiFab& y, int ycomp,
-            F && f,
+            F const& f,
             int numcomp, IntVect nghost, bool local)
 {
     BL_ASSERT(x.boxArray() == y.boxArray());
@@ -684,21 +696,21 @@ NormHelper (const MultiFab& x, int xcomp,
     return sm;
 }
 
-   /**
-    * \brief Returns part of a norm based on three MultiFabs
-    * The MultiFabs MUST have the same underlying BoxArray.
-    * The Predicate pf is used to test the mask
-    * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
-    * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
-    */
-
+/**
+ * \brief Returns part of a norm based on three MultiFabs
+ *
+ * The MultiFabs MUST have the same underlying BoxArray.
+ * The Predicate pf is used to test the mask
+ * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
+ * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
+ */
 template <typename MMF, typename Pred, typename F>
 Real
 NormHelper (const MMF& mask,
                const MultiFab& x, int xcomp,
                const MultiFab& y, int ycomp,
-               Pred && pf,
-               F && f,
+               Pred const& pf,
+               F const& f,
                int numcomp, IntVect nghost, bool local)
 {
     BL_ASSERT(x.boxArray() == y.boxArray());
@@ -1024,7 +1036,7 @@ template <typename Op, typename T, typename FAB, typename F,
 #endif
                            , int> FOO>
 BaseFab<T>
-ReduceToPlane (int direction, Box const& domain, FabArray<FAB> const& mf, F&& f)
+ReduceToPlane (int direction, Box const& domain, FabArray<FAB> const& mf, F const& f)
 {
     Box domain2d = domain;
     domain2d.setRange(direction, 0);
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index 93ba453cc07..5dd97fa6536 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -46,11 +46,15 @@ namespace {
             boxes.push_back(is.second);
             slice_to_full_ba_map.push_back(is.first);
         }
-        BoxArray slice_ba(boxes.data(), static_cast<int>(boxes.size()));
-        DistributionMapping slice_dmap(std::move(procs));
-
-        return std::make_unique<MultiFab>(slice_ba, slice_dmap, ncomp, 0,
-                                          MFInfo(), FArrayBoxFactory());
+        if (!boxes.empty()) {
+            BoxArray slice_ba(boxes.data(), static_cast<int>(boxes.size()));
+            DistributionMapping slice_dmap(std::move(procs));
+
+            return std::make_unique<MultiFab>(slice_ba, slice_dmap, ncomp, 0,
+                                              MFInfo(), FArrayBoxFactory());
+        } else {
+            return nullptr;
+        }
     }
 }
 
@@ -308,9 +312,6 @@ namespace amrex
 
 // *************************************************************************************************************
 
-    // Average fine cell-based MultiFab onto crse cell-centered MultiFab.
-    // We do NOT assume that the coarse layout is a coarsened version of the fine layout.
-    // This version DOES use volume-weighting.
     void average_down (const MultiFab& S_fine, MultiFab& S_crse,
                        const Geometry& fgeom, const Geometry& cgeom,
                        int scomp, int ncomp, int rr)
@@ -477,7 +478,7 @@ namespace amrex
         auto tmptype = type;
         tmptype.set(dir);
         if (dir >= AMREX_SPACEDIM || !tmptype.nodeCentered()) {
-            amrex::Abort("average_down_edges: not face index type");
+            amrex::Abort("average_down_edges: not edge index type");
         }
         const int ncomp = crse.nComp();
         if (isMFIterSafe(fine, crse))
@@ -563,6 +564,10 @@ namespace amrex
         Vector<int> slice_to_full_ba_map;
         std::unique_ptr<MultiFab> slice = allocateSlice(dir, cc, ncomp, geom, coord, slice_to_full_ba_map);
 
+        if (!slice) {
+            return nullptr;
+        }
+
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
diff --git a/Src/Base/AMReX_MultiFabUtil_2D_C.H b/Src/Base/AMReX_MultiFabUtil_2D_C.H
index 3cb97db8e2d..0a55ba0540a 100644
--- a/Src/Base/AMReX_MultiFabUtil_2D_C.H
+++ b/Src/Base/AMReX_MultiFabUtil_2D_C.H
@@ -120,6 +120,7 @@ void amrex_avgdown_faces (Box const& bx, Array4<T> const& crse,
         }
         break;
     }
+    default: { break; }
     }
 }
 
@@ -157,6 +158,7 @@ void amrex_avgdown_faces (int i, int j, int, int n, Array4<T> const& crse,
         crse(i,j,0,n+ccomp) = c * facInv;
         break;
     }
+    default: { break; }
     }
 }
 
@@ -206,6 +208,7 @@ void amrex_avgdown_edges (Box const& bx, Array4<Real> const& crse,
         }
         break;
     }
+    default: { break; }
     }
 }
 
@@ -242,6 +245,7 @@ void amrex_avgdown_edges (int i, int j, int, int n, Array4<Real> const& crse,
         crse(i,j,0,n+ccomp) = c * facInv;
         break;
     }
+    default: { break; }
     }
 }
 
diff --git a/Src/Base/AMReX_MultiFabUtil_3D_C.H b/Src/Base/AMReX_MultiFabUtil_3D_C.H
index 0cc12ff1aba..5945e7e511b 100644
--- a/Src/Base/AMReX_MultiFabUtil_3D_C.H
+++ b/Src/Base/AMReX_MultiFabUtil_3D_C.H
@@ -164,6 +164,7 @@ void amrex_avgdown_faces (Box const& bx, Array4<T> const& crse,
         }
         break;
     }
+    default: { break; }
     }
 }
 
@@ -214,6 +215,7 @@ void amrex_avgdown_faces (int i, int j, int k, int n, Array4<T> const& crse,
         crse(i,j,k,n+ccomp) = c * facInv;
         break;
     }
+    default: { break; }
     }
 }
 
@@ -287,6 +289,7 @@ void amrex_avgdown_edges (Box const& bx, Array4<Real> const& crse,
         }
         break;
     }
+    default: { break; }
     }
 }
 
@@ -333,6 +336,7 @@ void amrex_avgdown_edges (int i, int j, int k, int n, Array4<Real> const& crse,
         crse(i,j,k,n+ccomp) = c * facInv;
         break;
     }
+    default: { break; }
     }
 }
 
diff --git a/Src/Base/AMReX_MultiFabUtil_nd_C.H b/Src/Base/AMReX_MultiFabUtil_nd_C.H
index 3956e818316..cffec43d2f1 100644
--- a/Src/Base/AMReX_MultiFabUtil_nd_C.H
+++ b/Src/Base/AMReX_MultiFabUtil_nd_C.H
@@ -39,6 +39,7 @@ void amrex_fill_slice_interp (Box const& bx, Array4<Real> slice,
             break;
         case 2:
             khi = 1;
+        default: { break; }
         }
     } else {
         switch (dir)
@@ -51,6 +52,7 @@ void amrex_fill_slice_interp (Box const& bx, Array4<Real> slice,
             break;
         case 2:
             klo = -1;
+        default: { break; }
         }
     }
 
diff --git a/Src/Base/AMReX_NFiles.H b/Src/Base/AMReX_NFiles.H
index 824351b50dc..bd1518dd44c 100644
--- a/Src/Base/AMReX_NFiles.H
+++ b/Src/Base/AMReX_NFiles.H
@@ -23,7 +23,6 @@ namespace amrex {
 *   nfi.Stream().write((const char *) data.dataPtr(), nChars);
 * }
 */
-
 class NFilesIter
 {
   public:
diff --git a/Src/Base/AMReX_NFiles.cpp b/Src/Base/AMReX_NFiles.cpp
index 2abd27986f7..3892cabfd07 100644
--- a/Src/Base/AMReX_NFiles.cpp
+++ b/Src/Base/AMReX_NFiles.cpp
@@ -125,11 +125,11 @@ void NFilesIter::SetSparseFPP(const Vector<int> &ranksToWrite)
   // ---- do more error checking here
   // ---- ranks in range, is dynamic on already
   mySparseFileNumber = -1;
-  for(int r(0); r < ranksToWrite.size(); ++r) {
-    if(ranksToWrite[r] < 0 || ranksToWrite[r] >= nProcs) {
+  for(int r : ranksToWrite) {
+    if(r < 0 || r >= nProcs) {
       amrex::Abort("**** Error in NFilesIter::SetSparseFPP:  rank out of range.");
     }
-    if(ranksToWrite[r] == myProc) {
+    if(r == myProc) {
       if(mySparseFileNumber == -1) {
         mySparseFileNumber = myProc;
       } else {
@@ -281,8 +281,8 @@ bool NFilesIter::ReadyToWrite(bool appendFirst) {
       BL_PROFILE("NFI::ReadyToWrite:decider");
       // ---- the first message received is the coordinator
       ParallelDescriptor::Recv(&coordinatorProc, 1, MPI_ANY_SOURCE, deciderTag);
-      for(int i(0); i < setZeroProcs.size(); ++i) {  // ---- tell the set zero ranks  who is coordinating
-        ParallelDescriptor::Send(&coordinatorProc, 1, setZeroProcs[i], coordinatorTag);
+      for(int setZeroProc : setZeroProcs) {  // ---- tell the set zero ranks  who is coordinating
+        ParallelDescriptor::Send(&coordinatorProc, 1, setZeroProc, coordinatorTag);
       }
       unreadMessages.push_back(std::make_pair(deciderTag, setZeroProcs.size() - 1));
     }
@@ -552,8 +552,7 @@ Vector<int> NFilesIter::FileNumbersWritten()
 void NFilesIter::CleanUpMessages() {
 #ifdef BL_USE_MPI
   BL_PROFILE("NFI::CleanUpMessages");
-  for(int i(0); i < unreadMessages.size(); ++i) {
-    std::pair<int, int> & pii = unreadMessages[i];
+  for(auto & pii : unreadMessages) {
     int fromProc, tag(pii.first), nMessages(pii.second);
 #if 0
     amrex::AllPrint() << ParallelDescriptor::MyProc() << ":: cleaning up " << nMessages
diff --git a/Src/Base/AMReX_NonLocalBC.H b/Src/Base/AMReX_NonLocalBC.H
index fc7f4cb6583..f308aae1389 100644
--- a/Src/Base/AMReX_NonLocalBC.H
+++ b/Src/Base/AMReX_NonLocalBC.H
@@ -264,7 +264,7 @@ struct MultiBlockCommMetaData : FabArrayBase::CommMetaData {
 //                                                           [concept.FabProjection]
 //
 
-//! \brief This type trait tests if a type P is a projetion for FAB.
+//! \brief This type trait tests if a type P is a projection for FAB.
 template <typename P, typename FAB>
 struct IsFabProjection
     : IsCallableR<typename FAB::value_type, P, Array4<const typename FAB::value_type>, Dim3, int>
@@ -296,9 +296,9 @@ struct Identity  {
 static constexpr Identity identity{};
 
 static_assert(sizeof(Identity) == 1 );
-static_assert(std::is_trivially_default_constructible<Identity>::value );
-static_assert(std::is_trivially_copy_assignable<Identity>::value );
-static_assert(std::is_trivially_copy_constructible<Identity>::value );
+static_assert(std::is_trivially_default_constructible_v<Identity> );
+static_assert(std::is_trivially_copy_assignable_v<Identity> );
+static_assert(std::is_trivially_copy_constructible_v<Identity> );
 static_assert(IsIndexMapping<Identity>() ); // NOLINT(bugprone-throw-keyword-missing)
 static_assert(IsFabProjection<Identity, FArrayBox>() ); // NOLINT(bugprone-throw-keyword-missing)
 
@@ -1129,7 +1129,7 @@ FillBoundary_finish (CommHandler handler,
     auto cmd = makeFillBoundaryMetaData(mf, mf.nGrowVect, geom, dtos);
     // The metadata cmd can be cached and reused on a MultiFab/FabArray with
     // the same BoxArray and DistributionMapping.
-    FillBoundary_finish(mf, cmd, scomp, ncomp, dtos, proj);
+    FillBoundary(mf, cmd, scomp, ncomp, dtos, proj);
 \endverbatim
  *
  * The FillBoundary capability here is more flexible than FabArray's
diff --git a/Src/Base/AMReX_NonLocalBC.cpp b/Src/Base/AMReX_NonLocalBC.cpp
index ebd78f7b3d1..a716150884c 100644
--- a/Src/Base/AMReX_NonLocalBC.cpp
+++ b/Src/Base/AMReX_NonLocalBC.cpp
@@ -73,7 +73,7 @@ void PrepareCommBuffers(CommData& comm,
             nbytes += cct.sbox.numPts() * object_size * n_components;
         }
 
-        std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+        std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
         nbytes = amrex::aligned_size(acd, nbytes);  // so that nbytes are aligned
 
         // Also need to align the offset properly
diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H
index 8eb8ada4513..15d6854c926 100644
--- a/Src/Base/AMReX_OpenMP.H
+++ b/Src/Base/AMReX_OpenMP.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #ifdef AMREX_USE_OMP
+#include <AMReX_Extension.H>
 #include <omp.h>
 
 namespace amrex::OpenMP {
@@ -11,10 +12,15 @@ namespace amrex::OpenMP {
     inline int get_max_threads () { return omp_get_max_threads(); }
     inline int get_thread_num  () { return omp_get_thread_num();  }
     inline int in_parallel     () { return omp_in_parallel();     }
+    inline void set_num_threads (int num) { omp_set_num_threads(num); }
 
+    void Initialize ();
+    void Finalize ();
+
+    omp_lock_t* get_lock (int ilock);
 }
 
-#else
+#else // AMREX_USE_OMP
 
 namespace amrex::OpenMP {
 
@@ -22,9 +28,15 @@ namespace amrex::OpenMP {
     constexpr int get_max_threads () { return 1; }
     constexpr int get_thread_num  () { return 0; }
     constexpr int in_parallel     () { return false; }
-
+    constexpr void set_num_threads (int) { /* nothing */ }
 }
 
-#endif
+#endif // AMREX_USE_OMP
+
+namespace amrex {
+    /** ... */
+    int
+    numUniquePhysicalCores();
+}
 
 #endif
diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp
new file mode 100644
index 00000000000..03c54b5358b
--- /dev/null
+++ b/Src/Base/AMReX_OpenMP.cpp
@@ -0,0 +1,215 @@
+#include <AMReX_OpenMP.H>
+#include <AMReX.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_Print.H>
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <optional>
+#include <set>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+
+namespace amrex
+{
+    int
+    numUniquePhysicalCores ()
+    {
+        int ncores;
+
+#if defined(__APPLE__)
+        size_t len = sizeof(ncores);
+        // See hw.physicalcpu and hw.physicalcpu_max
+        //   https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_system_capabilities/
+        //   https://developer.apple.com/documentation/kernel/1387446-sysctlbyname
+        if (sysctlbyname("hw.physicalcpu", &ncores, &len, NULL, 0) == -1) {
+            if (system::verbose > 0) {
+                amrex::Print() << "numUniquePhysicalCores(): Error receiving hw.physicalcpu! "
+                               << "Defaulting to visible cores.\n";
+            }
+            ncores = int(std::thread::hardware_concurrency());
+        }
+#elif defined(__linux__)
+        std::set<std::vector<int>> uniqueThreadSets;
+        int cpuIndex = 0;
+
+        while (true) {
+            // for each logical CPU in cpuIndex from 0...N-1
+            std::string path = "/sys/devices/system/cpu/cpu" + std::to_string(cpuIndex) + "/topology/thread_siblings_list";
+            std::ifstream file(path);
+            if (!file.is_open()) {
+                break; // no further CPUs to check
+            }
+
+            // find its siblings
+            std::vector<int> siblings;
+            std::string line;
+            if (std::getline(file, line)) {
+                std::stringstream ss(line);
+                std::string token;
+
+                // Possible syntax: 0-3, 8-11, 14,17
+                // https://github.com/torvalds/linux/blob/v6.5/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-L72
+                while (std::getline(ss, token, ',')) {
+                    size_t dashPos = token.find('-');
+                    if (dashPos != std::string::npos) {
+                        // Range detected
+                        int start = std::stoi(token.substr(0, dashPos));
+                        int end = std::stoi(token.substr(dashPos + 1));
+                        for (int i = start; i <= end; ++i) {
+                            siblings.push_back(i);
+                        }
+                    } else {
+                        siblings.push_back(std::stoi(token));
+                    }
+                }
+            }
+
+            // and record the siblings group
+            // (assumes: ascending and unique sets per cpuIndex)
+            uniqueThreadSets.insert(siblings);
+            cpuIndex++;
+        }
+
+        if (cpuIndex == 0) {
+            if (system::verbose > 0) {
+                amrex::Print() << "numUniquePhysicalCores(): Error reading CPU info.\n";
+            }
+            ncores = int(std::thread::hardware_concurrency());
+        } else {
+            ncores = int(uniqueThreadSets.size());
+        }
+#elif defined(_WIN32)
+        DWORD length = 0;
+        bool result = GetLogicalProcessorInformation(NULL, &length);
+
+        if (!result) {
+            if (system::verbose > 0) {
+                amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! "
+                               << "Defaulting to visible cores.\n";
+            }
+            ncores = int(std::thread::hardware_concurrency());
+        }
+        else {
+            std::vector<SYSTEM_LOGICAL_PROCESSOR_INFORMATION> buffer(length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
+            if (!GetLogicalProcessorInformation(&buffer[0], &length)) {
+                if (system::verbose > 0) {
+                    amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! "
+                                   << "Defaulting to visible cores.\n";
+                }
+                ncores = int(std::thread::hardware_concurrency());
+            } else {
+                ncores = 0;
+                for (const auto& info : buffer) {
+                    if (info.Relationship == RelationProcessorCore) {
+                        ncores++;
+                    }
+                }
+            }
+        }
+#else
+        // TODO:
+        //   BSD
+        if (system::verbose > 0) {
+            amrex::Print() << "numUniquePhysicalCores(): Unknown system. Defaulting to visible cores.\n";
+        }
+        ncores = int(std::thread::hardware_concurrency());
+#endif
+        return ncores;
+    }
+} // namespace amrex
+
+#ifdef AMREX_USE_OMP
+namespace amrex::OpenMP
+{
+    namespace {
+        constexpr int nlocks = 128;
+        omp_lock_t omp_locks[nlocks];
+        unsigned int initialized = 0;
+    }
+
+    void Initialize ()
+    {
+        if (initialized) {
+            ++initialized;
+            return;
+        }
+
+        amrex::ParmParse pp("amrex");
+        std::string omp_threads = "system";
+        pp.queryAdd("omp_threads", omp_threads);
+
+        auto to_int = [](std::string const & str_omp_threads) {
+            std::optional<int> num;
+            try { num = std::stoi(str_omp_threads); }
+            catch (...) { /* nothing */ }
+            return num;
+        };
+
+        if (omp_threads == "system") {
+            // default or OMP_NUM_THREADS environment variable
+        } else if (omp_threads == "nosmt") {
+            char const *env_omp_num_threads = std::getenv("OMP_NUM_THREADS");
+            if (env_omp_num_threads == nullptr) {
+                omp_set_num_threads(numUniquePhysicalCores());
+            }
+            else if (amrex::system::verbose > 1) {
+                amrex::Print() << "amrex.omp_threads was set to nosmt,"
+                               << "but OMP_NUM_THREADS was set. Will keep "
+                               << "OMP_NUM_THREADS=" << env_omp_num_threads << ".\n";
+            }
+        } else {
+            std::optional<int> num_omp_threads = to_int(omp_threads);
+            if (num_omp_threads.has_value()) {
+                omp_set_num_threads(num_omp_threads.value());
+            }
+            else {
+                if (amrex::system::verbose > 0) {
+                    amrex::Print() << "amrex.omp_threads has an unknown value: "
+                                   << omp_threads
+                                   << " (try system, nosmt, or a positive integer)\n";
+                }
+            }
+        }
+
+        for (auto& lck : omp_locks) {
+            omp_init_lock(&lck);
+        }
+
+        ++initialized;
+    }
+
+    void Finalize ()
+    {
+        if (initialized) {
+            --initialized;
+            if (initialized == 0) {
+                for (auto& lck : omp_locks) {
+                    omp_destroy_lock(&lck);
+                }
+            }
+        }
+    }
+
+    omp_lock_t* get_lock (int ilock)
+    {
+        ilock = ilock % nlocks;
+        if (ilock < 0) { ilock += nlocks; }
+        return omp_locks + ilock;
+    }
+
+} // namespace amrex::OpenMP
+#endif // AMREX_USE_OMP
diff --git a/Src/Base/AMReX_Orientation.H b/Src/Base/AMReX_Orientation.H
index 263bb84a95e..61e3622b336 100644
--- a/Src/Base/AMReX_Orientation.H
+++ b/Src/Base/AMReX_Orientation.H
@@ -25,7 +25,6 @@ class OrientationIter;
 * AMREX_SPACEDIM-1 and then the AMREX_SPACEDIM high sides from direction 0 ..
 * AMREX_SPACEDIM-1.
 */
-
 class Orientation
 {
 public:
diff --git a/Src/Base/AMReX_PArena.H b/Src/Base/AMReX_PArena.H
index cc221ba7bae..75db747fd9f 100644
--- a/Src/Base/AMReX_PArena.H
+++ b/Src/Base/AMReX_PArena.H
@@ -15,7 +15,6 @@ namespace amrex {
 * \brief This arena uses CUDA stream-ordered memory allocator if available.
 * If not, use The_Arena().
 */
-
 class PArena
     :
     public Arena
@@ -38,6 +37,11 @@ public:
     [[nodiscard]] bool isDevice () const final;
     [[nodiscard]] bool isPinned () const final;
 
+#ifdef AMREX_USE_GPU
+    //! Is this CUDA stream ordered memory allocator?
+    [[nodiscard]] bool isStreamOrderedArena () const final { return true; }
+#endif
+
 #ifdef AMREX_CUDA_GE_11_2
 private:
     cudaMemPool_t m_pool;
diff --git a/Src/Base/AMReX_PODVector.H b/Src/Base/AMReX_PODVector.H
index 0f10dfb94ef..7800fb145ed 100644
--- a/Src/Base/AMReX_PODVector.H
+++ b/Src/Base/AMReX_PODVector.H
@@ -423,7 +423,7 @@ namespace amrex
         iterator insert (const_iterator a_pos, T&& a_item)
         {
             // This is *POD* vector after all
-            return insert(a_pos, 1, a_item);
+            return insert(a_pos, 1, std::move(a_item));
         }
 
         iterator insert (const_iterator a_pos,
diff --git a/Src/Base/AMReX_ParallelDescriptor.H b/Src/Base/AMReX_ParallelDescriptor.H
index 46ca0f99fc3..f949ae6f45c 100644
--- a/Src/Base/AMReX_ParallelDescriptor.H
+++ b/Src/Base/AMReX_ParallelDescriptor.H
@@ -447,162 +447,162 @@ while ( false )
 
     //! Real sum reduction.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T& rvar);
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T* rvar, int cnt);
 
     // Having this for backward compatibility
-    void ReduceRealSum (Vector<std::reference_wrapper<Real> >&& rvar);
+    void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar);
     //
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealSum (Vector<std::reference_wrapper<T> >&& rvar);
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar);
 
     //! Real sum reduction to specified cpu.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T& rvar, int cpu);
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T* rvar, int cnt, int cpu);
 
     // Having this for backward compatibility
-    void ReduceRealSum (Vector<std::reference_wrapper<Real> >&& rvar, int cpu);
+    void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
     //
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealSum (Vector<std::reference_wrapper<T> >&& rvar, int cpu);
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
 
     //! Real max reduction.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T& rvar);
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T* rvar, int cnt);
 
     // Having this for backward compatibility
-    void ReduceRealMax (Vector<std::reference_wrapper<Real> >&& rvar);
+    void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar);
     //
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMax (Vector<std::reference_wrapper<T> >&& rvar);
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar);
 
     //! Real max reduction to specified cpu.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T& rvar, int cpu);
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T* rvar, int cnt, int cpu);
 
     // Having this for backward compatibility
-    void ReduceRealMax (Vector<std::reference_wrapper<Real> >&& rvar, int cpu);
+    void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
     //
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMax (Vector<std::reference_wrapper<T> >&& rvar, int cpu);
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
 
     //! Real min reduction.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T& rvar);
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T* rvar, int cnt);
 
     // Having this for backward compatibility
-    void ReduceRealMin (Vector<std::reference_wrapper<Real> >&& rvar);
+    void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar);
     //
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMin (Vector<std::reference_wrapper<T> >&& rvar);
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar);
 
     //! Real min reduction to specified cpu.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T& rvar, int cpu);
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T* rvar, int cnt, int cpu);
 
     // Having this for backward compatibility
-    void ReduceRealMin (Vector<std::reference_wrapper<Real> >&& rvar, int cpu);
+    void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
     //
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMin (Vector<std::reference_wrapper<T> >&& rvar, int cpu);
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
 
     //! Integer sum reduction.
     void ReduceIntSum (int& rvar);
     void ReduceIntSum (int* rvar, int cnt);
-    void ReduceIntSum (Vector<std::reference_wrapper<int> >&& rvar);
+    void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar);
     //! Integer sum reduction to specified cpu.
     void ReduceIntSum (int& rvar, int cpu);
     void ReduceIntSum (int* rvar, int cnt, int cpu);
-    void ReduceIntSum (Vector<std::reference_wrapper<int> >&& rvar, int cpu);
+    void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
 
     //! Integer max reduction.
     void ReduceIntMax (int& rvar);
     void ReduceIntMax (int* rvar, int cnt);
-    void ReduceIntMax (Vector<std::reference_wrapper<int> >&& rvar);
+    void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar);
     //! Integer max reduction to specified cpu.
     void ReduceIntMax (int& rvar, int cpu);
     void ReduceIntMax (int* rvar, int cnt, int cpu);
-    void ReduceIntMax (Vector<std::reference_wrapper<int> >&& rvar, int cpu);
+    void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
 
     //! Integer min reduction.
     void ReduceIntMin (int& rvar);
     void ReduceIntMin (int* rvar, int cnt);
-    void ReduceIntMin (Vector<std::reference_wrapper<int> >&& rvar);
+    void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar);
     //! Integer min reduction to specified cpu.
     void ReduceIntMin (int& rvar, int cpu);
     void ReduceIntMin (int* rvar, int cnt, int cpu);
-    void ReduceIntMin (Vector<std::reference_wrapper<int> >&& rvar, int cpu);
+    void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
 
     //! Long sum reduction.
     void ReduceLongSum (Long& rvar);
     void ReduceLongSum (Long* rvar, int cnt);
-    void ReduceLongSum (Vector<std::reference_wrapper<Long> >&& rvar);
+    void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar);
     //! Long sum reduction to specified cpu.
     void ReduceLongSum (Long& rvar, int cpu);
     void ReduceLongSum (Long* rvar, int cnt, int cpu);
-    void ReduceLongSum (Vector<std::reference_wrapper<Long> >&& rvar, int cpu);
+    void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
 
     //! Long max reduction.
     void ReduceLongMax (Long& rvar);
     void ReduceLongMax (Long* rvar, int cnt);
-    void ReduceLongMax (Vector<std::reference_wrapper<Long> >&& rvar);
+    void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar);
     //! Long max reduction to specified cpu.
     void ReduceLongMax (Long& rvar, int cpu);
     void ReduceLongMax (Long* rvar, int cnt, int cpu);
-    void ReduceLongMax (Vector<std::reference_wrapper<Long> >&& rvar, int cpu);
+    void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
 
     //! Long min reduction.
     void ReduceLongMin (Long& rvar);
     void ReduceLongMin (Long* rvar, int cnt);
-    void ReduceLongMin (Vector<std::reference_wrapper<Long> >&& rvar);
+    void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar);
     //! Long min reduction to specified cpu.
     void ReduceLongMin (Long& rvar, int cpu);
     void ReduceLongMin (Long* rvar, int cnt, int cpu);
-    void ReduceLongMin (Vector<std::reference_wrapper<Long> >&& rvar, int cpu);
+    void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
 
     //! Long and-wise reduction.
     void ReduceLongAnd (Long& rvar);
     void ReduceLongAnd (Long* rvar, int cnt);
-    void ReduceLongAnd (Vector<std::reference_wrapper<Long> >&& rvar);
+    void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar);
     //! Long and-wise reduction to specified cpu.
     void ReduceLongAnd (Long& rvar, int cpu);
     void ReduceLongAnd (Long* rvar, int cnt, int cpu);
-    void ReduceLongAnd (Vector<std::reference_wrapper<Long> >&& rvar, int cpu);
+    void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
 
     //! Parallel gather.
     void Gather (Real const* sendbuf, int nsend, Real* recvbuf, int root);
@@ -674,7 +674,7 @@ while ( false )
 
 #ifdef BL_USE_MPI
     int select_comm_data_type (std::size_t nbytes);
-    std::size_t alignof_comm_data (std::size_t nbytes);
+    std::size_t sizeof_selected_comm_data_type (std::size_t nbytes);
 #endif
 }
 }
@@ -699,7 +699,7 @@ template <class T>
 Message
 Asend (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
 {
-    static_assert(!std::is_same<char,T>::value, "Asend: char version has been specialized");
+    static_assert(!std::is_same_v<char,T>, "Asend: char version has been specialized");
 
     BL_PROFILE_T_S("ParallelDescriptor::Asend(TsiiM)", T);
     BL_COMM_PROFILE(BLProfiler::AsendTsiiM, n * sizeof(T), dst_pid, tag);
@@ -740,7 +740,7 @@ template <class T>
 Message
 Send (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
 {
-    static_assert(!std::is_same<char,T>::value, "Send: char version has been specialized");
+    static_assert(!std::is_same_v<char,T>, "Send: char version has been specialized");
 
     BL_PROFILE_T_S("ParallelDescriptor::Send(Tsii)", T);
 
@@ -788,7 +788,7 @@ template <class T>
 Message
 Arecv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
 {
-    static_assert(!std::is_same<char,T>::value, "Arecv: char version has been specialized");
+    static_assert(!std::is_same_v<char,T>, "Arecv: char version has been specialized");
 
     BL_PROFILE_T_S("ParallelDescriptor::Arecv(TsiiM)", T);
     BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, n * sizeof(T), src_pid, tag);
@@ -829,7 +829,7 @@ template <class T>
 Message
 Recv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
 {
-    static_assert(!std::is_same<char,T>::value, "Recv: char version has been specialized");
+    static_assert(!std::is_same_v<char,T>, "Recv: char version has been specialized");
 
     BL_PROFILE_T_S("ParallelDescriptor::Recv(Tsii)", T);
     BL_COMM_PROFILE(BLProfiler::RecvTsii, BLProfiler::BeforeCall(), src_pid, tag);
@@ -996,9 +996,9 @@ ParallelDescriptor::GatherLayoutDataToVector (const LayoutData<T>& sendbuf,
     Vector<int> recvcount(nprocs, 0);
     recvbuf.resize(sendbuf.size());
     const Vector<int>& old_pmap = sendbuf.DistributionMap().ProcessorMap();
-    for (int i=0; i<old_pmap.size(); ++i)
+    for (int i : old_pmap)
     {
-        ++recvcount[old_pmap[i]];
+        ++recvcount[i];
     }
 
     // Make a map from post-gather to pre-gather index
@@ -1271,20 +1271,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
 
     //! Real sum reduction.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T& rvar) {
         detail::DoAllReduce<T>(&rvar,MPI_SUM,1);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T* rvar, int cnt) {
         detail::DoAllReduce<T>(rvar,MPI_SUM,cnt);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealSum (Vector<std::reference_wrapper<T> >&& rvar)
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar)
     {
         int cnt = rvar.size();
         Vector<T> tmp{std::begin(rvar), std::end(rvar)};
@@ -1296,20 +1296,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
 
     //! Real sum reduction to specified cpu.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T& rvar, int cpu) {
         detail::DoReduce<T>(&rvar,MPI_SUM,1,cpu);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T* rvar, int cnt, int cpu) {
         detail::DoReduce<T>(rvar,MPI_SUM,cnt,cpu);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealSum (Vector<std::reference_wrapper<T> >&& rvar, int cpu)
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
     {
         int cnt = rvar.size();
         Vector<T> tmp{std::begin(rvar), std::end(rvar)};
@@ -1321,20 +1321,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
 
     //! Real max reduction.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T& rvar) {
         detail::DoAllReduce<T>(&rvar,MPI_MAX,1);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T* rvar, int cnt) {
         detail::DoAllReduce<T>(rvar,MPI_MAX,cnt);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMax (Vector<std::reference_wrapper<T> >&& rvar)
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar)
     {
         int cnt = rvar.size();
         Vector<T> tmp{std::begin(rvar), std::end(rvar)};
@@ -1346,20 +1346,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
 
     //! Real max reduction to specified cpu.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T& rvar, int cpu) {
         detail::DoReduce<T>(&rvar,MPI_MAX,1,cpu);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T* rvar, int cnt, int cpu) {
         detail::DoReduce<T>(rvar,MPI_MAX,cnt,cpu);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMax (Vector<std::reference_wrapper<T> >&& rvar, int cpu)
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
     {
         int cnt = rvar.size();
         Vector<T> tmp{std::begin(rvar), std::end(rvar)};
@@ -1371,20 +1371,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
 
     //! Real min reduction.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T& rvar) {
         detail::DoAllReduce<T>(&rvar,MPI_MIN,1);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T* rvar, int cnt) {
         detail::DoAllReduce<T>(rvar,MPI_MIN,cnt);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMin (Vector<std::reference_wrapper<T> >&& rvar)
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar)
     {
         int cnt = rvar.size();
         Vector<T> tmp{std::begin(rvar), std::end(rvar)};
@@ -1396,20 +1396,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
 
     //! Real min reduction to specified cpu.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T& rvar, int cpu) {
         detail::DoReduce<T>(&rvar,MPI_MIN,1,cpu);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T* rvar, int cnt, int cpu) {
         detail::DoReduce<T>(rvar,MPI_MIN,cnt,cpu);
     }
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMin (Vector<std::reference_wrapper<T> >&& rvar, int cpu)
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
     {
         int cnt = rvar.size();
         Vector<T> tmp{std::begin(rvar), std::end(rvar)};
@@ -1423,81 +1423,81 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
 
     //! Real sum reduction.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T& ) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T*, int) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealSum (Vector<std::reference_wrapper<T> >&&) {}
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealSum (Vector<std::reference_wrapper<T> > const&) {}
 
     //! Real sum reduction to specified cpu.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T&, int) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealSum (T*, int, int) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealSum (Vector<std::reference_wrapper<T> >&&, int) {}
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealSum (Vector<std::reference_wrapper<T> > const&, int) {}
 
     //! Real max reduction.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T&) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T*, int) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMax (Vector<std::reference_wrapper<T> >&&) {}
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMax (Vector<std::reference_wrapper<T> > const&) {}
 
     //! Real max reduction to specified cpu.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T&, int) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMax (T*, int, int) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMax (Vector<std::reference_wrapper<T> >&&, int) {}
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMax (Vector<std::reference_wrapper<T> > const&, int) {}
 
     //! Real min reduction.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T&) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T*, int) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMin (Vector<std::reference_wrapper<T> >&&) {}
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMin (Vector<std::reference_wrapper<T> > const&) {}
 
     //! Real min reduction to specified cpu.
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T&, int) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
+    std::enable_if_t<std::is_floating_point_v<T>>
     ReduceRealMin (T*, int, int) {}
 
     template <typename T>
-    typename std::enable_if<std::is_floating_point<T>::value>::type
-    ReduceRealMin (Vector<std::reference_wrapper<T> >&&, int) {}
+    std::enable_if_t<std::is_floating_point_v<T>>
+    ReduceRealMin (Vector<std::reference_wrapper<T> > const&, int) {}
 
 #endif
 }
@@ -1529,9 +1529,9 @@ struct Mpi_typemap<ValLocPair<TV,TI>>
         static MPI_Datatype mpi_type = MPI_DATATYPE_NULL;
         if (mpi_type == MPI_DATATYPE_NULL) {
             using T = ValLocPair<TV,TI>;
-            static_assert(std::is_trivially_copyable<T>::value,
+            static_assert(std::is_trivially_copyable_v<T>,
                           "To communicate with MPI, ValLocPair must be trivially copyable.");
-            static_assert(std::is_standard_layout<T>::value,
+            static_assert(std::is_standard_layout_v<T>,
                           "To communicate with MPI, ValLocPair must be standard layout");
 
             T vlp[2];
diff --git a/Src/Base/AMReX_ParallelDescriptor.cpp b/Src/Base/AMReX_ParallelDescriptor.cpp
index 4e959504158..50f16c17967 100644
--- a/Src/Base/AMReX_ParallelDescriptor.cpp
+++ b/Src/Base/AMReX_ParallelDescriptor.cpp
@@ -551,39 +551,39 @@ Comm_dup (MPI_Comm comm, MPI_Comm& newcomm)
 }
 
 void
-ReduceRealSum (Vector<std::reference_wrapper<Real> >&& rvar)
+ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar)
 {
-    ReduceRealSum<Real>(std::move(rvar));
+    ReduceRealSum<Real>(rvar);
 }
 
 void
-ReduceRealSum (Vector<std::reference_wrapper<Real> >&& rvar, int cpu)
+ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar, int cpu)
 {
-    ReduceRealSum<Real>(std::move(rvar), cpu);
+    ReduceRealSum<Real>(rvar, cpu);
 }
 
 void
-ReduceRealMax (Vector<std::reference_wrapper<Real> > && rvar)
+ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar)
 {
-    ReduceRealMax<Real>(std::move(rvar));
+    ReduceRealMax<Real>(rvar);
 }
 
 void
-ReduceRealMax (Vector<std::reference_wrapper<Real> >&& rvar, int cpu)
+ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar, int cpu)
 {
-    ReduceRealMax<Real>(std::move(rvar), cpu);
+    ReduceRealMax<Real>(rvar, cpu);
 }
 
 void
-ReduceRealMin (Vector<std::reference_wrapper<Real> >&& rvar)
+ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar)
 {
-    ReduceRealMin<Real>(std::move(rvar));
+    ReduceRealMin<Real>(rvar);
 }
 
 void
-ReduceRealMin (Vector<std::reference_wrapper<Real> >&& rvar, int cpu)
+ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar, int cpu)
 {
-    ReduceRealMin<Real>(std::move(rvar), cpu);
+    ReduceRealMin<Real>(rvar, cpu);
 }
 
 void
@@ -643,7 +643,7 @@ ReduceIntSum (int* r, int cnt)
 }
 
 void
-ReduceIntSum (Vector<std::reference_wrapper<int> >&& rvar)
+ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<int> tmp{std::begin(rvar), std::end(rvar)};
@@ -666,7 +666,7 @@ ReduceIntSum (int* r, int cnt, int cpu)
 }
 
 void
-ReduceIntSum (Vector<std::reference_wrapper<int> >&& rvar, int cpu)
+ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar, int cpu)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<int> tmp{std::begin(rvar), std::end(rvar)};
@@ -689,7 +689,7 @@ ReduceIntMax (int* r, int cnt)
 }
 
 void
-ReduceIntMax (Vector<std::reference_wrapper<int> >&& rvar)
+ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<int> tmp{std::begin(rvar), std::end(rvar)};
@@ -712,7 +712,7 @@ ReduceIntMax (int* r, int cnt, int cpu)
 }
 
 void
-ReduceIntMax (Vector<std::reference_wrapper<int> >&& rvar, int cpu)
+ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar, int cpu)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<int> tmp{std::begin(rvar), std::end(rvar)};
@@ -735,7 +735,7 @@ ReduceIntMin (int* r, int cnt)
 }
 
 void
-ReduceIntMin (Vector<std::reference_wrapper<int> >&& rvar)
+ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<int> tmp{std::begin(rvar), std::end(rvar)};
@@ -758,7 +758,7 @@ ReduceIntMin (int* r, int cnt, int cpu)
 }
 
 void
-ReduceIntMin (Vector<std::reference_wrapper<int> >&& rvar, int cpu)
+ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar, int cpu)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<int> tmp{std::begin(rvar), std::end(rvar)};
@@ -781,7 +781,7 @@ ReduceLongSum (Long* r, int cnt)
 }
 
 void
-ReduceLongSum (Vector<std::reference_wrapper<Long> >&& rvar)
+ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<Long> tmp{std::begin(rvar), std::end(rvar)};
@@ -804,7 +804,7 @@ ReduceLongSum (Long* r, int cnt, int cpu)
 }
 
 void
-ReduceLongSum (Vector<std::reference_wrapper<Long> >&& rvar, int cpu)
+ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar, int cpu)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<Long> tmp{std::begin(rvar), std::end(rvar)};
@@ -827,7 +827,7 @@ ReduceLongMax (Long* r, int cnt)
 }
 
 void
-ReduceLongMax (Vector<std::reference_wrapper<Long> >&& rvar)
+ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<Long> tmp{std::begin(rvar), std::end(rvar)};
@@ -850,7 +850,7 @@ ReduceLongMax (Long* r, int cnt, int cpu)
 }
 
 void
-ReduceLongMax (Vector<std::reference_wrapper<Long> >&& rvar, int cpu)
+ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar, int cpu)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<Long> tmp{std::begin(rvar), std::end(rvar)};
@@ -873,7 +873,7 @@ ReduceLongMin (Long* r, int cnt)
 }
 
 void
-ReduceLongMin (Vector<std::reference_wrapper<Long> >&& rvar)
+ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<Long> tmp{std::begin(rvar), std::end(rvar)};
@@ -896,7 +896,7 @@ ReduceLongMin (Long* r, int cnt, int cpu)
 }
 
 void
-ReduceLongMin (Vector<std::reference_wrapper<Long> >&& rvar, int cpu)
+ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar, int cpu)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<Long> tmp{std::begin(rvar), std::end(rvar)};
@@ -919,7 +919,7 @@ ReduceLongAnd (Long* r, int cnt)
 }
 
 void
-ReduceLongAnd (Vector<std::reference_wrapper<Long> >&& rvar)
+ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<Long> tmp{std::begin(rvar), std::end(rvar)};
@@ -942,7 +942,7 @@ ReduceLongAnd (Long* r, int cnt, int cpu)
 }
 
 void
-ReduceLongAnd (Vector<std::reference_wrapper<Long> >&& rvar,int cpu)
+ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar,int cpu)
 {
     auto cnt = static_cast<int>(rvar.size());
     Vector<Long> tmp{std::begin(rvar), std::end(rvar)};
@@ -1211,13 +1211,13 @@ void IProbe (int, int, MPI_Comm, int&, MPI_Status&) {}
 
 void Comm_dup (MPI_Comm, MPI_Comm&) {}
 
-void ReduceRealSum (Vector<std::reference_wrapper<Real> >&& /*rvar*/) {}
-void ReduceRealMax (Vector<std::reference_wrapper<Real> >&& /*rvar*/) {}
-void ReduceRealMin (Vector<std::reference_wrapper<Real> >&& /*rvar*/) {}
+void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& /*rvar*/) {}
+void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& /*rvar*/) {}
+void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& /*rvar*/) {}
 
-void ReduceRealSum (Vector<std::reference_wrapper<Real> >&& /*rvar*/, int /*cpu*/) {}
-void ReduceRealMax (Vector<std::reference_wrapper<Real> >&& /*rvar*/, int /*cpu*/) {}
-void ReduceRealMin (Vector<std::reference_wrapper<Real> >&& /*rvar*/, int /*cpu*/) {}
+void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& /*rvar*/, int /*cpu*/) {}
+void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& /*rvar*/, int /*cpu*/) {}
+void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& /*rvar*/, int /*cpu*/) {}
 
 void ReduceLongAnd (Long&) {}
 void ReduceLongSum (Long&) {}
@@ -1239,15 +1239,15 @@ void ReduceLongSum (Long*,int,int) {}
 void ReduceLongMax (Long*,int,int) {}
 void ReduceLongMin (Long*,int,int) {}
 
-void ReduceLongAnd (Vector<std::reference_wrapper<Long> >&& /*rvar*/) {}
-void ReduceLongSum (Vector<std::reference_wrapper<Long> >&& /*rvar*/) {}
-void ReduceLongMax (Vector<std::reference_wrapper<Long> >&& /*rvar*/) {}
-void ReduceLongMin (Vector<std::reference_wrapper<Long> >&& /*rvar*/) {}
+void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& /*rvar*/) {}
+void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& /*rvar*/) {}
+void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& /*rvar*/) {}
+void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& /*rvar*/) {}
 
-void ReduceLongAnd (Vector<std::reference_wrapper<Long> >&& /*rvar*/, int /*cpu*/) {}
-void ReduceLongSum (Vector<std::reference_wrapper<Long> >&& /*rvar*/, int /*cpu*/) {}
-void ReduceLongMax (Vector<std::reference_wrapper<Long> >&& /*rvar*/, int /*cpu*/) {}
-void ReduceLongMin (Vector<std::reference_wrapper<Long> >&& /*rvar*/, int /*cpu*/) {}
+void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& /*rvar*/, int /*cpu*/) {}
+void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& /*rvar*/, int /*cpu*/) {}
+void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& /*rvar*/, int /*cpu*/) {}
+void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& /*rvar*/, int /*cpu*/) {}
 
 void ReduceIntSum (int&) {}
 void ReduceIntMax (int&) {}
@@ -1265,13 +1265,13 @@ void ReduceIntSum (int*,int,int) {}
 void ReduceIntMax (int*,int,int) {}
 void ReduceIntMin (int*,int,int) {}
 
-void ReduceIntSum (Vector<std::reference_wrapper<int> >&& /*rvar*/) {}
-void ReduceIntMax (Vector<std::reference_wrapper<int> >&& /*rvar*/) {}
-void ReduceIntMin (Vector<std::reference_wrapper<int> >&& /*rvar*/) {}
+void ReduceIntSum (Vector<std::reference_wrapper<int> > const& /*rvar*/) {}
+void ReduceIntMax (Vector<std::reference_wrapper<int> > const& /*rvar*/) {}
+void ReduceIntMin (Vector<std::reference_wrapper<int> > const& /*rvar*/) {}
 
-void ReduceIntSum (Vector<std::reference_wrapper<int> >&& /*rvar*/, int /*cpu*/) {}
-void ReduceIntMax (Vector<std::reference_wrapper<int> >&& /*rvar*/, int /*cpu*/) {}
-void ReduceIntMin (Vector<std::reference_wrapper<int> >&& /*rvar*/, int /*cpu*/) {}
+void ReduceIntSum (Vector<std::reference_wrapper<int> > const& /*rvar*/, int /*cpu*/) {}
+void ReduceIntMax (Vector<std::reference_wrapper<int> > const& /*rvar*/, int /*cpu*/) {}
+void ReduceIntMin (Vector<std::reference_wrapper<int> > const& /*rvar*/, int /*cpu*/) {}
 
 void ReduceBoolAnd (bool&) {}
 void ReduceBoolOr  (bool&) {}
@@ -1376,8 +1376,8 @@ BL_FORT_PROC_DECL(BL_PD_ABORT,bl_pd_abort)()
 #if defined(BL_USE_MPI) && !defined(BL_AMRPROF)
 template <> MPI_Datatype Mpi_typemap<IntVect>::type()
 {
-    static_assert(std::is_trivially_copyable<IntVect>::value, "IntVect must be trivially copyable");
-    static_assert(std::is_standard_layout<IntVect>::value, "IntVect must be standard layout");
+    static_assert(std::is_trivially_copyable_v<IntVect>, "IntVect must be trivially copyable");
+    static_assert(std::is_standard_layout_v<IntVect>, "IntVect must be standard layout");
 
     if ( mpi_type_intvect == MPI_DATATYPE_NULL )
     {
@@ -1399,8 +1399,8 @@ template <> MPI_Datatype Mpi_typemap<IntVect>::type()
 
 template <> MPI_Datatype Mpi_typemap<IndexType>::type()
 {
-    static_assert(std::is_trivially_copyable<IndexType>::value, "IndexType must be trivially copyable");
-    static_assert(std::is_standard_layout<IndexType>::value, "IndexType must be standard layout");
+    static_assert(std::is_trivially_copyable_v<IndexType>, "IndexType must be trivially copyable");
+    static_assert(std::is_standard_layout_v<IndexType>, "IndexType must be standard layout");
 
     if ( mpi_type_indextype == MPI_DATATYPE_NULL )
     {
@@ -1422,8 +1422,8 @@ template <> MPI_Datatype Mpi_typemap<IndexType>::type()
 
 template <> MPI_Datatype Mpi_typemap<Box>::type()
 {
-    static_assert(std::is_trivially_copyable<Box>::value, "Box must be trivially copyable");
-    static_assert(std::is_standard_layout<Box>::value, "Box must be standard layout");
+    static_assert(std::is_trivially_copyable_v<Box>, "Box must be trivially copyable");
+    static_assert(std::is_standard_layout_v<Box>, "Box must be standard layout");
 
     if ( mpi_type_box == MPI_DATATYPE_NULL )
     {
@@ -1630,7 +1630,7 @@ select_comm_data_type (std::size_t nbytes)
 }
 
 std::size_t
-alignof_comm_data (std::size_t nbytes)
+sizeof_selected_comm_data_type (std::size_t nbytes)
 {
     const int t = select_comm_data_type(nbytes);
     if (t == 1) {
diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H
index 01a0098333e..062db374029 100644
--- a/Src/Base/AMReX_ParmParse.H
+++ b/Src/Base/AMReX_ParmParse.H
@@ -267,7 +267,6 @@ class IntVect;
 *    #endif
 *
 */
-
 class ParmParse
 {
 public:
@@ -1063,7 +1062,7 @@ public:
     struct PP_entry;
     using Table = std::list<PP_entry>;
     static void appendTable(ParmParse::Table& tab);
-    [[nodiscard]] const Table& table() const {return m_table;}
+    [[nodiscard]] const Table& table() const {return *m_table;}
 
 protected:
 
@@ -1081,7 +1080,7 @@ protected:
     //
     //! Prefix used in keyword search.
     std::stack<std::string> m_pstack;
-    Table& m_table;
+    Table* m_table;
 };
 
 struct ParmParse::PP_entry
@@ -1116,7 +1115,7 @@ public:
     void pop();
     [[nodiscard]] std::string getPrefix() const;
 private:
-    ParmParse& m_pp;
+    ParmParse* m_pp;
     int        m_np{0};
 };
 
diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp
index c2ecfc7b37a..a2b42b64a15 100644
--- a/Src/Base/AMReX_ParmParse.cpp
+++ b/Src/Base/AMReX_ParmParse.cpp
@@ -29,11 +29,13 @@ extern "C" void amrex_finalize_namelist ();
 
 namespace amrex {
 
+namespace {
 #ifdef AMREX_XSDK
-static bool finalize_verbose = false;
+    bool finalize_verbose = false;
 #else
-static bool finalize_verbose = true;
+    bool finalize_verbose = true;
 #endif
+}
 
 std::string const ParmParse::FileKeyword = "FILE";
 
@@ -450,7 +452,6 @@ ppfound (const std::string& keyword,
 // except if n==-1, return the index of the last occurrence.
 // Return 0 if the specified occurrence does not exist.
 //
-
 const ParmParse::PP_entry*
 ppindex (const ParmParse::Table& table,
          int         n,
@@ -1123,20 +1124,20 @@ ParmParse::getPrefix() const
 
 ParmParse::ParmParse (const std::string& prefix)
     :
-    m_table(g_table)
+    m_table(&g_table)
 {
     m_pstack.push(prefix);
 }
 
 ParmParse::ParmParse (Table& a_table)
-    : m_table(a_table)
+    : m_table(&a_table)
 {
-    m_pstack.push("");
+    m_pstack.emplace("");
 }
 
 ParmParse::Frame::Frame (ParmParse& pp, const std::string& pfix)
     :
-    m_pp(pp)
+    m_pp(&pp)
 {
     push(pfix);
     BL_ASSERT( m_np == 1 );
@@ -1155,7 +1156,7 @@ ParmParse::Frame::~Frame ()
 void
 ParmParse::Frame::push (const std::string& str)
 {
-    m_pp.pushPrefix(str);
+    m_pp->pushPrefix(str);
     m_np++;
 }
 
@@ -1163,14 +1164,14 @@ void
 ParmParse::Frame::pop ()
 {
     BL_ASSERT( m_np > 0);
-    m_pp.popPrefix();
+    m_pp->popPrefix();
     m_np--;
 }
 
 std::string
 ParmParse::Frame::getPrefix () const
 {
-    return m_pp.getPrefix();
+    return m_pp->getPrefix();
 }
 
 void
@@ -1179,7 +1180,8 @@ ParmParse::appendTable(ParmParse::Table& tab)
   g_table.splice(g_table.end(), tab);
 }
 
-static
+namespace {
+
 bool
 unused_table_entries_q (const ParmParse::Table& table, const std::string& prefix = std::string())
 {
@@ -1216,7 +1218,6 @@ unused_table_entries_q (const ParmParse::Table& table, const std::string& prefix
     return false;
 }
 
-static
 void
 finalize_table (const std::string& pfx, const ParmParse::Table& table)
 {
@@ -1227,7 +1228,7 @@ finalize_table (const std::string& pfx, const ParmParse::Table& table)
             if ( !li.m_queried )
             {
                 if (finalize_verbose) {
-                    amrex::AllPrint() << "Record " << li.m_name << std::endl;
+                    amrex::AllPrint() << "Record " << li.m_name << '\n';
                 }
             }
             else
@@ -1238,12 +1239,14 @@ finalize_table (const std::string& pfx, const ParmParse::Table& table)
         else if ( !li.m_queried )
         {
             if (finalize_verbose) {
-                amrex::AllPrint() << pfx << "::" << li << std::endl;
+                amrex::AllPrint() << pfx << "::" << li << '\n';
             }
         }
     }
 }
 
+}
+
 void
 ParmParse::Initialize (int         argc,
                        char**      argv,
@@ -1267,7 +1270,7 @@ ParmParse::QueryUnusedInputs ()
         finalize_verbose = amrex::system::verbose;
         if (finalize_verbose) { amrex::OutStream() << "Unused ParmParse Variables:\n"; }
         finalize_table("  [TOP]", g_table);
-        if (finalize_verbose) { amrex::OutStream() << std::endl; }
+        if (finalize_verbose) { amrex::OutStream() << '\n'; }
         return true;
     }
     return false;
@@ -1279,7 +1282,7 @@ ParmParse::hasUnusedInputs (const std::string& prefix)
     return unused_table_entries_q(g_table, prefix);
 }
 
-static
+namespace {
 void
 get_entries_under_prefix (std::vector<std::string>& found_entries,
                          const ParmParse::Table& table,
@@ -1308,6 +1311,7 @@ get_entries_under_prefix (std::vector<std::string>& found_entries,
         }
     }
 }
+}
 
 std::vector<std::string>
 ParmParse::getUnusedInputs (const std::string& prefix)
@@ -1333,7 +1337,7 @@ ParmParse::Finalize ()
         finalize_verbose = amrex::system::verbose;
         if (finalize_verbose) { amrex::OutStream() << "Unused ParmParse Variables:\n"; }
         finalize_table("  [TOP]", g_table);
-        if (finalize_verbose) { amrex::OutStream() << std::endl; }
+        if (finalize_verbose) { amrex::OutStream() << '\n'; }
         //
         // First loop through and delete all queried entries.
         //
@@ -1356,10 +1360,10 @@ ParmParse::dumpTable (std::ostream& os, bool prettyPrint)
     for (auto const& li : g_table)
     {
         if(prettyPrint && li.m_queried) {
-            os << li.print() << std::endl;
+            os << li.print() << '\n';
         }
         else {
-            os << li << std::endl;
+            os << li << '\n';
         }
     }
 }
@@ -1371,7 +1375,7 @@ ParmParse::countval (const char* name,
     //
     // First find n'th occurrence of name in table.
     //
-    const PP_entry* def = ppindex(m_table, n, prefixedName(name), false);
+    const PP_entry* def = ppindex(*m_table, n, prefixedName(name), false);
     return def == nullptr ? 0 : static_cast<int>(def->m_vals.size());
 }
 
@@ -1382,7 +1386,7 @@ ParmParse::getkth (const char* name,
                    bool&       ref,
                    int         ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 void
@@ -1390,7 +1394,7 @@ ParmParse::get (const char* name,
                 bool&       ref,
                 int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 int
@@ -1399,7 +1403,7 @@ ParmParse::querykth (const char* name,
                      bool&       ref,
                      int         ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 int
@@ -1407,7 +1411,7 @@ ParmParse::query (const char* name,
                   bool&       ref,
                   int         ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1421,25 +1425,25 @@ ParmParse::add (const char* name,
 void
 ParmParse::getkth (const char* name, int k, int& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, int& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, int& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, int& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1452,28 +1456,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<int>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<int>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<int>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<int>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1487,25 +1491,25 @@ ParmParse::addarr (const char* name, const std::vector<int>& ref)
 void
 ParmParse::getkth (const char* name, int k, long& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, long& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, long& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, long& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1519,28 +1523,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<long>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<long>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<long>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<long>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1553,25 +1557,25 @@ ParmParse::addarr (const char* name, const std::vector<long>& ref)
 void
 ParmParse::getkth (const char* name, int k, long long& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, long long& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, long long& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, long long& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1584,28 +1588,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<long long>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<long long>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<long long>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<long long>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1618,25 +1622,25 @@ ParmParse::addarr (const char* name, const std::vector<long long>& ref)
 void
 ParmParse::getkth (const char* name, int k, float& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, float& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, float& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, float& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1649,28 +1653,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<float>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<float>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<float>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<float>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1685,25 +1689,25 @@ ParmParse::addarr (const char* name, const std::vector<float>& ref)
 void
 ParmParse::getkth (const char* name, int k, double& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, double& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, double& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, double& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1716,28 +1720,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<double>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<double>& ref, int start_ix,
                    int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<double>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<double>& ref, int start_ix,
                      int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1752,25 +1756,25 @@ ParmParse::addarr (const char* name, const std::vector<double>& ref)
 void
 ParmParse::getkth (const char* name, int k, std::string& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, std::string& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, std::string& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, std::string& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1783,28 +1787,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<std::string>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<std::string>& ref,
                    int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<std::string>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<std::string>& ref,
                      int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1819,25 +1823,25 @@ ParmParse::addarr (const char* name, const std::vector<std::string>& ref)
 void
 ParmParse::getkth (const char* name, int k, IntVect& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, IntVect& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, IntVect& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, IntVect& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1850,28 +1854,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<IntVect>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<IntVect>& ref,
                    int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<IntVect>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<IntVect>& ref,
                      int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1884,25 +1888,25 @@ ParmParse::addarr (const char* name, const std::vector<IntVect>& ref)
 void
 ParmParse::getkth (const char* name, int k, Box& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival,k);
+    sgetval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 void
 ParmParse::get (const char* name, Box& ref, int ival) const
 {
-    sgetval(m_table, prefixedName(name),ref,ival, LAST);
+    sgetval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 int
 ParmParse::querykth (const char* name, int k, Box& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival,k);
+    return squeryval(*m_table, prefixedName(name),ref,ival,k);
 }
 
 int
 ParmParse::query (const char* name, Box& ref, int ival) const
 {
-    return squeryval(m_table, prefixedName(name),ref,ival, LAST);
+    return squeryval(*m_table, prefixedName(name),ref,ival, LAST);
 }
 
 void
@@ -1915,28 +1919,28 @@ void
 ParmParse::getktharr (const char* name, int k, std::vector<Box>& ref,
                       int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k);
 }
 
 void
 ParmParse::getarr (const char* name, std::vector<Box>& ref,
                    int start_ix, int num_val) const
 {
-    sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 int
 ParmParse::queryktharr (const char* name, int k, std::vector<Box>& ref,
                         int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k);
 }
 
 int
 ParmParse::queryarr (const char* name, std::vector<Box>& ref,
                      int start_ix, int num_val) const
 {
-    return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST);
+    return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST);
 }
 
 void
@@ -1954,7 +1958,7 @@ int
 ParmParse::countname (const std::string& name) const
 {
     int cnt = 0;
-    for (auto const& li : m_table)
+    for (auto const& li : *m_table)
     {
         if ( ppfound(prefixedName(name), li, false) )
         {
@@ -1968,7 +1972,7 @@ int
 ParmParse::countRecords (const std::string& name) const
 {
     int cnt = 0;
-    for (auto const& li : m_table)
+    for (auto const& li : *m_table)
     {
         if ( ppfound(prefixedName(name), li, true) )
         {
@@ -1985,14 +1989,14 @@ ParmParse::countRecords (const std::string& name) const
 bool
 ParmParse::contains (const char* name) const
 {
-    for (auto const& li : m_table)
+    for (auto const& li : *m_table)
     {
        if ( ppfound(prefixedName(name), li, false))
        {
            //
            // Found an entry; mark all occurrences of name as used.
            //
-           for (auto& lli : m_table)
+           for (auto& lli : *m_table)
            {
                if ( ppfound(prefixedName(name), lli, false) )
                {
@@ -2009,9 +2013,9 @@ int
 ParmParse::remove (const char* name)
 {
     int r = 0;
-    for (auto it = m_table.begin(); it != m_table.end(); ) {
+    for (auto it = m_table->begin(); it != m_table->end(); ) {
         if (ppfound(prefixedName(name), *it, false)) {
-            it = m_table.erase(it);
+            it = m_table->erase(it);
             ++r;
         } else {
             ++it;
@@ -2023,10 +2027,10 @@ ParmParse::remove (const char* name)
 ParmParse::Record
 ParmParse::getRecord (const std::string& name, int n) const
 {
-    const PP_entry* pe = ppindex(m_table, n, prefixedName(name), true);
+    const PP_entry* pe = ppindex(*m_table, n, prefixedName(name), true);
     if ( pe == nullptr )
     {
-        amrex::ErrorStream() << "ParmParse::getRecord: record " << name << " not found" << std::endl;
+        amrex::ErrorStream() << "ParmParse::getRecord: record " << name << " not found" << '\n';
         amrex::Abort();
         return Record(ParmParse());
     } else {
diff --git a/Src/Base/AMReX_PhysBCFunct.H b/Src/Base/AMReX_PhysBCFunct.H
index 2d0906f27ad..ff8b73b5f0e 100644
--- a/Src/Base/AMReX_PhysBCFunct.H
+++ b/Src/Base/AMReX_PhysBCFunct.H
@@ -78,7 +78,7 @@ public:
                    int dcomp, int numcomp,
                    Geometry const& geom, Real time,
                    const Vector<BCRec>& bcr, int bcomp,
-                   int orig_comp, FF&& fillfunc);
+                   int orig_comp, FF const& fillfunc);
 
     void nddoit (Box const& bx, FArrayBox& dest,
                  int dcomp, int numcomp,
@@ -355,7 +355,7 @@ GpuBndryFuncFab<F>::ccfcdoit (Box const& bx, FArrayBox& dest,
                               int dcomp, int numcomp,
                               Geometry const& geom, Real time,
                               const Vector<BCRec>& bcr, int bcomp,
-                              int orig_comp, FF&& fillfunc)
+                              int orig_comp, FF const& fillfunc)
 {
     const IntVect& len = bx.length();
 
diff --git a/Src/Base/AMReX_PlotFileDataImpl.cpp b/Src/Base/AMReX_PlotFileDataImpl.cpp
index 1fbf5044a50..b85c17ad93c 100644
--- a/Src/Base/AMReX_PlotFileDataImpl.cpp
+++ b/Src/Base/AMReX_PlotFileDataImpl.cpp
@@ -141,7 +141,7 @@ PlotFileDataImpl::get (int level, std::string const& varname) noexcept
             int gid = mfi.index();
             FArrayBox& dstfab = mf[mfi];
             std::unique_ptr<FArrayBox> srcfab(m_vismf[level]->readFAB(gid, icomp));
-            dstfab.copy<RunOn::Host>(*srcfab);
+            dstfab.copy<RunOn::Device>(*srcfab);
         }
     }
     return mf;
diff --git a/Src/Base/AMReX_PlotFileUtil.cpp b/Src/Base/AMReX_PlotFileUtil.cpp
index df8ff405a10..52324dda506 100644
--- a/Src/Base/AMReX_PlotFileUtil.cpp
+++ b/Src/Base/AMReX_PlotFileUtil.cpp
@@ -98,8 +98,8 @@ WriteGenericPlotfileHeader (std::ostream &HeaderFile,
 
         HeaderFile << varnames.size() << '\n';
 
-        for (int ivar = 0; ivar < varnames.size(); ++ivar) {
-            HeaderFile << varnames[ivar] << "\n";
+        for (const auto & varname : varnames) {
+            HeaderFile << varname << "\n";
         }
         HeaderFile << AMREX_SPACEDIM << '\n';
         HeaderFile << time << '\n';
diff --git a/Src/Base/AMReX_REAL.H b/Src/Base/AMReX_REAL.H
index 6383808a3de..54815fa3da1 100644
--- a/Src/Base/AMReX_REAL.H
+++ b/Src/Base/AMReX_REAL.H
@@ -127,44 +127,6 @@ inline namespace literals {
 } // namespace amrex
 #endif
 
-#else
-
-#ifndef AMREX_XSDK
-
-/*
-  The REAL_T macro specifies the precision of the floating-point
-  calculations in Fortran code.  It will be either real*4 or
-  real*8 depending upon which of the symbols BL_USE_FLOAT or
-  BL_USE_DOUBLE, respectively, is defined during compilations.  For
-  portability, you should write floating-point code in terms of this
-  macro, instead of using real*4 or real*8 directly.
-
-  Note that exactly one of these macros must be defined
-  when compiling any module that uses floating-point.
-*/
-
-#ifdef BL_USE_FLOAT
-#    define REAL_T REAL
-#if __STDC__==1 || defined(__INTEL_COMPILER)
-#        define BL_REAL(a) a##E0
-#        define BL_REAL_E(a,b) a##E##b
-#else
-#        define BL_REAL(a) a/**/E0
-#        define BL_REAL_E(a,b) a/**/E/**/b
-#endif
-#else
-#    define REAL_T DOUBLE PRECISION
-#if __STDC__==1 || defined(__INTEL_COMPILER)
-#        define BL_REAL(a) a##D0
-#        define BL_REAL_E(a,b) a##D##b
-#else
-#        define BL_REAL(a) a/**/D0
-#        define BL_REAL_E(a,b) a/**/D/**/b
-#endif
-#endif
-
-#endif /* ndef AMREX_XSDK */
-
 #endif /* !BL_LANG_FORT */
 
 #endif /*BL_REAL_H*/
diff --git a/Src/Base/AMReX_Random.H b/Src/Base/AMReX_Random.H
index 675c12082d5..50b2c2693b0 100644
--- a/Src/Base/AMReX_Random.H
+++ b/Src/Base/AMReX_Random.H
@@ -23,24 +23,29 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Real Random (RandomEngine const& random_engine)
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-#ifdef BL_USE_FLOAT
-        return 1.0f - curand_uniform(random_engine.rand_state);
+#if defined (__SYCL_DEVICE_ONLY__)
+        mkl::rng::device::uniform<Real> distr;
+        return mkl::rng::device::generate(distr, *random_engine.engine);
 #else
-        return 1.0 - curand_uniform_double(random_engine.rand_state);
-#endif
-#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
 #ifdef BL_USE_FLOAT
-        return 1.0f - hiprand_uniform(random_engine.rand_state);
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return 1.0f - hiprand_uniform(random_engine.rand_state); ,
+                        return 1.0f - curand_uniform(random_engine.rand_state);
+                )
+        ))
 #else
-        return 1.0 - hiprand_uniform_double(random_engine.rand_state);
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return 1.0 - hiprand_uniform_double(random_engine.rand_state); ,
+                        return 1.0 - curand_uniform_double(random_engine.rand_state);
+                )
+        ))
 #endif
-#elif defined (__SYCL_DEVICE_ONLY__)
-        mkl::rng::device::uniform<Real> distr;
-        return mkl::rng::device::generate(distr, *random_engine.engine);
-#else
-        amrex::ignore_unused(random_engine);
-        return Random();
+        AMREX_IF_ON_HOST((
+                amrex::ignore_unused(random_engine);
+                return Random();
+        ))
 #endif
     }
 
@@ -56,24 +61,29 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Real RandomNormal (Real mean, Real stddev, RandomEngine const& random_engine)
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-#ifdef BL_USE_FLOAT
-        return stddev * curand_normal(random_engine.rand_state) + mean;
+#if defined (__SYCL_DEVICE_ONLY__)
+        mkl::rng::device::gaussian<Real> distr(mean, stddev);
+        return mkl::rng::device::generate(distr, *random_engine.engine);
 #else
-        return stddev * curand_normal_double(random_engine.rand_state) + mean;
-#endif
-#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
 #ifdef BL_USE_FLOAT
-        return stddev * hiprand_normal(random_engine.rand_state) + mean;
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return stddev * hiprand_normal(random_engine.rand_state) + mean; ,
+                        return stddev * curand_normal(random_engine.rand_state) + mean;
+                )
+        ))
 #else
-        return stddev * hiprand_normal_double(random_engine.rand_state) + mean;
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return stddev * hiprand_normal_double(random_engine.rand_state) + mean; ,
+                        return stddev * curand_normal_double(random_engine.rand_state) + mean;
+                )
+        ))
 #endif
-#elif defined (__SYCL_DEVICE_ONLY__)
-        mkl::rng::device::gaussian<Real> distr(mean, stddev);
-        return mkl::rng::device::generate(distr, *random_engine.engine);
-#else
-        amrex::ignore_unused(random_engine);
-        return RandomNormal(mean, stddev);
+        AMREX_IF_ON_HOST((
+                amrex::ignore_unused(random_engine);
+                return RandomNormal(mean, stddev);
+        ))
 #endif
     }
 
@@ -91,16 +101,20 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     unsigned int RandomPoisson (Real lambda, RandomEngine const& random_engine)
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-        return curand_poisson(random_engine.rand_state, lambda);
-#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return hiprand_poisson(random_engine.rand_state, lambda);
-#elif defined (__SYCL_DEVICE_ONLY__)
+#if defined (__SYCL_DEVICE_ONLY__)
         mkl::rng::device::poisson<unsigned int> distr(lambda);
         return mkl::rng::device::generate(distr, *random_engine.engine);
 #else
-        amrex::ignore_unused(random_engine);
-        return RandomPoisson(lambda);
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return hiprand_poisson(random_engine.rand_state, lambda); ,
+                        return curand_poisson(random_engine.rand_state, lambda);
+                )
+        ))
+        AMREX_IF_ON_HOST((
+                amrex::ignore_unused(random_engine);
+                return RandomPoisson(lambda);
+        ))
 #endif
     }
 
@@ -116,22 +130,23 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     unsigned int Random_int (unsigned int n, RandomEngine const& random_engine)
     {
-#if AMREX_DEVICE_COMPILE
 #if defined(__SYCL_DEVICE_ONLY__)
         mkl::rng::device::uniform<unsigned int> distr(0,n);
         return mkl::rng::device::generate(distr, *random_engine.engine);
 #else
-        unsigned int rand;
-        constexpr unsigned int RAND_M = 4294967295; // 2**32-1
-        do {
-            AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);,
-                               rand =  curand(random_engine.rand_state) );
-        } while (rand > (RAND_M - RAND_M % n));
-        return rand % n;
-#endif
-#else
-        amrex::ignore_unused(random_engine);
-        return Random_int(n);
+        AMREX_IF_ON_DEVICE((
+                unsigned int rand;
+                constexpr unsigned int RAND_M = 4294967295; // 2**32-1
+                do {
+                AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);,
+                                rand =  curand(random_engine.rand_state) );
+                } while (rand > (RAND_M - RAND_M % n));
+                return rand % n;
+        ))
+        AMREX_IF_ON_HOST((
+                amrex::ignore_unused(random_engine);
+                return Random_int(n);
+        ))
 #endif
     }
 
diff --git a/Src/Base/AMReX_Random.cpp b/Src/Base/AMReX_Random.cpp
index 9e1059e6798..a0b72155c50 100644
--- a/Src/Base/AMReX_Random.cpp
+++ b/Src/Base/AMReX_Random.cpp
@@ -19,9 +19,15 @@ namespace
 namespace amrex {
 #ifdef AMREX_USE_SYCL
     sycl_rng_descr* rand_engine_descr = nullptr;
-//xxxxx    oneapi::mkl::rng::philox4x32x10* gpu_rand_generator = nullptr;
 #else
     amrex::randState_t* gpu_rand_state = nullptr;
+#endif
+}
+
+namespace {
+#ifdef AMREX_USE_SYCL
+    oneapi::mkl::rng::philox4x32x10* gpu_rand_generator = nullptr;
+#else
     amrex::randGenerator_t gpu_rand_generator = nullptr;
 #endif
 }
@@ -44,8 +50,8 @@ void ResizeRandomSeed (amrex::ULong gpu_seed)
     rand_engine_descr = new sycl_rng_descr
         (Gpu::Device::streamQueue(), sycl::range<1>(N), gpu_seed, 1);
 
-//xxxxx    gpu_rand_generator = new std::remove_pointer_t<decltype(gpu_rand_generator)>
-//        (Gpu::Device::streamQueue(), gpu_seed+1234ULL);
+    gpu_rand_generator = new std::remove_pointer_t<decltype(gpu_rand_generator)>
+        (Gpu::Device::streamQueue(), gpu_seed+1234ULL);
 
 #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
 
@@ -192,7 +198,7 @@ UniqueRandomSubset (Vector<int> &uSet, int setSize, int poolSize,
   uSet = uSetTemp;
   if(printSet) {
     for(int i(0); i < uSet.size(); ++i) {
-        AllPrint() << "uSet[" << i << "]  = " << uSet[i] << std::endl;
+        AllPrint() << "uSet[" << i << "]  = " << uSet[i] << '\n';
     }
   }
 }
@@ -212,11 +218,11 @@ DeallocateRandomSeedDevArray ()
         Gpu::streamSynchronize();
         rand_engine_descr = nullptr;
     }
-//xxxxx    if (gpu_rand_generator != nullptr) {
-//        delete gpu_rand_generator;
-//        Gpu::streamSynchronize();
-//        gpu_rand_generator = nullptr;
-//    }
+    if (gpu_rand_generator != nullptr) {
+        delete gpu_rand_generator;
+        Gpu::streamSynchronize();
+        gpu_rand_generator = nullptr;
+    }
 #else
     if (gpu_rand_state != nullptr)
     {
@@ -258,15 +264,9 @@ void FillRandom (Real* p, Long N)
 
 #elif defined(AMREX_USE_SYCL)
 
-//xxxxx    oneapi::mkl::rng::uniform<Real> distr;
-//    auto event = oneapi::mkl::rng::generate(distr, gpu_rand_generator, N, p);
-//    event.wait();
-
-    amrex::ParallelForRNG(N, [=] AMREX_GPU_DEVICE (Long i, RandomEngine const& eng)
-    {
-        p[i] = Random(eng);
-    });
-    Gpu::streamSynchronize();
+    oneapi::mkl::rng::uniform<Real> distr;
+    auto event = oneapi::mkl::rng::generate(distr, *gpu_rand_generator, N, p);
+    event.wait();
 
 #else
     std::uniform_real_distribution<Real> distribution(Real(0.0), Real(1.0));
@@ -299,15 +299,9 @@ void FillRandomNormal (Real* p, Long N, Real mean, Real stddev)
 
 #elif defined(AMREX_USE_SYCL)
 
-//xxxxx    oneapi::mkl::rng::gaussian<Real> distr(mean, stddev);
-//    auto event = oneapi::mkl::rng::generate(distr, gpu_rand_generator, N, p);
-//    event.wait();
-
-    amrex::ParallelForRNG(N, [=] AMREX_GPU_DEVICE (Long i, RandomEngine const& eng)
-    {
-        p[i] = RandomNormal(mean, stddev, eng);
-    });
-    Gpu::streamSynchronize();
+    oneapi::mkl::rng::gaussian<Real> distr(mean, stddev);
+    auto event = oneapi::mkl::rng::generate(distr, *gpu_rand_generator, N, p);
+    event.wait();
 
 #else
 
@@ -322,30 +316,6 @@ void FillRandomNormal (Real* p, Long N, Real mean, Real stddev)
 
 } // namespace amrex
 
-
-//
-// Fortran entry points for amrex::Random().
-//
-
-#if !defined(AMREX_XSDK) && !defined(BL_NO_FORT)
-BL_FORT_PROC_DECL(BLUTILINITRAND,blutilinitrand)(const int* sd)
-{
-    amrex::ULong seed = *sd;
-    amrex::InitRandom(seed);
-}
-
-BL_FORT_PROC_DECL(BLINITRAND,blinitrand)(const int* sd)
-{
-    amrex::ULong seed = *sd;
-    amrex::InitRandom(seed);
-}
-
-BL_FORT_PROC_DECL(BLUTILRAND,blutilrand)(amrex::Real* rn)
-{
-    *rn = amrex::Random();
-}
-#endif
-
 extern "C" {
     double amrex_random ()
     {
diff --git a/Src/Base/AMReX_RealVect.H b/Src/Base/AMReX_RealVect.H
index 83cc747b030..9e1d72700f7 100644
--- a/Src/Base/AMReX_RealVect.H
+++ b/Src/Base/AMReX_RealVect.H
@@ -28,7 +28,6 @@ namespace amrex
   C++ array.  In addition, the basic arithmetic operators have been overloaded
   to implement scaling and translation operations.
 */
-
 class RealVect
 {
 public:
@@ -521,13 +520,13 @@ public:
   /**
      This is a RealVect all of whose components are equal to zero.
   */
-  static const RealVect Zero;
+  static AMREX_EXPORT const RealVect Zero;
 
   ///
   /**
      This is a RealVect all of whose components are equal to one.
   */
-  static const RealVect Unit;
+  static AMREX_EXPORT const RealVect Unit;
 
   /*@}*/
 
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index ae8ae3ca0c1..1060b91f319 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -67,14 +67,14 @@ namespace Reduce::detail {
 
     template <std::size_t I, typename T, typename P>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void for_each_init (T& t)
+    constexpr void for_each_init (T& t)
     {
         P().init(amrex::get<I>(t));
     }
 
     template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void for_each_init (T& t)
+    constexpr void for_each_init (T& t)
     {
         P().init(amrex::get<I>(t));
         for_each_init<I+1,T,P1,Ps...>(t);
@@ -200,11 +200,11 @@ struct ReduceOpLogicalAnd
 
     template <typename T>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    std::enable_if_t<std::is_integral<T>::value>
+    std::enable_if_t<std::is_integral_v<T>>
     local_update (T& d, T s) const noexcept { d = d && s; }
 
     template <typename T>
-    constexpr std::enable_if_t<std::is_integral<T>::value>
+    constexpr std::enable_if_t<std::is_integral_v<T>>
     init (T& t) const noexcept { t = true; }
 };
 
@@ -232,11 +232,11 @@ struct ReduceOpLogicalOr
 
     template <typename T>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    std::enable_if_t<std::is_integral<T>::value>
+    std::enable_if_t<std::is_integral_v<T>>
     local_update (T& d, T s) const noexcept { d = d || s; }
 
     template <typename T>
-    constexpr std::enable_if_t<std::is_integral<T>::value>
+    constexpr std::enable_if_t<std::is_integral_v<T>>
     init (T& t) const noexcept { t = false; }
 };
 
@@ -366,7 +366,7 @@ public:
 
     // This is public for CUDA
     template <typename I, typename MF, typename D, typename F>
-    void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&&f)
+    void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
     {
         using ReduceTuple = typename D::Type;
         const int nboxes = mf.local_size();
@@ -376,11 +376,12 @@ public:
             const int nblocks = par_for_blocks.first[nboxes];
             const int block_0_size = par_for_blocks.first[1];
             const int* dp_nblocks = par_for_blocks.second;
-            const Box* dp_boxes = parforinfo.getBoxes();
+            const BoxIndexer* dp_boxes = parforinfo.getBoxes();
 
             auto const& stream = Gpu::gpuStream();
             auto pdst = reduce_data.devicePtr(stream);
             int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
+            AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
             reduce_data.nBlocks(stream) = nblocks_ec;
             reduce_data.updateMaxStreamIndex(stream);
 
@@ -405,25 +406,19 @@ public:
                     dst = r;
                 }
                 for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
-                    int ibox, icell;
+                    int ibox;
+                    std::uint64_t icell;
                     if (dp_nblocks) {
                         ibox = amrex::bisect(dp_nblocks, 0, nboxes, iblock);
-                        icell = (iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x;
+                        icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x;
                     } else {
                         ibox = iblock / block_0_size;
-                        icell = (iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x;
+                        icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x;
                     }
 
-                    Box const& b = dp_boxes[ibox];
-                    int ncells = b.numPts();
-                    if (icell < ncells) {
-                        const auto len = amrex::length(b);
-                        int k =  icell /   (len.x*len.y);
-                        int j = (icell - k*(len.x*len.y)) /   len.x;
-                        int i = (icell - k*(len.x*len.y)) - j*len.x;
-                        AMREX_D_TERM(i += b.smallEnd(0);,
-                                     j += b.smallEnd(1);,
-                                     k += b.smallEnd(2););
+                    BoxIndexer const& indexer = dp_boxes[ibox];
+                    if (icell < indexer.numPts()) {
+                        auto [i, j, k] = indexer(icell);
                         Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
                             (f, ibox, i, j, k, ncomp, r);
                     }
@@ -496,7 +491,7 @@ public:
     }
 
     template <typename D, typename F>
-    void eval (Box const& box, D & reduce_data, F&& f)
+    void eval (Box const& box, D & reduce_data, F const& f)
     {
         using ReduceTuple = typename D::Type;
         auto const& stream = Gpu::gpuStream();
@@ -556,7 +551,7 @@ public:
 
     template <typename N, typename D, typename F,
               typename M=std::enable_if_t<std::is_integral<N>::value> >
-    void eval (Box const& box, N ncomp, D & reduce_data, F&& f)
+    void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
     {
         using ReduceTuple = typename D::Type;
         auto const& stream = Gpu::gpuStream();
@@ -617,7 +612,7 @@ public:
 
     template <typename N, typename D, typename F,
               typename M=std::enable_if_t<std::is_integral<N>::value> >
-    void eval (N n, D & reduce_data, F&& f)
+    void eval (N n, D & reduce_data, F const& f)
     {
         if (n <= 0) { return; }
         using ReduceTuple = typename D::Type;
@@ -771,7 +766,7 @@ T Sum (N n, T const* v, T init_val = 0)
 
 template <typename T, typename N, typename F,
           typename M=std::enable_if_t<std::is_integral<N>::value> >
-T Sum (N n, F&& f, T init_val = 0)
+T Sum (N n, F const& f, T init_val = 0)
 {
     ReduceOps<ReduceOpSum> reduce_op;
     ReduceData<T> reduce_data(reduce_op);
@@ -794,7 +789,7 @@ T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
 
 template <typename T, typename N, typename F,
           typename M=std::enable_if_t<std::is_integral<N>::value> >
-T Min (N n, F&& f, T init_val = std::numeric_limits<T>::max())
+T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
 {
     ReduceOps<ReduceOpMin> reduce_op;
     ReduceData<T> reduce_data(reduce_op);
@@ -817,7 +812,7 @@ T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
 
 template <typename T, typename N, typename F,
           typename M=std::enable_if_t<std::is_integral<N>::value> >
-T Max (N n, F&& f, T init_val = std::numeric_limits<T>::lowest())
+T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
 {
     ReduceOps<ReduceOpMax> reduce_op;
     ReduceData<T> reduce_data(reduce_op);
@@ -842,7 +837,7 @@ std::pair<T,T> MinMax (N n, T const* v)
 
 template <typename T, typename N, typename F,
           typename M=std::enable_if_t<std::is_integral<N>::value> >
-std::pair<T,T> MinMax (N n, F&& f)
+std::pair<T,T> MinMax (N n, F const& f)
 {
     ReduceOps<ReduceOpMin,ReduceOpMax> reduce_op;
     ReduceData<T,T> reduce_data(reduce_op);
@@ -856,7 +851,7 @@ std::pair<T,T> MinMax (N n, F&& f)
 }
 
 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
-bool AnyOf (N n, T const* v, P&& pred)
+bool AnyOf (N n, T const* v, P const& pred)
 {
     Gpu::LaunchSafeGuard lsg(true);
     Gpu::DeviceScalar<int> ds(0);
@@ -912,7 +907,7 @@ bool AnyOf (N n, T const* v, P&& pred)
 }
 
 template <typename P>
-bool AnyOf (Box const& box, P&& pred)
+bool AnyOf (Box const& box, P const& pred)
 {
     Gpu::LaunchSafeGuard lsg(true);
     Gpu::DeviceScalar<int> ds(0);
@@ -1042,8 +1037,8 @@ private:
     template <typename D, typename F>
     AMREX_FORCE_INLINE
     static auto call_f (Box const& box, typename D::Type & r, F const& f)
-        noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(f(0,0,0))>,
-                                                  typename D::Type>::value>
+        noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(0,0,0))>,
+                                                  typename D::Type>>
     {
         using ReduceTuple = typename D::Type;
         const auto lo = amrex::lbound(box);
@@ -1058,8 +1053,8 @@ private:
     template <typename D, typename F>
     AMREX_FORCE_INLINE
     static auto call_f (Box const& box, typename D::Type & r, F const& f)
-        noexcept -> std::enable_if_t<std::is_same<std::decay_t<decltype(f(Box()))>,
-                                                  typename D::Type>::value>
+        noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(Box()))>,
+                                                  typename D::Type>>
     {
         using ReduceTuple = typename D::Type;
         Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
@@ -1069,7 +1064,7 @@ public:
 
     template <typename MF, typename D, typename F>
     std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
-    eval (MF const& mf, IntVect const& nghost, D & reduce_data, F&& f)
+    eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
     {
         using ReduceTuple = typename D::Type;
 #ifdef AMREX_USE_OMP
@@ -1091,7 +1086,7 @@ public:
 
     template <typename MF, typename D, typename F>
     std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
-    eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F&& f)
+    eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
     {
         using ReduceTuple = typename D::Type;
 #ifdef AMREX_USE_OMP
@@ -1116,12 +1111,12 @@ public:
     void eval (Box const& box, D & reduce_data, F&& f)
     {
         auto& rr = reduce_data.reference(OpenMP::get_thread_num());
-        call_f<D>(box, rr, f);
+        call_f<D>(box, rr, std::forward<F>(f));
     }
 
     template <typename N, typename D, typename F,
-              typename M=std::enable_if_t<std::is_integral<N>::value> >
-    void eval (Box const& box, N ncomp, D & reduce_data, F&& f)
+              typename M=std::enable_if_t<std::is_integral_v<N>> >
+    void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
     {
         using ReduceTuple = typename D::Type;
         auto& rr = reduce_data.reference(OpenMP::get_thread_num());
@@ -1136,8 +1131,8 @@ public:
     }
 
     template <typename N, typename D, typename F,
-              typename M=std::enable_if_t<std::is_integral<N>::value> >
-    void eval (N n, D & reduce_data, F&& f)
+              typename M=std::enable_if_t<std::is_integral_v<N>> >
+    void eval (N n, D & reduce_data, F const& f)
     {
         using ReduceTuple = typename D::Type;
         auto& rr = reduce_data.reference(OpenMP::get_thread_num());
@@ -1170,8 +1165,8 @@ public:
 namespace Reduce {
 
 template <typename T, typename N, typename F,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-T Sum (N n, F&& f, T init_val = 0)
+          typename M=std::enable_if_t<std::is_integral_v<N>> >
+T Sum (N n, F const& f, T init_val = 0)
 {
     T r = init_val;
 #ifdef AMREX_USE_OMP
@@ -1183,15 +1178,15 @@ T Sum (N n, F&& f, T init_val = 0)
     return r;
 }
 
-template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
 T Sum (N n, T const* v, T init_val = 0)
 {
     return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
 }
 
 template <typename T, typename N, typename F,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-T Min (N n, F&& f, T init_val = std::numeric_limits<T>::max())
+          typename M=std::enable_if_t<std::is_integral_v<N>> >
+T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
 {
     T r = init_val;
 #ifdef AMREX_USE_OMP
@@ -1203,15 +1198,15 @@ T Min (N n, F&& f, T init_val = std::numeric_limits<T>::max())
     return r;
 }
 
-template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
 T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
 {
     return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
 }
 
 template <typename T, typename N, typename F,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-T Max (N n, F&& f, T init_val = std::numeric_limits<T>::lowest())
+          typename M=std::enable_if_t<std::is_integral_v<N>> >
+T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
 {
     T r = init_val;
 #ifdef AMREX_USE_OMP
@@ -1223,15 +1218,15 @@ T Max (N n, F&& f, T init_val = std::numeric_limits<T>::lowest())
     return r;
 }
 
-template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
 T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
 {
     return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
 }
 
 template <typename T, typename N, typename F,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-std::pair<T,T> Min (N n, F&& f)
+          typename M=std::enable_if_t<std::is_integral_v<N>> >
+std::pair<T,T> Min (N n, F const& f)
 {
     T r_min = std::numeric_limits<T>::max();
     T r_max = std::numeric_limits<T>::lowest();
@@ -1246,20 +1241,20 @@ std::pair<T,T> Min (N n, F&& f)
     return std::make_pair(r_min,r_max);
 }
 
-template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
+template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
 std::pair<T,T> MinMax (N n, T const* v)
 {
     return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
 }
 
-template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
+template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
 bool AnyOf (N n, T const* v, P&& pred)
 {
-    return std::any_of(v, v+n, pred);
+    return std::any_of(v, v+n, std::forward<P>(pred));
 }
 
 template <typename P>
-bool AnyOf (Box const& box, P&&pred)
+bool AnyOf (Box const& box, P const& pred)
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -1275,6 +1270,34 @@ bool AnyOf (Box const& box, P&&pred)
 
 #endif
 
+/**
+ * \brief Return a GpuTuple containing the identity element for each operation in ReduceOps.
+ * For example 0, +inf and -inf for ReduceOpSum, ReduceOpMin and ReduceOpMax respectively.
+ */
+template <typename... Ts, typename... Ps>
+AMREX_GPU_HOST_DEVICE
+constexpr GpuTuple<Ts...>
+IdentityTuple (GpuTuple<Ts...>, ReduceOps<Ps...>) noexcept
+{
+    GpuTuple<Ts...> r{};
+    Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
+    return r;
+}
+
+/**
+ * \brief Return a GpuTuple containing the identity element for each ReduceOp in TypeList.
+ * For example 0, +inf and -inf for ReduceOpSum, ReduceOpMin and ReduceOpMax respectively.
+ */
+template <typename... Ts, typename... Ps>
+AMREX_GPU_HOST_DEVICE
+constexpr GpuTuple<Ts...>
+IdentityTuple (GpuTuple<Ts...>, TypeList<Ps...>) noexcept
+{
+    GpuTuple<Ts...> r{};
+    Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
+    return r;
+}
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H
index cfac0851cab..2dc8514f9c6 100644
--- a/Src/Base/AMReX_RungeKutta.H
+++ b/Src/Base/AMReX_RungeKutta.H
@@ -4,8 +4,6 @@
 
 #include <AMReX_FabArray.H>
 
-namespace amrex::RungeKutta {
-
 /**
  * \brief Functions for Runge-Kutta methods
  *
@@ -48,6 +46,7 @@ namespace amrex::RungeKutta {
  * FillPatcher class can be useful for implementing such a callable.  See
  * AmrLevel::RK for an example.
  */
+namespace amrex::RungeKutta {
 
 struct PostStageNoOp {
     template <typename MF>
@@ -156,8 +155,8 @@ void rk4_update_4 (MF& Unew, MF const& Uold, Array<MF,4> const& rkk, Real dt6)
  * \param post_stage post-processing stage results
  */
 template <typename MF, typename F, typename FB, typename P = PostStageNoOp>
-void RK2 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
-          P&& post_stage = PostStageNoOp())
+void RK2 (MF& Uold, MF& Unew, Real time, Real dt, F const& frhs, FB const& fillbndry,
+          P const& post_stage = PostStageNoOp())
 {
     BL_PROFILE("RungeKutta2");
 
@@ -194,8 +193,8 @@ void RK2 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
  */
 template <typename MF, typename F, typename FB, typename R,
           typename P = PostStageNoOp>
-void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
-          R&& store_crse_data, P&& post_stage = PostStageNoOp())
+void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F const& frhs, FB const& fillbndry,
+          R const& store_crse_data, P const& post_stage = PostStageNoOp())
 {
     BL_PROFILE("RungeKutta3");
 
@@ -244,8 +243,8 @@ void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
  */
 template <typename MF, typename F, typename FB, typename R,
           typename P = PostStageNoOp>
-void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
-          R&& store_crse_data, P&& post_stage = PostStageNoOp())
+void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F const& frhs, FB const& fillbndry,
+          R const& store_crse_data, P const& post_stage = PostStageNoOp())
 {
     BL_PROFILE("RungeKutta4");
 
diff --git a/Src/Base/AMReX_SPACE.H b/Src/Base/AMReX_SPACE.H
index 91c4cc3a222..5328bb40b06 100644
--- a/Src/Base/AMReX_SPACE.H
+++ b/Src/Base/AMReX_SPACE.H
@@ -161,29 +161,6 @@ namespace amrex
 #  define AMREX_D_PICK(a,b,c) c
 #endif
 
-#ifndef AMREX_XSDK
-
-/* for backward compatibility */
-#if (AMREX_SPACEDIM == 1)
-#  define D_DECL(a,b,c) a
-#  define D_EXPR(a,b,c) ((void)((a),0))
-#  define D_PICK(a,b,c) a
-#  define D_TERM(a,b,c) a
-#elif (AMREX_SPACEDIM == 2)
-#  define D_DECL(a,b,c) a,b
-#  define D_EXPR(a,b,c) ((void)((a),(b),0))
-#  define D_PICK(a,b,c) b
-#  define D_TERM(a,b,c) a b
-#elif (AMREX_SPACEDIM == 3)
-#  define D_DECL(a,b,c) a,b,c
-#  define D_EXPR(a,b,c) ((void)((a),(b),(c),0))
-#  define D_PICK(a,b,c) c
-#  define D_TERM(a,b,c) a b c
-#endif
-
-#endif
-
-
 #if (AMREX_SPACEDIM == 1)
 #  define AMREX_1D_ONLY(a) a
 #  define AMREX_2D_ONLY(a) ((void)0)
diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
index 11fdfd8bd70..4c94960cba6 100644
--- a/Src/Base/AMReX_Scan.H
+++ b/Src/Base/AMReX_Scan.H
@@ -187,7 +187,7 @@ struct BlockStatus<T, false>
 
 #ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
 template <typename T, typename N, typename FIN, typename FOUT, typename TYPE>
-T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
+T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
 {
     if (n <= 0) { return 0; }
     constexpr int nwarps_per_block = 8;
@@ -400,15 +400,15 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
 
 #ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
     if (nblocks > 1) {
-        return PrefixSum_mp<T>(n, std::forward<FIN>(fin), std::forward<FOUT>(fout), type, retSum);
+        return PrefixSum_mp<T>(n, std::forward<FIN>(fin), std::forward<FOUT>(fout), type, a_ret_sum);
     }
 #endif
 
     std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
     auto stream = Gpu::gpuStream();
 
-    using BlockStatusT = typename std::conditional<sizeof(detail::STVA<T>) <= 8,
-        detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >::type;
+    using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
+        detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
 
     std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
     std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
@@ -627,7 +627,7 @@ template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
           typename M=std::enable_if_t<std::is_integral<N>::value &&
                                       (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
                                        std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
-T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
+T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
 {
     if (n <= 0) { return 0; }
     constexpr int nwarps_per_block = 4;
@@ -770,7 +770,7 @@ template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
           typename M=std::enable_if_t<std::is_integral<N>::value &&
                                       (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
                                        std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
-T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
+T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
 {
     if (n <= 0) { return 0; }
     constexpr int nwarps_per_block = 8;
@@ -903,7 +903,7 @@ template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
           typename M=std::enable_if_t<std::is_integral<N>::value &&
                                       (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
                                        std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
-T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
+T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
 {
     if (n <= 0) { return 0; }
     constexpr int nwarps_per_block = 4;
@@ -915,8 +915,8 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
     std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
     auto stream = Gpu::gpuStream();
 
-    using BlockStatusT = typename std::conditional<sizeof(detail::STVA<T>) <= 8,
-        detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >::type;
+    using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
+        detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
 
     std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
     std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
@@ -1281,10 +1281,10 @@ T ExclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
 #else
 //  !defined(AMREX_USE_GPU)
 template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
-          typename M=std::enable_if_t<std::is_integral<N>::value &&
-                                      (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
-                                       std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
-T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum = retSum)
+          typename M=std::enable_if_t<std::is_integral_v<N> &&
+                                      (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive> ||
+                                       std::is_same_v<std::decay_t<TYPE>,Type::Exclusive>)> >
+T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum = retSum)
 {
     if (n <= 0) { return 0; }
     T totalsum = 0;
@@ -1292,7 +1292,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum = retSum)
         T x = fin(i);
         T y = totalsum;
         totalsum += x;
-        AMREX_IF_CONSTEXPR (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value) {
+        AMREX_IF_CONSTEXPR (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive>) {
             y += x;
         }
         fout(i, y);
@@ -1301,7 +1301,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum = retSum)
 }
 
 // The return value is the total sum.
-template <typename N, typename T, typename M=std::enable_if_t<std::is_integral<N>::value> >
+template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
 T InclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
 {
 #if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
@@ -1314,7 +1314,7 @@ T InclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
 }
 
 // The return value is the total sum.
-template <typename N, typename T, typename M=std::enable_if_t<std::is_integral<N>::value> >
+template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
 T ExclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
 {
     if (n <= 0) { return 0; }
diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H
index b7572e2a1cf..8ff5f608334 100644
--- a/Src/Base/AMReX_TableData.H
+++ b/Src/Base/AMReX_TableData.H
@@ -22,12 +22,11 @@ struct Table1D
     int begin = 1;
     int end = 0;
 
-    AMREX_GPU_HOST_DEVICE
-    constexpr Table1D () noexcept {}
+    constexpr Table1D () noexcept = default;
 
-    template <class U=T, typename std::enable_if<std::is_const<U>::value,int>::type = 0>
+    template <class U=T, std::enable_if_t<std::is_const_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE
-    constexpr Table1D (Table1D<typename std::remove_const<T>::type> const& rhs) noexcept
+    constexpr Table1D (Table1D<std::remove_const_t<T>> const& rhs) noexcept
         : p(rhs.p),
           begin(rhs.begin),
           end(rhs.end)
@@ -43,7 +42,7 @@ struct Table1D
     AMREX_GPU_HOST_DEVICE
     explicit operator bool () const noexcept { return p != nullptr; }
 
-    template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+    template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     U& operator() (int i) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
@@ -57,16 +56,17 @@ struct Table1D
     void index_assert (int i) const
     {
         if (i < begin || i >= end) {
-#if AMREX_DEVICE_COMPILE
-            AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n",
-                                i, begin, end-1);
-            amrex::Abort();
-#else
-            std::stringstream ss;
-            ss << " (" << i << ") is out of bound ("
-               << begin << ":" << end-1 << ")";
-            amrex::Abort(ss.str());
-#endif
+            AMREX_IF_ON_DEVICE((
+                AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n",
+                                    i, begin, end-1);
+                amrex::Abort();
+            ))
+            AMREX_IF_ON_HOST((
+                std::stringstream ss;
+                ss << " (" << i << ") is out of bound ("
+                << begin << ":" << end-1 << ")";
+                amrex::Abort(ss.str());
+            ))
         }
     }
 #endif
@@ -80,12 +80,11 @@ struct Table2D
     GpuArray<int,2> begin{{1,1}};
     GpuArray<int,2> end{{0,0}};
 
-    AMREX_GPU_HOST_DEVICE
-    constexpr Table2D () noexcept {}
+    constexpr Table2D () noexcept = default;
 
-    template <class U=T, typename std::enable_if<std::is_const<U>::value,int>::type = 0>
+    template <class U=T, std::enable_if_t<std::is_const_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE
-    constexpr Table2D (Table2D<typename std::remove_const<T>::type> const& rhs) noexcept
+    constexpr Table2D (Table2D<std::remove_const_t<T>> const& rhs) noexcept
         : p(rhs.p),
           jstride(rhs.jstride),
           begin(rhs.begin),
@@ -105,7 +104,7 @@ struct Table2D
     AMREX_GPU_HOST_DEVICE
     explicit operator bool () const noexcept { return p != nullptr; }
 
-    template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+    template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     U& operator() (int i, int j) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
@@ -120,17 +119,18 @@ struct Table2D
     {
         if (i < begin[0] || i >= end[0] ||
             j < begin[1] || j >= end[1]) {
-#if AMREX_DEVICE_COMPILE
-            AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n",
-                                i, j, begin[0], end[0]-1, begin[1], end[1]-1);
-            amrex::Abort();
-#else
-            std::stringstream ss;
-            ss << " (" << i << "," << j << ") is out of bound ("
-               << begin[0] << ":" << end[0]-1
-               << "," << begin[1] << ":" << end[1]-1 << ")";
-            amrex::Abort(ss.str());
-#endif
+            AMREX_IF_ON_DEVICE((
+                AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n",
+                                    i, j, begin[0], end[0]-1, begin[1], end[1]-1);
+                amrex::Abort();
+            ))
+            AMREX_IF_ON_HOST((
+                std::stringstream ss;
+                ss << " (" << i << "," << j << ") is out of bound ("
+                << begin[0] << ":" << end[0]-1
+                << "," << begin[1] << ":" << end[1]-1 << ")";
+                amrex::Abort(ss.str());
+            ))
         }
     }
 #endif
@@ -145,12 +145,11 @@ struct Table3D
     GpuArray<int,3> begin{{1,1,1}};
     GpuArray<int,3> end{{0,0,0}};
 
-    AMREX_GPU_HOST_DEVICE
-    constexpr Table3D () noexcept {}
+    constexpr Table3D () noexcept = default;
 
-    template <class U=T, typename std::enable_if<std::is_const<U>::value,int>::type = 0>
+    template <class U=T, std::enable_if_t<std::is_const_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE
-    constexpr Table3D (Table3D<typename std::remove_const<T>::type> const& rhs) noexcept
+    constexpr Table3D (Table3D<std::remove_const_t<T>> const& rhs) noexcept
         : p(rhs.p),
           jstride(rhs.jstride),
           kstride(rhs.kstride),
@@ -172,7 +171,7 @@ struct Table3D
     AMREX_GPU_HOST_DEVICE
     explicit operator bool () const noexcept { return p != nullptr; }
 
-    template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+    template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     U& operator() (int i, int j, int k) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
@@ -188,19 +187,20 @@ struct Table3D
         if (i < begin[0] || i >= end[0] ||
             j < begin[1] || j >= end[1] ||
             k < begin[2] || k >= end[2]) {
-#if AMREX_DEVICE_COMPILE
-            AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n",
-                                i, j, k, begin[0], end[0]-1, begin[1], end[1]-1,
-                                begin[2], end[2]-1);
-            amrex::Abort();
-#else
-            std::stringstream ss;
-            ss << " (" << i << "," << j << "," << k << ") is out of bound ("
-               << begin[0] << ":" << end[0]-1
-               << "," << begin[1] << ":" << end[1]-1
-               << "," << begin[2] << ":" << end[2]-1 << ")";
-            amrex::Abort(ss.str());
-#endif
+            AMREX_IF_ON_DEVICE((
+                AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n",
+                                    i, j, k, begin[0], end[0]-1, begin[1], end[1]-1,
+                                    begin[2], end[2]-1);
+                amrex::Abort();
+            ))
+            AMREX_IF_ON_HOST((
+                std::stringstream ss;
+                ss << " (" << i << "," << j << "," << k << ") is out of bound ("
+                << begin[0] << ":" << end[0]-1
+                << "," << begin[1] << ":" << end[1]-1
+                << "," << begin[2] << ":" << end[2]-1 << ")";
+                amrex::Abort(ss.str());
+            ))
         }
     }
 #endif
@@ -216,12 +216,11 @@ struct Table4D
     GpuArray<int,4> begin{{1,1,1,1}};
     GpuArray<int,4> end{{0,0,0,0}};
 
-    AMREX_GPU_HOST_DEVICE
-    constexpr Table4D () noexcept {}
+    constexpr Table4D () noexcept = default;
 
-    template <class U=T, typename std::enable_if<std::is_const<U>::value,int>::type = 0>
+    template <class U=T, std::enable_if_t<std::is_const_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE
-    constexpr Table4D (Table4D<typename std::remove_const<T>::type> const& rhs) noexcept
+    constexpr Table4D (Table4D<std::remove_const_t<T>> const& rhs) noexcept
         : p(rhs.p),
           jstride(rhs.jstride),
           kstride(rhs.kstride),
@@ -245,7 +244,7 @@ struct Table4D
     AMREX_GPU_HOST_DEVICE
     explicit operator bool () const noexcept { return p != nullptr; }
 
-    template <class U=T, typename std::enable_if<!std::is_void<U>::value,int>::type = 0>
+    template <class U=T, std::enable_if_t<!std::is_void_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     U& operator() (int i, int j, int k, int n) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
@@ -262,20 +261,21 @@ struct Table4D
             j < begin[1] || j >= end[1] ||
             k < begin[2] || k >= end[2] ||
             n < begin[3] || n >= end[3]) {
-#if AMREX_DEVICE_COMPILE
-            AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n",
-                                i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1,
-                                begin[2], end[2]-1, begin[3], end[3]-1);
-            amrex::Abort();
-#else
-            std::stringstream ss;
-            ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound ("
-               << begin[0] << ":" << end[0]-1
-               << "," << begin[1] << ":" << end[1]-1
-               << "," << begin[2] << ":" << end[2]-1
-               << "," << begin[3] << ":" << end[3]-1 << ")";
-            amrex::Abort(ss.str());
-#endif
+            AMREX_IF_ON_DEVICE((
+                AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n",
+                                    i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1,
+                                    begin[2], end[2]-1, begin[3], end[3]-1);
+                amrex::Abort();
+            ))
+            AMREX_IF_ON_HOST((
+                std::stringstream ss;
+                ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound ("
+                << begin[0] << ":" << end[0]-1
+                << "," << begin[1] << ":" << end[1]-1
+                << "," << begin[2] << ":" << end[2]-1
+                << "," << begin[3] << ":" << end[3]-1 << ")";
+                amrex::Abort(ss.str());
+            ))
         }
     }
 #endif
@@ -333,7 +333,7 @@ public:
                              std::conditional_t<N==3, Table3D<T const>,
                                                       Table4D<T const> > > >;
 
-    TableData () noexcept;
+    TableData () noexcept = default;
 
     explicit TableData (Arena* ar) noexcept;
 
@@ -347,11 +347,11 @@ public:
 
     ~TableData () noexcept;
 
-    constexpr int dim () const noexcept { return N; }
+    [[nodiscard]] constexpr int dim () const noexcept { return N; }
 
     void resize (Array<int,N> const& lo, Array<int,N> const& hi, Arena* ar = nullptr);
 
-    Long size () const noexcept;
+    [[nodiscard]] Long size () const noexcept;
 
     Array<int,N> const& lo () const noexcept { return m_lo; }
 
@@ -376,9 +376,6 @@ private:
     bool m_ptr_owner = false;
 };
 
-template <typename T, int N>
-TableData<T,N>::TableData () noexcept {}
-
 template <typename T, int N>
 TableData<T,N>::TableData (Array<int,N> const& lo, Array<int,N> const& hi, Arena* ar)
     : DataAllocator{ar}, m_lo(lo), m_hi(hi)
@@ -404,14 +401,17 @@ template <typename T, int N>
 TableData<T,N>&
 TableData<T,N>::operator= (TableData<T,N> && rhs) noexcept
 {
-    m_arena     = rhs.m_arena;
-    m_dptr      = rhs.m_dptr;
-    m_lo        = rhs.m_lo;
-    m_hi        = rhs.m_hi;
-    m_truesize  = rhs.m_truesize;
-    m_ptr_owner = rhs.m_ptr_owner;
-    rhs.m_dptr = nullptr;
-    rhs.m_ptr_owner = false;
+    if (this != &rhs) {
+        clear();
+        m_arena     = rhs.m_arena;
+        m_dptr      = rhs.m_dptr;
+        m_lo        = rhs.m_lo;
+        m_hi        = rhs.m_hi;
+        m_truesize  = rhs.m_truesize;
+        m_ptr_owner = rhs.m_ptr_owner;
+        rhs.m_dptr = nullptr;
+        rhs.m_ptr_owner = false;
+    }
     return *this;
 }
 
diff --git a/Src/Base/AMReX_TagParallelFor.H b/Src/Base/AMReX_TagParallelFor.H
index 5aa748a3d61..ee8d089ee73 100644
--- a/Src/Base/AMReX_TagParallelFor.H
+++ b/Src/Base/AMReX_TagParallelFor.H
@@ -7,6 +7,7 @@
 #include <AMReX_Box.H>
 #include <AMReX_GpuLaunch.H>
 #include <AMReX_Vector.H>
+#include <limits>
 #include <utility>
 
 namespace amrex {
@@ -74,10 +75,10 @@ struct Array4BoxValTag {
 template <class T>
 struct VectorTag {
     T* p;
-    int m_size;
+    Long m_size;
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int size () const noexcept { return m_size; }
+    Long size () const noexcept { return m_size; }
 };
 
 #ifdef AMREX_USE_GPU
@@ -85,20 +86,20 @@ struct VectorTag {
 namespace detail {
 
 template <typename T>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::enable_if_t<std::is_same<std::decay_t<decltype(std::declval<T>().box())>, Box>::value,
-                 int>
+                 Long>
 get_tag_size (T const& tag) noexcept
 {
+    AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits<int>::max()));
     return static_cast<int>(tag.box().numPts());
 }
 
 template <typename T>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::enable_if_t<std::is_integral<std::decay_t<decltype(std::declval<T>().size())> >::value,
-                 int>
+                 Long>
 get_tag_size (T const& tag) noexcept
 {
+    AMREX_ASSERT(tag.size() < Long(std::numeric_limits<int>::max()));
     return tag.size();
 }
 
@@ -151,6 +152,7 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
     const int ntags = tags.size();
     if (ntags == 0) { return; }
 
+    Long l_ntotwarps = 0;
     int ntotwarps = 0;
     Vector<int> nwarps;
     nwarps.reserve(ntags+1);
@@ -158,8 +160,9 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
     {
         auto& tag = tags[i];
         nwarps.push_back(ntotwarps);
-        ntotwarps += static_cast<int>((get_tag_size(tag)
-                                       + Gpu::Device::warp_size-1) / Gpu::Device::warp_size);
+        auto nw = (get_tag_size(tag) + Gpu::Device::warp_size-1) / Gpu::Device::warp_size;
+        l_ntotwarps += nw;
+        ntotwarps += static_cast<int>(nw);
     }
     nwarps.push_back(ntotwarps);
 
@@ -182,6 +185,9 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
     constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
     int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
 
+    amrex::ignore_unused(l_ntotwarps);
+    AMREX_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits<int>::max()));
+
     amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
 #ifdef AMREX_USE_SYCL
     [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
@@ -192,11 +198,11 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
 #endif
     {
 #ifdef AMREX_USE_SYCL
-        int g_tid = item.get_global_id(0);
+        std::size_t g_tid = item.get_global_id(0);
 #else
-        int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
+        auto g_tid = std::size_t(blockDim.x)*blockIdx.x + threadIdx.x;
 #endif
-        int g_wid = g_tid / Gpu::Device::warp_size;
+        auto g_wid = int(g_tid / Gpu::Device::warp_size);
         if (g_wid >= ntotwarps) { return; }
 
         int tag_id = amrex::bisect(d_nwarps, 0, ntags, g_wid);
diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index 3d935589409..82f3718c735 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -93,6 +93,12 @@ TinyProfiler::start () noexcept
 #endif
     if (!regionstack.empty()) {
 
+#ifdef AMREX_USE_GPU
+        if (device_synchronize_around_region) {
+            amrex::Gpu::streamSynchronize();
+        }
+#endif
+
 #ifdef AMREX_USE_CUPTI
         if (uCUPTI) {
             cudaDeviceSynchronize();
@@ -111,12 +117,6 @@ TinyProfiler::start () noexcept
         in_parallel_region = false;
 #endif
 
-#ifdef AMREX_USE_GPU
-            if (device_synchronize_around_region) {
-                amrex::Gpu::streamSynchronize();
-            }
-#endif
-
 #ifdef AMREX_USE_CUDA
         nvtxRangePush(fname.c_str());
 #elif defined(AMREX_USE_HIP) && defined(AMREX_USE_ROCTX)
@@ -149,8 +149,14 @@ TinyProfiler::stop () noexcept
 #ifdef AMREX_USE_OMP
 #pragma omp master
 #endif
-    if (!stats.empty())
-    {
+    if (!stats.empty()) {
+
+#ifdef AMREX_USE_GPU
+        if (device_synchronize_around_region) {
+            amrex::Gpu::streamSynchronize();
+        }
+#endif
+
         double t;
         int nKernelCalls = 0;
 #ifdef AMREX_USE_CUPTI
@@ -207,12 +213,6 @@ TinyProfiler::stop () noexcept
                 std::get<1>(parent) += dtin;
             }
 
-#ifdef AMREX_USE_GPU
-            if (device_synchronize_around_region) {
-                amrex::Gpu::streamSynchronize();
-            }
-#endif
-
 #ifdef AMREX_USE_CUDA
             nvtxRangePop();
 #elif defined(AMREX_USE_HIP) && defined(AMREX_USE_ROCTX)
@@ -242,8 +242,12 @@ TinyProfiler::stop (unsigned boxUintID) noexcept
 #ifdef AMREX_USE_OMP
 #pragma omp master
 #endif
-    if (!stats.empty())
-    {
+    if (!stats.empty()) {
+
+        if (device_synchronize_around_region) {
+            amrex::Gpu::streamSynchronize();
+        }
+
         double t;
         cudaDeviceSynchronize();
         cuptiActivityFlushAll(0);
@@ -293,10 +297,6 @@ TinyProfiler::stop (unsigned boxUintID) noexcept
                 std::get<1>(parent) += dtin;
             }
 
-            if (device_synchronize_around_region) {
-                amrex::Gpu::streamSynchronize();
-            }
-
 #ifdef AMREX_USE_CUDA
             nvtxRangePop();
 #elif defined(AMREX_USE_HIP) && defined(AMREX_USE_ROCTX)
diff --git a/Src/Base/AMReX_Tuple.H b/Src/Base/AMReX_Tuple.H
index 5cdfbee634f..2aca55a0c6b 100644
--- a/Src/Base/AMReX_Tuple.H
+++ b/Src/Base/AMReX_Tuple.H
@@ -24,7 +24,7 @@ namespace detail {
 template <std::size_t I, typename T>
 struct gpu_tuple_element
 {
-    template <typename U=T, std::enable_if_t<std::is_default_constructible<U>::value,int> = 0>
+    template <typename U=T, std::enable_if_t<std::is_default_constructible_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE
     constexpr gpu_tuple_element () {} // NOLINT
 
@@ -32,7 +32,7 @@ struct gpu_tuple_element
         : m_value(a_value)
         {}
 
-    template <typename U, std::enable_if_t<std::is_convertible<U&&,T>::value,int> = 0>
+    template <typename U, std::enable_if_t<std::is_convertible_v<U&&,T>,int> = 0>
     explicit constexpr gpu_tuple_element (U && a_value) // NOLINT(bugprone-forwarding-reference-overload)
         : m_value(std::forward<U>(a_value))
         {}
@@ -47,7 +47,7 @@ struct gpu_tuple_impl<I, Head, Tail...>
     : public gpu_tuple_impl<I+1, Tail...>,
       public gpu_tuple_element<I, Head>
 {
-    template<typename U=Head, std::enable_if_t<std::is_default_constructible<U>::value,int> = 0>
+    template<typename U=Head, std::enable_if_t<std::is_default_constructible_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE
     constexpr gpu_tuple_impl () {} // NOLINT
 
@@ -56,7 +56,7 @@ struct gpu_tuple_impl<I, Head, Tail...>
           gpu_tuple_element<I, Head>(a_head)
         {}
 
-    template <typename UH, typename... UT, std::enable_if_t<std::is_convertible<UH&&,Head>::value,int> = 0>
+    template <typename UH, typename... UT, std::enable_if_t<std::is_convertible_v<UH&&,Head>,int> = 0>
     constexpr gpu_tuple_impl (UH&& a_head, UT &&... a_tail)
         : gpu_tuple_impl<I+1, Tail...>(std::forward<UT>(a_tail)...),
           gpu_tuple_element<I, Head>(std::forward<UH>(a_head))
@@ -68,7 +68,7 @@ struct gpu_tuple_impl<I, Head>
     : public gpu_tuple_element<I, Head>
 {
 
-    template<typename U=Head, std::enable_if_t<std::is_default_constructible<U>::value,int> = 0>
+    template<typename U=Head, std::enable_if_t<std::is_default_constructible_v<U>,int> = 0>
     AMREX_GPU_HOST_DEVICE
     constexpr gpu_tuple_impl () {} // NOLINT
 
@@ -76,7 +76,7 @@ struct gpu_tuple_impl<I, Head>
         : gpu_tuple_element<I, Head>(a_head)
         {}
 
-    template <typename U, std::enable_if_t<std::is_convertible<U&&,Head>::value,int> = 0>
+    template <typename U, std::enable_if_t<std::is_convertible_v<U&&,Head>,int> = 0>
     explicit constexpr gpu_tuple_impl (U&& a_head) // NOLINT(bugprone-forwarding-reference-overload)
         : gpu_tuple_element<I, Head>(std::forward<U>(a_head))
         {}
@@ -241,7 +241,7 @@ namespace detail {
     template <typename T> struct unwrap { using type = T; };
     template <typename T> struct unwrap<std::reference_wrapper<T> > { using type = T&; };
     template <typename T>
-    using tuple_decay_t = typename unwrap<typename std::decay<T>::type>::type;
+    using tuple_decay_t = typename unwrap<std::decay_t<T>>::type;
 }
 
 template <typename... Ts>
@@ -270,7 +270,7 @@ namespace detail {
 
     template <typename R, typename TP1, typename TP2, std::size_t... N1, std::size_t... N2>
     AMREX_GPU_HOST_DEVICE constexpr R
-    make_tuple (TP1 && a, TP2 && b,
+    make_tuple (TP1 const& a, TP2 const& b,
                 std::index_sequence<N1...> const& /*n1*/, std::index_sequence<N2...> const& /*n2*/)
     {
         return R(amrex::get<N1>(a)..., amrex::get<N2>(b)...);
@@ -297,9 +297,9 @@ TupleCat (TP1 && a, TP2 && b) -> typename detail::tuple_cat_result<detail::tuple
     using ReturnType =  typename detail::tuple_cat_result<detail::tuple_decay_t<TP1>,
                                                           detail::tuple_decay_t<TP2> >::type;
     return detail::make_tuple<ReturnType>
-        (a, b,
-         std::make_index_sequence<GpuTupleSize<typename std::decay<TP1>::type>::value>(),
-         std::make_index_sequence<GpuTupleSize<typename std::decay<TP2>::type>::value>());
+        (std::forward<TP1>(a), std::forward<TP2>(b),
+         std::make_index_sequence<GpuTupleSize<std::decay_t<TP1>>::value>(),
+         std::make_index_sequence<GpuTupleSize<std::decay_t<TP2>>::value>());
 }
 
 template <typename TP1, typename TP2, typename... TPs>
@@ -345,7 +345,7 @@ namespace detail {
     apply_impl (F&& f, TP&& t, std::index_sequence<N...> /*is*/)
         -> typename detail::apply_result<F,detail::tuple_decay_t<TP> >::type
     {
-        return f(amrex::get<N>(std::forward<TP>(t))...);
+        return std::forward<F>(f)(amrex::get<N>(std::forward<TP>(t))...);
     }
 }
 
@@ -355,7 +355,7 @@ constexpr auto
 Apply (F&& f, TP&& t) -> typename detail::apply_result<F,detail::tuple_decay_t<TP> >::type
 {
     return detail::apply_impl(std::forward<F>(f), std::forward<TP>(t),
-                              std::make_index_sequence<GpuTupleSize<typename std::decay<TP>::type>::value>());
+                              std::make_index_sequence<GpuTupleSize<std::decay_t<TP>>::value>());
 }
 
 // Tie
@@ -378,6 +378,20 @@ ForwardAsTuple (Ts&&... args) noexcept
     return GpuTuple<Ts&&...>(std::forward<Ts>(args)...);
 }
 
+// MakeZeroTuple
+
+/**
+ * \brief Return a GpuTuple containing all zeros.
+ * Note that a default-constructed GpuTuple can have uninitialized values.
+ */
+template <typename... Ts>
+AMREX_GPU_HOST_DEVICE
+constexpr GpuTuple<Ts...>
+MakeZeroTuple (GpuTuple<Ts...>) noexcept
+{
+    return GpuTuple<Ts...>(static_cast<Ts>(0)...);
+}
+
 }
 
 #endif /*AMREX_TUPLE_H_*/
diff --git a/Src/Base/AMReX_TypeList.H b/Src/Base/AMReX_TypeList.H
index 3bd0a837069..75201087b7d 100644
--- a/Src/Base/AMReX_TypeList.H
+++ b/Src/Base/AMReX_TypeList.H
@@ -34,13 +34,13 @@ using TypeAt = typename detail::TypeListGet<I,T>::type;
 namespace detail
 {
     template <typename TL, typename F, std::size_t...N>
-    constexpr void for_each_impl (F&&f, std::index_sequence<N...>)
+    constexpr void for_each_impl (F const&f, std::index_sequence<N...>)
     {
         (f(TypeAt<N,TL>{}), ...);
     }
 
     template <typename TL, typename F, std::size_t...N>
-    constexpr bool for_each_until_impl (F&&f, std::index_sequence<N...>)
+    constexpr bool for_each_until_impl (F const&f, std::index_sequence<N...>)
     {
         return (f(TypeAt<N,TL>{}) || ...);
     }
@@ -93,7 +93,7 @@ ForEach (TypeList<Ts...>, F&& f)
          // dst and src are either MultiFab or fMultiFab
          auto tt = CartesianProduct(TypeList<MultiFab,fMultiFab>{},
                                     TypeList<MultiFab,fMultiFab>{});
-         bool r = ForEachUtil(tt, [&] (auto t) -> bool
+         bool r = ForEachUntil(tt, [&] (auto t) -> bool
          {
              using MF0 = TypeAt<0,decltype(t)>;
              using MF1 = TypeAt<1,decltype(t)>;
@@ -151,6 +151,53 @@ constexpr auto CartesianProduct (Ls...) {
     return (TypeList<TypeList<>>{} * ... * Ls{});
 }
 
+namespace detail {
+    // return TypeList<T, T, T, T, ... (N times)> by using the fast power algorithm
+    template <class T, std::size_t N>
+    constexpr auto SingleTypeMultiplier_impl () {
+        if constexpr (N == 0) {
+            return TypeList<>{};
+        } else if constexpr (N == 1) {
+            return TypeList<T>{};
+        } else if constexpr (N % 2 == 0) {
+            return SingleTypeMultiplier_impl<T, N / 2>() + SingleTypeMultiplier_impl<T, N / 2>();
+        } else {
+            return SingleTypeMultiplier_impl<T, N - 1>() + TypeList<T>{};
+        }
+    }
+
+    // overload of SingleTypeMultiplier for multiple types:
+    // convert T[N] to  T, T, T, T, ... (N times with N >= 1)
+    template <class T, std::size_t N>
+    constexpr auto SingleTypeMultiplier (const T (&)[N]) {
+        return SingleTypeMultiplier_impl<T, N>();
+    }
+
+    // overload of SingleTypeMultiplier for one regular type
+    template <class T>
+    constexpr auto SingleTypeMultiplier (T) {
+        return TypeList<T>{};
+    }
+
+    // apply the types of the input TypeList as template arguments to TParam
+    template <template <class...> class TParam, class... Args>
+    constexpr auto TApply (TypeList<Args...>) {
+        return TypeList<TParam<Args...>>{};
+    }
+}
+
+/**
+ * \brief Return the first template argument with the later arguments applied to it.
+ * Types of the form T[N] are expanded to T, T, T, T, ... (N times with N >= 1).
+ *
+ * For example, TypeMultiplier<ReduceData, Real[4], int[2], Long>
+ * is an alias to the type ReduceData<Real, Real, Real, Real, int, int, Long>.
+ */
+template <template <class...> class TParam, class... Types>
+using TypeMultiplier = TypeAt<0, decltype(detail::TApply<TParam>(
+    (TypeList<>{} + ... + detail::SingleTypeMultiplier(Types{}))
+))>;
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_TypeTraits.H b/Src/Base/AMReX_TypeTraits.H
index 222576f05f5..2e4985bf224 100644
--- a/Src/Base/AMReX_TypeTraits.H
+++ b/Src/Base/AMReX_TypeTraits.H
@@ -7,8 +7,8 @@
 #include <type_traits>
 
 // In case they are still used by applications
-#define AMREX_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
-#define AMREX_IS_TRIVIALLY_DEFAULT_CONSTRUCTIBLE(T) std::is_trivially_default_constructible<T>::value
+#define AMREX_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable_v<T>
+#define AMREX_IS_TRIVIALLY_DEFAULT_CONSTRUCTIBLE(T) std::is_trivially_default_constructible_v<T>
 
 namespace amrex
 {
@@ -18,9 +18,9 @@ namespace amrex
     template <class A, class Enable = void> struct IsBaseFab : std::false_type {};
     //
     template <class D>
-    struct IsBaseFab<D, typename std::enable_if<
-                            std::is_base_of<BaseFab<typename D::value_type>,
-                                            D>::value>::type>
+    struct IsBaseFab<D, std::enable_if_t<
+                            std::is_base_of_v<BaseFab<typename D::value_type>,
+                                            D>>>
         : std::true_type {};
     //
     template <class A>
@@ -29,16 +29,28 @@ namespace amrex
     template <class A, class Enable = void> struct IsFabArray : std::false_type {};
     //
     template <class D>
-    struct IsFabArray<D, typename std::enable_if<
-                             std::is_base_of<FabArray<typename D::FABType::value_type>,
-                                             D>::value>::type>
+    struct IsFabArray<D, std::enable_if_t<
+                             std::is_base_of_v<FabArray<typename D::FABType::value_type>,
+                                             D>>>
         : std::true_type {};
     //
     template <class A>
     inline constexpr bool IsFabArray_v = IsFabArray<A>::value;
 
+    template <class M, class Enable = void>
+    struct IsMultiFabLike : std::false_type {};
+    //
+    template <class M>
+    struct IsMultiFabLike<M, std::enable_if_t<IsFabArray_v<M> &&
+                                              IsBaseFab_v<typename M::fab_type> > >
+        : std::true_type {};
+    //
+    template <class M>
+    inline constexpr bool IsMultiFabLike_v = IsMultiFabLike<M>::value;
+
+
     template <bool B, class T = void>
-    using EnableIf_t = typename std::enable_if<B,T>::type;
+    using EnableIf_t = std::enable_if_t<B,T>;
 
     template <class T, class Enable = void>
     struct HasAtomicAdd : std::false_type {};
@@ -110,13 +122,13 @@ namespace amrex
     template <typename T, typename U>
     struct Same<T,U>
     {
-        static constexpr bool value = std::is_same<T,U>::value;
+        static constexpr bool value = std::is_same_v<T,U>;
     };
 
     template <typename T, typename U1, typename... Us>
     struct Same
     {
-        static constexpr bool value = std::is_same<T,U1>::value && Same<T,Us...>::value;
+        static constexpr bool value = std::is_same_v<T,U1> && Same<T,Us...>::value;
     };
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -231,18 +243,18 @@ namespace amrex
     struct IsStoreAtomic : std::false_type {};
     //
     template <class T>
-    struct IsStoreAtomic<T, typename std::enable_if <
-                                std::is_arithmetic<T>::value
-                                && sizeof(T) <= 8 >::type >
+    struct IsStoreAtomic<T, std::enable_if_t <
+                                std::is_arithmetic_v<T>
+                                && sizeof(T) <= 8 > >
         : std::true_type {};
 
     template <class T, class Enable = void>
     struct IsStdVector : std::false_type {};
     //
     template <class T>
-    struct IsStdVector<T, std::enable_if_t<std::is_base_of<std::vector<typename T::value_type,
+    struct IsStdVector<T, std::enable_if_t<std::is_base_of_v<std::vector<typename T::value_type,
                                                                        typename T::allocator_type>,
-                                                           T>::value> >
+                                                           T>> >
                        : std::true_type {};
 
 }
diff --git a/Src/Base/AMReX_Utility.H b/Src/Base/AMReX_Utility.H
index ea81210e0a3..016b8adb0e2 100644
--- a/Src/Base/AMReX_Utility.H
+++ b/Src/Base/AMReX_Utility.H
@@ -132,16 +132,6 @@ namespace amrex
     *  WWW URL:     http://www.math.uio.no/~jacklam
 
     *  "p" MUST be in the open interval (0,1).
-
-    *  Entry points for Fortran:
-
-    *    REAL_T val
-    *    call blinvnormdist(val)
-
-    *  Internally, the Fortran entry point calls a static Mersenne Twister
-    *  object (the same one called by blutilrand()) to get a random number in
-    *  the open interval (0,1), and then sets "val" to the result of calling
-    *  amrex::InvNormDist() with that random number.
     */
     double InvNormDist (double p);
     /**
@@ -165,16 +155,6 @@ namespace amrex
     *  is roughly 30% more costly than InvNormDist() above.
     *
     *  "p" MUST be in the open interval (0,1).
-    *
-    *  Entry points for Fortran:
-    *
-    *    REAL_T val
-    *    call blinvnormdistbest(val)
-    *
-    *  Internally, the Fortran entry point calls a static Mersenne Twister
-    *  object (the same one called by blutilrand()) to get a random number in
-    *  the open interval (0,1), and then sets "val" to the result of calling
-    *  amrex::InvNormDist() with that random number.
     */
     double InvNormDistBest (double p);
 
@@ -211,7 +191,7 @@ namespace amrex
         int tries, maxTries;
         bool abortOnRetryFailure = true;
         std::string fileName;
-        std::ostream &sros;
+        std::ostream *sros;
         std::ostream::pos_type spos;
         std::string suffix;
 
@@ -242,9 +222,9 @@ namespace amrex
     void Sleep (double sleepsec);  // Sleep for sleepsec seconds
 
 
-    using MaxResSteadyClock = std::conditional<std::chrono::high_resolution_clock::is_steady,
+    using MaxResSteadyClock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
                                                std::chrono::high_resolution_clock,
-                                               std::chrono::steady_clock>::type;
+                                               std::chrono::steady_clock>;
     double second () noexcept;
 
     template<typename T> void hash_combine (uint64_t & seed, const T & val) noexcept;
diff --git a/Src/Base/AMReX_Utility.cpp b/Src/Base/AMReX_Utility.cpp
index 16f2f20b554..1c79dfba92f 100644
--- a/Src/Base/AMReX_Utility.cpp
+++ b/Src/Base/AMReX_Utility.cpp
@@ -204,7 +204,7 @@ amrex::UtilCreateCleanDirectory (const std::string &path, bool callbarrier)
       std::string newoldname(path + ".old." + amrex::UniqueString());
       if (amrex::system::verbose > 1) {
           amrex::Print() << "amrex::UtilCreateCleanDirectory():  " << path
-                         << " exists.  Renaming to:  " << newoldname << std::endl;
+                         << " exists.  Renaming to:  " << newoldname << '\n';
       }
       if (std::rename(path.c_str(), newoldname.c_str())) {
           amrex::Abort("UtilCreateCleanDirectory:: std::rename failed");
@@ -230,7 +230,7 @@ amrex::UtilCreateDirectoryDestructive(const std::string &path, bool callbarrier)
     {
       if (amrex::Verbose() > 1) {
           amrex::Print() << "amrex::UtilCreateCleanDirectoryDestructive():  " << path
-                         << " exists.  I am destroying it.  " << std::endl;
+                         << " exists.  I am destroying it.  " << '\n';
       }
       FileSystem::RemoveAll(path);
     }
@@ -254,7 +254,7 @@ amrex::UtilRenameDirectoryToOld (const std::string &path, bool callbarrier)
       std::string newoldname(path + ".old." + amrex::UniqueString());
       if (amrex::Verbose() > 1) {
           amrex::Print() << "amrex::UtilRenameDirectoryToOld():  " << path
-                         << " exists.  Renaming to:  " << newoldname << std::endl;
+                         << " exists.  Renaming to:  " << newoldname << '\n';
       }
       if (std::rename(path.c_str(), newoldname.c_str())) {
           amrex::Abort("UtilRenameDirectoryToOld: std::rename failed");
@@ -603,7 +603,7 @@ int amrex::StreamRetry::nStreamErrors = 0;
 
 amrex::StreamRetry::StreamRetry(std::ostream &a_os, std::string a_suffix,
                                 int a_maxtries)
-    : tries(0), maxTries(a_maxtries), sros(a_os), spos(a_os.tellp()), suffix(std::move(a_suffix))
+    : tries(0), maxTries(a_maxtries), sros(&a_os), spos(a_os.tellp()), suffix(std::move(a_suffix))
 {
 }
 
@@ -612,7 +612,7 @@ amrex::StreamRetry::StreamRetry (std::string filename, bool abortonretryfailure,
     : tries(0), maxTries(maxtries),
       abortOnRetryFailure(abortonretryfailure),
       fileName(std::move(filename)),
-      sros(amrex::ErrorStream())    // unused here, just to make the compiler happy
+      sros(&amrex::ErrorStream())    // unused here, just to make the compiler happy
 {
   nStreamErrors = 0;
 }
@@ -623,41 +623,41 @@ bool amrex::StreamRetry::TryOutput()
     ++tries;
     return true;
   } else {
-    if(sros.fail()) {
+    if(sros->fail()) {
       ++nStreamErrors;
       int myProc(ParallelDescriptor::MyProc());
       if(tries <= maxTries) {
           if (amrex::Verbose() > 1) {
               amrex::AllPrint() << "PROC: " << myProc << " :: STREAMRETRY_" << suffix << " # "
                                 << tries << " :: gbfe:  "
-                                << sros.good() << sros.bad() << sros.fail() << sros.eof()
+                                << sros->good() << sros->bad() << sros->fail() << sros->eof()
                                 << " :: sec = " << ParallelDescriptor::second()
-                                << " :: os.tellp() = " << sros.tellp()
+                                << " :: os.tellp() = " << sros->tellp()
                                 << " :: rewind spos = " << spos
-                                << std::endl;
+                                << '\n';
           }
-        sros.clear();  // clear the bad bits
+        sros->clear();  // clear the bad bits
         if (amrex::Verbose() > 1) {
-            amrex::AllPrint() << "After os.clear() : gbfe:  " << sros.good() << sros.bad()
-                              << sros.fail() << sros.eof() << std::endl;
+            amrex::AllPrint() << "After os.clear() : gbfe:  " << sros->good() << sros->bad()
+                              << sros->fail() << sros->eof() << '\n';
         }
-        sros.seekp(spos, std::ios::beg);  // reset stream position
+        sros->seekp(spos, std::ios::beg);  // reset stream position
         ++tries;
         return true;
       } else {
         if (amrex::Verbose() > 1) {
             amrex::AllPrint() << "PROC: " << myProc << " :: STREAMFAILED_" << suffix << " # "
                               << tries << " :: File may be corrupt.  :: gbfe:  "
-                              << sros.good() << sros.bad() << sros.fail() << sros.eof()
+                              << sros->good() << sros->bad() << sros->fail() << sros->eof()
                               << " :: sec = " << ParallelDescriptor::second()
-                              << " :: os.tellp() = " << sros.tellp()
+                              << " :: os.tellp() = " << sros->tellp()
                               << " :: rewind spos = " << spos
-                              << std::endl;
+                              << '\n';
         }
-        sros.clear();  // clear the bad bits
+        sros->clear();  // clear the bad bits
         if (amrex::Verbose() > 1) {
-            amrex::AllPrint() << "After os.clear() : gbfe:  " << sros.good() << sros.bad()
-                              << sros.fail() << sros.eof() << std::endl;
+            amrex::AllPrint() << "After os.clear() : gbfe:  " << sros->good() << sros->bad()
+                              << sros->fail() << sros->eof() << '\n';
         }
         return false;
       }
@@ -687,7 +687,7 @@ bool amrex::StreamRetry::TryFileOutput()
                                                                tries - 1, 2);
           if (amrex::Verbose() > 1) {
               amrex::Print() << nWriteErrors << " STREAMERRORS : Renaming file from "
-                             << fileName << "  to  " << badFileName << std::endl;
+                             << fileName << "  to  " << badFileName << '\n';
           }
           if (std::rename(fileName.c_str(), badFileName.c_str())) {
               amrex::Abort("StreamRetry::TryFileOutput: std::rename failed");
@@ -727,8 +727,8 @@ void amrex::SyncStrings(const Vector<std::string> &localStrings,
   int pfStringsSize(0);
   std::ostringstream pfStrings;
   if(ParallelDescriptor::IOProcessor()) {
-    for(int i(0); i < localStringsCopy.size(); ++i) {
-      pfStrings << localStringsCopy[i] << '\n';
+    for(const auto & i : localStringsCopy) {
+      pfStrings << i << '\n';
     }
     pfStringsSize = static_cast<int>(pfStrings.str().size());
   }
@@ -752,28 +752,28 @@ void amrex::SyncStrings(const Vector<std::string> &localStrings,
       }
     }
     // ---- now check if they match on non ioprocs
-    for(int n(0); n < ioprocStrings.size(); ++n) {
+    for(const auto & ioprocString : ioprocStrings) {
       bool matched(false);
-      for(int i(0); i < localStringsCopy.size(); ++i) {
-        if(ioprocStrings[n] == localStringsCopy[i]) {
+      for(const auto & i : localStringsCopy) {
+        if(ioprocString == i) {
           matched = true;
         }
       }
       if( ! matched) {
         ++nUnmatched;
-        localStringsCopy.push_back(ioprocStrings[n]);  // ---- add to local set
+        localStringsCopy.push_back(ioprocString);  // ---- add to local set
       }
     }
-    for(int n(0); n < localStringsCopy.size(); ++n) {
+    for(const auto & n : localStringsCopy) {
       bool matched(false);
-      for(int i(0); i < ioprocStrings.size(); ++i) {
-        if(localStringsCopy[n] == ioprocStrings[i]) {
+      for(const auto & ioprocString : ioprocStrings) {
+        if(n == ioprocString) {
           matched = true;
         }
       }
       if( ! matched) {
         ++nUnmatched;
-        sendStrings.push_back(localStringsCopy[n]);  // ---- send these to the ioproc
+        sendStrings.push_back(n);  // ---- send these to the ioproc
       }
     }
   }
@@ -791,8 +791,8 @@ void amrex::SyncStrings(const Vector<std::string> &localStrings,
   std::ostringstream ossSendStrings;
   Vector<char> sendCharArray(1);  // cannot be zero for gather call
   if( ! ParallelDescriptor::IOProcessor()) {
-    for(int i(0); i < sendStrings.size(); ++i) {
-      ossSendStrings << sendStrings[i] << '\n';
+    for(const auto & sendString : sendStrings) {
+      ossSendStrings << sendString << '\n';
     }
     sendStringsSize = static_cast<int>(ossSendStrings.str().size());
     sendCharArray.resize(sendStringsSize + 1);
@@ -806,8 +806,8 @@ void amrex::SyncStrings(const Vector<std::string> &localStrings,
   Vector<char> recvStrings(1);
   Vector<int> offset(nProcs, 0);
   if(ParallelDescriptor::IOProcessor()) {
-    for(int i(0); i < nChars.size(); ++i) {
-      totalChars += nChars[i];
+    for(int nChar : nChars) {
+      totalChars += nChar;
     }
     recvStrings.resize(totalChars + 1);
 
@@ -844,8 +844,8 @@ void amrex::SyncStrings(const Vector<std::string> &localStrings,
   int syncedStringsSize(0);
   std::ostringstream syncedStrStr;
   if(ParallelDescriptor::IOProcessor()) {
-    for(int i(0); i < syncedStrings.size(); ++i) {
-      syncedStrStr << syncedStrings[i] << '\n';
+    for(const auto & syncedString : syncedStrings) {
+      syncedStrStr << syncedString << '\n';
     }
     syncedStringsSize = static_cast<int>(syncedStrStr.str().size());
   }
@@ -878,8 +878,8 @@ void amrex::SyncStrings(const Vector<std::string> &localStrings,
 amrex::Vector<char> amrex::SerializeStringArray(const Vector<std::string> &stringArray)
 {
   std::ostringstream stringStream;
-  for(int i(0); i < stringArray.size(); ++i) {
-    stringStream << stringArray[i] << '\n';
+  for(const auto & i : stringArray) {
+    stringStream << i << '\n';
   }
 
   Vector<char> charArray(stringStream.str().size() + 1);
diff --git a/Src/Base/AMReX_Vector.H b/Src/Base/AMReX_Vector.H
index c377076fe1b..c468befa94b 100644
--- a/Src/Base/AMReX_Vector.H
+++ b/Src/Base/AMReX_Vector.H
@@ -20,7 +20,6 @@ namespace amrex {
 * Vector::operator[] provides bound checking when compiled with
 * DEBUG=TRUE.
 */
-
 template <class T, class Allocator=std::allocator<T> >
 class Vector
     :
@@ -146,9 +145,9 @@ namespace amrex
         return r;
     }
 
-    template <class T, typename std::enable_if<IsFabArray<T>::value ||
+    template <class T, std::enable_if_t<IsFabArray<T>::value ||
                                                IsBaseFab<T>::value,
-                                               int>::type = 0 >
+                                               int> = 0 >
     [[nodiscard]] Vector<std::array<T const*,AMREX_SPACEDIM> >
     GetVecOfArrOfConstPtrs (const Vector<std::array<T,AMREX_SPACEDIM> >& a)
     {
@@ -158,9 +157,9 @@ namespace amrex
         return r;
     }
 
-    template <class T, typename std::enable_if<IsFabArray<T>::value ||
+    template <class T, std::enable_if_t<IsFabArray<T>::value ||
                                                IsBaseFab<T>::value,
-                                               int>::type = 0 >
+                                               int> = 0 >
     [[nodiscard]] Vector<std::array<T*, AMREX_SPACEDIM> >
     GetVecOfArrOfPtrs(Vector<std::array<T, AMREX_SPACEDIM> >& a)
     {
diff --git a/Src/Base/AMReX_VisMF.H b/Src/Base/AMReX_VisMF.H
index f0b146f6a9d..23e2e356d97 100644
--- a/Src/Base/AMReX_VisMF.H
+++ b/Src/Base/AMReX_VisMF.H
@@ -29,7 +29,6 @@ class IArrayBox;
 * \brief File I/O for FabArray<FArrayBox>.
 *  Wrapper class for reading/writing FabArray<FArrayBox> objects to disk in various "smart" ways.
 */
-
 class VisMF
     : public VisMFBuffer
 {
@@ -380,7 +379,7 @@ std::istream& operator>> (std::istream& is, VisMF::Header& hd);
  */
 template <typename FAB>
 // This function does work for MultiFab, but let's disable it to avoid confusion.
-std::enable_if_t<std::is_same<FAB,IArrayBox>::value>
+std::enable_if_t<std::is_same_v<FAB,IArrayBox>>
 Write (const FabArray<FAB>& fa, const std::string& name)
 {
     BL_PROFILE("Write(FabArray)");
@@ -393,8 +392,8 @@ Write (const FabArray<FAB>& fa, const std::string& name)
     const Vector<int> &pmap = fa.DistributionMap().ProcessorMap();
     std::set<int> procsWithData;
     Vector<int> procsWithDataVector;
-    for(int i = 0; i < pmap.size(); ++i) {
-        procsWithData.insert(pmap[i]);
+    for(int i : pmap) {
+        procsWithData.insert(i);
     }
     const int nOutFiles = VisMF::GetNOutFiles();
     if (static_cast<int>(procsWithData.size()) < nOutFiles) {
@@ -557,7 +556,7 @@ void read_fab (FAB& fab, VisMF::FabOnDisk const& fod, std::string const& name)
  */
 template <typename FAB>
 // This function does work for MultiFab, but let's disable it to avoid confusion.
-std::enable_if_t<std::is_same<FAB,IArrayBox>::value>
+std::enable_if_t<std::is_same_v<FAB,IArrayBox>>
 Read (FabArray<FAB>& fa, const std::string& name)
 {
     BL_PROFILE("Read(FabArray)");
diff --git a/Src/Base/AMReX_VisMF.cpp b/Src/Base/AMReX_VisMF.cpp
index 52efe827460..cec4766abfd 100644
--- a/Src/Base/AMReX_VisMF.cpp
+++ b/Src/Base/AMReX_VisMF.cpp
@@ -11,9 +11,11 @@
 
 namespace amrex {
 
-static const char *TheMultiFabHdrFileSuffix = "_H";
-static const char *FabFileSuffix = "_D_";
-static const char *TheFabOnDiskPrefix = "FabOnDisk:";
+namespace {
+    const char *TheMultiFabHdrFileSuffix = "_H";
+    const char *FabFileSuffix = "_D_";
+    const char *TheFabOnDiskPrefix = "FabOnDisk:";
+}
 
 std::map<std::string, VisMF::PersistentIFStream> VisMF::persistentIFStreams;
 
@@ -49,8 +51,8 @@ namespace
         }
         int countForAll = static_cast<int>(totalItems / binCounts.size());
         int remainder = static_cast<int>(totalItems % binCounts.size());
-        for(int i(0); i < binCounts.size(); ++i) {
-            binCounts[i] = countForAll;
+        for(int & binCount : binCounts) {
+            binCount = countForAll;
         }
         for(int i(0); i < remainder; ++i) {
             ++binCounts[i];
@@ -192,7 +194,8 @@ operator>> (std::istream&            is,
     return is;
 }
 
-static
+namespace {
+
 std::ostream&
 operator<< (std::ostream&               os,
             const Vector< Vector<Real> >& ar)
@@ -217,7 +220,6 @@ operator<< (std::ostream&               os,
     return os;
 }
 
-static
 std::istream&
 operator>> (std::istream&         is,
             Vector< Vector<Real> >& ar)
@@ -265,6 +267,8 @@ operator>> (std::istream&         is,
     return is;
 }
 
+}
+
 std::ostream&
 operator<< (std::ostream        &os,
             const VisMF::Header &hd)
@@ -300,12 +304,12 @@ operator<< (std::ostream        &os,
     if(hd.m_vers == VisMF::Header::NoFabHeaderFAMinMax_v1) {
       BL_ASSERT(hd.m_famin.size() == hd.m_ncomp);
       BL_ASSERT(hd.m_famin.size() == hd.m_famax.size());
-      for(int i(0); i < hd.m_famin.size(); ++i) {
-        os << hd.m_famin[i] << ',';
+      for(auto famin : hd.m_famin) {
+        os << famin << ',';
       }
       os << '\n';
-      for(int i(0); i < hd.m_famax.size(); ++i) {
-        os << hd.m_famax[i] << ',';
+      for(auto famax : hd.m_famax) {
+        os << famax << ',';
       }
       os << '\n';
     }
@@ -388,14 +392,14 @@ operator>> (std::istream  &is,
       AMREX_ASSERT(hd.m_ncomp >= 0 && hd.m_ncomp < std::numeric_limits<int>::max());
       hd.m_famin.resize(hd.m_ncomp);
       hd.m_famax.resize(hd.m_ncomp);
-      for(int i(0); i < hd.m_famin.size(); ++i) {
-        is >> hd.m_famin[i] >> ch;
+      for(auto famin : hd.m_famin) {
+        is >> famin >> ch;
         if( ch != ',' ) {
           amrex::Error("Expected a ',' when reading hd.m_famin");
         }
       }
-      for(int i(0); i < hd.m_famax.size(); ++i) {
-        is >> hd.m_famax[i] >> ch;
+      for(auto famax : hd.m_famax) {
+        is >> famax >> ch;
         if( ch != ',' ) {
           amrex::Error("Expected a ',' when reading hd.m_famax");
         }
@@ -858,14 +862,14 @@ VisMF::Header::CalculateMinMax (const FabArray<FArrayBox>& mf,
       m_famax[comp] = -std::numeric_limits<Real>::max();
     }
 
-    for(int ibox(0); ibox < m_min.size(); ++ibox) {
-      for(int comp(0); comp < m_min[ibox].size(); ++comp) {
-        m_famin[comp] = std::min(m_famin[comp], m_min[ibox][comp]);
+    for(auto & ibox : m_min) {
+      for(int comp(0); comp < ibox.size(); ++comp) {
+        m_famin[comp] = std::min(m_famin[comp], ibox[comp]);
       }
     }
-    for(int ibox(0); ibox < m_max.size(); ++ibox) {
-      for(int comp(0); comp < m_max[ibox].size(); ++comp) {
-        m_famax[comp] = std::max(m_famax[comp], m_max[ibox][comp]);
+    for(auto & ibox : m_max) {
+      for(int comp(0); comp < ibox.size(); ++comp) {
+        m_famax[comp] = std::max(m_famax[comp], ibox[comp]);
       }
     }
 }
@@ -918,7 +922,7 @@ VisMF::WriteHeader (const std::string &mf_name, VisMF::Header &hdr,
           hss << hdr;
           if(static_cast<std::streamoff>(hss.tellp()) != bytesWritten) {
               amrex::ErrorStream() << "**** tellp error: hss.tellp() != bytesWritten :  "
-                                   << hss.tellp() << "  " << bytesWritten << std::endl;
+                                   << hss.tellp() << "  " << bytesWritten << '\n';
           }
         }
 
@@ -974,8 +978,8 @@ VisMF::Write (const FabArray<FArrayBox>&    mf,
     const Vector<int> &pmap = mf.DistributionMap().ProcessorMap();
     std::set<int> procsWithData;
     Vector<int> procsWithDataVector;
-    for(int i(0); i < pmap.size(); ++i) {
-      procsWithData.insert(pmap[i]);
+    for(int i : pmap) {
+      procsWithData.insert(i);
     }
     if(allowSparseWrites && (static_cast<int>(procsWithData.size()) < nOutFiles)) {
       useSparseFPP = true;
@@ -1296,9 +1300,8 @@ VisMF::FindOffsets (const FabArray<FArrayBox> &mf,
 
         const Vector< Vector<int> > &fileNumbersWriteOrder = nfi.FileNumbersWriteOrder();
 
-        for(int fn(0); fn < fileNumbersWriteOrder.size(); ++fn) {
-          for(int ri(0); ri < fileNumbersWriteOrder[fn].size(); ++ri) {
-            int rank(fileNumbersWriteOrder[fn][ri]);
+        for(const auto & fn : fileNumbersWriteOrder) {
+          for(int rank : fn) {
             auto rboIter = rankBoxOrder.find(rank);
 
             if(rboIter != rankBoxOrder.end()) {
@@ -1306,11 +1309,11 @@ VisMF::FindOffsets (const FabArray<FArrayBox> &mf,
               whichFileNumber = fileNumbers[rank];
               whichFileName   = VisMF::BaseName(NFilesIter::FileName(whichFileNumber, filePrefix));
 
-              for(int i(0); i < index.size(); ++i) {
-                 hdr.m_fod[index[i]].m_name = whichFileName;
-                 hdr.m_fod[index[i]].m_head = currentOffset[whichFileNumber];
-                 currentOffset[whichFileNumber] += mf.fabbox(index[i]).numPts() * nComps * whichRDBytes
-                                                   + fabHeaderBytes[index[i]];
+              for(int i : index) {
+                 hdr.m_fod[i].m_name = whichFileName;
+                 hdr.m_fod[i].m_head = currentOffset[whichFileNumber];
+                 currentOffset[whichFileNumber] += mf.fabbox(i).numPts() * nComps * whichRDBytes
+                                                   + fabHeaderBytes[i];
               }
             }
           }
@@ -1326,25 +1329,25 @@ VisMF::RemoveFiles(const std::string &mf_name, bool a_verbose)
     if(ParallelDescriptor::IOProcessor()) {
       std::string MFHdrFileName(mf_name + TheMultiFabHdrFileSuffix);
       if(a_verbose) {
-        amrex::Print() << "---- removing:  " << MFHdrFileName << std::endl;
+        amrex::Print() << "---- removing:  " << MFHdrFileName << '\n';
       }
       int retVal(std::remove(MFHdrFileName.c_str()));
       if(a_verbose) {
         if(retVal != 0) {
           amrex::Print() << "---- error removing:  " << MFHdrFileName << "  errno = "
-                    << strerror(errno) << std::endl;
+                    << strerror(errno) << '\n';
         }
       }
       for(int ip(0); ip < nOutFiles; ++ip) {
         std::string fileName(NFilesIter::FileName(nOutFiles, mf_name + FabFileSuffix, ip, true));
         if(a_verbose) {
-          amrex::Print() << "---- removing:  " << fileName << std::endl;
+          amrex::Print() << "---- removing:  " << fileName << '\n';
         }
         int rv(std::remove(fileName.c_str()));
         if(a_verbose) {
           if(rv != 0) {
             amrex::Print() << "---- error removing:  " << fileName << "  errno = "
-                      << strerror(errno) << std::endl;
+                      << strerror(errno) << '\n';
           }
         }
       }
@@ -1370,12 +1373,8 @@ VisMF::VisMF (std::string fafab_name)
     AMREX_ASSERT(m_hdr.m_ncomp >= 0 && m_hdr.m_ncomp < std::numeric_limits<int>::max());
     m_pa.resize(m_hdr.m_ncomp);
 
-    for(int n(0); n < m_pa.size(); ++n) {
-        m_pa[n].resize(m_hdr.m_ba.size());
-
-        for(int ii(0), N = static_cast<int>(m_pa[n].size()); ii < N; ++ii) {
-            m_pa[n][ii] = 0;
-        }
+    for(auto & vfabptr : m_pa) {
+        vfabptr.resize(m_hdr.m_ba.size(), nullptr);
     }
 }
 
@@ -1510,7 +1509,7 @@ VisMF::Read (FabArray<FArrayBox> &mf,
     int messTotal(0);
 
     if(verbose && myProc == coordinatorProc) {
-        amrex::AllPrint() << myProc << "::VisMF::Read:  about to read:  " << mf_name << std::endl;
+        amrex::AllPrint() << myProc << "::VisMF::Read:  about to read:  " << mf_name << '\n';
     }
 
     std::string FullHdrFileName(mf_name + TheMultiFabHdrFileSuffix);
@@ -1539,7 +1538,7 @@ VisMF::Read (FabArray<FArrayBox> &mf,
     } else {
         if (hdr.m_ba.empty())
         {
-            amrex::Print() << "In trying to read " << mf_name << std::endl;
+            amrex::Print() << "In trying to read " << mf_name << '\n';
             amrex::Error("Empty box array");
         }
     }
@@ -1600,8 +1599,8 @@ VisMF::Read (FabArray<FArrayBox> &mf,
       NItemsPerBin(static_cast<int>(frc.size()), nBoxesPerRank);
       int frcIndex(0);
 
-      for(int nbpr(0); nbpr < nBoxesPerRank.size(); ++nbpr) {
-        for(int nb(0); nb < nBoxesPerRank[nbpr]; ++nb) {
+      for(int nbpr : nBoxesPerRank) {
+        for(int nb(0); nb < nbpr; ++nb) {
 
           baFileOrder.set(indexFileOrder, frc[frcIndex].box);
           ranksFileOrder[indexFileOrder] = currentRank;
@@ -1623,11 +1622,11 @@ VisMF::Read (FabArray<FArrayBox> &mf,
     bool inFileOrder(mf.DistributionMap() == dmFileOrder && mf.boxArray() == baFileOrder);
     if(inFileOrder) {
       if(myProc == coordinatorProc && verbose) {
-          amrex::AllPrint() << "VisMF::Read:  inFileOrder" << std::endl;
+          amrex::AllPrint() << "VisMF::Read:  inFileOrder" << '\n';
       }
     } else {
       if(myProc == coordinatorProc && verbose) {
-          amrex::AllPrint() << "VisMF::Read:  not inFileOrder" << std::endl;
+          amrex::AllPrint() << "VisMF::Read:  not inFileOrder" << '\n';
       }
       // ---- make a temporary fabarray in file order
       fafabFileOrder.define(baFileOrder, dmFileOrder, hdr.m_ncomp, hdr.m_ngrow, MFInfo(), mf.Factory());
@@ -1659,9 +1658,8 @@ VisMF::Read (FabArray<FArrayBox> &mf,
         }
       }
 
-      for(int iSet(0); iSet < streamSets.size(); ++iSet) {
+      for(auto & rfrSet : streamSets) {
         Vector<int> readRanks;
-        std::set<int> &rfrSet = streamSets[iSet];
         for(setIter = rfrSet.begin(); setIter != rfrSet.end(); ++setIter) {
           readRanks.push_back(*setIter);
         }
@@ -1676,9 +1674,9 @@ VisMF::Read (FabArray<FArrayBox> &mf,
 
               // ---- confirm the data is contiguous in the stream
               Long firstOffset(-1);
-              for(int i(0); i < frc.size(); ++i) {
-                if(myProc == frc[i].rankToRead) {
-                  firstOffset = frc[i].fileOffset;
+              for(auto & i : frc) {
+                if(myProc == i.rankToRead) {
+                  firstOffset = i.fileOffset;
                   break;
                 }
               }
@@ -1687,12 +1685,12 @@ VisMF::Read (FabArray<FArrayBox> &mf,
               Long currentOffset(firstOffset), bytesToRead(0);
               int nFABs(0);
 
-              for(int i(0); i < frc.size(); ++i) {
-                if(myProc == frc[i].rankToRead) {
-                  if(currentOffset != frc[i].fileOffset) {
+              for(auto & i : frc) {
+                if(myProc == i.rankToRead) {
+                  if(currentOffset != i.fileOffset) {
                     dataIsContiguous = false;
                   } else {
-                    FArrayBox &fab = whichFA[frc[i].faIndex];
+                    FArrayBox &fab = whichFA[i.faIndex];
                     Long fabBytesToRead(fab.box().numPts() * fab.nComp() * hdr.m_writtenRD.numBytes());
                     currentOffset += fabBytesToRead;
                     bytesToRead   += fabBytesToRead;
@@ -1717,10 +1715,10 @@ VisMF::Read (FabArray<FArrayBox> &mf,
 
                 currentOffset = 0;  // ---- this is now relative to allFabData
 
-                for(int i(0); i < frc.size(); ++i) {
-                  if(myProc == frc[i].rankToRead) {
+                for(auto & i : frc) {
+                  if(myProc == i.rankToRead) {
                     char *afPtr = allFabData + currentOffset;
-                    FArrayBox &fab = whichFA[frc[i].faIndex];
+                    FArrayBox &fab = whichFA[i.faIndex];
                     Real* fabdata = fab.dataPtr();
 #ifdef AMREX_USE_GPU
                     std::unique_ptr<FArrayBox> hostfab;
@@ -1753,12 +1751,12 @@ VisMF::Read (FabArray<FArrayBox> &mf,
                 delete [] allFabData;
 
               } else {          // ---- cannot use one read
-                for(int i(0); i < frc.size(); ++i) {
-                  if(myProc == frc[i].rankToRead) {
-                    if(static_cast<std::streamoff>(nfi.SeekPos()) != frc[i].fileOffset) {
-                      nfi.Stream().seekp(frc[i].fileOffset, std::ios::beg);
+                for(auto & i : frc) {
+                  if(myProc == i.rankToRead) {
+                    if(static_cast<std::streamoff>(nfi.SeekPos()) != i.fileOffset) {
+                      nfi.Stream().seekp(i.fileOffset, std::ios::beg);
                     }
-                    FArrayBox &fab = whichFA[frc[i].faIndex];
+                    FArrayBox &fab = whichFA[i.faIndex];
                     Real* fabdata = fab.dataPtr();
 #ifdef AMREX_USE_GPU
                     std::unique_ptr<FArrayBox> hostfab;
@@ -1850,7 +1848,7 @@ VisMF::Read (FabArray<FArrayBox> &mf,
           int findex = fileNamesIter->second;
           allReads[findex][whichProc].insert(std::pair<Long, int>(iSeekPos, i));
         } else {
-            amrex::ErrorStream() << "**** Error:  filename not found = " << fname << std::endl;
+            amrex::ErrorStream() << "**** Error:  filename not found = " << fname << '\n';
             amrex::Abort("**** Error in VisMF::Read");
         }
       }
@@ -1943,7 +1941,7 @@ VisMF::Read (FabArray<FArrayBox> &mf,
             int fileIndex = fit->second;
             availableFiles.insert(fileIndex);
           } else {
-            amrex::ErrorStream() << "**** Error:  filename not found = " << fname << std::endl;
+            amrex::ErrorStream() << "**** Error:  filename not found = " << fname << '\n';
             amrex::Abort("**** Error in VisMF::Read");
           }
         }
@@ -1974,9 +1972,9 @@ VisMF::Read (FabArray<FArrayBox> &mf,
 #endif
 
     if(VisMF::GetUsePersistentIFStreams()) {
-      for(int idx(0); idx < hdr.m_fod.size(); ++idx) {
+      for(auto & idx : hdr.m_fod) {
         std::string FullName(VisMF::DirName(mf_name));
-        FullName += hdr.m_fod[idx].m_name;
+        FullName += idx.m_name;
         VisMF::DeleteStream(FullName);
       }
     }
@@ -1989,7 +1987,7 @@ VisMF::Read (FabArray<FArrayBox> &mf,
                         << "FARead ::  hTime = " << (hEndTime - hStartTime) << '\n'
                         << "FARead ::  faCopyTime = " << faCopyTime << '\n'
                         << "FARead ::  mfReadTime = " << mfReadTime
-                        << "  totalTime = " << totalTime << std::endl;
+                        << "  totalTime = " << totalTime << '\n';
     }
 
     BL_ASSERT(mf.ok());
@@ -2031,7 +2029,7 @@ VisMF::Check (const std::string& mf_name)
 
   if(ParallelDescriptor::IOProcessor()) {
    if (verbose) {
-       amrex::Print() << "---------------- VisMF::Check:  about to check:  " << mf_name << std::endl;
+       amrex::Print() << "---------------- VisMF::Check:  about to check:  " << mf_name << '\n';
    }
 
     char c;
@@ -2059,7 +2057,7 @@ VisMF::Check (const std::string& mf_name)
     if(hdr.m_vers != VisMF::Header::Version_v1) {
      v1 = false;
      if (verbose) {
-         amrex::Print() << "**** VisMF::Check currently only supports Version_v1." << std::endl;
+         amrex::Print() << "**** VisMF::Check currently only supports Version_v1." << '\n';
      }
     } else {
 
@@ -2074,7 +2072,7 @@ VisMF::Check (const std::string& mf_name)
 
       if( ! ifs.good()) {
           if (verbose) {
-              amrex::AllPrint() << "**** Error:  could not open file:  " << FullName << std::endl;
+              amrex::AllPrint() << "**** Error:  could not open file:  " << FullName << '\n';
           }
           continue;
       }
@@ -2098,7 +2096,7 @@ VisMF::Check (const std::string& mf_name)
         if (verbose) {
             amrex::AllPrint() << "**** Error in file:  " << FullName << "  Bad Fab at index = "
                               << i << "  seekpos = " << fod.m_head << "  box = " << hdr.m_ba[i]
-                              << std::endl;
+                              << '\n';
         }
       }
       ifs.close();
@@ -2106,12 +2104,12 @@ VisMF::Check (const std::string& mf_name)
     }
     if(nBadFabs) {
         if (verbose) {
-            amrex::AllPrint() << "Total Bad Fabs = " << nBadFabs << std::endl;
+            amrex::AllPrint() << "Total Bad Fabs = " << nBadFabs << '\n';
         }
         isOk = false;
     } else {
         if (verbose) {
-            amrex::AllPrint() << "No Bad Fabs." << std::endl;
+            amrex::AllPrint() << "No Bad Fabs." << '\n';
         }
         isOk = true;
     }
@@ -2202,7 +2200,7 @@ void VisMF::CloseStream(const std::string &fileName, bool forceClose)
     pifs.pstr = nullptr;
     pifs.isOpen = false;
   }
-  pifs.ioBuffer.clear();
+  VisMFBuffer::ClearBuffer(pifs.ioBuffer);
 }
 
 
@@ -2239,9 +2237,10 @@ VisMF::AsyncWrite (const FabArray<FArrayBox>& mf, const std::string& mf_name, bo
 }
 
 void
-VisMF::AsyncWrite (FabArray<FArrayBox>&& mf, const std::string& mf_name, bool valid_cells_only)
+VisMF::AsyncWrite (FabArray<FArrayBox>&& mf, const std::string& mf_name, bool valid_cells_only) // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved)
 {
     if (AsyncOut::UseAsyncOut()) {
+        // Note that we use `true` below to indicate this is rvalue.
         AsyncWriteDoit(mf, mf_name, true, valid_cells_only);
     } else {
         if (valid_cells_only && mf.nGrowVect() != 0) {
diff --git a/Src/Base/AMReX_VisMFBuffer.H b/Src/Base/AMReX_VisMFBuffer.H
index 843e5713f34..f095dbbebf2 100644
--- a/Src/Base/AMReX_VisMFBuffer.H
+++ b/Src/Base/AMReX_VisMFBuffer.H
@@ -29,6 +29,10 @@ public:
         ioBufferSize = iobuffersize;
     }
 
+    static void ClearBuffer (IO_Buffer& buf) {
+        IO_Buffer().swap(buf);
+    }
+
 protected:
 
     static AMREX_EXPORT Long ioBufferSize;   //!< ---- the settable buffer size
diff --git a/Src/Base/AMReX_bc_types.fi b/Src/Base/AMReX_bc_types.fi
index 2f3340ccfc4..d929b589b3d 100644
--- a/Src/Base/AMReX_bc_types.fi
+++ b/Src/Base/AMReX_bc_types.fi
@@ -17,3 +17,5 @@
       PARAMETER (HOEXTRAP=4)
       INTEGER HOEXTRAPCC
       PARAMETER (HOEXTRAPCC=5)
+      INTEGER EXT_DIR_CC
+      PARAMETER (EXT_DIR_CC=6)
diff --git a/Src/Base/AMReX_bc_types_mod.F90 b/Src/Base/AMReX_bc_types_mod.F90
index c1c6f237ba8..5b8c58c255a 100644
--- a/Src/Base/AMReX_bc_types_mod.F90
+++ b/Src/Base/AMReX_bc_types_mod.F90
@@ -15,6 +15,7 @@ module amrex_bc_types_module
   integer, parameter, public :: amrex_bc_ext_dir      =  3
   integer, parameter, public :: amrex_bc_hoextrap     =  4
   integer, parameter, public :: amrex_bc_hoextrapcc   =  5
+  integer, parameter, public :: amrex_bc_ext_dir_cc   =  6
   integer, parameter, public :: amrex_bc_user_1       =  1001
   integer, parameter, public :: amrex_bc_user_2       =  1002
   integer, parameter, public :: amrex_bc_user_3       =  1003
diff --git a/Src/Base/AMReX_filcc_mod.F90 b/Src/Base/AMReX_filcc_mod.F90
index c1a2b2951f4..26af52bb605 100644
--- a/Src/Base/AMReX_filcc_mod.F90
+++ b/Src/Base/AMReX_filcc_mod.F90
@@ -23,10 +23,6 @@ module amrex_filcc_module
   public :: amrex_hoextraptocc_2d
 #endif
 
-#ifndef AMREX_XSDK
-  public :: filccn
-#endif
-
 contains
 
   subroutine amrex_filcc_n(q,qlo,qhi,domlo,domhi,dx,xlo,bclo,bchi)
@@ -105,20 +101,6 @@ subroutine amrex_fab_filcc (q, qlo, qhi, nq, domlo, domhi, dx, xlo, bc) &
 
   end subroutine amrex_fab_filcc
 
-#ifndef AMREX_XSDK
-  subroutine filccn(lo, hi, q, q_lo, q_hi, ncomp, domlo, domhi, dx, xlo, bc)
-    implicit none
-    integer,          intent(in   ) :: lo(3), hi(3)
-    integer,          intent(in   ) :: q_lo(3), q_hi(3)
-    integer,          intent(in   ) :: ncomp
-    integer,          intent(in   ) :: domlo(amrex_spacedim), domhi(amrex_spacedim)
-    real(amrex_real), intent(in   ) :: xlo(amrex_spacedim), dx(amrex_spacedim)
-    real(amrex_real), intent(inout) :: q(q_lo(1):q_hi(1),q_lo(2):q_hi(2),q_lo(3):q_hi(3),ncomp)
-    integer,          intent(in   ) :: bc(amrex_spacedim,2,ncomp)
-    call amrex_filccn(lo, hi, q, q_lo, q_hi, ncomp, domlo, domhi, dx, xlo, bc)
-  end subroutine filccn
-#endif
-
   subroutine amrex_filccn(lo, hi, q, q_lo, q_hi, ncomp, domlo, domhi, dx, xlo, bc)
 
     implicit none
@@ -165,7 +147,7 @@ subroutine amrex_filccn(lo, hi, q, q_lo, q_hi, ncomp, domlo, domhi, dx, xlo, bc)
           imin = lo(1)
           imax = ilo-1
 
-          if (bc(1,1,n) .eq. amrex_bc_ext_dir) then
+          if (bc(1,1,n) .eq. amrex_bc_ext_dir .or. bc(1,1,n) .eq. amrex_bc_ext_dir_cc) then
 
              ! Do nothing.
 
@@ -237,7 +219,7 @@ subroutine amrex_filccn(lo, hi, q, q_lo, q_hi, ncomp, domlo, domhi, dx, xlo, bc)
           imin = ihi+1
           imax = hi(1)
 
-          if (bc(1,2,n) .eq. amrex_bc_ext_dir) then
+          if (bc(1,2,n) .eq. amrex_bc_ext_dir .or. bc(1,2,n) .eq. amrex_bc_ext_dir_cc) then
 
              ! Do nothing.
 
@@ -311,7 +293,7 @@ subroutine amrex_filccn(lo, hi, q, q_lo, q_hi, ncomp, domlo, domhi, dx, xlo, bc)
           jmin = lo(2)
           jmax = jlo-1
 
-          if (bc(2,1,n) .eq. amrex_bc_ext_dir) then
+          if (bc(2,1,n) .eq. amrex_bc_ext_dir .or. bc(2,1,n) .eq. amrex_bc_ext_dir_cc) then
 
              ! Do nothing.
 
@@ -383,7 +365,7 @@ subroutine amrex_filccn(lo, hi, q, q_lo, q_hi, ncomp, domlo, domhi, dx, xlo, bc)
           jmin = jhi+1
           jmax = hi(2)
 
-          if (bc(2,2,n) .eq. amrex_bc_ext_dir) then
+          if (bc(2,2,n) .eq. amrex_bc_ext_dir .or. bc(2,2,n) .eq. amrex_bc_ext_dir_cc) then
 
              ! Do nothing.
 
@@ -461,7 +443,7 @@ subroutine amrex_filccn(lo, hi, q, q_lo, q_hi, ncomp, domlo, domhi, dx, xlo, bc)
           kmin = lo(3)
           kmax = klo-1
 
-          if (bc(3,1,n) .eq. amrex_bc_ext_dir) then
+          if (bc(3,1,n) .eq. amrex_bc_ext_dir .or. bc(3,1,n) .eq. amrex_bc_ext_dir_cc) then
 
              ! Do nothing.
 
@@ -533,7 +515,7 @@ subroutine amrex_filccn(lo, hi, q, q_lo, q_hi, ncomp, domlo, domhi, dx, xlo, bc)
           kmin = khi+1
           kmax = hi(3)
 
-          if (bc(3,2,n) .eq. amrex_bc_ext_dir) then
+          if (bc(3,2,n) .eq. amrex_bc_ext_dir .or. bc(3,2,n) .eq. amrex_bc_ext_dir_cc) then
 
              ! Do nothing.
 
diff --git a/Src/Base/AMReX_iMultiFab.H b/Src/Base/AMReX_iMultiFab.H
index 519ab9c82a5..eb1e350433a 100644
--- a/Src/Base/AMReX_iMultiFab.H
+++ b/Src/Base/AMReX_iMultiFab.H
@@ -24,7 +24,7 @@ namespace amrex {
 * member functions are defined for I/O and simple arithmetic operations on
 * these aggregate objects.
 *
-8 This class does NOT provide a copy constructor or assignment operator.
+* This class does NOT provide a copy constructor or assignment operator.
 */
 class iMultiFab
     :
@@ -42,7 +42,7 @@ public:
     /**
     * \brief Constructs an empty iMultiFab.  Data can be defined at a later
     * time using the define member functions inherited from FabArray.  If
-    * `define` is called later with a nulltpr as MFInfo's arena, the default
+    * `define` is called later with a nullptr as MFInfo's arena, the default
     * Arena `a` will be used.  If the arena in MFInfo is not a nullptr, the
     * MFInfo's arena will be used.
     */
diff --git a/Src/Base/AMReX_parstream.cpp b/Src/Base/AMReX_parstream.cpp
index 824b9041ecc..1cbe219df1f 100644
--- a/Src/Base/AMReX_parstream.cpp
+++ b/Src/Base/AMReX_parstream.cpp
@@ -40,12 +40,14 @@ namespace amrex
 
 // the shared variables
 
-  static std::string       s_pout_filename ;
-  static std::string       s_pout_basename ;
-  static std::ofstream     s_pout ;
+namespace {
 
-  static bool              s_pout_init = false ;
-  static bool              s_pout_open = false ;
+  std::string       s_pout_filename ;
+  std::string       s_pout_basename ;
+  std::ofstream     s_pout ;
+
+  bool              s_pout_init = false ;
+  bool              s_pout_open = false ;
 
 ////////////////////////////////////////////////////////////////
 
@@ -54,7 +56,7 @@ namespace amrex
 #ifdef BL_USE_MPI
 // in parallel, compute the filename give the basename
 //[NOTE: dont call this before MPI is initialized.]
-  static void setFileName()
+  void setFileName()
   {
     int outInterv = 1;
     ParmParse pp("amrex");
@@ -76,7 +78,7 @@ namespace amrex
   }
 
 // in parallel, close the file if nec., open it and check for success
-  static void openFile()
+  void openFile()
   {
     if ( s_pout_open )
     {
@@ -90,18 +92,20 @@ namespace amrex
 
 #else
 // in serial, filename is always cout
-  static void setFileName()
+  void setFileName()
   {
     s_pout_filename = "cout" ;
   }
 
 // in serial, this does absolutely nothing
-  static void openFile()
+  void openFile()
   {
     amrex::ignore_unused(s_pout);
   }
 #endif
 
+}
+
 ////////////////////////////////////////////////////////////////
 
 /// the stream that all output except error msgs should use
@@ -204,7 +208,7 @@ namespace amrex
       // proc number is known.  So treat it as a programming bug.  Since MPI
       // isn't initialized, all procs must be running this code, so all procs
       // will fail.
-      std::cerr << "error: poutFileName() cannot be called before MPI_Initialize()." << std::endl ;
+      std::cerr << "error: poutFileName() cannot be called before MPI_Initialize()." << '\n' ;
       exit( 111 );
     }
 #else
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 544de3aed8c..cebd1f9bce1 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -53,6 +53,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_ParallelDescriptor.H
        AMReX_ParallelDescriptor.cpp
        AMReX_OpenMP.H
+       AMReX_OpenMP.cpp
        AMReX_ParallelReduce.H
        AMReX_ForkJoin.H
        AMReX_ForkJoin.cpp
@@ -131,6 +132,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_Array4.H
        AMReX_MakeType.H
        AMReX_TypeTraits.H
+       AMReX_FabDataType.H
        AMReX_FabFactory.H
        AMReX_BaseFabUtility.H
        # Fortran data defined on unions of rectangles ----------------------------
@@ -268,6 +270,8 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        Parser/amrex_iparser.tab.cpp
        Parser/amrex_iparser.tab.nolint.H
        Parser/amrex_iparser.tab.h
+       # Dense linear algebra solver using LU decomposition
+       AMReX_LUSolver.H
        # AMReX Hydro -----------------------------------------------------
        AMReX_Slopes_K.H
        # Forward declaration -----------------------------------------------------
@@ -291,7 +295,6 @@ foreach(D IN LISTS AMReX_SPACEDIM)
           AMReX_filcc_f.H
           AMReX_BLutil_F.F90
           AMReX_BLProfiler_F.F90
-          AMReX_FILCC_${D}D.F90
           AMReX_filcc_mod.F90
           AMReX_omp_mod.F90
           AMReX_acc_mod.F90
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index 29b4c25dc84..dfbfb4f03a1 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -38,6 +38,7 @@ C$(AMREX_BASE)_headers += AMReX_REAL.H AMReX_INT.H AMReX_CONSTANTS.H AMReX_SPACE
 C$(AMREX_BASE)_sources += AMReX_DistributionMapping.cpp AMReX_ParallelDescriptor.cpp
 C$(AMREX_BASE)_headers += AMReX_DistributionMapping.H AMReX_ParallelDescriptor.H
 C$(AMREX_BASE)_headers += AMReX_OpenMP.H
+C$(AMREX_BASE)_sources += AMReX_OpenMP.cpp
 
 C$(AMREX_BASE)_headers += AMReX_ParallelReduce.H
 
@@ -148,6 +149,7 @@ C$(AMREX_BASE)_headers += AMReX_IArrayBox.H
 
 C$(AMREX_BASE)_headers += AMReX_MakeType.H
 C$(AMREX_BASE)_headers += AMReX_TypeTraits.H
+C$(AMREX_BASE)_headers += AMReX_FabDataType.H
 
 C$(AMREX_BASE)_headers += AMReX_Array4.H
 C$(AMREX_BASE)_sources += AMReX_BaseFab.cpp
@@ -222,7 +224,6 @@ C$(AMREX_BASE)_headers += AMReX_Slopes_K.H
 # Fortran interface routines.
 #
 ifneq ($(BL_NO_FORT),TRUE)
-  F90$(AMREX_BASE)_sources += AMReX_FILCC_${DIM}D.F90
   F90$(AMREX_BASE)_headers += AMReX_filcc_f.H
 
   F90$(AMREX_BASE)_sources += AMReX_BLutil_F.F90
@@ -325,5 +326,8 @@ CEXE_sources += AMReX_IParser_Exe.cpp
 CEXE_headers += AMReX_IParser.H
 CEXE_sources += AMReX_IParser.cpp
 
+# Dense linear algebra solver using LU decomposition
+CEXE_headers += AMReX_LUSolver.H
+
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/Base/Parser
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Base/Parser
diff --git a/Src/Base/Parser/AMReX_IParser.H b/Src/Base/Parser/AMReX_IParser.H
index 69f40252b0c..6cf1e5b2b0f 100644
--- a/Src/Base/Parser/AMReX_IParser.H
+++ b/Src/Base/Parser/AMReX_IParser.H
@@ -16,15 +16,12 @@ namespace amrex {
 template <int N>
 struct IParserExecutor
 {
-    template <int M=N, typename std::enable_if_t<M==0,int> = 0>
+    template <int M=N, std::enable_if_t<M==0,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int operator() () const noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return iparser_exe_eval(m_device_executor, nullptr);
-#else
-        return iparser_exe_eval(m_host_executor, nullptr);
-#endif
+        AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, nullptr);))
+        AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, nullptr);))
     }
 
     template <typename... Ts>
@@ -33,30 +30,21 @@ struct IParserExecutor
     operator() (Ts... var) const noexcept
     {
         amrex::GpuArray<int,N> l_var{var...};
-#if AMREX_DEVICE_COMPILE
-        return iparser_exe_eval(m_device_executor, l_var.data());
-#else
-        return iparser_exe_eval(m_host_executor, l_var.data());
-#endif
+        AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, l_var.data());))
+        AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, l_var.data());))
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int operator() (GpuArray<int,N> const& var) const noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return iparser_exe_eval(m_device_executor, var.data());
-#else
-        return iparser_exe_eval(m_host_executor, var.data());
-#endif
+        AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, var.data());))
+        AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, var.data());))
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     explicit operator bool () const {
-#if AMREX_DEVICE_COMPILE
-        return m_device_executor != nullptr;
-#else
-        return m_host_executor != nullptr;
-#endif
+        AMREX_IF_ON_DEVICE((return m_device_executor != nullptr;))
+        AMREX_IF_ON_HOST((return m_host_executor != nullptr;))
     }
 
     char* m_host_executor = nullptr;
diff --git a/Src/Base/Parser/AMReX_IParser_Exe.cpp b/Src/Base/Parser/AMReX_IParser_Exe.cpp
index cabf8203349..062b7d9ffa4 100644
--- a/Src/Base/Parser/AMReX_IParser_Exe.cpp
+++ b/Src/Base/Parser/AMReX_IParser_Exe.cpp
@@ -2,7 +2,8 @@
 
 namespace amrex {
 
-static int iparser_local_symbol_index (struct iparser_symbol* sym, Vector<char*>& local_variables)
+namespace {
+int iparser_local_symbol_index (struct iparser_symbol* sym, Vector<char*>& local_variables)
 {
     auto r = std::find_if(local_variables.rbegin(), local_variables.rend(),
                           [=] (char* i) { return std::strcmp(sym->name, i) == 0; });
@@ -12,6 +13,7 @@ static int iparser_local_symbol_index (struct iparser_symbol* sym, Vector<char*>
         return -1;
     }
 }
+}
 
 void
 iparser_compile_exe_size (struct iparser_node* node, char*& p, std::size_t& exe_size,
diff --git a/Src/Base/Parser/AMReX_IParser_Y.cpp b/Src/Base/Parser/AMReX_IParser_Y.cpp
index 87a77503132..cfb036de96e 100644
--- a/Src/Base/Parser/AMReX_IParser_Y.cpp
+++ b/Src/Base/Parser/AMReX_IParser_Y.cpp
@@ -19,7 +19,9 @@ amrex_iparsererror (char const *s, ...)
 
 namespace amrex {
 
-static struct iparser_node* iparser_root = nullptr;
+namespace {
+    struct iparser_node* iparser_root = nullptr;
+}
 
 // This is called by a bison rule to store the original AST in a static variable.
 void
@@ -150,7 +152,8 @@ amrex_iparser_delete (struct amrex_iparser* iparser)
     std::free(iparser);
 }
 
-static
+namespace {
+
 std::size_t
 iparser_aligned_size (std::size_t N)
 {
@@ -160,7 +163,6 @@ iparser_aligned_size (std::size_t N)
     return x;
 }
 
-static
 void*
 iparser_allocate (struct amrex_iparser* my_iparser, std::size_t N)
 {
@@ -169,6 +171,8 @@ iparser_allocate (struct amrex_iparser* my_iparser, std::size_t N)
     return r;
 }
 
+}
+
 struct amrex_iparser*
 iparser_dup (struct amrex_iparser* source)
 {
@@ -976,7 +980,8 @@ iparser_ast_optimize (struct iparser_node* node)
     }
 }
 
-static
+namespace {
+
 void
 iparser_ast_print_f1 (struct iparser_f1* f1, std::string const& space, AllPrint& printer)
 {
@@ -989,7 +994,6 @@ iparser_ast_print_f1 (struct iparser_f1* f1, std::string const& space, AllPrint&
     iparser_ast_print(f1->l, space+"  ", printer);
 }
 
-static
 void
 iparser_ast_print_f2 (struct iparser_f2* f2, std::string const& space, AllPrint& printer)
 {
@@ -1038,7 +1042,6 @@ iparser_ast_print_f2 (struct iparser_f2* f2, std::string const& space, AllPrint&
     iparser_ast_print(f2->r, space+"  ", printer);
 }
 
-static
 void
 iparser_ast_print_f3 (struct iparser_f3* f3, std::string const& space, AllPrint& printer)
 {
@@ -1055,6 +1058,8 @@ iparser_ast_print_f3 (struct iparser_f3* f3, std::string const& space, AllPrint&
     iparser_ast_print(f3->n3, more_space, printer);
 }
 
+}
+
 void
 iparser_ast_print (struct iparser_node* node, std::string const& space, AllPrint& printer)
 {
diff --git a/Src/Base/Parser/AMReX_Parser.H b/Src/Base/Parser/AMReX_Parser.H
index b74de941950..18bcca35f77 100644
--- a/Src/Base/Parser/AMReX_Parser.H
+++ b/Src/Base/Parser/AMReX_Parser.H
@@ -17,15 +17,12 @@ namespace amrex {
 template <int N>
 struct ParserExecutor
 {
-    template <int M=N, typename std::enable_if_t<M==0,int> = 0>
+    template <int M=N, std::enable_if_t<M==0,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     double operator() () const noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return parser_exe_eval(m_device_executor, nullptr);
-#else
-        return parser_exe_eval(m_host_executor, nullptr);
-#endif
+        AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, nullptr);))
+        AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, nullptr);))
     }
 
     template <typename... Ts>
@@ -34,11 +31,8 @@ struct ParserExecutor
     operator() (Ts... var) const noexcept
     {
         amrex::GpuArray<double,N> l_var{var...};
-#if AMREX_DEVICE_COMPILE
-        return parser_exe_eval(m_device_executor, l_var.data());
-#else
-        return parser_exe_eval(m_host_executor, l_var.data());
-#endif
+        AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, l_var.data());))
+        AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, l_var.data());))
     }
 
     template <typename... Ts>
@@ -47,30 +41,21 @@ struct ParserExecutor
     operator() (Ts... var) const noexcept
     {
         amrex::GpuArray<double,N> l_var{var...};
-#if AMREX_DEVICE_COMPILE
-        return static_cast<float>(parser_exe_eval(m_device_executor, l_var.data()));
-#else
-        return static_cast<float>(parser_exe_eval(m_host_executor, l_var.data()));
-#endif
+        AMREX_IF_ON_DEVICE((return static_cast<float>(parser_exe_eval(m_device_executor, l_var.data()));))
+        AMREX_IF_ON_HOST((return static_cast<float>(parser_exe_eval(m_host_executor, l_var.data()));))
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     double operator() (GpuArray<double,N> const& var) const noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return parser_exe_eval(m_device_executor, var.data());
-#else
-        return parser_exe_eval(m_host_executor, var.data());
-#endif
+        AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, var.data());))
+        AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, var.data());))
     }
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     explicit operator bool () const {
-#if AMREX_DEVICE_COMPILE
-        return m_device_executor != nullptr;
-#else
-        return m_host_executor != nullptr;
-#endif
+        AMREX_IF_ON_DEVICE((return m_device_executor != nullptr;))
+        AMREX_IF_ON_HOST((return m_host_executor != nullptr;))
     }
 
     char* m_host_executor = nullptr;
diff --git a/Src/Base/Parser/AMReX_Parser_Y.H b/Src/Base/Parser/AMReX_Parser_Y.H
index 792f796554e..e84cf9e0d59 100644
--- a/Src/Base/Parser/AMReX_Parser_Y.H
+++ b/Src/Base/Parser/AMReX_Parser_Y.H
@@ -350,7 +350,7 @@ template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_NO_INLINE
 T parser_math_comp_ellint_1 (T a)
 {
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) && !defined(__NVCOMPILER)
     return std::comp_ellint_1(a);
 #else
     amrex::ignore_unused(a);
@@ -363,7 +363,7 @@ template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_NO_INLINE
 T parser_math_comp_ellint_2 (T a)
 {
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) && !defined(__NVCOMPILER)
     return std::comp_ellint_2(a);
 #else
     amrex::ignore_unused(a);
diff --git a/Src/Base/Parser/AMReX_Parser_Y.cpp b/Src/Base/Parser/AMReX_Parser_Y.cpp
index 75a58498c15..5bd99d036be 100644
--- a/Src/Base/Parser/AMReX_Parser_Y.cpp
+++ b/Src/Base/Parser/AMReX_Parser_Y.cpp
@@ -19,7 +19,9 @@ amrex_parsererror (char const *s, ...)
 
 namespace amrex {
 
-static struct parser_node* parser_root = nullptr;
+namespace {
+    struct parser_node* parser_root = nullptr;
+}
 
 // This is called by a bison rule to store the original AST in a static variable.
 void
@@ -171,7 +173,8 @@ amrex_parser_delete (struct amrex_parser* parser)
     std::free(parser);
 }
 
-static
+namespace {
+
 std::size_t
 parser_aligned_size (std::size_t N)
 {
@@ -181,7 +184,6 @@ parser_aligned_size (std::size_t N)
     return x;
 }
 
-static
 void*
 parser_allocate (struct amrex_parser* my_parser, std::size_t N)
 {
@@ -190,6 +192,8 @@ parser_allocate (struct amrex_parser* my_parser, std::size_t N)
     return r;
 }
 
+}
+
 struct amrex_parser*
 parser_dup (struct amrex_parser* source)
 {
@@ -523,7 +527,7 @@ namespace {
 
     template <typename F>
     bool group_combinables (struct parser_node*& a, struct parser_node*& b,
-                            F&& f, parser_node_t type)
+                            F const& f, parser_node_t type)
     {
         if (a->type == type && f(a->l, b))
         {
@@ -1334,7 +1338,8 @@ parser_ast_optimize (struct parser_node* node)
     }
 }
 
-static
+namespace {
+
 void
 parser_ast_print_f1 (struct parser_f1* f1, std::string const& space, std::ostream& printer)
 {
@@ -1342,7 +1347,6 @@ parser_ast_print_f1 (struct parser_f1* f1, std::string const& space, std::ostrea
     parser_ast_print(f1->l, space+"  ", printer);
 }
 
-static
 void
 parser_ast_print_f2 (struct parser_f2* f2, std::string const& space, std::ostream& printer)
 {
@@ -1351,7 +1355,6 @@ parser_ast_print_f2 (struct parser_f2* f2, std::string const& space, std::ostrea
     parser_ast_print(f2->r, space+"  ", printer);
 }
 
-static
 void
 parser_ast_print_f3 (struct parser_f3* f3, std::string const& space, std::ostream& printer)
 {
@@ -1362,6 +1365,8 @@ parser_ast_print_f3 (struct parser_f3* f3, std::string const& space, std::ostrea
     parser_ast_print(f3->n3, more_space, printer);
 }
 
+}
+
 void
 parser_ast_print (struct parser_node* node, std::string const& space, std::ostream& printer)
 {
diff --git a/Src/Boundary/AMReX_BoundCond.H b/Src/Boundary/AMReX_BoundCond.H
index 834f790f6b5..963a2fa7ecc 100644
--- a/Src/Boundary/AMReX_BoundCond.H
+++ b/Src/Boundary/AMReX_BoundCond.H
@@ -16,7 +16,6 @@ namespace amrex {
    boundary conditions are specified via an integer identifier.
    This class maintains that integer.
 */
-
 class BoundCond
 {
 public:
diff --git a/Src/Boundary/AMReX_BoundaryFwd.H b/Src/Boundary/AMReX_BoundaryFwd.H
index 6c42d8e5f06..93bc5009c94 100644
--- a/Src/Boundary/AMReX_BoundaryFwd.H
+++ b/Src/Boundary/AMReX_BoundaryFwd.H
@@ -11,6 +11,7 @@ class InterpBndryDataT;
 class Mask;
 class MultiMask;
 class YAFluxRegisterT;
+class EdgeFluxRegister;
 
 }
 
diff --git a/Src/Boundary/AMReX_EdgeFluxRegister.H b/Src/Boundary/AMReX_EdgeFluxRegister.H
new file mode 100644
index 00000000000..46813c2246a
--- /dev/null
+++ b/Src/Boundary/AMReX_EdgeFluxRegister.H
@@ -0,0 +1,117 @@
+#ifndef AMREX_EDGE_FLUX_REGISTER_H_
+#define AMREX_EDGE_FLUX_REGISTER_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_iMultiFab.H>
+#include <AMReX_LayoutData.H>
+#include <AMReX_MultiFab.H>
+
+namespace amrex {
+
+/**
+ * Edge Flux Register for Constrained Transport
+ *
+ * This Flux Register is useful for solving system like dB/dt + curl E = 0
+ * on a staggered mesh. (Here d is of course partial derivation.) B is a
+ * vector on cell faces, and E is a vector on cell edges. In 2D, E has only
+ * one component, Ez, and it is on the nodes of a 2d mesh.
+ *
+ * At the beginning of a coarse step, `reset()` is called.  In MFIter for
+ * the coarse level advance, `CrseAdd` is called with coarse flux (i.e., E).
+ * The flux is not scaled.  In MFIter for the fine level advance, `FineAdd`
+ * is called.  After the fine level finishes its time steps, `Reflux` is
+ * called to update the coarse level B on the coarse/fine boundary. The user
+ * is also expected to call this version of average_down_faces from
+ * AMReX_MultiFabUtil.H to synchronize the coarse level data with the fine
+ * level.
+ *
+ * \vertbatim
+    template <typename MF, std::enable_if_t<IsFabArray<MF>::value,int>>
+    void average_down_faces (const Array<const MF*,AMREX_SPACEDIM>& fine,
+                             const Array<MF*,AMREX_SPACEDIM>& crse,
+                             const IntVect& ratio, const Geometry& crse_geom)
+ * \endverbatim
+ *
+ * Note that both CrseAdd and FineAdd are async for GPU builds. That means
+ * it's the user's responsibility to keep the FArrayBox arguments alive or
+ * call Gpu::streamSynchronize() when necessary.
+ *
+ * Because staggered grids are used, tiling could be very confusing. To avoid
+ * confusion, this class assumes that tiling is not enabled for the MFIter
+ * loop containing calls to CrseAdd and FineAdd.
+ *
+ * If the equation has an extra factor due to the choice of units, the
+ * factor can be absorbed into dt. If we have `v x B` instead of E, the sign
+ * can also been absorbed into dt. Note that whatever the choice of sign is,
+ * the dt arguments passed to CrseAdd and FineAdd should have the same sign.
+ *
+ * We try to keep the interface simple by not providing overloads that
+ * specify the component index. If the user's data does not start with
+ * component 0, it can be worked around by creating alias FArrayBox and
+ * MultiFab.
+ */
+class EdgeFluxRegister
+{
+public:
+
+    EdgeFluxRegister () = default;
+
+    EdgeFluxRegister (const BoxArray& fba, const BoxArray& cba,
+                      const DistributionMapping& fdm, const DistributionMapping& cdm,
+                      const Geometry& fgeom, const Geometry& cgeom,
+                      int nvar = 1);
+
+    void define (const BoxArray& fba, const BoxArray& cba,
+                 const DistributionMapping& fdm, const DistributionMapping& cdm,
+                 const Geometry& fgeom, const Geometry& cgeom,
+                 int nvar = 1);
+
+    void reset ();
+
+#if (AMREX_SPACEDIM == 3)
+
+    void CrseAdd (MFIter const& mfi, const Array<FArrayBox const*,3>& E_crse, Real dt_crse);
+    void FineAdd (MFIter const& mfi, const Array<FArrayBox const*,3>& E_fine, Real dt_fine);
+
+#else /* 2D */
+
+    void CrseAdd (MFIter const& mfi, FArrayBox const& E_crse, Real dt_crse);
+    void FineAdd (MFIter const& mfi, FArrayBox const& E_fine, Real dt_fine);
+
+#endif
+
+    void Reflux (Array<MultiFab*,AMREX_SPACEDIM> const& B_crse) const;
+
+private:
+
+    Geometry m_fine_geom;
+    Geometry m_crse_geom;
+
+    IntVect m_ratio;
+    int m_ncomp;
+
+#if (AMREX_SPACEDIM == 3)
+
+    Array<MultiFab,AMREX_SPACEDIM> m_E_crse; // on original grids
+
+    // There are AMREX_SPACEDIM*2 faces. For each face, we need to store two
+    // component. For example, at the x-faces, we need to store Ey and Ez.
+    Array<Array<MultiFab,2>,AMREX_SPACEDIM*2> m_E_fine;
+
+    // Mask on the coarse level indicating overlap with m_E_fine
+    Array<iMultiFab,AMREX_SPACEDIM> m_fine_mask;
+
+#else
+
+    MultiFab m_E_crse;
+    Array<MultiFab,AMREX_SPACEDIM*2> m_E_fine;
+    iMultiFab m_fine_mask;
+
+#endif
+
+    LayoutData<int> m_has_cf; // Flag on the coarse level indicating c/f interface
+};
+
+}
+
+#endif
diff --git a/Src/Boundary/AMReX_EdgeFluxRegister.cpp b/Src/Boundary/AMReX_EdgeFluxRegister.cpp
new file mode 100644
index 00000000000..871aeb59fb1
--- /dev/null
+++ b/Src/Boundary/AMReX_EdgeFluxRegister.cpp
@@ -0,0 +1,399 @@
+#include <AMReX_EdgeFluxRegister.H>
+#include <AMReX_MultiFabUtil.H>
+
+namespace amrex {
+
+#if (AMREX_SPACEDIM == 3)
+static constexpr std::array<IntVect,3> E_ixtype{IntVect(0,1,1),IntVect(1,0,1),IntVect(1,1,0)};
+#endif
+
+EdgeFluxRegister::EdgeFluxRegister (const BoxArray& fba, const BoxArray& cba,
+                                    const DistributionMapping& fdm, const DistributionMapping& cdm,
+                                    const Geometry& fgeom, const Geometry& cgeom,
+                                    int nvar)
+{
+    define(fba, cba, fdm, cdm, fgeom, cgeom, nvar);
+}
+
+void EdgeFluxRegister::define (const BoxArray& fba, const BoxArray& cba,
+                               const DistributionMapping& fdm, const DistributionMapping& cdm,
+                               const Geometry& fgeom, const Geometry& cgeom,
+                               int nvar)
+{
+    m_fine_geom = fgeom;
+    m_crse_geom = cgeom;
+    m_ratio = fgeom.Domain().size() / cgeom.Domain().size();
+    AMREX_ALWAYS_ASSERT(fgeom.Domain() == amrex::refine(cgeom.Domain(), m_ratio));
+    m_ncomp = nvar;
+
+    m_has_cf.define(cba, cdm);
+    for (MFIter mfi(m_has_cf, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi) {
+        m_has_cf[mfi] = 0;
+    }
+
+#if (AMREX_SPACEDIM == 3)
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        m_E_crse[idim].define(amrex::convert(cba,E_ixtype[idim]), cdm, nvar, 0);
+    }
+    for (OrientationIter oit; oit.isValid(); ++oit) {
+        auto face = oit();
+        const int direction = face.coordDir();
+        int count = 0;
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            if (idim != direction) {
+                BoxArray bba(amrex::convert(fba,IntVect(0)),
+                             BATransformer(face, IndexType(E_ixtype[idim]), 0, 1, 0));
+                bba.coarsen(m_ratio);
+                m_E_fine[face][count++].define(bba, fdm, nvar, 0);
+            }
+        }
+    }
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        Array<MultiFab const*,4> fmf;
+        int count = 0;
+        for (OrientationIter oit; oit.isValid(); ++oit) {
+            auto face = oit();
+            const int direction = face.coordDir();
+            if (direction != idim) {
+                // For x-direction, we store Ey and then Ez in m_E_fine.
+                // For y-direction, we store Ex and then Ez in m_E_fine.
+                // For z-direction, we store Ey and then Ez in m_E_fine.
+                const int m = (idim < direction) ? idim : idim-1;
+                fmf[count++] = & m_E_fine[face][m];
+            }
+        }
+
+        for (int m = 0; m < 4; ++m) {
+            LayoutData<int> tmp_has_cf;
+            // We use IntVect(1) as ref ratio because fmf has already be coarsened
+            auto tmp_mask = makeFineMask(m_E_crse[idim], *fmf[m], IntVect(0), IntVect(1),
+                                         m_crse_geom.periodicity(), 0, 1, tmp_has_cf);
+            if (m == 0) {
+                m_fine_mask[idim] = std::move(tmp_mask);
+            } else {
+                amrex::Add(m_fine_mask[idim], tmp_mask, 0, 0, 1, 0);
+            }
+            for (MFIter mfi(m_has_cf, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi) {
+                m_has_cf[mfi] += tmp_has_cf[mfi];
+            }
+        }
+    }
+
+#else /* 2D */
+
+    m_E_crse.define(amrex::convert(cba,IntVect(1)), cdm, nvar, 0);
+
+    for (OrientationIter oit; oit.isValid(); ++oit) {
+        auto face = oit();
+        BoxArray bba(amrex::convert(fba,IntVect(0)),
+                     BATransformer(face, IndexType::TheNodeType(), 0, 1, 0));
+        bba.coarsen(m_ratio);
+        m_E_fine[face].define(bba, fdm, nvar, 0);
+    }
+
+    for (OrientationIter oit; oit.isValid(); ++oit) {
+        auto face = oit();
+        LayoutData<int> tmp_has_cf;
+        // We use IntVect(1) as ref ratio because fmf has already be coarsened
+        auto tmp_mask = makeFineMask(m_E_crse, m_E_fine[face], IntVect(0), IntVect(1),
+                                     m_crse_geom.periodicity(), 0, 1, tmp_has_cf);
+        if (int(face) == 0) {
+            m_fine_mask = std::move(tmp_mask);
+        } else {
+            amrex::Add(m_fine_mask, tmp_mask, 0, 0, 1, 0);
+        }
+        for (MFIter mfi(m_has_cf, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi) {
+            m_has_cf[mfi] += tmp_has_cf[mfi];
+        }
+    }
+
+#endif
+}
+
+void EdgeFluxRegister::reset ()
+{
+#if (AMREX_SPACEDIM == 3)
+
+    for (auto& mf : m_E_crse) {
+        auto const& ma = mf.arrays();
+        ParallelFor(mf, IntVect(0), mf.nComp(),
+        [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k, int n)
+        {
+            ma[bno](i,j,k,n) = Real(0.0);
+        });
+    }
+    for (auto& a : m_E_fine) {
+        for (auto& mf : a) {
+#ifdef AMREX_USE_GPU
+            auto const& ma = mf.arrays();
+            ParallelFor(mf, IntVect(0), mf.nComp(),
+            [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k, int n)
+            {
+                ma[bno](i,j,k,n) = Real(0.0);
+            });
+#else
+            // Due to its special BoxArray, it's not safe do tiling
+#ifdef AMREX_USE_OMP
+#pragma omp parallel
+#endif
+            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
+                mf[mfi].template setVal<RunOn::Host>(Real(0.0));
+            }
+#endif
+        }
+    }
+
+#else /* 2D */
+
+    {
+        auto const& ma = m_E_crse.arrays();
+        ParallelFor(m_E_crse, IntVect(0), m_E_crse.nComp(),
+        [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k, int n)
+        {
+            ma[bno](i,j,k,n) = Real(0.0);
+        });
+    }
+    for (auto& mf : m_E_fine) {
+#ifdef AMREX_USE_GPU
+        auto const& ma = mf.arrays();
+        ParallelFor(mf, IntVect(0), mf.nComp(),
+        [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k, int n)
+        {
+            ma[bno](i,j,k,n) = Real(0.0);
+        });
+#else
+        // Due to its special BoxArray, it's not safe do tiling
+#ifdef AMREX_USE_OMP
+#pragma omp parallel
+#endif
+        for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
+            mf[mfi].template setVal<RunOn::Host>(Real(0.0));
+        }
+#endif
+    }
+
+#endif
+
+    Gpu::synchronize();
+}
+
+#if (AMREX_SPACEDIM == 3)
+
+void EdgeFluxRegister::CrseAdd (MFIter const& mfi, const Array<FArrayBox const*,3>& E_crse,
+                                Real dt_crse)
+{
+    AMREX_ASSERT(mfi.validbox() == mfi.tilebox());
+    if (m_has_cf[mfi]) {
+        auto const& dst0 = m_E_crse[0].array(mfi);
+        auto const& dst1 = m_E_crse[1].array(mfi);
+        auto const& dst2 = m_E_crse[2].array(mfi);
+        auto const& src0 = E_crse[0]->const_array();
+        auto const& src1 = E_crse[1]->const_array();
+        auto const& src2 = E_crse[2]->const_array();
+        amrex::ParallelFor
+            (amrex::convert(mfi.validbox(),E_ixtype[0]), m_ncomp,
+             [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+             {
+                 dst0(i,j,k,n) += src0(i,j,k,n) * dt_crse;
+             },
+             amrex::convert(mfi.validbox(),E_ixtype[1]), m_ncomp,
+             [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+             {
+                 dst1(i,j,k,n) += src1(i,j,k,n) * dt_crse;
+             },
+             amrex::convert(mfi.validbox(),E_ixtype[2]), m_ncomp,
+             [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+             {
+                 dst2(i,j,k,n) += src2(i,j,k,n) * dt_crse;
+             });
+    }
+}
+
+void EdgeFluxRegister::FineAdd (MFIter const& mfi, const Array<FArrayBox const*,3>& E_fine,
+                                Real dt_fine)
+{
+    AMREX_ASSERT(mfi.validbox() == mfi.tilebox());
+    auto const ratio = m_ratio;
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        auto const& src = E_fine[idim]->const_array();
+        for (OrientationIter oit; oit.isValid(); ++oit) {
+            auto face = oit();
+            const int direction = face.coordDir();
+            if (direction != idim) {
+                // For x-direction, we store Ey and then Ez in m_E_fine.
+                // For y-direction, we store Ex and then Ez in m_E_fine.
+                // For z-direction, we store Ey and then Ez in m_E_fine.
+                const int m = (idim < direction) ? idim : idim-1;
+                auto const& dst = m_E_fine[face][m].array(mfi);
+                AMREX_ASSERT(E_fine[idim]->box().ixType() == m_E_fine[face][m].ixType());
+                auto offset = IntVect::TheDimensionVector(idim).dim3();
+                auto dt2 = dt_fine / Real(ratio[idim]);
+                ParallelFor(Box(dst), m_ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+                {
+                    int ii = ratio[0]*i;
+                    int jj = ratio[1]*j;
+                    int kk = ratio[2]*k;
+                    for (int rr = 0; rr < ratio[idim]; ++rr) {
+                        dst(i,j,k,n) += src(ii+offset.x*rr,jj+offset.y*rr,kk+offset.z*rr,n)*dt2;
+                    }
+                });
+            }
+        }
+    }
+}
+
+#else /* 2D */
+
+void EdgeFluxRegister::CrseAdd (MFIter const& mfi, FArrayBox const& E_crse, Real dt_crse)
+{
+    AMREX_ASSERT(mfi.validbox() == mfi.tilebox());
+    if (m_has_cf[mfi]) {
+        auto const& dst = m_E_crse.array(mfi);
+        auto const& src = E_crse.const_array();
+        amrex::ParallelFor(amrex::convert(mfi.validbox(),IntVect(1)), m_ncomp,
+                           [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+                           {
+                               dst(i,j,k,n) += src(i,j,k,n) * dt_crse;
+                           });
+    }
+}
+
+void EdgeFluxRegister::FineAdd (MFIter const& mfi, FArrayBox const& E_fine, Real dt_fine)
+{
+    AMREX_ASSERT(mfi.validbox() == mfi.tilebox());
+    auto const ratio = m_ratio;
+    auto const& src = E_fine.const_array();
+    for (OrientationIter oit; oit.isValid(); ++oit) {
+        auto face = oit();
+        auto const& dst = m_E_fine[face].array(mfi);
+        ParallelFor(Box(dst), m_ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int, int n)
+        {
+            int ii = ratio[0]*i;
+            int jj = ratio[1]*j;
+            dst(i,j,0,n) += src(ii,jj,0,n) * dt_fine;
+        });
+    }
+
+}
+
+#endif
+
+void EdgeFluxRegister::Reflux (Array<MultiFab*,AMREX_SPACEDIM> const& B_crse) const
+{
+#if (AMREX_SPACEDIM == 3)
+
+    Array<MultiFab,AMREX_SPACEDIM> E_cfine;
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        E_cfine[idim].define(m_E_crse[idim].boxArray(), m_E_crse[idim].DistributionMap(), m_ncomp, 0);
+        for (OrientationIter oit; oit.isValid(); ++oit) {
+            auto face = oit();
+            const int direction = face.coordDir();
+            if (direction != idim) {
+                // For x-direction, we store Ey and then Ez in m_E_fine.
+                // For y-direction, we store Ex and then Ez in m_E_fine.
+                // For z-direction, we store Ey and then Ez in m_E_fine.
+                const int m = (idim < direction) ? idim : idim-1;
+                E_cfine[idim].ParallelCopy(m_E_fine[face][m], m_crse_geom.periodicity());
+            }
+        }
+    }
+
+    Real dxi = m_crse_geom.InvCellSize(0);
+    Real dyi = m_crse_geom.InvCellSize(1);
+    Real dzi = m_crse_geom.InvCellSize(2);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(*B_crse[0]); mfi.isValid(); ++mfi) {
+        if (m_has_cf[mfi]) {
+            Box xbx = amrex::convert(mfi.validbox(), B_crse[0]->ixType());
+            Box ybx = amrex::convert(mfi.validbox(), B_crse[1]->ixType());
+            Box zbx = amrex::convert(mfi.validbox(), B_crse[2]->ixType());
+            auto const& xmsk = m_fine_mask[0].const_array(mfi);
+            auto const& ymsk = m_fine_mask[1].const_array(mfi);
+            auto const& zmsk = m_fine_mask[2].const_array(mfi);
+            auto const& Bx = B_crse[0]->array(mfi);
+            auto const& By = B_crse[1]->array(mfi);
+            auto const& Bz = B_crse[2]->array(mfi);
+            auto const& cEx = m_E_crse[0].const_array(mfi);
+            auto const& cEy = m_E_crse[1].const_array(mfi);
+            auto const& cEz = m_E_crse[2].const_array(mfi);
+            auto const& fEx = E_cfine[0].const_array(mfi);
+            auto const& fEy = E_cfine[1].const_array(mfi);
+            auto const& fEz = E_cfine[2].const_array(mfi);
+            ParallelFor
+                (xbx, m_ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+                 {
+                     Real dEym = ymsk(i,j  ,k  ) ? (fEy(i,j  ,k  ,n)-cEy(i,j  ,k  ,n)) : Real(0.0);
+                     Real dEyp = ymsk(i,j  ,k+1) ? (fEy(i,j  ,k+1,n)-cEy(i,j  ,k+1,n)) : Real(0.0);
+                     Real dEzm = zmsk(i,j  ,k  ) ? (fEz(i,j  ,k  ,n)-cEz(i,j  ,k  ,n)) : Real(0.0);
+                     Real dEzp = zmsk(i,j+1,k  ) ? (fEz(i,j+1,k  ,n)-cEz(i,j+1,k  ,n)) : Real(0.0);
+                     Bx(i,j,k,n) -= (dEzp-dEzm)*dyi - (dEyp-dEym)*dzi;
+                 },
+                 ybx, m_ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+                 {
+                     Real dExm = xmsk(i  ,j,k  ) ? (fEx(i  ,j,k  ,n)-cEx(i  ,j,k  ,n)) : Real(0.0);
+                     Real dExp = xmsk(i  ,j,k+1) ? (fEx(i  ,j,k+1,n)-cEx(i  ,j,k+1,n)) : Real(0.0);
+                     Real dEzm = zmsk(i  ,j,k  ) ? (fEz(i  ,j,k  ,n)-cEz(i  ,j,k  ,n)) : Real(0.0);
+                     Real dEzp = zmsk(i+1,j,k  ) ? (fEz(i+1,j,k  ,n)-cEz(i+1,j,k  ,n)) : Real(0.0);
+                     By(i,j,k,n) -= (dExp-dExm)*dzi - (dEzp-dEzm)*dxi;
+                 },
+                 zbx, m_ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+                 {
+                     Real dExm = xmsk(i  ,j  ,k) ? (fEx(i  ,j  ,k,n)-cEx(i  ,j  ,k,n)) : Real(0.0);
+                     Real dExp = xmsk(i  ,j+1,k) ? (fEx(i  ,j+1,k,n)-cEx(i  ,j+1,k,n)) : Real(0.0);
+                     Real dEym = ymsk(i  ,j  ,k) ? (fEy(i  ,j  ,k,n)-cEy(i  ,j  ,k,n)) : Real(0.0);
+                     Real dEyp = ymsk(i+1,j  ,k) ? (fEy(i+1,j  ,k,n)-cEy(i+1,j  ,k,n)) : Real(0.0);
+                     Bz(i,j,k,n) -= (dEyp-dEym)*dxi - (dExp-dExm)*dyi;
+                 });
+        }
+    }
+
+#else /* 2D */
+
+    MultiFab E_cfine(m_E_crse.boxArray(), m_E_crse.DistributionMap(), m_ncomp, 0);
+
+    for (OrientationIter oit; oit.isValid(); ++oit) {
+        auto face = oit();
+        E_cfine.ParallelCopy(m_E_fine[face], m_crse_geom.periodicity());
+    }
+
+    Real dxi = m_crse_geom.InvCellSize(0);
+    Real dyi = m_crse_geom.InvCellSize(1);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(*B_crse[0]); mfi.isValid(); ++mfi) {
+        if (m_has_cf[mfi]) {
+            Box xbx = amrex::convert(mfi.validbox(), B_crse[0]->ixType());
+            Box ybx = amrex::convert(mfi.validbox(), B_crse[1]->ixType());
+            auto const& zmsk = m_fine_mask.const_array(mfi);
+            auto const& Bx = B_crse[0]->array(mfi);
+            auto const& By = B_crse[1]->array(mfi);
+            auto const& cEz = m_E_crse.const_array(mfi);
+            auto const& fEz = E_cfine.const_array(mfi);
+            ParallelFor
+                (xbx, m_ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+                 {
+                     Real dEzm = zmsk(i,j  ,k  ) ? (fEz(i,j  ,k  ,n)-cEz(i,j  ,k  ,n)) : Real(0.0);
+                     Real dEzp = zmsk(i,j+1,k  ) ? (fEz(i,j+1,k  ,n)-cEz(i,j+1,k  ,n)) : Real(0.0);
+                     Bx(i,j,k,n) -= (dEzp-dEzm)*dyi;
+                 },
+                 ybx, m_ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+                 {
+                     Real dEzm = zmsk(i  ,j,k  ) ? (fEz(i  ,j,k  ,n)-cEz(i  ,j,k  ,n)) : Real(0.0);
+                     Real dEzp = zmsk(i+1,j,k  ) ? (fEz(i+1,j,k  ,n)-cEz(i+1,j,k  ,n)) : Real(0.0);
+                     By(i,j,k,n) -= - (dEzp-dEzm)*dxi;
+                 });
+        }
+    }
+
+#endif
+}
+
+}
diff --git a/Src/Boundary/AMReX_FabSet.H b/Src/Boundary/AMReX_FabSet.H
index f4ae8b7d247..0e9051b182d 100644
--- a/Src/Boundary/AMReX_FabSet.H
+++ b/Src/Boundary/AMReX_FabSet.H
@@ -3,6 +3,7 @@
 #define AMREX_FABSET_H_
 #include <AMReX_Config.H>
 
+#include <AMReX_FabDataType.H>
 #include <AMReX_MultiFab.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <AMReX_BLProfiler.H>
@@ -40,15 +41,14 @@ namespace amrex {
         FabSets are used primarily as a data storage mechanism, and are
         manipulated by more sophisticated control classes.
 */
-
 template <typename MF>
 class FabSetT
 {
     friend class FabSetIter;
     friend class FluxRegister;
 public:
-    using value_type = typename MF::value_type;
-    using FAB = typename MF::fab_type;
+    using value_type = typename FabDataType<MF>::value_type;
+    using FAB        = typename FabDataType<MF>::fab_type;
 
     //
     //! The default constructor -- you must later call define().
diff --git a/Src/Boundary/AMReX_LO_BCTYPES.H b/Src/Boundary/AMReX_LO_BCTYPES.H
index 2b4e9178bb2..5c51423034a 100644
--- a/Src/Boundary/AMReX_LO_BCTYPES.H
+++ b/Src/Boundary/AMReX_LO_BCTYPES.H
@@ -2,16 +2,6 @@
 #define AMREX_LO_BCTYPES_H_
 #include <AMReX_Config.H>
 
-#ifndef AMREX_XSDK
-
-#define LO_DIRICHLET 101
-#define LO_NEUMANN 102
-#define LO_REFLECT_ODD 103
-#define LO_MARSHAK 104
-#define LO_SANCHEZ_POMRANING 105
-
-#endif
-
 #define AMREX_LO_DIRICHLET 101
 #define AMREX_LO_NEUMANN 102
 #define AMREX_LO_REFLECT_ODD 103
@@ -20,6 +10,7 @@
 #define AMREX_LO_INFLOW   106
 #define AMREX_LO_INHOMOG_NEUMANN 107
 #define AMREX_LO_ROBIN 108
+#define AMREX_LO_SYMMETRY 109
 #define AMREX_LO_PERIODIC 200
 #define AMREX_LO_BOGUS    1729
 
@@ -38,6 +29,7 @@ namespace amrex {
         inflow           = AMREX_LO_INFLOW,
         inhomogNeumann   = AMREX_LO_INHOMOG_NEUMANN,
         Robin            = AMREX_LO_ROBIN,
+        symmetry         = AMREX_LO_SYMMETRY,
         Periodic         = AMREX_LO_PERIODIC,
         bogus            = AMREX_LO_BOGUS
     };
@@ -48,4 +40,3 @@ namespace amrex {
 #endif
 
 #endif
-
diff --git a/Src/Boundary/AMReX_LO_BCTYPES.cpp b/Src/Boundary/AMReX_LO_BCTYPES.cpp
index 85731b6af47..f9f78c766bb 100644
--- a/Src/Boundary/AMReX_LO_BCTYPES.cpp
+++ b/Src/Boundary/AMReX_LO_BCTYPES.cpp
@@ -52,6 +52,11 @@ std::ostream& operator<< (std::ostream& os, const LinOpBCType& t)
             os << "Robin";
             break;
         }
+        case LinOpBCType::symmetry:
+        {
+            os << "symmetry";
+            break;
+        }
         case LinOpBCType::Periodic:
         {
             os << "Periodic";
diff --git a/Src/Boundary/AMReX_Mask.H b/Src/Boundary/AMReX_Mask.H
index 3a41ea81913..02000250f49 100644
--- a/Src/Boundary/AMReX_Mask.H
+++ b/Src/Boundary/AMReX_Mask.H
@@ -22,7 +22,6 @@ namespace amrex {
 
         This class does NOT provide a copy constructor or assignment operator.
 */
-
 class Mask final
     :
     public BaseFab<int>
diff --git a/Src/Boundary/AMReX_YAFluxRegister.H b/Src/Boundary/AMReX_YAFluxRegister.H
index 075a630a2f7..e26426ce15b 100644
--- a/Src/Boundary/AMReX_YAFluxRegister.H
+++ b/Src/Boundary/AMReX_YAFluxRegister.H
@@ -23,7 +23,6 @@ namespace amrex {
   `Reflux` is called to update the coarse cells next to the
   coarse/fine boundary.
 */
-
 template <typename MF>
 class YAFluxRegisterT
 {
diff --git a/Src/Boundary/CMakeLists.txt b/Src/Boundary/CMakeLists.txt
index f8d2e7e4929..30be881ad19 100644
--- a/Src/Boundary/CMakeLists.txt
+++ b/Src/Boundary/CMakeLists.txt
@@ -23,6 +23,14 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_BoundaryFwd.H
        )
 
+    if (NOT (D EQUAL 1))
+       target_sources(amrex_${D}d
+          PRIVATE
+          AMReX_EdgeFluxRegister.H
+          AMReX_EdgeFluxRegister.cpp
+          )
+    endif ()
+
     if (AMReX_FORTRAN)
        target_sources(amrex_${D}d
           PRIVATE
diff --git a/Src/Boundary/Make.package b/Src/Boundary/Make.package
index 36968b1f9d8..7dae7ec913f 100644
--- a/Src/Boundary/Make.package
+++ b/Src/Boundary/Make.package
@@ -12,6 +12,11 @@ CEXE_headers += AMReX_LOUtil_K.H
 CEXE_headers += AMReX_YAFluxRegister_K.H AMReX_YAFluxRegister_$(DIM)D_K.H
 CEXE_headers += AMReX_YAFluxRegister.H
 
+ifneq ($(DIM),1)
+  CEXE_headers += AMReX_EdgeFluxRegister.H
+  CEXE_sources += AMReX_EdgeFluxRegister.cpp
+endif
+
 CEXE_headers += AMReX_BoundaryFwd.H
 
 ifneq ($(BL_NO_FORT),TRUE)
diff --git a/Src/EB/AMReX_EB2.cpp b/Src/EB/AMReX_EB2.cpp
index 35c5b0189a9..b5fcf60bfac 100644
--- a/Src/EB/AMReX_EB2.cpp
+++ b/Src/EB/AMReX_EB2.cpp
@@ -261,7 +261,7 @@ BuildFromChkptFile (std::string const& fname,
                      max_coarsening_level, ngrow,
                      build_coarse_level_by_coarsening,
                      a_extend_domain_face));
-}
+} // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
 
 namespace {
 int comp_max_crse_level (Box cdomain, const Box& domain)
diff --git a/Src/EB/AMReX_EB2_2D_C.cpp b/Src/EB/AMReX_EB2_2D_C.cpp
index b77b2ebd00e..b2bbde200c5 100644
--- a/Src/EB/AMReX_EB2_2D_C.cpp
+++ b/Src/EB/AMReX_EB2_2D_C.cpp
@@ -9,7 +9,7 @@ void set_eb_data (const int i, const int j,
                   GpuArray<Real,AMREX_SPACEDIM> const& dx,
                   Array4<Real> const& vfrac, Array4<Real> const& vcent,
                   Array4<Real> const& barea, Array4<Real> const& bcent,
-                  Array4<Real> const& bnorm) noexcept
+                  Array4<Real> const& bnorm, Array4<Real> const& levset) noexcept
 {
 #ifdef AMREX_USE_FLOAT
     constexpr Real almostone = 1.0_rt-1.e-6_rt;
@@ -37,7 +37,20 @@ void set_eb_data (const int i, const int j,
     const Real nyabs = std::abs(ny);
 
     Real x_ym, x_yp, y_xm, y_xp;
-    if (nx > 0.0_rt) {
+    if (nx == 0.0_rt) {
+        if (apx(i,j,0) == 1.0_rt && apx(i+1,j,0) == 1.0_rt) {
+            if (levset(i,j,0) > 0.0_rt || levset(i,j+1,0) > 0.0_rt) {
+                x_ym = 0.5_rt*dx[0] - aym;
+                x_yp = 0.5_rt*dx[0] - ayp;
+            } else {
+                x_ym = -0.5_rt*dx[0] + aym;
+                x_yp = -0.5_rt*dx[0] + ayp;
+            }
+        } else {
+            x_ym = 0.0_rt;
+            x_yp = 0.0_rt;
+        }
+    } else if (nx > 0.0_rt) {
         x_ym = -0.5_rt*dx[0] + aym;
         x_yp = -0.5_rt*dx[0] + ayp;
     } else {
@@ -45,7 +58,20 @@ void set_eb_data (const int i, const int j,
         x_yp = 0.5_rt*dx[0] - ayp;
     }
 
-    if (ny > 0.0_rt) {
+    if (ny == 0.0_rt) {
+        if (apy(i,j,0) == 1.0_rt && apy(i,j+1,0) == 1.0_rt) {
+            if (levset(i,j,0) > 0.0_rt || levset(i+1,j,0) > 0.0_rt) {
+                y_xm = 0.5_rt*dx[1] - axm;
+                y_xp = 0.5_rt*dx[1] - axp;
+            } else {
+                y_xm = -0.5_rt*dx[1] + axm;
+                y_xp = -0.5_rt*dx[1] + axp;
+            }
+        } else {
+            y_xm = 0.0_rt;
+            y_xp = 0.0_rt;
+        }
+    } else if (ny > 0.0_rt) {
         y_xm = -0.5_rt*dx[1] + axm;
         y_xp = -0.5_rt*dx[1] + axp;
     } else {
@@ -135,7 +161,8 @@ bool set_eb_cell (int i, int j, Array4<EBCellFlag> const& cell,
                   GpuArray<Real,AMREX_SPACEDIM> const& dx,
                   Array4<Real> const& vfrac, Array4<Real> const& vcent,
                   Array4<Real> const& barea, Array4<Real> const& bcent,
-                  Array4<Real> const& bnorm, Real small_volfrac) noexcept
+                  Array4<Real> const& bnorm, Array4<Real> const& levset,
+                  Real small_volfrac) noexcept
 {
     bool is_small_cell = false;
     if (cell(i,j,0).isRegular()) {
@@ -157,7 +184,7 @@ bool set_eb_cell (int i, int j, Array4<EBCellFlag> const& cell,
         bnorm(i,j,0,0) = 0.0_rt;
         bnorm(i,j,0,1) = 0.0_rt;
     } else {
-        set_eb_data(i,j,apx,apy,dx,vfrac,vcent,barea,bcent,bnorm);
+        set_eb_data(i,j,apx,apy,dx,vfrac,vcent,barea,bcent,bnorm,levset);
         // remove small cells
         if (vfrac(i,j,0) < small_volfrac) {
             set_covered(i,j,cell,vfrac,vcent,barea,bcent,bnorm);
@@ -341,7 +368,7 @@ void build_cells (Box const& bx, Array4<EBCellFlag> const& cell,
     {
         amrex::ignore_unused(k);
         bool is_small = set_eb_cell(i, j, cell, apx, apy, dx, vfrac, vcent, barea, bcent,
-                                    bnorm, small_volfrac);
+                                    bnorm, levset, small_volfrac);
         if (is_small) {
             Gpu::Atomic::Add(dp, 1);
         }
diff --git a/Src/EB/AMReX_EB2_GeometryShop.H b/Src/EB/AMReX_EB2_GeometryShop.H
index ee353c13952..1be28d536c4 100644
--- a/Src/EB/AMReX_EB2_GeometryShop.H
+++ b/Src/EB/AMReX_EB2_GeometryShop.H
@@ -15,7 +15,7 @@
 
 namespace amrex::EB2 {
 
-template <class F, typename std::enable_if<IsGPUable<F>::value>::type* FOO = nullptr>
+template <class F, std::enable_if_t<IsGPUable<F>::value>* FOO = nullptr>
 AMREX_GPU_HOST_DEVICE
 Real
 IF_f (F const& f, GpuArray<Real,AMREX_SPACEDIM> const& p) noexcept
@@ -23,18 +23,17 @@ IF_f (F const& f, GpuArray<Real,AMREX_SPACEDIM> const& p) noexcept
     return f(AMREX_D_DECL(p[0],p[1],p[2]));
 }
 
-template <class F, typename std::enable_if<!IsGPUable<F>::value>::type* BAR = nullptr>
+template <class F, std::enable_if_t<!IsGPUable<F>::value>* BAR = nullptr>
 AMREX_GPU_HOST_DEVICE
 Real
 IF_f (F const& f, GpuArray<Real,AMREX_SPACEDIM> const& p) noexcept
 {
-#if AMREX_DEVICE_COMPILE
-    amrex::ignore_unused(f,p);
-    amrex::Error("EB2::GeometryShop: how did this happen?");
-    return 0.0;
-#else
-    return f({AMREX_D_DECL(p[0],p[1],p[2])});
-#endif
+    AMREX_IF_ON_DEVICE((
+        amrex::ignore_unused(f,p);
+        amrex::Error("EB2::GeometryShop: how did this happen?");
+        return 0.0;
+    ))
+    AMREX_IF_ON_HOST((return f({AMREX_D_DECL(p[0],p[1],p[2])});))
 }
 
 template <class F>
@@ -235,7 +234,7 @@ public:
         }
     }
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value>::type* FOO = nullptr >
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value>* FOO = nullptr >
     [[nodiscard]] int getBoxType (const Box& bx, const Geometry& geom, RunOn run_on) const noexcept
     {
         if (run_on == RunOn::Gpu && Gpu::inLaunchRegion())
@@ -282,19 +281,19 @@ public:
         }
     }
 
-    template <class U=F, typename std::enable_if<!IsGPUable<U>::value>::type* BAR = nullptr >
+    template <class U=F, std::enable_if_t<!IsGPUable<U>::value>* BAR = nullptr >
     [[nodiscard]] int getBoxType (const Box& bx, const Geometry& geom, RunOn) const noexcept
     {
         return getBoxType_Cpu(bx, geom);
     }
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value>::type* FOO = nullptr >
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value>* FOO = nullptr >
     static constexpr bool isGPUable () noexcept { return true; }
 
-    template <class U=F, typename std::enable_if<!IsGPUable<U>::value>::type* BAR = nullptr >
+    template <class U=F, std::enable_if_t<!IsGPUable<U>::value>* BAR = nullptr >
     static constexpr bool isGPUable () noexcept { return false; }
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value>::type* FOO = nullptr >
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value>* FOO = nullptr >
     void fillFab (BaseFab<Real>& levelset, const Geometry& geom, RunOn run_on,
                   Box const& bounding_box) const noexcept
     {
@@ -313,7 +312,7 @@ public:
         });
     }
 
-    template <class U=F, typename std::enable_if<!IsGPUable<U>::value>::type* BAR = nullptr >
+    template <class U=F, std::enable_if_t<!IsGPUable<U>::value>* BAR = nullptr >
     void fillFab (BaseFab<Real>& levelset, const Geometry& geom, RunOn,
                   Box const& bounding_box) const noexcept
     {
@@ -352,7 +351,7 @@ public:
     }
 
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value>::type* FOO = nullptr >
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value>* FOO = nullptr >
     void getIntercept (Array<Array4<Real>,AMREX_SPACEDIM> const& inter_arr,
                        Array<Array4<Type_t const>,AMREX_SPACEDIM> const& type_arr,
                        Array4<Real const> const&, Geometry const& geom, RunOn run_on,
@@ -388,7 +387,7 @@ public:
         }
     }
 
-    template <class U=F, typename std::enable_if<!IsGPUable<U>::value>::type* BAR = nullptr >
+    template <class U=F, std::enable_if_t<!IsGPUable<U>::value>* BAR = nullptr >
     void getIntercept (Array<Array4<Real>,AMREX_SPACEDIM> const& inter_arr,
                        Array<Array4<Type_t const>,AMREX_SPACEDIM> const& type_arr,
                        Array4<Real const> const&, Geometry const& geom, RunOn,
@@ -520,18 +519,18 @@ private:
 };
 
 template <class F>
-GeometryShop<typename std::decay<F>::type>
+GeometryShop<std::decay_t<F>>
 makeShop (F&& f)
 {
-    return GeometryShop<typename std::decay<F>::type>(std::forward<F>(f));
+    return GeometryShop<std::decay_t<F>>(std::forward<F>(f));
 }
 
 template <class F, class R>
-GeometryShop<typename std::decay<F>::type, typename std::decay<R>::type>
+GeometryShop<std::decay_t<F>, std::decay_t<R>>
 makeShop (F&& f, R&& r)
 {
-    return GeometryShop<typename std::decay<F>::type,
-                        typename std::decay<R>::type>
+    return GeometryShop<std::decay_t<F>,
+                        std::decay_t<R>>
         (std::forward<F>(f), std::forward<R>(r));
 }
 
diff --git a/Src/EB/AMReX_EB2_IF_Base.H b/Src/EB/AMReX_EB2_IF_Base.H
index 5dafaffe15b..a278b0eebb5 100644
--- a/Src/EB/AMReX_EB2_IF_Base.H
+++ b/Src/EB/AMReX_EB2_IF_Base.H
@@ -15,7 +15,7 @@ namespace EB2 {
 template <class D, class Enable = void> struct IsGPUable : std::false_type {};
 
 template <class D>
-struct IsGPUable<D, typename std::enable_if<std::is_base_of<GPUable,D>::value>::type>
+struct IsGPUable<D, std::enable_if_t<std::is_base_of_v<GPUable,D>>>
     : std::true_type {};
 
 }
diff --git a/Src/EB/AMReX_EB2_IF_Complement.H b/Src/EB/AMReX_EB2_IF_Complement.H
index 611497dd3e7..4b5512c4e48 100644
--- a/Src/EB/AMReX_EB2_IF_Complement.H
+++ b/Src/EB/AMReX_EB2_IF_Complement.H
@@ -23,7 +23,7 @@ public:
         return -m_f(p);
     }
 
-    template<class U=F, typename std::enable_if<IsGPUable<U>::value,int>::type = 0>
+    template<class U=F, std::enable_if_t<IsGPUable<U>::value,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (AMREX_D_DECL(Real x, Real y, Real z)) const noexcept
     {
@@ -36,14 +36,14 @@ protected:
 };
 
 template <class F>
-struct IsGPUable<ComplementIF<F>, typename std::enable_if<IsGPUable<F>::value>::type>
+struct IsGPUable<ComplementIF<F>, std::enable_if_t<IsGPUable<F>::value>>
     : std::true_type {};
 
 template <class F>
-constexpr ComplementIF<typename std::decay<F>::type>
+constexpr ComplementIF<std::decay_t<F>>
 makeComplement (F&& f)
 {
-    return ComplementIF<typename std::decay<F>::type>(std::forward<F>(f));
+    return ComplementIF<std::decay_t<F>>(std::forward<F>(f));
 }
 
 }
diff --git a/Src/EB/AMReX_EB2_IF_Difference.H b/Src/EB/AMReX_EB2_IF_Difference.H
index 74d775677a0..b904a1bb8d8 100644
--- a/Src/EB/AMReX_EB2_IF_Difference.H
+++ b/Src/EB/AMReX_EB2_IF_Difference.H
@@ -31,8 +31,8 @@ public:
     }
 
     template <class U=F, class V=G,
-              typename std::enable_if<IsGPUable<U>::value &&
-                                      IsGPUable<V>::value, int>::type = 0>
+              std::enable_if_t<IsGPUable<U>::value &&
+                                      IsGPUable<V>::value, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (AMREX_D_DECL(Real x, Real y, Real z)) const noexcept
     {
@@ -48,17 +48,17 @@ protected:
 };
 
 template <class F, class G>
-struct IsGPUable<DifferenceIF<F,G>, typename std::enable_if<IsGPUable<F>::value &&
-                                                            IsGPUable<G>::value>::type>
+struct IsGPUable<DifferenceIF<F,G>, std::enable_if_t<IsGPUable<F>::value &&
+                                                            IsGPUable<G>::value>>
     : std::true_type {};
 
 template <class F, class G>
-constexpr DifferenceIF<typename std::decay<F>::type,
-                       typename std::decay<G>::type>
+constexpr DifferenceIF<std::decay_t<F>,
+                       std::decay_t<G>>
 makeDifference (F&& f, G&& g)
 {
-    return DifferenceIF<typename std::decay<F>::type,
-                        typename std::decay<G>::type>
+    return DifferenceIF<std::decay_t<F>,
+                        std::decay_t<G>>
         (std::forward<F>(f), std::forward<G>(g));
 }
 
diff --git a/Src/EB/AMReX_EB2_IF_Extrusion.H b/Src/EB/AMReX_EB2_IF_Extrusion.H
index 218ffdb74d4..c81f8cb0728 100644
--- a/Src/EB/AMReX_EB2_IF_Extrusion.H
+++ b/Src/EB/AMReX_EB2_IF_Extrusion.H
@@ -28,7 +28,7 @@ public:
         return m_f(x);
     }
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value,int>::type = 0>
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (AMREX_D_DECL(Real x, Real y, Real z)) const noexcept
     {
@@ -50,14 +50,14 @@ protected:
 };
 
 template <class F>
-struct IsGPUable<ExtrusionIF<F>, typename std::enable_if<IsGPUable<F>::value>::type>
+struct IsGPUable<ExtrusionIF<F>, std::enable_if_t<IsGPUable<F>::value>>
     : std::true_type {};
 
 template <class F>
-constexpr ExtrusionIF<typename std::decay<F>::type>
+constexpr ExtrusionIF<std::decay_t<F>>
 extrude (F&&f, int direction)
 {
-    return ExtrusionIF<typename std::decay<F>::type>(std::forward<F>(f),direction);
+    return ExtrusionIF<std::decay_t<F>>(std::forward<F>(f),direction);
 }
 
 }
diff --git a/Src/EB/AMReX_EB2_IF_Intersection.H b/Src/EB/AMReX_EB2_IF_Intersection.H
index 4da055ba7ef..e24a9013a1f 100644
--- a/Src/EB/AMReX_EB2_IF_Intersection.H
+++ b/Src/EB/AMReX_EB2_IF_Intersection.H
@@ -19,27 +19,27 @@ namespace IIF_detail {
     template <typename F>
     [[nodiscard]] inline Real do_min (const RealArray& p, F&& f) noexcept
     {
-        return f(p);
+        return std::forward<F>(f)(p);
     }
 
     template <typename F, typename... Fs>
     [[nodiscard]] inline Real do_min (const RealArray& p, F&& f, Fs&... fs) noexcept
     {
-        return amrex::min(f(p), do_min(p, std::forward<Fs>(fs)...));
+        return amrex::min(std::forward<F>(f)(p), do_min(p, std::forward<Fs>(fs)...));
     }
 
     template <typename F>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real do_min (AMREX_D_DECL(Real x, Real y, Real z), F&& f) noexcept
     {
-        return f(AMREX_D_DECL(x,y,z));
+        return std::forward<F>(f)(AMREX_D_DECL(x,y,z));
     }
 
     template <typename F, typename... Fs>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real do_min (AMREX_D_DECL(Real x, Real y, Real z), F&& f, Fs&... fs)
     {
-        return amrex::min(f(AMREX_D_DECL(x,y,z)), do_min(AMREX_D_DECL(x,y,z), std::forward<Fs>(fs)...));
+        return amrex::min(std::forward<F>(f)(AMREX_D_DECL(x,y,z)), do_min(AMREX_D_DECL(x,y,z), std::forward<Fs>(fs)...));
     }
 }
 
@@ -55,7 +55,7 @@ public:
         return op_impl(p, std::make_index_sequence<sizeof...(Fs)>());
     }
 
-    template <class U=IntersectionIF<Fs...>, typename std::enable_if<IsGPUable<U>::value,int>::type = 0>
+    template <class U=IntersectionIF<Fs...>, std::enable_if_t<IsGPUable<U>::value,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (AMREX_D_DECL(Real x, Real y, Real z)) const noexcept
     {
@@ -79,18 +79,18 @@ protected:
 };
 
 template <class Head, class... Tail>
-struct IsGPUable<IntersectionIF<Head, Tail...>, typename std::enable_if<IsGPUable<Head>::value>::type>
+struct IsGPUable<IntersectionIF<Head, Tail...>, std::enable_if_t<IsGPUable<Head>::value>>
     : IsGPUable<IntersectionIF<Tail...> > {};
 
 template <class F>
-struct IsGPUable<IntersectionIF<F>, typename std::enable_if<IsGPUable<F>::value>::type>
+struct IsGPUable<IntersectionIF<F>, std::enable_if_t<IsGPUable<F>::value>>
     : std::true_type {};
 
 template <class... Fs>
-constexpr IntersectionIF<typename std::decay<Fs>::type ...>
+constexpr IntersectionIF<std::decay_t<Fs> ...>
 makeIntersection (Fs&&... fs)
 {
-    return IntersectionIF<typename std::decay<Fs>::type ...>(std::forward<Fs>(fs)...);
+    return IntersectionIF<std::decay_t<Fs> ...>(std::forward<Fs>(fs)...);
 }
 
 }
diff --git a/Src/EB/AMReX_EB2_IF_Lathe.H b/Src/EB/AMReX_EB2_IF_Lathe.H
index c5914034f42..89ceb5867db 100644
--- a/Src/EB/AMReX_EB2_IF_Lathe.H
+++ b/Src/EB/AMReX_EB2_IF_Lathe.H
@@ -29,7 +29,7 @@ public:
 #endif
     }
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value,int>::type = 0>
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (AMREX_D_DECL(Real x, Real y, Real z)) const noexcept
     {
@@ -47,14 +47,14 @@ protected:
 };
 
 template <class F>
-struct IsGPUable<LatheIF<F>, typename std::enable_if<IsGPUable<F>::value>::type>
+struct IsGPUable<LatheIF<F>, std::enable_if_t<IsGPUable<F>::value>>
     : std::true_type {};
 
 template <class F>
-constexpr LatheIF<typename std::decay<F>::type>
+constexpr LatheIF<std::decay_t<F>>
 lathe (F&& f)
 {
-    return LatheIF<typename std::decay<F>::type>(std::forward<F>(f));
+    return LatheIF<std::decay_t<F>>(std::forward<F>(f));
 }
 
 }
diff --git a/Src/EB/AMReX_EB2_IF_Rotation.H b/Src/EB/AMReX_EB2_IF_Rotation.H
index 9a676a347ea..f361db84e30 100644
--- a/Src/EB/AMReX_EB2_IF_Rotation.H
+++ b/Src/EB/AMReX_EB2_IF_Rotation.H
@@ -32,7 +32,7 @@ public:
         return m_f({x, y});
     }
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value,int>::type = 0>
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (Real x, Real y) const noexcept
     {
@@ -66,7 +66,7 @@ public:
         }
     }
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value,int>::type = 0>
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (Real x, Real y, Real z) const noexcept
     {
@@ -102,14 +102,14 @@ protected:
 };
 
 template <class F>
-struct IsGPUable<RotationIF<F>, typename std::enable_if<IsGPUable<F>::value>::type>
+struct IsGPUable<RotationIF<F>, std::enable_if_t<IsGPUable<F>::value>>
     : std::true_type {};
 
 template <class F>
-constexpr RotationIF<typename std::decay<F>::type>
+constexpr RotationIF<std::decay_t<F>>
 rotate (F&&f, const Real angle, const int dir)
 {
-    return RotationIF<typename std::decay<F>::type>(std::forward<F>(f),angle, dir);
+    return RotationIF<std::decay_t<F>>(std::forward<F>(f),angle, dir);
 }
 
 }
diff --git a/Src/EB/AMReX_EB2_IF_Scale.H b/Src/EB/AMReX_EB2_IF_Scale.H
index 832b8c51197..e56eeb57e87 100644
--- a/Src/EB/AMReX_EB2_IF_Scale.H
+++ b/Src/EB/AMReX_EB2_IF_Scale.H
@@ -25,7 +25,7 @@ public:
 #endif
         {}
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value,int>::type = 0>
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (AMREX_D_DECL(Real x, Real y, Real z)) const noexcept
     {
@@ -48,14 +48,14 @@ protected:
 };
 
 template <class F>
-struct IsGPUable<ScaleIF<F>, typename std::enable_if<IsGPUable<F>::value>::type>
+struct IsGPUable<ScaleIF<F>, std::enable_if_t<IsGPUable<F>::value>>
     : std::true_type {};
 
 template <class F>
-constexpr ScaleIF<typename std::decay<F>::type>
+constexpr ScaleIF<std::decay_t<F>>
 scale (F&&f, const RealArray& scalefactor)
 {
-    return ScaleIF<typename std::decay<F>::type>(std::forward<F>(f),scalefactor);
+    return ScaleIF<std::decay_t<F>>(std::forward<F>(f),scalefactor);
 }
 
 }
diff --git a/Src/EB/AMReX_EB2_IF_Translation.H b/Src/EB/AMReX_EB2_IF_Translation.H
index cdf5ea9983b..42f34b6e070 100644
--- a/Src/EB/AMReX_EB2_IF_Translation.H
+++ b/Src/EB/AMReX_EB2_IF_Translation.H
@@ -28,7 +28,7 @@ public:
                                  p[2]-m_offset.z)});
     }
 
-    template <class U=F, typename std::enable_if<IsGPUable<U>::value,int>::type = 0>
+    template <class U=F, std::enable_if_t<IsGPUable<U>::value,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (AMREX_D_DECL(Real x, Real y, Real z)) const noexcept
     {
@@ -44,14 +44,14 @@ protected:
 };
 
 template <class F>
-struct IsGPUable<TranslationIF<F>, typename std::enable_if<IsGPUable<F>::value>::type>
+struct IsGPUable<TranslationIF<F>, std::enable_if_t<IsGPUable<F>::value>>
     : std::true_type {};
 
 template <class F>
-constexpr TranslationIF<typename std::decay<F>::type>
+constexpr TranslationIF<std::decay_t<F>>
 translate (F&&f, const RealArray& offset)
 {
-    return TranslationIF<typename std::decay<F>::type>(std::forward<F>(f),offset);
+    return TranslationIF<std::decay_t<F>>(std::forward<F>(f),offset);
 }
 
 }
diff --git a/Src/EB/AMReX_EB2_IF_Union.H b/Src/EB/AMReX_EB2_IF_Union.H
index 0c8b38cb80d..34f7ec356d0 100644
--- a/Src/EB/AMReX_EB2_IF_Union.H
+++ b/Src/EB/AMReX_EB2_IF_Union.H
@@ -19,27 +19,27 @@ namespace UIF_detail {
     template <typename F>
     [[nodiscard]] inline Real do_max (const RealArray& p, F&& f) noexcept
     {
-        return f(p);
+        return std::forward<F>(f)(p);
     }
 
     template <typename F, typename... Fs>
     [[nodiscard]] inline Real do_max (const RealArray& p, F&& f, Fs&... fs) noexcept
     {
-        return amrex::max(f(p), do_max(p, std::forward<Fs>(fs)...));
+        return amrex::max(std::forward<F>(f)(p), do_max(p, std::forward<Fs>(fs)...));
     }
 
     template <typename F>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real do_max (AMREX_D_DECL(Real x, Real y, Real z), F&& f) noexcept
     {
-        return f(AMREX_D_DECL(x,y,z));
+        return std::forward<F>(f)(AMREX_D_DECL(x,y,z));
     }
 
     template <typename F, typename... Fs>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real do_max (AMREX_D_DECL(Real x, Real y, Real z), F&& f, Fs&... fs) noexcept
     {
-        return amrex::max(f(AMREX_D_DECL(x,y,z)), do_max(AMREX_D_DECL(x,y,z), std::forward<Fs>(fs)...));
+        return amrex::max(std::forward<F>(f)(AMREX_D_DECL(x,y,z)), do_max(AMREX_D_DECL(x,y,z), std::forward<Fs>(fs)...));
     }
 }
 
@@ -55,7 +55,7 @@ public:
         return op_impl(p, std::make_index_sequence<sizeof...(Fs)>());
     }
 
-    template <class U=UnionIF<Fs...>, typename std::enable_if<IsGPUable<U>::value,int>::type = 0>
+    template <class U=UnionIF<Fs...>, std::enable_if_t<IsGPUable<U>::value,int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE inline
     Real operator() (AMREX_D_DECL(Real x, Real y, Real z)) const noexcept
     {
@@ -79,18 +79,18 @@ protected:
 };
 
 template <class Head, class... Tail>
-struct IsGPUable<UnionIF<Head, Tail...>, typename std::enable_if<IsGPUable<Head>::value>::type>
+struct IsGPUable<UnionIF<Head, Tail...>, std::enable_if_t<IsGPUable<Head>::value>>
     : IsGPUable<UnionIF<Tail...> > {};
 
 template <class F>
-struct IsGPUable<UnionIF<F>, typename std::enable_if<IsGPUable<F>::value>::type>
+struct IsGPUable<UnionIF<F>, std::enable_if_t<IsGPUable<F>::value>>
     : std::true_type {};
 
 template <class... Fs>
-constexpr UnionIF<typename std::decay<Fs>::type ...>
+constexpr UnionIF<std::decay_t<Fs> ...>
 makeUnion (Fs&&... fs)
 {
-    return UnionIF<typename std::decay<Fs>::type ...>(std::forward<Fs>(fs)...);
+    return UnionIF<std::decay_t<Fs> ...>(std::forward<Fs>(fs)...);
 }
 
 }
diff --git a/Src/EB/AMReX_EB2_Level.H b/Src/EB/AMReX_EB2_Level.H
index 7089c66a61e..2497320434f 100644
--- a/Src/EB/AMReX_EB2_Level.H
+++ b/Src/EB/AMReX_EB2_Level.H
@@ -492,16 +492,16 @@ GShopLevel<G>::define_fine (G const& gshop, const Geometry& geom,
             break;
         } else {
             auto ls = m_mgf.getLevelSet();
-            // This is an alias MulitFab, therefore FillBoundary on it is fine.
+            // This is an alias MultiFab, therefore FillBoundary on it is fine.
             ls.FillBoundary(geom.periodicity());
             if (amrex::Verbose() > 0) {
                 if (nsmallcells) {
                     amrex::Print() << "AMReX EB: Iter. " << iter+1 << " fixed " << nsmallcells
-                                   << " small cells" << std::endl;
+                                   << " small cells" << '\n';
                 }
                 if (nmulticuts) {
                     amrex::Print() << "AMReX EB: Iter. " << iter+1 << " fixed " << nmulticuts
-                                   << " multicuts" << std::endl;
+                                   << " multicuts" << '\n';
                 }
             }
         }
diff --git a/Src/EB/AMReX_EBFluxRegister.H b/Src/EB/AMReX_EBFluxRegister.H
index 33ec811dcf6..72fec3b6a7c 100644
--- a/Src/EB/AMReX_EBFluxRegister.H
+++ b/Src/EB/AMReX_EBFluxRegister.H
@@ -53,7 +53,6 @@ namespace amrex {
   to add the part in ghost cells (excluding ghost cells covered by
   valid cells of other grids) to EBFluxRegister's internal data.
 */
-
 class EBFluxRegister
     : public YAFluxRegister
 {
diff --git a/Src/EB/AMReX_EBMultiFabUtil_C.H b/Src/EB/AMReX_EBMultiFabUtil_C.H
index e47ad765bf2..666453cc117 100644
--- a/Src/EB/AMReX_EBMultiFabUtil_C.H
+++ b/Src/EB/AMReX_EBMultiFabUtil_C.H
@@ -2,6 +2,7 @@
 #define AMREX_EB_MULTIFAB_UTIL_C_H_
 #include <AMReX_Config.H>
 
+#include <AMReX_BCRec.H>
 #include <AMReX_FArrayBox.H>
 #include <AMReX_EBCellFlag.H>
 
diff --git a/Src/EB/AMReX_EB_FluxRedistribute.cpp b/Src/EB/AMReX_EB_FluxRedistribute.cpp
index cf48f034f20..409576977d7 100644
--- a/Src/EB/AMReX_EB_FluxRedistribute.cpp
+++ b/Src/EB/AMReX_EB_FluxRedistribute.cpp
@@ -1,3 +1,4 @@
+#include <AMReX_BCRec.H>
 #include <AMReX_EBFluxRegister.H>
 #include <AMReX_YAFluxRegister.H>
 #include <AMReX_EBFArrayBox.H>
diff --git a/Src/EB/AMReX_EB_Redistribution.H b/Src/EB/AMReX_EB_Redistribution.H
index fc177ce1c4d..6d816d45d50 100644
--- a/Src/EB/AMReX_EB_Redistribution.H
+++ b/Src/EB/AMReX_EB_Redistribution.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX.H>
+#include <AMReX_BCRec.H>
 #include <AMReX_MultiFab.H>
 #include <AMReX_MultiFabUtil.H>
 #include <AMReX_MultiCutFab.H>
diff --git a/Src/EB/AMReX_EB_Redistribution.cpp b/Src/EB/AMReX_EB_Redistribution.cpp
index 54fcea3889d..0bfcefa61c1 100644
--- a/Src/EB/AMReX_EB_Redistribution.cpp
+++ b/Src/EB/AMReX_EB_Redistribution.cpp
@@ -1,3 +1,4 @@
+#include <AMReX_BCRec.H>
 #include <AMReX_MultiFab.H>
 #include <AMReX_Geometry.H>
 #include <AMReX_MultiCutFab.H>
@@ -5,6 +6,7 @@
 #include <AMReX_EBFabFactory.H>
 #include <AMReX_EBFArrayBox.H>
 #include <AMReX_EB_Redistribution.H>
+#include <AMReX_EBMultiFabUtil.H>
 
 namespace amrex {
 
diff --git a/Src/EB/AMReX_EB_RedistributionApply.cpp b/Src/EB/AMReX_EB_RedistributionApply.cpp
index a6cbded198d..636aa546147 100644
--- a/Src/EB/AMReX_EB_RedistributionApply.cpp
+++ b/Src/EB/AMReX_EB_RedistributionApply.cpp
@@ -4,6 +4,7 @@
  *
  */
 
+#include <AMReX_BCRec.H>
 #include <AMReX_EB_Redistribution.H>
 #include <AMReX_EB_utils.H>
 
diff --git a/Src/EB/AMReX_EB_STL_utils.cpp b/Src/EB/AMReX_EB_STL_utils.cpp
index a8c205782d7..a149228273d 100644
--- a/Src/EB/AMReX_EB_STL_utils.cpp
+++ b/Src/EB/AMReX_EB_STL_utils.cpp
@@ -51,75 +51,236 @@ namespace amrex
 
         for(int i=0;i<m_num_tri;i++)
         {
-            std::getline(infile,tmpline);  //facet normal
-            std::istringstream fcnormal(tmpline);
-            fcnormal>>tmp1>>tmp2
-                >>m_tri_normals_h[i*m_ndata_per_normal+0]
-                >>m_tri_normals_h[i*m_ndata_per_normal+1]
-                >>m_tri_normals_h[i*m_ndata_per_normal+2];
-
-            std::getline(infile,tmpline); // outer loop
-
-            std::getline(infile,tmpline); //vertex 1
-            std::istringstream vertex1(tmpline);
-            vertex1>>tmp1
-                >>m_tri_pts_h[i*m_ndata_per_tri+0]
-                >>m_tri_pts_h[i*m_ndata_per_tri+1]
-                >>m_tri_pts_h[i*m_ndata_per_tri+2];
-
-            std::getline(infile,tmpline); //vertex 2
-            std::istringstream vertex2(tmpline);
-            vertex2>>tmp1
-                >>m_tri_pts_h[i*m_ndata_per_tri+3]
-                >>m_tri_pts_h[i*m_ndata_per_tri+4]
-                >>m_tri_pts_h[i*m_ndata_per_tri+5];
-
-            std::getline(infile,tmpline); //vertex 3
-            std::istringstream vertex3(tmpline);
-            vertex3>>tmp1 //vertex
-                >>m_tri_pts_h[i*m_ndata_per_tri+6]
-                >>m_tri_pts_h[i*m_ndata_per_tri+7]
-                >>m_tri_pts_h[i*m_ndata_per_tri+8];
-
-            std::getline(infile,tmpline); //end loop
-            std::getline(infile,tmpline); //end facet
-        }
+            std::getline(is,tmp); // facet normal
+            std::getline(is,tmp); // outer loop
 
-        //device vectors
-        m_tri_pts_d.resize(m_num_tri*m_ndata_per_tri);
-        m_tri_normals_d.resize(m_num_tri*m_ndata_per_normal);
-
-        Gpu::copy(Gpu::hostToDevice, m_tri_pts_h.begin(),
-                m_tri_pts_h.end(), m_tri_pts_d.begin());
-        Gpu::copy(Gpu::hostToDevice,
-                m_tri_normals_h.begin(), m_tri_normals_h.end(),
-                m_tri_normals_d.begin());
-        
-        m_triangles.clear();
-        for(int tr=0;tr<m_num_tri;tr++)
-        {
-             Point a(m_tri_pts_h[tr*STLtools::m_ndata_per_tri+0],
-                     m_tri_pts_h[tr*STLtools::m_ndata_per_tri+1],
-                     m_tri_pts_h[tr*STLtools::m_ndata_per_tri+2]);
+            Real x, y, z;
+
+            for (int iv = 0; iv < 3; ++iv) { // 3 vertices
+                is >> tmp >> x >> y >> z;
+                *p++ = x * scale + center[0];
+                *p++ = y * scale + center[1];
+                *p++ = z * scale + center[2];
+            }
+            std::getline(is,tmp); // read \n
 
-             Point b(m_tri_pts_h[tr*STLtools::m_ndata_per_tri+3],
-                     m_tri_pts_h[tr*STLtools::m_ndata_per_tri+4],
-                     m_tri_pts_h[tr*STLtools::m_ndata_per_tri+5]);
+            std::getline(is,tmp); //end loop
+            std::getline(is,tmp); //end facet
 
-             Point c(m_tri_pts_h[tr*STLtools::m_ndata_per_tri+6],
-                     m_tri_pts_h[tr*STLtools::m_ndata_per_tri+7],
-                     m_tri_pts_h[tr*STLtools::m_ndata_per_tri+8]);
-    
-             m_triangles.push_back(Triangle(a,b,c));
+            if (reverse_normal) {
+                std::swap(m_tri_pts_h[i].v1, m_tri_pts_h[i].v2);
+            }
         }
-    
-        m_aabb_tree = new Tree(m_triangles.begin(),m_triangles.end());
     }
-    //================================================================================
-    Real STLtools::getSignedDistance(Real x,Real y,Real z)
+}
+
+void
+STLtools::prepare ()
+{
+    ParallelDescriptor::Bcast(&m_num_tri, 1);
+    if (!ParallelDescriptor::IOProcessor()) {
+        m_tri_pts_h.resize(m_num_tri);
+    }
+    ParallelDescriptor::Bcast((char*)(m_tri_pts_h.dataPtr()), m_num_tri*sizeof(Triangle));
+
+    //device vectors
+    m_tri_pts_d.resize(m_num_tri);
+    m_tri_normals_d.resize(m_num_tri);
+
+    Gpu::copyAsync(Gpu::hostToDevice, m_tri_pts_h.begin(), m_tri_pts_h.end(),
+                   m_tri_pts_d.begin());
+
+    Triangle const* tri_pts = m_tri_pts_d.data();
+    XDim3* tri_norm = m_tri_normals_d.data();
+
+    // Compute normals in case the STL file does not have valid data for normals
+    ParallelFor(m_num_tri, [=] AMREX_GPU_DEVICE (int i) noexcept
     {
+        Triangle const& tri = tri_pts[i];
+        XDim3 vec1{tri.v2.x-tri.v1.x, tri.v2.y-tri.v1.y, tri.v2.z-tri.v1.z};
+        XDim3 vec2{tri.v3.x-tri.v2.x, tri.v3.y-tri.v2.y, tri.v3.z-tri.v2.z};
+        XDim3 norm{vec1.y*vec2.z-vec1.z*vec2.y,
+                   vec1.z*vec2.x-vec1.x*vec2.z,
+                   vec1.x*vec2.y-vec1.y*vec2.x};
+        Real tmp = 1._rt / std::sqrt(norm.x*norm.x + norm.y*norm.y + norm.z*norm.z);
+        tri_norm[i].x = norm.x * tmp;
+        tri_norm[i].y = norm.y * tmp;
+        tri_norm[i].z = norm.z * tmp;
+    });
+
+    ReduceOps<ReduceOpMin,ReduceOpMin,ReduceOpMin,ReduceOpMax,ReduceOpMax,ReduceOpMax> reduce_op;
+    ReduceData<Real,Real,Real,Real,Real,Real> reduce_data(reduce_op);
+    using ReduceTuple = typename decltype(reduce_data)::Type;
+    reduce_op.eval(m_num_tri, reduce_data,
+                   [=] AMREX_GPU_DEVICE (int i) -> ReduceTuple
+                   {
+                       return {amrex::min(tri_pts[i].v1.x,
+                                          tri_pts[i].v2.x,
+                                          tri_pts[i].v3.x),
+                               amrex::min(tri_pts[i].v1.y,
+                                          tri_pts[i].v2.y,
+                                          tri_pts[i].v3.y),
+                               amrex::min(tri_pts[i].v1.z,
+                                          tri_pts[i].v2.z,
+                                          tri_pts[i].v3.z),
+                               amrex::max(tri_pts[i].v1.x,
+                                          tri_pts[i].v2.x,
+                                          tri_pts[i].v3.x),
+                               amrex::max(tri_pts[i].v1.y,
+                                          tri_pts[i].v2.y,
+                                          tri_pts[i].v3.y),
+                               amrex::max(tri_pts[i].v1.z,
+                                          tri_pts[i].v2.z,
+                                          tri_pts[i].v3.z)};
+                   });
+    auto const& hv = reduce_data.value(reduce_op);
+    m_ptmin.x = amrex::get<0>(hv);
+    m_ptmin.y = amrex::get<1>(hv);
+    m_ptmin.z = amrex::get<2>(hv);
+    m_ptmax.x = amrex::get<3>(hv);
+    m_ptmax.y = amrex::get<4>(hv);
+    m_ptmax.z = amrex::get<5>(hv);
+
+    if (amrex::Verbose() > 0) {
+        amrex::Print() << "    Min: " << m_ptmin << " Max: " << m_ptmax << '\n';
+    }
 
-        Real sign,dist;
+    // Choose a reference point by extending the normal vector of the first
+    // triangle until it's slightly outside the bounding box.
+    XDim3 cent0; // centroid of the first triangle
+    int is_ref_positive;
+    {
+        Triangle const& tri = m_tri_pts_h[0];
+        cent0 = XDim3{(tri.v1.x + tri.v2.x + tri.v3.x) / 3._rt,
+                      (tri.v1.y + tri.v2.y + tri.v3.y) / 3._rt,
+                      (tri.v1.z + tri.v2.z + tri.v3.z) / 3._rt};
+        // We are computing the normal ourselves in case the stl file does
+        // not have valid data on normal.
+        XDim3 vec1{tri.v2.x-tri.v1.x, tri.v2.y-tri.v1.y, tri.v2.z-tri.v1.z};
+        XDim3 vec2{tri.v3.x-tri.v2.x, tri.v3.y-tri.v2.y, tri.v3.z-tri.v2.z};
+        XDim3 norm{vec1.y*vec2.z-vec1.z*vec2.y,
+                   vec1.z*vec2.x-vec1.x*vec2.z,
+                   vec1.x*vec2.y-vec1.y*vec2.x};
+        Real tmp = 1._rt / std::sqrt(norm.x*norm.x + norm.y*norm.y + norm.z*norm.z);
+        norm.x *= tmp;
+        norm.y *= tmp;
+        norm.z *= tmp;
+        // Now we need to find out where the normal vector will intersect
+        // with the bounding box defined by m_ptmin and m_ptmax.
+        Real Lx, Ly, Lz;
+        constexpr Real eps = std::numeric_limits<Real>::epsilon();
+        if (norm.x > eps) {
+            Lx = (m_ptmax.x-cent0.x) / norm.x;
+        } else if (norm.x < -eps) {
+            Lx = (m_ptmin.x-cent0.x) / norm.x;
+        } else {
+            Lx = std::numeric_limits<Real>::max();
+        }
+        if (norm.y > eps) {
+            Ly = (m_ptmax.y-cent0.y) / norm.y;
+        } else if (norm.y < -eps) {
+            Ly = (m_ptmin.y-cent0.y) / norm.y;
+        } else {
+            Ly = std::numeric_limits<Real>::max();
+        }
+        if (norm.z > eps) {
+            Lz = (m_ptmax.z-cent0.z) / norm.z;
+        } else if (norm.z < -eps) {
+            Lz = (m_ptmin.z-cent0.z) / norm.z;
+        } else {
+            Lz = std::numeric_limits<Real>::max();
+        }
+        Real Lp = std::min({Lx,Ly,Lz});
+        if (norm.x > eps) {
+            Lx = (m_ptmin.x-cent0.x) / norm.x;
+        } else if (norm.x < -eps) {
+            Lx = (m_ptmax.x-cent0.x) / norm.x;
+        } else {
+            Lx = std::numeric_limits<Real>::lowest();
+        }
+        if (norm.y > eps) {
+            Ly = (m_ptmin.y-cent0.y) / norm.y;
+        } else if (norm.y < -eps) {
+            Ly = (m_ptmax.y-cent0.y) / norm.y;
+        } else {
+            Ly = std::numeric_limits<Real>::lowest();
+        }
+        if (norm.z > eps) {
+            Lz = (m_ptmin.z-cent0.z) / norm.z;
+        } else if (norm.z < -eps) {
+            Lz = (m_ptmax.z-cent0.z) / norm.z;
+        } else {
+            Lz = std::numeric_limits<Real>::lowest();
+        }
+        if (std::abs(norm.x) < 1.e-5) {
+            norm.x = std::copysign(Real(1.e-5), norm.x);
+        }
+        if (std::abs(norm.y) < 1.e-5) {
+            norm.y = std::copysign(Real(1.e-5), norm.y);
+        }
+        if (std::abs(norm.z) < 1.e-5) {
+            norm.z = std::copysign(Real(1.e-5), norm.z);
+        }
+        Real Lm = std::max({Lx,Ly,Lz});
+        Real Leps = std::max(Lp,-Lm) * Real(0.009);
+        if (Lp < -Lm) {
+            m_ptref.x = cent0.x + (Lp+Leps) * norm.x;
+            m_ptref.y = cent0.y + (Lp+Leps) * norm.y;
+            m_ptref.z = cent0.z + (Lp+Leps) * norm.z;
+            is_ref_positive = true;
+        } else {
+            m_ptref.x = cent0.x + (Lm-Leps) * norm.x;
+            m_ptref.y = cent0.y + (Lm-Leps) * norm.y;
+            m_ptref.z = cent0.z + (Lm-Leps) * norm.z;
+            is_ref_positive = false;
+        }
+    }
+
+    // We now need to figure out if the boundary and the reference is
+    // outside or inside the object.
+    XDim3 ptref = m_ptref;
+    int num_isects = Reduce::Sum<int>(m_num_tri, [=] AMREX_GPU_DEVICE (int i) -> int
+        {
+            if (i == 0) {
+                return 1-is_ref_positive;
+            } else {
+                Real p1[] = {ptref.x, ptref.y, ptref.z};
+                Real p2[] = {cent0.x, cent0.y, cent0.z};
+                return static_cast<int>(line_tri_intersects(p1, p2, tri_pts[i]));
+            }
+        });
+
+    m_boundry_is_outside = num_isects % 2 == 0;
+}
+
+void
+STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom,
+                Real outside_value, Real inside_value) const
+{
+    int num_triangles = m_num_tri;
+
+    const auto plo = geom.ProbLoArray();
+    const auto dx  = geom.CellSizeArray();
+
+    const Triangle* tri_pts = m_tri_pts_d.data();
+    XDim3 ptmin = m_ptmin;
+    XDim3 ptmax = m_ptmax;
+    XDim3 ptref = m_ptref;
+    Real reference_value = m_boundry_is_outside ? outside_value :  inside_value;
+    Real other_value     = m_boundry_is_outside ?  inside_value : outside_value;
+
+    auto const& ma = mf.arrays();
+
+    ParallelFor(mf, nghost, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+    {
+        Real coords[3];
+        coords[0]=plo[0]+static_cast<Real>(i)*dx[0];
+        coords[1]=plo[1]+static_cast<Real>(j)*dx[1];
+#if (AMREX_SPACEDIM == 2)
+        coords[2]=Real(0.);
+#else
+        coords[2]=plo[2]+static_cast<Real>(k)*dx[2];
+#endif
         int num_intersects=0;
 
         Point crd(x,y,z);
diff --git a/Src/EB/AMReX_EB_StateRedistribute.cpp b/Src/EB/AMReX_EB_StateRedistribute.cpp
index 79e661d2c90..faa77539f90 100644
--- a/Src/EB/AMReX_EB_StateRedistribute.cpp
+++ b/Src/EB/AMReX_EB_StateRedistribute.cpp
@@ -176,7 +176,7 @@ MLStateRedistribute ( Box const& bx, int ncomp,
 
                 if (domain_per_grown.contains(IntVect(AMREX_D_DECL(r,s,t))))
                 {
-                    // Initialize so that the slope stencil goes from -1:1 in each diretion
+                    // Initialize so that the slope stencil goes from -1:1 in each direction
                     int nx = 1; int ny = 1; int nz = 1;
 
                     // Do we have enough extent in each coordinate direction to use the 3x3x3 stencil
@@ -511,7 +511,7 @@ StateRedistribute ( Box const& bx, int ncomp,
                     bool extdir_khi = (d_bcrec_ptr[n].hi(2) == amrex::BCType::ext_dir ||
                                        d_bcrec_ptr[n].hi(2) == amrex::BCType::hoextrap);
 #endif
-                    // Initialize so that the slope stencil goes from -1:1 in each diretion
+                    // Initialize so that the slope stencil goes from -1:1 in each direction
                     int nx = 1; int ny = 1; int nz = 1;
 
                     // Do we have enough extent in each coordinate direction to use the 3x3x3 stencil
diff --git a/Src/EB/AMReX_EB_chkpt_file.H b/Src/EB/AMReX_EB_chkpt_file.H
index bb51c79a127..958f401041e 100644
--- a/Src/EB/AMReX_EB_chkpt_file.H
+++ b/Src/EB/AMReX_EB_chkpt_file.H
@@ -10,18 +10,18 @@ class ChkptFile
 private:
     std::string m_restart_file;
 
-    const std::string m_volfrac_name   = "volfrac";
-    const std::string m_centroid_name  = "centroid";
-    const std::string m_bndryarea_name = "bndryarea";
-    const std::string m_bndrycent_name = "bndrycent";
-    const std::string m_bndrynorm_name = "bndrynorm";
-    const std::string m_levelset_name  = "levelset";
-
-    const amrex::Vector<std::string> m_areafrac_name
+    std::string m_volfrac_name   = "volfrac";
+    std::string m_centroid_name  = "centroid";
+    std::string m_bndryarea_name = "bndryarea";
+    std::string m_bndrycent_name = "bndrycent";
+    std::string m_bndrynorm_name = "bndrynorm";
+    std::string m_levelset_name  = "levelset";
+
+    amrex::Vector<std::string> m_areafrac_name
         = {AMREX_D_DECL("areafrac_x", "areafrac_y", "areafrac_z")};
-    const amrex::Vector<std::string> m_facecent_name
+    amrex::Vector<std::string> m_facecent_name
         = {AMREX_D_DECL("facecent_x", "facecent_y", "facecent_z")};
-    const amrex::Vector<std::string> m_edgecent_name
+    amrex::Vector<std::string> m_edgecent_name
         = {AMREX_D_DECL("edgecent_x", "edgecent_y", "edgecent_z")};
 
     void writeHeader (const BoxArray& cut_ba, const BoxArray& covered_ba, const Geometry& geom,
diff --git a/Src/EB/AMReX_EB_chkpt_file.cpp b/Src/EB/AMReX_EB_chkpt_file.cpp
index 1acb6b4764d..8a5d4436a2a 100644
--- a/Src/EB/AMReX_EB_chkpt_file.cpp
+++ b/Src/EB/AMReX_EB_chkpt_file.cpp
@@ -121,7 +121,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
 
     std::string File(m_restart_file + "/Header");
 
-    if (amrex::Verbose()) { amrex::Print() << "file=" << File << std::endl; }
+    if (amrex::Verbose()) { amrex::Print() << "file=" << File << '\n'; }
 
     VisMF::IO_Buffer io_buffer(VisMF::GetIOBufferSize());
 
@@ -209,7 +209,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
 
     // volfrac
     {
-        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_volfrac_name << std::endl; }
+        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_volfrac_name << '\n'; }
 
         volfrac.define(cut_grids, dmap, 1, ng_gfab);
 
@@ -219,7 +219,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
 
     // centroid
     {
-        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_centroid_name << std::endl; }
+        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_centroid_name << '\n'; }
 
         centroid.define(cut_grids, dmap, AMREX_SPACEDIM, ng_gfab);
 
@@ -229,7 +229,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
 
     // bndryarea
     {
-        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_bndryarea_name << std::endl; }
+        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_bndryarea_name << '\n'; }
 
         bndryarea.define(cut_grids, dmap, 1, ng_gfab);
 
@@ -239,7 +239,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
 
     // bndrycent
     {
-        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_bndrycent_name << std::endl; }
+        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_bndrycent_name << '\n'; }
 
         bndrycent.define(cut_grids, dmap, AMREX_SPACEDIM, ng_gfab);
 
@@ -249,7 +249,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
 
     // bndrynorm
     {
-        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_bndrynorm_name << std::endl; }
+        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_bndrynorm_name << '\n'; }
 
         bndrynorm.define(cut_grids, dmap, AMREX_SPACEDIM, ng_gfab);
 
@@ -260,7 +260,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
     for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
         // areafrac
         {
-            if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_areafrac_name[idim] << std::endl; }
+            if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_areafrac_name[idim] << '\n'; }
 
             areafrac[idim].define(convert(cut_grids, IntVect::TheDimensionVector(idim)), dmap, 1, ng_gfab);
 
@@ -270,7 +270,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
 
         // facecent
         {
-            if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_facecent_name[idim] << std::endl; }
+            if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_facecent_name[idim] << '\n'; }
 
             facecent[idim].define(convert(cut_grids, IntVect::TheDimensionVector(idim)), dmap, AMREX_SPACEDIM-1, ng_gfab);
 
@@ -280,7 +280,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
 
         // edgecent
         {
-            if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_edgecent_name[idim] << std::endl; }
+            if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_edgecent_name[idim] << '\n'; }
 
             IntVect edge_type{1}; edge_type[idim] = 0;
             edgecent[idim].define(convert(cut_grids, edge_type), dmap, 1, ng_gfab);
@@ -292,7 +292,7 @@ ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
 
     // levelset
     {
-        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_levelset_name << std::endl; }
+        if (amrex::Verbose()) { amrex::Print() << "  Loading " << m_levelset_name << '\n'; }
 
         levelset.define(convert(cut_grids,IntVect::TheNodeVector()), dmap, 1, ng_gfab);
 
@@ -316,7 +316,7 @@ ChkptFile::write_to_chkpt_file (const BoxArray& cut_grids,
 {
 
     if (ParallelDescriptor::IOProcessor()) {
-        std::cout << "\n\t Writing checkpoint " << m_restart_file << std::endl;
+        std::cout << "\n\t Writing checkpoint " << m_restart_file << '\n';
     }
 
     const int nlevels = 1;
diff --git a/Src/EB/AMReX_EB_utils.cpp b/Src/EB/AMReX_EB_utils.cpp
index 948a3d5db20..857a8eb08a8 100644
--- a/Src/EB/AMReX_EB_utils.cpp
+++ b/Src/EB/AMReX_EB_utils.cpp
@@ -31,7 +31,7 @@ facets_nearest_pt (IntVect const& ind_pt, IntVect const& ind_loop, RealVect cons
                    RealVect const& eb_normal, RealVect const& eb_p0,
                    GpuArray<Real,AMREX_SPACEDIM> const& dx)
 {
-    // Enumerate the possible EB facet edges invovlved.
+    // Enumerate the possible EB facet edges involved.
     int n_facets = 0;
     IntVect ind_facets {AMREX_D_DECL(0, 0, 0)};
     for (int d = 0; d < AMREX_SPACEDIM; ++d) {
diff --git a/Src/EB/AMReX_WriteEBSurface.cpp b/Src/EB/AMReX_WriteEBSurface.cpp
index 75a0421c94d..f26420cdcc7 100644
--- a/Src/EB/AMReX_WriteEBSurface.cpp
+++ b/Src/EB/AMReX_WriteEBSurface.cpp
@@ -24,25 +24,62 @@ void WriteEBSurface (const BoxArray & ba, const DistributionMapping & dmap, cons
 
         const auto & sfab    = static_cast<EBFArrayBox const &>(mf_ba[mfi]);
         const auto & my_flag = sfab.getEBCellFlagFab();
+        const auto * my_flag_ptr = &my_flag;
 
         const Box & bx = mfi.validbox();
 
         if (my_flag.getType(bx) == FabType::covered ||
             my_flag.getType(bx) == FabType::regular) { continue; }
 
-        std::array<const MultiCutFab *, AMREX_SPACEDIM> areafrac;
-        const MultiCutFab * bndrycent;
-
-        areafrac  =   ebf->getAreaFrac();
-        bndrycent = &(ebf->getBndryCent());
+        std::array<const CutFab *, AMREX_SPACEDIM> areafrac;
+        const CutFab * bndrycent;
+
+        for (int d = 0; d < AMREX_SPACEDIM; ++d) {
+            areafrac[d]  =   &(*ebf->getAreaFrac()[d])[mfi];
+        }
+        bndrycent = &(ebf->getBndryCent()[mfi]);
+
+#ifdef AMREX_USE_GPU
+        std::unique_ptr<EBCellFlagFab> host_flag;
+        if (my_flag.arena()->isManaged() || my_flag.arena()->isDevice()) {
+            host_flag = std::make_unique<EBCellFlagFab>(my_flag.box(), my_flag.nComp(),
+                                                  The_Pinned_Arena());
+            Gpu::dtoh_memcpy_async(host_flag->dataPtr(), my_flag.dataPtr(),
+                                   host_flag->nBytes());
+            Gpu::streamSynchronize();
+            my_flag_ptr = host_flag.get();
+        }
+
+        std::array<std::unique_ptr<CutFab>, AMREX_SPACEDIM> areafrac_h;
+        for (int d = 0; d < AMREX_SPACEDIM; ++d) {
+            if (areafrac[d]->arena()->isManaged() || areafrac[d]->arena()->isDevice()) {
+                areafrac_h[d] = std::make_unique<CutFab>(areafrac[d]->box(), areafrac[d]->nComp(),
+                                                           The_Pinned_Arena());
+                Gpu::dtoh_memcpy_async(areafrac_h[d]->dataPtr(), areafrac[d]->dataPtr(),
+                                       areafrac[d]->size()*sizeof(Real));
+                Gpu::streamSynchronize();
+                areafrac[d] = areafrac_h[d].get();
+            }
+        }
+
+        std::unique_ptr<CutFab> bndrycent_h;
+        if (bndrycent->arena()->isManaged() || bndrycent->arena()->isDevice()) {
+            bndrycent_h = std::make_unique<CutFab>(bndrycent->box(), bndrycent->nComp(),
+                                                        The_Pinned_Arena());
+            Gpu::dtoh_memcpy_async(bndrycent_h->dataPtr(), bndrycent->dataPtr(),
+                                   bndrycent->size()*sizeof(Real));
+            Gpu::streamSynchronize();
+            bndrycent = bndrycent_h.get();
+        }
+#endif
 
         eb_to_pvd.EBToPolygon(
                 problo, dx,
-                bx, my_flag.const_array(),
-                bndrycent->const_array(mfi),
-                areafrac[0]->const_array(mfi),
-                areafrac[1]->const_array(mfi),
-                areafrac[2]->const_array(mfi));
+                bx, my_flag_ptr->const_array(),
+                bndrycent->const_array(),
+                areafrac[0]->const_array(),
+                areafrac[1]->const_array(),
+                areafrac[2]->const_array());
     }
 
     int cpu = ParallelDescriptor::MyProc();
@@ -58,13 +95,26 @@ void WriteEBSurface (const BoxArray & ba, const DistributionMapping & dmap, cons
 
         const auto & sfab    = static_cast<EBFArrayBox const &>(mf_ba[mfi]);
         const auto & my_flag = sfab.getEBCellFlagFab();
+        const auto * my_flag_ptr = &my_flag;
 
         const Box & bx = mfi.validbox();
 
         if (my_flag.getType(bx) == FabType::covered ||
             my_flag.getType(bx) == FabType::regular) { continue; }
 
-        eb_to_pvd.EBGridCoverage(cpu, problo, dx, bx, my_flag.const_array());
+#ifdef AMREX_USE_GPU
+        std::unique_ptr<EBCellFlagFab> host_flag;
+        if (my_flag.arena()->isManaged() || my_flag.arena()->isDevice()) {
+            host_flag = std::make_unique<EBCellFlagFab>(my_flag.box(), my_flag.nComp(),
+                                                  The_Pinned_Arena());
+            Gpu::dtoh_memcpy_async(host_flag->dataPtr(), my_flag.dataPtr(),
+                                   host_flag->nBytes());
+            Gpu::streamSynchronize();
+            my_flag_ptr = host_flag.get();
+        }
+#endif
+
+        eb_to_pvd.EBGridCoverage(cpu, problo, dx, bx, my_flag_ptr->const_array());
     }
 }
 
diff --git a/Src/EB/AMReX_algoim_K.H b/Src/EB/AMReX_algoim_K.H
index 09b7f1bd715..8e99c5c431a 100644
--- a/Src/EB/AMReX_algoim_K.H
+++ b/Src/EB/AMReX_algoim_K.H
@@ -253,12 +253,12 @@ struct Interval
 template<int M, int N, typename Phi, typename F, bool S>
 struct ImplicitIntegral
 {
-    const Phi& phi;
-    F& f;
-    const GpuArray<bool,N> free;
+    const Phi* phi;
+    F* f;
+    GpuArray<bool,N> free;
     GpuArray<PsiCode<N>,1 << (N - 1)> psi;
     int psiCount;
-    const BoundingBox<N> xrange;
+    BoundingBox<N> xrange;
     int e0 = -1;
     GpuArray<Interval,N> xint;
     static constexpr int p = 4;
@@ -281,13 +281,13 @@ struct ImplicitIntegral
             Real dphi_max = 0.0_rt;
             for (int dim = 0; dim < N; ++dim) {
                 if (free[dim]) {
-                    dphi_max += std::abs(phi.grad(dim));
+                    dphi_max += std::abs(phi->grad(dim));
                 } else {
                     mid[dim] = xrange(psi[i].side(dim),dim);
                 }
             }
             dphi_max *= 0.5*almostone;
-            const Real phi_0 = phi(mid);
+            const Real phi_0 = (*phi)(mid);
             bool uniform_sign = (phi_0 > dphi_max) || (phi_0 < -dphi_max);
             if (uniform_sign)
             {
@@ -338,7 +338,7 @@ struct ImplicitIntegral
                     ++k;
                 }
             }
-            f.evalIntegrand(x, w);
+            f->evalIntegrand(x, w);
 
             for (int m = M-1; m >= 0; --m) {
                 ++(i[m]);
@@ -385,8 +385,8 @@ struct ImplicitIntegral
             int nroots = 0;
 
             x[e0] = x_min;
-            Real phi_lo = phi(x);
-            Real xroot = x[e0] - phi_lo / phi.grad(e0);
+            Real phi_lo = (*phi)(x);
+            Real xroot = x[e0] - phi_lo / phi->grad(e0);
             if (xroot > x_min && xroot < x_max) {
                 roots[nroots++] = xroot;
             }
@@ -398,10 +398,10 @@ struct ImplicitIntegral
                 x[e0] = roots[i];
                 Real mag = 0._rt;
                 for (int dim = 0; dim < N; ++dim) {
-                    mag += phi.grad(dim)*phi.grad(dim);
+                    mag += phi->grad(dim)*phi->grad(dim);
                 }
                 mag = std::sqrt(mag);
-                f.evalIntegrand(x, (mag/std::abs(phi.grad(e0)))*w);
+                f->evalIntegrand(x, (mag/std::abs(phi->grad(e0)))*w);
             }
 
             return;
@@ -412,7 +412,7 @@ struct ImplicitIntegral
         Real x_max = xrange.max(e0);
         roots[0] = x_min;
         int nroots = 1;
-        if (phi.grad(e0) != Real(0.0)) {
+        if (phi->grad(e0) != Real(0.0)) {
             for (int i = 0; i < psiCount; ++i)
             {
                 for (int dim = 0; dim < N; ++dim) {
@@ -422,8 +422,8 @@ struct ImplicitIntegral
                 }
                 // x is now valid in all variables except e0
                 x[e0] = x_min;
-                Real phi_lo = phi(x);
-                Real xroot = x[e0] - phi_lo / phi.grad(e0);
+                Real phi_lo = (*phi)(x);
+                Real xroot = x[e0] - phi_lo / phi->grad(e0);
                 if (xroot > x_min && xroot < x_max) {
                     roots[nroots++] = xroot;
                 }
@@ -454,7 +454,7 @@ struct ImplicitIntegral
                         x[dim] = xrange(psi[j].side(dim), dim);
                     }
                 }
-                bool new_ok = (phi(x) > 0.0) ? (psi[j].sign() >= 0) : (psi[j].sign() <= 0);
+                bool new_ok = ((*phi)(x) > 0.0) ? (psi[j].sign() >= 0) : (psi[j].sign() <= 0);
                 okay = okay && new_ok;
             }
             if (!okay) { continue; }
@@ -463,7 +463,7 @@ struct ImplicitIntegral
             {
                 x[e0] = roots[i] + (roots[i+1] - roots[i]) * gauss_x[j];
                 Real gw = (roots[i+1] - roots[i]) * gauss_w[j];
-                f.evalIntegrand(x, w * gw);
+                f->evalIntegrand(x, w * gw);
             }
         }
     }
@@ -475,7 +475,7 @@ struct ImplicitIntegral
     ImplicitIntegral (const Phi& phi_, F& f_, const GpuArray<bool,N>& free_,
                       const GpuArray<PsiCode<N>,1 << (N-1)>& psi_,
                       int psiCount_) noexcept
-        : phi(phi_), f(f_), free(free_), psi(psi_), psiCount(psiCount_),
+        : phi(&phi_), f(&f_), free(free_), psi(psi_), psiCount(psiCount_),
           xrange()
     {
         // For the one-dimensional base case, evaluate the
@@ -495,7 +495,7 @@ struct ImplicitIntegral
     ImplicitIntegral (const Phi& phi_, F& f_, const GpuArray<bool,N>& free_,
                       const GpuArray<PsiCode<N>,1 << (N-1)>& psi_,
                       int psiCount_) noexcept
-        : phi(phi_), f(f_), free(free_), psi(psi_), psiCount(psiCount_),
+        : phi(&phi_), f(&f_), free(free_), psi(psi_), psiCount(psiCount_),
           xrange()
     {
         // Establish interval bounds for prune() and remaining part of ctor.
@@ -538,8 +538,8 @@ struct ImplicitIntegral
         {
             Real gmax = -1.;
             for (int dim = 0; dim < N; ++dim) {
-                if (free[dim] && std::abs(phi.grad(dim)) > gmax) {
-                    gmax = std::abs(phi.grad(dim));
+                if (free[dim] && std::abs(phi->grad(dim)) > gmax) {
+                    gmax = std::abs(phi->grad(dim));
                     e0 = dim;
                 }
             }
@@ -560,7 +560,7 @@ struct ImplicitIntegral
 
             // w.z. We assume grad does not change.
             int bottomSign, topSign;
-            detail::determineSigns<S>(phi.grad(e0) > 0.0, psi[i].sign(),
+            detail::determineSigns<S>(phi->grad(e0) > 0.0, psi[i].sign(),
                                    bottomSign, topSign);
             // w.z. There are sides 0 and 1.
             newPsi[newPsiCount++] = PsiCode<N>(psi[i], e0, 0, bottomSign);
@@ -571,7 +571,7 @@ struct ImplicitIntegral
         GpuArray<bool,N> new_free = free;
         new_free[e0] = false;
         ImplicitIntegral<M-1,N,Phi,ImplicitIntegral<M,N,Phi,F,S>,false>
-            (phi, *this, new_free, newPsi, newPsiCount);
+            (*phi, *this, new_free, newPsi, newPsiCount);
     }
 };
 
diff --git a/Src/EB/AMReX_distFcnElement.cpp b/Src/EB/AMReX_distFcnElement.cpp
index f889e4a8009..2c8fddb3fa2 100644
--- a/Src/EB/AMReX_distFcnElement.cpp
+++ b/Src/EB/AMReX_distFcnElement.cpp
@@ -204,7 +204,7 @@ void SplineDistFcnElement2d::single_spline_cpdist(amrex::RealVect pt,
 
   if (mindist == Real(0.0)) {
     std::cout << "identified minimum distance of 0.0 at t = " << t
-              << "; cp = " << mincp << " for p = " << pt << std::endl;
+              << "; cp = " << mincp << " for p = " << pt << '\n';
   }
 }
 
@@ -301,12 +301,12 @@ void SplineDistFcnElement2d::set_bc_points(amrex::RealVect start,
 void SplineDistFcnElement2d::print_control_points() const {
   for (unsigned i=0; i<control_points_x.size(); ++i) {
     std::cout << "(" << control_points_x[i] << ","
-              << control_points_y[i] << ")" << std::endl;
+              << control_points_y[i] << ")" << '\n';
   }
 
   std::cout << "(" << bc_pt_start[0] << ","
-            << bc_pt_start[1] << ")" << std::endl;
-  std::cout << "(" << bc_pt_end[0] << "," << bc_pt_end[1] << ")" << std::endl;
+            << bc_pt_start[1] << ")" << '\n';
+  std::cout << "(" << bc_pt_end[0] << "," << bc_pt_end[1] << ")" << '\n';
 }
 
 
@@ -439,7 +439,7 @@ void LineDistFcnElement2d::print_control_points()
 {
   for (int i=1, N=static_cast<int>(control_points_x.size()); i<N; ++i) {
     std::cout << "(" << control_points_x[i-1] << ", "<< control_points_y[i-1] << ")" << "---"
-              << "(" << control_points_x[i] << ", " << control_points_y[i] << ")" << std::endl;
+              << "(" << control_points_x[i] << ", " << control_points_y[i] << ")" << '\n';
   }
 }
 
diff --git a/Src/Extern/Bittree/AMReX_Bittree.H b/Src/Extern/Bittree/AMReX_Bittree.H
new file mode 100644
index 00000000000..feb05e9f189
--- /dev/null
+++ b/Src/Extern/Bittree/AMReX_Bittree.H
@@ -0,0 +1,73 @@
+#ifndef BL_BITTREE_H_
+#define BL_BITTREE_H_
+
+#include <AMReX_IntVect.H>
+#include <AMReX_BoxArray.H>
+#include <AMReX_DistributionMapping.H>
+#include <Bittree_BittreeAmr.h>
+
+namespace amrex {
+
+/*
+Include in Make.local:
+BITTREE_PATH = /path/to/bittree/installation
+INCLUDE_LOCATIONS += $(BITTREE_PATH)/include
+LIBRARY_LOCATIONS += $(BITTREE_PATH)/lib
+LIBRARIES += -lbittree
+
+Include in inputs:
+amr.use_bittree = true
+*/
+class btUnit {
+  // Functions used in AmrMesh
+  public:
+    static int  btRefine (bittree::BittreeAmr* mesh,
+                          std::vector<int>& btTags,
+                          int max_crse, int lbase,
+                          Vector<BoxArray>& grids, Vector<DistributionMapping>& dmap,
+                          MPI_Comm comm);
+    static void btCalculateGrids (bittree::BittreeAmr* mesh,
+                                  int lbase,
+                                  int& new_finest,
+                                  Vector<BoxArray>& new_grids,
+                                  Vector<IntVect> const& max_grid_size);
+    static void btCalculateLevel (bittree::BittreeAmr* mesh,
+                                  int lev,
+                                  BoxArray& ba,
+                                  IntVect const& max_grid_size);
+  // Utils
+    static int getBitid (bittree::BittreeAmr* mesh, bool updated,
+                         int lev, int idx_on_lev);
+    static int getIndex (bittree::BittreeAmr* mesh, bool updated,
+                         int lev, int bitid);
+
+  // Functions to implement strict octree logic
+  private:
+    static void btCheckRefine (bittree::BittreeAmr* mesh,
+                               std::vector<int>& btTags,
+                               int max_crse, int lbase,
+                               Vector<BoxArray>& grids, Vector<DistributionMapping>& dmap,
+                               MPI_Comm comm);
+
+    static void btCheckDerefine (bittree::BittreeAmr* mesh,
+                                 std::vector<int>& btTags,
+                                 int max_crse, int lbase,
+                                 Vector<BoxArray>& grids, Vector<DistributionMapping>& dmap,
+                                 MPI_Comm comm);
+
+    // Utility Functions
+    static bool checkNeighborsRefine (bittree::BittreeAmr* mesh,
+                                      bittree::MortonTree::Block b);
+    static std::vector<int> neighIntCoords (bittree::BittreeAmr* mesh,
+                                            unsigned lev, unsigned const* lcoord,
+                                            int const* gCell);
+
+  public:
+    // Represents whether domain has periodic BC in each direction
+    // true = Periodic, false = Non-Periodic
+    static bool bcPeriodic[AMREX_SPACEDIM];
+};
+
+
+}
+#endif
diff --git a/Src/Extern/Bittree/AMReX_Bittree.cpp b/Src/Extern/Bittree/AMReX_Bittree.cpp
new file mode 100644
index 00000000000..51bbe199278
--- /dev/null
+++ b/Src/Extern/Bittree/AMReX_Bittree.cpp
@@ -0,0 +1,388 @@
+#include <AMReX_Bittree.H>
+#include <AMReX_ParallelDescriptor.H>
+#include <AMReX_MFIter.H>
+#include <functional>
+
+using namespace bittree;
+
+namespace amrex {
+static constexpr auto K1D = int(AMREX_SPACEDIM>=1);
+static constexpr auto K2D = int(AMREX_SPACEDIM>=2);
+static constexpr auto K3D = int(AMREX_SPACEDIM>=3);
+
+
+bool btUnit::bcPeriodic[AMREX_SPACEDIM];
+
+/*
+NOTE: Bittree object is created in AmrMesh::MakeNewGrids (Real time)
+   with
+      `mesh = std::make_shared<BittreeAmr>(top,includes);`
+
+The functions here are called in the BT version of MakeNewGrids which has three steps:
+    1. Error Estimation and tagging - btTagging
+    2. Bitree's actual bitmap generated/updated - btRefine
+    3. AMReX updates grids based on bitree - btCalculateGrids
+*/
+
+
+/** New Bittree mesh is generated.
+  *
+  * This makes use of BT library functions and as well as routines adapted
+  * from Flash-X that enforce Octree nesting.
+  */
+int btUnit::btRefine (BittreeAmr* const mesh, std::vector<int>& btTags,
+                      int max_crse, int lbase,
+                      Vector<BoxArray>& grids, Vector<DistributionMapping>& dmap, MPI_Comm comm)
+{
+    BL_PROFILE("Bittree-btRefine");
+
+    // Tree before refinement. With only one rank, lnblocks = nblocks.
+    auto tree0 = mesh->getTree();
+
+    // Mark leaves to be refined
+    for (int lev=max_crse; lev>=lbase; --lev) {
+        for (MFIter mfi(grids[lev], dmap[lev]); mfi.isValid(); ++mfi) {
+            int id = getBitid(mesh,false,lev,mfi.index());
+            if (btTags[id]==1) {
+                if(!tree0->block_is_parent(id)) {
+                    mesh->refine_mark(id, true);
+                }
+            }
+        }
+    }
+
+    mesh->refine_reduce(comm);
+    mesh->refine_update();
+
+    btCheckRefine(mesh, btTags, max_crse, lbase, grids, dmap, comm);
+
+    // Mark derefinement (parents who will nodetype change to leaf)
+    for (int lev=max_crse; lev>=lbase; --lev) {
+        for (MFIter mfi(grids[lev], dmap[lev]); mfi.isValid(); ++mfi) {
+            int id = getBitid(mesh,false,lev,mfi.index());
+            if (btTags[id]==-1) {
+                if(tree0->block_is_parent(id)) {
+                    mesh->refine_mark(id, true);
+                }
+            }
+        }
+    }
+
+    mesh->refine_reduce(comm);
+    mesh->refine_update();
+
+    btCheckDerefine(mesh, btTags, max_crse, lbase, grids, dmap, comm);
+
+    // return delta count
+    return static_cast<int>( mesh->delta_count() );
+}
+
+/** Creates new box arrays to match the new Bittree mesh.
+  */
+void btUnit::btCalculateGrids (BittreeAmr* const mesh, int lbase,
+                               int& new_finest,
+                               Vector<BoxArray>& new_grids,
+                               Vector<IntVect> const& max_grid_size)
+{
+    BL_PROFILE("Bittree-btCalculateGrids");
+
+    auto tree1 = mesh->getTree(true);
+    auto nlevs = tree1->levels();
+    new_finest = int(nlevs - 1);
+
+//--Calculate the new grid layout and distribution map based on Bittree
+    for(int lev=lbase; lev<=new_finest; ++lev) {
+        btCalculateLevel(mesh, lev, new_grids[lev],
+                         max_grid_size[lev]);
+    }
+}
+
+/** Creates a box array based on Bittree.
+  */
+void btUnit::btCalculateLevel (BittreeAmr* const mesh, int lev,
+                               BoxArray& ba,
+                               IntVect const& max_grid_size)
+{
+    auto tree1 = mesh->getTree(true);
+
+    //Bittree has its own indices for blocks which I call bitid; get
+    //the range of bitids for the level being made. Bitid range is
+    //contiguous for each level.
+    auto id0 = tree1->level_id0(lev);
+    auto id1 = tree1->level_id1(lev);
+    // int nblocks = tree1->level_blocks(lev);
+
+    BoxList bl;
+
+    for(auto i=id0; i<id1; ++i) {
+      //Get coordinates and morton index.
+      auto b = tree1->locate(i);
+
+      if(b.level != lev) {
+          std::string msg = "Error identifying block in btCalculateGrids";
+          //throw error?
+      }
+
+      IntVect coordVec{AMREX_D_DECL(static_cast<int>(b.coord[0]),
+                                    static_cast<int>(b.coord[1]),
+                                    static_cast<int>(b.coord[2]))};
+      IntVect lo = max_grid_size*coordVec;
+      IntVect hi = max_grid_size*(coordVec+1) - 1;
+      bl.push_back( Box{lo,hi} );
+    }
+
+    ba = BoxArray(bl);
+}
+
+int btUnit::getBitid (BittreeAmr* const mesh, bool updated,
+                      int lev, int idx_on_lev)
+{
+    return idx_on_lev + int(mesh->getTree(updated)->level_id0(lev));
+}
+
+int btUnit::getIndex (BittreeAmr* const mesh, bool updated,
+                      int lev, int bitid)
+{
+    return bitid - int(mesh->getTree(updated)->level_id0(lev));
+}
+
+
+
+//---------------------------------------------------------------------
+// Local Routines
+//---------------------------------------------------------------------
+
+/** Implements the logic which ensures the generated Bittree adheres
+  * to a strict octree structure with no more than one level difference
+  * between surrounding leaf blocks.
+  */
+void btUnit::btCheckRefine (BittreeAmr* const mesh, std::vector<int>& btTags,
+                            int max_crse, int lbase,
+                            Vector<BoxArray>& grids,
+                            Vector<DistributionMapping>& dmap, MPI_Comm comm)
+{
+    BL_PROFILE("Bittree-btCheckRefine");
+
+    // Tree before refinement.
+    auto tree0 = mesh->getTree();
+
+    // Ref test is marked 1 if block needs a tag (and doesn't have one).
+    std::vector<int> ref_test(tree0->id_upper_bound());
+
+    // Repeat is made true if another round is needed
+    bool repeat = false;
+
+    do {
+        // Clear out ref_test
+        std::fill(ref_test.begin(),ref_test.end(),0);
+
+        // Check neighbors - if any adjacent child of a neighbor is either a parent
+        // or marked for refinement, this block needs to be refined.
+        for (int lev=max_crse; lev>=lbase; --lev) {
+            for (MFIter mfi(grids[lev], dmap[lev]); mfi.isValid(); ++mfi) {
+                int id = getBitid(mesh,false,lev,mfi.index());
+                auto b = tree0->locate(id);
+                if( !b.is_parent && btTags[id]!=1 ) {
+                    bool needsTag = checkNeighborsRefine( mesh, b);
+                    //amrex::Print() << "needsTag for " << id << " : " << needsTag <<std::endl;
+                    if(needsTag) {
+                        ref_test[id] = 1;
+                    }
+                }
+            }
+        }
+
+        // Mark blocks who need to be refined (as per above check).
+        repeat = false;
+        for (int lev=max_crse; lev>=lbase; --lev) {
+            for (MFIter mfi(grids[lev], dmap[lev]); mfi.isValid(); ++mfi) {
+                int id = getBitid(mesh,false,lev,mfi.index());
+                if( ref_test[id]==1 && btTags[id]!=1 ) {
+                    repeat = true;
+                    btTags[id] = 1;
+                    mesh->refine_mark(id,true);
+                }
+            }
+        }
+
+        // If only processing local blocks, check all processors to see if
+        // a repeat is necessary, then reduce bittree to update on all ranks.
+        ParallelDescriptor::ReduceBoolOr(repeat);
+
+        if(repeat) {
+            mesh->refine_reduce(comm);
+            mesh->refine_update();
+        }
+
+    } while(repeat);
+}
+
+
+/** Implements the logic which ensures the generated Bittree adheres
+  * to a strict octree structure with no more than one level difference
+  * between surrounding leaf blocks.
+  */
+void btUnit::btCheckDerefine (BittreeAmr* const mesh, std::vector<int>& btTags,
+                              int max_crse, int lbase,
+                              Vector<BoxArray>& grids,
+                              Vector<DistributionMapping>& dmap, MPI_Comm comm)
+{
+    BL_PROFILE("Bittree-btCheckDerefine");
+
+    // Tree before refinement. With only one rank, lnblocks = nblocks.
+    auto tree0 = mesh->getTree();
+
+    std::vector<int> deref_test(tree0->id_upper_bound());
+
+    // Repeat is made true if another round is needed
+    bool repeat = false;
+
+    // Repeat is left true if another round is needed
+    do {
+        // Turn deref_test to default 0 if block can't be derefined
+        deref_test = btTags;
+
+        // Check neighbors - if any adjacent child of neighbor is either a parent
+        // or marked for refinement, do not derefine.
+        for (int lev=max_crse; lev>=lbase; --lev) {
+            for (MFIter mfi(grids[lev], dmap[lev]); mfi.isValid(); ++mfi) {
+                int id = getBitid(mesh,false,lev,mfi.index());
+                auto b = tree0->locate(id);
+                if( btTags[id]==-1 ) {
+                    bool cantDeref = checkNeighborsRefine( mesh, b);
+                    if(cantDeref) {
+                        deref_test[id] = 0;
+                    }
+                }
+            }
+        }
+
+        // Unmark any blocks who cannot derefine (as per above check).
+        repeat = false;
+        for (int lev=max_crse; lev>=lbase; --lev) {
+            for (MFIter mfi(grids[lev], dmap[lev]); mfi.isValid(); ++mfi) {
+                int id = getBitid(mesh,false,lev,mfi.index());
+                if( deref_test[id]==0 && btTags[id]==-1 ) {
+                    repeat = true;
+                    btTags[id] = 0;
+
+                    // Unmark for derefinement
+                    mesh->refine_mark(id, false);
+                }
+            }
+        }
+
+        // If only processing local blocks, check all processors to see if
+        // a repeat is necessary, then reduce bittree to update on all ranks.
+        ParallelDescriptor::ReduceBoolOr(repeat);
+
+        if(repeat) {
+            mesh->refine_reduce_and(comm);
+            mesh->refine_update();
+        }
+
+    } while(repeat);
+}
+
+
+// Check all neighbors to see if their adjacent children are parents or marked for refinement.
+bool btUnit::checkNeighborsRefine (BittreeAmr* const mesh, MortonTree::Block b)
+{
+    BL_PROFILE("Bittree-checkNeighborsRefine");
+
+    auto tree0 = mesh->getTree();
+    auto tree1 = mesh->getTree(true);
+    int nIdx[3], cIdx[3];
+    unsigned childCoord_u[AMREX_SPACEDIM];
+
+    // Loop over neighbors
+    for(nIdx[2]= -1*K3D; nIdx[2]<= K3D; ++nIdx[2]) {
+    for(nIdx[1]= -1*K2D; nIdx[1]<= K2D; ++nIdx[1]) {
+    for(nIdx[0]= -1*K1D; nIdx[0]<= K1D; ++nIdx[0]) {
+        std::vector<int> nCoord = neighIntCoords(mesh, b.level, b.coord, nIdx);
+
+        // If neighbor is outside domain or otherwise invalid, continue.
+        if(AMREX_D_TERM(nCoord[0]<0, || nCoord[1]<0, || nCoord[2]<0 )) {
+            continue;
+        }
+
+        // Identify neighbor from Bittree.
+        unsigned neighCoord_u[AMREX_SPACEDIM];
+        for(unsigned d=0; d<AMREX_SPACEDIM; ++d) {
+            neighCoord_u[d] = static_cast<unsigned>(nCoord[d]);
+        }
+        auto n = tree0->identify(b.level, neighCoord_u);
+        if(b.level==n.level && n.is_parent) {
+            // Loop over children of neighbor.
+            for(cIdx[2]= 0; cIdx[2]<= K3D; ++cIdx[2]) {
+            for(cIdx[1]= 0; cIdx[1]<= K2D; ++cIdx[1]) {
+            for(cIdx[0]= 0; cIdx[0]<= K1D; ++cIdx[0]) {
+
+                // Only check adjacent children
+                if (( ((1-nIdx[0])/2)==cIdx[0] || nIdx[0] == 0 ) &&
+                    ( ((1-nIdx[1])/2)==cIdx[1] || nIdx[1] == 0 ) &&
+                    ( ((1-nIdx[2])/2)==cIdx[2] || nIdx[2] == 0 )) {
+
+                    // Identify child on updated tree
+                    for(unsigned d=0; d<AMREX_SPACEDIM; ++d) {
+                      childCoord_u[d] = neighCoord_u[d]*2 + static_cast<unsigned>(cIdx[d]);
+                    }
+                    auto c = tree1->identify(n.level+1, childCoord_u);
+
+                    // If child WILL be parent, return true
+                    if( c.level==(b.level+1) && c.is_parent) {
+                        return true;
+                    }
+                }
+            }}}
+        }
+    }}}
+
+    // Return false otherwise
+    return false;
+}
+
+/** Calculate integer coordinates of neighbors, taking into account BCs.
+  * Currently assuming Periodic in all directions.
+  */
+std::vector<int> btUnit::neighIntCoords (BittreeAmr* const mesh,
+                                         unsigned lev, unsigned const* lcoord,
+                                         int const* gCell)
+{
+    auto tree = mesh->getTree();
+
+    std::vector<int> neighCoord(AMREX_SPACEDIM);
+
+//--Calculate integer coordinates of neighbor in direction
+    for(unsigned d=0;d<AMREX_SPACEDIM;++d) {
+        neighCoord[d] = static_cast<int>(lcoord[d]) + gCell[d];
+    }
+
+//--Make sure not out-of-bounds. If periodic BCs, apply modulo
+    std::vector<int> maxcoord(AMREX_SPACEDIM);
+    for(unsigned d=0;d<AMREX_SPACEDIM;++d) {
+        maxcoord[d] = static_cast<int>(tree->top_size(d)) << lev;
+    }
+
+    for(unsigned d=0;d<AMREX_SPACEDIM;++d) {
+        if (neighCoord[d] < 0 ) {
+            if ( bcPeriodic[d] == true ) {
+                neighCoord[d] = neighCoord[d] + maxcoord[d];
+            } else {
+                neighCoord[d] = -1;
+            }
+        }
+
+        if (neighCoord[d] >= maxcoord[d]) {
+            if ( bcPeriodic[d] == true ) {
+                neighCoord[d] = neighCoord[d] - maxcoord[d];
+            } else {
+                neighCoord[d] = -1;
+            }
+        }
+    }
+
+    return neighCoord;
+}
+
+}
diff --git a/Src/Extern/Bittree/CMakeLists.txt b/Src/Extern/Bittree/CMakeLists.txt
new file mode 100644
index 00000000000..d51454b2172
--- /dev/null
+++ b/Src/Extern/Bittree/CMakeLists.txt
@@ -0,0 +1,11 @@
+target_include_directories( amrex
+   PUBLIC
+   $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>)
+
+add_amrex_define(AMREX_USE_BITTREE NO_LEGACY)
+
+target_sources( amrex
+   PRIVATE
+   AMReX_Bittree.H
+   AMReX_Bittree.cpp
+   )
diff --git a/Src/Extern/Bittree/Make.package b/Src/Extern/Bittree/Make.package
new file mode 100644
index 00000000000..0aefb6f300a
--- /dev/null
+++ b/Src/Extern/Bittree/Make.package
@@ -0,0 +1,5 @@
+CEXE_headers += AMReX_Bittree.H
+CEXE_sources += AMReX_Bittree.cpp
+
+VPATH_LOCATIONS += $(AMREX_HOME)/Src/Extern/Bittree
+INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Extern/Bittree
diff --git a/Src/Extern/Conduit/AMReX_Conduit_Blueprint.H b/Src/Extern/Conduit/AMReX_Conduit_Blueprint.H
index 6d23bcf07ed..9ac8eb31fd0 100644
--- a/Src/Extern/Conduit/AMReX_Conduit_Blueprint.H
+++ b/Src/Extern/Conduit/AMReX_Conduit_Blueprint.H
@@ -96,9 +96,8 @@ namespace amrex
     //  coordset and fields used to represent the passed particle container.
     //  This allows you to use unique names to wrap multiple particle containers
     //  into a single blueprint tree.
-    template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
-    void ParticleContainerToBlueprint (const ParticleContainer<NStructReal,
-                                                               NStructInt,
+    template <typename ParticleType, int NArrayReal, int NArrayInt>
+    void ParticleContainerToBlueprint (const ParticleContainer_impl<ParticleType,
                                                                NArrayReal,
                                                                NArrayInt> &pc,
                                        const Vector<std::string> &real_comp_names,
diff --git a/Src/Extern/Conduit/AMReX_Conduit_Blueprint_ParticlesI.H b/Src/Extern/Conduit/AMReX_Conduit_Blueprint_ParticlesI.H
index f2d7d1ed2da..e4186ba247b 100644
--- a/Src/Extern/Conduit/AMReX_Conduit_Blueprint_ParticlesI.H
+++ b/Src/Extern/Conduit/AMReX_Conduit_Blueprint_ParticlesI.H
@@ -20,10 +20,9 @@ namespace amrex
 // Note:
 // This is a helper function, it's not part of the AMReX Blueprint Interface.
 //---------------------------------------------------------------------------//
-template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
+template <typename ParticleType, int NArrayReal, int NArrayInt>
 void
-ParticleTileToBlueprint(const ParticleTile<amrex::Particle<NStructReal,
-                                                           NStructInt>,
+ParticleTileToBlueprint(const ParticleTile<ParticleType,
                                            NArrayReal,
                                            NArrayInt> &ptile,
                         const Vector<std::string> &real_comp_names,
@@ -31,15 +30,11 @@ ParticleTileToBlueprint(const ParticleTile<amrex::Particle<NStructReal,
                         conduit::Node &res,
                         const std::string &topology_name)
 {
-    int num_particles = ptile.GetArrayOfStructs().size();
-    int struct_size   = sizeof(Particle<NStructReal, NStructInt>);
+    int num_particles = ptile.size();
 
     // knowing the above, we can zero copy the x,y,z positions + id, cpu
     // and any user fields in the AOS
 
-    // get the first particle's struct
-    const auto &pstruct = ptile.GetArrayOfStructs();
-
     // setup a blueprint description for the particle mesh
     // create a coordinate set
     std::string coordset_name = topology_name + "_coords";
@@ -63,29 +58,56 @@ ParticleTileToBlueprint(const ParticleTile<amrex::Particle<NStructReal,
     //----------------------------------//
     // point locations from from aos
     //----------------------------------//
+    char* pbuf = nullptr;
 
-    const char* pbuf_const = reinterpret_cast<const char*>(pstruct.data());
-    char* pbuf = const_cast<char*>(pbuf_const);
+    if constexpr(ParticleType::is_soa_particle)
+    {
+        amrex::ignore_unused(pbuf);
 
-    ParticleReal* xp = reinterpret_cast<ParticleReal*>(pbuf); pbuf += sizeof(ParticleReal);
-    n_coords["values/x"].set_external(xp,
-                                      num_particles,
-                                      0,
-                                      struct_size);
+        const auto &soa = ptile.GetStructOfArrays();
+
+        // for soa entries, we can use standard strides,
+        // since these are contiguous arrays
+
+        n_coords["values/x"].set_external(const_cast<ParticleReal*>(&soa.GetRealData(0)[0]),
+                                          num_particles);
 #if AMREX_SPACEDIM > 1
-    ParticleReal* yp = reinterpret_cast<ParticleReal*>(pbuf); pbuf += sizeof(ParticleReal);
-    n_coords["values/y"].set_external(yp,
-                                      num_particles,
-                                      0,
-                                      struct_size);
+        n_coords["values/y"].set_external(const_cast<ParticleReal*>(&soa.GetRealData(1)[0]),
+                                          num_particles);
 #endif
 #if AMREX_SPACEDIM > 2
-    ParticleReal* zp = reinterpret_cast<ParticleReal*>(pbuf); pbuf += sizeof(ParticleReal);
-    n_coords["values/z"].set_external(zp,
-                                      num_particles,
-                                      0,
-                                      struct_size);
+        n_coords["values/z"].set_external(const_cast<ParticleReal*>(&soa.GetRealData(2)[0]),
+                                          num_particles);
+#endif
+    } else
+    {
+        // get the first particle's struct
+        const auto &pstruct = ptile.GetArrayOfStructs();
+        const int struct_size = sizeof(ParticleType);
+
+        const char* pbuf_const = reinterpret_cast<const char*>(pstruct.data());
+        pbuf = const_cast<char*>(pbuf_const);
+
+        ParticleReal* xp = reinterpret_cast<ParticleReal*>(pbuf); pbuf += sizeof(ParticleReal);
+        n_coords["values/x"].set_external(xp,
+                                          num_particles,
+                                          0,
+                                          struct_size);
+#if AMREX_SPACEDIM > 1
+        ParticleReal* yp = reinterpret_cast<ParticleReal*>(pbuf); pbuf += sizeof(ParticleReal);
+        n_coords["values/y"].set_external(yp,
+                                          num_particles,
+                                          0,
+                                          struct_size);
+#endif
+#if AMREX_SPACEDIM > 2
+        ParticleReal* zp = reinterpret_cast<ParticleReal*>(pbuf); pbuf += sizeof(ParticleReal);
+        n_coords["values/z"].set_external(zp,
+                                          num_particles,
+                                          0,
+                                          struct_size);
 #endif
+    }
 
     // fields
     conduit::Node &n_fields = res["fields"];
@@ -95,20 +117,26 @@ ParticleTileToBlueprint(const ParticleTile<amrex::Particle<NStructReal,
     // -----------------------------
 
     int vname_real_idx = 0;
-    // struct real fields, the first set are always the particle positions
-    // which we wrap above
-    for (int i = 0; i < NStructReal; i++)
+    if constexpr(!ParticleType::is_soa_particle)
     {
-        ParticleReal* val = reinterpret_cast<ParticleReal*>(pbuf); pbuf += sizeof(ParticleReal);
-        conduit::Node &n_f = n_fields[real_comp_names.at(vname_real_idx)];
-        n_f["topology"] = topology_name;
-        n_f["association"] = "element";
-        n_f["values"].set_external(val,
-                                   num_particles,
-                                   0,
-                                   struct_size);
+        constexpr int struct_size = sizeof(ParticleType);
+        constexpr int NStructReal = ParticleType::NReal;
 
-        vname_real_idx++;
+        // struct real fields, the first set are always the particle positions
+        // which we wrap above
+        for (int i = 0; i < NStructReal; i++)
+        {
+            ParticleReal* val = reinterpret_cast<ParticleReal*>(pbuf); pbuf += sizeof(ParticleReal);
+            conduit::Node &n_f = n_fields[real_comp_names.at(vname_real_idx)];
+            n_f["topology"] = topology_name;
+            n_f["association"] = "element";
+            n_f["values"].set_external(val,
+                                       num_particles,
+                                       0,
+                                       struct_size);
+
+            vname_real_idx++;
+        }
     }
 
     //----------------------------------//
@@ -116,44 +144,77 @@ ParticleTileToBlueprint(const ParticleTile<amrex::Particle<NStructReal,
     // (id, cpu)
     //----------------------------------//
 
-    // id is the first int entry
-    int* id = reinterpret_cast<int*>(pbuf); pbuf += sizeof(int);
-    conduit::Node &n_f_id = n_fields[topology_name + "_id"];
+    if constexpr(!ParticleType::is_soa_particle)
+    {
+        const int struct_size = sizeof(ParticleType);
+
+        // id is the first int entry
+        int* id = reinterpret_cast<int*>(pbuf); pbuf += sizeof(int);
+        conduit::Node &n_f_id = n_fields[topology_name + "_id"];
+
+        n_f_id["topology"] = topology_name;
+        n_f_id["association"] = "element";
+        n_f_id["values"].set_external(id,
+                                      num_particles,
+                                      0,
+                                      struct_size);
+
+        // cpu is the second int entry
+        int* cpu = reinterpret_cast<int*>(pbuf); pbuf += sizeof(int);
+        conduit::Node &n_f_cpu = n_fields[topology_name + "_cpu"];
+
+        n_f_cpu["topology"] = topology_name;
+        n_f_cpu["association"] = "element";
+        n_f_cpu["values"].set_external(cpu,
+                                       num_particles,
+                                       0,
+                                       struct_size);
+    } else {
+        const auto &soa = ptile.GetStructOfArrays();
+
+        // for soa entries, we can use standard strides,
+        // since these are contiguous arrays
 
-    n_f_id["topology"] = topology_name;
-    n_f_id["association"] = "element";
-    n_f_id["values"].set_external(id,
-                                  num_particles,
-                                  0,
-                                  struct_size);
+        // id is the first int entry
+        conduit::Node &n_f_id = n_fields[topology_name + "_id"];
 
-    // cpu is the second int entry
-    int* cpu = reinterpret_cast<int*>(pbuf); pbuf += sizeof(int);
-    conduit::Node &n_f_cpu = n_fields[topology_name + "_cpu"];
+        n_f_id["topology"] = topology_name;
+        n_f_id["association"] = "element";
+        n_f_id["values"].set_external(const_cast<int*>(&soa.GetIntData(0)[0]),
+                                      num_particles);
 
-    n_f_cpu["topology"] = topology_name;
-    n_f_cpu["association"] = "element";
-    n_f_cpu["values"].set_external(cpu,
-                                   num_particles,
-                                   0,
-                                   struct_size);
+        // cpu is the second int entry
+        conduit::Node &n_f_cpu = n_fields[topology_name + "_cpu"];
+
+        n_f_cpu["topology"] = topology_name;
+        n_f_cpu["association"] = "element";
+        n_f_cpu["values"].set_external(const_cast<int*>(&soa.GetIntData(0)[0]),
+                                       num_particles);
+
+    }
 
     // --------------------------------
     // user defined, integer aos fields
     // --------------------------------
 
     int vname_int_idx = 0;
-    for (int i = 0; i < NStructInt; i++)
+    if constexpr(!ParticleType::is_soa_particle)
     {
-        int* val = reinterpret_cast<int*>(pbuf); pbuf += sizeof(int);
-        conduit::Node &n_f = n_fields[int_comp_names.at(vname_int_idx)];
-        n_f["topology"] = topology_name;
-        n_f["association"] = "element";
-        n_f["values"].set_external(val,
-                                   num_particles,
-                                   0,
-                                   struct_size);
-        vname_int_idx++;
+        constexpr int struct_size = sizeof(ParticleType);
+        constexpr int NStructInt = ParticleType::NInt;
+
+        for (int i = 0; i < NStructInt; i++)
+        {
+            int* val = reinterpret_cast<int*>(pbuf); pbuf += sizeof(int);
+            conduit::Node &n_f = n_fields[int_comp_names.at(vname_int_idx)];
+            n_f["topology"] = topology_name;
+            n_f["association"] = "element";
+            n_f["values"].set_external(val,
+                                       num_particles,
+                                       0,
+                                       struct_size);
+            vname_int_idx++;
+        }
     }
 
     // -------------------------
@@ -193,10 +254,9 @@ ParticleTileToBlueprint(const ParticleTile<amrex::Particle<NStructReal,
 //---------------------------------------------------------------------------//
 // Converts a AMReX Particle Container into a Conduit Mesh Blueprint Hierarchy.
 //---------------------------------------------------------------------------//
-template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
+template <typename ParticleType, int NArrayReal, int NArrayInt>
 void
-ParticleContainerToBlueprint(const ParticleContainer<NStructReal,
-                                                     NStructInt,
+ParticleContainerToBlueprint(const ParticleContainer_impl<ParticleType,
                                                      NArrayReal,
                                                      NArrayInt> &pc,
                              const Vector<std::string> &real_comp_names,
@@ -209,8 +269,13 @@ ParticleContainerToBlueprint(const ParticleContainer<NStructReal,
     // validate varnames, which are used to provide field names
     // for user defined aos and soa values.
 
-    BL_ASSERT(real_comp_names.size() == (NStructReal + NArrayReal) );
-    BL_ASSERT(int_comp_names.size()  == (NStructInt + NArrayInt) );
+    if constexpr(ParticleType::is_soa_particle) {
+        BL_ASSERT(real_comp_names.size() == NArrayReal);
+        BL_ASSERT(int_comp_names.size()  == NArrayInt);
+    } else {
+        BL_ASSERT(real_comp_names.size() == (ParticleType::NReal + NArrayReal) );
+        BL_ASSERT(int_comp_names.size()  == (ParticleType::NInt + NArrayInt) );
+    }
 
     int num_levels = pc.maxLevel() + 1;
     int num_domains = 0;
@@ -224,7 +289,7 @@ ParticleContainerToBlueprint(const ParticleContainer<NStructReal,
     int rank   = ParallelDescriptor::MyProc();
     int nprocs = ParallelDescriptor::NProcs();
 
-    using MyParConstIter = ParConstIter<NStructReal, NStructInt, NArrayReal, NArrayInt>;
+    using MyParConstIter = ParConstIter_impl<ParticleType, NArrayReal, NArrayInt>;
 
     //
     // blueprint expects unique ids for each domain published
diff --git a/Src/Extern/HDF5/AMReX_ParticleHDF5.H b/Src/Extern/HDF5/AMReX_ParticleHDF5.H
index 414db4ad29b..aec3ba9a63e 100644
--- a/Src/Extern/HDF5/AMReX_ParticleHDF5.H
+++ b/Src/Extern/HDF5/AMReX_ParticleHDF5.H
@@ -267,7 +267,7 @@ WritePlotFileHDF5 (const std::string& dir, const std::string& name,
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
-template <class F, typename std::enable_if<!std::is_same<F, Vector<std::string>>::value>::type*>
+template <class F, std::enable_if_t<!std::is_same_v<F, Vector<std::string>>>*>
 void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>
 ::WritePlotFileHDF5 (const std::string& dir, const std::string& name,
@@ -325,7 +325,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
-template <class F, typename std::enable_if<!std::is_same<F, Vector<std::string>>::value>::type*>
+template <class F, std::enable_if_t<!std::is_same_v<F, Vector<std::string>>>*>
 void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>
 ::WritePlotFileHDF5 (const std::string& dir, const std::string& name,
@@ -711,12 +711,13 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         if (count[grid] == 0)
             continue;
 
-        int_size  = count[grid] * (2 + NStructInt + NArrayInt);
+        int_size  = count[grid] * (2 + NStructInt + NumIntComps());
         my_mfi_int_size.push_back(int_size);
         my_nparticles.push_back(count[grid]);
         my_mfi_int_total_size += int_size;
 
-        real_size = count[grid] * (AMREX_SPACEDIM + NStructReal + NArrayReal);
+
+        real_size = count[grid] * (AMREX_SPACEDIM + NStructReal + NumRealComps());
         my_mfi_real_size.push_back(real_size);
         my_mfi_real_total_size += real_size;
         my_mfi_cnt++;
diff --git a/Src/Extern/HDF5/AMReX_ParticlesHDF5.H b/Src/Extern/HDF5/AMReX_ParticlesHDF5.H
index c72f6e22523..80d54aa941f 100644
--- a/Src/Extern/HDF5/AMReX_ParticlesHDF5.H
+++ b/Src/Extern/HDF5/AMReX_ParticlesHDF5.H
@@ -105,7 +105,7 @@ public:
      * \param name The name of the sub-directory for this particle type (i.e. "Tracer")
      * \param f callable that returns whether or not to write each particle
      */
-    template <class F, typename std::enable_if<!std::is_same<F, Vector<std::string>>::value>::type* = nullptr>
+    template <class F, std::enable_if_t<!std::is_same_v<F, Vector<std::string>>>* = nullptr>
     void WritePlotFileHDF5 (const std::string& dir, const std::string& name, const std::string& compression, F&& f) const;
 
     /**
@@ -165,7 +165,7 @@ public:
      * \param real_comp_names for each real component, a name to label the data with
      * \param f callable that returns whether or not to write each particle
      */
-    template <class F, typename std::enable_if<!std::is_same<F, Vector<std::string>>::value>::type* = nullptr>
+    template <class F, std::enable_if_t<!std::is_same_v<F, Vector<std::string>>>* = nullptr>
     void WritePlotFileHDF5 (const std::string& dir, const std::string& name,
                             const Vector<std::string>& real_comp_names,
                             const std::string& compression, F&& f) const;
diff --git a/Src/Extern/HYPRE/AMReX_HypreNodeLap.cpp b/Src/Extern/HYPRE/AMReX_HypreNodeLap.cpp
index 79b35a5d58f..448869ca359 100644
--- a/Src/Extern/HYPRE/AMReX_HypreNodeLap.cpp
+++ b/Src/Extern/HYPRE/AMReX_HypreNodeLap.cpp
@@ -47,7 +47,7 @@ HypreNodeLap::HypreNodeLap (const BoxArray& grids_, const DistributionMapping& d
     Int nnodes_proc = fill_local_node_id();
 
     // At this point, local_node_id stores the ids local to each box.
-    // nnodes_grid stroes the number of nodes in each box.  nnodes_proc is
+    // nnodes_grid stores the number of nodes in each box.  nnodes_proc is
     // the number of nodes on this MPI process.  If a nodal is invalid, its
     // id is invalid (i.e., a very negative number).  Note that the data
     // type of local_node_id is int, not HYPRE_Int for performance on GPU.
diff --git a/Src/Extern/HYPRE/AMReX_HypreSolver.H b/Src/Extern/HYPRE/AMReX_HypreSolver.H
index 205518f1f16..039373d5ab1 100644
--- a/Src/Extern/HYPRE/AMReX_HypreSolver.H
+++ b/Src/Extern/HYPRE/AMReX_HypreSolver.H
@@ -220,7 +220,7 @@ HypreSolver<MSS>::HypreSolver (Vector<IndexType>   const& a_index_type,
     fill_local_id(a_marker);
 
     // At this point, m_local_id stores the ids local to each box.
-    // m_nrows_grid stroes the number of unique points in each box.
+    // m_nrows_grid stores the number of unique points in each box.
     // m_nrows_proc is the number of rowss for all variables on this MPI
     // process.  If a point is invalid, its id is invalid (i.e., a very
     // negative number).  Note that the data type of local_node_id is int,
diff --git a/Src/Extern/ProfParser/AMReX_AVGDOWN_1D.F b/Src/Extern/ProfParser/AMReX_AVGDOWN_1D.F
index 2e91364a729..37abbb83d3a 100644
--- a/Src/Extern/ProfParser/AMReX_AVGDOWN_1D.F
+++ b/Src/Extern/ProfParser/AMReX_AVGDOWN_1D.F
@@ -4,17 +4,24 @@
 #endif
 
 #include "AMReX_REAL.H"
-#include "AMReX_CONSTANTS.H"
 #include "AMReX_AVGDOWN_F.H"
 #include "AMReX_ArrayLim.H"
 
-#undef AMREX_XSDK
+#ifdef BL_USE_FLOAT
+#define REAL_T  real
+#define zero    0.0e0
+#define one     1.0e0
+#else
+#define REAL_T  double precision
+#define zero    0.0d0
+#define one     1.0d0
+#endif
 
 #define SDIM 1
 
       subroutine FORT_CV_AVGDOWN (
-     &     crse,DIMS(crse),nvar,
-     &     fine,DIMS(fine),
+     &     crse,crse_l1,crse_h1,nvar,
+     &     fine,fine_l1,fine_h1,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Average the fine grid data onto the coarse
@@ -26,13 +33,14 @@ subroutine FORT_CV_AVGDOWN (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
-      integer  DIMDEC(crse)
-      integer  DIMDEC(fine)
+      implicit none
+      integer  crse_l1,crse_h1
+      integer  fine_l1,fine_h1
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T   crse(DIMV(crse),nvar)
-      REAL_T   fine(DIMV(fine),nvar)
+      REAL_T   crse(crse_l1:crse_h1,nvar)
+      REAL_T   fine(fine_l1:fine_h1,nvar)
 
       integer  i, j, n, ic, jc, ioff, joff
       integer  lratx
@@ -68,8 +76,8 @@ subroutine FORT_CV_AVGDOWN (
 
       subroutine FORT_CV_AVGDOWN_STAG (
      &     nodal_dir,
-     &     crse,DIMS(crse),nvar,
-     &     fine,DIMS(fine),
+     &     crse,crse_l1,crse_h1,nvar,
+     &     fine,fine_l1,fine_h1,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Average the fine grid data onto the coarse
@@ -81,14 +89,15 @@ subroutine FORT_CV_AVGDOWN_STAG (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
+      implicit none
       integer  nodal_dir
-      integer  DIMDEC(crse)
-      integer  DIMDEC(fine)
+      integer  crse_l1,crse_h1
+      integer  fine_l1,fine_h1
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T   crse(DIMV(crse),nvar)
-      REAL_T   fine(DIMV(fine),nvar)
+      REAL_T   crse(crse_l1:crse_h1,nvar)
+      REAL_T   fine(fine_l1:fine_h1,nvar)
 
       integer  i, j, n, ic, jc, ioff, joff
       integer  lrat
@@ -135,10 +144,10 @@ subroutine FORT_CV_AVGDOWN_STAG (
       end
 
       subroutine FORT_AVGDOWN (
-     &     crse,DIMS(crse),nvar,
-     &     fine,DIMS(fine),
-     &     cv,DIMS(cv),
-     &     fv,DIMS(fv),
+     &     crse,crse_l1,crse_h1,nvar,
+     &     fine,fine_l1,fine_h1,
+     &     cv,cv_l1,cv_h1,
+     &     fv,fv_l1,fv_h1,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Volume-weight average the fine grid data onto the coarse
@@ -152,17 +161,18 @@ subroutine FORT_AVGDOWN (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
-      integer  DIMDEC(crse)
-      integer  DIMDEC(cv)
-      integer  DIMDEC(fine)
-      integer  DIMDEC(fv)
+      implicit none
+      integer  crse_l1,crse_h1
+      integer  cv_l1,cv_h1
+      integer  fine_l1,fine_h1
+      integer  fv_l1,fv_h1
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T   crse(DIMV(crse),nvar)
-      REAL_T     cv(DIMV(cv))
-      REAL_T   fine(DIMV(fine),nvar)
-      REAL_T     fv(DIMV(fv))
+      REAL_T   crse(crse_l1:crse_h1,nvar)
+      REAL_T     cv(cv_l1:cv_h1)
+      REAL_T   fine(fine_l1:fine_h1,nvar)
+      REAL_T     fv(fv_l1:fv_h1)
 
       integer  i, j, n, ic, jc, ioff, joff
       integer  lratx
diff --git a/Src/Extern/ProfParser/AMReX_AVGDOWN_2D.F b/Src/Extern/ProfParser/AMReX_AVGDOWN_2D.F
index 7bf28b49bcd..2f4cb953db8 100644
--- a/Src/Extern/ProfParser/AMReX_AVGDOWN_2D.F
+++ b/Src/Extern/ProfParser/AMReX_AVGDOWN_2D.F
@@ -3,18 +3,25 @@
 #define BL_LANG_FORT
 #endif
 
-#undef AMREX_XSDK
-
 #include "AMReX_REAL.H"
-#include "AMReX_CONSTANTS.H"
 #include "AMReX_AVGDOWN_F.H"
 #include "AMReX_ArrayLim.H"
 
+#ifdef BL_USE_FLOAT
+#define REAL_T  real
+#define zero    0.0e0
+#define one     1.0e0
+#else
+#define REAL_T  double precision
+#define zero    0.0d0
+#define one     1.0d0
+#endif
+
 #define SDIM 2
 
       subroutine FORT_MAXVAL_AVGDOWN (
-     &     crse,DIMS(crse),nvar,
-     &     fine,DIMS(fine),
+     &     crse,crse_l1,crse_l2,crse_h1,crse_h2,nvar,
+     &     fine,fine_l1,fine_l2,fine_h1,fine_h2,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Average the fine grid data onto the coarse
@@ -26,13 +33,14 @@ subroutine FORT_MAXVAL_AVGDOWN (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
-      integer  DIMDEC(crse)
-      integer  DIMDEC(fine)
+      implicit none
+      integer  crse_l1,crse_l2,crse_h1,crse_h2
+      integer  fine_l1,fine_l2,fine_h1,fine_h2
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T   crse(DIMV(crse),nvar)
-      REAL_T   fine(DIMV(fine),nvar)
+      REAL_T   crse(crse_l1:crse_h1,crse_l2:crse_h2,nvar)
+      REAL_T   fine(fine_l1:fine_h1,fine_l2:fine_h2,nvar)
 
       integer  i, j, n, ic, jc, ioff, joff
       integer  lratx,lraty
@@ -71,8 +79,8 @@ subroutine FORT_MAXVAL_AVGDOWN (
 
 
       subroutine FORT_CV_AVGDOWN (
-     &     crse,DIMS(crse),nvar,
-     &     fine,DIMS(fine),
+     &     crse,crse_l1,crse_l2,crse_h1,crse_h2,nvar,
+     &     fine,fine_l1,fine_l2,fine_h1,fine_h2,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Average the fine grid data onto the coarse
@@ -84,13 +92,14 @@ subroutine FORT_CV_AVGDOWN (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
-      integer  DIMDEC(crse)
-      integer  DIMDEC(fine)
+      implicit none
+      integer  crse_l1,crse_l2,crse_h1,crse_h2
+      integer  fine_l1,fine_l2,fine_h1,fine_h2
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T   crse(DIMV(crse),nvar)
-      REAL_T   fine(DIMV(fine),nvar)
+      REAL_T   crse(crse_l1:crse_h1,crse_l2:crse_h2,nvar)
+      REAL_T   fine(fine_l1:fine_h1,fine_l2:fine_h2,nvar)
 
       integer  i, j, n, ic, jc, ioff, joff
       integer  lratx,lraty
@@ -139,8 +148,8 @@ subroutine FORT_CV_AVGDOWN (
 
       subroutine FORT_CV_AVGDOWN_STAG (
      &     nodal_dir,
-     &     crse,DIMS(crse),nvar,
-     &     fine,DIMS(fine),
+     &     crse,crse_l1,crse_l2,crse_h1,crse_h2,nvar,
+     &     fine,fine_l1,fine_l2,fine_h1,fine_h2,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Average the fine grid data onto the coarse
@@ -152,14 +161,15 @@ subroutine FORT_CV_AVGDOWN_STAG (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
+      implicit none
       integer  nodal_dir
-      integer  DIMDEC(crse)
-      integer  DIMDEC(fine)
+      integer  crse_l1,crse_l2,crse_h1,crse_h2
+      integer  fine_l1,fine_l2,fine_h1,fine_h2
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T   crse(DIMV(crse),nvar)
-      REAL_T   fine(DIMV(fine),nvar)
+      REAL_T   crse(crse_l1:crse_h1,crse_l2:crse_h2,nvar)
+      REAL_T   fine(fine_l1:fine_h1,fine_l2:fine_h2,nvar)
 
       integer  i, j, n, ic, jc, ioff, joff
       integer  lrat
@@ -220,10 +230,10 @@ subroutine FORT_CV_AVGDOWN_STAG (
       end
 
       subroutine FORT_AVGDOWN (
-     &     crse,DIMS(crse),nvar,
-     &     fine,DIMS(fine),
-     &     cv,DIMS(cv),
-     &     fv,DIMS(fv),
+     &     crse,crse_l1,crse_l2,crse_h1,crse_h2,nvar,
+     &     fine,fine_l1,fine_l2,fine_h1,fine_h2,
+     &     cv,cv_l1,cv_l2,cv_h1,cv_h2,
+     &     fv,fv_l1,fv_l2,fv_h1,fv_h2,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Volume-weight average the fine grid data onto the coarse
@@ -237,17 +247,18 @@ subroutine FORT_AVGDOWN (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
-      integer  DIMDEC(crse)
-      integer  DIMDEC(cv)
-      integer  DIMDEC(fine)
-      integer  DIMDEC(fv)
+      implicit none
+      integer  crse_l1,crse_l2,crse_h1,crse_h2
+      integer  cv_l1,cv_l2,cv_h1,cv_h2
+      integer  fine_l1,fine_l2,fine_h1,fine_h2
+      integer  fv_l1,fv_l2,fv_h1,fv_h2
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T   crse(DIMV(crse),nvar)
-      REAL_T     cv(DIMV(cv))
-      REAL_T   fine(DIMV(fine),nvar)
-      REAL_T     fv(DIMV(fv))
+      REAL_T   crse(crse_l1:crse_h1,crse_l2:crse_h2,nvar)
+      REAL_T     cv(cv_l1:cv_h1,cv_l2:cv_h2)
+      REAL_T   fine(fine_l1:fine_h1,fine_l2:fine_h2,nvar)
+      REAL_T     fv(fv_l1:fv_h1,fv_l2:fv_h2)
 
       integer  i, j, n, ic, jc, ioff, joff
       integer  lratx,lraty
diff --git a/Src/Extern/ProfParser/AMReX_AVGDOWN_3D.F b/Src/Extern/ProfParser/AMReX_AVGDOWN_3D.F
index babb3729799..a02dafcdd3f 100644
--- a/Src/Extern/ProfParser/AMReX_AVGDOWN_3D.F
+++ b/Src/Extern/ProfParser/AMReX_AVGDOWN_3D.F
@@ -3,19 +3,26 @@
 #define BL_LANG_FORT
 #endif
 
-#undef AMREX_XSDK
-
 #include "AMReX_REAL.H"
-#include "AMReX_CONSTANTS.H"
 #include "AMReX_AVGDOWN_F.H"
 #include "AMReX_ArrayLim.H"
 
+#ifdef BL_USE_FLOAT
+#define REAL_T  real
+#define zero    0.0e0
+#define one     1.0e0
+#else
+#define REAL_T  double precision
+#define zero    0.0d0
+#define one     1.0d0
+#endif
+
 #define SDIM 3
 
       subroutine FORT_CV_AVGDOWN (
-     &     crse,DIMS(crse),
+     &     crse,crse_l1,crse_l2,crse_l3,crse_h1,crse_h2,crse_h3,
      &     nvar,
-     &     fine,DIMS(fine),
+     &     fine,fine_l1,fine_l2,fine_l3,fine_h1,fine_h2,fine_h3,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Average the fine grid data onto the coarse
@@ -27,15 +34,14 @@ subroutine FORT_CV_AVGDOWN (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
-      integer  DIMDEC(crse)
-      integer  DIMDEC(fine)
+      implicit none
+      integer  crse_l1,crse_l2,crse_l3,crse_h1,crse_h2,crse_h3
+      integer  fine_l1,fine_l2,fine_l3,fine_h1,fine_h2,fine_h3
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T
-     $     crse(DIMV(crse),nvar)
-      REAL_T
-     $     fine(DIMV(fine),nvar)
+      REAL_T crse(crse_l1:crse_h1,crse_l2:crse_h2,crse_l3:crse_h3,nvar)
+      REAL_T fine(fine_l1:fine_h1,fine_l2:fine_h2,fine_l3:fine_h3,nvar)
 
       integer  i, j, k, n, ic, jc, kc, ioff, joff, koff
       integer  lratx,lraty,lratz
@@ -91,9 +97,9 @@ subroutine FORT_CV_AVGDOWN (
       end
       subroutine FORT_CV_AVGDOWN_STAG (
      &     nodal_dir,
-     &     crse,DIMS(crse),
+     &     crse,crse_l1,crse_l2,crse_l3,crse_h1,crse_h2,crse_h3,
      &     nvar,
-     &     fine,DIMS(fine),
+     &     fine,fine_l1,fine_l2,fine_l3,fine_h1,fine_h2,fine_h3,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Average the fine grid data onto the coarse
@@ -105,16 +111,15 @@ subroutine FORT_CV_AVGDOWN_STAG (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
+      implicit none
       integer  nodal_dir
-      integer  DIMDEC(crse)
-      integer  DIMDEC(fine)
+      integer  crse_l1,crse_l2,crse_l3,crse_h1,crse_h2,crse_h3
+      integer  fine_l1,fine_l2,fine_l3,fine_h1,fine_h2,fine_h3
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T
-     $     crse(DIMV(crse),nvar)
-      REAL_T
-     $     fine(DIMV(fine),nvar)
+      REAL_T crse(crse_l1:crse_h1,crse_l2:crse_h2,crse_l3:crse_h3,nvar)
+      REAL_T fine(fine_l1:fine_h1,fine_l2:fine_h2,fine_l3:fine_h3,nvar)
 
       integer  i, j, k, n, ic, jc, kc, ioff, joff, koff
       integer  lrat
@@ -200,11 +205,11 @@ subroutine FORT_CV_AVGDOWN_STAG (
 
 
       subroutine FORT_AVGDOWN (
-     &     crse,DIMS(crse),
+     &     crse,crse_l1,crse_l2,crse_l3,crse_h1,crse_h2,crse_h3,
      &      nvar,
-     &     fine,DIMS(fine),
-     &     cv,DIMS(cv),
-     &     fv,DIMS(fv),
+     &     fine,fine_l1,fine_l2,fine_l3,fine_h1,fine_h2,fine_h3,
+     &     cv,cv_l1,cv_l2,cv_l3,cv_h1,cv_h2,cv_h3,
+     &     fv,fv_l1,fv_l2,fv_l3,fv_h1,fv_h2,fv_h3,
      &     lo,hi,ratios)
 c     ----------------------------------------------------------
 c     Volume-weight average the fine grid data onto the coarse
@@ -218,19 +223,18 @@ subroutine FORT_AVGDOWN (
 c     lo,hi     = index limits of overlap (crse grid)
 c     ratios    = IntVect refinement ratio
 c     ----------------------------------------------------------
-      integer  DIMDEC(crse)
-      integer  DIMDEC(cv)
-      integer  DIMDEC(fine)
-      integer  DIMDEC(fv)
+      implicit none
+      integer  crse_l1,crse_l2,crse_l3,crse_h1,crse_h2,crse_h3
+      integer  cv_l1,cv_l2,cv_l3,cv_h1,cv_h2,cv_h3
+      integer  fine_l1,fine_l2,fine_l3,fine_h1,fine_h2,fine_h3
+      integer  fv_l1,fv_l2,fv_l3,fv_h1,fv_h2,fv_h3
       integer  lo(SDIM), hi(SDIM)
       integer  nvar
       integer  ratios(SDIM)
-      REAL_T
-     $     crse(DIMV(crse),nvar)
-      REAL_T     cv(DIMV(cv))
-      REAL_T
-     $     fine(DIMV(fine),nvar)
-      REAL_T     fv(DIMV(fv))
+      REAL_T crse(crse_l1:crse_h1,crse_l2:crse_h2,crse_l3:crse_h3,nvar)
+      REAL_T     cv(cv_l1:cv_h1,cv_l2:cv_h2,cv_l3:cv_h3)
+      REAL_T fine(fine_l1:fine_h1,fine_l2:fine_h2,fine_l3:fine_h3,nvar)
+      REAL_T     fv(fv_l1:fv_h1,fv_l2:fv_h2,fv_l3:fv_h3)
 
       integer  i, j, k, n, ic, jc, kc, ioff, joff, koff
       integer  lratx,lraty,lratz
diff --git a/Src/Extern/SUNDIALS/AMReX_NVector_MultiFab.cpp b/Src/Extern/SUNDIALS/AMReX_NVector_MultiFab.cpp
index 8408f75c41d..34671fac1a8 100644
--- a/Src/Extern/SUNDIALS/AMReX_NVector_MultiFab.cpp
+++ b/Src/Extern/SUNDIALS/AMReX_NVector_MultiFab.cpp
@@ -24,7 +24,6 @@ namespace amrex::sundials {
 /* ----------------------------------------------------------------------------
  * Function to create a new empty multifab vector
  */
-
 N_Vector N_VNewEmpty_MultiFab(sunindextype length, ::sundials::Context* sunctx)
 {
     /* Create vector */
@@ -76,7 +75,6 @@ N_Vector N_VNewEmpty_MultiFab(sunindextype length, ::sundials::Context* sunctx)
 /* ----------------------------------------------------------------------------
  * Function to create a new MultiFab vector
  */
-
 N_Vector N_VNew_MultiFab(sunindextype length,
                          const amrex::BoxArray &ba,
                          const amrex::DistributionMapping &dm,
@@ -102,7 +100,6 @@ N_Vector N_VNew_MultiFab(sunindextype length,
 /* ----------------------------------------------------------------------------
  * Function to create a MultiFab N_Vector with user-specific MultiFab
  */
-
 N_Vector N_VMake_MultiFab(sunindextype length, amrex::MultiFab *v_mf,
                           ::sundials::Context* sunctx)
 {
diff --git a/Src/Extern/SUNDIALS/AMReX_SUNMemory.H b/Src/Extern/SUNDIALS/AMReX_SUNMemory.H
index f7700ce4210..5fc01c3b6b9 100644
--- a/Src/Extern/SUNDIALS/AMReX_SUNMemory.H
+++ b/Src/Extern/SUNDIALS/AMReX_SUNMemory.H
@@ -13,7 +13,6 @@ namespace amrex::sundials {
  *
  * This class allows SUNDIALS to allocate memory using the amrex::Arena.
  */
-
 class MemoryHelper {
 public:
     MemoryHelper(::sundials::Context* sunctx);
diff --git a/Src/Extern/SUNDIALS/AMReX_Sundials.H b/Src/Extern/SUNDIALS/AMReX_Sundials.H
index 7bd09795d50..141291e4a21 100644
--- a/Src/Extern/SUNDIALS/AMReX_Sundials.H
+++ b/Src/Extern/SUNDIALS/AMReX_Sundials.H
@@ -6,7 +6,7 @@
 #include <AMReX_Sundials_Core.H>
 #include <AMReX_SUNMemory.H>
 
-static_assert(std::is_same<amrex::Real,realtype>::value,
-              "amrex::Real must be the same as SUNDIALS realtype");
+static_assert(std::is_same<amrex::Real,sunrealtype>::value,
+              "amrex::Real must be the same as SUNDIALS sunrealtype");
 
 #endif
diff --git a/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H b/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H
index 2623f832590..8066e9e8840 100644
--- a/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H
+++ b/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H
@@ -15,48 +15,48 @@
 #include <sunlinsol/sunlinsol_spgmr.h> /* access to SPGMR SUNLinearSolver        */
 #include <sunlinsol/sunlinsol_spfgmr.h> /* access to SPGMR SUNLinearSolver        */
 #include <sunnonlinsol/sunnonlinsol_fixedpoint.h> /* access to FixedPoint SUNNonlinearSolver */
-#include <sundials/sundials_types.h>   /* defs. of realtype, sunindextype, etc   */
+#include <sundials/sundials_types.h>   /* defs. of sunrealtype, sunindextype, etc   */
 
 namespace amrex {
 
 struct SundialsUserData {
-    std::function<int(realtype, N_Vector, N_Vector, void*)> f0;
-    std::function<int(realtype, N_Vector, N_Vector, void*)> f_fast;
-    std::function<int(realtype, N_Vector, N_Vector, void*)> f;
-    /* std::function<int(realtype, N_Vector*, int, void*)> StoreStage; */
-    std::function<int(realtype, N_Vector, void*)> ProcessStage;
-    std::function<int(realtype, N_Vector, void*)> PostStoreStage;
+    std::function<int(sunrealtype, N_Vector, N_Vector, void*)> f0;
+    std::function<int(sunrealtype, N_Vector, N_Vector, void*)> f_fast;
+    std::function<int(sunrealtype, N_Vector, N_Vector, void*)> f;
+    /* std::function<int(sunrealtype, N_Vector*, int, void*)> StoreStage; */
+    std::function<int(sunrealtype, N_Vector, void*)> ProcessStage;
+    std::function<int(sunrealtype, N_Vector, void*)> PostStoreStage;
 };
 
 namespace SundialsUserFun {
-    static int f0 (realtype t, N_Vector y, N_Vector ydot, void *user_data) {
+    static int f0 (sunrealtype t, N_Vector y, N_Vector ydot, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->f0(t, y, ydot, user_data);
     }
 
-    static int f_fast (realtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) {
+    static int f_fast (sunrealtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->f_fast(t, y_data, y_rhs, user_data);
     }
 
-    static int f (realtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) {
+    static int f (sunrealtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->f(t, y_data, y_rhs, user_data);
     }
 
 /*
-    static int StoreStage (realtype t, N_Vector* f_data, int nvecs, void *user_data) {
+    static int StoreStage (sunrealtype t, N_Vector* f_data, int nvecs, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->StoreStage(t, f_data, nvecs, user_data);
     }
 */
 
-    static int ProcessStage (realtype t, N_Vector y_data, void *user_data) {
+    static int ProcessStage (sunrealtype t, N_Vector y_data, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->ProcessStage(t, y_data, user_data);
     }
 
-    static int PostStoreStage(realtype t, N_Vector y_data, void *user_data) {
+    static int PostStoreStage(sunrealtype t, N_Vector y_data, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->PostStoreStage(t, y_data, user_data);
     }
@@ -245,7 +245,7 @@ public:
 
         /* Begin Section: SUNDIALS FUNCTION HOOKS */
         /* f routine to compute the ODE RHS function f(t,y). */
-        udata.f = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
+        udata.f = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab> S_data;
             amrex::Vector<amrex::MultiFab> S_rhs;
 
@@ -265,7 +265,7 @@ public:
             return 0;
         };
 
-        udata.ProcessStage = [&](realtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
+        udata.ProcessStage = [&](sunrealtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab > S_data;
 
             const int num_vecs = N_VGetNumSubvectors_ManyVector(y_data);
@@ -421,14 +421,14 @@ public:
 
         /* Begin Section: SUNDIALS FUNCTION HOOKS */
         /* f0 routine to compute a zero-valued ODE RHS function f(t,y). */
-        udata.f0 = [&](realtype /* rhs_time */, N_Vector /* y */, N_Vector ydot, void * /* user_data */) -> int {
+        udata.f0 = [&](sunrealtype /* rhs_time */, N_Vector /* y */, N_Vector ydot, void * /* user_data */) -> int {
             // Initialize ydot to zero and return
             N_VConst(0.0, ydot);
             return 0;
         };
 
         /* f routine to compute the ODE RHS function f(t,y). */
-        udata.f_fast = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
+        udata.f_fast = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab> S_data;
             amrex::Vector<amrex::MultiFab> S_rhs;
             amrex::Vector<amrex::MultiFab> S_stage_data;
@@ -456,7 +456,7 @@ public:
         };
 
         /* f routine to compute the ODE RHS function f(t,y). */
-        udata.f = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
+        udata.f = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab> S_data;
             amrex::Vector<amrex::MultiFab> S_rhs;
 
@@ -476,7 +476,7 @@ public:
             return 0;
         };
 
-        udata.ProcessStage = [&](realtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
+        udata.ProcessStage = [&](sunrealtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab > S_data;
 
             const int num_vecs = N_VGetNumSubvectors_ManyVector(y_data);
@@ -492,7 +492,7 @@ public:
             return 0;
         };
 
-        udata.PostStoreStage = [&](realtype rhs_time, N_Vector y_data, void *user_data) -> int {
+        udata.PostStoreStage = [&](sunrealtype rhs_time, N_Vector y_data, void *user_data) -> int {
             udata.ProcessStage(rhs_time, y_data, user_data);
 
             for(int i=0; i<N_VGetNumSubvectors_ManyVector(y_data); i++)
@@ -693,7 +693,7 @@ public:
 
         /* Begin Section: SUNDIALS FUNCTION HOOKS */
         /* f routine to compute the ODE RHS function f(t,y). */
-        udata.f = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
+        udata.f = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab> S_data;
             amrex::Vector<amrex::MultiFab> S_rhs;
 
@@ -713,7 +713,7 @@ public:
             return 0;
         };
 
-        udata.ProcessStage = [&](realtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
+        udata.ProcessStage = [&](sunrealtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab > S_data;
 
             const int num_vecs = N_VGetNumSubvectors_ManyVector(y_data);
diff --git a/Src/Extern/SUNDIALS/AMReX_Sundials_Core.H b/Src/Extern/SUNDIALS/AMReX_Sundials_Core.H
index bb3695d19ac..090a5f43534 100644
--- a/Src/Extern/SUNDIALS/AMReX_Sundials_Core.H
+++ b/Src/Extern/SUNDIALS/AMReX_Sundials_Core.H
@@ -15,7 +15,6 @@ namespace amrex::sundials {
  * This will create the nthreads SUNDIALS context objects that are needed by
  * the SUNDIALS solver and vector objects. Called by amrex::Initialize.
  */
-
 void Initialize(int nthreads);
 
 /**
@@ -23,7 +22,6 @@ void Initialize(int nthreads);
  *
  * Called by amrex::Finalize.
  */
-
 void Finalize();
 
 /**
@@ -33,7 +31,6 @@ void Finalize();
  *
  * A SUNDIALS context should not be used concurrently from different threads.
  */
-
 ::sundials::Context* The_Sundials_Context(int i = amrex::OpenMP::get_thread_num());
 
 }
diff --git a/Src/Extern/amrdata/AMReX_FABUTIL_1D.F b/Src/Extern/amrdata/AMReX_FABUTIL_1D.F
index 5692a76831b..42824b08963 100644
--- a/Src/Extern/amrdata/AMReX_FABUTIL_1D.F
+++ b/Src/Extern/amrdata/AMReX_FABUTIL_1D.F
@@ -3,16 +3,29 @@
 #define   FORT_CINTERP       cinterp1d
 #define   FORT_PCINTERP      pcinterp1d
 
-#undef AMREX_XSDK
-
 #include "AMReX_REAL.H"
-#include "AMReX_CONSTANTS.H"
+
+#ifdef BL_USE_FLOAT
+#define REAL_T  real
+#define bigreal 1.0e30
+#define zero    0.0e0
+#define one     1.0e0
+#define half    0.5e0
+#else
+#define REAL_T  double precision
+#define bigreal 1.0d30
+#define zero    0.0d0
+#define one     1.0d0
+#define half    0.5d0
+#endif
 
 c ::: --------------------------------------------------------------
       subroutine FORT_CINTERP (fine,floi1,floi2,fhii1,fhii2, fblo,fbhi,
      $ nvar,lratio, crse,clo,chi,cblo,cbhi,fslo,fshi, cslope,clen,
      $ fslope,fdat,flen,voff)
 
+      implicit none
+
       integer floi1,floi2, fhii1,fhii2
 
       integer fblo(2), fbhi(2)
@@ -204,6 +217,8 @@ subroutine FORT_CINTERP (fine,floi1,floi2,fhii1,fhii2, fblo,fbhi,
       subroutine FORT_PCINTERP (fine,floi1,fhii1,fblo, fbhi,lrat,
      $ nvar, crse,cloi1,chii1,cblo, cbhi,temp,tloi,thii)
 
+      implicit none
+
       integer floi1
       integer fhii1
       integer cloi1
diff --git a/Src/Extern/amrdata/AMReX_FABUTIL_2D.F b/Src/Extern/amrdata/AMReX_FABUTIL_2D.F
index 31391d8733d..8748760ff35 100644
--- a/Src/Extern/amrdata/AMReX_FABUTIL_2D.F
+++ b/Src/Extern/amrdata/AMReX_FABUTIL_2D.F
@@ -3,16 +3,29 @@
 #define   FORT_CINTERP       cinterp2d
 #define   FORT_PCINTERP      pcinterp2d
 
-#undef AMREX_XSDK
-
 #include "AMReX_REAL.H"
-#include "AMReX_CONSTANTS.H"
+
+#ifdef BL_USE_FLOAT
+#define REAL_T  real
+#define bigreal 1.0e30
+#define zero    0.0e0
+#define one     1.0e0
+#define half    0.5e0
+#else
+#define REAL_T  double precision
+#define bigreal 1.0d30
+#define zero    0.0d0
+#define one     1.0d0
+#define half    0.5d0
+#endif
 
 c ::: --------------------------------------------------------------
       subroutine FORT_CINTERP (fine,floi1,floi2,fhii1,fhii2, fblo,fbhi,
      $ nvar,lratio, crse,clo,chi,cblo,cbhi,fslo,fshi, cslope,clen,
      $ fslope,fdat,flen,voff)
 
+      implicit none
+
       integer floi1,floi2, fhii1,fhii2
 
       integer fblo(2), fbhi(2)
@@ -203,6 +216,8 @@ subroutine FORT_PCINTERP (fine,floi1,floi2,fhii1,fhii2,fblo,
      $     fbhi,lrat,nvar,crse,cloi1,cloi2,chii1,chii2,cblo,
      $     cbhi,temp,tloi,thii)
 
+      implicit none
+
       integer floi1,floi2
       integer fhii1,fhii2
       integer cloi1,cloi2
diff --git a/Src/Extern/amrdata/AMReX_FABUTIL_3D.F b/Src/Extern/amrdata/AMReX_FABUTIL_3D.F
index cdc8cce9ff6..8a6ff561577 100644
--- a/Src/Extern/amrdata/AMReX_FABUTIL_3D.F
+++ b/Src/Extern/amrdata/AMReX_FABUTIL_3D.F
@@ -5,10 +5,21 @@
 #define   FORT_VCINTERP  vcinterp3d
 #define   FORT_CARTGRIDMINMAX  cartgridminmax3d
 
-#undef AMREX_XSDK
-
 #include "AMReX_REAL.H"
-#include "AMReX_CONSTANTS.H"
+
+#ifdef BL_USE_FLOAT
+#define REAL_T  real
+#define bigreal 1.0e30
+#define zero    0.0e0
+#define one     1.0e0
+#define half    0.5e0
+#else
+#define REAL_T  double precision
+#define bigreal 1.0d30
+#define zero    0.0d0
+#define one     1.0d0
+#define half    0.5d0
+#endif
 
 #define IX_PROJ(A,B) (A+B*iabs(A))/B-iabs(A)
 
@@ -48,6 +59,8 @@ subroutine FORT_CINTERP (fine,floi1,floi2,floi3,fhii1,fhii2,fhii3,
      $  fblo,fbhi,nvar,lratio, crse,clo,chi,cblo,cbhi,fslo,fshi, cslope,
      $ clen,fslope,fdat,flen,voff)
 
+      implicit none
+
       integer floi1,floi2,floi3, fhii1,fhii2,fhii3
       integer fblo(3), fbhi(3)
       integer cblo(3), cbhi(3)
@@ -348,6 +361,8 @@ subroutine FORT_PCINTERP (fine,floi1,floi2,floi3,fhii1,fhii2,
      $ fhii3,fblo,fbhi,lrat,nvar,crse,cloi1,cloi2,cloi3,chii1,chii2,
      $ chii3,cblo,cbhi,temp,tloi,thii)
 
+      implicit none
+
       integer floi1,floi2,floi3
       integer fhii1,fhii2,fhii3
       integer cloi1,cloi2,cloi3
@@ -408,6 +423,8 @@ subroutine FORT_VCINTERP (fine,floi1,floi2,floi3,fhii1,fhii2,
      $ cvhii2,cvhii3, cslope,clen,fslope,fdat,flen,voff, cvc1,fvc1,cvc2,
      $ fvc2,cvc3,fvc3)
 
+      implicit none
+
       integer floi1,floi2,floi3
       integer fhii1,fhii2,fhii3
       integer fbloi1,fbloi2,fbloi3
diff --git a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H
index d946d33fdbc..3756a4bd1f2 100644
--- a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H
+++ b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H
@@ -5,6 +5,7 @@
 #include <AMReX_MultiFab.H>
 #include <AMReX_iMultiFab.H>
 #include <AMReX_Geometry.H>
+#include <AMReX_OpenMP.H>
 
 #include <map>
 
@@ -42,6 +43,18 @@ public:
     void store (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
                 const int* isFluxDensity, Real sf);
 
+    // flux_in_register += scaling_factor * \sum{fine_flux} / (ref_ratio[0]*ref_ratio[1]*ref_ratio[2])
+    void add (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf);
+
+    // flux_in_register += scaling_factor * \sum{fine_flux * area}
+    void add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
+                Real sf);
+
+    // flux_in_register += scaling_factor * \sum{fine_flux * area}, if the component is flux density
+    //                     scaling_factor * \sum{fine_flux}       , otherwise
+    void add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
+                const int* isFluxDensity, Real sf);
+
     void communicate ();
 
     // crse_flux = flux_in_register * scaling_factor
@@ -63,6 +76,20 @@ public:
     void load (int crse_global_index, int dir, FArrayBox& crse_flux, FArrayBox const& cflux,
                FArrayBox const& area, const int* isFluxDensity, Real sf_f, Real sf_c) const;
 
+    enum struct OpType { Store, Add };
+
+    template <OpType op>
+    void store_or_add (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf);
+
+    template <OpType op>
+    void store_or_add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
+                       Real sf);
+
+    template <OpType op>
+    void store_or_add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
+                       const int* isFluxDensity, Real sf);
+
+
 protected:
 
     BoxArray m_fine_grids;
@@ -86,6 +113,395 @@ protected:
     mutable Vector<Gpu::DeviceVector<int> > m_d_ifd;
 };
 
+template <FlashFluxRegister::OpType op>
+void FlashFluxRegister::store_or_add (int fine_global_index, int dir,
+                                      FArrayBox const& fine_flux, Real sf)
+{
+    AMREX_ASSERT(dir < AMREX_SPACEDIM);
+    auto found = m_fine_map.find(fine_global_index);
+    if (found != m_fine_map.end()) {
+        const int ncomp = m_ncomp;
+        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
+        if (fab_a[dir]) {
+            Box const& b = fab_a[dir]->box();
+            Array4<Real> const& dest = fab_a[dir]->array();
+            Array4<Real const> const& src = fine_flux.const_array();
+            if (dir == 0) {
+#if (AMREX_SPACEDIM == 1)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(j,k,dest);
+                    auto rhs = src(2*i,0,0,n)*sf;
+                    if constexpr (op == OpType::Store) {
+                        dest(i,0,0,n) = rhs;
+                    } else {
+                        dest(i,0,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    auto rhs = (src(2*i,2*j  ,0,n) +
+                                src(2*i,2*j+1,0,n)) * (Real(0.5)*sf);
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i,2*j  ,2*k  ,n) +
+                                src(2*i,2*j+1,2*k  ,n) +
+                                src(2*i,2*j  ,2*k+1,n) +
+                                src(2*i,2*j+1,2*k+1,n)) * (Real(0.25)*sf);
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM >= 2)
+            else if (dir == 1) {
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    auto rhs = (src(2*i  ,2*j,0,n) +
+                                src(2*i+1,2*j,0,n)) * (Real(0.5)*sf);
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i  ,2*j,2*k  ,n) +
+                                src(2*i+1,2*j,2*k  ,n) +
+                                src(2*i  ,2*j,2*k+1,n) +
+                                src(2*i+1,2*j,2*k+1,n)) * (Real(0.25)*sf);
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM == 3)
+            else {
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i  ,2*j  ,2*k,n) +
+                                src(2*i+1,2*j  ,2*k,n) +
+                                src(2*i  ,2*j+1,2*k,n) +
+                                src(2*i+1,2*j+1,2*k,n)) * (Real(0.25)*sf);
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+            }
+#endif
+#endif
+        }
+    }
+}
+
+template <FlashFluxRegister::OpType op>
+void FlashFluxRegister::store_or_add (int fine_global_index, int dir,
+                                      FArrayBox const& fine_flux,
+                                      FArrayBox const& fine_area, Real sf)
+{
+    AMREX_ASSERT(dir < AMREX_SPACEDIM);
+    auto found = m_fine_map.find(fine_global_index);
+    if (found != m_fine_map.end()) {
+        const int ncomp = m_ncomp;
+        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
+        if (fab_a[dir]) {
+            Box const& b = fab_a[dir]->box();
+            Array4<Real> const& dest = fab_a[dir]->array();
+            Array4<Real const> const& src = fine_flux.const_array();
+            Array4<Real const> const& area = fine_area.const_array();
+            if (dir == 0) {
+#if (AMREX_SPACEDIM == 1)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(j,k,dest);
+                    auto rhs = src(2*i,0,0,n)*area(2*i,0,0)*sf;
+                    if constexpr (op == OpType::Store) {
+                        dest(i,0,0,n) = rhs;
+                    } else {
+                        dest(i,0,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    auto rhs = (src(2*i,2*j  ,0,n)*area(2*i,2*j  ,0) +
+                                src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf;
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i,2*j  ,2*k  ,n)*area(2*i,2*j  ,2*k  ) +
+                                src(2*i,2*j+1,2*k  ,n)*area(2*i,2*j+1,2*k  ) +
+                                src(2*i,2*j  ,2*k+1,n)*area(2*i,2*j  ,2*k+1) +
+                                src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf;
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM >= 2)
+            else if (dir == 1) {
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    auto rhs = (src(2*i  ,2*j,0,n)*area(2*i  ,2*j,0) +
+                                src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf;
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i  ,2*j,2*k  ,n)*area(2*i  ,2*j,2*k  ) +
+                                src(2*i+1,2*j,2*k  ,n)*area(2*i+1,2*j,2*k  ) +
+                                src(2*i  ,2*j,2*k+1,n)*area(2*i  ,2*j,2*k+1) +
+                                src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf;
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM == 3)
+            else {
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i  ,2*j  ,2*k,n)*area(2*i  ,2*j  ,2*k) +
+                                src(2*i+1,2*j  ,2*k,n)*area(2*i+1,2*j  ,2*k) +
+                                src(2*i  ,2*j+1,2*k,n)*area(2*i  ,2*j+1,2*k) +
+                                src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf;
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+            }
+#endif
+#endif
+        }
+    }
+}
+
+template <FlashFluxRegister::OpType op>
+void FlashFluxRegister::store_or_add (int fine_global_index, int dir,
+                                      FArrayBox const& fine_flux,
+                                      FArrayBox const& fine_area,
+                                      const int* isFluxDensity, Real sf)
+{
+    auto& h_ifd = m_h_ifd[OpenMP::get_thread_num()];
+    auto& d_ifd = m_d_ifd[OpenMP::get_thread_num()];
+
+    AMREX_ASSERT(dir < AMREX_SPACEDIM);
+    auto found = m_fine_map.find(fine_global_index);
+    if (found != m_fine_map.end()) {
+        const int ncomp = m_ncomp;
+        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
+        if (fab_a[dir]) {
+            bool allsame = true;
+            for (int n = 0; n < m_ncomp; ++n) {
+                if (h_ifd[n] != isFluxDensity[n]) {
+                    allsame = false;
+                    h_ifd[n] = isFluxDensity[n];
+                }
+            }
+            if (d_ifd.empty()) {
+                allsame = false;
+                d_ifd.resize(m_ncomp);
+            }
+            if (! allsame) {
+                Gpu::copyAsync(Gpu::HostToDevice(), h_ifd.begin(), h_ifd.end(), d_ifd.begin());
+            }
+
+            Box const& b = fab_a[dir]->box();
+            Array4<Real> const& dest = fab_a[dir]->array();
+            Array4<Real const> const& src = fine_flux.const_array();
+            Array4<Real const> const& area = fine_area.const_array();
+            const int* ifd = d_ifd.data();
+            if (dir == 0) {
+#if (AMREX_SPACEDIM == 1)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(j,k,dest);
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = src(2*i,0,0,n)*area(2*i,0,0)*sf;
+                    } else {
+                        rhs = src(2*i,0,0,n)*sf;
+                    }
+                    if constexpr (op == OpType::Store) {
+                        dest(i,0,0,n) = rhs;
+                    } else {
+                        dest(i,0,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i,2*j  ,0,n)*area(2*i,2*j  ,0) +
+                               src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf;
+                    } else {
+                        rhs = (src(2*i,2*j  ,0,n) +
+                               src(2*i,2*j+1,0,n)) * sf;
+                    }
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i,2*j  ,2*k  ,n)*area(2*i,2*j  ,2*k  ) +
+                               src(2*i,2*j+1,2*k  ,n)*area(2*i,2*j+1,2*k  ) +
+                               src(2*i,2*j  ,2*k+1,n)*area(2*i,2*j  ,2*k+1) +
+                               src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf;
+                    } else {
+                        rhs = (src(2*i,2*j  ,2*k  ,n) +
+                               src(2*i,2*j+1,2*k  ,n) +
+                               src(2*i,2*j  ,2*k+1,n) +
+                               src(2*i,2*j+1,2*k+1,n)) * sf;
+                    }
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM >= 2)
+            else if (dir == 1) {
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i  ,2*j,0,n)*area(2*i  ,2*j,0) +
+                               src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf;
+                    } else {
+                        rhs = (src(2*i  ,2*j,0,n) +
+                               src(2*i+1,2*j,0,n)) * sf;
+                    }
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i  ,2*j,2*k  ,n)*area(2*i  ,2*j,2*k  ) +
+                               src(2*i+1,2*j,2*k  ,n)*area(2*i+1,2*j,2*k  ) +
+                               src(2*i  ,2*j,2*k+1,n)*area(2*i  ,2*j,2*k+1) +
+                               src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf;
+                    } else {
+                        rhs = (src(2*i  ,2*j,2*k  ,n) +
+                               src(2*i+1,2*j,2*k  ,n) +
+                               src(2*i  ,2*j,2*k+1,n) +
+                               src(2*i+1,2*j,2*k+1,n)) * sf;
+                    }
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM == 3)
+            else {
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i  ,2*j  ,2*k,n)*area(2*i  ,2*j  ,2*k) +
+                               src(2*i+1,2*j  ,2*k,n)*area(2*i+1,2*j  ,2*k) +
+                               src(2*i  ,2*j+1,2*k,n)*area(2*i  ,2*j+1,2*k) +
+                               src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf;
+                    } else {
+                        rhs = (src(2*i  ,2*j  ,2*k,n) +
+                               src(2*i+1,2*j  ,2*k,n) +
+                               src(2*i  ,2*j+1,2*k,n) +
+                               src(2*i+1,2*j+1,2*k,n)) * sf;
+                    }
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+            }
+#endif
+#endif
+        }
+    }
+}
+
 }
 
 #endif
diff --git a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp
index abe74c41125..0e804f76a12 100644
--- a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp
+++ b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp
@@ -1,5 +1,4 @@
 #include <AMReX_FlashFluxRegister.H>
-#include <AMReX_OpenMP.H>
 
 namespace amrex {
 
@@ -178,279 +177,36 @@ void FlashFluxRegister::define (const BoxArray& fba, const BoxArray& cba,
 
 void FlashFluxRegister::store (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf)
 {
-    AMREX_ASSERT(dir < AMREX_SPACEDIM);
-    auto found = m_fine_map.find(fine_global_index);
-    if (found != m_fine_map.end()) {
-        const int ncomp = m_ncomp;
-        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
-        if (fab_a[dir]) {
-            Box const& b = fab_a[dir]->box();
-            Array4<Real> const& dest = fab_a[dir]->array();
-            Array4<Real const> const& src = fine_flux.const_array();
-            if (dir == 0) {
-#if (AMREX_SPACEDIM == 1)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(j,k);
-                    dest(i,0,0,n) = src(2*i,0,0,n)*sf;
-                });
-#endif
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    dest(i,j,0,n) = (src(2*i,2*j  ,0,n) +
-                                     src(2*i,2*j+1,0,n)) * (Real(0.5)*sf);
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i,2*j  ,2*k  ,n) +
-                                     src(2*i,2*j+1,2*k  ,n) +
-                                     src(2*i,2*j  ,2*k+1,n) +
-                                     src(2*i,2*j+1,2*k+1,n)) * (Real(0.25)*sf);
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM >= 2)
-            else if (dir == 1) {
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    dest(i,j,0,n) = (src(2*i  ,2*j,0,n) +
-                                     src(2*i+1,2*j,0,n)) * (Real(0.5)*sf);
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i  ,2*j,2*k  ,n) +
-                                     src(2*i+1,2*j,2*k  ,n) +
-                                     src(2*i  ,2*j,2*k+1,n) +
-                                     src(2*i+1,2*j,2*k+1,n)) * (Real(0.25)*sf);
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM == 3)
-            else {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i  ,2*j  ,2*k,n) +
-                                     src(2*i+1,2*j  ,2*k,n) +
-                                     src(2*i  ,2*j+1,2*k,n) +
-                                     src(2*i+1,2*j+1,2*k,n)) * (Real(0.25)*sf);
-                });
-            }
-#endif
-#endif
-        }
-    }
+    store_or_add<OpType::Store>(fine_global_index, dir, fine_flux, sf);
 }
 
 void FlashFluxRegister::store (int fine_global_index, int dir, FArrayBox const& fine_flux,
                                FArrayBox const& fine_area, Real sf)
 {
-    AMREX_ASSERT(dir < AMREX_SPACEDIM);
-    auto found = m_fine_map.find(fine_global_index);
-    if (found != m_fine_map.end()) {
-        const int ncomp = m_ncomp;
-        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
-        if (fab_a[dir]) {
-            Box const& b = fab_a[dir]->box();
-            Array4<Real> const& dest = fab_a[dir]->array();
-            Array4<Real const> const& src = fine_flux.const_array();
-            Array4<Real const> const& area = fine_area.const_array();
-            if (dir == 0) {
-#if (AMREX_SPACEDIM == 1)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(j,k);
-                    dest(i,0,0,n) = src(2*i,0,0,n)*area(2*i,0,0)*sf;
-                });
-#endif
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    dest(i,j,0,n) = (src(2*i,2*j  ,0,n)*area(2*i,2*j  ,0) +
-                                     src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf;
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i,2*j  ,2*k  ,n)*area(2*i,2*j  ,2*k  ) +
-                                     src(2*i,2*j+1,2*k  ,n)*area(2*i,2*j+1,2*k  ) +
-                                     src(2*i,2*j  ,2*k+1,n)*area(2*i,2*j  ,2*k+1) +
-                                     src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf;
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM >= 2)
-            else if (dir == 1) {
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    dest(i,j,0,n) = (src(2*i  ,2*j,0,n)*area(2*i  ,2*j,0) +
-                                     src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf;
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i  ,2*j,2*k  ,n)*area(2*i  ,2*j,2*k  ) +
-                                     src(2*i+1,2*j,2*k  ,n)*area(2*i+1,2*j,2*k  ) +
-                                     src(2*i  ,2*j,2*k+1,n)*area(2*i  ,2*j,2*k+1) +
-                                     src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf;
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM == 3)
-            else {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i  ,2*j  ,2*k,n)*area(2*i  ,2*j  ,2*k) +
-                                     src(2*i+1,2*j  ,2*k,n)*area(2*i+1,2*j  ,2*k) +
-                                     src(2*i  ,2*j+1,2*k,n)*area(2*i  ,2*j+1,2*k) +
-                                     src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf;
-                });
-            }
-#endif
-#endif
-        }
-    }
+    store_or_add<OpType::Store>(fine_global_index, dir, fine_flux, fine_area, sf);
 }
 
 void FlashFluxRegister::store (int fine_global_index, int dir, FArrayBox const& fine_flux,
                                FArrayBox const& fine_area, const int* isFluxDensity, Real sf)
 {
-    auto& h_ifd = m_h_ifd[OpenMP::get_thread_num()];
-    auto& d_ifd = m_d_ifd[OpenMP::get_thread_num()];
+    store_or_add<OpType::Store>(fine_global_index, dir, fine_flux, fine_area, isFluxDensity, sf);
+}
 
-    AMREX_ASSERT(dir < AMREX_SPACEDIM);
-    auto found = m_fine_map.find(fine_global_index);
-    if (found != m_fine_map.end()) {
-        const int ncomp = m_ncomp;
-        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
-        if (fab_a[dir]) {
-            bool allsame = true;
-            for (int n = 0; n < m_ncomp; ++n) {
-                if (h_ifd[n] != isFluxDensity[n]) {
-                    allsame = false;
-                    h_ifd[n] = isFluxDensity[n];
-                }
-            }
-            if (d_ifd.empty()) {
-                allsame = false;
-                d_ifd.resize(m_ncomp);
-            }
-            if (! allsame) {
-                Gpu::copyAsync(Gpu::HostToDevice(), h_ifd.begin(), h_ifd.end(), d_ifd.begin());
-            }
+void FlashFluxRegister::add (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf)
+{
+    store_or_add<OpType::Add>(fine_global_index, dir, fine_flux, sf);
+}
 
-            Box const& b = fab_a[dir]->box();
-            Array4<Real> const& dest = fab_a[dir]->array();
-            Array4<Real const> const& src = fine_flux.const_array();
-            Array4<Real const> const& area = fine_area.const_array();
-            const int* ifd = d_ifd.data();
-            if (dir == 0) {
-#if (AMREX_SPACEDIM == 1)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(j,k);
-                    if (ifd[n]) {
-                        dest(i,0,0,n) = src(2*i,0,0,n)*area(2*i,0,0)*sf;
-                    } else {
-                        dest(i,0,0,n) = src(2*i,0,0,n)*sf;
-                    }
-                });
-#endif
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    if (ifd[n]) {
-                        dest(i,j,0,n) = (src(2*i,2*j  ,0,n)*area(2*i,2*j  ,0) +
-                                         src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf;
-                    } else {
-                        dest(i,j,0,n) = (src(2*i,2*j  ,0,n) +
-                                         src(2*i,2*j+1,0,n)) * sf;
-                    }
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    if (ifd[n]) {
-                        dest(i,j,k,n) = (src(2*i,2*j  ,2*k  ,n)*area(2*i,2*j  ,2*k  ) +
-                                         src(2*i,2*j+1,2*k  ,n)*area(2*i,2*j+1,2*k  ) +
-                                         src(2*i,2*j  ,2*k+1,n)*area(2*i,2*j  ,2*k+1) +
-                                         src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf;
-                    } else {
-                        dest(i,j,k,n) = (src(2*i,2*j  ,2*k  ,n) +
-                                         src(2*i,2*j+1,2*k  ,n) +
-                                         src(2*i,2*j  ,2*k+1,n) +
-                                         src(2*i,2*j+1,2*k+1,n)) * sf;
-                    }
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM >= 2)
-            else if (dir == 1) {
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    if (ifd[n]) {
-                        dest(i,j,0,n) = (src(2*i  ,2*j,0,n)*area(2*i  ,2*j,0) +
-                                         src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf;
-                    } else {
-                        dest(i,j,0,n) = (src(2*i  ,2*j,0,n) +
-                                         src(2*i+1,2*j,0,n)) * sf;
-                    }
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    if (ifd[n]) {
-                        dest(i,j,k,n) = (src(2*i  ,2*j,2*k  ,n)*area(2*i  ,2*j,2*k  ) +
-                                         src(2*i+1,2*j,2*k  ,n)*area(2*i+1,2*j,2*k  ) +
-                                         src(2*i  ,2*j,2*k+1,n)*area(2*i  ,2*j,2*k+1) +
-                                         src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf;
-                    } else {
-                        dest(i,j,k,n) = (src(2*i  ,2*j,2*k  ,n) +
-                                         src(2*i+1,2*j,2*k  ,n) +
-                                         src(2*i  ,2*j,2*k+1,n) +
-                                         src(2*i+1,2*j,2*k+1,n)) * sf;
-                    }
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM == 3)
-            else {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    if (ifd[n]) {
-                        dest(i,j,k,n) = (src(2*i  ,2*j  ,2*k,n)*area(2*i  ,2*j  ,2*k) +
-                                         src(2*i+1,2*j  ,2*k,n)*area(2*i+1,2*j  ,2*k) +
-                                         src(2*i  ,2*j+1,2*k,n)*area(2*i  ,2*j+1,2*k) +
-                                         src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf;
-                    } else {
-                        dest(i,j,k,n) = (src(2*i  ,2*j  ,2*k,n) +
-                                         src(2*i+1,2*j  ,2*k,n) +
-                                         src(2*i  ,2*j+1,2*k,n) +
-                                         src(2*i+1,2*j+1,2*k,n)) * sf;
-                    }
-                });
-            }
-#endif
-#endif
-        }
-    }
+void FlashFluxRegister::add (int fine_global_index, int dir, FArrayBox const& fine_flux,
+                             FArrayBox const& fine_area, Real sf)
+{
+    store_or_add<OpType::Add>(fine_global_index, dir, fine_flux, fine_area, sf);
+}
+
+void FlashFluxRegister::add (int fine_global_index, int dir, FArrayBox const& fine_flux,
+                             FArrayBox const& fine_area, const int* isFluxDensity, Real sf)
+{
+    store_or_add<OpType::Add>(fine_global_index, dir, fine_flux, fine_area, isFluxDensity, sf);
 }
 
 void FlashFluxRegister::communicate ()
diff --git a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp
index e8c2bace6d5..562405a889e 100644
--- a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp
+++ b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp
@@ -119,6 +119,41 @@ extern "C" {
         flux_reg->store(fgid, dir, fab, areafab, ifd, scaling_factor);
     }
 
+    void amrex_fi_flash_fluxregister_add (FlashFluxRegister* flux_reg, int fgid, int dir,
+                                            Real const* flux, const int* flo, const int* fhi, int nc,
+                                            Real scaling_factor)
+    {
+        Box bx;
+        bx = Box(IntVect(flo), IntVect(fhi));
+        bx.shiftHalf(dir,-1);
+        const FArrayBox fab(bx,nc,const_cast<Real*>(flux));
+        flux_reg->add(fgid, dir, fab, scaling_factor);
+    }
+
+    void amrex_fi_flash_fluxregister_add_area (FlashFluxRegister* flux_reg, int fgid, int dir,
+                                                 Real const* flux, const int* flo, const int* fhi, int nc,
+                                                 Real const* area, Real scaling_factor)
+    {
+        Box bx;
+        bx = Box(IntVect(flo), IntVect(fhi));
+        bx.shiftHalf(dir,-1);
+        const FArrayBox fab(bx,nc,const_cast<Real*>(flux));
+        const FArrayBox areafab(bx,1,const_cast<Real*>(area));
+        flux_reg->add(fgid, dir, fab, areafab, scaling_factor);
+    }
+
+    void amrex_fi_flash_fluxregister_add_area_ifd (FlashFluxRegister* flux_reg, int fgid, int dir,
+                                                     Real const* flux, const int* flo, const int* fhi, int nc,
+                                                     Real const* area, const int* ifd, Real scaling_factor)
+    {
+        Box bx;
+        bx = Box(IntVect(flo), IntVect(fhi));
+        bx.shiftHalf(dir,-1);
+        const FArrayBox fab(bx,nc,const_cast<Real*>(flux));
+        const FArrayBox areafab(bx,1,const_cast<Real*>(area));
+        flux_reg->add(fgid, dir, fab, areafab, ifd, scaling_factor);
+    }
+
     void amrex_fi_flash_fluxregister_communicate (FlashFluxRegister* flux_reg)
     {
         flux_reg->communicate();
diff --git a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90 b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90
index e13cc0e2543..dc0f00fdd0b 100644
--- a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90
+++ b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90
@@ -138,6 +138,36 @@ subroutine amrex_fi_flash_fluxregister_store_area_ifd (fr,cgid,dir,flux,flo,fhi,
        real(amrex_real), value :: scale
      end subroutine amrex_fi_flash_fluxregister_store_area_ifd
 
+     subroutine amrex_fi_flash_fluxregister_add (fr,cgid,dir,flux,flo,fhi,nc,scale) bind(c)
+       import
+       implicit none
+       type(c_ptr), value :: fr
+       real(amrex_real), intent(in) :: flux(*)
+       integer(c_int), value, intent(in) :: cgid, dir, nc
+       integer(c_int), intent(in) :: flo(*), fhi(*)
+       real(amrex_real), value :: scale
+     end subroutine amrex_fi_flash_fluxregister_add
+
+     subroutine amrex_fi_flash_fluxregister_add_area (fr,cgid,dir,flux,flo,fhi,nc,area,scale) bind(c)
+       import
+       implicit none
+       type(c_ptr), value :: fr
+       real(amrex_real), intent(in) :: flux(*), area(*)
+       integer(c_int), value, intent(in) :: cgid, dir, nc
+       integer(c_int), intent(in) :: flo(*), fhi(*)
+       real(amrex_real), value :: scale
+     end subroutine amrex_fi_flash_fluxregister_add_area
+
+     subroutine amrex_fi_flash_fluxregister_add_area_ifd (fr,cgid,dir,flux,flo,fhi,nc,area,ifd,scale) bind(c)
+       import
+       implicit none
+       type(c_ptr), value :: fr
+       real(amrex_real), intent(in) :: flux(*), area(*)
+       integer(c_int), value, intent(in) :: cgid, dir, nc
+       integer(c_int), intent(in) :: flo(*), fhi(*), ifd(*)
+       real(amrex_real), value :: scale
+     end subroutine amrex_fi_flash_fluxregister_add_area_ifd
+
      subroutine amrex_fi_flash_fluxregister_communicate (fr) bind(c)
        import
        implicit none
@@ -186,50 +216,80 @@ subroutine amrex_flash_fluxregister_communicate (this)
     call amrex_fi_flash_fluxregister_communicate(this%p)
   end subroutine amrex_flash_fluxregister_communicate
 
-  subroutine amrex_flash_fluxregister_store (this, flux, flo, fhi, grid_idx, dir, scale)
+  subroutine amrex_flash_fluxregister_store (this, flux, flo, fhi, grid_idx, dir, addit, scale)
     class(amrex_flash_fluxregister), intent(inout) :: this
     integer, intent(in) :: flo(*), fhi(*), grid_idx, dir
     real(amrex_real), intent(in) :: flux(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),flo(4):fhi(4))
+    logical,          optional, intent(in) :: addit
     real(amrex_real), optional, intent(in) :: scale
-    !
+    logical          :: my_addit
     real(amrex_real) :: my_scale
+    if (present(addit)) then
+       my_addit = addit
+    else
+       my_addit = .FALSE.
+    end if
     if (present(scale)) then
        my_scale = scale
     else
        my_scale = 1._amrex_real
     end if
-    call amrex_fi_flash_fluxregister_store(this%p, grid_idx, dir, &
-         flux, flo, fhi, fhi(4)-flo(4)+1, my_scale)
+    if (my_addit) then
+       call amrex_fi_flash_fluxregister_add(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, my_scale)
+    else
+       call amrex_fi_flash_fluxregister_store(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, my_scale)
+    end if
   end subroutine amrex_flash_fluxregister_store
 
-  subroutine amrex_flash_fluxregister_store_area (this, flux, area, flo, fhi, grid_idx, dir, scale)
+  subroutine amrex_flash_fluxregister_store_area (this, flux, area, flo, fhi, grid_idx, dir, addit, scale)
     class(amrex_flash_fluxregister), intent(inout) :: this
     integer, intent(in) :: flo(*), fhi(*), grid_idx, dir
     real(amrex_real), intent(in) :: flux(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),flo(4):fhi(4))
     real(amrex_real), intent(in) :: area(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3))
+    logical,          optional, intent(in) :: addit
     real(amrex_real), optional, intent(in) :: scale
     !
+    logical          :: my_addit
     real(amrex_real) :: my_scale
+    if (present(addit)) then
+       my_addit = addit
+    else
+       my_addit = .FALSE.
+    end if
     if (present(scale)) then
        my_scale = scale
     else
        my_scale = 1._amrex_real
     end if
-    call amrex_fi_flash_fluxregister_store_area(this%p, grid_idx, dir, &
-         flux, flo, fhi, fhi(4)-flo(4)+1, area, my_scale)
+    if (my_addit) then
+       call amrex_fi_flash_fluxregister_add_area(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, area, my_scale)
+    else
+       call amrex_fi_flash_fluxregister_store_area(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, area, my_scale)
+    end if
   end subroutine amrex_flash_fluxregister_store_area
 
   subroutine amrex_flash_fluxregister_store_area_ifd (this, flux, area, flo, fhi, &
-       isFluxDensity, grid_idx, dir, scale)
+       isFluxDensity, grid_idx, dir, addit, scale)
     class(amrex_flash_fluxregister), intent(inout) :: this
     integer, intent(in) :: flo(*), fhi(*), grid_idx, dir
     real(amrex_real), intent(in) :: flux(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),flo(4):fhi(4))
     real(amrex_real), intent(in) :: area(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3))
     logical, intent(in) :: isFluxDensity(flo(4):fhi(4))
+    logical,          optional, intent(in) :: addit
     real(amrex_real), optional, intent(in) :: scale
     !
+    logical          :: my_addit
     real(amrex_real) :: my_scale
     integer(c_int) :: ifd(flo(4):fhi(4))
+    if (present(addit)) then
+       my_addit = addit
+    else
+       my_addit = .FALSE.
+    end if
     if (present(scale)) then
        my_scale = scale
     else
@@ -240,8 +300,13 @@ subroutine amrex_flash_fluxregister_store_area_ifd (this, flux, area, flo, fhi,
     elsewhere
        ifd = 0
     endwhere
-    call amrex_fi_flash_fluxregister_store_area_ifd(this%p, grid_idx, dir, &
-         flux, flo, fhi, fhi(4)-flo(4)+1, area, ifd, my_scale)
+    if (my_addit) then
+       call amrex_fi_flash_fluxregister_add_area_ifd(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, area, ifd, my_scale)
+    else
+       call amrex_fi_flash_fluxregister_store_area_ifd(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, area, ifd, my_scale)
+    end if
   end subroutine amrex_flash_fluxregister_store_area_ifd
 
   subroutine amrex_flash_fluxregister_load_1 (this, flux, flo, fhi, grid_idx, dir, scale)
diff --git a/Src/F_Interfaces/Base/AMReX_parallel_mod.F90 b/Src/F_Interfaces/Base/AMReX_parallel_mod.F90
index 00fcc275c89..68cf647ff7f 100644
--- a/Src/F_Interfaces/Base/AMReX_parallel_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_parallel_mod.F90
@@ -71,7 +71,7 @@ subroutine amrex_parallel_init (comm)
 
     if (present(comm) .and. .not.flag) then
        if (comm .ne. MPI_COMM_WORLD) then
-          stop "MPI has not been initialized.  How come we are given a communciator?"
+          stop "MPI has not been initialized.  How come we are given a communicator?"
        endif
     end if
 
diff --git a/Src/LinearSolvers/AMReX_GMRES.H b/Src/LinearSolvers/AMReX_GMRES.H
new file mode 100644
index 00000000000..9b80d9dc57c
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_GMRES.H
@@ -0,0 +1,413 @@
+#ifndef AMREX_GMRES_H_
+#define AMREX_GMRES_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_BLProfiler.H>
+#include <AMReX_Print.H>
+#include <AMReX_TableData.H>
+#include <AMReX_Vector.H>
+#include <cmath>
+#include <limits>
+#include <memory>
+
+namespace amrex {
+
+/**
+ * \brief GMRES
+ *
+ * This class implements the GMRES algorithm. The template parameter V is
+ * for a linear algebra vector class. For example, it could be
+ * amrex::MultiFab. The other template parameter M is for a linear operator
+ * class with a number of required member functions. Note that conceptually
+ * M contains a matrix. However, it does not mean it needs to have a member
+ * variable storing the matrix, because GMRES only needs the matrix vector
+ * product, not the matrix itself.
+ *
+ * \tparam V linear algebra vector. It must be default constructible, move
+ *           constructible, and move assignable.
+ * \tparam M linear operator. A list of required member functions for M is
+ *           shown below. Here RT (typename V::value_type) is either double
+ *           or float.
+ *             - void apply(V& lhs, V const& rhs)\n
+ *               lhs = L(rhs), where L is the linear operator performing matrix
+ *               vector product.
+ *             - void assign(V& lhs, V const& rhs)\n
+ *               lhs = rhs.
+ *             - RT dotProduct(V const& v1, V const& v2)\n
+ *               returns v1 * v2.
+ *             - void increment(V& lhs, V const& rhs, RT a)\n
+ *               lhs += a * rhs.
+ *             - void linComb(V& lhs, RT a, V const& rhs_a, RT b, V const& rhs_b)\n
+ *               lhs = a * rhs_a + b * rhs_b.
+ *             - V makeVecRHS()\n
+ *               returns a V object that is suitable as RHS in M x = b. The reason
+ *               we distinguish between LHS and RHS is M might need the distinction
+ *               for efficiency. For example, if V is MultiFab, we might need the x
+ *               in the LHS of M x = b to have ghost cells for efficiency, whereas
+ *               no ghost cells are needed for the RHS (i.e., b).
+ *             - V makeVecLHS()\n
+ *               returns a V object that is suitable as LHS in M x = b. See the
+ *               description for makeVecRHS for more details.
+ *             - RT norm2(V const& v)\n
+ *               returns the 2-norm of v.
+ *             - void precond(V& lhs, V const& rhs)\n
+ *               applies preconditioner to rhs. If there is no preconditioner,
+ *               this function should do lhs = rhs.
+ *             - void setToZero(V& v)\n
+ *               v = 0.
+ */
+template <typename V, typename M>
+class GMRES
+{
+public:
+
+    using RT = typename M::RT; // double or float
+
+    GMRES ();
+
+    //! Defines with a reference to M. It's the user's responsibility to
+    //! keep the M object alive for GMRES to be functional. This function
+    //! must be called before solve() can be called.
+    void define (M& linop);
+
+    /**
+     * \brief Solve the linear system
+     *
+     * \param a_sol     unknowns, i.e., x in A x = b.
+     * \param a_rhs     RHS, i.e., b in A x = b.
+     * \param a_tol_rel relative tolerance.
+     * \param a_tol_abs absolute tolerance.
+     * \patam a_its     optional argument specifying the maximum number of iterations.
+     */
+    void solve (V& a_sol, V const& a_rhs, RT a_tol_rel, RT a_tol_abs, int a_its=-1);
+
+    //! Sets verbosity.
+    void setVerbose (int v) { m_verbose = v; }
+
+    //! Sets restart length. The default is 30.
+    void setRestartLength (int rl);
+
+    //! Sets the max number of iterations
+    void setMaxIters (int niters) { m_maxiter = niters; }
+
+    //! Gets the number of iterations.
+    [[nodiscard]] int getNumIters () const { return m_its; }
+
+    //! Gets the solver status.
+    [[nodiscard]] int getStatus () const { return m_status; }
+
+    //! Gets the 2-norm of the residual.
+    [[nodiscard]] RT getResidualNorm () const { return m_res; }
+
+private:
+    void clear ();
+    void allocate_scratch ();
+    void cycle (V& a_xx, int& a_status, int& a_itcount, RT& a_rnorm0);
+    void build_solution (V& a_xx, int it);
+    void compute_residual (V& a_rr, V const& a_xx, V const& a_bb);
+
+    bool converged (RT r0, RT r) const;
+
+    void gram_schmidt_orthogonalization (int it);
+    void update_hessenberg (int it, bool happyend, RT& res);
+
+    int m_verbose = 0;
+    int m_maxiter = 2000;
+    int m_its = 0;
+    int m_status = -1;
+    int m_restrtlen = 30;
+    RT m_res = std::numeric_limits<RT>::max();
+    RT m_rtol = RT(0);
+    RT m_atol = RT(0);
+    Vector<RT> m_hh_1d;
+    Vector<RT> m_hes_1d;
+    Table2D<RT> m_hh;
+    Table2D<RT> m_hes;
+    Vector<RT> m_grs;
+    Vector<RT> m_cc;
+    Vector<RT> m_ss;
+    std::unique_ptr<V> m_v_tmp_rhs;
+    std::unique_ptr<V> m_v_tmp_lhs;
+    Vector<V> m_vv;
+    M* m_linop = nullptr;
+};
+
+template <typename V, typename M>
+GMRES<V,M>::GMRES ()
+{
+    allocate_scratch();
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::allocate_scratch ()
+{
+    int rs = m_restrtlen;
+
+    m_hh_1d.resize(std::size_t(rs + 2) * (rs + 1));
+    m_hh = Table2D<RT>(m_hh_1d.data(), {0,0}, {rs+1,rs}); // (0:rs+1,0:rs)
+
+    m_hes_1d.resize(std::size_t(rs + 2) * (rs + 1));
+    m_hes = Table2D<RT>(m_hes_1d.data(), {0,0}, {rs+1,rs}); // (0:rs+1,0:rs)
+
+    m_grs.resize(rs + 2);
+    m_cc.resize(rs + 1);
+    m_ss.resize(rs + 1);
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::setRestartLength (int rl)
+{
+    if (m_restrtlen != rl) {
+        m_restrtlen = rl;
+        allocate_scratch();
+        m_vv.clear();
+    }
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::define (M& linop)
+{
+    clear();
+    m_linop = &linop;
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::clear ()
+{
+    m_its = 0;
+    m_status = -1;
+    m_res = std::numeric_limits<RT>::max();
+    m_v_tmp_rhs.reset();
+    m_v_tmp_lhs.reset();
+    m_vv.clear();
+    m_linop = nullptr;
+}
+
+template <typename V, typename M>
+bool GMRES<V,M>::converged (RT r0, RT r) const
+{
+    return (r < r0*m_rtol) || (r < m_atol);
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::solve (V& a_sol, V const& a_rhs, RT a_tol_rel, RT a_tol_abs, int a_its)
+{
+    BL_PROFILE("GMRES::solve()");
+
+    AMREX_ALWAYS_ASSERT(m_linop != nullptr);
+
+    auto t0 = amrex::second();
+
+    if (m_v_tmp_rhs == nullptr) {
+        m_v_tmp_rhs = std::make_unique<V>(m_linop->makeVecRHS());
+    }
+    if (m_v_tmp_lhs == nullptr) {
+        m_v_tmp_lhs = std::make_unique<V>(m_linop->makeVecLHS());
+    }
+    if (m_vv.empty()) {
+        m_vv.reserve(m_restrtlen+1);
+        for (int i = 0; i < 2; ++i) { // to save space, start with just 2
+            m_vv.emplace_back(m_linop->makeVecRHS());
+        }
+    }
+
+    m_rtol = a_tol_rel;
+    m_atol = a_tol_abs;
+
+    if (a_its < 0) { a_its = m_maxiter; }
+
+    auto rnorm0 = RT(0);
+
+    m_linop->assign(m_vv[0], a_rhs);
+    m_linop->setToZero(a_sol);
+
+    m_its = 0;
+    m_status = -1;
+    cycle(a_sol, m_status, m_its, rnorm0);
+
+    while (m_status == -1 && m_its < a_its) {
+        compute_residual(m_vv[0], a_sol, a_rhs);
+        cycle(a_sol, m_status, m_its, rnorm0);
+    }
+
+    if (m_status == -1 && m_its >= a_its) { m_status = 1; }
+
+    m_v_tmp_rhs.reset();
+    m_v_tmp_lhs.reset();
+    m_vv.clear();
+
+    auto t1 = amrex::second();
+    if (m_verbose > 0) {
+        amrex::Print() << "GMRES: Solve Time = " << t1-t0 << std::endl;
+    }
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::cycle (V& a_xx, int& a_status, int& a_itcount, RT& a_rnorm0)
+{
+    BL_PROFILE("GMREA::cycle()");
+
+    m_res = m_linop->norm2(m_vv[0]);
+    m_grs[0] = m_res;
+
+    if (m_res == RT(0.0)) {
+        a_status = 0;
+        return;
+    }
+
+    m_linop->scale(m_vv[0], RT(1.0)/m_res);
+
+    if (a_itcount == 0) { a_rnorm0 = m_res; }
+
+    a_status = converged(a_rnorm0,m_res) ? 0 : -1;
+
+    int it = 0;
+    while (it < m_restrtlen && a_itcount < m_maxiter)
+    {
+        if (m_verbose > 1) {
+            amrex::Print() << "GMRES: iter = " << a_itcount
+                           << ", residual = " << m_res << ", " << m_res/a_rnorm0
+                           << " (rel.)\n";
+        }
+
+        if (a_status == 0) { break; }
+
+        while (m_vv.size() < it+2) {
+            m_vv.emplace_back(m_linop->makeVecRHS());
+        }
+
+        auto const& vv_it  = m_vv[it  ];
+        auto      & vv_it1 = m_vv[it+1];
+
+        m_linop->precond(*m_v_tmp_lhs, vv_it);
+        m_linop->apply(vv_it1, *m_v_tmp_lhs);
+
+        gram_schmidt_orthogonalization(it);
+
+        auto tt = m_linop->norm2(vv_it1);
+
+        auto const small = RT((sizeof(RT) == 8) ? 1.e-99 : 1.e-30);
+        bool happyend = (tt < small);
+        if (!happyend) {
+            m_linop->scale(vv_it1, RT(1.0)/tt);
+        }
+
+        m_hh (it+1,it) = tt;
+        m_hes(it+1,it) = tt;
+
+        update_hessenberg(it, happyend, m_res);
+
+        ++it;
+        ++a_itcount;
+        a_status = converged(a_rnorm0, m_res) ? 0 : -1;
+        if (happyend) { break; }
+    }
+
+    if ((m_verbose > 1) && (a_status != 0 || a_itcount >= m_maxiter)) {
+        amrex::Print() << "GMRES: iter = " << a_itcount
+                       << ", residual = " << m_res << ", " << m_res/a_rnorm0
+                       << " (rel.)\n";
+    }
+
+    build_solution(a_xx, it-1);
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::gram_schmidt_orthogonalization (int const it)
+{
+    // Two unmodified Gram-Schmidt Orthogonalization
+
+    BL_PROFILE("GMRES::GramSchmidt");
+
+    auto& vv_1 = m_vv[it+1];
+
+    Vector<RT> lhh(it+1);
+
+    for (int j = 0; j <= it; ++j) {
+        m_hh (j,it) = RT(0.0);
+        m_hes(j,it) = RT(0.0);
+    }
+
+    for (int ncnt = 0; ncnt < 2 ; ++ncnt)
+    {
+        for (int j = 0; j <= it; ++j) {
+            lhh[j] = m_linop->dotProduct(vv_1, m_vv[j]);
+        }
+
+        for (int j = 0; j <= it; ++j) {
+            m_linop->increment(vv_1, m_vv[j], -lhh[j]);
+            m_hh (j,it) += lhh[j];
+            m_hes(j,it) -= lhh[j];
+        }
+    }
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::update_hessenberg (int it, bool happyend, RT& res)
+{
+    BL_PROFILE("GMRES::update_hessenberg()");
+
+    for (int j = 1; j <= it; ++j) {
+        auto tt = m_hh(j-1,it);
+        m_hh(j-1,it) = m_cc[j-1] * tt         + m_ss[j-1] * m_hh(j,it);
+        m_hh(j  ,it) = m_cc[j-1] * m_hh(j,it) - m_ss[j-1] * tt;
+    }
+
+    if (!happyend)
+    {
+        auto tt = std::sqrt(m_hh(it,it)*m_hh(it,it) + m_hh(it+1,it)*m_hh(it+1,it));
+        m_cc[it] = m_hh(it  ,it) / tt;
+        m_ss[it] = m_hh(it+1,it) / tt;
+        m_grs[it+1] = - (m_ss[it] * m_grs[it]);
+        m_grs[it  ] =    m_cc[it] * m_grs[it];
+        m_hh(it,it) = m_cc[it] * m_hh(it,it) + m_ss[it] * m_hh(it+1,it);
+        res = std::abs(m_grs[it+1]);
+    }
+    else
+    {
+        res = RT(0.0);
+    }
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::build_solution (V& a_xx, int const it)
+{
+    BL_PROFILE("GMRES:build_solution()");
+
+    if (it < 0) { return; }
+
+    if (m_hh(it,it) != RT(0.0)) {
+        m_grs[it] /= m_hh(it,it);
+    } else {
+        m_grs[it] = RT(0.0);
+    }
+
+    for (int ii = 1; ii <= it; ++ii) {
+        int k = it - ii;
+        auto tt = m_grs[k];
+        for (int j = k+1; j <= it; ++j) {
+            tt -= m_hh(k,j) * m_grs[j];
+        }
+        m_grs[k] = tt / m_hh(k,k);
+    }
+
+    m_linop->setToZero(*m_v_tmp_rhs);
+    for (int ii = 0; ii < it+1; ++ii) {
+        m_linop->increment(*m_v_tmp_rhs, m_vv[ii], m_grs[ii]);
+    }
+
+    m_linop->precond(*m_v_tmp_lhs, *m_v_tmp_rhs);
+    m_linop->increment(a_xx, *m_v_tmp_lhs, RT(1.0));
+}
+
+template <typename V, typename M>
+void GMRES<V,M>::compute_residual (V& a_rr, V const& a_xx, V const& a_bb)
+{
+    BL_PROFILE("GMRES::compute_residual()");
+    m_linop->assign(*m_v_tmp_lhs, a_xx);
+    m_linop->apply(*m_v_tmp_rhs, *m_v_tmp_lhs);
+    m_linop->linComb(a_rr, RT(1.0), a_bb, RT(-1.0), *m_v_tmp_rhs);
+}
+
+}
+#endif
diff --git a/Src/LinearSolvers/AMReX_GMRES_MLMG.H b/Src/LinearSolvers/AMReX_GMRES_MLMG.H
new file mode 100644
index 00000000000..fd81f32fc2c
--- /dev/null
+++ b/Src/LinearSolvers/AMReX_GMRES_MLMG.H
@@ -0,0 +1,221 @@
+#ifndef AMREX_GMRES_MLMG_H_
+#define AMREX_GMRES_MLMG_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_GMRES.H>
+#include <AMReX_MLMG.H>
+#include <utility>
+
+namespace amrex {
+
+/**
+ * \brief Solve using GMRES with multigrid as preconditioner
+ *
+ * The linear system to solve is provided by MLMG, which is also being used
+ * as the preconditioner.
+ *
+ */
+template <typename MF>
+class GMRESMLMGT
+{
+public:
+    using MG = MLMGT<MF>;
+    using RT = typename MG::RT; // double or float
+    using GM = GMRES<MF,GMRESMLMGT<MF>>;
+
+    explicit GMRESMLMGT (MG& mlmg);
+
+    /**
+     * \brief Solve the linear system
+     *
+     * \param a_sol     unknowns, i.e., x in A x = b.
+     * \param a_rhs     RHS, i.e., b in A x = b.
+     * \param a_tol_rel relative tolerance.
+     * \param a_tol_abs absolute tolerance.
+     */
+    void solve (MF& a_sol, MF const& a_rhs, RT a_tol_rel, RT a_tol_abs);
+
+    //! Sets verbosity.
+    void setVerbose (int v) { m_gmres.setVerbose(v); }
+
+    //! Sets the max number of iterations
+    void setMaxIters (int niters) { m_gmres.setMaxIters(niters); }
+
+    //! Gets the number of iterations.
+    [[nodiscard]] int getNumIters () const { return m_gmres.getNumIters(); }
+
+    //! Gets the 2-norm of the residual.
+    [[nodiscard]] RT getResidualNorm () const { return m_gmres.getResidualNorm(); }
+
+    //! Get the GMRES object.
+    GM& getGMRES () { return m_gmres; }
+
+    /**
+     * \brief Set MLMG's multiplicative property of zero
+     *
+     * This should NOT be called unless MLMG has the multiplicative property
+     * of zero. MLMG is not a matrix, and usually does not have the
+     * properties of a matrix such as the multiplicative property of zero
+     * (i.e., M*0=0) because how domain boundary conditions are
+     * handled. However, if MLMG has the property of zero, calling this
+     * function with true can have a small performance benefit.
+     */
+    void setPropertyOfZero (bool b) { m_prop_zero = b; }
+
+    //! Make MultiFab without ghost cells
+    MF makeVecRHS () const;
+
+    //! Make MultiFab with ghost cells and set ghost cells to zero
+    MF makeVecLHS () const;
+
+    RT norm2 (MF const& mf) const;
+
+    static void scale (MF& mf, RT scale_factor);
+
+    RT dotProduct (MF const& mf1, MF const& mf2) const;
+
+    //! lhs = 0
+    static void setToZero (MF& lhs);
+
+    //! lhs = rhs
+    static void assign (MF& lhs, MF const& rhs);
+
+    //! lhs += a*rhs
+    static void increment (MF& lhs, MF const& rhs, RT a);
+
+    //! lhs = a*rhs_a + b*rhs_b
+    static void linComb (MF& lhs, RT a, MF const& rhs_a, RT b, MF const& rhs_b);
+
+    //! lhs = L(rhs)
+    void apply (MF& lhs, MF const& rhs) const;
+
+    void precond (MF& lhs, MF const& rhs) const;
+
+    bool usePrecond (bool new_flag) { return std::exchange(m_use_precond, new_flag); }
+
+private:
+    GM m_gmres;
+    MG& m_mlmg;
+    MLLinOpT<MF>& m_linop;
+    bool m_use_precond = true;
+    bool m_prop_zero = false;
+};
+
+template <typename MF>
+GMRESMLMGT<MF>::GMRESMLMGT (MG& mlmg)
+    : m_mlmg(mlmg), m_linop(mlmg.getLinOp())
+{
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_linop.NAMRLevels() == 1,
+                                     "Only support single level solve");
+    m_mlmg.setVerbose(0);
+    m_mlmg.setBottomVerbose(0);
+    m_mlmg.prepareForGMRES();
+    m_gmres.define(*this);
+}
+
+template <typename MF>
+auto GMRESMLMGT<MF>::makeVecRHS () const -> MF
+{
+    return m_linop.make(0, 0, IntVect(0));
+}
+
+template <typename MF>
+auto GMRESMLMGT<MF>::makeVecLHS () const -> MF
+{
+    auto mf = m_linop.make(0, 0, IntVect(1));
+    setBndry(mf, RT(0), 0, nComp(mf));
+    return mf;
+}
+
+template <typename MF>
+auto GMRESMLMGT<MF>::norm2 (MF const& mf) const -> RT
+{
+    auto r = m_linop.xdoty(0, 0, mf, mf, false);
+    return std::sqrt(r);
+}
+
+template <typename MF>
+void GMRESMLMGT<MF>::scale (MF& mf, RT scale_factor)
+{
+    Scale(mf, scale_factor, 0, nComp(mf), 0);
+}
+
+template <typename MF>
+auto GMRESMLMGT<MF>::dotProduct (MF const& mf1, MF const& mf2) const -> RT
+{
+    return m_linop.xdoty(0, 0, mf1, mf2, false);
+}
+
+template <typename MF>
+void GMRESMLMGT<MF>::setToZero (MF& lhs)
+{
+    setVal(lhs, RT(0.0));
+}
+
+template <typename MF>
+void GMRESMLMGT<MF>::assign (MF& lhs, MF const& rhs)
+{
+    LocalCopy(lhs, rhs, 0, 0, nComp(lhs), IntVect(0));
+}
+
+template <typename MF>
+void GMRESMLMGT<MF>::increment (MF& lhs, MF const& rhs, RT a)
+{
+    Saxpy(lhs, a, rhs, 0, 0, nComp(lhs), IntVect(0));
+}
+
+template <typename MF>
+void GMRESMLMGT<MF>::linComb (MF& lhs, RT a, MF const& rhs_a, RT b, MF const& rhs_b)
+{
+    LinComb(lhs, a, rhs_a, 0, b, rhs_b, 0, 0, nComp(lhs), IntVect(0));
+}
+
+template <typename MF>
+void GMRESMLMGT<MF>::apply (MF& lhs, MF const& rhs) const
+{
+    m_linop.apply(0, 0, lhs, const_cast<MF&>(rhs),
+                  MLLinOpT<MF>::BCMode::Homogeneous,
+                  MLLinOpT<MF>::StateMode::Correction);
+}
+
+template <typename MF>
+void GMRESMLMGT<MF>::precond (MF& lhs, MF const& rhs) const
+{
+    if (m_use_precond) {
+        m_mlmg.prepareMGcycle();
+
+        LocalCopy(m_mlmg.res[0][0], rhs, 0, 0, nComp(rhs), IntVect(0));
+
+        m_mlmg.mgVcycle(0,0);
+
+        LocalCopy(lhs, m_mlmg.cor[0][0], 0, 0, nComp(rhs), IntVect(0));
+
+    } else {
+        LocalCopy(lhs, rhs, 0, 0, nComp(lhs), IntVect(0));
+    }
+}
+
+template <typename MF>
+void GMRESMLMGT<MF>::solve (MF& a_sol, MF const& a_rhs, RT a_tol_rel, RT a_tol_abs)
+{
+    if (m_prop_zero) {
+        auto rhs = makeVecRHS();
+        assign(rhs, a_rhs);
+        m_linop.setDirichletNodesToZero(0,0,rhs);
+        m_gmres.solve(a_sol, rhs, a_tol_rel, a_tol_abs);
+    } else {
+        auto res = makeVecRHS();
+        m_mlmg.apply({&res}, {&a_sol}); // res = L(sol)
+        increment(res, a_rhs, RT(-1)); // res = L(sol) - rhs
+        auto cor = makeVecLHS();
+        m_linop.setDirichletNodesToZero(0,0,res);
+        m_gmres.solve(cor, res, a_tol_rel, a_tol_abs); // L(cor) = res
+        increment(a_sol, cor, RT(-1)); // sol = sol - cor
+    }
+}
+
+using GMRESMLMG = GMRESMLMGT<MultiFab>;
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/CMakeLists.txt b/Src/LinearSolvers/CMakeLists.txt
index 500716be3a0..cae0b2028f0 100644
--- a/Src/LinearSolvers/CMakeLists.txt
+++ b/Src/LinearSolvers/CMakeLists.txt
@@ -2,6 +2,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
     #
     # Sources in subdirectory MLMG
     #
+    target_include_directories(amrex_${D}d PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>)
     target_include_directories(amrex_${D}d PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/MLMG>)
 
     target_sources(amrex_${D}d
@@ -49,6 +50,12 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        MLMG/AMReX_MLEBNodeFDLaplacian.cpp
        MLMG/AMReX_MLEBNodeFDLap_K.H
        MLMG/AMReX_MLEBNodeFDLap_${D}D_K.H
+       MLMG/AMReX_MLNodeABecLaplacian.H
+       MLMG/AMReX_MLNodeABecLaplacian.cpp
+       MLMG/AMReX_MLNodeABecLap_K.H
+       MLMG/AMReX_MLNodeABecLap_${D}D_K.H
+       AMReX_GMRES.H
+       AMReX_GMRES_MLMG.H
        )
 
     if (D EQUAL 3)
@@ -60,6 +67,15 @@ foreach(D IN LISTS AMReX_SPACEDIM)
           )
     endif ()
 
+    if (NOT D EQUAL 1)
+       target_sources(amrex_${D}d
+          PRIVATE
+          MLMG/AMReX_MLCurlCurl.H
+          MLMG/AMReX_MLCurlCurl.cpp
+          MLMG/AMReX_MLCurlCurl_K.H
+          )
+    endif ()
+
     if (AMReX_EB AND NOT D EQUAL 1)
        target_sources(amrex_${D}d
           PRIVATE
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
index 9498af0e620..9dd91d70f9d 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
@@ -72,7 +72,7 @@ public:
 
     /**
      * Sets alpha as a scalar field to values from a single component
-     * mutlifab.
+     * multifab.
      *
      * \param [in] amrlev The level of the multifab for the solver, with
      *                    \p amrlev = 0 always being the lowest level in the
@@ -88,12 +88,12 @@ public:
 
     /**
      * Sets alpha as a single scalar constant value across
-     * the mutlifab.
+     * the multifab.
      *
      * \param [in] amrlev The level of the multifab for the solver, with
      *                    \p amrlev = 0 always being the lowest level in the
      *                    AMR hierarchy represented in the solve.
-     * \param [in] alpha  Single scalar value to populate across mutlifab.
+     * \param [in] alpha  Single scalar value to populate across multifab.
      */
     template <typename T,
               std::enable_if_t<std::is_convertible_v<T,typename MF::value_type>,
@@ -118,12 +118,12 @@ public:
 
     /**
      * Sets beta as a single scalar constant value across
-     * the mutlifabs (one for each dimension).
+     * the multifabs (one for each dimension).
      *
      * \param [in] amrlev The level of the multifab for the solver, with
      *                    \p amrlev = 0 always being the lowest level in the
      *                    AMR hierarchy represented in the solve.
-     * \param [in] beta   Single scalar value to populate across mutlifabs.
+     * \param [in] beta   Single scalar value to populate across multifabs.
      */
     template <typename T,
               std::enable_if_t<std::is_convertible_v<T,typename MF::value_type>,
@@ -190,15 +190,18 @@ public:
                        Array<FAB*,AMREX_SPACEDIM> const& flux,
                        FAB const& sol, int face_only, int ncomp);
 
-protected:
-
-    bool m_needs_update = true;
-
     RT m_a_scalar = std::numeric_limits<RT>::quiet_NaN();
     RT m_b_scalar = std::numeric_limits<RT>::quiet_NaN();
     Vector<Vector<MF> > m_a_coeffs;
     Vector<Vector<Array<MF,AMREX_SPACEDIM> > > m_b_coeffs;
 
+    bool m_scalars_set = false;
+    bool m_acoef_set = false;
+
+protected:
+
+    bool m_needs_update = true;
+
     Vector<int> m_is_singular;
 
     [[nodiscard]] bool supportRobinBC () const noexcept override { return true; }
@@ -308,7 +311,9 @@ MLABecLaplacianT<MF>::setScalars (T1 a, T2 b) noexcept
         for (int amrlev = 0; amrlev < this->m_num_amr_levels; ++amrlev) {
             m_a_coeffs[amrlev][0].setVal(RT(0.0));
         }
+        m_acoef_set = true;
     }
+    m_scalars_set = true;
 }
 
 template <typename MF>
@@ -323,6 +328,7 @@ MLABecLaplacianT<MF>::setACoeffs (int amrlev, const AMF& alpha)
                               "MLABecLaplacian::setACoeffs: alpha is supposed to be single component.");
     m_a_coeffs[amrlev][0].LocalCopy(alpha, 0, 0, 1, IntVect(0));
     m_needs_update = true;
+    m_acoef_set = true;
 }
 
 template <typename MF>
@@ -333,6 +339,7 @@ MLABecLaplacianT<MF>::setACoeffs (int amrlev, T alpha)
 {
     m_a_coeffs[amrlev][0].setVal(RT(alpha));
     m_needs_update = true;
+    m_acoef_set = true;
 }
 
 
@@ -402,6 +409,8 @@ MLABecLaplacianT<MF>::update ()
     applyMetricTermsCoeffs();
 #endif
 
+    applyRobinBCTermsCoeffs();
+
     averageDownCoeffs();
 
     update_singular_flags();
@@ -474,29 +483,37 @@ MLABecLaplacianT<MF>::applyMetricTermsCoeffs ()
 //    \tilde{alpha}_i = alpha_i + (1-B) beta_{i+1/2} / h^2
 //    \tilde{rhs}_i = rhs_i + A beta_{i+1/2} / h^2
 //
-template <typename MF>
-void
-MLABecLaplacianT<MF>::applyRobinBCTermsCoeffs ()
+namespace detail {
+template <typename LP>
+void applyRobinBCTermsCoeffs (LP& linop)
 {
-    if (!(this->hasRobinBC())) { return; }
+    using RT = typename LP::RT;
 
-    const int ncomp = this->getNComp();
+    const int ncomp = linop.getNComp();
     bool reset_alpha = false;
-    if (m_a_scalar == RT(0.0)) {
-        m_a_scalar = RT(1.0);
+    if (linop.m_a_scalar == RT(0.0)) {
+        linop.m_a_scalar = RT(1.0);
         reset_alpha = true;
     }
-    const RT bovera = m_b_scalar/m_a_scalar;
+    const RT bovera = linop.m_b_scalar/linop.m_a_scalar;
 
-    for (int amrlev = 0; amrlev < this->m_num_amr_levels; ++amrlev) {
+    if (!reset_alpha) {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(linop.m_scalars_set && linop.m_acoef_set,
+                                         "To reuse solver With Robin BC, one must re-call setScalars (and setACoeffs if the scalar is not zero)");
+    }
+
+    linop.m_scalars_set = false;
+    linop.m_acoef_set = false;
+
+    for (int amrlev = 0; amrlev < linop.NAMRLevels(); ++amrlev) {
         const int mglev = 0;
-        const Box& domain = this->m_geom[amrlev][mglev].Domain();
-        const RT dxi = static_cast<RT>(this->m_geom[amrlev][mglev].InvCellSize(0));
-        const RT dyi = static_cast<RT>((AMREX_SPACEDIM >= 2) ? this->m_geom[amrlev][mglev].InvCellSize(1) : Real(1.0));
-        const RT dzi = static_cast<RT>((AMREX_SPACEDIM == 3) ? this->m_geom[amrlev][mglev].InvCellSize(2) : Real(1.0));
+        const Box& domain = linop.Geom(amrlev,mglev).Domain();
+        const RT dxi = static_cast<RT>(linop.Geom(amrlev,mglev).InvCellSize(0));
+        const RT dyi = static_cast<RT>((AMREX_SPACEDIM >= 2) ? linop.Geom(amrlev,mglev).InvCellSize(1) : Real(1.0));
+        const RT dzi = static_cast<RT>((AMREX_SPACEDIM == 3) ? linop.Geom(amrlev,mglev).InvCellSize(2) : Real(1.0));
 
         if (reset_alpha) {
-            m_a_coeffs[amrlev][mglev].setVal(RT(0.0));
+            linop.m_a_coeffs[amrlev][mglev].setVal(RT(0.0));
         }
 
         MFItInfo mfi_info;
@@ -505,20 +522,20 @@ MLABecLaplacianT<MF>::applyRobinBCTermsCoeffs ()
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-        for (MFIter mfi(m_a_coeffs[amrlev][mglev], mfi_info); mfi.isValid(); ++mfi)
+        for (MFIter mfi(linop.m_a_coeffs[amrlev][mglev], mfi_info); mfi.isValid(); ++mfi)
         {
             const Box& vbx = mfi.validbox();
-            auto const& afab = m_a_coeffs[amrlev][mglev].array(mfi);
+            auto const& afab = linop.m_a_coeffs[amrlev][mglev].array(mfi);
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                auto const& bfab = m_b_coeffs[amrlev][mglev][idim].const_array(mfi);
+                auto const& bfab = linop.m_b_coeffs[amrlev][mglev][idim].const_array(mfi);
                 const Box& blo = amrex::adjCellLo(vbx,idim);
                 const Box& bhi = amrex::adjCellHi(vbx,idim);
                 bool outside_domain_lo = !(domain.contains(blo));
                 bool outside_domain_hi = !(domain.contains(bhi));
                 if ((!outside_domain_lo) && (!outside_domain_hi)) { continue; }
                 for (int icomp = 0; icomp < ncomp; ++icomp) {
-                    auto const& rbc = (*(this->m_robin_bcval[amrlev]))[mfi].const_array(icomp*3);
-                    if (this->m_lobc_orig[icomp][idim] == LinOpBCType::Robin && outside_domain_lo)
+                    auto const& rbc = (*(linop.m_robin_bcval[amrlev]))[mfi].const_array(icomp*3);
+                    if (linop.m_lobc_orig[icomp][idim] == LinOpBCType::Robin && outside_domain_lo)
                     {
                         if (idim == 0) {
                             RT fac = bovera*dxi*dxi;
@@ -546,7 +563,7 @@ MLABecLaplacianT<MF>::applyRobinBCTermsCoeffs ()
                             });
                         }
                     }
-                    if (this->m_hibc_orig[icomp][idim] == LinOpBCType::Robin && outside_domain_hi)
+                    if (linop.m_hibc_orig[icomp][idim] == LinOpBCType::Robin && outside_domain_hi)
                     {
                         if (idim == 0) {
                             RT fac = bovera*dxi*dxi;
@@ -579,6 +596,16 @@ MLABecLaplacianT<MF>::applyRobinBCTermsCoeffs ()
         }
     }
 }
+} // namespace detail
+
+template <typename MF>
+void
+MLABecLaplacianT<MF>::applyRobinBCTermsCoeffs ()
+{
+    if (this->hasRobinBC()) {
+        detail::applyRobinBCTermsCoeffs(*this);
+    }
+}
 
 template <typename MF>
 void
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
index 3764fa38f8a..3bfab3c9f61 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
@@ -12,8 +12,8 @@ class MLCGSolverT
 {
 public:
 
-    using FAB = typename MF::fab_type;
-    using RT  = typename MF::value_type;
+    using FAB = typename MLLinOpT<MF>::FAB;
+    using RT  = typename MLLinOpT<MF>::RT;
 
     enum struct Type { BiCGStab, CG };
 
@@ -42,6 +42,16 @@ public:
     void setMaxIter (int _maxiter) { maxiter = _maxiter; }
     [[nodiscard]] int getMaxIter () const { return maxiter; }
 
+
+    /**
+    * Is the initial guess provided to the solver zero ?
+    * If so, set this to true.
+    * The solver will avoid a few operations if this is true.
+    * Default is false.
+    */
+    void setInitSolnZeroed (bool _sol_zeroed) { initial_vec_zeroed = _sol_zeroed; }
+    [[nodiscard]] bool getInitSolnZeroed () const { return initial_vec_zeroed; }
+
     void setNGhost(int _nghost) {nghost = IntVect(_nghost);}
     [[nodiscard]] int getNGhost() {return nghost[0];}
 
@@ -62,6 +72,7 @@ private:
     int maxiter   = 100;
     IntVect nghost = IntVect(0);
     int iter = -1;
+    bool initial_vec_zeroed = false;
 };
 
 template <typename MF>
@@ -88,34 +99,34 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 {
     BL_PROFILE("MLCGSolver::bicgstab");
 
-    const int ncomp = sol.nComp();
+    const int ncomp = nComp(sol);
 
-    const BoxArray& ba = sol.boxArray();
-    const DistributionMapping& dm = sol.DistributionMap();
-    const auto& factory = sol.Factory();
+    MF p = Lp.make(amrlev, mglev, nGrowVect(sol));
+    MF r = Lp.make(amrlev, mglev, nGrowVect(sol));
+    setVal(p, RT(0.0)); // Make sure all entries are initialized to avoid errors
+    setVal(r, RT(0.0));
 
-    MF ph(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory);
-    MF sh(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory);
-    ph.setVal(RT(0.0));
-    sh.setVal(RT(0.0));
+    MF rh    = Lp.make(amrlev, mglev, nghost);
+    MF v     = Lp.make(amrlev, mglev, nghost);
+    MF t     = Lp.make(amrlev, mglev, nghost);
 
-    MF sorig(ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF p    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF r    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF s    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF rh   (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF v    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF t    (ba, dm, ncomp, nghost, MFInfo(), factory);
 
-    Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT<MF>::BCMode::Homogeneous);
+    MF sorig;
 
-    // Then normalize
-    Lp.normalize(amrlev, mglev, r);
+    if ( initial_vec_zeroed ) {
+        LocalCopy(r,rhs,0,0,ncomp,nghost);
+    } else {
+        sorig = Lp.make(amrlev, mglev, nghost);
 
-    sorig.LocalCopy(sol,0,0,ncomp,nghost);
-    rh.LocalCopy   (r  ,0,0,ncomp,nghost);
+        Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT<MF>::BCMode::Homogeneous);
 
-    sol.setVal(RT(0.0));
+        LocalCopy(sorig,sol,0,0,ncomp,nghost);
+        setVal(sol, RT(0.0));
+    }
+
+    // Then normalize
+    Lp.normalize(amrlev, mglev, r);
+    LocalCopy(rh, r, 0,0,ncomp,nghost);
 
     RT rnorm = norm_inf(r);
     const RT rnorm0 = rnorm;
@@ -148,16 +159,15 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
         }
         if ( iter == 1 )
         {
-            p.LocalCopy(r,0,0,ncomp,nghost);
+            LocalCopy(p,r,0,0,ncomp,nghost);
         }
         else
         {
             const RT beta = (rho/rho_1)*(alpha/omega);
-            MF::Saxpy(p, -omega, v, 0, 0, ncomp, nghost); // p += -omega*v
-            MF::Xpay(p, beta, r, 0, 0, ncomp, nghost); // p = r + beta*p
+            Saxpy(p, -omega, v, 0, 0, ncomp, nghost); // p += -omega*v
+            Xpay(p, beta, r, 0, 0, ncomp, nghost); // p = r + beta*p
         }
-        ph.LocalCopy(p,0,0,ncomp,nghost);
-        Lp.apply(amrlev, mglev, v, ph, MLLinOpT<MF>::BCMode::Homogeneous, MLLinOpT<MF>::StateMode::Correction);
+        Lp.apply(amrlev, mglev, v, p, MLLinOpT<MF>::BCMode::Homogeneous, MLLinOpT<MF>::StateMode::Correction);
         Lp.normalize(amrlev, mglev, v);
 
         RT rhTv = dotxy(rh,v);
@@ -169,10 +179,11 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
         {
             ret = 2; break;
         }
-        MF::Saxpy(sol, alpha, ph, 0, 0, ncomp, nghost); // sol += alpha * ph
-        MF::LinComb(s, RT(1.0), r, 0, -alpha, v, 0, 0, ncomp, nghost); // s = r - alpha * v
+        Saxpy(sol, alpha, p, 0, 0, ncomp, nghost); // sol += alpha * p
+        Saxpy(r,  -alpha, v, 0, 0, ncomp, nghost); // r += -alpha * v
 
-        rnorm = norm_inf(s);
+        rnorm = norm_inf(r);
+        rnorm = norm_inf(r);
 
         if ( verbose > 2 && ParallelDescriptor::IOProcessor() )
         {
@@ -184,15 +195,14 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
         if ( rnorm < eps_rel*rnorm0 || rnorm < eps_abs ) { break; }
 
-        sh.LocalCopy(s,0,0,ncomp,nghost);
-        Lp.apply(amrlev, mglev, t, sh, MLLinOpT<MF>::BCMode::Homogeneous, MLLinOpT<MF>::StateMode::Correction);
+        Lp.apply(amrlev, mglev, t, r, MLLinOpT<MF>::BCMode::Homogeneous, MLLinOpT<MF>::StateMode::Correction);
         Lp.normalize(amrlev, mglev, t);
         //
         // This is a little funky.  I want to elide one of the reductions
         // in the following two dotxy()s.  We do that by calculating the "local"
         // values and then reducing the two local values at the same time.
         //
-        RT tvals[2] = { dotxy(t,t,true), dotxy(t,s,true) };
+        RT tvals[2] = { dotxy(t,t,true), dotxy(t,r,true) };
 
         BL_PROFILE_VAR("MLCGSolver::ParallelAllReduce", blp_par);
         ParallelAllReduce::Sum(tvals,2,Lp.BottomCommunicator());
@@ -206,8 +216,8 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
         {
             ret = 3; break;
         }
-        MF::Saxpy(sol, omega, sh, 0, 0, ncomp, nghost); // sol += omega * sh
-        MF::LinComb(r, RT(1.0), s, 0, -omega, t, 0, 0, ncomp, nghost); // r = s - omega * t
+        Saxpy(sol, omega, r, 0, 0, ncomp, nghost); // sol += omega * r
+        Saxpy(r,  -omega, t, 0, 0, ncomp, nghost); // r += -omega * t
 
         rnorm = norm_inf(r);
 
@@ -246,12 +256,16 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     if ( ( ret == 0 || ret == 8 ) && (rnorm < rnorm0) )
     {
-        sol.LocalAdd(sorig, 0, 0, ncomp, nghost);
+        if ( !initial_vec_zeroed ) {
+            LocalAdd(sol, sorig, 0, 0, ncomp, nghost);
+        }
     }
     else
     {
-        sol.setVal(RT(0.0));
-        sol.LocalAdd(sorig, 0, 0, ncomp, nghost);
+        setVal(sol, RT(0.0));
+        if ( !initial_vec_zeroed ) {
+            LocalAdd(sol, sorig, 0, 0, ncomp, nghost);
+        }
     }
 
     return ret;
@@ -263,25 +277,26 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 {
     BL_PROFILE("MLCGSolver::cg");
 
-    const int ncomp = sol.nComp();
+    const int ncomp = nComp(sol);
 
-    const BoxArray& ba = sol.boxArray();
-    const DistributionMapping& dm = sol.DistributionMap();
-    const auto& factory = sol.Factory();
+    MF p = Lp.make(amrlev, mglev, nGrowVect(sol));
+    setVal(p, RT(0.0));
 
-    MF p(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory);
-    p.setVal(RT(0.0));
+    MF r     = Lp.make(amrlev, mglev, nghost);
+    MF q     = Lp.make(amrlev, mglev, nghost);
 
-    MF sorig(ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF r    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF z    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF q    (ba, dm, ncomp, nghost, MFInfo(), factory);
+    MF sorig;
 
-    sorig.LocalCopy(sol,0,0,ncomp,nghost);
+    if ( initial_vec_zeroed ) {
+        LocalCopy(r,rhs,0,0,ncomp,nghost);
+    } else {
+        sorig = Lp.make(amrlev, mglev, nghost);
 
-    Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT<MF>::BCMode::Homogeneous);
+        Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT<MF>::BCMode::Homogeneous);
 
-    sol.setVal(RT(0.0));
+        LocalCopy(sorig,sol,0,0,ncomp,nghost);
+        setVal(sol, RT(0.0));
+    }
 
     RT       rnorm    = norm_inf(r);
     const RT rnorm0   = rnorm;
@@ -307,9 +322,7 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     for (; iter <= maxiter; ++iter)
     {
-        z.LocalCopy(r,0,0,ncomp,nghost);
-
-        RT rho = dotxy(z,r);
+        RT rho = dotxy(r,r);
 
         if ( rho == 0 )
         {
@@ -317,12 +330,12 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
         }
         if (iter == 1)
         {
-            p.LocalCopy(z,0,0,ncomp,nghost);
+            LocalCopy(p,r,0,0,ncomp,nghost);
         }
         else
         {
             RT beta = rho/rho_1;
-            MF::Xpay(p, beta, z, 0, 0, ncomp, nghost); // p = z + beta * p
+            Xpay(p, beta, r, 0, 0, ncomp, nghost); // p = r + beta * p
         }
         Lp.apply(amrlev, mglev, q, p, MLLinOpT<MF>::BCMode::Homogeneous, MLLinOpT<MF>::StateMode::Correction);
 
@@ -344,8 +357,8 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
                            << " rho " << rho
                            << " alpha " << alpha << '\n';
         }
-        MF::Saxpy(sol, alpha, p, 0, 0, ncomp, nghost); // sol += alpha * p
-        MF::Saxpy(r, -alpha, q, 0, 0, ncomp, nghost); // r += -alpha * q
+        Saxpy(sol, alpha, p, 0, 0, ncomp, nghost); // sol += alpha * p
+        Saxpy(r, -alpha, q, 0, 0, ncomp, nghost); // r += -alpha * q
         rnorm = norm_inf(r);
 
         if ( verbose > 2 )
@@ -379,12 +392,16 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     if ( ( ret == 0 || ret == 8 ) && (rnorm < rnorm0) )
     {
-        sol.LocalAdd(sorig, 0, 0, ncomp, nghost);
+        if ( !initial_vec_zeroed ) {
+            LocalAdd(sol, sorig, 0, 0, ncomp, nghost);
+        }
     }
     else
     {
-        sol.setVal(RT(0.0));
-        sol.LocalAdd(sorig, 0, 0, ncomp, nghost);
+        setVal(sol, RT(0.0));
+        if ( !initial_vec_zeroed ) {
+            LocalAdd(sol, sorig, 0, 0, ncomp, nghost);
+        }
     }
 
     return ret;
@@ -405,8 +422,8 @@ template <typename MF>
 auto
 MLCGSolverT<MF>::norm_inf (const MF& res, bool local) -> RT
 {
-    int ncomp = res.nComp();
-    RT result = res.norminf(0,ncomp,IntVect(0),true);
+    int ncomp = nComp(res);
+    RT result = norminf(res,0,ncomp,IntVect(0),true);
     if (!local) {
         BL_PROFILE("MLCGSolver::ParallelAllReduce");
         ParallelAllReduce::Max(result, Lp.BottomCommunicator());
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H b/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H
index 6262064f550..02f7adeb115 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H
@@ -50,6 +50,8 @@ public:
 
     void prepareForSolve () override;
 
+    void setDirichletNodesToZero (int amrlev, int mglev, MF& mf) const override;
+
     void getFluxes (const Vector<Array<MF*,AMREX_SPACEDIM> >& a_flux,
                             const Vector<MF*>& a_sol,
                             Location a_loc) const final;
@@ -247,6 +249,24 @@ MLCellABecLapT<MF>::prepareForSolve ()
     MLCellLinOpT<MF>::prepareForSolve();
 }
 
+template <typename MF>
+void
+MLCellABecLapT<MF>::setDirichletNodesToZero (int amrlev, int mglev, MF& mf) const
+{
+    auto const* omask = this->getOversetMask(amrlev, mglev);
+    if (omask) {
+        const int ncomp = this->getNComp();
+        auto const& mskma = omask->const_arrays();
+        auto const& ma = mf.arrays();
+        ParallelFor(mf, IntVect(0), ncomp,
+        [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k, int n)
+        {
+            if (mskma[bno](i,j,k) == 0) { ma[bno](i,j,k,n) = RT(0.0); }
+        });
+        Gpu::streamSynchronize();
+    }
+}
+
 template <typename MF>
 void
 MLCellABecLapT<MF>::getFluxes (const Vector<Array<MF*,AMREX_SPACEDIM> >& a_flux,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
index 175f34ae54b..d4b3718212a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
@@ -131,6 +131,8 @@ public:
         RT location;
     };
 
+    Vector<std::unique_ptr<MF> > m_robin_bcval;
+
 protected:
 
     bool m_has_metric_term = false;
@@ -182,8 +184,6 @@ protected:
     };
     Vector<Vector<std::unique_ptr<BndryCondLoc> > > m_bcondloc;
 
-    Vector<std::unique_ptr<MF> > m_robin_bcval;
-
     // used to save interpolation coefficients of the first interior cells
     mutable Vector<Vector<BndryRegisterT<MF>> > m_undrrelxr;
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H
new file mode 100644
index 00000000000..08b5d162ead
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H
@@ -0,0 +1,140 @@
+#ifndef AMREX_ML_CURL_CURL_H_
+#define AMREX_ML_CURL_CURL_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_MLLinOp.H>
+#include <AMReX_MLCurlCurl_K.H>
+
+namespace amrex {
+
+/**
+ * \brief curl (alpha curl E) + beta E = rhs
+ *
+ * Here E is an Array of 3 MultiFabs on staggered grid, alpha is a positive
+ * scalar, and beta is a non-negative scalar.
+ *
+ * It's the caller's responsibility to make sure rhs has consistent nodal
+ * data. If needed, one could call prepareRHS for this.
+ *
+ * The smoother is based on the 4-color Gauss-Seidel smoother of Li
+ * et. al. 2020.  "An Efficient Preconditioner for 3-D Finite Difference
+ * Modeling of the Electromagnetic Diffusion Process in the Frequency
+ * Domain", IEEE Transactions on Geoscience and Remote Sensing, 58, 500-509.
+ */
+class MLCurlCurl
+    : public MLLinOpT<Array<MultiFab,3> >
+{
+public:
+    using MF        = Array<MultiFab,3>;
+    using RT        = typename MLLinOpT<MF>::RT;
+    using BCType    = typename MLLinOpT<MF>::BCType;
+    using BCMode    = typename MLLinOpT<MF>::BCMode;
+    using StateMode = typename MLLinOpT<MF>::StateMode;
+    using Location  = typename MLLinOpT<MF>::Location;
+
+    MLCurlCurl () = default;
+    MLCurlCurl (const Vector<Geometry>& a_geom,
+                const Vector<BoxArray>& a_grids,
+                const Vector<DistributionMapping>& a_dmap,
+                const LPInfo& a_info = LPInfo());
+
+    void define (const Vector<Geometry>& a_geom,
+                 const Vector<BoxArray>& a_grids,
+                 const Vector<DistributionMapping>& a_dmap,
+                 const LPInfo& a_info = LPInfo());
+
+    void setScalars (RT a_alpha, RT a_beta) noexcept;
+
+    //! Synchronize RHS on nodal points. If the user can guarantee it, this
+    //! function does not need to be called.
+    void prepareRHS (Vector<MF*> const& rhs) const;
+
+    void setDirichletNodesToZero (int amrlev, int mglev, MF& a_mf) const override;
+
+    [[nodiscard]] std::string name () const override {
+        return std::string("curl of curl");
+    }
+
+    void setLevelBC (int amrlev, const MF* levelbcdata,
+                     const MF* robinbc_a = nullptr,
+                     const MF* robinbc_b = nullptr,
+                     const MF* robinbc_f = nullptr) override;
+
+    void restriction (int amrlev, int cmglev, MF& crse, MF& fine) const override;
+
+    void interpolation (int amrlev, int fmglev, MF& fine, const MF& crse) const override;
+
+    void apply (int amrlev, int mglev, MF& out, MF& in, BCMode bc_mode,
+                StateMode s_mode, const MLMGBndryT<MF>* bndry=nullptr) const override;
+
+    void smooth (int amrlev, int mglev, MF& sol, const MF& rhs,
+                 bool skip_fillboundary=false) const override;
+
+    void solutionResidual (int amrlev, MF& resid, MF& x, const MF& b,
+                           const MF* crse_bcdata=nullptr) override;
+
+    void correctionResidual (int amrlev, int mglev, MF& resid, MF& x,
+                             const MF& b, BCMode bc_mode,
+                             const MF* crse_bcdata=nullptr) override;
+
+    void prepareForSolve () override;
+
+    [[nodiscard]] bool isSingular (int /*amrlev*/) const override { return false; }
+    [[nodiscard]] bool isBottomSingular () const override { return false; }
+
+    RT xdoty (int amrlev, int mglev, const MF& x, const MF& y, bool local) const override;
+
+    [[nodiscard]] RT normInf (int amrlev, MF const& mf, bool local) const override;
+
+    void averageDownAndSync (Vector<MF>& sol) const override;
+
+    [[nodiscard]] IntVect getNGrowVectRestriction () const override {
+        return IntVect(1);
+    }
+
+    void make (Vector<Vector<MF> >& mf, IntVect const& ng) const override;
+
+    [[nodiscard]] MF make (int amrlev, int mglev, IntVect const& ng) const override;
+
+    [[nodiscard]] MF makeAlias (MF const& mf) const override;
+
+    [[nodiscard]] MF makeCoarseMG (int amrlev, int mglev, IntVect const& ng) const override;
+
+    [[nodiscard]] MF makeCoarseAmr (int famrlev, IntVect const& ng) const override;
+
+// public for cuda
+
+    void smooth4 (int amrlev, int mglev, MF& sol, MF const& rhs, int color) const;
+
+    void compresid (int amrlev, int mglev, MF& resid, MF const& b) const;
+
+    void applyPhysBC (int amrlev, int mglev, MultiFab& mf, CurlCurlStateType type) const;
+
+private:
+
+    void applyBC (int amrlev, int mglev, MF& in, CurlCurlStateType type) const;
+
+    [[nodiscard]] iMultiFab const& getDotMask (int amrlev, int mglev, int idim) const;
+
+    [[nodiscard]] CurlCurlDirichletInfo getDirichletInfo (int amrlev, int mglev) const;
+    [[nodiscard]] CurlCurlSymmetryInfo getSymmetryInfo (int amrlev, int mglev) const;
+
+    RT m_alpha = std::numeric_limits<RT>::lowest();
+    RT m_beta  = std::numeric_limits<RT>::lowest();
+
+    Array<IntVect,3> m_etype
+#if (AMREX_SPACEDIM == 3)
+         {IntVect(0,1,1), IntVect(1,0,1), IntVect(1,1,0)};
+#else
+         {IntVect(0,1), IntVect(1,0), IntVect(1,1)};
+#endif
+
+    mutable Vector<Vector<Array<std::unique_ptr<iMultiFab>,3>>> m_dotmask;
+    static constexpr int m_ncomp = 1;
+    Vector<Vector<std::unique_ptr<Gpu::DeviceScalar
+                                  <LUSolver<AMREX_SPACEDIM*2,RT>>>>> m_lusolver;
+};
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp
new file mode 100644
index 00000000000..2ebc49d7b45
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp
@@ -0,0 +1,766 @@
+#include <AMReX_MLCurlCurl.H>
+
+namespace amrex {
+
+MLCurlCurl::MLCurlCurl (const Vector<Geometry>& a_geom,
+                        const Vector<BoxArray>& a_grids,
+                        const Vector<DistributionMapping>& a_dmap,
+                        const LPInfo& a_info)
+{
+    define(a_geom, a_grids, a_dmap, a_info);
+}
+
+void MLCurlCurl::define (const Vector<Geometry>& a_geom,
+                         const Vector<BoxArray>& a_grids,
+                         const Vector<DistributionMapping>& a_dmap,
+                         const LPInfo& a_info)
+{
+    MLLinOpT<MF>::define(a_geom, a_grids, a_dmap, a_info, {});
+
+    m_dotmask.resize(this->m_num_amr_levels);
+    for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
+        m_dotmask[amrlev].resize(this->m_num_mg_levels[amrlev]);
+    }
+
+    m_lusolver.resize(this->m_num_amr_levels);
+    for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
+        m_lusolver[amrlev].resize(this->m_num_mg_levels[amrlev]);
+    }
+}
+
+void MLCurlCurl::setScalars (RT a_alpha, RT a_beta) noexcept
+{
+    m_alpha = a_alpha;
+    m_beta = a_beta;
+    AMREX_ASSERT(m_beta > RT(0));
+}
+
+void MLCurlCurl::prepareRHS (Vector<MF*> const& rhs) const
+{
+    for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
+        for (auto& mf : *rhs[amrlev]) {
+            mf.OverrideSync(m_geom[amrlev][0].periodicity());
+        }
+    }
+}
+
+void MLCurlCurl::setDirichletNodesToZero (int amrlev, int mglev, MF& a_mf) const
+{
+    MFItInfo mfi_info{};
+#ifdef AMREX_USE_GPU
+    Vector<Array4BoxTag<RT>> tags;
+    mfi_info.DisableDeviceSync();
+#endif
+
+    for (auto& mf : a_mf)
+    {
+        auto const idxtype = mf.ixType();
+        Box const domain = amrex::convert(m_geom[amrlev][mglev].Domain(), idxtype);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+        for (MFIter mfi(mf,mfi_info); mfi.isValid(); ++mfi) {
+            auto const& vbx = mfi.validbox();
+            auto const& a = mf.array(mfi);
+            for (OrientationIter oit; oit; ++oit) {
+                Orientation const face = oit();
+                int const idim = face.coordDir();
+                bool is_dirichlet = face.isLow()
+                    ? m_lobc[0][idim] == LinOpBCType::Dirichlet
+                    : m_hibc[0][idim] == LinOpBCType::Dirichlet;
+                if (is_dirichlet && domain[face] == vbx[face] &&
+                    idxtype.nodeCentered(idim))
+                {
+                    Box b = vbx;
+                    b.setRange(idim, vbx[face], 1);
+#ifdef AMREX_USE_GPU
+                    tags.emplace_back(Array4BoxTag<RT>{a,b});
+#else
+                    amrex::LoopOnCpu(b, [&] (int i, int j, int k)
+                    {
+                        a(i,j,k) = RT(0.0);
+                    });
+#endif
+                }
+            }
+        }
+    }
+
+#ifdef AMREX_USE_GPU
+    ParallelFor(tags,
+    [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4BoxTag<RT> const& tag) noexcept
+    {
+        tag.dfab(i,j,k) = RT(0.0);
+    });
+#endif
+}
+
+void MLCurlCurl::setLevelBC (int amrlev, const MF* levelbcdata, // TODO
+                             const MF* robinbc_a, const MF* robinbc_b,
+                             const MF* robinbc_f)
+{
+    amrex::ignore_unused(amrlev, levelbcdata, robinbc_a, robinbc_b, robinbc_f);
+}
+
+void MLCurlCurl::restriction (int amrlev, int cmglev, MF& crse, MF& fine) const
+{
+    IntVect ratio = (amrlev > 0) ? IntVect(2) : this->mg_coarsen_ratio_vec[cmglev-1];
+    AMREX_ALWAYS_ASSERT(ratio == 2);
+
+    applyBC(amrlev, cmglev-1, fine, CurlCurlStateType::r);
+
+    auto dinfo = getDirichletInfo(amrlev,cmglev-1);
+
+    for (int idim = 0; idim < 3; ++idim) {
+        bool need_parallel_copy = !amrex::isMFIterSafe(crse[idim], fine[idim]);
+        MultiFab cfine;
+        if (need_parallel_copy) {
+            BoxArray const& ba = amrex::coarsen(fine[idim].boxArray(), 2);
+            cfine.define(ba, fine[idim].DistributionMap(), 1, 0);
+        }
+
+        MultiFab* pcrse = (need_parallel_copy) ? &cfine : &(crse[idim]);
+
+        auto const& crsema = pcrse->arrays();
+        auto const& finema = fine[idim].const_arrays();
+        ParallelFor(*pcrse, [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+        {
+            mlcurlcurl_restriction(idim,i,j,k,crsema[bno],finema[bno],dinfo);
+        });
+        Gpu::streamSynchronize();
+
+        if (need_parallel_copy) {
+            crse[idim].ParallelCopy(cfine);
+        }
+    }
+}
+
+void MLCurlCurl::interpolation (int amrlev, int fmglev, MF& fine,
+                                const MF& crse) const
+{
+    IntVect ratio = (amrlev > 0) ? IntVect(2) : this->mg_coarsen_ratio_vec[fmglev];
+    AMREX_ALWAYS_ASSERT(ratio == 2);
+
+    auto dinfo = getDirichletInfo(amrlev,fmglev);
+
+    for (int idim = 0; idim < 3; ++idim) {
+        bool need_parallel_copy = !amrex::isMFIterSafe(crse[idim], fine[idim]);
+        MultiFab cfine;
+        MultiFab const* cmf = &(crse[idim]);
+        if (need_parallel_copy) {
+            BoxArray const& ba = amrex::coarsen(fine[idim].boxArray(), 2);
+            cfine.define(ba, fine[idim].DistributionMap(), 1, 0);
+            cfine.ParallelCopy(crse[idim]);
+            cmf = &cfine;
+        }
+        auto const& finema = fine[idim].arrays();
+        auto const& crsema = cmf->const_arrays();
+        ParallelFor(fine[idim], [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+        {
+            if (!dinfo.is_dirichlet_edge(idim,i,j,k)) {
+                mlcurlcurl_interpadd(idim,i,j,k,finema[bno],crsema[bno]);
+            }
+        });
+    }
+    Gpu::streamSynchronize();
+}
+
+void
+MLCurlCurl::apply (int amrlev, int mglev, MF& out, MF& in, BCMode /*bc_mode*/,
+                   StateMode /*s_mode*/, const MLMGBndryT<MF>* /*bndry*/) const
+{
+    applyBC(amrlev, mglev, in, CurlCurlStateType::x);
+
+    auto adxinv = this->m_geom[amrlev][mglev].InvCellSizeArray();
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        adxinv[idim] *= std::sqrt(m_alpha);
+    }
+    auto const b = m_beta;
+
+    auto dinfo = getDirichletInfo(amrlev,mglev);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(out[0],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+    {
+        Box const& xbx = mfi.tilebox(out[0].ixType().toIntVect());
+        Box const& ybx = mfi.tilebox(out[1].ixType().toIntVect());
+        Box const& zbx = mfi.tilebox(out[2].ixType().toIntVect());
+        auto const& xout = out[0].array(mfi);
+        auto const& yout = out[1].array(mfi);
+        auto const& zout = out[2].array(mfi);
+        auto const& xin = in[0].array(mfi);
+        auto const& yin = in[1].array(mfi);
+        auto const& zin = in[2].array(mfi);
+        amrex::ParallelFor(xbx, ybx, zbx,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            if (dinfo.is_dirichlet_x_edge(i,j,k)) {
+                xout(i,j,k) = Real(0.0);
+            } else {
+                mlcurlcurl_adotx_x(i,j,k,xout,xin,yin,zin,b,adxinv);
+            }
+        },
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            if (dinfo.is_dirichlet_y_edge(i,j,k)) {
+                yout(i,j,k) = Real(0.0);
+            } else {
+                mlcurlcurl_adotx_y(i,j,k,yout,xin,yin,zin,b,adxinv);
+            }
+        },
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            if (dinfo.is_dirichlet_z_edge(i,j,k)) {
+                zout(i,j,k) = Real(0.0);
+            } else {
+                mlcurlcurl_adotx_z(i,j,k,zout,xin,yin,zin,b,adxinv);
+            }
+        });
+    }
+}
+
+void MLCurlCurl::smooth (int amrlev, int mglev, MF& sol, const MF& rhs,
+                         bool skip_fillboundary) const
+{
+    AMREX_ASSERT(rhs[0].nGrowVect().allGE(IntVect(1)));
+
+    applyBC(amrlev, mglev, const_cast<MF&>(rhs), CurlCurlStateType::b);
+
+    for (int color = 0; color < 4; ++color) {
+        if (!skip_fillboundary) {
+            applyBC(amrlev, mglev, sol, CurlCurlStateType::x);
+        }
+        skip_fillboundary = false;
+        smooth4(amrlev, mglev, sol, rhs, color);
+    }
+}
+
+void MLCurlCurl::smooth4 (int amrlev, int mglev, MF& sol, MF const& rhs,
+                          int color) const
+{
+    auto const& ex = sol[0].arrays();
+    auto const& ey = sol[1].arrays();
+    auto const& ez = sol[2].arrays();
+    auto const& rhsx = rhs[0].const_arrays();
+    auto const& rhsy = rhs[1].const_arrays();
+    auto const& rhsz = rhs[2].const_arrays();
+
+#if (AMREX_SPACEDIM == 2)
+    auto b = m_beta;
+#endif
+
+    auto adxinv = this->m_geom[amrlev][mglev].InvCellSizeArray();
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        adxinv[idim] *= std::sqrt(m_alpha);
+    }
+
+    auto* plusolver = m_lusolver[amrlev][mglev]->dataPtr();
+
+    auto dinfo = getDirichletInfo(amrlev,mglev);
+    auto sinfo = getSymmetryInfo(amrlev,mglev);
+
+    MultiFab nmf(amrex::convert(rhs[0].boxArray(),IntVect(1)),
+                 rhs[0].DistributionMap(), 1, 0, MFInfo().SetAlloc(false));
+    ParallelFor(nmf, [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+    {
+        mlcurlcurl_gs4(i,j,k,ex[bno],ey[bno],ez[bno],rhsx[bno],rhsy[bno],rhsz[bno],
+#if (AMREX_SPACEDIM == 2)
+                       b,
+#endif
+                       adxinv,color,*plusolver,dinfo,sinfo);
+    });
+    Gpu::streamSynchronize();
+}
+
+void MLCurlCurl::solutionResidual (int amrlev, MF& resid, MF& x, const MF& b,
+                                   const MF* /*crse_bcdata*/)
+{
+    BL_PROFILE("MLCurlCurl::solutionResidual()");
+    const int mglev = 0;
+    apply(amrlev, mglev, resid, x, BCMode::Inhomogeneous, StateMode::Solution);
+    compresid(amrlev, mglev, resid, b);
+}
+
+void MLCurlCurl::correctionResidual (int amrlev, int mglev, MF& resid, MF& x,
+                                     const MF& b, BCMode bc_mode,
+                                     const MF* crse_bcdata)
+{
+    AMREX_ALWAYS_ASSERT(bc_mode != BCMode::Inhomogeneous && crse_bcdata == nullptr);
+    apply(amrlev, mglev, resid, x, BCMode::Homogeneous, StateMode::Correction);
+    compresid(amrlev, mglev, resid, b);
+}
+
+void MLCurlCurl::compresid (int amrlev, int mglev, MF& resid, MF const& b) const
+{
+    auto dinfo = getDirichletInfo(amrlev,mglev);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(resid[0],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+    {
+        Box const& xbx = mfi.tilebox(resid[0].ixType().toIntVect());
+        Box const& ybx = mfi.tilebox(resid[1].ixType().toIntVect());
+        Box const& zbx = mfi.tilebox(resid[2].ixType().toIntVect());
+        auto const& resx = resid[0].array(mfi);
+        auto const& resy = resid[1].array(mfi);
+        auto const& resz = resid[2].array(mfi);
+        auto const& bx = b[0].array(mfi);
+        auto const& by = b[1].array(mfi);
+        auto const& bz = b[2].array(mfi);
+        amrex::ParallelFor(xbx, ybx, zbx,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            if (dinfo.is_dirichlet_x_edge(i,j,k)) {
+                resx(i,j,k) = Real(0.0);
+            } else {
+                resx(i,j,k) = bx(i,j,k) - resx(i,j,k);
+            }
+        },
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            if (dinfo.is_dirichlet_y_edge(i,j,k)) {
+                resy(i,j,k) = Real(0.0);
+            } else {
+                resy(i,j,k) = by(i,j,k) - resy(i,j,k);
+            }
+        },
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            if (dinfo.is_dirichlet_z_edge(i,j,k)) {
+                resz(i,j,k) = Real(0.0);
+            } else {
+                resz(i,j,k) = bz(i,j,k) - resz(i,j,k);
+            }
+        });
+    }
+}
+
+void MLCurlCurl::prepareForSolve ()
+{
+    for (int amrlev = 0;  amrlev < m_num_amr_levels; ++amrlev) {
+        for (int mglev = 0; mglev < m_num_mg_levels[amrlev]; ++mglev) {
+            auto const& dxinv = this->m_geom[amrlev][mglev].InvCellSizeArray();
+            Real dxx = dxinv[0]*dxinv[0];
+            Real dyy = dxinv[1]*dxinv[1];
+            Real dxy = dxinv[0]*dxinv[1];
+#if (AMREX_SPACEDIM == 2)
+            Array2D<Real,0,3,0,3,Order::C> A
+                {m_alpha*dyy*Real(2.0) + m_beta,
+                 Real(0.0),
+                -m_alpha*dxy,
+                 m_alpha*dxy,
+                 //
+                 Real(0.0),
+                 m_alpha*dyy*Real(2.0) + m_beta,
+                 m_alpha*dxy,
+                -m_alpha*dxy,
+                 //
+                -m_alpha*dxy,
+                 m_alpha*dxy,
+                 m_alpha*dxx*Real(2.0) + m_beta,
+                 Real(0.0),
+                 //
+                 m_alpha*dxy,
+                -m_alpha*dxy,
+                 Real(0.0),
+                 m_alpha*dxx*Real(2.0) + m_beta};
+#else
+            Real dzz = dxinv[2]*dxinv[2];
+            Real dxz = dxinv[0]*dxinv[2];
+            Real dyz = dxinv[1]*dxinv[2];
+
+            Array2D<Real,0,5,0,5,Order::C> A
+                {m_alpha*(dyy+dzz)*Real(2.0) + m_beta,
+                 Real(0.0),
+                -m_alpha*dxy,
+                 m_alpha*dxy,
+                -m_alpha*dxz,
+                 m_alpha*dxz,
+                 //
+                 Real(0.0),
+                 m_alpha*(dyy+dzz)*Real(2.0) + m_beta,
+                 m_alpha*dxy,
+                -m_alpha*dxy,
+                 m_alpha*dxz,
+                -m_alpha*dxz,
+                 //
+                -m_alpha*dxy,
+                 m_alpha*dxy,
+                 m_alpha*(dxx+dzz)*Real(2.0) + m_beta,
+                 Real(0.0),
+                -m_alpha*dyz,
+                 m_alpha*dyz,
+                 //
+                 m_alpha*dxy,
+                -m_alpha*dxy,
+                 Real(0.0),
+                 m_alpha*(dxx+dzz)*Real(2.0) + m_beta,
+                 m_alpha*dyz,
+                -m_alpha*dyz,
+                 //
+                -m_alpha*dxz,
+                 m_alpha*dxz,
+                -m_alpha*dyz,
+                 m_alpha*dyz,
+                 m_alpha*(dxx+dyy)*Real(2.0) + m_beta,
+                 Real(0.0),
+                 //
+                 m_alpha*dxz,
+                -m_alpha*dxz,
+                 m_alpha*dyz,
+                -m_alpha*dyz,
+                 Real(0.0),
+                 m_alpha*(dxx+dyy)*Real(2.0) + m_beta};
+#endif
+
+            m_lusolver[amrlev][mglev]
+                = std::make_unique<Gpu::DeviceScalar
+                                   <LUSolver<AMREX_SPACEDIM*2,RT>>>(A);
+        }
+    }
+}
+
+Real MLCurlCurl::xdoty (int amrlev, int mglev, const MF& x, const MF& y,
+                        bool local) const
+{
+    auto result = Real(0.0);
+    for (int idim = 0; idim < 3; ++idim) {
+        auto rtmp = MultiFab::Dot(getDotMask(amrlev,mglev,idim),
+                                  x[idim], 0, y[idim], 0, 1, 0, true);
+        result += rtmp;
+    }
+    if (!local) {
+        ParallelAllReduce::Sum(result, ParallelContext::CommunicatorSub());
+    }
+    return result;
+}
+
+Real MLCurlCurl::normInf (int /*amrlev*/, MF const& mf, bool local) const
+{
+    return amrex::norminf(mf, 0, m_ncomp, IntVect(0), local);
+}
+
+void MLCurlCurl::averageDownAndSync (Vector<MF>& sol) const
+{
+    BL_PROFILE("MLCurlCurl::averageDownAndSync()");
+    AMREX_ALWAYS_ASSERT(sol.size() == 1);
+    const int amrlev = 0;
+    const int mglev = 0;
+    for (int idim = 0; idim < 3; ++idim) {
+        amrex::OverrideSync(sol[amrlev][idim], getDotMask(amrlev,mglev,idim),
+                            this->m_geom[amrlev][mglev].periodicity());
+    }
+}
+
+void MLCurlCurl::make (Vector<Vector<MF> >& mf, IntVect const& ng) const
+{
+    MLLinOpT<MF>::make(mf, ng);
+}
+
+Array<MultiFab,3>
+MLCurlCurl::make (int amrlev, int mglev, IntVect const& ng) const
+{
+    MF r;
+    for (int idim = 0; idim < 3; ++idim) {
+        r[idim].define(amrex::convert(this->m_grids[amrlev][mglev], m_etype[idim]),
+                       this->m_dmap[amrlev][mglev], m_ncomp, ng, MFInfo(),
+                       *(this->m_factory)[amrlev][mglev]);
+    }
+    return r;
+}
+
+Array<MultiFab,3>
+MLCurlCurl::makeAlias (MF const& mf) const
+{
+    MF r;
+    for (int idim = 0; idim < 3; ++idim) {
+        r[idim] = MultiFab(mf[idim], amrex::make_alias, 0, mf[idim].nComp());
+    }
+    return r;
+}
+
+Array<MultiFab,3>
+MLCurlCurl::makeCoarseMG (int amrlev, int mglev, IntVect const& ng) const
+{
+    BoxArray cba = this->m_grids[amrlev][mglev];
+    IntVect ratio = (amrlev > 0) ? IntVect(2) : this->mg_coarsen_ratio_vec[mglev];
+    cba.coarsen(ratio);
+
+    MF r;
+    for (int idim = 0; idim < 3; ++idim) {
+        r[idim].define(amrex::convert(cba, m_etype[idim]),
+                       this->m_dmap[amrlev][mglev], m_ncomp, ng);
+    }
+    return r;
+}
+
+Array<MultiFab,3>
+MLCurlCurl::makeCoarseAmr (int famrlev, IntVect const& ng) const
+{
+    BoxArray cba = this->m_grids[famrlev][0];
+    IntVect ratio(this->AMRRefRatio(famrlev-1));
+    cba.coarsen(ratio);
+
+    MF r;
+    for (int idim = 0; idim < 3; ++idim) {
+        r[idim].define(amrex::convert(cba, m_etype[idim]),
+                       this->m_dmap[famrlev][0], m_ncomp, ng);
+    }
+    return r;
+}
+
+void MLCurlCurl::applyBC (int amrlev, int mglev, MF& in, CurlCurlStateType type) const
+{
+    int nmfs = 3;
+#if (AMREX_SPACEDIM == 2)
+    if (CurlCurlStateType::b == type) {
+        nmfs = 2; // no need to applyBC on Ez
+    }
+#endif
+    Vector<MultiFab*> mfs(nmfs);
+    for (int imf = 0; imf < nmfs; ++imf) {
+        mfs[imf] = in.data() + imf;
+    }
+    FillBoundary(mfs, this->m_geom[amrlev][mglev].periodicity());
+    for (auto* mf : mfs) {
+        applyPhysBC(amrlev, mglev, *mf, type);
+    }
+}
+
+#ifdef AMREX_USE_GPU
+struct MLCurlCurlBCTag {
+    Array4<Real> fab;
+    Box bx;
+    Orientation face;
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Box const& box() const noexcept { return bx; }
+};
+
+struct MLCurlCurlEdgeBCTag {
+    Array4<Real> fab;
+    Box bx;
+    Dim3 offset;
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Box const& box() const noexcept { return bx; }
+};
+#endif
+
+void MLCurlCurl::applyPhysBC (int amrlev, int mglev, MultiFab& mf, CurlCurlStateType type) const
+{
+    if (CurlCurlStateType::b == type) { return; }
+
+    auto const idxtype = mf.ixType();
+    Box const domain = amrex::convert(this->m_geom[amrlev][mglev].Domain(), idxtype);
+    Box const gdomain = amrex::convert
+        (this->m_geom[amrlev][mglev].growPeriodicDomain(1), idxtype);
+
+    MFItInfo mfi_info{};
+
+#ifdef AMREX_USE_GPU
+    Vector<MLCurlCurlBCTag> tags;
+    mfi_info.DisableDeviceSync();
+#endif
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(mf,mfi_info); mfi.isValid(); ++mfi) {
+        auto const& vbx = mfi.validbox();
+        auto const& a = mf.array(mfi);
+        for (OrientationIter oit; oit; ++oit) {
+            Orientation const face = oit();
+            int const idim = face.coordDir();
+            bool is_symmetric = face.isLow()
+                ? m_lobc[0][idim] == LinOpBCType::symmetry
+                : m_hibc[0][idim] == LinOpBCType::symmetry;
+            if (domain[face] == vbx[face] && is_symmetric &&
+                ((type == CurlCurlStateType::x) ||
+                 (type == CurlCurlStateType::r && idxtype.nodeCentered(idim)))) // transverse direction only
+            {
+                Box b = vbx;
+                for (int jdim = 0; jdim < AMREX_SPACEDIM; ++jdim) {
+                    if (jdim == idim) {
+                        int shift = face.isLow() ? -1 : 1;
+                        b.setRange(jdim, domain[face] + shift, 1);
+                    } else {
+                        if (b.smallEnd(jdim) > gdomain.smallEnd(jdim)) {
+                            b.growLo(jdim);
+                        }
+                        if (b.bigEnd(jdim) < gdomain.bigEnd(jdim)) {
+                            b.growHi(jdim);
+                        }
+                    }
+                }
+#ifdef AMREX_USE_GPU
+                tags.emplace_back(MLCurlCurlBCTag{a,b,face});
+#else
+                amrex::LoopOnCpu(b, [&] (int i, int j, int k)
+                {
+                    mlcurlcurl_bc_symmetry(i, j, k, face, idxtype, a);
+                });
+#endif
+            }
+        }
+    }
+
+#ifdef AMREX_USE_GPU
+    ParallelFor(tags,
+    [=] AMREX_GPU_DEVICE (int i, int j, int k, MLCurlCurlBCTag const& tag) noexcept
+    {
+        mlcurlcurl_bc_symmetry(i, j, k, tag.face, idxtype, tag.fab);
+    });
+#endif
+
+    if (CurlCurlStateType::r == type) { // fix domain edges
+        auto sinfo = getSymmetryInfo(amrlev,mglev);
+
+#ifdef AMREX_USE_GPU
+        Vector<MLCurlCurlEdgeBCTag> tags2;
+#endif
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+        for (MFIter mfi(mf,mfi_info); mfi.isValid(); ++mfi) {
+            auto const& vbx = mfi.validbox();
+            auto const& a = mf.array(mfi);
+            for (int idim = 0; idim < AMREX_SPACEDIM-1; ++idim) {
+                for (int jdim = idim+1; jdim < AMREX_SPACEDIM; ++jdim) {
+                    if (idxtype.nodeCentered(idim) &&
+                        idxtype.nodeCentered(jdim))
+                    {
+                        for (int iside = 0; iside < 2; ++iside) {
+                            int ii = (iside == 0) ? vbx.smallEnd(idim) : vbx.bigEnd(idim);
+                            for (int jside = 0; jside < 2; ++jside) {
+                                int jj = (jside == 0) ? vbx.smallEnd(jdim) : vbx.bigEnd(jdim);
+                                if (sinfo.is_symmetric(idim,iside,ii) &&
+                                    sinfo.is_symmetric(jdim,jside,jj))
+                                {
+                                    IntVect oiv(0);
+                                    oiv[idim] = (iside == 0) ? 2 : -2;
+                                    oiv[jdim] = (jside == 0) ? 2 : -2;
+                                    Dim3 offset = oiv.dim3();
+
+                                    Box b = vbx;
+                                    if (iside == 0) {
+                                        b.setRange(idim,vbx.smallEnd(idim)-1);
+                                    } else {
+                                        b.setRange(idim,vbx.bigEnd(idim)+1);
+                                    }
+                                    if (jside == 0) {
+                                        b.setRange(jdim,vbx.smallEnd(jdim)-1);
+                                    } else {
+                                        b.setRange(jdim,vbx.bigEnd(jdim)+1);
+                                    }
+#ifdef AMREX_USE_GPU
+                                    tags2.emplace_back(MLCurlCurlEdgeBCTag{a,b,offset});
+#else
+                                    amrex::LoopOnCpu(b, [&] (int i, int j, int k)
+                                    {
+                                        a(i,j,k) = a(i+offset.x,j+offset.y,k+offset.z);
+                                    });
+#endif
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+#ifdef AMREX_USE_GPU
+        ParallelFor(tags2,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k, MLCurlCurlEdgeBCTag const& tag)
+        {
+            tag.fab(i,j,k) = tag.fab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z);
+        });
+#endif
+    }
+}
+
+iMultiFab const& MLCurlCurl::getDotMask (int amrlev, int mglev, int idim) const
+{
+    if (m_dotmask[amrlev][mglev][idim] == nullptr) {
+        MultiFab tmp(amrex::convert(this->m_grids[amrlev][mglev], m_etype[idim]),
+                     this->m_dmap[amrlev][mglev], 1, 0, MFInfo().SetAlloc(false));
+        m_dotmask[amrlev][mglev][idim] =
+            tmp.OwnerMask(this->m_geom[amrlev][mglev].periodicity());
+    }
+    return *m_dotmask[amrlev][mglev][idim];
+}
+
+CurlCurlDirichletInfo MLCurlCurl::getDirichletInfo (int amrlev, int mglev) const
+{
+
+    auto helper = [&] (int idim, int face) -> int
+    {
+#if (AMREX_SPACEDIM == 2)
+        if (idim == 2) {
+            return std::numeric_limits<int>::lowest();
+        }
+#endif
+
+        if (face == 0) {
+            if (m_lobc[0][idim] == LinOpBCType::Dirichlet) {
+                return m_geom[amrlev][mglev].Domain().smallEnd(idim);
+            } else {
+                return std::numeric_limits<int>::lowest();
+            }
+        } else {
+            if (m_hibc[0][idim] == LinOpBCType::Dirichlet) {
+                return m_geom[amrlev][mglev].Domain().bigEnd(idim) + 1;
+            } else {
+                return std::numeric_limits<int>::max();
+            }
+        }
+    };
+
+    return CurlCurlDirichletInfo{IntVect(AMREX_D_DECL(helper(0,0),
+                                                      helper(1,0),
+                                                      helper(2,0))),
+                                 IntVect(AMREX_D_DECL(helper(0,1),
+                                                      helper(1,1),
+                                                      helper(2,1)))};
+}
+
+CurlCurlSymmetryInfo MLCurlCurl::getSymmetryInfo (int amrlev, int mglev) const
+{
+
+    auto helper = [&] (int idim, int face) -> int
+    {
+#if (AMREX_SPACEDIM == 2)
+        if (idim == 2) {
+            return std::numeric_limits<int>::lowest();
+        }
+#endif
+
+        if (face == 0) {
+            if (m_lobc[0][idim] == LinOpBCType::symmetry) {
+                return m_geom[amrlev][mglev].Domain().smallEnd(idim);
+            } else {
+                return std::numeric_limits<int>::lowest();
+            }
+        } else {
+            if (m_hibc[0][idim] == LinOpBCType::symmetry) {
+                return m_geom[amrlev][mglev].Domain().bigEnd(idim) + 1;
+            } else {
+                return std::numeric_limits<int>::max();
+            }
+        }
+    };
+
+    return CurlCurlSymmetryInfo{IntVect(AMREX_D_DECL(helper(0,0),
+                                                     helper(1,0),
+                                                     helper(2,0))),
+                                IntVect(AMREX_D_DECL(helper(0,1),
+                                                     helper(1,1),
+                                                     helper(2,1)))};
+}
+
+}
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl_K.H b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl_K.H
new file mode 100644
index 00000000000..0816d141183
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl_K.H
@@ -0,0 +1,777 @@
+#ifndef AMREX_ML_CURL_CURL_K_H_
+#define AMREX_ML_CURL_CURL_K_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Array4.H>
+#include <AMReX_LUSolver.H>
+
+namespace amrex {
+
+/* Index types
+ * E_x        : (0,1,1)
+ * E_y        : (1,0,1)
+ * E_z        : (1,1,0)
+ * (curl E)_x : (1,0,0)
+ * (curl E)_y : (0,1,0)
+ * (curl E)_z : (0,0,1)
+ */
+
+/*
+  Notes for gs4:
+
+  Interior nodes:
+
+    For v = [Ex(i-1,j,k), Ex(i,j,k), Ey(i,j-1,k), Ey(i,j,k), Ez(i,j,k-1), Ez(i,j,k)]^T,
+    we have A*v = b, where
+
+    A00 = alpha*(dyy+dzz)*2 + beta
+    A01 = 0
+    A02 = -alpha*dxy
+    A03 =  alpha*dxy
+    A04 = -alpha*dxz
+    A05 =  alpha*dxz
+
+    A10 = 0
+    A11 = alpha*(dyy+dzz)*2 + beta
+    A12 =  alpha*dxy
+    A13 = -alpha*dxy
+    A14 =  alpha*dxz
+    A15 = -alpha*dxz
+
+    A20 = -alpha*dxy
+    A21 =  alpha*dxy
+    A22 = alpha*(dxx+dzz)*2 + beta
+    A23 = 0
+    A24 = -alpha*dyz
+    A25 =  alpha*dyz
+
+    A30 =  alpha*dxy
+    A31 = -alpha*dxy
+    A32 = 0
+    A33 = alpha*(dxx+dzz)*2 + beta
+    A34 =  alpha*dyz
+    A35 = -alpha*dyz
+
+    A40 = -alpha*dxz
+    A41 =  alpha*dxz
+    A42 = -alpha*dyz
+    A43 =  alpha*dyz
+    A44 = alpha*(dxx+dyy)*2 + beta
+    A45 = 0
+
+    A50 =  alpha*dxz
+    A51 = -alpha*dxz
+    A52 =  alpha*dyz
+    A53 = -alpha*dyz
+    A54 = 0
+    A55 = alpha*(dxx+dyy)*2 + beta
+
+    b0 = rhsx(i-1,j,k) - (alpha*ccex), where
+      ccex = - dyy * (ex(i-1,j-1,k  ) +
+                      ex(i-1,j+1,k  ))
+             - dzz * (ex(i-1,j  ,k+1) +
+                      ex(i-1,j  ,k-1))
+             + dxy * (ey(i-1,j-1,k  )
+                    - ey(i-1,j  ,k  ))
+             + dxz * (ez(i-1,j  ,k-1)
+                    - ez(i-1,j  ,k  ))
+    b1 = rhsx(i,j,k) - (alpha*ccex), where
+      ccex = - dyy * ( ex(i  ,j-1,k  ) +
+                       ex(i  ,j+1,k  ))
+             - dzz * ( ex(i  ,j  ,k+1) +
+                       ex(i  ,j  ,k-1))
+             + dxy * (-ey(i+1,j-1,k  )
+                     + ey(i+1,j  ,k  ))
+             + dxz * (-ez(i+1,j  ,k-1)
+                     + ez(i+1,j  ,k  ));
+    b2 = rhsy(i,j-1,k) - alpha*ccey, where
+      ccey = - dxx * (ey(i-1,j-1,k  ) +
+                      ey(i+1,j-1,k  ))
+             - dzz * (ey(i  ,j-1,k-1) +
+                      ey(i  ,j-1,k+1))
+             + dxy * (ex(i-1,j-1,k  )
+                    - ex(i  ,j-1,k  ))
+             + dyz * (ez(i  ,j-1,k-1)
+                    - ez(i  ,j-1,k  ))
+    b3 = rhsy(i,j,k) - alpha*ccey, where
+      ccey = - dxx * ( ey(i-1,j  ,k  ) +
+                       ey(i+1,j  ,k  ))
+             - dzz * ( ey(i  ,j  ,k-1) +
+                       ey(i  ,j  ,k+1))
+             + dxy * (-ex(i-1,j+1,k  )
+                     + ex(i  ,j+1,k  ))
+             + dyz * (-ez(i  ,j+1,k-1)
+                     + ez(i  ,j+1,k  ));
+    b4 = rhsz(i,j,k-1) - alpha*ccez, where
+      ccez = - dxx * (ez(i-1,j  ,k-1) +
+                      ez(i+1,j  ,k-1))
+             - dyy * (ez(i  ,j-1,k-1) +
+                      ez(i  ,j+1,k-1))
+             + dxz * (ex(i-1,j  ,k-1)
+                    - ex(i  ,j  ,k-1))
+             + dyz * (ey(i  ,j-1,k-1)
+                    - ey(i  ,j  ,k-1))
+    b5 = rhsz(i,j,k) - alpha*ccez, where
+      ccez = - dxx * ( ez(i-1,j  ,k  ) +
+                       ez(i+1,j  ,k  ))
+             - dyy * ( ez(i  ,j-1,k  ) +
+                       ez(i  ,j+1,k  ))
+             + dxz * (-ex(i-1,j  ,k+1)
+                     + ex(i  ,j  ,k+1))
+             + dyz * (-ey(i  ,j-1,k+1)
+                     + ey(i  ,j  ,k+1));
+
+    dxx = 1/(dx*dx)
+    dyy = 1/(dy*dy)
+    dzz = 1/(dz*dz)
+    dxy = 1/(dx*dy)
+    dxz = 1/(dx*dz)
+    dyz = 1/(dy*dz)
+
+  For Dirichlet boundary nodes, we don't do anything.
+
+  For symmetric boundary nodes, we treat it as interior nodes because the
+  rhs outside the domain has been filled properly.
+
+  In 2D,
+
+    For v = [Ex(i-1,j,k), Ex(i,j,k), Ey(i,j-1,k), Ey(i,j,k)]^T,
+    we have A*v = b, where
+
+    A00 = alpha*dyy*2 + beta
+    A01 = 0
+    A02 = -alpha*dxy
+    A03 =  alpha*dxy
+
+    A10 = 0
+    A11 = alpha*dyy*2 + beta
+    A12 =  alpha*dxy
+    A13 = -alpha*dxy
+
+    A20 = -alpha*dxy
+    A21 =  alpha*dxy
+    A22 = alpha*dxx*2 + beta
+    A23 = 0
+
+    A30 =  alpha*dxy
+    A31 = -alpha*dxy
+    A32 = 0
+    A33 = alpha*dxx*2 + beta
+
+    b0 = rhsx(i-1,j,k) - (alpha*ccex), where
+      ccex = - dyy * (ex(i-1,j-1,k  ) +
+                      ex(i-1,j+1,k  ))
+             + dxy * (ey(i-1,j-1,k  )
+                    - ey(i-1,j  ,k  ))
+    b1 = rhsx(i,j,k) - (alpha*ccex), where
+      ccex = - dyy * ( ex(i  ,j-1,k  ) +
+                       ex(i  ,j+1,k  ))
+             + dxy * (-ey(i+1,j-1,k  )
+                     + ey(i+1,j  ,k  ))
+    b2 = rhsy(i,j-1,k) - alpha*ccey, where
+      ccey = - dxx * (ey(i-1,j-1,k  ) +
+                      ey(i+1,j-1,k  ))
+             + dxy * (ex(i-1,j-1,k  )
+                    - ex(i  ,j-1,k  ))
+    b3 = rhsy(i,j,k) - alpha*ccey, where
+      ccey = - dxx * ( ey(i-1,j  ,k  ) +
+                       ey(i+1,j  ,k  ))
+             + dxy * (-ex(i-1,j+1,k  )
+                     + ex(i  ,j+1,k  ))
+*/
+
+struct CurlCurlDirichletInfo
+{
+    IntVect dirichlet_lo;
+    IntVect dirichlet_hi;
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool is_dirichlet_node (int i, int j, int k) const
+    {
+#if (AMREX_SPACEDIM == 2)
+        amrex::ignore_unused(k);
+        return (i == dirichlet_lo[0]) || (i == dirichlet_hi[0])
+            || (j == dirichlet_lo[1]) || (j == dirichlet_hi[1]);
+#else
+        return (i == dirichlet_lo[0]) || (i == dirichlet_hi[0])
+            || (j == dirichlet_lo[1]) || (j == dirichlet_hi[1])
+            || (k == dirichlet_lo[2]) || (k == dirichlet_hi[2]);
+#endif
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool is_dirichlet_x_edge (int, int j, int k) const
+    {
+#if (AMREX_SPACEDIM == 2)
+        amrex::ignore_unused(k);
+        return (j == dirichlet_lo[1]) || (j == dirichlet_hi[1]);
+#else
+        return (j == dirichlet_lo[1]) || (j == dirichlet_hi[1])
+            || (k == dirichlet_lo[2]) || (k == dirichlet_hi[2]);
+#endif
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool is_dirichlet_y_edge (int i, int, int k) const
+    {
+#if (AMREX_SPACEDIM == 2)
+        amrex::ignore_unused(k);
+        return (i == dirichlet_lo[0]) || (i == dirichlet_hi[0]);
+#else
+        return (i == dirichlet_lo[0]) || (i == dirichlet_hi[0])
+            || (k == dirichlet_lo[2]) || (k == dirichlet_hi[2]);
+#endif
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool is_dirichlet_z_edge (int i, int j, int) const
+    {
+        return (i == dirichlet_lo[0]) || (i == dirichlet_hi[0])
+            || (j == dirichlet_lo[1]) || (j == dirichlet_hi[1]);
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool is_dirichlet_edge (int dim, int i, int j, int k) const
+    {
+        if (dim == 0) {
+            return is_dirichlet_x_edge(i,j,k);
+        } else if (dim == 1) {
+            return is_dirichlet_y_edge(i,j,k);
+        } else {
+            return is_dirichlet_z_edge(i,j,k);
+        }
+    }
+};
+
+struct CurlCurlSymmetryInfo
+{
+    IntVect symmetry_lo;
+    IntVect symmetry_hi;
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool xlo_is_symmetric (int i) const
+    {
+        return i == symmetry_lo[0];
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool xhi_is_symmetric (int i) const
+    {
+        return i == symmetry_hi[0];
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool ylo_is_symmetric (int j) const
+    {
+        return j == symmetry_lo[1];
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool yhi_is_symmetric (int j) const
+    {
+        return j == symmetry_hi[1];
+    }
+
+#if (AMREX_SPACEDIM == 3)
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool zlo_is_symmetric (int k) const
+    {
+        return k == symmetry_lo[2];
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool zhi_is_symmetric (int k) const
+    {
+        return k == symmetry_hi[2];
+    }
+#endif
+
+    [[nodiscard]] bool is_symmetric (int dir, int side, int idx) const
+    {
+#if (AMREX_SPACEDIM == 2)
+        if (dir == 0) {
+            return (side == 0) ? xlo_is_symmetric(idx) : xhi_is_symmetric(idx);
+        } else {
+            return (side == 0) ? ylo_is_symmetric(idx) : yhi_is_symmetric(idx);
+        }
+#else
+        if (dir == 0) {
+            return (side == 0) ? xlo_is_symmetric(idx) : xhi_is_symmetric(idx);
+        } else if (dir == 1) {
+            return (side == 0) ? ylo_is_symmetric(idx) : yhi_is_symmetric(idx);
+        } else {
+            return (side == 0) ? zlo_is_symmetric(idx) : zhi_is_symmetric(idx);
+        }
+#endif
+    }
+};
+
+enum struct CurlCurlStateType { x, b, r }; // x, b & r=b-Ax
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlcurlcurl_adotx_x (int i, int j, int k, Array4<Real> const& Ax,
+                         Array4<Real const> const& ex,
+                         Array4<Real const> const& ey,
+                         Array4<Real const> const& ez,
+                         Real beta, GpuArray<Real,AMREX_SPACEDIM> const& adxinv)
+{
+#if (AMREX_SPACEDIM == 2)
+    amrex::ignore_unused(ez);
+    Real dyy = adxinv[1] * adxinv[1];
+    Real dxy = adxinv[0] * adxinv[1];
+    Real ccex =  ex(i  ,j  ,k  ) * dyy * Real(2.0)
+        - dyy * (ex(i  ,j-1,k  ) +
+                 ex(i  ,j+1,k  ))
+        + dxy * (ey(i  ,j-1,k  )
+               - ey(i  ,j  ,k  )
+               - ey(i+1,j-1,k  )
+               + ey(i+1,j  ,k  ));
+#else
+    Real dyy = adxinv[1] * adxinv[1];
+    Real dzz = adxinv[2] * adxinv[2];
+    Real dxy = adxinv[0] * adxinv[1];
+    Real dxz = adxinv[0] * adxinv[2];
+    Real ccex =  ex(i  ,j  ,k  ) * (dyy+dzz)*Real(2.0)
+        - dyy * (ex(i  ,j-1,k  ) +
+                 ex(i  ,j+1,k  ))
+        - dzz * (ex(i  ,j  ,k+1) +
+                 ex(i  ,j  ,k-1))
+        + dxy * (ey(i  ,j-1,k  )
+               - ey(i  ,j  ,k  )
+               - ey(i+1,j-1,k  )
+               + ey(i+1,j  ,k  ))
+        + dxz * (ez(i  ,j  ,k-1)
+               - ez(i  ,j  ,k  )
+               - ez(i+1,j  ,k-1)
+               + ez(i+1,j  ,k  ));
+#endif
+    Ax(i,j,k) = ccex + beta * ex(i,j,k);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlcurlcurl_adotx_y (int i, int j, int k, Array4<Real> const& Ay,
+                         Array4<Real const> const& ex,
+                         Array4<Real const> const& ey,
+                         Array4<Real const> const& ez,
+                         Real beta, GpuArray<Real,AMREX_SPACEDIM> const& adxinv)
+{
+#if (AMREX_SPACEDIM == 2)
+    amrex::ignore_unused(ez);
+    Real dxx = adxinv[0] * adxinv[0];
+    Real dxy = adxinv[0] * adxinv[1];
+    Real ccey =  ey(i  ,j  ,k  ) * dxx * Real(2.0)
+        - dxx * (ey(i-1,j  ,k  ) +
+                 ey(i+1,j  ,k  ))
+        + dxy * (ex(i-1,j  ,k  )
+               - ex(i  ,j  ,k  )
+               - ex(i-1,j+1,k  )
+               + ex(i  ,j+1,k  ));
+#else
+    Real dxx = adxinv[0] * adxinv[0];
+    Real dzz = adxinv[2] * adxinv[2];
+    Real dxy = adxinv[0] * adxinv[1];
+    Real dyz = adxinv[1] * adxinv[2];
+    Real ccey =  ey(i  ,j  ,k  ) * (dxx+dzz)*Real(2.0)
+        - dxx * (ey(i-1,j  ,k  ) +
+                 ey(i+1,j  ,k  ))
+        - dzz * (ey(i  ,j  ,k-1) +
+                 ey(i  ,j  ,k+1))
+        + dxy * (ex(i-1,j  ,k  )
+               - ex(i  ,j  ,k  )
+               - ex(i-1,j+1,k  )
+               + ex(i  ,j+1,k  ))
+        + dyz * (ez(i  ,j  ,k-1)
+               - ez(i  ,j  ,k  )
+               - ez(i  ,j+1,k-1)
+               + ez(i  ,j+1,k  ));
+#endif
+    Ay(i,j,k) = ccey + beta * ey(i,j,k);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlcurlcurl_adotx_z (int i, int j, int k, Array4<Real> const& Az,
+                         Array4<Real const> const& ex,
+                         Array4<Real const> const& ey,
+                         Array4<Real const> const& ez,
+                         Real beta, GpuArray<Real,AMREX_SPACEDIM> const& adxinv)
+{
+#if (AMREX_SPACEDIM == 2)
+    amrex::ignore_unused(ex,ey);
+    Real dxx = adxinv[0] * adxinv[0];
+    Real dyy = adxinv[1] * adxinv[1];
+    Real ccez =  ez(i  ,j  ,k  ) * (dxx+dyy)*Real(2.0)
+        - dxx * (ez(i-1,j  ,k  ) +
+                 ez(i+1,j  ,k  ))
+        - dyy * (ez(i  ,j-1,k  ) +
+                 ez(i  ,j+1,k  ));
+#else
+    Real dxx = adxinv[0] * adxinv[0];
+    Real dyy = adxinv[1] * adxinv[1];
+    Real dxz = adxinv[0] * adxinv[2];
+    Real dyz = adxinv[1] * adxinv[2];
+    Real ccez =  ez(i  ,j  ,k  ) * (dxx+dyy)*Real(2.0)
+        - dxx * (ez(i-1,j  ,k  ) +
+                 ez(i+1,j  ,k  ))
+        - dyy * (ez(i  ,j-1,k  ) +
+                 ez(i  ,j+1,k  ))
+        + dxz * (ex(i-1,j  ,k  )
+               - ex(i  ,j  ,k  )
+               - ex(i-1,j  ,k+1)
+               + ex(i  ,j  ,k+1))
+        + dyz * (ey(i  ,j-1,k  )
+               - ey(i  ,j  ,k  )
+               - ey(i  ,j-1,k+1)
+               + ey(i  ,j  ,k+1));
+#endif
+    Az(i,j,k) = ccez + beta * ez(i,j,k);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlcurlcurl_gs4 (int i, int j, int k,
+                     Array4<Real> const& ex,
+                     Array4<Real> const& ey,
+                     Array4<Real> const& ez,
+                     Array4<Real const> const& rhsx,
+                     Array4<Real const> const& rhsy,
+                     Array4<Real const> const& rhsz,
+#if (AMREX_SPACEDIM == 2)
+                     Real beta,
+#endif
+                     GpuArray<Real,AMREX_SPACEDIM> const& adxinv,
+                     int color, LUSolver<AMREX_SPACEDIM*2,Real> const& lusolver,
+                     CurlCurlDirichletInfo const& dinfo,
+                     CurlCurlSymmetryInfo const& sinfo)
+{
+    if (dinfo.is_dirichlet_node(i,j,k)) { return; }
+
+    int my_color = i%2 + 2*(j%2);
+    if (k%2 != 0) {
+        my_color = 3 - my_color;
+    }
+
+#if (AMREX_SPACEDIM == 2)
+
+    Real dxx = adxinv[0] * adxinv[0];
+    Real dyy = adxinv[1] * adxinv[1];
+
+    if (((my_color == 0 || my_color == 3) && (color == 0 || color == 3)) ||
+        ((my_color == 1 || my_color == 2) && (color == 1 || color == 2)))
+    {
+        Real gamma = (dxx+dyy)*Real(2.0) + beta;
+        Real ccez = - dxx * (ez(i-1,j  ,k  ) +
+                             ez(i+1,j  ,k  ))
+                    - dyy * (ez(i  ,j-1,k  ) +
+                             ez(i  ,j+1,k  ));
+        Real res = rhsz(i,j,k) - (gamma*ez(i,j,k) + ccez);
+        constexpr Real omega = Real(1.15);
+        ez(i,j,k) += omega/gamma * res;
+    }
+
+    if (my_color != color) { return; }
+
+    Real dxy = adxinv[0]*adxinv[1];
+
+    GpuArray<Real,6> b
+        {rhsx(i-1,j,k) - (-dyy * ( ex(i-1,j-1,k  ) +
+                                   ex(i-1,j+1,k  ))
+                         + dxy * ( ey(i-1,j-1,k  )
+                                  -ey(i-1,j  ,k  ))),
+         rhsx(i  ,j,k) - (-dyy * ( ex(i  ,j-1,k  ) +
+                                   ex(i  ,j+1,k  ))
+                         + dxy * (-ey(i+1,j-1,k  )
+                                  +ey(i+1,j  ,k  ))),
+         rhsy(i,j-1,k) - (-dxx * ( ey(i-1,j-1,k  ) +
+                                   ey(i+1,j-1,k  ))
+                         + dxy * ( ex(i-1,j-1,k  )
+                                  -ex(i  ,j-1,k  ))),
+         rhsy(i,j  ,k) - (-dxx * ( ey(i-1,j  ,k  ) +
+                                   ey(i+1,j  ,k  ))
+                         + dxy * (-ex(i-1,j+1,k  )
+                                  +ex(i  ,j+1,k  )))};
+
+    if (sinfo.xlo_is_symmetric(i)) {
+        b[0] = -b[1];
+    } else if (sinfo.xhi_is_symmetric(i)) {
+        b[1] = -b[0];
+    }
+
+    if (sinfo.ylo_is_symmetric(j)) {
+        b[2] = -b[3];
+    } else if (sinfo.yhi_is_symmetric(j)) {
+        b[3] = -b[2];
+    }
+
+    GpuArray<Real,4> x;
+    lusolver(x.data(), b.data());
+    ex(i-1,j  ,k  ) = x[0];
+    ex(i  ,j  ,k  ) = x[1];
+    ey(i  ,j-1,k  ) = x[2];
+    ey(i  ,j  ,k  ) = x[3];
+
+#else
+
+    if (my_color != color) { return; }
+
+    Real dxx = adxinv[0]*adxinv[0];
+    Real dyy = adxinv[1]*adxinv[1];
+    Real dzz = adxinv[2]*adxinv[2];
+    Real dxy = adxinv[0]*adxinv[1];
+    Real dxz = adxinv[0]*adxinv[2];
+    Real dyz = adxinv[1]*adxinv[2];
+
+    GpuArray<Real,6> b
+        {rhsx(i-1,j,k) - (-dyy * ( ex(i-1,j-1,k  ) +
+                                   ex(i-1,j+1,k  ))
+                         - dzz * ( ex(i-1,j  ,k+1) +
+                                   ex(i-1,j  ,k-1))
+                         + dxy * ( ey(i-1,j-1,k  )
+                                  -ey(i-1,j  ,k  ))
+                         + dxz * ( ez(i-1,j  ,k-1)
+                                  -ez(i-1,j  ,k  ))),
+         rhsx(i  ,j,k) - (-dyy * ( ex(i  ,j-1,k  ) +
+                                   ex(i  ,j+1,k  ))
+                         - dzz * ( ex(i  ,j  ,k+1) +
+                                   ex(i  ,j  ,k-1))
+                         + dxy * (-ey(i+1,j-1,k  )
+                                  +ey(i+1,j  ,k  ))
+                         + dxz * (-ez(i+1,j  ,k-1)
+                                  +ez(i+1,j  ,k  ))),
+         rhsy(i,j-1,k) - (-dxx * ( ey(i-1,j-1,k  ) +
+                                   ey(i+1,j-1,k  ))
+                         - dzz * ( ey(i  ,j-1,k-1) +
+                                   ey(i  ,j-1,k+1))
+                         + dxy * ( ex(i-1,j-1,k  )
+                                  -ex(i  ,j-1,k  ))
+                         + dyz * ( ez(i  ,j-1,k-1)
+                                  -ez(i  ,j-1,k  ))),
+         rhsy(i,j  ,k) - (-dxx * ( ey(i-1,j  ,k  ) +
+                                   ey(i+1,j  ,k  ))
+                         - dzz * ( ey(i  ,j  ,k-1) +
+                                   ey(i  ,j  ,k+1))
+                         + dxy * (-ex(i-1,j+1,k  )
+                                  +ex(i  ,j+1,k  ))
+                         + dyz * (-ez(i  ,j+1,k-1)
+                                  +ez(i  ,j+1,k  ))),
+         rhsz(i,j,k-1) - (-dxx * ( ez(i-1,j  ,k-1) +
+                                   ez(i+1,j  ,k-1))
+                         - dyy * ( ez(i  ,j-1,k-1) +
+                                   ez(i  ,j+1,k-1))
+                         + dxz * ( ex(i-1,j  ,k-1)
+                                  -ex(i  ,j  ,k-1))
+                         + dyz * ( ey(i  ,j-1,k-1)
+                                  -ey(i  ,j  ,k-1))),
+         rhsz(i,j,k  ) - (-dxx * ( ez(i-1,j  ,k  ) +
+                                   ez(i+1,j  ,k  ))
+                         - dyy * ( ez(i  ,j-1,k  ) +
+                                   ez(i  ,j+1,k  ))
+                         + dxz * (-ex(i-1,j  ,k+1)
+                                  +ex(i  ,j  ,k+1))
+                         + dyz * (-ey(i  ,j-1,k+1)
+                                  +ey(i  ,j  ,k+1)))};
+
+    if (sinfo.xlo_is_symmetric(i)) {
+        b[0] = -b[1];
+    } else if (sinfo.xhi_is_symmetric(i)) {
+        b[1] = -b[0];
+    }
+
+    if (sinfo.ylo_is_symmetric(j)) {
+        b[2] = -b[3];
+    } else if (sinfo.yhi_is_symmetric(j)) {
+        b[3] = -b[2];
+    }
+
+    if (sinfo.zlo_is_symmetric(k)) {
+        b[4] = -b[5];
+    } else if (sinfo.zhi_is_symmetric(k)) {
+        b[5] = -b[4];
+    }
+
+    GpuArray<Real,6> x;
+    lusolver(x.data(), b.data());
+    ex(i-1,j  ,k  ) = x[0];
+    ex(i  ,j  ,k  ) = x[1];
+    ey(i  ,j-1,k  ) = x[2];
+    ey(i  ,j  ,k  ) = x[3];
+    ez(i  ,j  ,k-1) = x[4];
+    ez(i  ,j  ,k  ) = x[5];
+#endif
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlcurlcurl_interpadd (int dir, int i, int j, int k,
+                           Array4<Real> const& fine,
+                           Array4<Real const> const& crse)
+{
+    int ic = amrex::coarsen(i,2);
+    int jc = amrex::coarsen(j,2);
+    int kc = amrex::coarsen(k,2);
+    if (dir == 0) {
+        bool j_is_odd = (jc*2 != j);
+        bool k_is_odd = (kc*2 != k);
+        if (j_is_odd && k_is_odd) {
+            fine(i,j,k) += Real(0.25) *
+                (crse(ic,jc,kc  ) + crse(ic,jc+1,kc   ) +
+                 crse(ic,jc,kc+1) + crse(ic,jc+1,kc+1));
+        } else if (j_is_odd) {
+            fine(i,j,k) += Real(0.5) * (crse(ic,jc,kc) + crse(ic,jc+1,kc));
+        } else if (k_is_odd) {
+            fine(i,j,k) += Real(0.5) * (crse(ic,jc,kc) + crse(ic,jc,kc+1));
+        } else {
+            fine(i,j,k) += crse(ic,jc,kc);
+        }
+    } else if (dir == 1) {
+        bool i_is_odd = (ic*2 != i);
+        bool k_is_odd = (kc*2 != k);
+        if (i_is_odd && k_is_odd) {
+            fine(i,j,k) += Real(0.25) *
+                (crse(ic  ,jc,kc  ) + crse(ic+1,jc,kc  ) +
+                 crse(ic  ,jc,kc+1) + crse(ic+1,jc,kc+1));
+        } else if (i_is_odd) {
+            fine(i,j,k) += Real(0.5) * (crse(ic,jc,kc) + crse(ic+1,jc,kc));
+        } else if (k_is_odd) {
+            fine(i,j,k) += Real(0.5) * (crse(ic,jc,kc) + crse(ic,jc,kc+1));
+        } else {
+            fine(i,j,k) += crse(ic,jc,kc);
+        }
+    } else {
+        bool i_is_odd = (ic*2 != i);
+        bool j_is_odd = (jc*2 != j);
+        if (i_is_odd && j_is_odd) {
+            fine(i,j,k) += Real(0.25) *
+                (crse(ic  ,jc  ,kc) + crse(ic+1,jc  ,kc) +
+                 crse(ic  ,jc+1,kc) + crse(ic+1,jc+1,kc));
+        } else if (i_is_odd) {
+            fine(i,j,k) += Real(0.5) * (crse(ic,jc,kc) + crse(ic+1,jc,kc));
+        } else if (j_is_odd) {
+            fine(i,j,k) += Real(0.5) * (crse(ic,jc,kc) + crse(ic,jc+1,kc));
+        } else {
+            fine(i,j,k) += crse(ic,jc,kc);
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlcurlcurl_restriction (int dir, int i, int j, int k,
+                             Array4<Real> const& crse,
+                             Array4<Real const> const& fine,
+                             CurlCurlDirichletInfo const& dinfo)
+{
+    int ii = i*2;
+    int jj = j*2;
+    int kk = k*2;
+    if (dinfo.is_dirichlet_edge(dir,ii,jj,kk)) {
+        crse(i,j,k) = Real(0.0);
+    }
+    else
+    {
+#if (AMREX_SPACEDIM == 3)
+        if (dir == 0) {
+            crse(i,j,k) = Real(1./32.) * (fine(ii  ,jj-1,kk-1)             +
+                                          fine(ii  ,jj  ,kk-1) * Real(2.0) +
+                                          fine(ii  ,jj+1,kk-1)             +
+                                          fine(ii  ,jj-1,kk  ) * Real(2.0) +
+                                          fine(ii  ,jj  ,kk  ) * Real(4.0) +
+                                          fine(ii  ,jj+1,kk  ) * Real(2.0) +
+                                          fine(ii  ,jj-1,kk+1)             +
+                                          fine(ii  ,jj  ,kk+1) * Real(2.0) +
+                                          fine(ii  ,jj+1,kk+1)             +
+                                          fine(ii+1,jj-1,kk-1)             +
+                                          fine(ii+1,jj  ,kk-1) * Real(2.0) +
+                                          fine(ii+1,jj+1,kk-1)             +
+                                          fine(ii+1,jj-1,kk  ) * Real(2.0) +
+                                          fine(ii+1,jj  ,kk  ) * Real(4.0) +
+                                          fine(ii+1,jj+1,kk  ) * Real(2.0) +
+                                          fine(ii+1,jj-1,kk+1)             +
+                                          fine(ii+1,jj  ,kk+1) * Real(2.0) +
+                                          fine(ii+1,jj+1,kk+1)             );
+        } else if (dir == 1) {
+            crse(i,j,k) = Real(1./32.) * (fine(ii-1,jj  ,kk-1)             +
+                                          fine(ii  ,jj  ,kk-1) * Real(2.0) +
+                                          fine(ii+1,jj  ,kk-1)             +
+                                          fine(ii-1,jj  ,kk  ) * Real(2.0) +
+                                          fine(ii  ,jj  ,kk  ) * Real(4.0) +
+                                          fine(ii+1,jj  ,kk  ) * Real(2.0) +
+                                          fine(ii-1,jj  ,kk+1)             +
+                                          fine(ii  ,jj  ,kk+1) * Real(2.0) +
+                                          fine(ii+1,jj  ,kk+1)             +
+                                          fine(ii-1,jj+1,kk-1)             +
+                                          fine(ii  ,jj+1,kk-1) * Real(2.0) +
+                                          fine(ii+1,jj+1,kk-1)             +
+                                          fine(ii-1,jj+1,kk  ) * Real(2.0) +
+                                          fine(ii  ,jj+1,kk  ) * Real(4.0) +
+                                          fine(ii+1,jj+1,kk  ) * Real(2.0) +
+                                          fine(ii-1,jj+1,kk+1)             +
+                                          fine(ii  ,jj+1,kk+1) * Real(2.0) +
+                                          fine(ii+1,jj+1,kk+1)             );
+        } else {
+            crse(i,j,k) = Real(1./32.) * (fine(ii-1,jj-1,kk  )             +
+                                          fine(ii  ,jj-1,kk  ) * Real(2.0) +
+                                          fine(ii+1,jj-1,kk  )             +
+                                          fine(ii-1,jj  ,kk  ) * Real(2.0) +
+                                          fine(ii  ,jj  ,kk  ) * Real(4.0) +
+                                          fine(ii+1,jj  ,kk  ) * Real(2.0) +
+                                          fine(ii-1,jj+1,kk  )             +
+                                          fine(ii  ,jj+1,kk  ) * Real(2.0) +
+                                          fine(ii+1,jj+1,kk  )             +
+                                          fine(ii-1,jj-1,kk+1)             +
+                                          fine(ii  ,jj-1,kk+1) * Real(2.0) +
+                                          fine(ii+1,jj-1,kk+1)             +
+                                          fine(ii-1,jj  ,kk+1) * Real(2.0) +
+                                          fine(ii  ,jj  ,kk+1) * Real(4.0) +
+                                          fine(ii+1,jj  ,kk+1) * Real(2.0) +
+                                          fine(ii-1,jj+1,kk+1)             +
+                                          fine(ii  ,jj+1,kk+1) * Real(2.0) +
+                                          fine(ii+1,jj+1,kk+1)             );
+        }
+#else
+        if (dir == 0) {
+            crse(i,j,0) = Real(0.125) * (fine(ii  ,jj-1,0)             +
+                                         fine(ii  ,jj  ,0) * Real(2.0) +
+                                         fine(ii  ,jj+1,0)             +
+                                         fine(ii+1,jj-1,0)             +
+                                         fine(ii+1,jj  ,0) * Real(2.0) +
+                                         fine(ii+1,jj+1,0)             );
+        } else if (dir == 1) {
+            crse(i,j,0) = Real(0.125) * (fine(ii-1,jj  ,0)             +
+                                         fine(ii  ,jj  ,0) * Real(2.0) +
+                                         fine(ii+1,jj  ,0)             +
+                                         fine(ii-1,jj+1,0)             +
+                                         fine(ii  ,jj+1,0) * Real(2.0) +
+                                         fine(ii+1,jj+1,0)             );
+        } else {
+            crse(i,j,0) = Real(1./16.)*(fine(ii-1,jj-1,0) + Real(2.)*fine(ii  ,jj-1,0) +          fine(ii+1,jj-1,0)
+                             + Real(2.)*fine(ii-1,jj  ,0) + Real(4.)*fine(ii  ,jj  ,0) + Real(2.)*fine(ii+1,jj  ,0)
+                                      + fine(ii-1,jj+1,0) + Real(2.)*fine(ii  ,jj+1,0) +          fine(ii+1,jj+1,0));
+
+        }
+#endif
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mlcurlcurl_bc_symmetry (int i, int j, int k, Orientation face, IndexType it,
+                             Array4<Real> const& a)
+{
+    int const idir = face.coordDir();
+    int offset = face.isLow() ? 1 : -1;
+    Real sign;
+    if (it.cellCentered(idir)) {
+        sign = Real(-1.0);
+    } else {
+        sign = Real(1.0);
+        offset *= 2;
+    }
+
+    if (idir == 0) {
+        a(i,j,k) = sign * a(i+offset,j,k);
+    } else if (idir == 1) {
+        a(i,j,k) = sign * a(i,j+offset,k);
+    } else {
+        a(i,j,k) = sign * a(i,j,k+offset);
+    }
+}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.H b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.H
index 6187c479e4c..ec80a7e41a9 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.H
@@ -114,6 +114,8 @@ public:
     void getEBFluxes (const Vector<MultiFab*>& a_flux,
                               const Vector<MultiFab*>& a_sol) const override;
 
+    void applyRobinBCTermsCoeffs ();
+
 #if defined(AMREX_USE_HYPRE) && (AMREX_SPACEDIM > 1)
     [[nodiscard]] std::unique_ptr<Hypre> makeHypre (Hypre::Interface hypre_interface) const override;
 #endif
@@ -122,6 +124,14 @@ public:
     [[nodiscard]] std::unique_ptr<PETScABecLap> makePETSc () const override;
 #endif
 
+    Real m_a_scalar = std::numeric_limits<Real>::quiet_NaN();
+    Real m_b_scalar = std::numeric_limits<Real>::quiet_NaN();
+    Vector<Vector<MultiFab> > m_a_coeffs;
+    Vector<Vector<Array<MultiFab,AMREX_SPACEDIM> > > m_b_coeffs;
+
+    bool m_scalars_set = false;
+    bool m_acoef_set = false;
+
 protected:
 
     int m_ncomp = 1;
@@ -131,10 +141,6 @@ protected:
     Location m_beta_loc; // Location of coefficients: face centers or face centroids
     Location m_phi_loc;  // Location of solution variable: cell centers or cell centroids
 
-    Real m_a_scalar = std::numeric_limits<Real>::quiet_NaN();
-    Real m_b_scalar = std::numeric_limits<Real>::quiet_NaN();
-    Vector<Vector<MultiFab> > m_a_coeffs;
-    Vector<Vector<Array<MultiFab,AMREX_SPACEDIM> > > m_b_coeffs;
     Vector<Vector<iMultiFab> > m_cc_mask;
 
     Vector<std::unique_ptr<MultiFab> > m_eb_phi;
@@ -154,6 +160,8 @@ protected:
                                         const Vector<MultiFab*>& b_eb);
     void averageDownCoeffs ();
     void averageDownCoeffsToCoarseAmrLevel (int flev);
+
+    [[nodiscard]] bool supportRobinBC () const noexcept override { return true; }
 };
 
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.cpp
index b37537645e6..525fde8b9c3 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.cpp
@@ -116,7 +116,9 @@ MLEBABecLap::setScalars (Real a, Real b)
         {
             m_a_coeffs[amrlev][0].setVal(0.0);
         }
+        m_acoef_set = true;
     }
+    m_scalars_set = true;
 }
 
 void
@@ -124,6 +126,7 @@ MLEBABecLap::setACoeffs (int amrlev, const MultiFab& alpha)
 {
     MultiFab::Copy(m_a_coeffs[amrlev][0], alpha, 0, 0, 1, 0);
     m_needs_update = true;
+    m_acoef_set = true;
 }
 
 void
@@ -131,6 +134,7 @@ MLEBABecLap::setACoeffs (int amrlev, Real alpha)
 {
     m_a_coeffs[amrlev][0].setVal(alpha);
     m_needs_update = true;
+    m_acoef_set = true;
 }
 
 void
@@ -685,6 +689,8 @@ MLEBABecLap::prepareForSolve ()
 
     MLCellABecLap::prepareForSolve();
 
+    applyRobinBCTermsCoeffs();
+
     averageDownCoeffs();
 
     if (m_eb_phi[0]) {
@@ -1183,6 +1189,8 @@ MLEBABecLap::update ()
 {
     if (MLCellABecLap::needsUpdate()) { MLCellABecLap::update(); }
 
+    applyRobinBCTermsCoeffs();
+
     averageDownCoeffs();
 
     m_is_singular.clear();
@@ -1285,6 +1293,14 @@ MLEBABecLap::getEBFluxes (const Vector<MultiFab*>& a_flux, const Vector<MultiFab
     }
 }
 
+void
+MLEBABecLap::applyRobinBCTermsCoeffs ()
+{
+    if (this->hasRobinBC()) {
+        detail::applyRobinBCTermsCoeffs(*this);
+    }
+}
+
 #if defined(AMREX_USE_HYPRE) && (AMREX_SPACEDIM > 1)
 std::unique_ptr<Hypre>
 MLEBABecLap::makeHypre (Hypre::Interface hypre_interface) const
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H
index e9176b94864..c23797f8f7b 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H
@@ -24,7 +24,7 @@ void mlebndfdlap_adotx_eb_doit (int i, int j, int k, Array4<Real> const& y,
                                 Array4<Real const> const& x, Array4<Real const> const& levset,
                                 Array4<int const> const& dmsk,
                                 Array4<Real const> const& ecx, Array4<Real const> const& ecy,
-                                F && xeb, Real bx, Real by) noexcept
+                                F const& xeb, Real bx, Real by) noexcept
 {
     if (dmsk(i,j,k)) {
         y(i,j,k) = Real(0.0);
@@ -199,7 +199,7 @@ void mlebndfdlap_adotx_rz_eb_doit (int i, int j, int k, Array4<Real> const& y,
                                    Array4<Real const> const& x, Array4<Real const> const& levset,
                                    Array4<int const> const& dmsk,
                                    Array4<Real const> const& ecx, Array4<Real const> const& ecy,
-                                   F && xeb, Real sigr, Real dr, Real dz, Real rlo, Real alpha) noexcept
+                                   F const& xeb, Real sigr, Real dr, Real dz, Real rlo, Real alpha) noexcept
 {
     Real const r = rlo + Real(i) * dr;
     if (dmsk(i,j,k) || (r == Real(0.0) && alpha != Real(0.0))) {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_3D_K.H
index 8d00c5f1cbb..bebaa33bbc2 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_3D_K.H
@@ -26,7 +26,7 @@ void mlebndfdlap_adotx_eb_doit (int i, int j, int k, Array4<Real> const& y,
                                 Array4<Real const> const& x, Array4<Real const> const& levset,
                                 Array4<int const> const& dmsk,
                                 Array4<Real const> const& ecx, Array4<Real const> const& ecy,
-                                Array4<Real const> const& ecz, F && xeb,
+                                Array4<Real const> const& ecz, F const& xeb,
                                 Real bx, Real by, Real bz) noexcept
 {
     if (dmsk(i,j,k)) {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_K.H
index 1d081760389..d389af59d18 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_K.H
@@ -19,7 +19,7 @@ template <typename F>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlebndfdlap_grad_x_doit (int i, int j, int k, Array4<Real> const& px,
                               Array4<Real const> const& p, Array4<int const> const& dmsk,
-                              Array4<Real const> const& ecx, F&& phieb, Real dxi)
+                              Array4<Real const> const& ecx, F const& phieb, Real dxi)
 {
     if (dmsk(i,j,k) >= 0 && dmsk(i+1,j,k) >= 0) {
         px(i,j,k) = dxi * (p(i+1,j,k) - p(i,j,k));
@@ -36,7 +36,7 @@ template <typename F>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlebndfdlap_grad_y_doit (int i, int j, int k, Array4<Real> const& py,
                               Array4<Real const> const& p, Array4<int const> const& dmsk,
-                              Array4<Real const> const& ecy, F&& phieb, Real dyi)
+                              Array4<Real const> const& ecy, F const& phieb, Real dyi)
 {
     if (dmsk(i,j,k) >= 0 && dmsk(i,j+1,k) >= 0) {
         py(i,j,k) = dyi * (p(i,j+1,k) - p(i,j,k));
@@ -54,7 +54,7 @@ template <typename F>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlebndfdlap_grad_z_doit (int i, int j, int k, Array4<Real> const& pz,
                               Array4<Real const> const& p, Array4<int const> const& dmsk,
-                              Array4<Real const> const& ecz, F&& phieb, Real dzi)
+                              Array4<Real const> const& ecz, F const& phieb, Real dzi)
 {
     if (dmsk(i,j,k) >= 0 && dmsk(i,j,k+1) >= 0) {
         pz(i,j,k) = dzi * (p(i,j,k+1) - p(i,j,k));
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
index 0c4937d2de5..6e7559d21d9 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
@@ -18,7 +18,7 @@ namespace amrex {
 // where phi and rhs are nodal multifab, and sigma is a tensor constant
 // with only diagonal components.  The EB is assumed to be Dirichlet.
 //
-// del dot (simga grad phi) - alpha/r^2 phi = rhs, for RZ where alpha is a
+// del dot (sigma grad phi) - alpha/r^2 phi = rhs, for RZ where alpha is a
 // scalar constant that is zero by default.
 
 class MLEBNodeFDLaplacian
@@ -61,7 +61,7 @@ public:
     //
     template <typename F>
     std::enable_if_t<IsCallableR<Real,F,AMREX_D_DECL(Real,Real,Real)>::value>
-    setEBDirichlet (F&& f);
+    setEBDirichlet (F const& f);
 
     void define (const Vector<Geometry>& a_geom,
                  const Vector<BoxArray>& a_grids,
@@ -84,12 +84,6 @@ public:
 
     void restriction (int amrlev, int cmglev, MultiFab& crse, MultiFab& fine) const final;
     void interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiFab& crse) const final;
-    void averageDownSolutionRHS (int camrlev, MultiFab& crse_sol, MultiFab& crse_rhs,
-                                 const MultiFab& fine_sol, const MultiFab& fine_rhs) final;
-
-    void reflux (int crse_amrlev,
-                 MultiFab& res, const MultiFab& crse_sol, const MultiFab& crse_rhs,
-                 MultiFab& fine_res, MultiFab& fine_sol, const MultiFab& fine_rhs) const final;
 
     void prepareForSolve () final;
     void Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) const final;
@@ -132,7 +126,7 @@ private:
 
 template <typename F>
 std::enable_if_t<IsCallableR<Real,F,AMREX_D_DECL(Real,Real,Real)>::value>
-MLEBNodeFDLaplacian::setEBDirichlet (F&& f)
+MLEBNodeFDLaplacian::setEBDirichlet (F const& f)
 {
     m_phi_eb.resize(m_num_amr_levels);
     for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
index 523ed4cb44d..5559f133821 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
@@ -222,24 +222,6 @@ MLEBNodeFDLaplacian::interpolation (int amrlev, int fmglev, MultiFab& fine,
     }
 }
 
-void
-MLEBNodeFDLaplacian::averageDownSolutionRHS (int /*camrlev*/, MultiFab& /*crse_sol*/,
-                                             MultiFab& /*crse_rhs*/,
-                                             const MultiFab& /*fine_sol*/,
-                                             const MultiFab& /*fine_rhs*/)
-{
-    amrex::Abort("MLEBNodeFDLaplacian::averageDownSolutionRHS: todo");
-}
-
-void
-MLEBNodeFDLaplacian::reflux (int /*crse_amrlev*/, MultiFab& /*res*/,
-                             const MultiFab& /*crse_sol*/, const MultiFab& /*crse_rhs*/,
-                             MultiFab& /*fine_res*/, MultiFab& /*fine_sol*/,
-                             const MultiFab& /*fine_rhs*/) const
-{
-    amrex::Abort("MLEBNodeFDLaplacian::reflux: TODO");
-}
-
 void
 MLEBNodeFDLaplacian::prepareForSolve ()
 {
@@ -284,20 +266,7 @@ MLEBNodeFDLaplacian::prepareForSolve ()
         });
     }
 
-    if (m_is_bottom_singular)
-    {
-        int amrlev = 0;
-        int mglev = 0;
-        auto const& dotmasks = m_coarse_dot_mask.arrays();
-        auto const& dirmasks = m_dirichlet_mask[amrlev][mglev]->const_arrays();
-        amrex::ParallelFor(m_coarse_dot_mask,
-        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
-        {
-            if (dirmasks[box_no](i,j,k)) {
-                dotmasks[box_no](i,j,k) = Real(0.);
-            }
-        });
-    }
+    AMREX_ASSERT(!isBottomSingular());
 
     Gpu::streamSynchronize();
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp
index c180d951909..3fd1b26cae4 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp
@@ -156,9 +156,9 @@ MLEBTensorOp::prepareForSolve ()
         }
     } else {
         for (int amrlev = 0; amrlev < NAMRLevels(); ++amrlev) {
-            for (int mglev = 0; mglev < m_kappa[amrlev].size(); ++mglev) {
+            for (auto & mglev : m_kappa[amrlev]) {
                 for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                    m_kappa[amrlev][mglev][idim].setVal(0.0);
+                    mglev[idim].setVal(0.0);
                 }
             }
         }
@@ -179,8 +179,8 @@ MLEBTensorOp::prepareForSolve ()
         }
     } else {
         for (int amrlev = 0; amrlev < NAMRLevels(); ++amrlev) {
-            for (int mglev = 0; mglev < m_eb_kappa[amrlev].size(); ++mglev) {
-                m_eb_kappa[amrlev][mglev].setVal(0.0);
+            for (auto & mglev : m_eb_kappa[amrlev]) {
+                mglev.setVal(0.0);
             }
         }
     }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
index 706fe679d7e..22a27e4473f 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
@@ -12,11 +12,13 @@
 #endif
 
 #ifdef AMREX_USE_EB
+#include <AMReX_EBMultiFabUtil.H>
 #include <AMReX_MultiCutFab.H>
 #endif
 
 #include <AMReX_Any.H>
 #include <AMReX_BndryRegister.H>
+#include <AMReX_FabDataType.H>
 #include <AMReX_MLMGBndry.H>
 #include <AMReX_MultiFabUtil.H>
 
@@ -89,6 +91,7 @@ template <typename T> class MLMGT;
 template <typename T> class MLCGSolverT;
 template <typename T> class MLPoissonT;
 template <typename T> class MLABecLaplacianT;
+template <typename T> class GMRESMLMGT;
 
 template <typename MF>
 class MLLinOpT
@@ -99,9 +102,11 @@ public:
     template <typename T> friend class MLCGSolverT;
     template <typename T> friend class MLPoissonT;
     template <typename T> friend class MLABecLaplacianT;
+    template <typename T> friend class GMRESMLMGT;
 
-    using FAB = typename MF::fab_type;
-    using RT  = typename MF::value_type;
+    using MFType = MF;
+    using FAB = typename FabDataType<MF>::fab_type;
+    using RT  = typename FabDataType<MF>::value_type;
 
     using BCType = LinOpBCType;
     using BCMode    = LinOpEnumType::BCMode;
@@ -279,7 +284,11 @@ public:
      * \param fine   fine MG level data
      * \param crse   coarse MG level data
      */
-    virtual void interpAssign (int amrlev, int fmglev, MF& fine, MF& crse) const = 0;
+    virtual void interpAssign (int amrlev, int fmglev, MF& fine, MF& crse) const
+    {
+        amrex::ignore_unused(amrlev, fmglev, fine, crse);
+        amrex::Abort("MLLinOpT::interpAssign: Must be implemented for FMG cycle");
+    }
 
     /**
      * \brief Interpolation between AMR levels
@@ -287,10 +296,14 @@ public:
      * \param famrlev fine AMR level
      * \param fine    fine level data
      * \param crse    coarse level data
-     * \parame nghost number of ghost cells
+     * \param nghost number of ghost cells
      */
     virtual void interpolationAmr (int famrlev, MF& fine, const MF& crse,
-                                   IntVect const& nghost) const = 0;
+                                   IntVect const& nghost) const
+    {
+        amrex::ignore_unused(famrlev, fine, crse, nghost);
+        amrex::Abort("MLLinOpT::interpolationAmr: Must be implemented for composite solves across multiple AMR levels");
+    }
 
     /**
      * \brief Average-down data from fine AMR level to coarse AMR level.
@@ -302,7 +315,11 @@ public:
      * \param fine_rhs RHS on fine AMR level
      */
     virtual void averageDownSolutionRHS (int camrlev, MF& crse_sol, MF& crse_rhs,
-                                         const MF& fine_sol, const MF& fine_rhs) = 0;
+                                         const MF& fine_sol, const MF& fine_rhs)
+    {
+        amrex::ignore_unused(camrlev, crse_sol, crse_rhs, fine_sol, fine_rhs);
+        amrex::Abort("MLLinOpT::averageDownSolutionRHS: Must be implemented for composite solves across multiple AMR levels");
+    }
 
     /**
      * \brief Apply the linear operator, out = L(in)
@@ -374,7 +391,12 @@ public:
      */
     virtual void reflux (int crse_amrlev,
                          MF& res, const MF& crse_sol, const MF& crse_rhs,
-                         MF& fine_res, MF& fine_sol, const MF& fine_rhs) const = 0;
+                         MF& fine_res, MF& fine_sol, const MF& fine_rhs) const
+    {
+        amrex::ignore_unused(crse_amrlev, res, crse_sol, crse_rhs, fine_res,
+                             fine_sol, fine_rhs);
+        amrex::Abort("MLLinOpT::reflux: Must be implemented for composite solves across multiple AMR levels");
+    }
 
     /**
      * \brief Compute fluxes
@@ -384,8 +406,11 @@ public:
      * \param sol    solution
      * \param loc    location of the fluxes
      */
-    virtual void compFlux (int amrlev, const Array<MF*,AMREX_SPACEDIM>& fluxes,
-                           MF& sol, Location loc) const = 0;
+    virtual void compFlux (int /*amrlev*/, const Array<MF*,AMREX_SPACEDIM>& /*fluxes*/,
+                           MF& /*sol*/, Location /*loc*/) const
+    {
+        amrex::Abort("AMReX_MLLinOp::compFlux::How did we get here?");
+    }
 
     /**
      * \brief Compute gradients of the solution
@@ -395,13 +420,16 @@ public:
      * \param sol    solution
      * \param loc    location of the gradients
      */
-    virtual void compGrad (int amrlev, const Array<MF*,AMREX_SPACEDIM>& grad,
-                           MF& sol, Location loc) const = 0;
+    virtual void compGrad (int /*amrlev*/, const Array<MF*,AMREX_SPACEDIM>& /*grad*/,
+                           MF& /*sol*/, Location /*loc*/) const
+    {
+        amrex::Abort("AMReX_MLLinOp::compGrad::How did we get here?");
+    }
 
     //! apply metric terms if there are any
-    virtual void applyMetricTerm (int amrlev, int mglev, MF& rhs) const = 0;
+    virtual void applyMetricTerm (int /*amrlev*/, int /*mglev*/, MF& /*rhs*/) const {}
     //! unapply metric terms if there are any
-    virtual void unapplyMetricTerm (int amrlev, int mglev, MF& rhs) const = 0;
+    virtual void unapplyMetricTerm (int /*amrlev*/, int /*mglev*/, MF& /*rhs*/) const {}
 
     //! This is needed for our nodal projection solver
     virtual void unimposeNeumannBC (int /*amrlev*/, MF& /*rhs*/) const {}
@@ -425,6 +453,16 @@ public:
 
     virtual void prepareForSolve () = 0;
 
+    virtual void prepareForGMRES () {}
+
+    //! For GMRES to work, this might need to be implemented to mask out
+    //! Dirichlet nodes or cells (e.g., EB covered cells or overset cells)
+    virtual void setDirichletNodesToZero (int /*amrlev*/, int /*mglev*/,
+                                          MF& /*mf*/) const
+    {
+        amrex::Warning("This function might need to be implemented for GMRES to work with this LinOp.");
+    }
+
     //! Is it singular on given AMR level?
     [[nodiscard]] virtual bool isSingular (int amrlev) const = 0;
     //! Is the bottom of MG singular?
@@ -433,7 +471,11 @@ public:
     //! x dot y, used by the bottom solver
     virtual RT xdoty (int amrlev, int mglev, const MF& x, const MF& y, bool local) const = 0;
 
-    virtual std::unique_ptr<MLLinOpT<MF>> makeNLinOp (int grid_size) const = 0;
+    virtual std::unique_ptr<MLLinOpT<MF>> makeNLinOp (int /*grid_size*/) const
+    {
+        amrex::Abort("MLLinOp::makeNLinOp: N-Solve not supported");
+        return nullptr;
+    }
 
     virtual void getFluxes (const Vector<Array<MF*,AMREX_SPACEDIM> >& /*a_flux*/,
                             const Vector<MF*>& /*a_sol*/,
@@ -483,11 +525,33 @@ public:
 
     virtual void averageDownAndSync (Vector<MF>& sol) const = 0;
 
-    virtual void avgDownResAmr (int clev, MF& cres, MF const& fres) const = 0;
+    virtual void avgDownResAmr (int clev, MF& cres, MF const& fres) const
+    {
+        amrex::ignore_unused(clev, cres, fres);
+        amrex::Abort("MLLinOpT::avgDownResAmr: Must be implemented for composite solves across multiple AMR levels");
+    }
+
+    // This function is needed for FMG cycle, but not V-cycle.
     virtual void avgDownResMG (int clev, MF& cres, MF const& fres) const;
 
     [[nodiscard]] bool isMFIterSafe (int amrlev, int mglev1, int mglev2) const;
 
+    //! Return the number of AMR levels
+    [[nodiscard]] int NAMRLevels () const noexcept { return m_num_amr_levels; }
+
+    //! Return the number of MG levels at given AMR level
+    [[nodiscard]] int NMGLevels (int amrlev) const noexcept { return m_num_mg_levels[amrlev]; }
+
+    [[nodiscard]] const Geometry& Geom (int amr_lev, int mglev=0) const noexcept { return m_geom[amr_lev][mglev]; }
+
+    // BC
+    Vector<Array<BCType, AMREX_SPACEDIM> > m_lobc;
+    Vector<Array<BCType, AMREX_SPACEDIM> > m_hibc;
+    // Need to save the original copy because we change the BC type to
+    // Neumann for inhomogeneous Neumann and Robin.
+    Vector<Array<BCType, AMREX_SPACEDIM> > m_lobc_orig;
+    Vector<Array<BCType, AMREX_SPACEDIM> > m_hibc_orig;
+
 protected:
 
     static constexpr int mg_coarsen_ratio = 2;
@@ -544,14 +608,6 @@ protected:
     };
     std::unique_ptr<CommContainer> m_raii_comm;
 
-    // BC
-    Vector<Array<BCType, AMREX_SPACEDIM> > m_lobc;
-    Vector<Array<BCType, AMREX_SPACEDIM> > m_hibc;
-    // Need to save the original copy because we change the BC type to
-    // Neumann for inhomogeneous Neumann and Robin.
-    Vector<Array<BCType, AMREX_SPACEDIM> > m_lobc_orig;
-    Vector<Array<BCType, AMREX_SPACEDIM> > m_hibc_orig;
-
     Array<Real, AMREX_SPACEDIM> m_domain_bloc_lo {{AMREX_D_DECL(0._rt,0._rt,0._rt)}};
     Array<Real, AMREX_SPACEDIM> m_domain_bloc_hi {{AMREX_D_DECL(0._rt,0._rt,0._rt)}};
 
@@ -561,20 +617,12 @@ protected:
     const MF* m_coarse_data_for_bc = nullptr;
     MF m_coarse_data_for_bc_raii;
 
-    //! Return the number of AMR levels
-    [[nodiscard]] int NAMRLevels () const noexcept { return m_num_amr_levels; }
-
-    //! Return the number of MG levels at given AMR level
-    [[nodiscard]] int NMGLevels (int amrlev) const noexcept { return m_num_mg_levels[amrlev]; }
-
     //! Return AMR refinement ratios
     [[nodiscard]] const Vector<int>& AMRRefRatio () const noexcept { return m_amr_ref_ratio; }
 
     //! Return AMR refinement ratio at given AMR level
     [[nodiscard]] int AMRRefRatio (int amr_lev) const noexcept { return m_amr_ref_ratio[amr_lev]; }
 
-    [[nodiscard]] const Geometry& Geom (int amr_lev, int mglev=0) const noexcept { return m_geom[amr_lev][mglev]; }
-
     [[nodiscard]] FabFactory<FAB> const* Factory (int amr_lev, int mglev=0) const noexcept {
         return m_factory[amr_lev][mglev].get();
     }
@@ -613,6 +661,10 @@ protected:
 
     [[nodiscard]] bool isCellCentered () const noexcept { return m_ixtype == 0; }
 
+    [[nodiscard]] virtual IntVect getNGrowVectRestriction () const {
+        return isCellCentered() ? IntVect(0) : IntVect(1);
+    }
+
     virtual void make (Vector<Vector<MF> >& mf, IntVect const& ng) const;
 
     [[nodiscard]] virtual MF make (int amrlev, int mglev, IntVect const& ng) const;
@@ -1389,39 +1441,62 @@ template <typename MF>
 MF
 MLLinOpT<MF>::make (int amrlev, int mglev, IntVect const& ng) const
 {
-    return MF(amrex::convert(m_grids[amrlev][mglev], m_ixtype),
-              m_dmap[amrlev][mglev], getNComp(), ng, MFInfo(),
-              *m_factory[amrlev][mglev]);
+    if constexpr (IsMultiFabLike_v<MF>) {
+        return MF(amrex::convert(m_grids[amrlev][mglev], m_ixtype),
+                  m_dmap[amrlev][mglev], getNComp(), ng, MFInfo(),
+                  *m_factory[amrlev][mglev]);
+    } else {
+        amrex::ignore_unused(amrlev, mglev, ng);
+        amrex::Abort("MLLinOpT::make: how did we get here?");
+        return {};
+    }
 }
 
 template <typename MF>
 MF
 MLLinOpT<MF>::makeAlias (MF const& mf) const
 {
-    return MF(mf, amrex::make_alias, 0, mf.nComp());
+    if constexpr (IsMultiFabLike_v<MF>) {
+        return MF(mf, amrex::make_alias, 0, mf.nComp());
+    } else {
+        amrex::ignore_unused(mf);
+        amrex::Abort("MLLinOpT::makeAlias: how did we get here?");
+        return {};
+    }
 }
 
 template <typename MF>
 MF
 MLLinOpT<MF>::makeCoarseMG (int amrlev, int mglev, IntVect const& ng) const
 {
-    BoxArray cba = m_grids[amrlev][mglev];
-    IntVect ratio = (amrlev > 0) ? IntVect(2) : mg_coarsen_ratio_vec[mglev];
-    cba.coarsen(ratio);
-    cba.convert(m_ixtype);
-    return MF(cba, m_dmap[amrlev][mglev], getNComp(), ng);
-
+    if constexpr (IsMultiFabLike_v<MF>) {
+        BoxArray cba = m_grids[amrlev][mglev];
+        IntVect ratio = (amrlev > 0) ? IntVect(2) : mg_coarsen_ratio_vec[mglev];
+        cba.coarsen(ratio);
+        cba.convert(m_ixtype);
+        return MF(cba, m_dmap[amrlev][mglev], getNComp(), ng);
+    } else {
+        amrex::ignore_unused(amrlev, mglev, ng);
+        amrex::Abort("MLLinOpT::makeCoarseMG: how did we get here?");
+        return {};
+    }
 }
 
 template <typename MF>
 MF
 MLLinOpT<MF>::makeCoarseAmr (int famrlev, IntVect const& ng) const
 {
-    BoxArray cba = m_grids[famrlev][0];
-    IntVect ratio(AMRRefRatio(famrlev-1));
-    cba.coarsen(ratio);
-    cba.convert(m_ixtype);
-    return MF(cba, m_dmap[famrlev][0], getNComp(), ng);
+    if constexpr (IsMultiFabLike_v<MF>) {
+        BoxArray cba = m_grids[famrlev][0];
+        IntVect ratio(AMRRefRatio(famrlev-1));
+        cba.coarsen(ratio);
+        cba.convert(m_ixtype);
+        return MF(cba, m_dmap[famrlev][0], getNComp(), ng);
+    } else {
+        amrex::ignore_unused(famrlev, ng);
+        amrex::Abort("MLLinOpT::makeCoarseAmr: how did we get here?");
+        return {};
+    }
 }
 
 template <typename MF>
@@ -1446,8 +1521,9 @@ template <typename MF>
 void
 MLLinOpT<MF>::avgDownResMG (int clev, MF& cres, MF const& fres) const
 {
-    const int ncomp = this->getNComp();
+    amrex::ignore_unused(clev, cres, fres);
     if constexpr (amrex::IsFabArray<MF>::value) {
+        const int ncomp = this->getNComp();
 #ifdef AMREX_USE_EB
         if (!fres.isAllRegular()) {
             if constexpr (std::is_same<MF,MultiFab>()) {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLMG.H b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
index 7fc822afa23..9e8b7887c0c 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLMG.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
@@ -20,9 +20,11 @@ public:
     };
 
     template <typename T> friend class MLCGSolverT;
+    template <typename M> friend class GMRESMLMGT;
 
-    using FAB = typename MF::fab_type;
-    using RT  = typename MF::value_type;
+    using MFType = MF;
+    using FAB = typename MLLinOpT<MF>::FAB;
+    using RT  = typename MLLinOpT<MF>::RT;
 
     using BCMode   = typename MLLinOpT<MF>::BCMode;
     using Location = typename MLLinOpT<MF>::Location;
@@ -167,6 +169,12 @@ public:
 
     void prepareForNSolve ();
 
+    void prepareLinOp ();
+
+    void prepareMGcycle ();
+
+    void prepareForGMRES ();
+
     void oneIter (int iter);
 
     void miniCycle (int amrlev);
@@ -217,6 +225,8 @@ public:
     [[nodiscard]] int getNumIters () const noexcept { return m_iter_fine_resnorm0.size(); }
     [[nodiscard]] Vector<int> const& getNumCGIters () const noexcept { return m_niters_cg; }
 
+    MLLinOpT<MF>& getLinOp () { return linop; }
+
 private:
 
     bool throw_exception = false;
@@ -507,7 +517,7 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
     for (int alev = 0; alev < namrlevs; ++alev)
     {
         if (!sol_is_alias[alev]) {
-            a_sol[alev]->LocalCopy(sol[alev], 0, 0, ncomp, ng_back);
+            LocalCopy(*a_sol[alev], sol[alev], 0, 0, ncomp, ng_back);
         }
     }
 
@@ -541,11 +551,11 @@ MLMGT<MF>::getGradSolution (const Vector<Array<AMF*,AMREX_SPACEDIM> >& a_grad_so
             Array<MF,AMREX_SPACEDIM> grad_sol;
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
                 auto const& amf = *(a_grad_sol[alev][idim]);
-                grad_sol[idim].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0);
+                grad_sol[idim].define(boxArray(amf), DistributionMap(amf), ncomp, 0);
             }
             linop.compGrad(alev, GetArrOfPtrs(grad_sol), sol[alev], a_loc);
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                a_grad_sol[alev][idim]->LocalCopy(grad_sol, 0, 0, ncomp, IntVect(0));
+                LocalCopy(*a_grad_sol[alev][idim], grad_sol[idim], 0, 0, ncomp, IntVect(0));
             }
         }
     }
@@ -578,13 +588,13 @@ MLMGT<MF>::getFluxes (const Vector<Array<AMF*,AMREX_SPACEDIM> >& a_flux,
         for (int ilev = 0; ilev < namrlevs; ++ilev) {
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
                 auto const& amf = *(a_flux[ilev][idim]);
-                fluxes[ilev][idim].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0);
+                fluxes[ilev][idim].define(boxArray(amf), DistributionMap(amf), ncomp, 0);
             }
         }
         getFluxes(GetVecOfArrOfPtrs(fluxes), GetVecOfPtrs(sol), a_loc);
         for (int ilev = 0; ilev < namrlevs; ++ilev) {
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                a_flux[ilev][idim]->LocalCopy(fluxes[ilev][idim], 0, 0, ncomp, IntVect(0));
+                LocalCopy(*a_flux[ilev][idim], fluxes[ilev][idim], 0, 0, ncomp, IntVect(0));
             }
         }
     }
@@ -618,14 +628,14 @@ MLMGT<MF>::getFluxes (const Vector<Array<AMF*,AMREX_SPACEDIM> >& a_flux,
         for (int ilev = 0; ilev < namrlevs; ++ilev) {
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
                 auto const& amf = *(a_flux[ilev][idim]);
-                fluxes[ilev][idim].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0);
+                fluxes[ilev][idim].define(boxArray(amf), DistributionMap(amf), ncomp, 0);
             }
-            sol[ilev].LocalCopy(*a_sol[ilev], 0, 0, ncomp, sol[ilev].nGrowVect());
+            LocalCopy(sol[ilev], *a_sol[ilev], 0, 0, ncomp, nGrowVect(sol[ilev]));
         }
         linop.getFluxes(GetVecOfArrOfPtrs(fluxes), GetVecOfPtrs(sol), a_loc);
         for (int ilev = 0; ilev < namrlevs; ++ilev) {
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                a_flux[ilev][idim]->LocalCopy(fluxes[ilev][idim], 0, 0, ncomp, IntVect(0));
+                LocalCopy(*a_flux[ilev][idim], fluxes[ilev][idim], 0, 0, ncomp, IntVect(0));
             }
         }
     }
@@ -653,11 +663,11 @@ MLMGT<MF>::getFluxes (const Vector<AMF*> & a_flux, Location a_loc)
         Vector<MF> fluxes(namrlevs);
         for (int ilev = 0; ilev < namrlevs; ++ilev) {
             auto const& amf = *a_flux[ilev];
-            fluxes[ilev].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0);
+            fluxes[ilev].define(boxArray(amf), DistributionMap(amf), ncomp, 0);
         }
         getFluxes(GetVecOfPtrs(fluxes), GetVecOfPtrs(sol), a_loc);
         for (int ilev = 0; ilev < namrlevs; ++ilev) {
-            a_flux[ilev]->LocalCopy(fluxes[ilev], 0, 0, ncomp, IntVect(0));
+            LocalCopy(*a_flux[ilev], fluxes[ilev], 0, 0, ncomp, IntVect(0));
         }
     }
 }
@@ -676,11 +686,11 @@ void
 MLMGT<MF>::getFluxes (const Vector<AMF*> & a_flux,
                       const Vector<AMF*>& a_sol, Location /*a_loc*/)
 {
-    AMREX_ASSERT(a_flux[0]->nComp() >= AMREX_SPACEDIM);
+    AMREX_ASSERT(nComp(*a_flux[0]) >= AMREX_SPACEDIM);
 
     if constexpr (! std::is_same<AMF,MF>()) {
         for (int alev = 0; alev < namrlevs; ++alev) {
-            sol[alev].LocalCopy(*a_sol[alev], 0, 0, ncomp, sol[alev].nGrowVect());
+            LocalCopy(sol[alev], *a_sol[alev], 0, 0, ncomp, nGrowVect(sol[alev]));
         }
     }
 
@@ -718,11 +728,11 @@ MLMGT<MF>::getFluxes (const Vector<AMF*> & a_flux,
             Vector<MF> fluxes(namrlevs);
             for (int ilev = 0; ilev < namrlevs; ++ilev) {
                 auto const& amf = *a_flux[ilev];
-                fluxes[ilev].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0);
+                fluxes[ilev].define(boxArray(amf), DistributionMap(amf), ncomp, 0);
             }
             linop.getFluxes(GetVecOfPtrs(fluxes), GetVecOfPtrs(sol));
             for (int ilev = 0; ilev < namrlevs; ++ilev) {
-                a_flux[ilev]->LocalCopy(fluxes[ilev], 0, 0, ncomp, IntVect(0));
+                LocalCopy(*a_flux[ilev], fluxes[ilev], 0, 0, ncomp, IntVect(0));
             }
         }
     }
@@ -779,7 +789,7 @@ MLMGT<MF>::compResidual (const Vector<MF*>& a_res, const Vector<MF*>& a_sol,
     sol_is_alias.resize(namrlevs,true);
     for (int alev = 0; alev < namrlevs; ++alev)
     {
-        if (cf_strategy == CFStrategy::ghostnodes || a_sol[alev]->nGrowVect() == ng_sol)
+        if (cf_strategy == CFStrategy::ghostnodes || nGrowVect(*a_sol[alev]) == ng_sol)
         {
             sol[alev] = linop.makeAlias(*a_sol[alev]);
             sol_is_alias[alev] = true;
@@ -790,16 +800,11 @@ MLMGT<MF>::compResidual (const Vector<MF*>& a_res, const Vector<MF*>& a_sol,
             {
                 sol[alev] = linop.make(alev, 0, ng_sol);
             }
-            sol[alev].LocalCopy(*a_sol[alev], 0, 0, ncomp, IntVect(0));
+            LocalCopy(sol[alev], *a_sol[alev], 0, 0, ncomp, IntVect(0));
         }
     }
 
-    if (!linop_prepared) {
-        linop.prepareForSolve();
-        linop_prepared = true;
-    } else if (linop.needsUpdate()) {
-        linop.update();
-    }
+    prepareLinOp();
 
     const auto& amrrr = linop.AMRRefRatio();
 
@@ -808,9 +813,9 @@ MLMGT<MF>::compResidual (const Vector<MF*>& a_res, const Vector<MF*>& a_sol,
         const MF* prhs = a_rhs[alev];
 #if (AMREX_SPACEDIM != 3)
         int nghost = (cf_strategy == CFStrategy::ghostnodes) ? linop.getNGrow(alev) : 0;
-        MF rhstmp(prhs->boxArray(), prhs->DistributionMap(), ncomp, nghost,
+        MF rhstmp(boxArray(*prhs), DistributionMap(*prhs), ncomp, nghost,
                   MFInfo(), *linop.Factory(alev));
-        rhstmp.LocalCopy(*prhs, 0, 0, ncomp, IntVect(nghost));
+        LocalCopy(rhstmp, *prhs, 0, 0, ncomp, IntVect(nghost));
         linop.applyMetricTerm(alev, 0, rhstmp);
         linop.unimposeNeumannBC(alev, rhstmp);
         linop.applyInhomogNeumannTerm(alev, rhstmp);
@@ -822,9 +827,9 @@ MLMGT<MF>::compResidual (const Vector<MF*>& a_res, const Vector<MF*>& a_sol,
                          *a_res[alev+1], sol[alev+1], *a_rhs[alev+1]);
             if (linop.isCellCentered()) {
 #ifdef AMREX_USE_EB
-                amrex::EB_average_down(*a_res[alev+1], *a_res[alev], 0, ncomp, amrrr[alev]);
+                EB_average_down(*a_res[alev+1], *a_res[alev], 0, ncomp, amrrr[alev]);
 #else
-                amrex::average_down(*a_res[alev+1], *a_res[alev], 0, ncomp, amrrr[alev]);
+                average_down(*a_res[alev+1], *a_res[alev], 0, ncomp, amrrr[alev]);
 #endif
             }
         }
@@ -858,7 +863,7 @@ MLMGT<MF>::apply (const Vector<MF*>& out, const Vector<MF*>& a_in)
             nghost = linop.getNGrow(alev);
             in[alev] = a_in[alev];
         }
-        else if (a_in[alev]->nGrowVect() == ng_sol)
+        else if (nGrowVect(*a_in[alev]) == ng_sol)
         {
             in[alev] = a_in[alev];
         }
@@ -866,26 +871,15 @@ MLMGT<MF>::apply (const Vector<MF*>& out, const Vector<MF*>& a_in)
         {
             IntVect ng = ng_sol;
             if (cf_strategy == CFStrategy::ghostnodes) { ng = IntVect(nghost); }
-            in_raii[alev].define(a_in[alev]->boxArray(),
-                                 a_in[alev]->DistributionMap(),
-                                 a_in[alev]->nComp(), ng,
-                                 MFInfo(), *linop.Factory(alev));
-            in_raii[alev].LocalCopy(*a_in[alev], 0, 0, ncomp, IntVect(nghost));
+            in_raii[alev] = linop.make(alev, 0, ng);
+            LocalCopy(in_raii[alev], *a_in[alev], 0, 0, ncomp, IntVect(nghost));
             in[alev] = &(in_raii[alev]);
         }
-        rh[alev].define(a_in[alev]->boxArray(),
-                        a_in[alev]->DistributionMap(),
-                        a_in[alev]->nComp(), nghost, MFInfo(),
-                        *linop.Factory(alev));
-        rh[alev].setVal(RT(0.0));
+        rh[alev] = linop.make(alev, 0, IntVect(nghost));
+        setVal(rh[alev], RT(0.0));
     }
 
-    if (!linop_prepared) {
-        linop.prepareForSolve();
-        linop_prepared = true;
-    } else if (linop.needsUpdate()) {
-        linop.update();
-    }
+    prepareLinOp();
 
     for (int alev = 0; alev < namrlevs; ++alev) {
         linop.applyInhomogNeumannTerm(alev, rh[alev]);
@@ -900,11 +894,15 @@ MLMGT<MF>::apply (const Vector<MF*>& out, const Vector<MF*>& a_in)
             linop.reflux(alev, *out[alev], *in[alev], rh[alev],
                          *out[alev+1], *in[alev+1], rh[alev+1]);
             if (linop.isCellCentered()) {
+                if constexpr (IsMultiFabLike_v<MF>) {
 #ifdef AMREX_USE_EB
-                amrex::EB_average_down(*out[alev+1], *out[alev], 0, out[alev]->nComp(), amrrr[alev]);
+                    EB_average_down(*out[alev+1], *out[alev], 0, nComp(*out[alev]), amrrr[alev]);
 #else
-                amrex::average_down(*out[alev+1], *out[alev], 0, out[alev]->nComp(), amrrr[alev]);
+                    average_down(*out[alev+1], *out[alev], 0, nComp(*out[alev]), amrrr[alev]);
 #endif
+                } else {
+                    amrex::Abort("MLMG: TODO average_down for non-MultiFab");
+                }
             }
         }
     }
@@ -917,7 +915,7 @@ MLMGT<MF>::apply (const Vector<MF*>& out, const Vector<MF*>& a_in)
 
     for (int alev = 0; alev <= finest_amr_lev; ++alev) {
         if (cf_strategy == CFStrategy::ghostnodes) { nghost = linop.getNGrow(alev); }
-        out[alev]->negate(nghost);
+        Scale(*out[alev], RT(-1), 0, nComp(*out[alev]), nghost);
     }
 }
 
@@ -970,10 +968,10 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
         }
         else
         {
-            if (a_sol[alev]->nGrowVect() == ng_sol) {
+            if (nGrowVect(*a_sol[alev]) == ng_sol) {
                 if constexpr (std::is_same<AMF,MF>()) {
                     sol[alev] = linop.makeAlias(*a_sol[alev]);
-                    sol[alev].setBndry(RT(0.0), 0, ncomp);
+                    setBndry(sol[alev], RT(0.0), 0, ncomp);
                     sol_is_alias[alev] = true;
                 }
             }
@@ -981,8 +979,8 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
                 if (!solve_called) {
                     sol[alev] = linop.make(alev, 0, ng_sol);
                 }
-                sol[alev].LocalCopy(*a_sol[alev], 0, 0, ncomp, IntVect(0));
-                sol[alev].setBndry(RT(0.0), 0, ncomp);
+                LocalCopy(sol[alev], *a_sol[alev], 0, 0, ncomp, IntVect(0));
+                setBndry(sol[alev], RT(0.0), 0, ncomp);
             }
         }
     }
@@ -994,7 +992,7 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
         if (!solve_called) {
             rhs[alev] = linop.make(alev, 0, ng_rhs);
         }
-        rhs[alev].LocalCopy(*a_rhs[alev], 0, 0, ncomp, ng_rhs);
+        LocalCopy(rhs[alev], *a_rhs[alev], 0, 0, ncomp, ng_rhs);
         linop.applyMetricTerm(alev, 0, rhs[alev]);
         linop.unimposeNeumannBC(alev, rhs[alev]);
         linop.applyInhomogNeumannTerm(alev, rhs[alev]);
@@ -1025,7 +1023,7 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
         makeSolvable();
     }
 
-    IntVect ng = linop.isCellCentered() ? IntVect(0) : IntVect(1);
+    IntVect ng = linop.getNGrowVectRestriction();
     if (cf_strategy == CFStrategy::ghostnodes) { ng = ng_rhs; }
     if (!solve_called) {
         linop.make(res, ng);
@@ -1036,8 +1034,8 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
         const int nmglevs = linop.NMGLevels(alev);
         for (int mglev = 0; mglev < nmglevs; ++mglev)
         {
-            res   [alev][mglev].setVal(RT(0.0));
-            rescor[alev][mglev].setVal(RT(0.0));
+            setVal(res   [alev][mglev], RT(0.0));
+            setVal(rescor[alev][mglev], RT(0.0));
         }
     }
 
@@ -1054,7 +1052,7 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
                 if (cf_strategy == CFStrategy::ghostnodes) { _ng=IntVect(linop.getNGrow(alev,mglev)); }
                 cor[alev][mglev] = linop.make(alev, mglev, _ng);
             }
-            cor[alev][mglev].setVal(RT(0.0));
+            setVal(cor[alev][mglev], RT(0.0));
         }
     }
 
@@ -1070,7 +1068,7 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
                 if (cf_strategy == CFStrategy::ghostnodes) { _ng=IntVect(linop.getNGrow(alev,mglev)); }
                 cor_hold[alev][mglev] = linop.make(alev, mglev, _ng);
             }
-            cor_hold[alev][mglev].setVal(RT(0.0));
+            setVal(cor_hold[alev][mglev], RT(0.0));
         }
     }
     for (int alev = 1; alev < finest_amr_lev; ++alev)
@@ -1081,7 +1079,7 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
             if (cf_strategy == CFStrategy::ghostnodes) { _ng=IntVect(linop.getNGrow(alev)); }
             cor_hold[alev][0] = linop.make(alev, 0, _ng);
         }
-        cor_hold[alev][0].setVal(RT(0.0));
+        setVal(cor_hold[alev][0], RT(0.0));
     }
 
     if (linop.m_parent // no embedded N-Solve
@@ -1106,34 +1104,111 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
     }
 }
 
+template <typename MF>
+void
+MLMGT<MF>::prepareLinOp ()
+{
+    if (!linop_prepared) {
+        linop.prepareForSolve();
+        linop_prepared = true;
+    } else if (linop.needsUpdate()) {
+        linop.update();
+    }
+}
+
+template <typename MF>
+void
+MLMGT<MF>::prepareForGMRES ()
+{
+    prepareLinOp();
+    linop.prepareForGMRES();
+}
+
+template <typename MF>
+void
+MLMGT<MF>::prepareMGcycle ()
+{
+    if (res.empty()) {
+        IntVect ng = linop.getNGrowVectRestriction();
+        linop.make(res, ng);
+        linop.make(rescor, ng);
+
+        for (int alev = 0; alev <= finest_amr_lev; ++alev)
+        {
+            const int nmglevs = linop.NMGLevels(alev);
+            for (int mglev = 0; mglev < nmglevs; ++mglev)
+            {
+                setVal(res   [alev][mglev], RT(0.0));
+                setVal(rescor[alev][mglev], RT(0.0));
+            }
+        }
+
+        IntVect ng_sol(1);
+        if (linop.hasHiddenDimension()) { ng_sol[linop.hiddenDirection()] = 0; }
+        ng = ng_sol;
+
+        cor.resize(namrlevs);
+        for (int alev = 0; alev <= finest_amr_lev; ++alev)
+        {
+            const int nmglevs = linop.NMGLevels(alev);
+            cor[alev].resize(nmglevs);
+            for (int mglev = 0; mglev < nmglevs; ++mglev)
+            {
+                cor[alev][mglev] = linop.make(alev, mglev, ng);
+                setVal(cor[alev][mglev], RT(0.0));
+            }
+        }
+
+        cor_hold.resize(std::max(namrlevs-1,1));
+        {
+            const int alev = 0;
+            const int nmglevs = linop.NMGLevels(alev);
+            cor_hold[alev].resize(nmglevs);
+            for (int mglev = 0; mglev < nmglevs-1; ++mglev)
+            {
+                cor_hold[alev][mglev] = linop.make(alev, mglev, ng);
+                setVal(cor_hold[alev][mglev], RT(0.0));
+            }
+        }
+        for (int alev = 1; alev < finest_amr_lev; ++alev)
+        {
+            cor_hold[alev].resize(1);
+            cor_hold[alev][0] = linop.make(alev, 0, ng);
+            setVal(cor_hold[alev][0], RT(0.0));
+        }
+    }
+}
+
 template <typename MF>
 void
 MLMGT<MF>::prepareForNSolve ()
 {
-    ns_linop = linop.makeNLinOp(nsolve_grid_size);
+    if constexpr (IsMultiFabLike_v<MF>) {
+        ns_linop = linop.makeNLinOp(nsolve_grid_size);
 
-    int nghost = 0;
-    if (cf_strategy == CFStrategy::ghostnodes) { nghost = linop.getNGrow(); }
-
-    const BoxArray& ba = (*ns_linop).m_grids[0][0];
-    const DistributionMapping& dm =(*ns_linop).m_dmap[0][0];
-
-    int ng = 1;
-    if (cf_strategy == CFStrategy::ghostnodes) { ng = nghost; }
-    ns_sol = std::make_unique<MF>(ba, dm, ncomp, ng, MFInfo(), *(ns_linop->Factory(0,0)));
-    ng = 0;
-    if (cf_strategy == CFStrategy::ghostnodes) { ng = nghost; }
-    ns_rhs = std::make_unique<MF>(ba, dm, ncomp, ng, MFInfo(), *(ns_linop->Factory(0,0)));
-    ns_sol->setVal(RT(0.0));
-    ns_rhs->setVal(RT(0.0));
-
-    ns_linop->setLevelBC(0, ns_sol.get());
-
-    ns_mlmg = std::make_unique<MLMGT<MF>>(*ns_linop);
-    ns_mlmg->setVerbose(0);
-    ns_mlmg->setFixedIter(1);
-    ns_mlmg->setMaxFmgIter(20);
-    ns_mlmg->setBottomSolver(BottomSolver::smoother);
+        int nghost = 0;
+        if (cf_strategy == CFStrategy::ghostnodes) { nghost = linop.getNGrow(); }
+
+        const BoxArray& ba = (*ns_linop).m_grids[0][0];
+        const DistributionMapping& dm =(*ns_linop).m_dmap[0][0];
+
+        int ng = 1;
+        if (cf_strategy == CFStrategy::ghostnodes) { ng = nghost; }
+        ns_sol = std::make_unique<MF>(ba, dm, ncomp, ng, MFInfo(), *(ns_linop->Factory(0,0)));
+        ng = 0;
+        if (cf_strategy == CFStrategy::ghostnodes) { ng = nghost; }
+        ns_rhs = std::make_unique<MF>(ba, dm, ncomp, ng, MFInfo(), *(ns_linop->Factory(0,0)));
+        setVal(*ns_sol, RT(0.0));
+        setVal(*ns_rhs, RT(0.0));
+
+        ns_linop->setLevelBC(0, ns_sol.get());
+
+        ns_mlmg = std::make_unique<MLMGT<MF>>(*ns_linop);
+        ns_mlmg->setVerbose(0);
+        ns_mlmg->setFixedIter(1);
+        ns_mlmg->setMaxFmgIter(20);
+        ns_mlmg->setBottomSolver(BottomSolver::smoother);
+    }
 }
 
 // in  : Residual (res) on the finest AMR level
@@ -1149,7 +1224,7 @@ void MLMGT<MF>::oneIter (int iter)
 
         IntVect nghost(0);
         if (cf_strategy == CFStrategy::ghostnodes) { nghost = IntVect(linop.getNGrow(alev)); }
-        sol[alev].LocalAdd(cor[alev][0], 0, 0, ncomp, nghost);
+        LocalAdd(sol[alev], cor[alev][0], 0, 0, ncomp, nghost);
 
         // compute residual for the coarse AMR level
         computeResWithCrseSolFineCor(alev-1,alev);
@@ -1175,7 +1250,7 @@ void MLMGT<MF>::oneIter (int iter)
 
         IntVect nghost(0);
         if (cf_strategy == CFStrategy::ghostnodes) { nghost = IntVect(linop.getNGrow(0)); }
-        sol[0].LocalAdd(cor[0][0], 0, 0, ncomp, nghost);
+        LocalAdd(sol[0], cor[0][0], 0, 0, ncomp, nghost);
     }
 
     for (int alev = 1; alev <= finest_amr_lev; ++alev)
@@ -1185,10 +1260,10 @@ void MLMGT<MF>::oneIter (int iter)
 
         IntVect nghost(0);
         if (cf_strategy == CFStrategy::ghostnodes) { nghost = IntVect(linop.getNGrow(alev)); }
-        sol[alev].LocalAdd(cor[alev][0], 0, 0, ncomp, nghost);
+        LocalAdd(sol[alev], cor[alev][0], 0, 0, ncomp, nghost);
 
         if (alev != finest_amr_lev) {
-            cor_hold[alev][0].LocalAdd(cor[alev][0], 0, 0, ncomp, nghost);
+            LocalAdd(cor_hold[alev][0], cor[alev][0], 0, 0, ncomp, nghost);
         }
 
         // Update fine AMR level correction
@@ -1196,10 +1271,10 @@ void MLMGT<MF>::oneIter (int iter)
 
         miniCycle(alev);
 
-        sol[alev].LocalAdd(cor[alev][0], 0, 0, ncomp, nghost);
+        LocalAdd(sol[alev], cor[alev][0], 0, 0, ncomp, nghost);
 
         if (alev != finest_amr_lev) {
-            cor[alev][0].LocalAdd(cor_hold[alev][0], 0, 0, ncomp, nghost);
+            LocalAdd(cor[alev][0], cor_hold[alev][0], 0, 0, ncomp, nghost);
         }
     }
 
@@ -1231,12 +1306,12 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
 
         if (verbose >= 4)
         {
-            RT norm = res[amrlev][mglev].norminf(0,ncomp,IntVect(0));
+            RT norm = norminf(res[amrlev][mglev],0,ncomp,IntVect(0));
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
                            << "   DN: Norm before smooth " << norm << "\n";
         }
 
-        cor[amrlev][mglev].setVal(RT(0.0));
+        setVal(cor[amrlev][mglev], RT(0.0));
         bool skip_fillboundary = true;
         for (int i = 0; i < nu1; ++i) {
             linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev], skip_fillboundary);
@@ -1248,7 +1323,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
 
         if (verbose >= 4)
         {
-            RT norm = rescor[amrlev][mglev].norminf(0,ncomp,IntVect(0));
+            RT norm = norminf(rescor[amrlev][mglev],0,ncomp,IntVect(0));
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
                            << "   DN: Norm after  smooth " << norm << "\n";
         }
@@ -1262,7 +1337,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
     {
         if (verbose >= 4)
         {
-            RT norm = res[amrlev][mglev_bottom].norminf(0,ncomp,IntVect(0));
+            RT norm = norminf(res[amrlev][mglev_bottom],0,ncomp,IntVect(0));
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev_bottom
                            << "   DN: Norm before bottom " << norm << "\n";
         }
@@ -1270,7 +1345,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             computeResOfCorrection(amrlev, mglev_bottom);
-            RT norm = rescor[amrlev][mglev_bottom].norminf(0,ncomp,IntVect(0));
+            RT norm = norminf(rescor[amrlev][mglev_bottom],0,ncomp,IntVect(0));
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev_bottom
                            << "   UP: Norm after  bottom " << norm << "\n";
         }
@@ -1279,11 +1354,11 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
     {
         if (verbose >= 4)
         {
-            RT norm = res[amrlev][mglev_bottom].norminf(0,ncomp,IntVect(0));
+            RT norm = norminf(res[amrlev][mglev_bottom],0,ncomp,IntVect(0));
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev_bottom
                            << "       Norm before smooth " << norm << "\n";
         }
-        cor[amrlev][mglev_bottom].setVal(RT(0.0));
+        setVal(cor[amrlev][mglev_bottom], RT(0.0));
         bool skip_fillboundary = true;
         for (int i = 0; i < nu1; ++i) {
             linop.smooth(amrlev, mglev_bottom, cor[amrlev][mglev_bottom],
@@ -1293,7 +1368,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             computeResOfCorrection(amrlev, mglev_bottom);
-            RT norm = rescor[amrlev][mglev_bottom].norminf(0,ncomp,IntVect(0));
+            RT norm = norminf(rescor[amrlev][mglev_bottom],0,ncomp,IntVect(0));
             amrex::Print() << "AT LEVEL "  << amrlev  << " " << mglev_bottom
                            << "       Norm after  smooth " << norm << "\n";
         }
@@ -1308,7 +1383,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             computeResOfCorrection(amrlev, mglev);
-            RT norm = rescor[amrlev][mglev].norminf(0,ncomp,IntVect(0));
+            RT norm = norminf(rescor[amrlev][mglev],0,ncomp,IntVect(0));
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
                            << "   UP: Norm before smooth " << norm << "\n";
         }
@@ -1321,7 +1396,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             computeResOfCorrection(amrlev, mglev);
-            RT norm = rescor[amrlev][mglev].norminf(0,ncomp,IntVect(0));
+            RT norm = norminf(rescor[amrlev][mglev],0,ncomp,IntVect(0));
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
                            << "   UP: Norm after  smooth " << norm << "\n";
         }
@@ -1361,16 +1436,16 @@ MLMGT<MF>::mgFcycle ()
         // rescor = res - L(cor)
         computeResOfCorrection(amrlev, mglev);
         // res = rescor; this provides b to the vcycle below
-        res[amrlev][mglev].LocalCopy(rescor[amrlev][mglev], 0, 0, ncomp, nghost);
+        LocalCopy(res[amrlev][mglev], rescor[amrlev][mglev], 0, 0, ncomp, nghost);
 
         // save cor; do v-cycle; add the saved to cor
         std::swap(cor[amrlev][mglev], cor_hold[amrlev][mglev]);
         mgVcycle(amrlev, mglev);
-        cor[amrlev][mglev].LocalAdd(cor_hold[amrlev][mglev], 0, 0, ncomp, nghost);
+        LocalAdd(cor[amrlev][mglev], cor_hold[amrlev][mglev], 0, 0, ncomp, nghost);
     }
 }
 
-// At the true bottom of the coarset AMR level.
+// At the true bottom of the coarsest AMR level.
 // in  : Residual (res) as b
 // out : Correction (cor) as x
 template <typename MF>
@@ -1393,16 +1468,16 @@ MLMGT<MF>::NSolve (MLMGT<MF>& a_solver, MF& a_sol, MF& a_rhs)
 {
     BL_PROFILE("MLMG::NSolve()");
 
-    a_sol.setVal(RT(0.0));
+    setVal(a_sol, RT(0.0));
 
     MF const& res_bottom = res[0].back();
-    if (BoxArray::SameRefs(a_rhs.boxArray(),res_bottom.boxArray()) &&
-        DistributionMapping::SameRefs(a_rhs.DistributionMap(),res_bottom.DistributionMap()))
+    if (BoxArray::SameRefs(boxArray(a_rhs),boxArray(res_bottom)) &&
+        DistributionMapping::SameRefs(DistributionMap(a_rhs),DistributionMap(res_bottom)))
     {
-        a_rhs.LocalCopy(res_bottom, 0, 0, ncomp, IntVect(0));
+        LocalCopy(a_rhs, res_bottom, 0, 0, ncomp, IntVect(0));
     } else {
-        a_rhs.setVal(RT(0.0));
-        a_rhs.ParallelCopy(res_bottom);
+        setVal(a_rhs, RT(0.0));
+        ParallelCopy(a_rhs, res_bottom, 0, 0, ncomp);
     }
 
     a_solver.solve(Vector<MF*>{&a_sol}, Vector<MF const*>{&a_rhs},
@@ -1428,7 +1503,7 @@ MLMGT<MF>::actualBottomSolve ()
     auto& x = cor[amrlev][mglev];
     auto& b = res[amrlev][mglev];
 
-    x.setVal(RT(0.0));
+    setVal(x, RT(0.0));
 
     if (bottom_solver == BottomSolver::smoother)
     {
@@ -1444,9 +1519,9 @@ MLMGT<MF>::actualBottomSolve ()
         MF raii_b;
         if (linop.isBottomSingular() && linop.getEnforceSingularSolvable())
         {
-            const IntVect ng = b.nGrowVect();
+            const IntVect ng = nGrowVect(b);
             raii_b = linop.make(amrlev, mglev, ng);
-            raii_b.LocalCopy(b, 0, 0, ncomp, ng);
+            LocalCopy(raii_b, b, 0, 0, ncomp, ng);
             bottom_b = &raii_b;
 
             makeSolvable(amrlev,mglev,*bottom_b);
@@ -1486,7 +1561,7 @@ MLMGT<MF>::actualBottomSolve ()
             int ret = bottomSolveWithCG(x, *bottom_b, cg_type);
             // If the MLMG solve failed then set the correction to zero
             if (ret != 0) {
-                cor[amrlev][mglev].setVal(RT(0.0));
+                setVal(cor[amrlev][mglev], RT(0.0));
                 if (bottom_solver == BottomSolver::cgbicg ||
                     bottom_solver == BottomSolver::bicgcg) {
                     if (bottom_solver == BottomSolver::cgbicg) {
@@ -1496,7 +1571,7 @@ MLMGT<MF>::actualBottomSolve ()
                     }
                     ret = bottomSolveWithCG(x, *bottom_b, cg_type);
                     if (ret != 0) {
-                        cor[amrlev][mglev].setVal(RT(0.0));
+                        setVal(cor[amrlev][mglev], RT(0.0));
                     } else { // switch permanently
                         if (cg_type == MLCGSolverT<MF>::Type::CG) {
                             bottom_solver = BottomSolver::cg;
@@ -1515,7 +1590,9 @@ MLMGT<MF>::actualBottomSolve ()
 
     ParallelContext::pop();
 
-    timer[bottom_time] += amrex::second() - bottom_start_time;
+    if (! timer.empty()) {
+        timer[bottom_time] += amrex::second() - bottom_start_time;
+    }
 }
 
 template <typename MF>
@@ -1526,6 +1603,7 @@ MLMGT<MF>::bottomSolveWithCG (MF& x, const MF& b, typename MLCGSolverT<MF>::Type
     cg_solver.setSolver(type);
     cg_solver.setVerbose(bottom_verbose);
     cg_solver.setMaxIter(bottom_maxiter);
+    cg_solver.setInitSolnZeroed(true);
     if (cf_strategy == CFStrategy::ghostnodes) { cg_solver.setNGhost(linop.getNGrow()); }
 
     int ret = cg_solver.solve(x, b, bottom_reltol, bottom_abstol);
@@ -1590,7 +1668,7 @@ MLMGT<MF>::computeResWithCrseSolFineCor (int calev, int falev)
     linop.solutionResidual(calev, crse_res, crse_sol, crse_rhs, crse_bcdata);
 
     linop.correctionResidual(falev, 0, fine_rescor, fine_cor, fine_res, BCMode::Homogeneous);
-    fine_res.LocalCopy(fine_rescor, 0, 0, ncomp, nghost);
+    LocalCopy(fine_res, fine_rescor, 0, 0, ncomp, nghost);
 
     linop.reflux(calev, crse_res, crse_sol, crse_rhs, fine_res, fine_sol, fine_rhs);
 
@@ -1618,7 +1696,7 @@ MLMGT<MF>::computeResWithCrseCorFineCor (int falev)
     // fine_rescor = fine_res - L(fine_cor)
     linop.correctionResidual(falev, 0, fine_rescor, fine_cor, fine_res,
                              BCMode::Inhomogeneous, &crse_cor);
-    fine_res.LocalCopy(fine_rescor, 0, 0, ncomp, nghost);
+    LocalCopy(fine_res, fine_rescor, 0, 0, ncomp, nghost);
 }
 
 // Interpolate correction from coarse to fine AMR level.
@@ -1647,9 +1725,9 @@ MLMGT<MF>::interpCorrection (int alev)
     }
 
     MF cfine = linop.makeCoarseAmr(alev, IntVect(ng_dst));
-    cfine.setVal(RT(0.0));
-    cfine.ParallelCopy(crse_cor, 0, 0, ncomp,  IntVect(ng_src), IntVect(ng_dst),
-                       crse_geom.periodicity());
+    setVal(cfine, RT(0.0));
+    ParallelCopy(cfine, crse_cor, 0, 0, ncomp, IntVect(ng_src), IntVect(ng_dst),
+                 crse_geom.periodicity());
 
     linop.interpolationAmr(alev, fine_cor, cfine, nghost); // NOLINT(readability-suspicious-call-argument)
 }
@@ -1688,7 +1766,7 @@ MLMGT<MF>::addInterpCorrection (int alev, int mglev)
     else
     {
         cfine = linop.makeCoarseMG(alev, mglev, IntVect(0));
-        cfine.ParallelCopy(crse_cor,0,0,ncomp,IntVect(0),IntVect(0));
+        ParallelCopy(cfine, crse_cor, 0, 0, ncomp);
         cmf = &cfine;
     }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_1D_K.H
new file mode 100644
index 00000000000..34a2ddda6f7
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_1D_K.H
@@ -0,0 +1,30 @@
+#ifndef AMREX_MLNODEABECLAP_1D_K_H_
+#define AMREX_MLNODEABECLAP_1D_K_H_
+
+namespace amrex {
+
+inline void
+mlndabeclap_gauss_seidel_aa (Box const& /*bx*/, Array4<Real> const& /*sol*/,
+                             Array4<Real const> const& /*rhs*/,
+                             Real /*alpha*/, Real /*beta*/,
+                             Array4<Real const> const& /*acf*/,
+                             Array4<Real const> const& /*bcf*/,
+                             Array4<int const> const& /*msk*/,
+                             GpuArray<Real,AMREX_SPACEDIM> const& /*dxinv*/) noexcept
+{}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE void
+mlndabeclap_jacobi_aa (int /*i*/, int /*j*/, int /*k*/,
+                       Array4<Real> const& /*sol*/,
+                       Real /*lap*/,
+                       Array4<Real const> const& /*rhs*/,
+                       Real /*alpha*/, Real /*beta*/,
+                       Array4<Real const> const& /*acf*/,
+                       Array4<Real const> const& /*bcf*/,
+                       Array4<int const> const& /*msk*/,
+                       GpuArray<Real,AMREX_SPACEDIM> const& /*dxinv*/) noexcept
+{}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_2D_K.H
new file mode 100644
index 00000000000..3418b19d279
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_2D_K.H
@@ -0,0 +1,67 @@
+#ifndef AMREX_MLNODEABECLAP_2D_K_H_
+#define AMREX_MLNODEABECLAP_2D_K_H_
+
+namespace amrex {
+
+inline void
+mlndabeclap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
+                             Array4<Real const> const& rhs,
+                             Real alpha, Real beta,
+                             Array4<Real const> const& acf,
+                             Array4<Real const> const& bcf,
+                             Array4<int const> const& msk,
+                             GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+{
+    Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0];
+    Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1];
+    Real fxy = facx + facy;
+    Real f2xmy = Real(2.0)*facx - facy;
+    Real fmx2y = Real(2.0)*facy - facx;
+
+    amrex::Loop(bx, [=] (int i, int j, int k) noexcept
+    {
+        if (msk(i,j,k)) {
+            sol(i,j,k) = Real(0.0);
+        } else {
+            Real s0 = (-Real(2.0))*fxy*(bcf(i-1,j-1,k)+bcf(i,j-1,k)+bcf(i-1,j,k)+bcf(i,j,k));
+            Real lap =  sol(i-1,j-1,k)*fxy*bcf(i-1,j-1,k)
+                      + sol(i+1,j-1,k)*fxy*bcf(i  ,j-1,k)
+                      + sol(i-1,j+1,k)*fxy*bcf(i-1,j  ,k)
+                      + sol(i+1,j+1,k)*fxy*bcf(i  ,j  ,k)
+                      + sol(i-1,j,k)*f2xmy*(bcf(i-1,j-1,k)+bcf(i-1,j,k))
+                      + sol(i+1,j,k)*f2xmy*(bcf(i  ,j-1,k)+bcf(i  ,j,k))
+                      + sol(i,j-1,k)*fmx2y*(bcf(i-1,j-1,k)+bcf(i,j-1,k))
+                      + sol(i,j+1,k)*fmx2y*(bcf(i-1,j  ,k)+bcf(i,j  ,k))
+                      + sol(i,j,k)*s0;
+            Real Ax = alpha*acf(i,j,k)*sol(i,j,k) - beta*lap;
+
+            sol(i,j,k) += (rhs(i,j,k) - Ax) / (alpha*acf(i,j,k)-beta*s0);
+        }
+    });
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE void
+mlndabeclap_jacobi_aa (int i, int j, int k, Array4<Real> const& sol,
+                       Real lap, Array4<Real const> const& rhs,
+                       Real alpha, Real beta,
+                       Array4<Real const> const& acf,
+                       Array4<Real const> const& bcf,
+                       Array4<int const> const& msk,
+                       GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+{
+    if (msk(i,j,k)) {
+        sol(i,j,k) = Real(0.0);
+    } else {
+        Real fac = -Real(2.0/6.0)*(dxinv[0]*dxinv[0] + dxinv[1]*dxinv[1]);
+        Real s0 = fac*(bcf(i-1,j-1,k)+bcf(i,j-1,k)+bcf(i-1,j,k)+bcf(i,j,k));
+        Real Ax = alpha*acf(i,j,k)*sol(i,j,k) - beta*lap;
+
+        sol(i,j,k) += Real(2.0/3.0) * (rhs(i,j,k) - Ax)
+            / (alpha*acf(i,j,k)-beta*s0);
+    }
+
+}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_3D_K.H
new file mode 100644
index 00000000000..5ddb93a958c
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_3D_K.H
@@ -0,0 +1,93 @@
+#ifndef AMREX_MLNODEABECLAP_3D_K_H_
+#define AMREX_MLNODEABECLAP_3D_K_H_
+
+namespace amrex {
+
+inline void
+mlndabeclap_gauss_seidel_aa (Box const& bx, Array4<Real> const& sol,
+                             Array4<Real const> const& rhs,
+                             Real alpha, Real beta,
+                             Array4<Real const> const& acf,
+                             Array4<Real const> const& bcf,
+                             Array4<int const> const& msk,
+                             GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+{
+    Real facx = Real(1.0/36.0)*dxinv[0]*dxinv[0];
+    Real facy = Real(1.0/36.0)*dxinv[1]*dxinv[1];
+    Real facz = Real(1.0/36.0)*dxinv[2]*dxinv[2];
+    Real fxyz = facx + facy + facz;
+    Real fmx2y2z = -facx + Real(2.0)*facy + Real(2.0)*facz;
+    Real f2xmy2z = Real(2.0)*facx - facy + Real(2.0)*facz;
+    Real f2x2ymz = Real(2.0)*facx + Real(2.0)*facy - facz;
+    Real f4xm2ym2z = Real(4.0)*facx - Real(2.0)*facy - Real(2.0)*facz;
+    Real fm2x4ym2z = -Real(2.0)*facx + Real(4.0)*facy - Real(2.0)*facz;
+    Real fm2xm2y4z = -Real(2.0)*facx - Real(2.0)*facy + Real(4.0)*facz;
+
+    amrex::LoopOnCpu(bx, [=] (int i, int j, int k) noexcept
+    {
+        if (msk(i,j,k)) {
+            sol(i,j,k) = Real(0.0);
+        } else {
+            Real s0 = Real(-4.0)*fxyz*(bcf(i-1,j-1,k-1)+bcf(i,j-1,k-1)+bcf(i-1,j,k-1)+bcf(i,j,k-1)
+                                      +bcf(i-1,j-1,k  )+bcf(i,j-1,k  )+bcf(i-1,j,k  )+bcf(i,j,k  ));
+            Real lap = sol(i,j,k)*s0
+                + fxyz*(sol(i-1,j-1,k-1)*bcf(i-1,j-1,k-1)
+                      + sol(i+1,j-1,k-1)*bcf(i  ,j-1,k-1)
+                      + sol(i-1,j+1,k-1)*bcf(i-1,j  ,k-1)
+                      + sol(i+1,j+1,k-1)*bcf(i  ,j  ,k-1)
+                      + sol(i-1,j-1,k+1)*bcf(i-1,j-1,k  )
+                      + sol(i+1,j-1,k+1)*bcf(i  ,j-1,k  )
+                      + sol(i-1,j+1,k+1)*bcf(i-1,j  ,k  )
+                      + sol(i+1,j+1,k+1)*bcf(i  ,j  ,k  ))
+                + fmx2y2z*(sol(i  ,j-1,k-1)*(bcf(i-1,j-1,k-1)+bcf(i,j-1,k-1))
+                         + sol(i  ,j+1,k-1)*(bcf(i-1,j  ,k-1)+bcf(i,j  ,k-1))
+                         + sol(i  ,j-1,k+1)*(bcf(i-1,j-1,k  )+bcf(i,j-1,k  ))
+                         + sol(i  ,j+1,k+1)*(bcf(i-1,j  ,k  )+bcf(i,j  ,k  )))
+                + f2xmy2z*(sol(i-1,j  ,k-1)*(bcf(i-1,j-1,k-1)+bcf(i-1,j,k-1))
+                         + sol(i+1,j  ,k-1)*(bcf(i  ,j-1,k-1)+bcf(i  ,j,k-1))
+                         + sol(i-1,j  ,k+1)*(bcf(i-1,j-1,k  )+bcf(i-1,j,k  ))
+                         + sol(i+1,j  ,k+1)*(bcf(i  ,j-1,k  )+bcf(i  ,j,k  )))
+                + f2x2ymz*(sol(i-1,j-1,k  )*(bcf(i-1,j-1,k-1)+bcf(i-1,j-1,k))
+                         + sol(i+1,j-1,k  )*(bcf(i  ,j-1,k-1)+bcf(i  ,j-1,k))
+                         + sol(i-1,j+1,k  )*(bcf(i-1,j  ,k-1)+bcf(i-1,j  ,k))
+                         + sol(i+1,j+1,k  )*(bcf(i  ,j  ,k-1)+bcf(i  ,j  ,k)))
+                + f4xm2ym2z*(sol(i-1,j,k)*(bcf(i-1,j-1,k-1)+bcf(i-1,j,k-1)+bcf(i-1,j-1,k)+bcf(i-1,j,k))
+                           + sol(i+1,j,k)*(bcf(i  ,j-1,k-1)+bcf(i  ,j,k-1)+bcf(i  ,j-1,k)+bcf(i  ,j,k)))
+                + fm2x4ym2z*(sol(i,j-1,k)*(bcf(i-1,j-1,k-1)+bcf(i,j-1,k-1)+bcf(i-1,j-1,k)+bcf(i,j-1,k))
+                           + sol(i,j+1,k)*(bcf(i-1,j  ,k-1)+bcf(i,j  ,k-1)+bcf(i-1,j  ,k)+bcf(i,j  ,k)))
+                + fm2xm2y4z*(sol(i,j,k-1)*(bcf(i-1,j-1,k-1)+bcf(i,j-1,k-1)+bcf(i-1,j,k-1)+bcf(i,j,k-1))
+                           + sol(i,j,k+1)*(bcf(i-1,j-1,k  )+bcf(i,j-1,k  )+bcf(i-1,j,k  )+bcf(i,j,k  )));
+            Real Ax = alpha*acf(i,j,k)*sol(i,j,k) - beta*lap;
+
+            sol(i,j,k) += (rhs(i,j,k) - Ax) / (alpha*acf(i,j,k)-beta*s0);
+        }
+    });
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE void
+mlndabeclap_jacobi_aa (int i, int j, int k, Array4<Real> const& sol,
+                       Real lap, Array4<Real const> const& rhs,
+                       Real alpha, Real beta,
+                       Array4<Real const> const& acf,
+                       Array4<Real const> const& bcf,
+                       Array4<int const> const& msk,
+                       GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+{
+    if (msk(i,j,k)) {
+        sol(i,j,k) = Real(0.0);
+    } else {
+        Real fxyz = Real(-4.0 / 36.0)*(dxinv[0]*dxinv[0] +
+                                       dxinv[1]*dxinv[1] +
+                                       dxinv[2]*dxinv[2]);
+        Real s0 = fxyz*(bcf(i-1,j-1,k-1)+bcf(i,j-1,k-1)+bcf(i-1,j,k-1)+bcf(i,j,k-1)
+                       +bcf(i-1,j-1,k  )+bcf(i,j-1,k  )+bcf(i-1,j,k  )+bcf(i,j,k));
+        Real Ax = alpha*acf(i,j,k)*sol(i,j,k) - beta*lap;
+
+        sol(i,j,k) += Real(2.0/3.0) * (rhs(i,j,k) - Ax)
+            / (alpha*acf(i,j,k)-beta*s0);
+    }
+}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_K.H
new file mode 100644
index 00000000000..fd744bacd1e
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLap_K.H
@@ -0,0 +1,13 @@
+#ifndef AMREX_MLNODEABECLAP_K_H_
+#define AMREX_MLNODEABECLAP_K_H_
+#include <AMReX_Config.H>
+
+#if (AMREX_SPACEDIM == 1)
+#include <AMReX_MLNodeABecLap_1D_K.H>
+#elif (AMREX_SPACEDIM == 2)
+#include <AMReX_MLNodeABecLap_2D_K.H>
+#else
+#include <AMReX_MLNodeABecLap_3D_K.H>
+#endif
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.H
new file mode 100644
index 00000000000..6caed940fa9
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.H
@@ -0,0 +1,88 @@
+#ifndef AMREX_MLNODEABECLAPLACIAN_H_
+#define AMREX_MLNODEABECLAPLACIAN_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_MLNodeLinOp.H>
+
+namespace amrex {
+
+// (alpha * a - beta * (del dot b grad)) phi = rhs
+// a, phi and rhs are nodal. b is cell-centered.
+
+class MLNodeABecLaplacian
+    : public MLNodeLinOp
+{
+public:
+
+    MLNodeABecLaplacian () = default;
+    MLNodeABecLaplacian (const Vector<Geometry>& a_geom,
+                         const Vector<BoxArray>& a_grids,
+                         const Vector<DistributionMapping>& a_dmap,
+                         const LPInfo& a_info = LPInfo(),
+                         const Vector<FabFactory<FArrayBox> const*>& a_factory = {});
+    ~MLNodeABecLaplacian () override = default;
+
+    MLNodeABecLaplacian (const MLNodeABecLaplacian&) = delete;
+    MLNodeABecLaplacian (MLNodeABecLaplacian&&) = delete;
+    MLNodeABecLaplacian& operator= (const MLNodeABecLaplacian&) = delete;
+    MLNodeABecLaplacian& operator= (MLNodeABecLaplacian&&) = delete;
+
+    void define (const Vector<Geometry>& a_geom,
+                 const Vector<BoxArray>& a_grids,
+                 const Vector<DistributionMapping>& a_dmap,
+                 const LPInfo& a_info = LPInfo(),
+                 const Vector<FabFactory<FArrayBox> const*>& a_factory = {});
+
+    std::string name () const override { return std::string("MLNodeABecLaplacian"); }
+
+    void setScalars (Real a, Real b) {
+        m_a_scalar = a;
+        m_b_scalar = b;
+    }
+
+    void setACoeffs (int amrlev, Real a_acoef);
+    void setACoeffs (int amrlev, const MultiFab& a_acoef);
+
+    void setBCoeffs (int amrlev, Real a_bcoef);
+    void setBCoeffs (int amrlev, const MultiFab& a_bcoef);
+
+    void Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) const final;
+    void Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs) const final;
+
+    void fixUpResidualMask (int amrlev, iMultiFab& resmsk) final;
+
+    bool isSingular (int /*amrlev*/) const final { return false; }
+    bool isBottomSingular () const final { return false; }
+
+    void restriction (int amrlev, int cmglev, MultiFab& crse, MultiFab& fine) const final;
+    void interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiFab& crse) const final;
+    void averageDownSolutionRHS (int camrlev, MultiFab& crse_sol, MultiFab& crse_rhs,
+                                 const MultiFab& fine_sol, const MultiFab& fine_rhs) final;
+
+    void reflux (int crse_amrlev,
+                 MultiFab& res, const MultiFab& crse_sol, const MultiFab& crse_rhs,
+                 MultiFab& fine_res, MultiFab& fine_sol, const MultiFab& fine_rhs) const final;
+
+    void prepareForSolve () final;
+
+    [[nodiscard]] bool needsUpdate () const final { return m_needs_update; }
+
+    void update () final;
+
+    void averageDownCoeffs ();
+    void averageDownCoeffsToCoarseAmrLevel (int flev);
+    void averageDownCoeffsSameAmrLevel (int amrlev);
+
+private:
+
+    bool m_needs_update = true;
+
+    Real m_a_scalar = std::numeric_limits<Real>::quiet_NaN();
+    Real m_b_scalar = std::numeric_limits<Real>::quiet_NaN();
+    Vector<Vector<MultiFab> > m_a_coeffs;
+    Vector<Vector<MultiFab> > m_b_coeffs;
+};
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp
new file mode 100644
index 00000000000..07c516992f6
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp
@@ -0,0 +1,348 @@
+#include <AMReX_MLNodeABecLaplacian.H>
+#include <AMReX_MLNodeLap_K.H>
+#include <AMReX_MLNodeABecLap_K.H>
+
+namespace amrex {
+
+MLNodeABecLaplacian::MLNodeABecLaplacian (const Vector<Geometry>& a_geom,
+                                          const Vector<BoxArray>& a_grids,
+                                          const Vector<DistributionMapping>& a_dmap,
+                                          const LPInfo& a_info,
+                                          const Vector<FabFactory<FArrayBox> const*>& a_factory)
+{
+    define(a_geom, a_grids, a_dmap, a_info, a_factory);
+}
+
+void
+MLNodeABecLaplacian::define (const Vector<Geometry>& a_geom,
+                             const Vector<BoxArray>& a_grids,
+                             const Vector<DistributionMapping>& a_dmap,
+                             const LPInfo& a_info,
+                             const Vector<FabFactory<FArrayBox> const*>& a_factory)
+{
+#ifdef AMREX_USE_EB
+    amrex::Abort("MLNodeABecLaplacian does not support EB");
+#endif
+
+    BL_PROFILE("MLNodeABecLaplacian::define()");
+
+    // This makes sure grids are cell-centered;
+    Vector<BoxArray> cc_grids = a_grids;
+    for (auto& ba : cc_grids) {
+        ba.enclosedCells();
+    }
+
+    MLNodeLinOp::define(a_geom, cc_grids, a_dmap, a_info, a_factory);
+
+    const int ncomp = getNComp();
+
+    m_a_coeffs.resize(m_num_amr_levels);
+    m_b_coeffs.resize(m_num_amr_levels);
+    for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
+        m_a_coeffs[amrlev].resize(m_num_mg_levels[amrlev]);
+        m_b_coeffs[amrlev].resize(m_num_mg_levels[amrlev]);
+        for (int mglev = 0; mglev < m_num_mg_levels[amrlev]; ++mglev) {
+            m_a_coeffs[amrlev][mglev].define
+                (amrex::convert(m_grids[amrlev][mglev], IntVect::TheNodeVector()),
+                 m_dmap[amrlev][mglev], ncomp, 0);
+            m_b_coeffs[amrlev][mglev].define
+                (m_grids[amrlev][mglev], m_dmap[amrlev][mglev], ncomp, 1);
+        }
+    }
+}
+
+void
+MLNodeABecLaplacian::setACoeffs (int amrlev, Real a_acoef)
+{
+    m_a_coeffs[amrlev][0].setVal(a_acoef);
+    m_needs_update = true;
+}
+
+void
+MLNodeABecLaplacian::setACoeffs (int amrlev, const MultiFab& a_acoef)
+{
+    const int ncomp = getNComp();
+    m_a_coeffs[amrlev][0].LocalCopy(a_acoef, 0, 0, ncomp, IntVect(0));
+    m_needs_update = true;
+}
+
+void
+MLNodeABecLaplacian::setBCoeffs (int amrlev, Real a_bcoef)
+{
+    m_b_coeffs[amrlev][0].setVal(a_bcoef);
+    m_needs_update = true;
+}
+
+void
+MLNodeABecLaplacian::setBCoeffs (int amrlev, const MultiFab& a_bcoef)
+{
+    const int ncomp = getNComp();
+    m_b_coeffs[amrlev][0].LocalCopy(a_bcoef, 0, 0, ncomp, IntVect(0));
+    m_needs_update = true;
+}
+
+void
+MLNodeABecLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) const
+{
+    BL_PROFILE("MLNodeLaplacian::Fapply()");
+
+    AMREX_ALWAYS_ASSERT(getNComp() == 1);
+
+    auto const alpha = m_a_scalar;
+    auto const beta  = m_b_scalar;
+    const auto dxinvarr = m_geom[amrlev][mglev].InvCellSizeArray();
+
+    auto const& acoef_ma = m_a_coeffs[amrlev][mglev].const_arrays();
+    auto const& bcoef_ma = m_b_coeffs[amrlev][mglev].const_arrays();
+    auto const& dmskarr_ma = m_dirichlet_mask[amrlev][mglev]->const_arrays();
+
+    auto const& xarr_ma = in.const_arrays();
+    auto const& yarr_ma = out.arrays();
+
+    ParallelFor(out, [=] AMREX_GPU_DEVICE(int box_no, int i, int j, int k) noexcept
+    {
+        auto lap = mlndlap_adotx_aa(i,j,k,xarr_ma[box_no],bcoef_ma[box_no],dmskarr_ma[box_no],
+#if (AMREX_SPACEDIM == 2)
+                                    false,
+#endif
+                                    dxinvarr);
+        yarr_ma[box_no](i,j,k) = (dmskarr_ma[box_no](i,j,k)) ? Real(0.0)
+            : alpha*acoef_ma[box_no](i,j,k)*xarr_ma[box_no](i,j,k) - beta*lap;
+    });
+    Gpu::streamSynchronize();
+}
+
+void
+MLNodeABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs) const
+{
+    BL_PROFILE("MLNodeABecLaplacian::Fsmooth()");
+
+    auto const alpha = m_a_scalar;
+    auto const beta  = m_b_scalar;
+    const auto dxinvarr = m_geom[amrlev][mglev].InvCellSizeArray();
+
+    auto const& acoef = m_a_coeffs[amrlev][mglev];
+    auto const& bcoef = m_b_coeffs[amrlev][mglev];
+    auto const& dmsk  = *(m_dirichlet_mask[amrlev][mglev]);
+
+#ifdef AMREX_USE_GPU
+
+    auto const& acoef_ma = acoef.const_arrays();
+    auto const& bcoef_ma = bcoef.const_arrays();
+    auto const& dmskarr_ma = dmsk.const_arrays();
+    auto const& solarr_ma = sol.arrays();
+    auto const& rhsarr_ma = rhs.const_arrays();
+
+    for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) {
+        ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+        {
+            auto lap = mlndlap_adotx_aa(i,j,k,solarr_ma[box_no],bcoef_ma[box_no],dmskarr_ma[box_no],
+#if (AMREX_SPACEDIM == 2)
+                                        false,
+#endif
+                                        dxinvarr);
+            mlndabeclap_jacobi_aa(i,j,k, solarr_ma[box_no], lap, rhsarr_ma[box_no], alpha, beta,
+                                  acoef_ma[box_no], bcoef_ma[box_no],
+                                  dmskarr_ma[box_no], dxinvarr);
+        });
+        Gpu::streamSynchronize();
+        if (m_smooth_num_sweeps > 1) { nodalSync(amrlev, mglev, sol); }
+    }
+#else
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel
+#endif
+    for (MFIter mfi(sol); mfi.isValid(); ++mfi) {
+        const Box& bx = mfi.validbox();
+        Array4<Real const> const& aarr = acoef.array(mfi);
+        Array4<Real const> const& barr = bcoef.array(mfi);
+        Array4<Real> const& solarr = sol.array(mfi);
+        Array4<Real const> const& rhsarr = rhs.const_array(mfi);
+        Array4<int const> const& dmskarr = dmsk.const_array(mfi);
+        for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) {
+            mlndabeclap_gauss_seidel_aa(bx, solarr, rhsarr, alpha, beta,
+                                        aarr, barr, dmskarr, dxinvarr);
+        }
+    }
+    nodalSync(amrlev, mglev, sol);
+#endif
+}
+
+void
+MLNodeABecLaplacian::restriction (int amrlev, int cmglev, MultiFab& crse, MultiFab& fine) const
+{
+    BL_PROFILE("MLNodeABecLaplacian::restriction()");
+
+    applyBC(amrlev, cmglev-1, fine, BCMode::Homogeneous, StateMode::Solution);
+
+    bool need_parallel_copy = !amrex::isMFIterSafe(crse, fine);
+    MultiFab cfine;
+    if (need_parallel_copy) {
+        const BoxArray& ba = amrex::coarsen(fine.boxArray(), 2);
+        cfine.define(ba, fine.DistributionMap(), 1, 0);
+    }
+
+    MultiFab* pcrse = (need_parallel_copy) ? &cfine : &crse;
+
+    auto pcrse_ma = pcrse->arrays();
+    auto fine_ma = fine.const_arrays();
+    auto msk_ma = m_dirichlet_mask[amrlev][cmglev-1]->const_arrays();
+
+    ParallelFor(*pcrse, [=] AMREX_GPU_DEVICE(int box_no, int i, int j, int k) noexcept
+    {
+        mlndlap_restriction(i,j,k,pcrse_ma[box_no],fine_ma[box_no],msk_ma[box_no]);
+    });
+    Gpu::streamSynchronize();
+
+    if (need_parallel_copy) {
+        crse.ParallelCopy(cfine);
+    }
+}
+
+void
+MLNodeABecLaplacian::interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiFab& crse) const
+{
+    BL_PROFILE("MLNodeABecLaplacian::interpolation()");
+
+    bool need_parallel_copy = !amrex::isMFIterSafe(crse, fine);
+    MultiFab cfine;
+    const MultiFab* cmf = &crse;
+    if (need_parallel_copy) {
+        const BoxArray& ba = amrex::coarsen(fine.boxArray(), 2);
+        cfine.define(ba, fine.DistributionMap(), 1, 0);
+        cfine.ParallelCopy(crse);
+        cmf = &cfine;
+    }
+
+    auto const& fine_ma = fine.arrays();
+    auto const& crse_ma = cmf->const_arrays();
+    auto const& msk_ma = m_dirichlet_mask[amrlev][fmglev]->const_arrays();
+    auto const& sig_ma = m_b_coeffs[amrlev][fmglev].const_arrays();
+
+    ParallelFor(fine, [=] AMREX_GPU_DEVICE(int box_no, int i, int j, int k) noexcept
+    {
+        mlndlap_interpadd_aa(i, j, k, fine_ma[box_no], crse_ma[box_no],
+                             sig_ma[box_no], msk_ma[box_no]);
+    });
+    Gpu::streamSynchronize();
+}
+
+void
+MLNodeABecLaplacian::averageDownSolutionRHS (int camrlev, MultiFab& crse_sol, MultiFab& crse_rhs,
+                                             const MultiFab& fine_sol, const MultiFab& fine_rhs)
+{
+    amrex::ignore_unused(camrlev,crse_sol,crse_rhs,fine_sol,fine_rhs);
+    amrex::Abort("MLNodeABecLaplacian::averageDownSolutionRHS TODO");
+}
+
+void
+MLNodeABecLaplacian::reflux (int crse_amrlev,
+                             MultiFab& res, const MultiFab& crse_sol, const MultiFab& crse_rhs,
+                             MultiFab& fine_res, MultiFab& fine_sol, const MultiFab& fine_rhs) const
+{
+    amrex::ignore_unused(crse_amrlev,res,crse_sol,crse_rhs,fine_res,fine_sol,fine_rhs);
+    amrex::Abort("MLNodeABecLaplacian::reflux TODO");
+}
+
+void
+MLNodeABecLaplacian::prepareForSolve ()
+{
+    BL_PROFILE("MLNodeABecLaplacian::prepareForSolve()");
+
+    MLNodeLinOp::prepareForSolve();
+
+    buildMasks();
+
+    averageDownCoeffs();
+
+    m_needs_update = false;
+}
+
+void
+MLNodeABecLaplacian::update ()
+{
+    BL_PROFILE("MLNodeABecLaplacian::prepareForSolve()");
+    averageDownCoeffs();
+    m_needs_update = false;
+}
+
+void
+MLNodeABecLaplacian::fixUpResidualMask (int amrlev, iMultiFab& resmsk)
+{
+    if (!m_masks_built) { buildMasks(); }
+
+    auto const& fmsk = m_nd_fine_mask[amrlev]->const_arrays();
+    auto const& rmsk = resmsk.arrays();
+
+    amrex::ParallelFor(resmsk,
+    [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+    {
+        if (fmsk[bno](i,j,k) == crse_fine_node) { rmsk[bno](i,j,k) = 1; }
+    });
+    Gpu::streamSynchronize();
+}
+
+void
+MLNodeABecLaplacian::averageDownCoeffs ()
+{
+    BL_PROFILE("MLNodeABecLaplacian::averageDownCoeffs()");
+
+    for (int amrlev = m_num_amr_levels-1; amrlev > 0; --amrlev) {
+        averageDownCoeffsSameAmrLevel(amrlev);
+        averageDownCoeffsToCoarseAmrLevel(amrlev);
+    }
+
+    averageDownCoeffsSameAmrLevel(0);
+
+    for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
+        for (int mglev = 0; mglev < m_num_mg_levels[amrlev]; ++mglev) {
+            m_b_coeffs[amrlev][mglev].FillBoundary(m_geom[amrlev][mglev].periodicity());
+
+            const Box& domain = m_geom[amrlev][mglev].Domain();
+            const auto lobc = LoBC();
+            const auto hibc = HiBC();
+
+            MFItInfo mfi_info;
+            if (Gpu::notInLaunchRegion()) { mfi_info.SetDynamic(true); }
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+            for (MFIter mfi(m_b_coeffs[amrlev][mglev], mfi_info); mfi.isValid(); ++mfi)
+            {
+                Array4<Real> const& sfab = m_b_coeffs[amrlev][mglev].array(mfi);
+                mlndlap_fillbc_cc<Real>(mfi.validbox(),sfab,domain,lobc,hibc);
+            }
+        }
+    }
+}
+
+void
+MLNodeABecLaplacian::averageDownCoeffsToCoarseAmrLevel (int flev)
+{
+    const int mglev = 0;
+    const int ncomp = getNComp();
+    // xxxxx TODO: There is a potential issue of the coarse data not consistent
+    // across periodic boundaries.
+    amrex::average_down_nodal(m_a_coeffs[flev  ][mglev],
+                              m_a_coeffs[flev-1][mglev],
+                              IntVect(m_amr_ref_ratio[flev-1]));
+    amrex::average_down(m_b_coeffs[flev  ][mglev],
+                        m_b_coeffs[flev-1][mglev], 0, ncomp,
+                        m_amr_ref_ratio[flev-1]);
+}
+
+void
+MLNodeABecLaplacian::averageDownCoeffsSameAmrLevel (int amrlev)
+{
+    const int ncomp = getNComp();
+    for (int mglev = 1; mglev < m_num_mg_levels[amrlev]; ++mglev) {
+        IntVect ratio(mg_coarsen_ratio);
+        amrex::average_down_nodal(m_a_coeffs[amrlev][mglev-1],
+                                  m_a_coeffs[amrlev][mglev  ], ratio);
+        amrex::average_down(m_b_coeffs[amrlev][mglev-1],
+                            m_b_coeffs[amrlev][mglev  ], 0, ncomp, ratio);
+    }
+}
+
+}
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H
index 1de55f8a63d..b251de33b7a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H
@@ -1376,7 +1376,7 @@ void mlndlap_crse_resid (int i, int j, int k, Array4<Real> const& resid,
 namespace {
     template <typename P, typename S>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    Real mlndlap_sum_Ax (P && pred, S && sig, int i, int j, Real facx, Real facy,
+    Real mlndlap_sum_Ax (P const& pred, S const& sig, int i, int j, Real facx, Real facy,
                          Array4<Real const> const& phi, bool is_rz) noexcept
     {
         Real Ax = Real(0.0);
@@ -1418,7 +1418,7 @@ namespace {
 
     template <int rr, typename S>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void mlndlap_Ax_fine_contrib_doit (S&& sig, int i, int j, Box const& ndbx, Box const& ccbx,
+    void mlndlap_Ax_fine_contrib_doit (S const& sig, int i, int j, Box const& ndbx, Box const& ccbx,
                                        Array4<Real> const& f, Array4<Real const> const& res,
                                        Array4<Real const> const& rhs,
                                        Array4<Real const> const& phi,
@@ -2282,97 +2282,35 @@ void mlndlap_set_integral_eb (int i, int j, int, Array4<Real> const& intg,
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlndlap_set_surface_integral_eb (int i, int j, int, Array4<Real> const& sintg,
-                              Array4<EBCellFlag const> const& flag, Array4<Real const> const& vol,
-                              Array4<Real const> const& ax, Array4<Real const> const& ay,
+                              Array4<EBCellFlag const> const& flag,
                               Array4<Real const> const& bcen,
-                              Array4<Real const> const& barea) noexcept
+                              Array4<Real const> const& barea,
+                              Array4<Real const> const& bnorm) noexcept
 {
     if (flag(i,j,0).isCovered() || flag(i,j,0).isRegular()) {
         sintg(i,j,0,i_B_x ) = Real(0.);
         sintg(i,j,0,i_B_y ) = Real(0.);
         sintg(i,j,0,i_B_xy) = Real(0.);
     } else {
-        Real axm = ax(i,j,0);
-        Real axp = ax(i+1,j,0);
-        Real aym = ay(i,j,0);
-        Real ayp = ay(i,j+1,0);
+        Real bcx = bcen(i,j,0,0);
+        Real bcy = bcen(i,j,0,1);
 
-        Real apnorm = std::sqrt((axm-axp)*(axm-axp) + (aym-ayp)*(aym-ayp));
-        if (apnorm == Real(0.)) {
-            amrex::Abort("amrex_mlndlap_set_surface_integral: we are in trouble");
-        }
-
-        if (vol(i,j,0) >= almostone) {
-            sintg(i,j,0,i_B_x ) = Real(0.);
-            sintg(i,j,0,i_B_y ) = Real(0.);
-            sintg(i,j,0,i_B_xy) = Real(0.);
-            if (axm < Real(1.)) {
-                sintg(i,j,0,i_B_x) = Real(-0.5)*barea(i,j,0);
-            } else if (aym < Real(1.)) {
-                sintg(i,j,0,i_B_y) = Real(-0.5)*barea(i,j,0);
-            } else if (axp < Real(1.)) {
-                sintg(i,j,0,i_B_x) = Real( 0.5)*barea(i,j,0);
-            } else if (ayp < Real(1.)) {
-                sintg(i,j,0,i_B_y) = Real( 0.5)*barea(i,j,0);
-            } else {
-                amrex::Abort("amrex_mlndlap_set_surface_integral: we are in trouble");
-            }
-        } else {
-            Real apnorminv = Real(1.)/apnorm;
-            Real anrmx = (axm-axp) * apnorminv;  // pointing to the wall
-            Real anrmy = (aym-ayp) * apnorminv;
-
-            Real bcx = bcen(i,j,0,0);
-            Real bcy = bcen(i,j,0,1);
-
-            Real c = -(bcx * anrmx + bcy * anrmy);
-
-            GpuArray<RealVect,4> pts; //intersection points
-            int np = 0;
-            if (std::abs(anrmx) <= almostzero) {
-                pts[np++] = RealVect{Real(-0.5), Real(-c + Real(0.5)*anrmx)/anrmy};
-                pts[np++] = RealVect{Real( 0.5), Real(-c - Real(0.5)*anrmx)/anrmy};
-            } else if (std::abs(anrmy) <= almostzero) {
-                pts[np++] = RealVect{Real(-c + Real(0.5)*anrmy)/anrmx, Real(-0.5)};
-                pts[np++] = RealVect{Real(-c - Real(0.5)*anrmy)/anrmx, Real( 0.5)};
-            } else {
-                if (    (axm > Real(0.) && axm  < Real(1.))
-                     || (axm > Real(0.) && aym == Real(0.))
-                     || (axm > Real(0.) && ayp == Real(0.))) {
-                    pts[np++] = RealVect{Real(-0.5), Real(-c + Real(0.5)*anrmx)/anrmy};
-                }
-                if (    (axp > Real(0.) && axp  < Real(1.))
-                     || (axp > Real(0.) && aym == Real(0.))
-                     || (axp > Real(0.) && ayp == Real(0.))) {
-                    pts[np++] = RealVect{Real( 0.5), Real(-c - Real(0.5)*anrmx)/anrmy};
-                }
-                if (    (aym > Real(0.) && aym  < Real(1.))
-                     || (aym > Real(0.) && axm == Real(0.))
-                     || (aym > Real(0.) && axp == Real(0.))) {
-                    pts[np++] = RealVect{Real(-c + Real(0.5)*anrmy)/anrmx, Real(-0.5)};
-                }
-                if (    (ayp > Real(0.) && ayp  < Real(1.))
-                     || (ayp > Real(0.) && axm == Real(0.))
-                     || (ayp > Real(0.) && axp == Real(0.))) {
-                    pts[np++] = RealVect{Real(-c - Real(0.5)*anrmy)/anrmx, Real( 0.5)};
-                }
-            }
+        Real btanx =  bnorm(i,j,0,1);
+        Real btany = -bnorm(i,j,0,0);
 
-            if (np != 2) {
-                amrex::Abort("amrex_mlndlap_set_surface_integral: we are in trouble");
-            }
+        Real x0 =  bcx - Real(0.5)*barea(i,j,0)*btanx;
+        Real x1 =  bcx + Real(0.5)*barea(i,j,0)*btanx;
 
-            Real x0 = pts[0][0], x1 = pts[1][0];
-            Real y0 = pts[0][1], y1 = pts[1][1];
+        Real y0 =  bcy - Real(0.5)*barea(i,j,0)*btany;
+        Real y1 =  bcy + Real(0.5)*barea(i,j,0)*btany;
 
-            Real Bx  = barea(i,j,0)*Real(0.5)*(x1 + x0);
-            Real By  = barea(i,j,0)*Real(0.5)*(y1 + y0);
-            Real Bxy = barea(i,j,0)*(x0*y0 + (x0*(y1 - y0) + y0*(x1 - x0))/Real(2.) + (x1 - x0)*(y1 - y0)/Real(3.));
+        Real Bx  = barea(i,j,0)*Real(0.5)*(x1 + x0);
+        Real By  = barea(i,j,0)*Real(0.5)*(y1 + y0);
+        Real Bxy = barea(i,j,0)*(x0*y0 + (x0*(y1 - y0) + y0*(x1 - x0))/Real(2.) + (x1 - x0)*(y1 - y0)/Real(3.));
 
-            sintg(i,j,0,i_B_x ) = Bx;
-            sintg(i,j,0,i_B_y ) = By;
-            sintg(i,j,0,i_B_xy) = Bxy;
-        }
+        sintg(i,j,0,i_B_x ) = Bx;
+        sintg(i,j,0,i_B_y ) = By;
+        sintg(i,j,0,i_B_xy) = Bxy;
     }
 }
 
@@ -2468,7 +2406,7 @@ void mlndlap_fillijmat_aa_cpu (Box const& ndbx,
     Real f2xmy = Real(2.0)*facx - facy;
     Real fmx2y = Real(2.0)*facy - facx;
 
-    // Note that ccdom has been grown at peridoci boundaries.
+    // Note that ccdom has been grown at periodic boundaries.
     const Box& nddom = amrex::surroundingNodes(ccdom);
 
     constexpr auto gidmax = std::numeric_limits<AtomicInt>::max();
@@ -2614,7 +2552,7 @@ void mlndlap_fillijmat_ha_cpu (Box const& ndbx,
     Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0];
     Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1];
 
-    // Note that ccdom has been grown at peridoci boundaries.
+    // Note that ccdom has been grown at periodic boundaries.
     const Box& nddom = amrex::surroundingNodes(ccdom);
 
     constexpr auto gidmax = std::numeric_limits<AtomicInt>::max();
@@ -2770,7 +2708,7 @@ void mlndlap_fillijmat_cs_cpu (Box const& ndbx,
     Real f2xmy = Real(2.0)*facx - facy;
     Real fmx2y = Real(2.0)*facy - facx;
 
-    // Note that ccdom has been grown at peridoci boundaries.
+    // Note that ccdom has been grown at periodic boundaries.
     const Box& nddom = amrex::surroundingNodes(ccdom);
 
     constexpr auto gidmax = std::numeric_limits<AtomicInt>::max();
@@ -3417,7 +3355,7 @@ void mlndlap_fillijmat_cs_gpu (const int ps, const int i, const int j, const int
             fp = fm = Real(0.0);
         }
 
-        // Note that nddom has been grown at peridoci boundaries.
+        // Note that nddom has been grown at periodic boundaries.
         const Box& nddom = amrex::surroundingNodes(ccdom);
 
         constexpr auto gidmax = std::numeric_limits<AtomicInt>::max();
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H
index da16ff58b13..2824b26f894 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H
@@ -2482,7 +2482,7 @@ void mlndlap_crse_resid (int i, int j, int k, Array4<Real> const& resid,
 namespace {
     template <typename P, typename S>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    Real mlndlap_sum_Ax (P && pred, S && sig,
+    Real mlndlap_sum_Ax (P const& pred, S const& sig,
                          int i, int j, int k, Real facx, Real facy, Real facz,
                          Array4<Real const> const& phi) noexcept
     {
@@ -2604,7 +2604,7 @@ namespace {
 
     template <int rr, typename S>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void mlndlap_Ax_fine_contrib_doit (S&& sig,
+    void mlndlap_Ax_fine_contrib_doit (S const& sig,
                                        int i, int j, int k, Box const& ndbx, Box const& ccbx,
                                        Array4<Real> const& f, Array4<Real const> const& res,
                                        Array4<Real const> const& rhs,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.H
index 7ec9f13ce73..bb2ec8dd4f5 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.H
@@ -6,7 +6,7 @@
 
 namespace amrex {
 
-// del dot (sigma grah phi) = rhs
+// del dot (sigma grad phi) = rhs
 // where phi and rhs are nodal, and sigma is cell-centered.
 
 class MLNodeLaplacian
@@ -84,10 +84,6 @@ public :
         if (m_const_sigma == Real(0.0)) { m_coarsening_strategy = cs; }
     }
 
-    void setSmoothNumSweeps (int nsweeps) noexcept {
-        m_smooth_num_sweeps = nsweeps;
-    }
-
     BottomSolver getDefaultBottomSolver () const final {
         return (m_coarsening_strategy == CoarseningStrategy::RAP) ?
             BottomSolver::bicgcg : BottomSolver::bicgstab;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
index 4a749a1ed08..d96e183af4d 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
@@ -947,7 +947,7 @@ MLNodeLaplacian::checkPoint (std::string const& file_name) const
 
             HeaderFile.precision(17);
 
-            // MLLinop stuff
+            // MLLinOp stuff
             HeaderFile << "verbose = " << verbose << "\n"
                        << "nlevs = " << NAMRLevels() << "\n"
                        << "do_agglomeration = " << info.do_agglomeration << "\n"
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_eb.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_eb.cpp
index 49f80ad4c11..3088a29cb24 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_eb.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_eb.cpp
@@ -99,10 +99,9 @@ MLNodeLaplacian::buildSurfaceIntegral ()
         {
             const int ncomp = sintg->nComp();
             const auto& flags = factory->getMultiEBCellFlagFab();
-            const auto& vfrac = factory->getVolFrac();
-            const auto& area = factory->getAreaFrac();
             const auto& bcent = factory->getBndryCent();
             const auto& barea = factory->getBndryArea();
+            const auto& bnorm = factory->getBndryNormal();
 
             MFItInfo mfi_info;
             if (Gpu::notInLaunchRegion()) { mfi_info.EnableTiling().SetDynamic(true); }
@@ -128,14 +127,12 @@ MLNodeLaplacian::buildSurfaceIntegral ()
                     });
                 } else {
                     Array4<EBCellFlag const> const& flagarr = flags.const_array(mfi);
-                    Array4<Real const> const& vfracarr = vfrac.const_array(mfi);
-                    Array4<Real const> const& axarr = area[0]->const_array(mfi);
-                    Array4<Real const> const& ayarr = area[1]->const_array(mfi);
                     Array4<Real const> const& bcarr = bcent.const_array(mfi);
                     Array4<Real const> const& baarr = barea.const_array(mfi);
+                    Array4<Real const> const& bnarr = bnorm.const_array(mfi);
                     AMREX_HOST_DEVICE_FOR_3D(bx, i, j, k,
                     {
-                        mlndlap_set_surface_integral_eb(i,j,k,garr,flagarr,vfracarr,axarr,ayarr,bcarr,baarr);
+                        mlndlap_set_surface_integral_eb(i,j,k,garr,flagarr,bcarr,baarr,bnarr);
                     });
                 }
             }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
index 4abf219e44f..9cb5ec880fa 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
@@ -895,7 +895,7 @@ MLNodeLaplacian::compRHS (const Vector<MultiFab*>& rhs, const Vector<MultiFab*>&
     //
     // Note that div vel we copmute on a coarse/fine nodes is not a
     // composite divergence.  It has been restricted so that it is suitable
-    // as RHS for our geometric mulitgrid solver with a MG hirerachy
+    // as RHS for our geometric multigrid solver with a MG hirerachy
     // including multiple AMR levels.
     //
     // Also note that even for RAP, we do doubling at Nuemann boundary,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp
index 602f893b00d..db83ee8c979 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sten.cpp
@@ -344,7 +344,7 @@ MLNodeLaplacian::buildStencil ()
         });
     }
 
-    if (m_is_bottom_singular)
+    if (isBottomSingular())
     {
         int amrlev = 0;
         int mglev = 0;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sync.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sync.cpp
index f8bff06337b..587f9508ef5 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sync.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sync.cpp
@@ -621,7 +621,7 @@ MLNodeLaplacian::reflux (int crse_amrlev,
     //
     //  Note that the residue we copmute on a coarse/fine node is not a
     //  composite divergence.  It has been restricted so that it is suitable
-    //  as RHS for our geometric mulitgrid solver with a MG hirerachy
+    //  as RHS for our geometric multigrid solver with a MG hirerachy
     //  including multiple AMR levels.
     //
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
index 3c36989f79a..9adb2f12408 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
@@ -33,9 +33,13 @@ public:
                  const Vector<FabFactory<FArrayBox> const*>& a_factory = {},
                  int a_eb_limit_coarsening = -1);
 
+    void setSmoothNumSweeps (int nsweeps) noexcept {
+        m_smooth_num_sweeps = nsweeps;
+    }
+
     void setLevelBC (int /*amrlev*/, const MultiFab* /*levelbcdata*/,
-                             const MultiFab* = nullptr, const MultiFab* = nullptr,
-                             const MultiFab* = nullptr) final {}
+                     const MultiFab* = nullptr, const MultiFab* = nullptr,
+                     const MultiFab* = nullptr) final {}
 
     void apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode bc_mode,
                         StateMode s_mode, const MLMGBndry* bndry=nullptr) const final;
@@ -47,17 +51,6 @@ public:
                                    const MultiFab* crse_bcdata=nullptr) override;
     void correctionResidual (int amrlev, int mglev, MultiFab& resid, MultiFab& x, const MultiFab& b,
                                      BCMode bc_mode, const MultiFab* crse_bcdata=nullptr) override;
-    void compFlux (int /*amrlev*/, const Array<MultiFab*,AMREX_SPACEDIM>& /*fluxes*/,
-                           MultiFab& /*sol*/, Location /*loc*/) const final {
-        amrex::Abort("AMReX_MLNodeLinOp::compFlux::How did we get here?");
-    }
-    void compGrad (int /*amrlev*/, const Array<MultiFab*,AMREX_SPACEDIM>& /*grad*/,
-                           MultiFab& /*sol*/, Location /*loc*/) const override {
-        amrex::Abort("AMReX_MLNodeLinOp::compGrad::How did we get here?");
-    }
-
-    void applyMetricTerm (int /*amrlev*/, int /*mglev*/, MultiFab& /*rhs*/) const final {}
-    void unapplyMetricTerm (int /*amrlev*/, int /*mglev*/, MultiFab& /*rhs*/) const final {}
 
     Vector<Real> getSolvabilityOffset (int amrlev, int mglev,
                                                MultiFab const& rhs) const override;
@@ -66,6 +59,10 @@ public:
 
     void prepareForSolve () override;
 
+    void prepareForGMRES () override;
+
+    void setDirichletNodesToZero (int amrlev, int mglev, MultiFab& mf) const override;
+
     bool isSingular (int amrlev) const override
         { return (amrlev == 0) ? m_is_bottom_singular : false; }
     bool isBottomSingular () const override { return m_is_bottom_singular; }
@@ -80,11 +77,6 @@ public:
 
     void nodalSync (int amrlev, int mglev, MultiFab& mf) const;
 
-    std::unique_ptr<MLLinOp> makeNLinOp (int /*grid_size*/) const final {
-        amrex::Abort("MLNodeLinOp::makeNLinOp: N-Solve not supported");
-        return std::unique_ptr<MLLinOp>{};
-    }
-
     static std::unique_ptr<iMultiFab> makeOwnerMask (const BoxArray& ba,
                                                      const DistributionMapping& dm,
                                                      const Geometry& geom);
@@ -152,7 +144,6 @@ protected:
     CoarseningStrategy m_coarsening_strategy = CoarseningStrategy::Sigma;
 #endif
 
-    bool m_is_bottom_singular = false;
     bool m_masks_built = false;
     bool m_overset_dirichlet_mask = false;
 #ifdef AMREX_USE_GPU
@@ -161,6 +152,9 @@ protected:
     int m_smooth_num_sweeps = 2;
 #endif
     mutable bool m_in_solution_mode = true;
+
+private:
+    bool m_is_bottom_singular = false;
 };
 
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
index 5e2389c1f93..38f58b70bbb 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
@@ -4,6 +4,10 @@
 #include <AMReX_MLMG_K.H>
 #include <AMReX_MultiFabUtil.H>
 
+#ifdef AMREX_USE_EB
+#include <AMReX_EBMultiFabUtil.H>
+#endif
+
 #ifdef AMREX_USE_OMP
 #include <omp.h>
 #endif
@@ -234,7 +238,7 @@ void MLNodeLinOp_set_dot_mask (MultiFab& dot_mask, iMultiFab const& omask, Geome
     Box nddomain = amrex::surroundingNodes(geom.Domain());
 
     if (strategy != MLNodeLinOp::CoarseningStrategy::Sigma) {
-        nddomain.grow(1000); // hack to avoid masks being modified at Neuman boundary
+        nddomain.grow(1000); // hack to avoid masks being modified at Neumann boundary
     }
 
 #ifdef AMREX_USE_OMP
@@ -368,7 +372,7 @@ MLNodeLinOp::buildMasks ()
         MLNodeLinOp_set_dot_mask(m_bottom_dot_mask, omask, geom, lobc, hibc, m_coarsening_strategy);
     }
 
-    if (m_is_bottom_singular)
+    if (isBottomSingular())
     {
         int amrlev = 0;
         int mglev = 0;
@@ -379,6 +383,38 @@ MLNodeLinOp::buildMasks ()
     }
 }
 
+void
+MLNodeLinOp::prepareForGMRES ()
+{
+    if (m_coarse_dot_mask.empty()) {
+        int amrlev = 0;
+        int mglev = 0;
+        const Geometry& geom = m_geom[amrlev][mglev];
+        const iMultiFab& omask = *m_owner_mask_top;
+        m_coarse_dot_mask.define(omask.boxArray(), omask.DistributionMap(), 1, 0);
+        const auto lobc = LoBC();
+        const auto hibc = HiBC();
+        MLNodeLinOp_set_dot_mask(m_coarse_dot_mask, omask, geom, lobc, hibc, m_coarsening_strategy);
+    }
+}
+
+void
+MLNodeLinOp::setDirichletNodesToZero (int amrlev, int mglev, MultiFab& mf) const
+{
+    auto const& maskma = m_dirichlet_mask[amrlev][mglev]->const_arrays();
+    auto const& ma = mf.arrays();
+    const int ncomp = getNComp();
+    ParallelFor(mf, IntVect(0), ncomp,
+    [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k, int n)
+    {
+        if (maskma[bno](i,j,k)) { ma[bno](i,j,k,n) = RT(0.0); }
+    });
+    Gpu::streamSynchronize();
+#ifdef AMREX_USE_EB
+    EB_set_covered(mf, 0, ncomp, 0, RT(0.0));
+#endif
+}
+
 void
 MLNodeLinOp::setOversetMask (int amrlev, const iMultiFab& a_dmask)
 {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
index c0e8805bd24..a0af307ff9c 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
@@ -142,9 +142,9 @@ MLTensorOp::prepareForSolve ()
         }
     } else {
         for (int amrlev = 0; amrlev < NAMRLevels(); ++amrlev) {
-            for (int mglev = 0; mglev < m_kappa[amrlev].size(); ++mglev) {
+            for (auto & mglev : m_kappa[amrlev]) {
                 for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                    m_kappa[amrlev][mglev][idim].setVal(0.0);
+                    mglev[idim].setVal(0.0);
                 }
             }
         }
diff --git a/Src/LinearSolvers/MLMG/Make.package b/Src/LinearSolvers/MLMG/Make.package
index 22934a0d045..a8f267d4c26 100644
--- a/Src/LinearSolvers/MLMG/Make.package
+++ b/Src/LinearSolvers/MLMG/Make.package
@@ -1,3 +1,5 @@
+ifndef AMREX_MLMG_MAKE
+       AMREX_MLMG_MAKE := 1
 
 CEXE_sources += AMReX_MLMG.cpp
 
@@ -50,6 +52,9 @@ ifeq ($(USE_HYPRE),TRUE)
   CEXE_sources   += AMReX_MLNodeLaplacian_hypre.cpp
 endif
 
+CEXE_headers   += AMReX_MLNodeABecLaplacian.H
+CEXE_sources   += AMReX_MLNodeABecLaplacian.cpp
+CEXE_headers   += AMReX_MLNodeABecLap_K.H AMReX_MLNodeABecLap_$(DIM)D_K.H
 
 CEXE_headers   += AMReX_MLNodeTensorLaplacian.H
 CEXE_sources   += AMReX_MLNodeTensorLaplacian.cpp
@@ -82,5 +87,13 @@ ifneq ($(BL_NO_FORT),TRUE)
   F90EXE_sources += AMReX_MLLinOp_nd.F90
 endif
 
+ifneq ($(DIM),1)
+  CEXE_headers += AMReX_MLCurlCurl.H
+  CEXE_sources += AMReX_MLCurlCurl.cpp
+  CEXE_headers += AMReX_MLCurlCurl_K.H
+endif
+
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG
+
+endif
diff --git a/Src/LinearSolvers/Make.package b/Src/LinearSolvers/Make.package
new file mode 100644
index 00000000000..c818714cbfd
--- /dev/null
+++ b/Src/LinearSolvers/Make.package
@@ -0,0 +1,9 @@
+CEXE_headers += AMReX_GMRES.H AMReX_GMRES_MLMG.H
+
+VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers
+INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers
+
+include $(AMREX_HOME)/Src/LinearSolvers/MLMG/Make.package
+ifeq ($(DIM),3)
+  include $(AMREX_HOME)/Src/LinearSolvers/OpenBC/Make.package
+endif
diff --git a/Src/LinearSolvers/OpenBC/Make.package b/Src/LinearSolvers/OpenBC/Make.package
index 5fc39f69371..b2f83ed61e8 100644
--- a/Src/LinearSolvers/OpenBC/Make.package
+++ b/Src/LinearSolvers/OpenBC/Make.package
@@ -1,6 +1,10 @@
+ifndef AMREX_OPENBC_MAKE
+       AMREX_OPENBC_MAKE := 1
 
 CEXE_headers += AMReX_OpenBC.H AMReX_OpenBC_K.H
 CEXE_sources += AMReX_OpenBC.cpp
 
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/OpenBC
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/OpenBC
+
+endif
diff --git a/Src/Particle/AMReX_ArrayOfStructs.H b/Src/Particle/AMReX_ArrayOfStructs.H
index 0237611fc7b..1492a7acfbd 100644
--- a/Src/Particle/AMReX_ArrayOfStructs.H
+++ b/Src/Particle/AMReX_ArrayOfStructs.H
@@ -86,7 +86,7 @@ public:
     [[nodiscard]] const ParticleType& operator[] (int i) const { return m_data[i]; }
     [[nodiscard]]       ParticleType& operator[] (int i)       { return m_data[i]; }
 
-    void swap (ArrayOfStructs& other)
+    void swap (ArrayOfStructs& other) noexcept
     {
         m_data.swap(other.m_data);
     }
diff --git a/Src/Particle/AMReX_BinIterator.H b/Src/Particle/AMReX_BinIterator.H
index 00f378e9508..f57b61a0371 100644
--- a/Src/Particle/AMReX_BinIterator.H
+++ b/Src/Particle/AMReX_BinIterator.H
@@ -24,15 +24,15 @@ struct BinIterator
 {
     using index_type = unsigned int;
 
-    using const_pointer_type = typename std::conditional<IsParticleTileData<T>(),
+    using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
         const T*
-    >::type;
+    >;
 
-    using const_pointer_input_type = typename std::conditional<IsParticleTileData<T>(),
+    using const_pointer_input_type = std::conditional_t<IsParticleTileData<T>(),
         const T&,
         const T*
-    >::type;
+    >;
 
     struct iterator
     {
@@ -85,7 +85,7 @@ struct BinIterator
     {}
 
 private:
-    const index_type m_i;
+    index_type m_i;
     const index_type * m_offsets_ptr;
     const index_type * m_permutation_ptr;
     const_pointer_type m_items;
diff --git a/Src/Particle/AMReX_DenseBins.H b/Src/Particle/AMReX_DenseBins.H
index 5ff9ecdc23f..24c9cc293cb 100644
--- a/Src/Particle/AMReX_DenseBins.H
+++ b/Src/Particle/AMReX_DenseBins.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_Gpu.H>
+#include <AMReX_Scan.H>
 #include <AMReX_IntVect.H>
 #include <AMReX_BLProfiler.H>
 #include <AMReX_BinIterator.H>
@@ -31,10 +32,10 @@ struct DenseBinIteratorFactory
 {
     using index_type = unsigned int;
 
-    using const_pointer_type = typename std::conditional<IsParticleTileData<T>(),
+    using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
         const T*
-    >::type;
+    >;
 
     DenseBinIteratorFactory (const Gpu::DeviceVector<index_type>& offsets,
                              const Gpu::DeviceVector<index_type>& permutation,
@@ -79,15 +80,15 @@ public:
     using BinIteratorFactory = DenseBinIteratorFactory<T>;
     using index_type = unsigned int;
 
-    using const_pointer_type = typename std::conditional<IsParticleTileData<T>(),
+    using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
         const T*
-    >::type;
+    >;
 
-    using const_pointer_input_type = typename std::conditional<IsParticleTileData<T>(),
+    using const_pointer_input_type = std::conditional_t<IsParticleTileData<T>(),
         const T&,
         const T*
-    >::type;
+    >;
 
 private:
 
@@ -181,7 +182,7 @@ public:
      * \param f a function object that maps items to bins
      */
     template <typename N, typename F>
-    void build (BinPolicy::GPUBinPolicy, N nitems, const_pointer_input_type v, const Box& bx, F&& f)
+    void build (BinPolicy::GPUBinPolicy, N nitems, const_pointer_input_type v, const Box& bx, F const& f)
     {
         const auto lo = lbound(bx);
         const auto hi = ubound(bx);
@@ -222,7 +223,7 @@ public:
      * \param f a function object that maps items to bins
      */
     template <typename N, typename F>
-    void build (BinPolicy::GPUBinPolicy, N nitems, const_pointer_input_type v, int nbins, F&& f)
+    void build (BinPolicy::GPUBinPolicy, N nitems, const_pointer_input_type v, int nbins, F const& f)
     {
         BL_PROFILE("DenseBins<T>::buildGPU");
 
@@ -285,7 +286,7 @@ public:
      * \param f a function object that maps items to bins
      */
     template <typename N, typename F>
-    void build (BinPolicy::OpenMPBinPolicy, N nitems, const_pointer_input_type v, const Box& bx, F&& f)
+    void build (BinPolicy::OpenMPBinPolicy, N nitems, const_pointer_input_type v, const Box& bx, F const& f)
     {
         const auto lo = lbound(bx);
         const auto hi = ubound(bx);
@@ -327,7 +328,7 @@ public:
      * \param f a function object that maps items to bins
      */
     template <typename N, typename F>
-    void build (BinPolicy::OpenMPBinPolicy, N nitems, const_pointer_input_type v, int nbins, F&& f)
+    void build (BinPolicy::OpenMPBinPolicy, N nitems, const_pointer_input_type v, int nbins, F const& f)
     {
         BL_PROFILE("DenseBins<T>::buildOpenMP");
 
@@ -424,7 +425,7 @@ public:
      * \param f a function object that maps items to bins
      */
     template <typename N, typename F>
-    void build (BinPolicy::SerialBinPolicy, N nitems, const_pointer_input_type v, const Box& bx, F&& f)
+    void build (BinPolicy::SerialBinPolicy, N nitems, const_pointer_input_type v, const Box& bx, F const& f)
     {
         const auto lo = lbound(bx);
         const auto hi = ubound(bx);
@@ -466,7 +467,7 @@ public:
      * \param f a function object that maps items to bins
      */
     template <typename N, typename F>
-    void build (BinPolicy::SerialBinPolicy, N nitems, const_pointer_input_type v, int nbins, F&& f)
+    void build (BinPolicy::SerialBinPolicy, N nitems, const_pointer_input_type v, int nbins, F const& f)
     {
         BL_PROFILE("DenseBins<T>::buildSerial");
 
@@ -481,7 +482,7 @@ public:
         m_offsets.resize(0);
         m_offsets.resize(nbins+1);
 
-        for (int i = 0; i < nitems; ++i) {
+        for (N i = 0; i < nitems; ++i) {
             m_bins[i] = call_f(f,v,i);
             ++m_counts[m_bins[i]];
         }
@@ -490,7 +491,7 @@ public:
 
         Gpu::copy(Gpu::deviceToDevice, m_offsets.begin(), m_offsets.end(), m_counts.begin());
 
-        for (int i = 0; i < nitems; ++i) {
+        for (N i = 0; i < nitems; ++i) {
             index_type index = m_counts[m_bins[i]]++;
             m_perm[index] = i;
         }
diff --git a/Src/Particle/AMReX_MakeParticle.H b/Src/Particle/AMReX_MakeParticle.H
index 4b9e35597dc..5395b4443f5 100644
--- a/Src/Particle/AMReX_MakeParticle.H
+++ b/Src/Particle/AMReX_MakeParticle.H
@@ -25,7 +25,7 @@ struct make_particle
 };
 
 template <typename T_ParticleType>
-struct make_particle<T_ParticleType, typename std::enable_if<is_soa_particle<T_ParticleType>::value>::type>
+struct make_particle<T_ParticleType, std::enable_if_t<is_soa_particle<T_ParticleType>::value>>
 {
     template <typename PTD>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
diff --git a/Src/Particle/AMReX_NeighborList.H b/Src/Particle/AMReX_NeighborList.H
index 330bef29834..2231f84db2b 100644
--- a/Src/Particle/AMReX_NeighborList.H
+++ b/Src/Particle/AMReX_NeighborList.H
@@ -1,5 +1,5 @@
-#ifndef NEIGHBOR_LIST_H_
-#define NEIGHBOR_LIST_H_
+#ifndef AMREX_NEIGHBOR_LIST_H_
+#define AMREX_NEIGHBOR_LIST_H_
 #include <AMReX_Config.H>
 
 #include <AMReX_Particles.H>
@@ -227,16 +227,16 @@ struct NeighborData
 };
 
 template<typename A, typename B,
-         typename std::enable_if<std::is_same<std::remove_cv_t<A>,
-                                              std::remove_cv_t<B> >::value, int>::type = 0>
+         std::enable_if_t<std::is_same_v<std::remove_cv_t<A>,
+                                         std::remove_cv_t<B> >, int> = 0>
 bool isSame (A const* pa, B const* pb)
 {
     return pa == pb;
 }
 
 template<typename A, typename B,
-         typename std::enable_if<!std::is_same<std::remove_cv_t<A>,
-                                               std::remove_cv_t<B> >::value, int>::type = 0>
+         std::enable_if_t<!std::is_same_v<std::remove_cv_t<A>,
+                                          std::remove_cv_t<B> >, int> = 0>
 bool isSame (A const* /*pa*/, B const* /*pb*/)
 {
     return false;
diff --git a/Src/Particle/AMReX_NeighborParticles.H b/Src/Particle/AMReX_NeighborParticles.H
index 781c9ddc446..f288a723c38 100644
--- a/Src/Particle/AMReX_NeighborParticles.H
+++ b/Src/Particle/AMReX_NeighborParticles.H
@@ -290,7 +290,7 @@ public:
     }
 
     template <typename T,
-              typename std::enable_if<std::is_same<T,bool>::value,int>::type=0>
+              std::enable_if_t<std::is_same_v<T,bool>,int> = 0>
     void AddRealComp (T communicate=true)
     {
         ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::
@@ -300,7 +300,7 @@ public:
     }
 
     template <typename T,
-              typename std::enable_if<std::is_same<T,bool>::value,int>::type=0>
+              std::enable_if_t<std::is_same_v<T,bool>,int> = 0>
     void AddIntComp (T communicate=true)
     {
         ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::
diff --git a/Src/Particle/AMReX_ParGDB.H b/Src/Particle/AMReX_ParGDB.H
index 59f236ad1f5..81c72826ec1 100644
--- a/Src/Particle/AMReX_ParGDB.H
+++ b/Src/Particle/AMReX_ParGDB.H
@@ -171,9 +171,9 @@ ParGDB::ParGDB (const Vector<Geometry>            & geom,
     m_ba(ba),
     m_nlevels(static_cast<int>(ba.size()))
 {
-    for (int level = 0; level < static_cast<int>(rr.size()); ++level)
+    for (int level : rr)
     {
-        m_rr.push_back(rr[level]*IntVect::TheUnitVector());
+        m_rr.push_back(level*IntVect::TheUnitVector());
     }
 }
 
diff --git a/Src/Particle/AMReX_ParIter.H b/Src/Particle/AMReX_ParIter.H
index a5208305d73..84ca403eff7 100644
--- a/Src/Particle/AMReX_ParIter.H
+++ b/Src/Particle/AMReX_ParIter.H
@@ -34,15 +34,16 @@ class ParIterBase_impl
 private:
 
     using PCType = ParticleContainer_impl<T_ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>;
-    using ContainerRef    = typename std::conditional<is_const, PCType const&, PCType&>::type;
-    using ParticleTileRef = typename std::conditional
-        <is_const, typename PCType::ParticleTileType const&, typename PCType::ParticleTileType &>::type;
-    using ParticleTilePtr = typename std::conditional
-        <is_const, typename PCType::ParticleTileType const*, typename PCType::ParticleTileType *>::type;
-    using AoSRef          = typename std::conditional
-        <is_const, typename PCType::AoS const&, typename PCType::AoS&>::type;
-    using SoARef          = typename std::conditional
-        <is_const, typename PCType::SoA const&, typename PCType::SoA&>::type;
+    using ContainerRef    = std::conditional_t<is_const, PCType const&, PCType&>;
+    using ContainerPtr    = std::conditional_t<is_const, PCType const*, PCType*>;
+    using ParticleTileRef = std::conditional_t
+        <is_const, typename PCType::ParticleTileType const&, typename PCType::ParticleTileType &>;
+    using ParticleTilePtr = std::conditional_t
+        <is_const, typename PCType::ParticleTileType const*, typename PCType::ParticleTileType *>;
+    using AoSRef          = std::conditional_t
+        <is_const, typename PCType::AoS const&, typename PCType::AoS&>;
+    using SoARef          = std::conditional_t
+        <is_const, typename PCType::SoA const&, typename PCType::SoA&>;
 
 public:
 
@@ -99,7 +100,7 @@ public:
 
     [[nodiscard]] std::pair<int, int> GetPairIndex () const { return std::make_pair(this->index(), this->LocalTileIndex()); }
 
-    [[nodiscard]] const Geometry& Geom (int lev) const { return m_pc.Geom(lev); }
+    [[nodiscard]] const Geometry& Geom (int lev) const { return m_pc->Geom(lev); }
 
 protected:
 
@@ -107,7 +108,7 @@ protected:
     int m_pariter_index;
     Vector<int> m_valid_index;
     Vector<ParticleTilePtr> m_particle_tiles;
-    ContainerRef m_pc;
+    ContainerPtr m_pc;
 };
 
 template <typename T_ParticleType, int NArrayReal=0, int NArrayInt=0,
@@ -172,7 +173,7 @@ ParIterBase_impl<is_const, ParticleType, NArrayReal, NArrayInt, Allocator, CellA
       MFIter(*pc.m_dummy_mf[level], pc.do_tiling ? info.EnableTiling(pc.tile_size) : info),
       m_level(level),
       m_pariter_index(0),
-      m_pc(pc)
+      m_pc(&pc)
 {
     auto& particles = pc.GetParticles(level);
 
@@ -224,7 +225,7 @@ ParIterBase_impl<is_const, T_ParticleType, NArrayReal, NArrayInt, Allocator, Cel
            pc.do_tiling ? pc.tile_size : IntVect::TheZeroVector()),
     m_level(level),
     m_pariter_index(0),
-    m_pc(pc)
+    m_pc(&pc)
 {
     auto& particles = pc.GetParticles(level);
 
diff --git a/Src/Particle/AMReX_Particle.H b/Src/Particle/AMReX_Particle.H
index 16004d1231e..1ff630929e0 100644
--- a/Src/Particle/AMReX_Particle.H
+++ b/Src/Particle/AMReX_Particle.H
@@ -15,7 +15,7 @@ namespace amrex {
 
 namespace
 {
-    /** Used for 64bit Long particle Ids as in AoS layout */
+    /** Special flags used for 64-bit Long particle Ids */
     namespace LongParticleIds {
         constexpr Long GhostParticleID = 549755813887L; // 2**39-1
         constexpr Long VirtualParticleID = GhostParticleID - 1;
@@ -24,27 +24,27 @@ namespace
         constexpr Long NoSplitParticleID = GhostParticleID - 4;
     }
 
-    /** Used for 32bit int particle Ids, as in pure SoA layout */
-    namespace IntParticleIds {
-        constexpr int GhostParticleID = 2147483647; // 2**31-1
-        constexpr int VirtualParticleID = GhostParticleID - 1;
-        constexpr int LastParticleID = GhostParticleID - 2;
-        constexpr int DoSplitParticleID = GhostParticleID - 3;
-        constexpr int NoSplitParticleID = GhostParticleID - 4;
+    using namespace LongParticleIds;
+
+    /** Flags used to set the entire uint64_t idcpu
+        to special values at once.
+     */
+    namespace ParticleIdCpus {
+        constexpr std::uint64_t Invalid = 16777216; // corresponds to id = -1, cpu = 0
     }
 
-    using namespace LongParticleIds;
+    using namespace ParticleIdCpus;
 }
 
 struct ParticleIDWrapper
 {
-    uint64_t& m_idata;
+    uint64_t* m_idata;
 
     ~ParticleIDWrapper () noexcept = default;
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     ParticleIDWrapper (uint64_t& idata) noexcept
-        : m_idata(idata)
+        : m_idata(&idata)
     {}
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -68,7 +68,7 @@ struct ParticleIDWrapper
     ParticleIDWrapper& operator= (const Long id) noexcept
     {
         // zero out the 40 leftmost bits, which store the sign and the abs of the id;
-        m_idata &= 0x00FFFFFF;
+        (*m_idata) &= 0x00FFFFFF;
 
         uint64_t val;
         uint64_t sign = id >= 0;
@@ -85,8 +85,8 @@ struct ParticleIDWrapper
             val = -id;
         }
 
-        m_idata |= (sign << 63);  // put the sign in the leftmost bit
-        m_idata |= (val << 24);   // put the val in the next 39
+        (*m_idata) |= (sign << 63);  // put the sign in the leftmost bit
+        (*m_idata) |= (val << 24);   // put the val in the next 39
         return *this;
     }
 
@@ -95,24 +95,59 @@ struct ParticleIDWrapper
     {
         Long r = 0;
 
-        uint64_t sign = m_idata >> 63;  // extract leftmost sign bit
-        uint64_t val  = ((m_idata >> 24) & 0x7FFFFFFFFF);  // extract next 39 id bits
+        uint64_t sign = (*m_idata) >> 63;  // extract leftmost sign bit
+        uint64_t val  = (((*m_idata) >> 24) & 0x7FFFFFFFFF);  // extract next 39 id bits
 
         Long lval = static_cast<Long>(val);  // bc we take -
         r = (sign) ? lval : -lval;
         return r;
     }
+
+    /** Mark the particle as invalid
+     *
+     * Swaps the is_valid (sign) bit to invalid.
+     * This is NOT identical to id = -id, but it is equally reversible via make_valid().
+     */
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    void make_invalid () const noexcept
+    {
+        // RHS mask: 0111...
+        (*m_idata) &= ~(uint64_t(1) << 63);
+    }
+
+    /** Mark the particle as valid
+     *
+     * Swaps the is_valid (sign) bit to valid.
+     * This is NOT identical to id = -id, but it is equally reversible via make_invalid().
+     */
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    void make_valid () const noexcept
+    {
+        // RHS mask: 1000...
+        (*m_idata) |= uint64_t(1) << 63;
+    }
+
+    /** Check the particle is valid, via the sign of the id.
+     *
+     * Returns true if the particle is valid (the id is positive), otherwise false (invalid particle).
+     */
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool is_valid () const noexcept
+    {
+        // the leftmost bit is our id's valid sign
+        return (*m_idata) >> 63;
+    }
 };
 
 struct ParticleCPUWrapper
 {
-    uint64_t& m_idata;
+    uint64_t* m_idata;
 
     ~ParticleCPUWrapper () noexcept = default;
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     ParticleCPUWrapper (uint64_t& idata) noexcept
-        : m_idata(idata)
+        : m_idata(&idata)
     {}
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -136,29 +171,29 @@ struct ParticleCPUWrapper
     ParticleCPUWrapper& operator= (const int cpu) noexcept
     {
         // zero out the first 24 bits, which are used to store the cpu number
-        m_idata &= (~ 0x00FFFFFF);
+        (*m_idata) &= (~ 0x00FFFFFF);
 
         AMREX_ASSERT(cpu >= 0);
         AMREX_ASSERT(cpu <= 16777215);  // 2**24-1, the max representable number
 
-        m_idata |= cpu;
+        (*m_idata) |= cpu;
         return *this;
     }
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     operator int () const noexcept
     {
-        return static_cast<int>(m_idata & 0x00FFFFFF);
+        return static_cast<int>((*m_idata) & 0x00FFFFFF);
     }
 };
 
 struct ConstParticleIDWrapper
 {
-    const uint64_t& m_idata;
+    const uint64_t* m_idata;
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     ConstParticleIDWrapper (const uint64_t& idata) noexcept
-        : m_idata(idata)
+        : m_idata(&idata)
     {}
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -166,28 +201,51 @@ struct ConstParticleIDWrapper
     {
         Long r = 0;
 
-        uint64_t sign = m_idata >> 63;  // extract leftmost sign bit
-        uint64_t val  = ((m_idata >> 24) & 0x7FFFFFFFFF);  // extract next 39 id bits
+        uint64_t sign = (*m_idata) >> 63;  // extract leftmost sign bit
+        uint64_t val  = (((*m_idata) >> 24) & 0x7FFFFFFFFF);  // extract next 39 id bits
 
         Long lval = static_cast<Long>(val);  // bc we take -
         r = (sign) ? lval : -lval;
         return r;
     }
+
+    /** Check the sign of the id.
+     *
+     * Returns true if the id is positive, otherwise false (invalid particle).
+     */
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool is_valid () const noexcept
+    {
+        // the leftmost bit is our id's valid sign
+        return (*m_idata) >> 63;
+    }
 };
 
 struct ConstParticleCPUWrapper
 {
-    const uint64_t& m_idata;
+    const uint64_t* m_idata;
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     ConstParticleCPUWrapper (const uint64_t& idata) noexcept
-        : m_idata(idata)
+        : m_idata(&idata)
     {}
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    operator int () const noexcept { return static_cast<int>(m_idata & 0x00FFFFFF); }
+    operator int () const noexcept { return static_cast<int>((*m_idata) & 0x00FFFFFF); }
 };
 
+/** Set the idcpu value at once, based on a particle id and cpuid
+ *
+ * This can be used in initialization and assignments,
+ * to avoid writing twice into the same memory bank.
+ */
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+std::uint64_t SetParticleIDandCPU (Long id, int cpu) noexcept{
+    std::uint64_t idcpu = 0;
+    ParticleIDWrapper{idcpu} = id;
+    ParticleCPUWrapper{idcpu} = cpu;
+    return idcpu;
+}
 
 template <typename T, int NReal, int NInt>
 struct ParticleBase
@@ -296,7 +354,7 @@ struct Particle
         return this->m_pos[index];
     }
 
-    template <int U = T_NReal, typename std::enable_if<U != 0, int>::type = 0>
+    template <int U = T_NReal, std::enable_if_t<U != 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     RealType& rdata (int index) &
     {
@@ -304,7 +362,7 @@ struct Particle
         return this->m_rdata[index];
     }
 
-    template <int U = T_NReal, typename std::enable_if<U == 0, int>::type = 0>
+    template <int U = T_NReal, std::enable_if_t<U == 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     RealType& rdata (int /*index*/) &
     {
@@ -312,7 +370,7 @@ struct Particle
         return this->pos(0);  // bc we must return something
     }
 
-    template <int U = T_NReal, typename std::enable_if<U != 0, int>::type = 0>
+    template <int U = T_NReal, std::enable_if_t<U != 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     const RealType& rdata (int index) const &
     {
@@ -320,7 +378,7 @@ struct Particle
         return this->m_rdata[index];
     }
 
-    template <int U = T_NReal, typename std::enable_if<U == 0, int>::type = 0>
+    template <int U = T_NReal, std::enable_if_t<U == 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     RealType rdata (int /*index*/) const &
     {
@@ -331,7 +389,7 @@ struct Particle
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     RealType rdata (int /*index*/) && = delete;
 
-    template <int U = T_NReal, typename std::enable_if<U != 0, int>::type = 0>
+    template <int U = T_NReal, std::enable_if_t<U != 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     RealVect rvec (AMREX_D_DECL(int indx, int indy, int indz)) const &
     {
@@ -341,7 +399,7 @@ struct Particle
                                      this->m_rdata[indz]));
     }
 
-    template <int U = T_NReal, typename std::enable_if<U == 0, int>::type = 0>
+    template <int U = T_NReal, std::enable_if_t<U == 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     RealVect rvec (AMREX_D_DECL(int /*indx*/, int /*indy*/, int /*indz*/)) const &
     {
@@ -349,7 +407,7 @@ struct Particle
         return RealVect(AMREX_D_DECL(0.0, 0.0, 0.0)); // bc we must return something
     }
 
-    template <int U = T_NReal, typename std::enable_if<U != 0, int>::type = 0>
+    template <int U = T_NReal, std::enable_if_t<U != 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     RealVect rvec (const IntVect& indices) const &
     {
@@ -359,7 +417,7 @@ struct Particle
                                      this->m_rdata[indices[2]]));
     }
 
-    template <int U = T_NReal, typename std::enable_if<U == 0, int>::type = 0>
+    template <int U = T_NReal, std::enable_if_t<U == 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     RealVect rvec (const IntVect& /*indices*/) const &
     {
@@ -367,7 +425,7 @@ struct Particle
         return RealVect(AMREX_D_DECL(0.0, 0.0, 0.0)); // bc we must return something
     }
 
-    template <int U = T_NInt, typename std::enable_if<U != 0, int>::type = 0>
+    template <int U = T_NInt, std::enable_if_t<U != 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int& idata (int index) &
     {
@@ -375,7 +433,7 @@ struct Particle
         return this->m_idata[index];
     }
 
-    template <int U = T_NInt, typename std::enable_if<U == 0, int>::type = 0>
+    template <int U = T_NInt, std::enable_if_t<U == 0, int> = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     uint64_t& idata (int /*index*/) &
     {
@@ -383,7 +441,7 @@ struct Particle
         return this->m_idcpu;  //bc we must return something
     }
 
-    template <int U = T_NInt, typename std::enable_if<U != 0, int>::type = 0>
+    template <int U = T_NInt, std::enable_if_t<U != 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     const int& idata (int index) const &
     {
@@ -391,7 +449,7 @@ struct Particle
         return this->m_idata[index];
     }
 
-    template <int U = T_NInt, typename std::enable_if<U == 0, int>::type = 0>
+    template <int U = T_NInt, std::enable_if_t<U == 0, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int idata (int /*index*/) const &
     {
diff --git a/Src/Particle/AMReX_ParticleCommunication.H b/Src/Particle/AMReX_ParticleCommunication.H
index ba9a4faba49..5a5dfeb54e1 100644
--- a/Src/Particle/AMReX_ParticleCommunication.H
+++ b/Src/Particle/AMReX_ParticleCommunication.H
@@ -7,6 +7,7 @@
 #include <AMReX_IntVect.H>
 #include <AMReX_ParticleBufferMap.H>
 #include <AMReX_MFIter.H>
+#include <AMReX_Scan.H>
 #include <AMReX_TypeTraits.H>
 #include <AMReX_MakeParticle.H>
 
@@ -90,8 +91,8 @@ struct ParticleCopyPlan
     Vector<int> m_rcv_box_pids;
     Vector<int> m_rcv_box_levs;
 
-    Long m_NumSnds;
-    int m_nrcvs;
+    Long m_NumSnds = 0;
+    int m_nrcvs = 0;
     mutable Vector<MPI_Status> m_build_stats;
     mutable Vector<MPI_Request> m_build_rreqs;
 
@@ -219,9 +220,13 @@ struct ParticleCopyPlan
             if (int_comp_mask[i])  {++num_int_comm_comp;}
         }
 
-        m_superparticle_size = sizeof(typename PC::ParticleType)
-                             + num_real_comm_comp * sizeof(typename PC::ParticleType::RealType)
-                             + num_int_comm_comp  * sizeof(int);
+        if constexpr (PC::ParticleType::is_soa_particle) {
+            m_superparticle_size = sizeof(uint64_t);  // idcpu
+        } else {
+            m_superparticle_size = sizeof(typename PC::ParticleType);
+        }
+        m_superparticle_size += num_real_comm_comp * sizeof(typename PC::ParticleType::RealType)
+                              + num_int_comm_comp  * sizeof(int);
 
         buildMPIStart(pc.BufferMap(), m_superparticle_size);
     }
@@ -289,8 +294,8 @@ struct GetSendBufferOffset
 
 template <class PC, class Buffer,
           std::enable_if_t<IsParticleContainer<PC>::value &&
-                           std::is_base_of<PolymorphicArenaAllocator<typename Buffer::value_type>,
-                                           Buffer>::value, int> foo = 0>
+                           std::is_base_of_v<PolymorphicArenaAllocator<typename Buffer::value_type>,
+                                           Buffer>, int> foo = 0>
 void packBuffer (const PC& pc, const ParticleCopyOp& op, const ParticleCopyPlan& plan,
                  Buffer& snd_buffer)
 {
@@ -390,7 +395,7 @@ void packBuffer (const PC& pc, const ParticleCopyOp& op, const ParticleCopyPlan&
 
 template <class PC, class Buffer, class UnpackPolicy,
           std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
-void unpackBuffer (PC& pc, const ParticleCopyPlan& plan, const Buffer& snd_buffer, const UnpackPolicy&& policy)
+void unpackBuffer (PC& pc, const ParticleCopyPlan& plan, const Buffer& snd_buffer, UnpackPolicy const& policy)
 {
     BL_PROFILE("amrex::unpackBuffer");
 
@@ -478,7 +483,7 @@ void communicateParticlesStart (const PC& pc, ParticleCopyPlan& plan, const SndB
             RcvProc.push_back(i);
             rOffset.push_back(TotRcvBytes);
             Long nbytes = plan.m_rcv_num_particles[i]*psize;
-            std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+            std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
             TotRcvBytes = Long(amrex::aligned_size(acd, TotRcvBytes));
             TotRcvBytes += Long(amrex::aligned_size(acd, nbytes));
             plan.m_rcv_pad_correction_h.push_back(plan.m_rcv_pad_correction_h.back() + nbytes);
@@ -511,7 +516,7 @@ void communicateParticlesStart (const PC& pc, ParticleCopyPlan& plan, const SndB
         const auto Who    = RcvProc[i];
         const auto offset = rOffset[i];
         Long nbytes       = plan.m_rcv_num_particles[Who]*psize;
-        std::size_t acd   = ParallelDescriptor::alignof_comm_data(nbytes);
+        std::size_t acd   = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
         const auto Cnt    = amrex::aligned_size(acd, nbytes);
 
         AMREX_ASSERT(Cnt > 0);
@@ -533,9 +538,8 @@ void communicateParticlesStart (const PC& pc, ParticleCopyPlan& plan, const SndB
         if (Cnt == 0) { continue; }
 
         auto snd_offset = plan.m_snd_offsets[i];
-        AMREX_ASSERT(plan.m_snd_counts[i] % ParallelDescriptor::alignof_comm_data(plan.m_snd_num_particles[i]*psize) == 0);
+        AMREX_ASSERT(plan.m_snd_counts[i] % ParallelDescriptor::sizeof_selected_comm_data_type(plan.m_snd_num_particles[i]*psize) == 0);
         AMREX_ASSERT(Who >= 0 && Who < NProcs);
-        AMREX_ASSERT(snd_offset % ParallelDescriptor::alignof_comm_data(plan.m_snd_num_particles[i]*psize) == 0);
 
         ParallelDescriptor::Send((char const*)(snd_buffer.dataPtr()+snd_offset), Cnt, Who, SeqNum,
                                  ParallelContext::CommunicatorSub());
@@ -551,7 +555,7 @@ void communicateParticlesFinish (const ParticleCopyPlan& plan);
 
 template <class PC, class Buffer, class UnpackPolicy,
           std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
-void unpackRemotes (PC& pc, const ParticleCopyPlan& plan, Buffer& rcv_buffer, UnpackPolicy&& policy)
+void unpackRemotes (PC& pc, const ParticleCopyPlan& plan, Buffer& rcv_buffer, UnpackPolicy const& policy)
 {
     BL_PROFILE("amrex::unpackRemotes");
 
diff --git a/Src/Particle/AMReX_ParticleCommunication.cpp b/Src/Particle/AMReX_ParticleCommunication.cpp
index 6558c30e3fa..51cb2866f69 100644
--- a/Src/Particle/AMReX_ParticleCommunication.cpp
+++ b/Src/Particle/AMReX_ParticleCommunication.cpp
@@ -182,11 +182,12 @@ void ParticleCopyPlan::buildMPIStart (const ParticleBufferMap& map, Long psize)
     for (int i = 0; i < NProcs; ++i)
     {
         Long nbytes = m_snd_num_particles[i]*psize;
-        std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+        std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
         auto Cnt = static_cast<Long>(amrex::aligned_size(acd, nbytes));
         Long bytes_to_send = (i == MyProc) ? 0 : Cnt;
         m_snd_counts.push_back(bytes_to_send);
-        m_snd_offsets.push_back(amrex::aligned_size(acd, m_snd_offsets.back()) + Cnt);
+        m_snd_offsets.push_back(amrex::aligned_size(Arena::align_size,
+                                                    m_snd_offsets.back() + Cnt));
         m_snd_pad_correction_h.push_back(m_snd_pad_correction_h.back() + nbytes);
     }
 
diff --git a/Src/Particle/AMReX_ParticleContainer.H b/Src/Particle/AMReX_ParticleContainer.H
index df71f2eeec1..568fe8472e1 100644
--- a/Src/Particle/AMReX_ParticleContainer.H
+++ b/Src/Particle/AMReX_ParticleContainer.H
@@ -692,7 +692,7 @@ public:
     */
     template <class F, class PCType,
               std::enable_if_t<IsParticleContainer<PCType>::value, int> foo = 0,
-              std::enable_if_t<! std::is_integral<F>::value, int> bar = 0>
+              std::enable_if_t<! std::is_integral_v<F>, int> bar = 0>
     void copyParticles (const PCType& other, F&&f, bool local=false);
 
     /**
@@ -710,8 +710,8 @@ public:
     */
     template <class F, class PCType,
               std::enable_if_t<IsParticleContainer<PCType>::value, int> foo = 0,
-              std::enable_if_t<! std::is_integral<F>::value, int> bar = 0>
-    void addParticles (const PCType& other, F&& f, bool local=false);
+              std::enable_if_t<! std::is_integral_v<F>, int> bar = 0>
+    void addParticles (const PCType& other, F const& f, bool local=false);
 
     /**
     * \brief Write a contiguous chunk of real particle data to an ostream.
@@ -819,7 +819,7 @@ public:
      * \param name The name of the sub-directory for this particle type (i.e. "Tracer")
      * \param f callable that returns whether or not to write each particle
      */
-    template <class F, typename std::enable_if<!std::is_same<F, Vector<std::string>&>::value>::type* = nullptr>
+    template <class F, std::enable_if_t<!std::is_same_v<F, Vector<std::string>&>>* = nullptr>
     void WritePlotFile (const std::string& dir, const std::string& name, F&& f) const;
 
     /**
@@ -876,7 +876,7 @@ public:
      * \param real_comp_names for each real component, a name to label the data with
      * \param f callable that returns whether or not to write each particle
      */
-    template <class F, typename std::enable_if<!std::is_same<F, Vector<std::string>>::value>::type* = nullptr>
+    template <class F, std::enable_if_t<!std::is_same_v<F, Vector<std::string>>>* = nullptr>
     void WritePlotFile (const std::string& dir, const std::string& name,
                         const Vector<std::string>& real_comp_names, F&& f) const;
 
@@ -1180,11 +1180,6 @@ public:
                         Vector<std::unique_ptr<MultiFab> >& mf_to_be_filled,
                         int lev_min, int ncomp, int finest_level, int ngrow=2) const;
 
-    void Interpolate (Vector<std::unique_ptr<MultiFab> >& mesh_data,
-                      int lev_min, int lev_max);
-
-    void InterpolateSingleLevel (MultiFab& mesh_data, int lev);
-
     void AssignCellDensitySingleLevel (int rho_index, MultiFab& mf, int level,
                                        int ncomp=1, int particle_lvl_offset = 0) const;
 
@@ -1242,7 +1237,7 @@ public:
     Long superParticleSize() const { return superparticle_size; }
 
     template <typename T,
-              typename std::enable_if<std::is_same<T,bool>::value,int>::type=0>
+              std::enable_if_t<std::is_same_v<T,bool>,int> = 0>
     void AddRealComp (T communicate=true)
     {
         m_runtime_comps_defined = true;
@@ -1252,7 +1247,7 @@ public:
     }
 
     template <typename T,
-              typename std::enable_if<std::is_same<T,bool>::value,int>::type=0>
+              std::enable_if_t<std::is_same_v<T,bool>,int> = 0>
     void AddIntComp (T communicate=true)
     {
         m_runtime_comps_defined = true;
diff --git a/Src/Particle/AMReX_ParticleContainerBase.H b/Src/Particle/AMReX_ParticleContainerBase.H
index 9c6b2ea2e30..64adf750f0f 100644
--- a/Src/Particle/AMReX_ParticleContainerBase.H
+++ b/Src/Particle/AMReX_ParticleContainerBase.H
@@ -57,15 +57,15 @@ public:
         m_gdb_object
         (std::make_unique<ParGDB>(geom,dmap,ba, [&]() -> Vector<int> {
                 Vector<int> ref_ratio;
-                for (int i = 0; i < static_cast<int>(rr.size()); ++i)
+                for (auto const& iv : rr)
                 {
 #if AMREX_SPACEDIM > 1
-                    AMREX_ASSERT(rr[i][0] == rr[i][1]);
+                    AMREX_ASSERT(iv[0] == iv[1]);
 #endif
 #if AMREX_SPACEDIM > 2
-                    AMREX_ASSERT(rr[i][0] == rr[i][2]);
+                    AMREX_ASSERT(iv[0] == iv[2]);
 #endif
-                    ref_ratio.push_back(rr[i][0]);
+                    ref_ratio.push_back(iv[0]);
                 }
                 return ref_ratio;  }() )),
         m_gdb(static_cast<ParGDBBase*>(m_gdb_object.get()))
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index b845d130d5c..142f917e025 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -18,10 +18,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         if (h_redistribute_int_comp[i]) {++num_int_comm_comps;}
     }
 
-    if constexpr(!ParticleType::is_soa_particle) {
-        particle_size = sizeof(ParticleType);
+    if constexpr (ParticleType::is_soa_particle) {
+        particle_size = sizeof(uint64_t);  // idcpu
     } else {
-        particle_size = 0;
+        particle_size = sizeof(ParticleType);
     }
     superparticle_size = particle_size +
         num_real_comm_comps*sizeof(ParticleReal) + num_int_comm_comps*sizeof(int);
@@ -1010,7 +1010,7 @@ template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
 template <class F, class PCType,
           std::enable_if_t<IsParticleContainer<PCType>::value, int> foo,
-          std::enable_if_t<! std::is_integral<F>::value, int> bar>
+          std::enable_if_t<! std::is_integral_v<F>, int> bar>
 void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>::
 copyParticles (const PCType& other, F&& f, bool local)
@@ -1024,10 +1024,10 @@ template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
 template <class F, class PCType,
           std::enable_if_t<IsParticleContainer<PCType>::value, int> foo,
-          std::enable_if_t<! std::is_integral<F>::value, int> bar>
+          std::enable_if_t<! std::is_integral_v<F>, int> bar>
 void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>::
-addParticles (const PCType& other, F&& f, bool local)
+addParticles (const PCType& other, F const& f, bool local)
 {
     BL_PROFILE("ParticleContainer::addParticles");
 
@@ -1095,7 +1095,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     const size_t np_total = np + ptile.numNeighborParticles();
 
     if (memEfficientSort) {
-        if constexpr(!ParticleType::is_soa_particle) {
+        if constexpr (!ParticleType::is_soa_particle) {
             static_assert(sizeof(ParticleType)%4 == 0 && sizeof(uint32_t) == 4);
             using tmp_t = std::conditional_t<sizeof(ParticleType)%8 == 0,
                                              uint64_t, uint32_t>;
@@ -1530,7 +1530,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             unsigned npart = ptile_ptrs[pmap_it]->numParticles();
             ParticleLocData pld;
 
-            if constexpr(!ParticleType::is_soa_particle){
+            if constexpr (!ParticleType::is_soa_particle){
 
                 if (npart != 0) {
                     Long last = npart - 1;
@@ -1647,7 +1647,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                     }
                 }
 
-            } else{ // soa particle
+            } else { // soa particle
 
                 auto particle_tile = ptile_ptrs[pmap_it];
                 if (npart != 0) {
@@ -1663,6 +1663,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                         }
 
                         if (p.id() < 0){
+                            soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
                             for (int comp = 0; comp < NumRealComps(); comp++) {
                                 soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
                             }
@@ -1679,6 +1680,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                         particlePostLocate(p, pld, lev);
 
                         if (p.id() < 0) {
+                            soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
                             for (int comp = 0; comp < NumRealComps(); comp++) {
                                 soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
                             }
@@ -1696,6 +1698,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                                 // We own it but must shift it to another place.
                                 auto index = std::make_pair(pld.m_grid, pld.m_tile);
                                 AMREX_ASSERT(soa_local[pld.m_lev][index].size() == num_threads);
+                                {
+                                    auto& arr = soa_local[pld.m_lev][index][thread_num].GetIdCPUData();
+                                    arr.push_back(soa.GetIdCPUData()[pindex]);
+                                }
                                 for (int comp = 0; comp < NumRealComps(); ++comp) {
                                     RealVector& arr = soa_local[pld.m_lev][index][thread_num].GetRealData(comp);
                                     arr.push_back(soa.GetRealData(comp)[pindex]);
@@ -1715,6 +1721,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                             particles_to_send.resize(new_size);
 
                             char* dst = &particles_to_send[old_size];
+                            {
+                                std::memcpy(dst, &soa.GetIdCPUData()[pindex], sizeof(uint64_t));
+                                dst += sizeof(uint64_t);
+                            }
                             int array_comp_start = AMREX_SPACEDIM + NStructReal;
                             for (int comp = 0; comp < NumRealComps(); comp++) {
                                 if (h_redistribute_real_comp[array_comp_start + comp]) {
@@ -1733,6 +1743,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                         }
 
                         if (p.id() < 0){
+                            soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
                             for (int comp = 0; comp < NumRealComps(); comp++) {
                                 soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
                             }
@@ -1747,6 +1758,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                         ++pindex;
                     }
 
+                    {
+                        auto& iddata = soa.GetIdCPUData();
+                        iddata.erase(iddata.begin() + last + 1, iddata.begin() + npart);
+                    }
                     for (int comp = 0; comp < NumRealComps(); comp++) {
                         RealVector& rdata = soa.GetRealData(comp);
                         rdata.erase(rdata.begin() + last + 1, rdata.begin() + npart);
@@ -1821,13 +1836,19 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 #ifdef AMREX_USE_OMP
 #pragma omp parallel for
 #endif
-            for (int pit = 0; pit < static_cast<int>(grid_tile_ids.size()); ++pit)
+            for (int pit = 0; pit < static_cast<int>(grid_tile_ids.size()); ++pit) // NOLINT(modernize-loop-convert)
             {
                 auto index = grid_tile_ids[pit];
                 auto& ptile = DefineAndReturnParticleTile(lev, index.first, index.second);
                 auto& soa = ptile.GetStructOfArrays();
                 auto& soa_tmp = soa_local[lev][index];
                 for (int i = 0; i < num_threads; ++i) {
+                    {
+                        auto& arr = soa.GetIdCPUData();
+                        auto& tmp = soa_tmp[i].GetIdCPUData();
+                        arr.insert(arr.end(), tmp.begin(), tmp.end());
+                        tmp.erase(tmp.begin(), tmp.end());
+                    }
                     for (int comp = 0; comp < NumRealComps(); ++comp) {
                         RealVector& arr = soa.GetRealData(comp);
                         RealVector& tmp = soa_tmp[i].GetRealData(comp);
@@ -2045,20 +2066,16 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
 
                 Particle<NStructReal, NStructInt> p;
 
-                if constexpr (!ParticleType::is_soa_particle) {
-                   std::memcpy(&p, pbuf, sizeof(ParticleType));
-                } else {
+                if constexpr (ParticleType::is_soa_particle) {
+                    std::memcpy(&p.m_idcpu, pbuf, sizeof(uint64_t));
+
                     ParticleReal pos[AMREX_SPACEDIM];
-                    std::memcpy(&pos[0], pbuf, AMREX_SPACEDIM*sizeof(ParticleReal));
+                    std::memcpy(&pos[0], pbuf + sizeof(uint64_t), AMREX_SPACEDIM*sizeof(ParticleReal));
                     AMREX_D_TERM(p.pos(0) = pos[0];,
                                  p.pos(1) = pos[1];,
                                  p.pos(2) = pos[2]);
-
-                    int idcpu[2];
-                    std::memcpy(&idcpu[0], pbuf + NumRealComps()*sizeof(ParticleReal), 2*sizeof(int));
-
-                    p.id() = idcpu[0];
-                    p.cpu() = idcpu[1];
+                } else {
+                   std::memcpy(&p, pbuf, sizeof(ParticleType));
                 }
 
                 bool success = Where(p, pld, lev_min, lev_max, 0);
@@ -2097,7 +2114,12 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
                                                                           rcv_tile[ipart])];
                 char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;
 
-                if constexpr(! ParticleType::is_soa_particle) {
+                if constexpr (ParticleType::is_soa_particle) {
+                    uint64_t idcpudata;
+                    std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
+                    pbuf += sizeof(uint64_t);
+                    ptile.GetStructOfArrays().GetIdCPUData().push_back(idcpudata);
+                } else {
                     ParticleType p;
                     std::memcpy(&p, pbuf, sizeof(ParticleType));
                     pbuf += sizeof(ParticleType);
@@ -2146,6 +2168,10 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
         host_int_attribs.reserve(15);
         host_int_attribs.resize(finestLevel()+1);
 
+        Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
+        host_idcpu.reserve(15);
+        host_idcpu.resize(finestLevel()+1);
+
         ipart = 0;
         for (int i = 0; i < nrcvs; ++i)
         {
@@ -2159,7 +2185,15 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
 
                 char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;
 
-                if constexpr(! ParticleType::is_soa_particle) {
+                host_real_attribs[lev][ind].resize(NumRealComps());
+                host_int_attribs[lev][ind].resize(NumIntComps());
+
+                if constexpr (ParticleType::is_soa_particle) {
+                    uint64_t idcpudata;
+                    std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
+                    pbuf += sizeof(uint64_t);
+                    host_idcpu[lev][ind].push_back(idcpudata);
+                } else {
                     ParticleType p;
                     std::memcpy(&p, pbuf, sizeof(ParticleType));
                     pbuf += sizeof(ParticleType);
@@ -2210,7 +2244,12 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
               auto new_size = old_size + src_tile.size();
               dst_tile.resize(new_size);
 
-              if constexpr(! ParticleType::is_soa_particle) {
+              if constexpr (ParticleType::is_soa_particle) {
+                  Gpu::copyAsync(Gpu::hostToDevice,
+                                 host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
+                                 host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
+                                 dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
+              } else {
                   Gpu::copyAsync(Gpu::hostToDevice,
                                  src_tile.begin(), src_tile.end(),
                                  dst_tile.GetArrayOfStructs().begin() + old_size);
@@ -2442,56 +2481,6 @@ AssignCellDensitySingleLevel (int rho_index,
     }
 }
 
-template <typename ParticleType, int NArrayReal, int NArrayInt,
-          template<class> class Allocator, class CellAssignor>
-void
-ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>::Interpolate (Vector<std::unique_ptr<MultiFab> >& mesh_data,
-                                                                                int lev_min, int lev_max)
-{
-    BL_PROFILE("ParticleContainer::Interpolate()");
-    for (int lev = lev_min; lev <= lev_max; ++lev) {
-        InterpolateSingleLevel(*mesh_data[lev], lev);
-    }
-}
-
-template <typename ParticleType, int NArrayReal, int NArrayInt,
-          template<class> class Allocator, class CellAssignor>
-void
-ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>::
-InterpolateSingleLevel (MultiFab& mesh_data, int lev)
-{
-    BL_PROFILE("ParticleContainer::InterpolateSingleLevel()");
-
-    if (mesh_data.nGrow() < 1) {
-        amrex::Error("Must have at least one ghost cell when in InterpolateSingleLevel");
-    }
-
-    const Geometry& gm = Geom(lev);
-    const auto     plo = gm.ProbLoArray();
-    const auto     dxi = gm.InvCellSizeArray();
-
-    using ParIter = ParIter_impl<ParticleType, NArrayReal, NArrayInt, Allocator>;
-
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-    for (ParIter pti(*this, lev); pti.isValid(); ++pti)
-    {
-        auto& particles = pti.GetArrayOfStructs();
-        auto ptd = pti.GetParticleTile().getParticleTileData();
-        FArrayBox& fab = mesh_data[pti];
-        const auto fabarr = fab.array();
-        const Long np = particles.numParticles();
-
-        int nComp = fab.nComp();
-        AMREX_FOR_1D( np, i,
-        {
-            auto p = make_particle<ParticleType>{}(ptd,i);
-            amrex_interpolate_cic(p, nComp, fabarr, plo, dxi);
-        });
-    }
-}
-
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
 void
diff --git a/Src/Particle/AMReX_ParticleIO.H b/Src/Particle/AMReX_ParticleIO.H
index ea24b1e5f81..4a9d446efa7 100644
--- a/Src/Particle/AMReX_ParticleIO.H
+++ b/Src/Particle/AMReX_ParticleIO.H
@@ -32,6 +32,14 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     }
 }
 
+struct FilterPositiveID {
+    template <typename P>
+    AMREX_GPU_HOST_DEVICE
+    int operator() (const P& p) const {
+        return p.id() > 0;
+    }
+};
+
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
 void
@@ -43,7 +51,9 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 {
     Vector<int> write_real_comp;
     Vector<std::string> tmp_real_comp_names;
-    for (int i = 0; i < NStructReal + NumRealComps(); ++i )
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+
+    for (int i = 0; i < nrc; ++i )
     {
         write_real_comp.push_back(1);
         if (real_comp_names.empty())
@@ -77,10 +87,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
                             tmp_real_comp_names, tmp_int_comp_names,
-                            [=] AMREX_GPU_HOST_DEVICE (const SuperParticleType& p) -> int
-                            {
-                                return p.id() > 0;
-                            }, true);
+                            FilterPositiveID{}, true);
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
@@ -91,7 +98,9 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 {
     Vector<int> write_real_comp;
     Vector<std::string> real_comp_names;
-    for (int i = 0; i < NStructReal + NumRealComps(); ++i )
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+
+    for (int i = 0; i < nrc; ++i )
     {
         write_real_comp.push_back(1);
         std::stringstream ss;
@@ -111,10 +120,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
-                            [=] AMREX_GPU_HOST_DEVICE (const SuperParticleType& p)
-                            {
-                                return p.id() > 0;
-                            });
+                            FilterPositiveID{});
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
@@ -125,11 +131,16 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                  const Vector<std::string>& real_comp_names,
                  const Vector<std::string>& int_comp_names) const
 {
-    AMREX_ASSERT(real_comp_names.size() == NStructReal + NumRealComps());
+    if constexpr(ParticleType::is_soa_particle) {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == NumRealComps() + NStructReal - AMREX_SPACEDIM); // pure SoA: skip positions
+    } else {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == NumRealComps() + NStructReal);
+    }
     AMREX_ASSERT( int_comp_names.size() == NStructInt  + NumIntComps() );
 
     Vector<int> write_real_comp;
-    for (int i = 0; i < NStructReal + NumRealComps(); ++i) {
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+    for (int i = 0; i < nrc; ++i) {
         write_real_comp.push_back(1);
     }
 
@@ -141,10 +152,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     WriteBinaryParticleData(dir, name,
                             write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
-                            [=] AMREX_GPU_HOST_DEVICE (const SuperParticleType& p)
-                            {
-                                return p.id() > 0;
-                            });
+                            FilterPositiveID{});
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
@@ -154,10 +162,15 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 ::WritePlotFile (const std::string& dir, const std::string& name,
                  const Vector<std::string>& real_comp_names) const
 {
-    AMREX_ASSERT(real_comp_names.size() == NStructReal + NumRealComps());
+    if constexpr(ParticleType::is_soa_particle) {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == NumRealComps() + NStructReal - AMREX_SPACEDIM); // pure SoA: skip positions
+    } else {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == NumRealComps() + NStructReal);
+    }
 
     Vector<int> write_real_comp;
-    for (int i = 0; i < NStructReal + NumRealComps(); ++i) {
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+    for (int i = 0; i < nrc; ++i) {
         write_real_comp.push_back(1);
     }
 
@@ -177,10 +190,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     WriteBinaryParticleData(dir, name,
                             write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
-                            [=] AMREX_GPU_HOST_DEVICE (const SuperParticleType& p)
-                            {
-                                return p.id() > 0;
-                            });
+                            FilterPositiveID{});
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
@@ -192,11 +202,17 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                  const Vector<int>& write_real_comp,
                  const Vector<int>& write_int_comp) const
 {
-    AMREX_ASSERT(write_real_comp.size() == NStructReal + NumRealComps());
+
+    if constexpr(ParticleType::is_soa_particle) {
+        AMREX_ALWAYS_ASSERT(write_real_comp.size() == NumRealComps() + NStructReal - AMREX_SPACEDIM); // pure SoA: skip positions
+    } else {
+        AMREX_ALWAYS_ASSERT(write_real_comp.size() == NumRealComps() + NStructReal);
+    }
     AMREX_ASSERT(write_int_comp.size()  == NStructInt  + NArrayInt );
 
     Vector<std::string> real_comp_names;
-    for (int i = 0; i < NStructReal + NumRealComps(); ++i )
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+    for (int i = 0; i < nrc; ++i )
     {
         std::stringstream ss;
         ss << "real_comp" << i;
@@ -213,10 +229,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
-                            [=] AMREX_GPU_HOST_DEVICE (const SuperParticleType& p)
-                            {
-                                return p.id() > 0;
-                            });
+                            FilterPositiveID{});
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
@@ -234,22 +247,21 @@ WritePlotFile (const std::string& dir, const std::string& name,
     WriteBinaryParticleData(dir, name,
                             write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
-                            [=] AMREX_GPU_HOST_DEVICE (const SuperParticleType& p)
-                            {
-                                return p.id() > 0;
-                            });
+                            FilterPositiveID{});
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
-template <class F, typename std::enable_if<!std::is_same<F, Vector<std::string>&>::value>::type*>
+template <class F, std::enable_if_t<!std::is_same_v<F, Vector<std::string>&>>*>
 void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>
 ::WritePlotFile (const std::string& dir, const std::string& name, F&& f) const
 {
     Vector<int> write_real_comp;
     Vector<std::string> real_comp_names;
-    for (int i = 0; i < NStructReal + NumRealComps(); ++i )
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+
+    for (int i = 0; i < nrc; ++i )
     {
         write_real_comp.push_back(1);
         std::stringstream ss;
@@ -281,11 +293,16 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                  const Vector<std::string>& real_comp_names,
                  const Vector<std::string>& int_comp_names, F&& f) const
 {
-    AMREX_ASSERT(real_comp_names.size() == NStructReal + NumRealComps());
+    if constexpr(ParticleType::is_soa_particle) {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == NumRealComps() + NStructReal - AMREX_SPACEDIM); // pure SoA: skip positions
+    } else {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == NumRealComps() + NStructReal);
+    }
     AMREX_ASSERT( int_comp_names.size() == NStructInt  + NArrayInt );
 
     Vector<int> write_real_comp;
-    for (int i = 0; i < NStructReal + NumRealComps(); ++i) {
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+    for (int i = 0; i < nrc; ++i) {
         write_real_comp.push_back(1);
     }
 
@@ -302,16 +319,21 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
-template <class F, typename std::enable_if<!std::is_same<F, Vector<std::string>>::value>::type*>
+template <class F, std::enable_if_t<!std::is_same_v<F, Vector<std::string>>>*>
 void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>
 ::WritePlotFile (const std::string& dir, const std::string& name,
                  const Vector<std::string>& real_comp_names, F&& f) const
 {
-    AMREX_ASSERT(real_comp_names.size() == NStructReal + NumRealComps());
+    if constexpr(ParticleType::is_soa_particle) {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == NumRealComps() + NStructReal - AMREX_SPACEDIM); // pure SoA: skip positions
+    } else {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == NumRealComps() + NStructReal);
+    }
 
     Vector<int> write_real_comp;
-    for (int i = 0; i < NStructReal + NumRealComps(); ++i) {
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+    for (int i = 0; i < nrc; ++i) {
         write_real_comp.push_back(1);
     }
 
@@ -344,11 +366,16 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                  const Vector<int>& write_real_comp,
                  const Vector<int>& write_int_comp, F&& f) const
 {
-    AMREX_ASSERT(write_real_comp.size() == NStructReal + NumRealComps());
+    if constexpr(ParticleType::is_soa_particle) {
+        AMREX_ALWAYS_ASSERT(write_real_comp.size() == NumRealComps() + NStructReal - AMREX_SPACEDIM); // pure SoA: skip positions
+    } else {
+        AMREX_ALWAYS_ASSERT(write_real_comp.size() == NumRealComps() + NStructReal);
+    }
     AMREX_ASSERT(write_int_comp.size()  == NStructInt  + NumIntComps() );
 
     Vector<std::string> real_comp_names;
-    for (int i = 0; i < NStructReal + NumRealComps(); ++i )
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+    for (int i = 0; i < nrc; ++i )
     {
         std::stringstream ss;
         ss << "real_comp" << i;
@@ -690,8 +717,9 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
     int nr;
     HdrFile >> nr;
-    if (nr != NStructReal + NumRealComps()) {
-        amrex::Abort("ParticleContainer::Restart(): nr != NStructReal + NumRealComps()");
+    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
+    if (nr != nrc) {
+        amrex::Abort("ParticleContainer::Restart(): nr not the expected value");
     }
 
     std::string comp_name;
@@ -841,9 +869,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             }
         }
 
-        for(int igrid = 0; igrid < static_cast<int>(grids_to_read.size()); ++igrid) {
-            const int grid = grids_to_read[igrid];
-
+        for(int grid : grids_to_read) {
             if (count[grid] <= 0) { continue; }
 
             // The file names in the header file are relative.
@@ -939,7 +965,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     readIntData(istuff.dataPtr(), istuff.size(), ifs, FPC::NativeIntDescriptor());
 
     // Then the real data in binary.
-    const int rChunkSize = AMREX_SPACEDIM + NStructReal + NumRealComps();
+    const int rChunkSize = ParticleType::is_soa_particle ? NStructReal + NumRealComps() : AMREX_SPACEDIM + NStructReal + NumRealComps();
     Vector<RTYPE> rstuff(std::size_t(cnt)*rChunkSize);
     ReadParticleRealData(rstuff.dataPtr(), rstuff.size(), ifs);
 
@@ -964,10 +990,14 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     host_int_attribs.reserve(15);
     host_int_attribs.resize(finest_level_in_file+1);
 
+    Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
+    host_idcpu.reserve(15);
+    host_idcpu.resize(finestLevel()+1);
+
     for (int i = 0; i < cnt; i++) {
         // note: for pure SoA particle layouts, we do write the id, cpu and positions as a struct
         //       for backwards compatibility with readers
-        if (!ParticleType::is_soa_particle && convert_ids) {
+        if (convert_ids) {
             std::int32_t  xi, yi;
             std::uint32_t xu, yu;
             xi = iptr[0];
@@ -1031,8 +1061,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                 host_real_attribs[pld.m_lev][ind][j].push_back(ptemp.pos(j));
             }
 
-            host_int_attribs[pld.m_lev][ind][0].push_back(ptemp.id());
-            host_int_attribs[pld.m_lev][ind][1].push_back(ptemp.cpu());
+            host_idcpu[pld.m_lev][ind].push_back(ptemp.m_idcpu);
 
             // read all other SoA
             // add the real...
@@ -1042,7 +1071,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             }
 
             // ... and int array data
-            for (int icomp = 2; icomp < NumIntComps(); icomp++) {
+            for (int icomp = 0; icomp < NumIntComps(); icomp++) {
                 host_int_attribs[lev][ind][icomp].push_back(*iptr);
                 ++iptr;
             }
@@ -1063,6 +1092,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             {
                 new_size += src_tile.size();
             } else {
+                amrex::ignore_unused(src_tile);
                 new_size += host_real_attribs[host_lev][std::make_pair(grid,tile)][0].size();
             }
             dst_tile.resize(new_size);
@@ -1071,6 +1101,11 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             {
                 Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
                                dst_tile.GetArrayOfStructs().begin() + old_size);
+            } else {
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
+                               host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
+                               dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
             }
 
             for (int i = 0; i < NumRealComps(); ++i) { // NOLINT(readability-misleading-indentation)
diff --git a/Src/Particle/AMReX_ParticleInit.H b/Src/Particle/AMReX_ParticleInit.H
index d2568d5d70e..75316dec746 100644
--- a/Src/Particle/AMReX_ParticleInit.H
+++ b/Src/Particle/AMReX_ParticleInit.H
@@ -1062,6 +1062,10 @@ InitRandom (Long                    icount,
         host_int_attribs.reserve(15);
         host_int_attribs.resize(finestLevel()+1);
 
+        Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
+        host_idcpu.reserve(15);
+        host_idcpu.resize(finestLevel()+1);
+
         for (Long j = 0; j < icount; j++)
         {
             Particle<0, 0> ptest;
@@ -1117,8 +1121,9 @@ InitRandom (Long                    icount,
                         host_real_attribs[pld.m_lev][ind][i].push_back(pos[j*AMREX_SPACEDIM+i]);
                     }
 
-                    host_int_attribs[pld.m_lev][ind][0].push_back(ParticleType::NextID());
-                    host_int_attribs[pld.m_lev][ind][1].push_back(MyProc);
+                    host_idcpu[pld.m_lev][ind].push_back(0);
+                    ParticleIDWrapper(host_idcpu[pld.m_lev][ind].back()) = ParticleType::NextID();
+                    ParticleCPUWrapper(host_idcpu[pld.m_lev][ind].back()) = ParallelDescriptor::MyProc();
 
                     host_particles[pld.m_lev][ind];
 
@@ -1157,6 +1162,11 @@ InitRandom (Long                    icount,
                 {
                     Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
                                    dst_tile.GetArrayOfStructs().begin() + old_size);
+                } else {
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
+                                   host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
+                                   dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
                 }
 
                 for (int i = 0; i < NArrayReal; ++i) { // NOLINT(readability-misleading-indentation)
@@ -1201,6 +1211,10 @@ InitRandom (Long                    icount,
         host_int_attribs.reserve(15);
         host_int_attribs.resize(finestLevel()+1);
 
+        Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
+        host_idcpu.reserve(15);
+        host_idcpu.resize(finestLevel()+1);
+
         for (Long icnt = 0; icnt < M; icnt++) {
             Particle<0, 0> ptest;
             for (int i = 0; i < AMREX_SPACEDIM; i++) {
@@ -1261,8 +1275,9 @@ InitRandom (Long                    icount,
                     host_real_attribs[pld.m_lev][ind][i].push_back(ptest.pos(i));
                 }
 
-                host_int_attribs[pld.m_lev][ind][0].push_back(ptest.id());
-                host_int_attribs[pld.m_lev][ind][1].push_back(ptest.cpu());
+                host_idcpu[pld.m_lev][ind].push_back(0);
+                ParticleIDWrapper(host_idcpu[pld.m_lev][ind].back()) = ParticleType::NextID();
+                ParticleCPUWrapper(host_idcpu[pld.m_lev][ind].back()) = ParallelDescriptor::MyProc();
 
                 host_particles[pld.m_lev][ind];
 
@@ -1300,6 +1315,11 @@ InitRandom (Long                    icount,
                 {
                     Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
                                    dst_tile.GetArrayOfStructs().begin() + old_size);
+                } else {
+                    Gpu::copyAsync(Gpu::hostToDevice,
+                                   host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
+                                   host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
+                                   dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
                 }
 
                 for (int i = 0; i < NArrayReal; ++i) { // NOLINT(readability-misleading-indentation)
diff --git a/Src/Particle/AMReX_ParticleLocator.H b/Src/Particle/AMReX_ParticleLocator.H
index f4d6301fc80..0a07c85553d 100644
--- a/Src/Particle/AMReX_ParticleLocator.H
+++ b/Src/Particle/AMReX_ParticleLocator.H
@@ -46,7 +46,7 @@ struct AssignGrid
 
     template <typename P, typename Assignor = DefaultAssignor>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int operator() (const P& p, int nGrow=0, Assignor&& assignor = Assignor{}) const noexcept
+    int operator() (const P& p, int nGrow=0, Assignor const& assignor = Assignor{}) const noexcept
     {
         const auto iv = assignor(p, m_plo, m_dxi, m_domain);
         return this->operator()(iv, nGrow);
@@ -213,20 +213,20 @@ struct AmrAssignGrid
     template <typename P, typename Assignor = DefaultAssignor>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     GpuTuple<int, int> operator() (const P& p, int lev_min=-1, int lev_max=-1, int nGrow=0,
-                                   Assignor&& assignor = {}) const noexcept
+                                   Assignor const& assignor = {}) const noexcept
     {
         lev_min = (lev_min == -1) ? 0 : lev_min;
         lev_max = (lev_max == -1) ? m_size - 1 : lev_max;
 
         for (int lev = lev_max; lev >= lev_min; --lev)
         {
-            int grid = m_funcs[lev](p, 0, std::forward<Assignor>(assignor));
+            int grid = m_funcs[lev](p, 0, assignor);
             if (grid >= 0) { return makeTuple(grid, lev); }
         }
 
         for (int lev = lev_min; lev >= lev_min; --lev)
         {
-            int grid = m_funcs[lev](p, nGrow, std::forward<Assignor>(assignor));
+            int grid = m_funcs[lev](p, nGrow, assignor);
             if (grid >= 0) { return makeTuple(grid, lev); }
         }
 
diff --git a/Src/Particle/AMReX_ParticleMesh.H b/Src/Particle/AMReX_ParticleMesh.H
index 90b99831850..8c0d56da487 100644
--- a/Src/Particle/AMReX_ParticleMesh.H
+++ b/Src/Particle/AMReX_ParticleMesh.H
@@ -5,6 +5,7 @@
 #include <AMReX_TypeTraits.H>
 #include <AMReX_MultiFab.H>
 #include <AMReX_ParticleUtil.H>
+#include <type_traits>
 
 namespace amrex {
 
@@ -18,9 +19,12 @@ auto call_f (F const& f,
              GpuArray<Real,AMREX_SPACEDIM> const& plo,
              GpuArray<Real,AMREX_SPACEDIM> const& dxi) noexcept
 {
-    if constexpr (IsCallable<F, decltype(p.m_aos[i]), decltype(fabarr), decltype(plo), decltype(dxi)>::value) {
+    using PTDTypeT = std::remove_const_t<std::remove_reference_t<decltype(p)>>;
+    if constexpr ( ! T_ParticleType::is_soa_particle &&
+                   IsCallable<F, typename PTDTypeT::ParticleRefType, decltype(fabarr), decltype(plo), decltype(dxi)>::value) {
         return f(p.m_aos[i], fabarr, plo, dxi);
-    } else if constexpr (IsCallable<F, decltype(p.m_aos[i]), decltype(fabarr)>::value) {
+    } else if constexpr ( ! T_ParticleType::is_soa_particle &&
+                          IsCallable<F, typename PTDTypeT::ParticleRefType, decltype(fabarr)>::value) {
         return f(p.m_aos[i], fabarr);
     } else if constexpr (IsCallable<F, decltype(p.getSuperParticle(i)), decltype(fabarr), decltype(plo), decltype(dxi)>::value) {
         return f(p.getSuperParticle(i), fabarr, plo, dxi);
@@ -36,7 +40,7 @@ auto call_f (F const& f,
 
 template <class PC, class MF, class F, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
 void
-ParticleToMesh (PC const& pc, MF& mf, int lev, F&& f, bool zero_out_input=true)
+ParticleToMesh (PC const& pc, MF& mf, int lev, F const& f, bool zero_out_input=true)
 {
     BL_PROFILE("amrex::ParticleToMesh");
 
@@ -121,7 +125,7 @@ ParticleToMesh (PC const& pc, MF& mf, int lev, F&& f, bool zero_out_input=true)
 
 template <class PC, class MF, class F, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
 void
-MeshToParticle (PC& pc, MF const& mf, int lev, F&& f)
+MeshToParticle (PC& pc, MF const& mf, int lev, F const& f)
 {
     BL_PROFILE("amrex::MeshToParticle");
 
diff --git a/Src/Particle/AMReX_ParticleReduce.H b/Src/Particle/AMReX_ParticleReduce.H
index 2c1df48fd0e..a05ebcba3ec 100644
--- a/Src/Particle/AMReX_ParticleReduce.H
+++ b/Src/Particle/AMReX_ParticleReduce.H
@@ -12,6 +12,7 @@
 #include <AMReX_Vector.H>
 
 #include <limits>
+#include <type_traits>
 
 namespace amrex {
 
@@ -23,7 +24,8 @@ auto call_f (F const& f,
              const ConstParticleTileData<T_ParticleType, NAR, NAI>& p,
              const int i) noexcept
 {
-    if constexpr (IsCallable<F, decltype(p.m_aos[i])>::value) {
+    if constexpr ( ! T_ParticleType::is_soa_particle &&
+                   IsCallable<F, T_ParticleType const&>::value) {
         return f(p.m_aos[i]);
     } else if constexpr (IsCallable<F, decltype(p.getSuperParticle(i))>::value) {
         return f(p.getSuperParticle(i));
@@ -171,7 +173,7 @@ ReduceSum (PC const& pc, int lev, F&& f)
  */
 template <class PC, class F, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
 auto
-ReduceSum (PC const& pc, int lev_min, int lev_max, F&& f)
+ReduceSum (PC const& pc, int lev_min, int lev_max, F const& f)
     -> decltype(particle_detail::call_f(f, typename PC::ParticleTileType::ConstParticleTileDataType(), int()))
 {
     using value_type = decltype(particle_detail::call_f(f, typename PC::ParticleTileType::ConstParticleTileDataType(), int()));
@@ -371,7 +373,7 @@ ReduceMax (PC const& pc, int lev, F&& f)
  */
 template <class PC, class F, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
 auto
-ReduceMax (PC const& pc, int lev_min, int lev_max, F&& f)
+ReduceMax (PC const& pc, int lev_min, int lev_max, F const& f)
     -> decltype(particle_detail::call_f(f, typename PC::ParticleTileType::ConstParticleTileDataType(), int()))
 {
     using value_type = decltype(particle_detail::call_f(f, typename PC::ParticleTileType::ConstParticleTileDataType(), int()));
@@ -572,7 +574,7 @@ ReduceMin (PC const& pc, int lev, F&& f)
  */
 template <class PC, class F, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
 auto
-ReduceMin (PC const& pc, int lev_min, int lev_max, F&& f)
+ReduceMin (PC const& pc, int lev_min, int lev_max, F const& f)
     -> decltype(particle_detail::call_f(f, typename PC::ParticleTileType::ConstParticleTileDataType(), int()))
 {
     using value_type = decltype(particle_detail::call_f(f, typename PC::ParticleTileType::ConstParticleTileDataType(), int()));
@@ -771,7 +773,7 @@ ReduceLogicalAnd (PC const& pc, int lev, F&& f)
  */
 template <class PC, class F, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
 bool
-ReduceLogicalAnd (PC const& pc, int lev_min, int lev_max, F&& f)
+ReduceLogicalAnd (PC const& pc, int lev_min, int lev_max, F const& f)
 {
     int r = true;
 
@@ -967,7 +969,7 @@ ReduceLogicalOr (PC const& pc, int lev, F&& f)
  */
 template <class PC, class F, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
 bool
-ReduceLogicalOr (PC const& pc, int lev_min, int lev_max, F&& f)
+ReduceLogicalOr (PC const& pc, int lev_min, int lev_max, F const& f)
 {
     int r = false;
 
@@ -1233,7 +1235,7 @@ ParticleReduce (PC const& pc, int lev, F&& f, ReduceOps& reduce_ops)
 template <class RD, class PC, class F, class ReduceOps,
           std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
 typename RD::Type
-ParticleReduce (PC const& pc, int lev_min, int lev_max, F&& f, ReduceOps& reduce_ops)
+ParticleReduce (PC const& pc, int lev_min, int lev_max, F const& f, ReduceOps& reduce_ops)
 {
     RD reduce_data(reduce_ops);
     for (int lev = lev_min; lev <= lev_max; ++lev) {
diff --git a/Src/Particle/AMReX_ParticleTile.H b/Src/Particle/AMReX_ParticleTile.H
index e35af847ec4..3f578098215 100644
--- a/Src/Particle/AMReX_ParticleTile.H
+++ b/Src/Particle/AMReX_ParticleTile.H
@@ -11,6 +11,7 @@
 #include <AMReX_RealVect.H>
 
 #include <array>
+#include <type_traits>
 
 namespace amrex {
 
@@ -30,6 +31,7 @@ struct ParticleTileData
     static constexpr int NAI = NArrayInt;
 
     using ParticleType = T_ParticleType;
+    using ParticleRefType = T_ParticleType&;
     using Self = ParticleTileData<ParticleType, NAR, NAI>;
 
     static constexpr int NStructReal = ParticleType::NReal;
@@ -41,8 +43,11 @@ struct ParticleTileData
 
     Long m_size;
 
-    ParticleType* AMREX_RESTRICT m_aos;
+    using AOS_PTR = std::conditional_t<T_ParticleType::is_soa_particle,
+                                       void * AMREX_RESTRICT, ParticleType * AMREX_RESTRICT>;
+    AOS_PTR m_aos;
 
+    uint64_t* m_idcpu;
     GpuArray<ParticleReal*, NAR> m_rdata;
     GpuArray<int*, NAI> m_idata;
 
@@ -67,7 +72,7 @@ struct ParticleTileData
         if constexpr(!ParticleType::is_soa_particle) {
             return this->m_aos[index].id();
         } else {
-            return this->m_idata[0][index];
+            return ParticleIDWrapper(this->m_idcpu[index]);
         }
     }
 
@@ -77,7 +82,17 @@ struct ParticleTileData
         if constexpr(!ParticleType::is_soa_particle) {
             return this->m_aos[index].cpu();
         } else {
-            return this->m_idata[1][index];
+            return ParticleCPUWrapper(this->m_idcpu[index]);
+        }
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    decltype(auto) idcpu (const int index) const &
+    {
+        if constexpr(ParticleType::is_soa_particle) {
+            return this->m_idcpu[index];
+        } else {
+            amrex::Abort("not implemented");
         }
     }
 
@@ -112,6 +127,9 @@ struct ParticleTileData
         if constexpr (!ParticleType::is_soa_particle) {
             memcpy(dst, m_aos + src_index, sizeof(ParticleType));
             dst += sizeof(ParticleType);
+        } else {
+            memcpy(dst, m_idcpu + src_index, sizeof(uint64_t));
+            dst += sizeof(uint64_t);
         }
         int array_start_index  = AMREX_SPACEDIM + NStructReal;
         for (int i = 0; i < NAR; ++i)
@@ -160,6 +178,9 @@ struct ParticleTileData
         if constexpr (!ParticleType::is_soa_particle) {
             memcpy(m_aos + dst_index, src, sizeof(ParticleType));
             src += sizeof(ParticleType);
+        } else {
+            memcpy(m_idcpu + dst_index, src, sizeof(uint64_t));
+            src += sizeof(uint64_t);
         }
         int array_start_index  = AMREX_SPACEDIM + NStructReal;
         for (int i = 0; i < NAR; ++i)
@@ -199,7 +220,7 @@ struct ParticleTileData
         }
     }
 
-    template <typename T = ParticleType, typename std::enable_if<!T::is_soa_particle, int>::type = 0>
+    template <typename T = ParticleType, std::enable_if_t<!T::is_soa_particle, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     SuperParticleType getSuperParticle (int index) const noexcept
     {
@@ -225,15 +246,14 @@ struct ParticleTileData
         return sp;
     }
 
-    template <typename T = ParticleType, typename std::enable_if<T::is_soa_particle, int>::type = 0>
+    template <typename T = ParticleType, std::enable_if_t<T::is_soa_particle, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     SuperParticleType getSuperParticle (int index) const noexcept
     {
         AMREX_ASSERT(index < m_size);
         SuperParticleType sp;
+        sp.m_idcpu = m_idcpu[index];
         for (int i = 0; i < AMREX_SPACEDIM; ++i) {sp.pos(i) = m_rdata[i][index];}
-        sp.id()  = m_idata[0][index];
-        sp.cpu() = m_idata[1][index];
         for (int i = 0; i < NAR; ++i) {
             sp.rdata(i) = m_rdata[i][index];
         }
@@ -243,7 +263,7 @@ struct ParticleTileData
         return sp;
     }
 
-    template <typename T = ParticleType, typename std::enable_if<!T::is_soa_particle, int>::type = 0>
+    template <typename T = ParticleType, std::enable_if_t<!T::is_soa_particle, int> = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void setSuperParticle (const SuperParticleType& sp, int index) const noexcept
     {
@@ -266,10 +286,11 @@ struct ParticleTileData
         }
     }
 
-    template <typename T = ParticleType, typename std::enable_if<T::is_soa_particle, int>::type = 0>
+    template <typename T = ParticleType, std::enable_if_t<T::is_soa_particle, int> = 0>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void setSuperParticle (const SuperParticleType& sp, int index) const noexcept
     {
+        m_idcpu[index] = sp.m_idcpu;
         for (int i = 0; i < NAR; ++i) {
             m_rdata[i][index] = sp.rdata(i);
         }
@@ -303,10 +324,10 @@ struct ConstSoAParticle : SoAParticleBase
     //functions to get id and cpu in the SOA data
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int cpu () const { return this->m_constparticle_tile_data.m_idata[1][m_index]; }
+    ConstParticleCPUWrapper cpu () const { return this->m_constparticle_tile_data.m_idcpu[m_index]; }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int id () const { return this->m_constparticle_tile_data.m_idata[0][m_index]; }
+    ConstParticleIDWrapper id () const { return this->m_constparticle_tile_data.m_idcpu[m_index]; }
 
     //functions to get positions of the particle in the SOA data
 
@@ -361,21 +382,27 @@ struct SoAParticle : SoAParticleBase
     {
     }
 
-    static int the_next_id;
+    static Long the_next_id;
 
     //functions to get id and cpu in the SOA data
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int& cpu () & { return this->m_particle_tile_data.m_idata[1][m_index]; }
+    ParticleCPUWrapper cpu () & { return this->m_particle_tile_data.m_idcpu[m_index]; }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    ParticleIDWrapper id () & { return this->m_particle_tile_data.m_idcpu[m_index]; }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int& id () & { return this->m_particle_tile_data.m_idata[0][m_index]; }
+    uint64_t& idcpu () & { return this->m_particle_tile_data.m_idcpu[m_index]; }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    const int& cpu () const & { return this->m_particle_tile_data.m_idata[1][m_index]; }
+    ConstParticleCPUWrapper cpu () const & { return this->m_particle_tile_data.m_idcpu[m_index]; }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    const int& id () const & { return this->m_particle_tile_data.m_idata[0][m_index]; }
+    ConstParticleIDWrapper id () const & { return this->m_particle_tile_data.m_idcpu[m_index]; }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    const uint64_t& idcpu () const & { return this->m_particle_tile_data.m_idcpu[m_index]; }
 
     //functions to get positions of the particle in the SOA data
 
@@ -396,19 +423,19 @@ struct SoAParticle : SoAParticleBase
         return this->m_particle_tile_data.m_rdata[position_index][m_index];
     }
 
-    static int NextID ();
+    static Long NextID ();
 
     /**
     * \brief This version can only be used inside omp critical.
     */
-    static int UnprotectedNextID ();
+    static Long UnprotectedNextID ();
 
     /**
     * \brief Reset on restart.
     *
     * \param nextid
     */
-    static void NextID (int nextid);
+    static void NextID (Long nextid);
 
 private :
 
@@ -419,10 +446,10 @@ private :
 };
 
 //template <int NArrayReal, int NArrayInt> Long ConstSoAParticle<NArrayReal, NArrayInt>::the_next_id = 1;
-template <int NArrayReal, int NArrayInt> int SoAParticle<NArrayReal, NArrayInt>::the_next_id = 1;
+template <int NArrayReal, int NArrayInt> Long SoAParticle<NArrayReal, NArrayInt>::the_next_id = 1;
 
 template <int NArrayReal, int NArrayInt>
-int
+Long
 SoAParticle<NArrayReal, NArrayInt>::NextID ()
 {
     Long next;
@@ -435,19 +462,19 @@ SoAParticle<NArrayReal, NArrayInt>::NextID ()
 #endif
     next = the_next_id++;
 
-    if (next > IntParticleIds::LastParticleID) {
+    if (next > LongParticleIds::LastParticleID) {
         amrex::Abort("SoAParticle<NArrayReal, NArrayInt>::NextID() -- too many particles");
     }
 
-    return int(next);
+    return next;
 }
 
 template <int NArrayReal, int NArrayInt>
-int
+Long
 SoAParticle<NArrayReal, NArrayInt>::UnprotectedNextID ()
 {
-    int next = the_next_id++;
-    if (next > IntParticleIds::LastParticleID) {
+    Long next = the_next_id++;
+    if (next > LongParticleIds::LastParticleID) {
         amrex::Abort("SoAParticle<NArrayReal, NArrayInt>::NextID() -- too many particles");
     }
     return next;
@@ -455,7 +482,7 @@ SoAParticle<NArrayReal, NArrayInt>::UnprotectedNextID ()
 
 template <int NArrayReal, int NArrayInt>
 void
-SoAParticle<NArrayReal, NArrayInt>::NextID (int nextid)
+SoAParticle<NArrayReal, NArrayInt>::NextID (Long nextid)
 {
     the_next_id = nextid;
 }
@@ -466,6 +493,7 @@ struct ConstParticleTileData
     static constexpr int NAR = NArrayReal;
     static constexpr int NAI = NArrayInt;
     using ParticleType = T_ParticleType;
+    using ParticleRefType = T_ParticleType const&;
 
     static constexpr int NStructReal = ParticleType::NReal;
     static constexpr int NStructInt = ParticleType::NInt;
@@ -475,8 +503,12 @@ struct ConstParticleTileData
     static constexpr bool is_particle_tile_data = true;
 
     Long m_size;
-    const ParticleType* AMREX_RESTRICT m_aos;
 
+    using AOS_PTR = std::conditional_t<T_ParticleType::is_soa_particle,
+                                       void const * AMREX_RESTRICT, ParticleType const * AMREX_RESTRICT>;
+    AOS_PTR m_aos;
+
+    const uint64_t* m_idcpu;
     GpuArray<const ParticleReal*, NArrayReal> m_rdata;
     GpuArray<const int*, NArrayInt > m_idata;
 
@@ -496,7 +528,7 @@ struct ConstParticleTileData
         if constexpr(!ParticleType::is_soa_particle) {
             return this->m_aos[index].id();
         } else {
-            return this->m_idata[0][index];
+            return ConstParticleIDWrapper(this->m_idcpu[index]);
         }
     }
 
@@ -506,7 +538,17 @@ struct ConstParticleTileData
         if constexpr(!ParticleType::is_soa_particle) {
             return this->m_aos[index].cpu();
         } else {
-            return this->m_idata[1][index];
+            return ConstParticleCPUWrapper(this->m_idcpu[index]);
+        }
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    decltype(auto) idcpu (const int index) const &
+    {
+        if constexpr(ParticleType::is_soa_particle) {
+            return this->m_idcpu[index];
+        } else {
+            amrex::Abort("not implemented");
         }
     }
 
@@ -546,6 +588,9 @@ struct ConstParticleTileData
         if constexpr (!ParticleType::is_soa_particle) {
             memcpy(dst, m_aos + src_index, sizeof(ParticleType));
             dst += sizeof(ParticleType);
+        } else {
+            memcpy(dst, m_idcpu + src_index, sizeof(uint64_t));
+            dst += sizeof(uint64_t);
         }
         int array_start_index  = AMREX_SPACEDIM + NStructReal;
         for (int i = 0; i < NArrayReal; ++i)
@@ -585,7 +630,7 @@ struct ConstParticleTileData
         }
     }
 
-    template <typename T = ParticleType, typename std::enable_if<!T::is_soa_particle, int>::type = 0>
+    template <typename T = ParticleType, std::enable_if_t<!T::is_soa_particle, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     SuperParticleType getSuperParticle (int index) const noexcept
     {
@@ -615,15 +660,14 @@ struct ConstParticleTileData
         return sp;
     }
 
-    template <typename T = ParticleType, typename std::enable_if<T::is_soa_particle, int>::type = 0>
+    template <typename T = ParticleType, std::enable_if_t<T::is_soa_particle, int> = 0>
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     SuperParticleType getSuperParticle (int index) const noexcept
     {
         AMREX_ASSERT(index < m_size);
         SuperParticleType sp;
         for (int i = 0; i < AMREX_SPACEDIM; ++i) {sp.pos(i) = m_rdata[i][index];}
-        sp.id()  = m_idata[0][index];
-        sp.cpu() = m_idata[1][index];
+        sp.m_idcpu = m_idcpu[index];
         for (int i = 0; i < NAR; ++i) {
             sp.rdata(i) = m_rdata[i][index];
         }
@@ -657,13 +701,16 @@ struct ParticleTile
 
     using SuperParticleType = Particle<NStructReal + NArrayReal, NStructInt + NArrayInt>;
 
-    using AoS = typename std::conditional<
+    using AoS = std::conditional_t<
         ParticleType::is_soa_particle,
         ThisParticleTileHasNoAoS,
-        ArrayOfStructs<ParticleType, Allocator>>::type;
+        ArrayOfStructs<ParticleType, Allocator>>;
     //using ParticleVector = typename AoS::ParticleVector;
 
-    using SoA = StructOfArrays<NArrayReal, NArrayInt, Allocator>;
+    using SoA = std::conditional_t<
+        ParticleType::is_soa_particle,
+        StructOfArrays<NArrayReal, NArrayInt, Allocator, true>,
+        StructOfArrays<NArrayReal, NArrayInt, Allocator, false>>;
     using RealVector = typename SoA::RealVector;
     using IntVector = typename SoA::IntVector;
     using StorageParticleType = typename ParticleType::StorageParticleType;
@@ -688,7 +735,7 @@ struct ParticleTile
         if constexpr (!ParticleType::is_soa_particle) {
             return m_aos_tile[index].id();
         } else {
-            return m_soa_tile.GetIntData(0)[index];
+            return ParticleIDWrapper(m_soa_tile.GetIdCPUData()[index]);
         }
     }
 
@@ -697,7 +744,7 @@ struct ParticleTile
         if constexpr (!ParticleType::is_soa_particle) {
             return m_aos_tile[index].id();
         } else {
-            return m_soa_tile.GetIntData(0)[index];
+            return ConstParticleIDWrapper(m_soa_tile.GetIdCPUData()[index]);
         }
     }
 
@@ -706,7 +753,7 @@ struct ParticleTile
         if constexpr (!ParticleType::is_soa_particle) {
             return m_aos_tile[index].cpu();
         } else {
-            return m_soa_tile.GetIntData(1)[index];
+            return ParticleCPUWrapper(m_soa_tile.GetIdCPUData()[index]);
         }
     }
 
@@ -715,7 +762,7 @@ struct ParticleTile
         if constexpr (!ParticleType::is_soa_particle) {
             return m_aos_tile[index].cpu();
         } else {
-            return m_soa_tile.GetIntData(1)[index];
+            return ConstParticleCPUWrapper(m_soa_tile.GetIdCPUData()[index]);
         }
     }
 
@@ -845,7 +892,7 @@ struct ParticleTile
     ///
     /// Add one particle to this tile.
     ///
-    template <typename T = ParticleType, typename std::enable_if<!T::is_soa_particle, int>::type = 0>
+    template <typename T = ParticleType, std::enable_if_t<!T::is_soa_particle, int> = 0>
     void push_back (const ParticleType& p) { m_aos_tile().push_back(p); }
 
     ///
@@ -873,7 +920,9 @@ struct ParticleTile
         }
 
         m_soa_tile.resize(np+1);
-
+        if constexpr (ParticleType::is_soa_particle) {
+            m_soa_tile.GetIdCPUData()[np] = sp.m_idcpu;
+        }
         auto& arr_rdata = m_soa_tile.GetRealData();
         auto& arr_idata = m_soa_tile.GetIntData();
         for (int i = 0; i < NArrayReal; ++i) {
@@ -998,7 +1047,9 @@ struct ParticleTile
 
     void shrink_to_fit ()
     {
-        if constexpr (!ParticleType::is_soa_particle) {
+        if constexpr (ParticleType::is_soa_particle) {
+            GetStructOfArrays().GetIdCPUData().shrink_to_fit();
+        } else {
             m_aos_tile().shrink_to_fit();
         }
         for (int j = 0; j < NumRealComps(); ++j)
@@ -1017,7 +1068,9 @@ struct ParticleTile
     Long capacity () const
     {
         Long nbytes = 0;
-        if constexpr (!ParticleType::is_soa_particle) {
+        if constexpr (ParticleType::is_soa_particle) {
+            nbytes += GetStructOfArrays().GetIdCPUData().capacity() * sizeof(uint64_t);
+        } else {
             nbytes += m_aos_tile().capacity() * sizeof(ParticleType);
         }
         for (int j = 0; j < NumRealComps(); ++j)
@@ -1034,9 +1087,11 @@ struct ParticleTile
         return nbytes;
     }
 
-    void swap (ParticleTile<ParticleType, NArrayReal, NArrayInt, Allocator>& other)
+    void swap (ParticleTile<ParticleType, NArrayReal, NArrayInt, Allocator>& other) noexcept
     {
-        if constexpr (!ParticleType::is_soa_particle) {
+        if constexpr (ParticleType::is_soa_particle) {
+            GetStructOfArrays().GetIdCPUData().swap(other.GetStructOfArrays().GetIdCPUData());
+        } else {
             m_aos_tile().swap(other.GetArrayOfStructs()());
         }
         for (int j = 0; j < NumRealComps(); ++j)
@@ -1054,35 +1109,41 @@ struct ParticleTile
 
     ParticleTileDataType getParticleTileData ()
     {
-        int index = NArrayReal;
+        m_runtime_r_ptrs.resize(m_soa_tile.NumRealComps() - NArrayReal);
+        m_runtime_i_ptrs.resize(m_soa_tile.NumIntComps() - NArrayInt);
 #ifdef AMREX_USE_GPU
-        Gpu::HostVector<ParticleReal*> h_runtime_r_ptrs(m_runtime_r_ptrs.size());
-        for (auto& r_ptr : h_runtime_r_ptrs) {
-            r_ptr = m_soa_tile.GetRealData(index++).dataPtr();
-        }
-        if (h_runtime_r_ptrs.size() > 0) {
-            Gpu::htod_memcpy_async(m_runtime_r_ptrs.data(), h_runtime_r_ptrs.data(),
-                                   h_runtime_r_ptrs.size()*sizeof(ParticleReal*));
+        bool copy_real = false;
+        m_h_runtime_r_ptrs.resize(m_soa_tile.NumRealComps() - NArrayReal, nullptr);
+        for (std::size_t i = 0; i < m_h_runtime_r_ptrs.size(); ++i) {
+            if (m_h_runtime_r_ptrs[i] != m_soa_tile.GetRealData(i + NArrayReal).dataPtr()) {
+                m_h_runtime_r_ptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr();
+                copy_real = true;
+            }
         }
-#else
-        for (auto& r_ptr : m_runtime_r_ptrs) {
-            r_ptr = m_soa_tile.GetRealData(index++).dataPtr();
+        if (copy_real) {
+            Gpu::htod_memcpy_async(m_runtime_r_ptrs.data(), m_h_runtime_r_ptrs.data(),
+                                   m_h_runtime_r_ptrs.size()*sizeof(ParticleReal*));
         }
-#endif
 
-        index = NArrayInt;
-#ifdef AMREX_USE_GPU
-        Gpu::HostVector<int*> h_runtime_i_ptrs(m_runtime_i_ptrs.size());
-        for (auto& i_ptr : h_runtime_i_ptrs) {
-            i_ptr = m_soa_tile.GetIntData(index++).dataPtr();
+        bool copy_int = false;
+        m_h_runtime_i_ptrs.resize(m_soa_tile.NumIntComps() - NArrayInt, nullptr);
+        for (std::size_t i = 0; i < m_h_runtime_i_ptrs.size(); ++i) {
+            if (m_h_runtime_i_ptrs[i] != m_soa_tile.GetIntData(i + NArrayInt).dataPtr()) {
+                m_h_runtime_i_ptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr();
+                copy_int = true;
+            }
         }
-        if (h_runtime_i_ptrs.size() > 0) {
-            Gpu::htod_memcpy_async(m_runtime_i_ptrs.data(), h_runtime_i_ptrs.data(),
-                                   h_runtime_i_ptrs.size()*sizeof(int*));
+        if (copy_int) {
+            Gpu::htod_memcpy_async(m_runtime_i_ptrs.data(), m_h_runtime_i_ptrs.data(),
+                                   m_h_runtime_i_ptrs.size()*sizeof(int*));
         }
 #else
-        for (auto& i_ptr : m_runtime_i_ptrs) {
-            i_ptr = m_soa_tile.GetIntData(index++).dataPtr();
+        for (std::size_t i = 0; i < m_runtime_r_ptrs.size(); ++i) {
+            m_runtime_r_ptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr();
+        }
+
+        for (std::size_t i = 0; i < m_runtime_i_ptrs.size(); ++i) {
+            m_runtime_i_ptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr();
         }
 #endif
 
@@ -1092,6 +1153,11 @@ struct ParticleTile
         } else {
             ptd.m_aos = nullptr;
         }
+        if constexpr (ParticleType::is_soa_particle) {
+            ptd.m_idcpu = m_soa_tile.GetIdCPUData().dataPtr();
+        } else {
+            ptd.m_idcpu = nullptr;
+        }
         if constexpr(NArrayReal > 0) {
             for (int i = 0; i < NArrayReal; ++i) {
                 ptd.m_rdata[i] = m_soa_tile.GetRealData(i).dataPtr();
@@ -1109,7 +1175,7 @@ struct ParticleTile
         ptd.m_runtime_idata = m_runtime_i_ptrs.dataPtr();
 
 #ifdef AMREX_USE_GPU
-        if ((h_runtime_r_ptrs.size() > 0) || (h_runtime_i_ptrs.size() > 0)) {
+        if (copy_real || copy_int) {
             Gpu::streamSynchronize();
         }
 #endif
@@ -1119,35 +1185,41 @@ struct ParticleTile
 
     ConstParticleTileDataType getConstParticleTileData () const
     {
-        int index = NArrayReal;
+        m_runtime_r_cptrs.resize(m_soa_tile.NumRealComps() - NArrayReal);
+        m_runtime_i_cptrs.resize(m_soa_tile.NumIntComps() - NArrayInt);
 #ifdef AMREX_USE_GPU
-        Gpu::HostVector<ParticleReal const*> h_runtime_r_cptrs(m_runtime_r_cptrs.size());
-        for (auto& r_ptr : h_runtime_r_cptrs) {
-            r_ptr = m_soa_tile.GetRealData(index++).dataPtr();
-        }
-        if (h_runtime_r_cptrs.size() > 0) {
-            Gpu::htod_memcpy_async(m_runtime_r_cptrs.data(), h_runtime_r_cptrs.data(),
-                                   h_runtime_r_cptrs.size()*sizeof(ParticleReal const*));
+        bool copy_real = false;
+        m_h_runtime_r_cptrs.resize(m_soa_tile.NumRealComps() - NArrayReal, nullptr);
+        for (std::size_t i = 0; i < m_h_runtime_r_cptrs.size(); ++i) {
+            if (m_h_runtime_r_cptrs[i] != m_soa_tile.GetRealData(i + NArrayReal).dataPtr()) {
+                m_h_runtime_r_cptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr();
+                copy_real = true;
+            }
         }
-#else
-        for (auto& r_ptr : m_runtime_r_cptrs) {
-            r_ptr = m_soa_tile.GetRealData(index++).dataPtr();
+        if (copy_real) {
+            Gpu::htod_memcpy_async(m_runtime_r_cptrs.data(), m_h_runtime_r_cptrs.data(),
+                                   m_h_runtime_r_cptrs.size()*sizeof(ParticleReal*));
         }
-#endif
 
-        index = NArrayInt;
-#ifdef AMREX_USE_GPU
-        Gpu::HostVector<int const*> h_runtime_i_cptrs(m_runtime_i_cptrs.size());
-        for (auto& i_ptr : h_runtime_i_cptrs) {
-            i_ptr = m_soa_tile.GetIntData(index++).dataPtr();
+        bool copy_int = false;
+        m_h_runtime_i_cptrs.resize(m_soa_tile.NumIntComps() - NArrayInt, nullptr);
+        for (std::size_t i = 0; i < m_h_runtime_i_cptrs.size(); ++i) {
+            if (m_h_runtime_i_cptrs[i] != m_soa_tile.GetIntData(i + NArrayInt).dataPtr()) {
+                m_h_runtime_i_cptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr();
+                copy_int = true;
+            }
         }
-        if (h_runtime_i_cptrs.size() > 0) {
-            Gpu::htod_memcpy_async(m_runtime_i_cptrs.data(), h_runtime_i_cptrs.data(),
-                                   h_runtime_i_cptrs.size()*sizeof(int const*));
+        if (copy_int) {
+            Gpu::htod_memcpy_async(m_runtime_i_cptrs.data(), m_h_runtime_i_cptrs.data(),
+                                   m_h_runtime_i_cptrs.size()*sizeof(int*));
         }
 #else
-        for (auto& i_ptr : m_runtime_i_cptrs) {
-            i_ptr = m_soa_tile.GetIntData(index++).dataPtr();
+        for (std::size_t i = 0; i < m_runtime_r_cptrs.size(); ++i) {
+            m_runtime_r_cptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr();
+        }
+
+        for (std::size_t i = 0; i < m_runtime_i_cptrs.size(); ++i) {
+            m_runtime_i_cptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr();
         }
 #endif
 
@@ -1157,6 +1229,11 @@ struct ParticleTile
         } else {
             ptd.m_aos = nullptr;
         }
+        if constexpr (ParticleType::is_soa_particle) {
+            ptd.m_idcpu = m_soa_tile.GetIdCPUData().dataPtr();
+        } else {
+            ptd.m_idcpu = nullptr;
+        }
         if constexpr(NArrayReal > 0) {
             for (int i = 0; i < NArrayReal; ++i) {
                 ptd.m_rdata[i] = m_soa_tile.GetRealData(i).dataPtr();
@@ -1174,7 +1251,7 @@ struct ParticleTile
         ptd.m_runtime_idata = m_runtime_i_cptrs.dataPtr();
 
 #ifdef AMREX_USE_GPU
-        if ((h_runtime_r_cptrs.size() > 0) || (h_runtime_i_cptrs.size() > 0)) {
+        if (copy_real || copy_int) {
             Gpu::streamSynchronize();
         }
 #endif
@@ -1194,6 +1271,12 @@ private:
 
     mutable amrex::PODVector<const ParticleReal*, Allocator<const ParticleReal*> > m_runtime_r_cptrs;
     mutable amrex::PODVector<const int*, Allocator<const int*> >m_runtime_i_cptrs;
+
+    amrex::Gpu::HostVector<ParticleReal*> m_h_runtime_r_ptrs;
+    amrex::Gpu::HostVector<int*> m_h_runtime_i_ptrs;
+
+    mutable amrex::Gpu::HostVector<const ParticleReal*> m_h_runtime_r_cptrs;
+    mutable amrex::Gpu::HostVector<const int*> m_h_runtime_i_cptrs;
 };
 
 } // namespace amrex
diff --git a/Src/Particle/AMReX_ParticleTransformation.H b/Src/Particle/AMReX_ParticleTransformation.H
index 28ccfa84a97..961f69cabbf 100644
--- a/Src/Particle/AMReX_ParticleTransformation.H
+++ b/Src/Particle/AMReX_ParticleTransformation.H
@@ -35,7 +35,11 @@ void copyParticle (const      ParticleTileData<T_ParticleType, NAR, NAI>& dst,
     AMREX_ASSERT(dst.m_num_runtime_real == src.m_num_runtime_real);
     AMREX_ASSERT(dst.m_num_runtime_int  == src.m_num_runtime_int );
 
-    dst.m_aos[dst_i] = src.m_aos[src_i];
+    if constexpr(!T_ParticleType::is_soa_particle) {
+        dst.m_aos[dst_i] = src.m_aos[src_i];
+    } else {
+        dst.m_idcpu[dst_i] = src.m_idcpu[src_i];
+    }
     if constexpr(NAR > 0) {
         for (int j = 0; j < NAR; ++j) {
             dst.m_rdata[j][dst_i] = src.m_rdata[j][src_i];
@@ -77,7 +81,11 @@ void copyParticle (const ParticleTileData<T_ParticleType, NAR, NAI>& dst,
     AMREX_ASSERT(dst.m_num_runtime_real == src.m_num_runtime_real);
     AMREX_ASSERT(dst.m_num_runtime_int  == src.m_num_runtime_int );
 
-    dst.m_aos[dst_i] = src.m_aos[src_i];
+    if constexpr(T_ParticleType::is_soa_particle) {
+        dst.m_idcpu[dst_i] = src.m_idcpu[src_i];
+    } else {
+        dst.m_aos[dst_i] = src.m_aos[src_i];
+    }
     for (int j = 0; j < NAR; ++j) {
         dst.m_rdata[j][dst_i] = src.m_rdata[j][src_i];
     }
@@ -115,7 +123,11 @@ void swapParticle (const ParticleTileData<T_ParticleType, NAR, NAI>& dst,
     AMREX_ASSERT(dst.m_num_runtime_real == src.m_num_runtime_real);
     AMREX_ASSERT(dst.m_num_runtime_int  == src.m_num_runtime_int );
 
-    amrex::Swap(src.m_aos[src_i], dst.m_aos[dst_i]);
+    if constexpr(T_ParticleType::is_soa_particle) {
+        amrex::Swap(src.m_idcpu[src_i], dst.m_idcpu[dst_i]);
+    } else {
+        amrex::Swap(src.m_aos[src_i], dst.m_aos[dst_i]);
+    }
     for (int j = 0; j < NAR; ++j) {
         amrex::Swap(dst.m_rdata[j][dst_i], src.m_rdata[j][src_i]);
     }
@@ -166,7 +178,7 @@ void copyParticles (DstTile& dst, const SrcTile& src) noexcept
  *
  */
 template <typename DstTile, typename SrcTile, typename Index, typename N,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 void copyParticles (DstTile& dst, const SrcTile& src,
                     Index src_start, Index dst_start, N n) noexcept
 {
@@ -220,9 +232,9 @@ void transformParticles (DstTile& dst, const SrcTile& src, F&& f) noexcept
  *
  */
 template <typename DstTile, typename SrcTile, typename Index, typename N, typename F,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 void transformParticles (DstTile& dst, const SrcTile& src,
-                         Index src_start, Index dst_start, N n, F&& f) noexcept
+                         Index src_start, Index dst_start, N n, F const& f) noexcept
 {
     const auto src_data = src.getConstParticleTileData();
           auto dst_data = dst.getParticleTileData();
@@ -280,9 +292,9 @@ void transformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& src, F&&
  */
 template <typename DstTile1, typename DstTile2, typename SrcTile,
           typename Index, typename N, typename F,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 void transformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& src,
-                         Index src_start, Index dst1_start, Index dst2_start, N n, F&& f) noexcept
+                         Index src_start, Index dst1_start, Index dst2_start, N n, F const& f) noexcept
 {
     const auto src_data  = src.getConstParticleTileData();
           auto dst1_data = dst1.getParticleTileData();
@@ -309,7 +321,7 @@ void transformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& src,
  *
  */
 template <typename DstTile, typename SrcTile, typename Index, typename N,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 Index filterParticles (DstTile& dst, const SrcTile& src, const Index* mask) noexcept
 {
     return filterParticles(dst, src, mask, 0, 0, src.numParticles());
@@ -333,7 +345,7 @@ Index filterParticles (DstTile& dst, const SrcTile& src, const Index* mask) noex
  *
  */
 template <typename DstTile, typename SrcTile, typename Index, typename N,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 Index filterParticles (DstTile& dst, const SrcTile& src, const Index* mask,
                        Index src_start, Index dst_start, N n) noexcept
 {
@@ -370,10 +382,11 @@ Index filterParticles (DstTile& dst, const SrcTile& src, const Index* mask,
  * \param p predicate function - particles will be copied if p returns true
  *
  */
-template <typename DstTile, typename SrcTile, typename Pred>
+template <typename DstTile, typename SrcTile, typename Pred,
+          std::enable_if_t<!std::is_pointer_v<std::decay_t<Pred>>,int> foo = 0>
 int filterParticles (DstTile& dst, const SrcTile& src, Pred&& p) noexcept
 {
-    return filterParticles(dst, src, p, 0, 0, src.numParticles());
+    return filterParticles(dst, src, std::forward<Pred>(p), 0, 0, src.numParticles());
 }
 
 /**
@@ -395,7 +408,7 @@ int filterParticles (DstTile& dst, const SrcTile& src, Pred&& p) noexcept
  */
 template <typename DstTile, typename SrcTile, typename Pred, typename Index, typename N,
           std::enable_if_t<!std::is_pointer_v<std::decay_t<Pred>>,Index> nvccfoo = 0>
-Index filterParticles (DstTile& dst, const SrcTile& src, Pred&& p,
+Index filterParticles (DstTile& dst, const SrcTile& src, Pred const& p,
                        Index src_start, Index dst_start, N n) noexcept
 {
     Gpu::DeviceVector<Index> mask(n);
@@ -432,8 +445,8 @@ Index filterParticles (DstTile& dst, const SrcTile& src, Pred&& p,
  *
  */
 template <typename DstTile, typename SrcTile, typename Index, typename F,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
-Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Index* mask, F&& f,
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
+Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Index* mask, F const& f,
                                    Index src_start, Index dst_start) noexcept
 {
     auto np = src.numParticles();
@@ -477,7 +490,7 @@ Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Index* mask
  *
  */
 template <typename DstTile, typename SrcTile, typename Index, typename F,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Index* mask, F&& f) noexcept
 {
     return filterAndTransformParticles(dst, src, mask, std::forward<F>(f), 0, 0);
@@ -498,7 +511,8 @@ Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Index* mask
  * \param f defines the transformation that will be applied to the particles on copy
  *
  */
-template <typename DstTile, typename SrcTile, typename Pred, typename F>
+template <typename DstTile, typename SrcTile, typename Pred, typename F,
+          std::enable_if_t<!std::is_pointer_v<std::decay_t<Pred>>,int> foo = 0>
 int filterAndTransformParticles (DstTile& dst, const SrcTile& src, Pred&& p, F&& f) noexcept
 {
     return filterAndTransformParticles(dst, src, std::forward<Pred>(p), std::forward<F>(f), 0, 0);
@@ -522,9 +536,9 @@ int filterAndTransformParticles (DstTile& dst, const SrcTile& src, Pred&& p, F&&
  *
  */
 template <typename DstTile1, typename DstTile2, typename SrcTile, typename Index, typename F,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 Index filterAndTransformParticles (DstTile1& dst1, DstTile2& dst2,
-                                   const SrcTile& src, Index* mask, F&& f) noexcept
+                                   const SrcTile& src, Index* mask, F const& f) noexcept
 {
     auto np = src.numParticles();
     Gpu::DeviceVector<Index> offsets(np);
@@ -566,9 +580,10 @@ Index filterAndTransformParticles (DstTile1& dst1, DstTile2& dst2,
  * \param f defines the transformation that will be applied to the particles on copy
  *
  */
-template <typename DstTile1, typename DstTile2, typename SrcTile, typename Pred, typename F>
+template <typename DstTile1, typename DstTile2, typename SrcTile, typename Pred, typename F,
+          std::enable_if_t<!std::is_pointer_v<std::decay_t<Pred>>, int> foo = 0>
 int filterAndTransformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& src,
-                                 Pred&& p, F&& f) noexcept
+                                 Pred const& p, F&& f) noexcept
 {
     auto np = src.numParticles();
     Gpu::DeviceVector<int> mask(np);
@@ -604,13 +619,11 @@ int filterAndTransformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile&
  * \param p predicate function - particles will be copied if p returns true
  * \param src_start the offset at which to start reading particles from src
  * \param dst_start the offset at which to start writing particles to dst
- * \param n the number of particles to apply the operation to
  *
  */
-
 template <typename DstTile, typename SrcTile, typename Pred, typename F, typename Index,
           std::enable_if_t<!std::is_pointer_v<std::decay_t<Pred>>,Index> nvccfoo = 0>
-Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Pred&& p, F&& f,
+Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Pred const& p, F&& f,
                                    Index src_start, Index dst_start) noexcept
 {
     auto np = src.numParticles();
@@ -649,7 +662,7 @@ Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Pred&& p, F
  *
  */
 template <typename PTile, typename N, typename Index,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 void gatherParticles (PTile& dst, const PTile& src, N np, const Index* inds)
 {
     const auto src_data = src.getConstParticleTileData();
@@ -679,7 +692,7 @@ void gatherParticles (PTile& dst, const PTile& src, N np, const Index* inds)
  *
  */
 template <typename PTile, typename N, typename Index,
-          std::enable_if_t<std::is_integral<Index>::value, int> foo = 0>
+          std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 void scatterParticles (PTile& dst, const PTile& src, N np, const Index* inds)
 {
     const auto src_data = src.getConstParticleTileData();
diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
index d4b13030c04..0d6954d61f5 100644
--- a/Src/Particle/AMReX_ParticleUtil.H
+++ b/Src/Particle/AMReX_ParticleUtil.H
@@ -47,7 +47,6 @@ numParticlesOutOfRange (Iterator const& pti, int nGrow)
  * \param nGrow the number of grow cells allowed.
  *
  */
-
 template <class Iterator, std::enable_if_t<IsParticleIterator<Iterator>::value && !Iterator::ContainerType::ParticleType::is_soa_particle, int> foo = 0>
 int
 numParticlesOutOfRange (Iterator const& pti, IntVect nGrow)
@@ -372,6 +371,26 @@ IntVect getParticleCell (P const& p,
     return iv;
 }
 
+template <typename PTD>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+IntVect getParticleCell (PTD const& ptd, int i,
+                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                         const Box& domain) noexcept
+{
+    if constexpr (PTD::ParticleType::is_soa_particle)
+    {
+        IntVect iv(
+                   AMREX_D_DECL(int(amrex::Math::floor((ptd.m_rdata[0][i]-plo[0])*dxi[0])),
+                                int(amrex::Math::floor((ptd.m_rdata[1][i]-plo[1])*dxi[1])),
+                                int(amrex::Math::floor((ptd.m_rdata[2][i]-plo[2])*dxi[2]))));
+        iv += domain.smallEnd();
+        return iv;
+    } else {
+        return getParticleCell(ptd.m_aos[i], plo, dxi, domain);;
+    }
+}
+
 struct DefaultAssignor
 {
 
@@ -437,11 +456,136 @@ bool enforcePeriodic (P& p,
     return shifted;
 }
 
+/**
+ * \brief Reorders the ParticleTile into two partitions
+ * left [0, num_left-1] and right [num_left, ptile.numParticles()-1]
+ * and returns the number of particles in the left partition.
+ *
+ * The functor is_left [(ParticleTileData ptd, int index) -> bool] maps each particle to
+ * either the left [return true] or the right [return false] partition.
+ * It must return the same result if evaluated multiple times for the same particle.
+ *
+ * \param ptile the ParticleTile to partition
+ * \param is_left functor to map particles to a partition
+ */
+template <typename PTile, typename ParFunc>
+int
+partitionParticles (PTile& ptile, ParFunc const& is_left)
+{
+    const int np = ptile.numParticles();
+    if (np == 0) { return 0; }
+
+    auto ptd = ptile.getParticleTileData();
+
+    const int num_left = Reduce::Sum<int>(np,
+        [=] AMREX_GPU_DEVICE (int i) -> int
+        {
+            return int(is_left(ptd, i));
+        });
+
+    // The ptile will be partitioned into left [0, num_left-1] and right [num_left, np-1].
+    //
+    // Note that currently the number of particles in [0, num_left-1] that should belong to the
+    // right partition is equal to the number of particles in [num_left, np-1] that should belong
+    // in the left partition. We will define num_swaps to be this number. This is the minimum
+    // number of swaps that need to be performed to partition the ptile in place for any algorithm.
+    //
+    // From this it is easy to see that
+    // max_num_swaps = min(size([0, num_left-1]), size([num_left, np-1]))
+    // is an upper bound for num_swaps.
+
+    const int max_num_swaps = std::min(num_left, np - num_left);
+    if (max_num_swaps == 0) { return num_left; }
+
+    Gpu::DeviceVector<int> index_left(max_num_swaps);
+    Gpu::DeviceVector<int> index_right(max_num_swaps);
+    int * const p_index_left = index_left.dataPtr();
+    int * const p_index_right = index_right.dataPtr();
+
+    // The num_swaps particles that are in [0, num_left-1] but should be moved to the right
+    // partition are at the same time the first num_swaps particles for which is_left is false
+    // in all the ptile.
+    // Similarly, the num_swaps particles in [num_left, np-1] that should be moved to the left
+    // partition are the last num_swaps particles of the ptile for which is_left is true.
+    //
+    // The PrefixSum is used to find exactly these particles and store their indexes in
+    // index_left and index_right. Since num_swaps is not known, the first max_num_swaps
+    // particles are stored instead. Here, dst = num_left-1-(i-s) is used to effectively reverse
+    // the PrefixSum to store the last particles for which is_left is true.
+    //
+    // This way, all indexes in index_right are in ascending order, and all indexes in
+    // index_left are in descending order.
+
+    Scan::PrefixSum<int>(np,
+        [=] AMREX_GPU_DEVICE (int i) -> int
+        {
+            return int(!is_left(ptd, i));
+        },
+        [=] AMREX_GPU_DEVICE (int i, int const& s)
+        {
+            if (!is_left(ptd, i)) {
+                int dst = s;
+                if (dst < max_num_swaps) {
+                    p_index_right[dst] = i;
+                }
+            } else {
+                int dst = num_left-1-(i-s); // avoid integer overflow
+                if (dst < max_num_swaps) {
+                    p_index_left[dst] = i;
+                }
+            }
+        },
+        Scan::Type::exclusive, Scan::noRetSum);
+
+    // Finally, the particles are swapped. Since max_num_swaps is only an upper bound for num_swaps,
+    // some swaps should not be performed and need to be skipped. This is the case if the index
+    // in index_left[i] is already in the left partition or the index in index_right[i] is already
+    // in the right partition. These two cases coincide for the same i because index_right is in
+    // ascending order and index_left in descending order. This means for both index_left and
+    // index_right the first num_swaps particles need to be swapped, and the particles after that
+    // should be skipped.
+    //
+    // The check right_i < left_i makes sure that the particle going to the right partition is
+    // actually coming from the left partition, which has a lower index than the other particle and
+    // visa-versa.
+    //
+    // Since exactly num_swaps swap operations are performed in the end, which is the smallest
+    // number possible, this algorithm is optimal in the number of swap operations.
+    // This results in good performance in practice if the size of a particle is large enough that
+    // it compensates for the extra kernel launches and evaluations of is_left which this
+    // algorithm needs.
+
+    ParallelFor(max_num_swaps,
+        [=] AMREX_GPU_DEVICE (int i)
+        {
+            int left_i = p_index_left[i];
+            int right_i = p_index_right[i];
+            if (right_i < left_i) {
+                swapParticle(ptd, ptd, left_i, right_i);
+            }
+        });
+
+    Gpu::streamSynchronize(); // for index_left and index_right deallocation
+
+    return num_left;
+}
+
+template <typename PTile>
+void
+removeInvalidParticles (PTile& ptile)
+{
+    const int new_size = partitionParticles(ptile,
+        [=] AMREX_GPU_DEVICE (auto& ptd, int i) {
+            return ptd.id(i).is_valid();
+        });
+    ptile.resize(new_size);
+}
+
 #if defined(AMREX_USE_GPU)
 
 template <typename PTile, typename PLocator, typename CellAssignor>
 int
-partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor&& assignor,
+partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const& assignor,
                           const ParticleBufferMap& pmap,
                           const GpuArray<Real,AMREX_SPACEDIM>& plo,
                           const GpuArray<Real,AMREX_SPACEDIM>& phi,
@@ -451,115 +595,59 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor&& ass
                           int lev, int gid, int /*tid*/,
                           int lev_min, int lev_max, int nGrow, bool remove_negative)
 {
-    const int np = ptile.numParticles();
-    if (np == 0) { return 0; }
-
     auto getPID = pmap.getPIDFunctor();
-
     int pid = ParallelContext::MyProcSub();
-    constexpr int chunk_size = 256*256*256;
-    int num_chunks = std::max(1, (np + (chunk_size - 1)) / chunk_size);
 
-    PTile ptile_tmp;
-    ptile_tmp.define(ptile.NumRuntimeRealComps(), ptile.NumRuntimeIntComps());
-    ptile_tmp.resize(std::min(np, chunk_size));
+    Gpu::DeviceVector<uint8_t> particle_stays(ptile.numParticles());
+    uint8_t * const p_particle_stays = particle_stays.dataPtr();
+    auto ptd = ptile.getParticleTileData();
 
-    auto src_data = ptile.getParticleTileData();
-    auto dst_data = ptile_tmp.getParticleTileData();
-
-    int last_offset = 0;
-    for (int ichunk = 0; ichunk < num_chunks; ++ichunk)
-    {
-        int this_offset = ichunk*chunk_size;
-        int this_chunk_size = std::min(chunk_size, np - this_offset);
-
-        int num_stay;
+    // the function for determining if a particle stays on this grid is very slow,
+    // so we cache it in particle_stays to avoid evaluating it multiple times.
+    ParallelFor(ptile.numParticles(),
+        [=] AMREX_GPU_DEVICE (int i)
         {
-            auto particle_stays = [=] AMREX_GPU_DEVICE (int i) -> int
-            {
-                int assigned_grid;
-                int assigned_lev;
+            int assigned_grid;
+            int assigned_lev;
 
-                if (src_data.id(i+this_offset) < 0 )
+            if (!ptd.id(i).is_valid())
+            {
+                assigned_grid = -1;
+                assigned_lev  = -1;
+            }
+            else
+            {
+                auto p_prime = ptd.getSuperParticle(i);
+                enforcePeriodic(p_prime, plo, phi, rlo, rhi, is_per);
+                auto tup_prime = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
+                assigned_grid = amrex::get<0>(tup_prime);
+                assigned_lev  = amrex::get<1>(tup_prime);
+                if (assigned_grid >= 0)
                 {
-                    assigned_grid = -1;
-                    assigned_lev  = -1;
+                    AMREX_D_TERM(ptd.pos(0, i) = p_prime.pos(0);,
+                                 ptd.pos(1, i) = p_prime.pos(1);,
+                                 ptd.pos(2, i) = p_prime.pos(2););
                 }
-                else
+                else if (lev_min > 0)
                 {
-                    amrex::Particle<0> p_prime;
-                    AMREX_D_TERM(p_prime.pos(0) = src_data.pos(0, i+this_offset);,
-                                 p_prime.pos(1) = src_data.pos(1, i+this_offset);,
-                                 p_prime.pos(2) = src_data.pos(2, i+this_offset););
-
-                    enforcePeriodic(p_prime, plo, phi, rlo, rhi, is_per);
-                    auto tup_prime = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
-                    assigned_grid = amrex::get<0>(tup_prime);
-                    assigned_lev  = amrex::get<1>(tup_prime);
-                    if (assigned_grid >= 0)
-                    {
-                      AMREX_D_TERM(src_data.pos(0, i+this_offset) = p_prime.pos(0);,
-                                   src_data.pos(1, i+this_offset) = p_prime.pos(1);,
-                                   src_data.pos(2, i+this_offset) = p_prime.pos(2););
-                    }
-                    else if (lev_min > 0)
-                    {
-                      AMREX_D_TERM(p_prime.pos(0) = src_data.pos(0, i+this_offset);,
-                                   p_prime.pos(1) = src_data.pos(1, i+this_offset);,
-                                   p_prime.pos(2) = src_data.pos(2, i+this_offset););
-                      auto tup = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
-                      assigned_grid = amrex::get<0>(tup);
-                      assigned_lev  = amrex::get<1>(tup);
-                    }
+                    AMREX_D_TERM(p_prime.pos(0) = ptd.pos(0, i);,
+                                 p_prime.pos(1) = ptd.pos(1, i);,
+                                 p_prime.pos(2) = ptd.pos(2, i););
+                    auto tup = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
+                    assigned_grid = amrex::get<0>(tup);
+                    assigned_lev  = amrex::get<1>(tup);
                 }
+            }
 
-                if ((remove_negative == false) && (src_data.id(i+this_offset) < 0)) {
-                    return true;
-                }
-
-                return ((assigned_grid == gid) && (assigned_lev == lev) && (getPID(lev, gid) == pid));
-            };
-
-            num_stay = Scan::PrefixSum<int> (this_chunk_size,
-                          [=] AMREX_GPU_DEVICE (int i) -> int
-                          {
-                              return particle_stays(i);
-                          },
-                          [=] AMREX_GPU_DEVICE (int i, int const& s)
-                          {
-                              int src_i = i + this_offset;
-                              int dst_i = particle_stays(i) ? s : this_chunk_size-1-(i-s);
-                              copyParticle(dst_data, src_data, src_i, dst_i);
-                          },
-                          Scan::Type::exclusive);
-        }
-
-        if (num_chunks == 1)
-        {
-            ptile.swap(ptile_tmp);
-        }
-        else
-        {
-            AMREX_FOR_1D(this_chunk_size, i,
-                         {
-                             copyParticle(src_data, dst_data, i, i + this_offset);
-                         });
-        }
-
-        if ( ichunk > 0 )
-        {
-            int num_swap = std::min(this_offset - last_offset, num_stay);
-            AMREX_FOR_1D( num_swap, i,
-            {
-                swapParticle(src_data, src_data, last_offset + i,
-                             this_offset + num_stay - 1 - i);
-            });
-        }
-
-        last_offset += num_stay;
-    }
+            p_particle_stays[i] = uint8_t(
+                ((remove_negative == false) && (!ptd.id(i).is_valid())) ||
+                ((assigned_grid == gid) && (assigned_lev == lev) && (getPID(lev, gid) == pid)));
+        });
 
-    return last_offset;
+    return partitionParticles(ptile,
+        [=] AMREX_GPU_DEVICE (auto& /* ptd */, int i) -> bool {
+            return p_particle_stays[i];
+        });
 }
 
 #endif
@@ -605,7 +693,7 @@ void clearEmptyEntries (C& c)
 
 template <class index_type, typename F>
 void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type nitems,
-                               index_type nbins, F&& f)
+                               index_type nbins, F const& f)
 {
     BL_PROFILE("PermutationForDeposition()");
 
@@ -679,6 +767,7 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
             }
         });
 #else
+    amrex::ignore_unused(pperm, pglobal_idx);
     Abort("Not implemented");
 #endif
 
diff --git a/Src/Particle/AMReX_Particle_mod_K.H b/Src/Particle/AMReX_Particle_mod_K.H
index 2d1f91b4275..4f07e0db697 100644
--- a/Src/Particle/AMReX_Particle_mod_K.H
+++ b/Src/Particle/AMReX_Particle_mod_K.H
@@ -2,7 +2,6 @@
 #define AMREX_PARTICLE_MOD_K_H_
 #include <AMReX_Config.H>
 
-#include <AMReX_Particles.H>
 #include <AMReX_FArrayBox.H>
 #include <AMReX_REAL.H>
 #include <cmath>
@@ -233,80 +232,6 @@ void amrex_deposit_particle_dx_cic (P const& p, int nc, amrex::Array4<amrex::Rea
 #endif
 }
 
-template <typename P>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void amrex_interpolate_cic (P const& p, int nc, amrex::Array4<amrex::Real const> const& acc,
-                            amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
-                            amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi)
-{
-#if (AMREX_SPACEDIM == 1)
-    amrex::Real lx = (p.pos(0) - plo[0]) * dxi[0] + Real(0.5);
-
-    int i = static_cast<int>(amrex::Math::floor(lx));
-
-    amrex::Real xint = lx - static_cast<Real>(i);
-
-    amrex::Real sx[] = {Real(1.0)-xint, xint};
-
-    for (int comp=0; comp < nc; ++comp) {
-        for (int ii = 0; ii <= 1; ++ii) {
-            amrex::Real acceleration = sx[ii]*acc(i+ii-1,0,0,comp);
-            amrex::ignore_unused(acceleration);
-        }
-    }
-#elif (AMREX_SPACEDIM == 2)
-    amrex::Real lx = (p.pos(0) - plo[0]) * dxi[0] + Real(0.5);
-    amrex::Real ly = (p.pos(1) - plo[1]) * dxi[1] + Real(0.5);
-
-    int i = static_cast<int>(amrex::Math::floor(lx));
-    int j = static_cast<int>(amrex::Math::floor(ly));
-
-    amrex::Real xint = lx - static_cast<Real>(i);
-    amrex::Real yint = ly - static_cast<Real>(j);
-
-    amrex::Real sx[] = {Real(1.0)-xint, xint};
-    amrex::Real sy[] = {Real(1.0)-yint, yint};
-
-    for (int comp=0; comp < nc; ++comp) {
-        for (int jj = 0; jj <= 1; ++jj) {
-            for (int ii = 0; ii <= 1; ++ii) {
-                amrex::Real acceleration = sx[ii]*sy[jj]*acc(i+ii-1,j+jj-1,0,comp);
-                amrex::ignore_unused(acceleration);
-            }
-        }
-    }
-#elif (AMREX_SPACEDIM == 3)
-    amrex::Real lx = (p.pos(0) - plo[0]) * dxi[0] + Real(0.5);
-    amrex::Real ly = (p.pos(1) - plo[1]) * dxi[1] + Real(0.5);
-    amrex::Real lz = (p.pos(2) - plo[2]) * dxi[2] + Real(0.5);
-
-    int i = static_cast<int>(amrex::Math::floor(lx));
-    int j = static_cast<int>(amrex::Math::floor(ly));
-    int k = static_cast<int>(amrex::Math::floor(lz));
-
-    amrex::Real xint = lx - static_cast<Real>(i);
-    amrex::Real yint = ly - static_cast<Real>(j);
-    amrex::Real zint = lz - static_cast<Real>(k);
-
-    amrex::Real sx[] = {Real(1.0)-xint, xint};
-    amrex::Real sy[] = {Real(1.0)-yint, yint};
-    amrex::Real sz[] = {Real(1.0)-zint, zint};
-
-    for (int comp=0; comp < nc; ++comp) {
-        for (int kk = 0; kk <= 1; ++kk) {
-            for (int jj = 0; jj <= 1; ++jj) {
-                for (int ii = 0; ii <= 1; ++ii) {
-                    amrex::Real acceleration = sx[ii]*sy[jj]*sz[kk]*acc(i+ii-1,j+jj-1,k+kk-1,comp);
-                    amrex::ignore_unused(acceleration);
-                }
-            }
-        }
-    }
-#else
-    amrex::Abort("Not implemented.");
-#endif
-}
-
 }
 
 #endif
diff --git a/Src/Particle/AMReX_SparseBins.H b/Src/Particle/AMReX_SparseBins.H
index 6ec80a0a831..fb026a8ca88 100644
--- a/Src/Particle/AMReX_SparseBins.H
+++ b/Src/Particle/AMReX_SparseBins.H
@@ -16,15 +16,15 @@ struct SparseBinIteratorFactory
 
     using index_type = unsigned int;
 
-    using const_pointer_type = typename std::conditional<IsParticleTileData<T>(),
+    using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
         const T*
-    >::type;
+    >;
 
-    using const_pointer_input_type = typename std::conditional<IsParticleTileData<T>(),
+    using const_pointer_input_type = std::conditional_t<IsParticleTileData<T>(),
         const T&,
         const T*
-    >::type;
+    >;
 
     SparseBinIteratorFactory (const Gpu::DeviceVector<index_type>& bins,
                               const Gpu::DeviceVector<index_type>& offsets,
@@ -99,15 +99,15 @@ public:
     using bin_type = IntVect;
     using index_type = unsigned int;
 
-    using const_pointer_type = typename std::conditional<IsParticleTileData<T>(),
+    using const_pointer_type = std::conditional_t<IsParticleTileData<T>(),
         T,
         const T*
-    >::type;
+    >;
 
-    using const_pointer_input_type = typename std::conditional<IsParticleTileData<T>(),
+    using const_pointer_input_type = std::conditional_t<IsParticleTileData<T>(),
         const T&,
         const T*
-    >::type;
+    >;
 
     /**
      * \brief Populate the bins with a set of items.
@@ -121,7 +121,7 @@ public:
      * \param f a function object that maps items to bins
      */
     template <typename N, typename F>
-    void build (N nitems, const_pointer_input_type v, const Box& bx, F&& f)
+    void build (N nitems, const_pointer_input_type v, const Box& bx, F const& f)
     {
         BL_PROFILE("SparseBins<T>::build");
 
diff --git a/Src/Particle/AMReX_StructOfArrays.H b/Src/Particle/AMReX_StructOfArrays.H
index 0ef3b8ae869..4de35e085ca 100644
--- a/Src/Particle/AMReX_StructOfArrays.H
+++ b/Src/Particle/AMReX_StructOfArrays.H
@@ -11,9 +11,11 @@
 namespace amrex {
 
 template <int NReal, int NInt,
-          template<class> class Allocator=DefaultAllocator>
+          template<class> class Allocator=DefaultAllocator,
+          bool use64BitIdCpu=false>
 struct StructOfArrays {
 
+    using IdCPU = amrex::PODVector<uint64_t, Allocator<uint64_t> >;
     using RealVector = amrex::PODVector<ParticleReal, Allocator<ParticleReal> >;
     using IntVector = amrex::PODVector<int, Allocator<int> >;
 
@@ -28,9 +30,12 @@ struct StructOfArrays {
 
     [[nodiscard]] int NumIntComps () const noexcept { return NInt + m_runtime_idata.size(); }
 
+    [[nodiscard]] IdCPU& GetIdCPUData () { return m_idcpu; }
     [[nodiscard]] std::array<RealVector, NReal>& GetRealData () { return m_rdata; }
     [[nodiscard]] std::array< IntVector,  NInt>& GetIntData  () { return m_idata; }
 
+    /** Get access to the particle id/cpu Arrays */
+    [[nodiscard]] const IdCPU& GetIdCPUData () const { return m_idcpu; }
     /** Get access to the particle Real Arrays (only compile-time components) */
     [[nodiscard]] const std::array<RealVector, NReal>& GetRealData () const { return m_rdata; }
     /** Get access to the particle Int Arrays (only compile-time components) */
@@ -119,7 +124,9 @@ struct StructOfArrays {
     */
     [[nodiscard]] std::size_t size () const
     {
-        if constexpr (NReal > 0) {
+        if constexpr (use64BitIdCpu == true) {
+            return m_idcpu.size();
+        } else if constexpr (NReal > 0) {
             return m_rdata[0].size();
         } else if constexpr (NInt > 0) {
             return m_idata[0].size();
@@ -175,6 +182,9 @@ struct StructOfArrays {
 
     void resize (size_t count)
     {
+        if constexpr (use64BitIdCpu == true) {
+            m_idcpu.resize(count);
+        }
         if constexpr (NReal > 0) {
             for (int i = 0; i < NReal; ++i) { m_rdata[i].resize(count); }
         }
@@ -185,6 +195,14 @@ struct StructOfArrays {
         for (int i = 0; i < int(m_runtime_idata.size()); ++i) { m_runtime_idata[i].resize(count); }
     }
 
+    [[nodiscard]] uint64_t* idcpuarray () {
+        if constexpr (use64BitIdCpu == true) {
+            return m_idcpu.dataPtr();
+        } else {
+            return nullptr;
+        }
+    }
+
     [[nodiscard]] GpuArray<ParticleReal*, NReal> realarray ()
     {
         GpuArray<Real*, NReal> arr;
@@ -208,6 +226,7 @@ struct StructOfArrays {
     int m_num_neighbor_particles{0};
 
 private:
+    IdCPU m_idcpu;
     std::array<RealVector, NReal> m_rdata;
     std::array< IntVector,  NInt> m_idata;
 
diff --git a/Src/Particle/AMReX_TracerParticle_mod_K.H b/Src/Particle/AMReX_TracerParticle_mod_K.H
index 433ad864e58..ee85f61fac4 100644
--- a/Src/Particle/AMReX_TracerParticle_mod_K.H
+++ b/Src/Particle/AMReX_TracerParticle_mod_K.H
@@ -1,7 +1,7 @@
 #ifndef AMREX_TRACERPARTICLE_MOD_K_H
 #define AMREX_TRACERPARTICLE_MOD_K_H
-#include <AMReX_Config.H>
 
+#include <AMReX_Config.H>
 #include <AMReX_FArrayBox.H>
 #include <AMReX_Box.H>
 #include <AMReX_Gpu.H>
@@ -9,200 +9,661 @@
 #include <AMReX_REAL.H>
 #include <AMReX_IntVect.H>
 #include <AMReX_TracerParticles.H>
-
 #include <cmath>
 
-namespace amrex{
+namespace amrex {
+
+//
+// **********************************************************************
+// Regular coordinates
+// **********************************************************************
+//
 
+/**
+ \brief Linearly interpolates the mesh data to the particle position from cell-centered data.
+*/
 template <typename P>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void cic_interpolate (const P& p,
                       amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
                       amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
-                      const amrex::Array4<amrex::Real const> &  uccarr,
-                      amrex::ParticleReal * val, int M)
+                      const amrex::Array4<amrex::Real const>&  data_arr,
+                      amrex::ParticleReal * val, int M = AMREX_SPACEDIM)
 {
-  AMREX_ASSERT(val != nullptr);
+    cic_interpolate_cc(p, plo, dxi, data_arr, val, M);
+}
 
-#if (AMREX_SPACEDIM == 1)
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void cic_interpolate_cc (const P& p,
+                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                         const amrex::Array4<amrex::Real const>&  data_arr,
+                         amrex::ParticleReal * val, int M = AMREX_SPACEDIM)
+{
+    int start_comp = 0;
+    int ncomp_per_array = M;
+    int num_arrays = 1;
+    IntVect is_nodal = amrex::IntVect::TheZeroVector();
+    linear_interpolate_to_particle (p, plo, dxi, &data_arr, val, &is_nodal, start_comp, ncomp_per_array, num_arrays);
+}
+
+/**
+ \brief Linearly interpolates the mesh data to the particle position from node-centered data.
+*/
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void cic_interpolate_nd (const P& p,
+                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                         const amrex::Array4<amrex::Real const>&  data_arr,
+                         amrex::ParticleReal * val, int M = AMREX_SPACEDIM)
+{
+    int start_comp = 0;
+    int ncomp_per_array = M;
+    int num_arrays = 1;
+    IntVect is_nodal = amrex::IntVect::TheUnitVector();
+    linear_interpolate_to_particle (p, plo, dxi, &data_arr, val, &is_nodal, start_comp, ncomp_per_array, num_arrays);
+}
+
+/**
+ \brief Linearly interpolates the mesh data to the particle position from face-centered data.
+        The nth component of the data_arr array is nodal in the nth direction, and cell-centered in the others.
+*/
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mac_interpolate (const P& p,
+                      amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                      amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                      amrex::GpuArray<amrex::Array4<amrex::Real const>,AMREX_SPACEDIM> const& data_arr,
+                      amrex::ParticleReal * val)
+{
+    int start_comp = 0;
+    int ncomp_per_array = 1;
+    int num_arrays = AMREX_SPACEDIM;
+    IntVect is_nodal[AMREX_SPACEDIM];
+    for (int d=0; d < AMREX_SPACEDIM; ++d) {
+        is_nodal[d] = amrex::IntVect::TheZeroVector();
+        is_nodal[d][d] = 1;
+    }
+    linear_interpolate_to_particle (p, plo, dxi, data_arr.data(), val, &is_nodal[0], start_comp, ncomp_per_array, num_arrays);
+}
 
-    amrex::Real lx = (Real(p.pos(0)) - plo[0]) * dxi[0] - Real(0.5); //len
 
-    int const i = static_cast<int>(amrex::Math::floor(lx)); //cell
 
-    amrex::Real xint = lx - static_cast<Real>(i); //frac
+/**
+ \brief Linearly interpolates the mesh data to the particle position from mesh data.
+        This general form can handle an arbitrary number of Array4s, each with different staggerings.
+*/
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void linear_interpolate_to_particle (const P& p,
+                                     amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                                     amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                                     const Array4<amrex::Real const>* data_arr,
+                                     amrex::ParticleReal * val,
+                                     const IntVect* is_nodal,
+                                     int start_comp, int ncomp, int num_arrays)
+{
+    AMREX_ASSERT(val != nullptr);
 
-    amrex::Real sx[] = {Real(1.0) - xint, xint};
+    int ctr = 0;
 
-    for (int d=0; d < M; ++d)
+    for (int d = 0; d < num_arrays; d++)
     {
-        val[d] = ParticleReal(0.0);
-        for (int ii = 0; ii<=1; ++ii)
-        {
-            val[d] += static_cast<ParticleReal>(sx[ii]*uccarr(i+ii,0,0,d));
-        }
-    }
+        AMREX_D_TERM(amrex::Real lx = (Real(p.pos(0))-plo[0])*dxi[0] - static_cast<Real>(!is_nodal[d][0])*Real(0.5);,
+                     amrex::Real ly = (Real(p.pos(1))-plo[1])*dxi[1] - static_cast<Real>(!is_nodal[d][1])*Real(0.5);,
+                     amrex::Real lz = (Real(p.pos(2))-plo[2])*dxi[2] - static_cast<Real>(!is_nodal[d][2])*Real(0.5));
+
+        // (i0,j0,k0) is the lower corner of the box needed for interpolation
+        // i0 = (i-1) if particle is lower  than center of cell i
+        // i0 = (i  ) if particle is higher than center of cell i
+        AMREX_D_TERM(int const i0 = static_cast<int>(amrex::Math::floor(lx));,
+                     int const j0 = static_cast<int>(amrex::Math::floor(ly));,
+                     int const k0 = static_cast<int>(amrex::Math::floor(lz)));
+
+        AMREX_D_TERM(amrex::Real const xint = lx - static_cast<Real>(i0);,
+                     amrex::Real const yint = ly - static_cast<Real>(j0);,
+                     amrex::Real const zint = lz - static_cast<Real>(k0));
+
+        amrex::Real sx[] = {amrex::Real(1.0) - xint, xint};
+#if (AMREX_SPACEDIM > 1)
+        amrex::Real sy[] = {amrex::Real(1.0) - yint, yint};
+#endif
+#if (AMREX_SPACEDIM > 2)
+        amrex::Real sz[] = {amrex::Real(1.0) - zint, zint};
+#endif
+
+        for (int comp = start_comp; comp < ncomp; ++comp) {
+            val[ctr] = ParticleReal(0.0);
+#if (AMREX_SPACEDIM > 2)
+            for (int kk = 0; kk <=1; ++kk) {
+#endif
+
+#if (AMREX_SPACEDIM > 1)
+                for (int jj = 0; jj <= 1; ++jj) {
+#endif
+                    for (int ii = 0; ii <= 1; ++ii) {
+                        val[ctr] += static_cast<ParticleReal>((data_arr[d])(IntVect(AMREX_D_DECL(i0+ii, j0+jj, k0+kk)), comp) *
+                                                                                    AMREX_D_TERM(sx[ii],*sy[jj],*sz[kk]));
+            AMREX_D_TERM(},},});
+            ctr++;
+        } // ncomp
+    } // d
+}
+
+//
+// **********************************************************************
+// Terrain-fitted coordinates
+// **********************************************************************
+//
+
+/**
+ \brief Linearly interpolates the mesh data to the particle position from cell-centered data
+        on a terrain-fitted grid.
+*/
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void cic_interpolate_mapped_z (const P& p,
+                               amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                               amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                               const amrex::Array4<amrex::Real const>&  data_arr,
+                               const amrex::Array4<amrex::Real const>&  height_arr,
+                               amrex::ParticleReal * val, int M = AMREX_SPACEDIM)
+{
+    cic_interpolate_cc_mapped_z(p, plo, dxi, data_arr, height_arr, val, M);
+}
+
+/**
+ \brief Linearly interpolates the mesh data to the particle position from cell-centered data
+        on a terrain-fitted grid.
+*/
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void cic_interpolate_cc_mapped_z (const P& p,
+                                  amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                                  amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                                  const amrex::Array4<amrex::Real const>&  data_arr,
+                                  const amrex::Array4<amrex::Real const>&  height_arr,
+                                  amrex::ParticleReal * val, int M = AMREX_SPACEDIM)
+{
+    int icomp = 0;
+    int ncomp_per_array = M;
+    int num_arrays = 1;
+    IntVect is_nodal = amrex::IntVect::TheZeroVector();
+    linear_interpolate_to_particle_z(p, plo, dxi, &data_arr, height_arr,
+                                     val, &is_nodal, icomp, ncomp_per_array, num_arrays);
+}
+
+/**
+ \brief Linearly interpolates the mesh data to the particle position from node-centered data.
+        on a terrain-fitted grid.
+*/
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void cic_interpolate_nd_mapped_z (const P& p,
+                                  amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                                  amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                                  const amrex::Array4<amrex::Real const>&  data_arr,
+                                  const amrex::Array4<amrex::Real const>&  height_arr,
+                                  amrex::ParticleReal * val, int M = AMREX_SPACEDIM)
+{
+    int icomp = 0;
+    int ncomp_per_array = M;
+    int num_arrays = 1;
+    IntVect is_nodal = amrex::IntVect::TheUnitVector();
+    linear_interpolate_to_particle_z(p, plo, dxi, &data_arr, height_arr,
+                                     val, &is_nodal, icomp, ncomp_per_array, num_arrays);
+}
 
+/**
+ \brief Linearly interpolates the mesh data to the particle position from face-centered data
+        on a terrain-fitted grid.
+        The nth component of the data_arr array is nodal in the nth direction, and cell-centered in the others.
+*/
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mac_interpolate_mapped_z (const P& p,
+                               amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                               amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                               amrex::GpuArray<amrex::Array4<amrex::Real const>,AMREX_SPACEDIM> const& data_arr,
+                               const amrex::Array4<amrex::Real const>&  height_arr,
+                               amrex::ParticleReal * val)
+{
+    int icomp = 0;
+    int ncomp_per_array = 1;
+    int num_arrays = AMREX_SPACEDIM;
+    IntVect is_nodal[AMREX_SPACEDIM];
+    for (int d=0; d < AMREX_SPACEDIM; ++d) {
+        is_nodal[d] = amrex::IntVect::TheZeroVector();
+        is_nodal[d][d] = 1;
+    }
+    linear_interpolate_to_particle_z(p, plo, dxi, data_arr.data(), height_arr,
+                                     val, &is_nodal[0], icomp, ncomp_per_array, num_arrays);
+}
 
-#elif (AMREX_SPACEDIM == 2)
+/**
+ \brief Linearly interpolates the mesh data to the particle position from mesh data.
+        This general form can handle an arbitrary number of Array4s, each with different staggerings
+        on a terrain-fitted grid.
+*/
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void linear_interpolate_to_particle_z (const P& p,
+                                       amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
+                                       amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
+                                       const Array4<amrex::Real const>* data_arr,
+                                       const amrex::Array4<amrex::Real const>&  height_arr,
+                                       amrex::ParticleReal * val,
+                                       const IntVect* is_nodal,
+                                       int start_comp, int ncomp, int num_arrays)
+{
+#if (AMREX_SPACEDIM == 1)
+    amrex::ignore_unused(p, plo, dxi, data_arr, height_arr, val, is_nodal, start_comp, ncomp, num_arrays);
+    amrex::Abort(" Terrain fitted grid interpolation is not supported in 1D\n");
+#else
+    AMREX_ASSERT(val != nullptr);
 
-    amrex::Real lx = (Real(p.pos(0)) - plo[0]) * dxi[0] - Real(0.5);
-    amrex::Real ly = (Real(p.pos(1)) - plo[1]) * dxi[1] - Real(0.5);
+    int ctr = 0;
 
-    int const i = static_cast<int>(amrex::Math::floor(lx));
-    int const j = static_cast<int>(amrex::Math::floor(ly));
+    for (int d = 0; d < num_arrays; d++)
+    {
+        AMREX_D_TERM(amrex::Real lx = (Real(p.pos(0))-plo[0])*dxi[0] - static_cast<Real>(!is_nodal[d][0])*Real(0.5);,
+                     amrex::Real ly = (Real(p.pos(1))-plo[1])*dxi[1] - static_cast<Real>(!is_nodal[d][1])*Real(0.5);,);
+
+        int const i0 = static_cast<int>(amrex::Math::floor(lx));
+        int k = 0;
+
+#if (AMREX_SPACEDIM == 2)
+        amrex::ignore_unused(ly);
+        int const j  = p.idata(0);
+
+        amrex::Real hlo_xlo = amrex::Real(0.25)
+                                   * ( height_arr(i0                       , j                     , k)
+                                   +   height_arr(i0 + (!(is_nodal[d][0])) , j                     , k)
+                                   +   height_arr(i0                       , j + (!is_nodal[d][1]) , k)
+                                   +   height_arr(i0 + (!(is_nodal[d][0])) , j + (!is_nodal[d][1]) , k) );
+
+        amrex::Real hlo_xhi = amrex::Real(0.25)
+                                   * ( height_arr(i0 + 1                       , j                    , k )
+                                   +   height_arr(i0 + 1 + (!(is_nodal[d][0])) , j                    , k )
+                                   +   height_arr(i0 + 1                       , j + (!is_nodal[d][1]), k )
+                                   +   height_arr(i0 + 1 + (!(is_nodal[d][0])) , j + (!is_nodal[d][1]), k ) );
+
+
+        amrex::Real const xint = lx - static_cast<Real>(i0);
+        amrex::Real sx[] = { amrex::Real(1.) - xint, xint};
+        amrex::Real height_at_px = sx[0] * hlo_xlo + sx[1] * hlo_xhi;
+
+        int const j0 = (amrex::Real(p.pos(1)) >= height_at_px) ? j : j-1;
+
+        int yctr = 0;
+        amrex::Real ht[4];
+        for (int ii=0; ii < 2; ++ii) {
+            for (int jj=0; jj < 2; ++jj) {
+                ht[yctr] = amrex::Real(0.25)
+                                * ( height_arr(i0 + ii                       , j0 + jj , k                    )
+                                +   height_arr(i0 + ii + (!(is_nodal[d][0])) , j0 + jj , k                    )
+                                +   height_arr(i0 + ii                       , j0 + jj + (!is_nodal[d][1]), k )
+                                +   height_arr(i0 + ii + (!(is_nodal[d][0])) , j0 + jj + (!is_nodal[d][1]), k ) );
+                ++yctr;
+            }
+        }
+        amrex::Real hint_ilo = (p.pos(1) - ht[0]) / (ht[1] - ht[0]);
+        amrex::Real hint_ihi = (p.pos(1) - ht[2]) / (ht[3] - ht[2]);
 
-    amrex::Real xint = lx - static_cast<Real>(i);
-    amrex::Real yint = ly - static_cast<Real>(j);
+        amrex::Real sy[] = { amrex::Real(1.) - hint_ilo, amrex::Real(1.) - hint_ihi,
+                                               hint_ilo,                   hint_ihi};
 
-    amrex::Real sx[] = {Real(1.0) - xint, xint};
-    amrex::Real sy[] = {Real(1.0) - yint, yint};
+#elif (AMREX_SPACEDIM == 3)
 
-    for (int d=0; d < M; ++d)
-      {
-        val[d] = ParticleReal(0.0);
-        for (int jj = 0; jj <= 1; ++jj)
-          {
-            for (int ii = 0; ii <= 1; ++ii)
-              {
-                  val[d] += static_cast<ParticleReal>(sx[ii]*sy[jj]*uccarr(i+ii,j+jj,0,d));
-              }
-          }
-      }
+        int const j0 = static_cast<int>(amrex::Math::floor(ly));
+        k = p.idata(0);
+        amrex::Real const xint = lx - static_cast<Real>(i0);
+        amrex::Real const yint = ly - static_cast<Real>(j0);
+        amrex::Real sx[] = { amrex::Real(1.) - xint, xint};
+        amrex::Real sy[] = { amrex::Real(1.) - yint, yint};
+
+        amrex::Real hlo[4];
+        int ilo = 0;
+        amrex::Real height_at_pxy = 0.;
+        for (int ii = 0; ii < 2; ++ii) {
+            for (int jj = 0; jj < 2; ++jj) {
+                hlo[ilo] = amrex::Real(0.125)
+                * ( height_arr(i0 + ii                    , j0 + jj                    , k                    )
+                +   height_arr(i0 + ii + (!is_nodal[d][0]), j0 + jj                    , k                    )
+                +   height_arr(i0 + ii                    , j0 + jj + (!is_nodal[d][1]), k                    )
+                +   height_arr(i0 + ii + (!is_nodal[d][0]), j0 + jj + (!is_nodal[d][1]), k                    )
+                +   height_arr(i0 + ii                    , j0 + jj                    , k + (!is_nodal[d][2]))
+                +   height_arr(i0 + ii + (!is_nodal[d][0]), j0 + jj                    , k + (!is_nodal[d][2]))
+                +   height_arr(i0 + ii                    , j0 + jj + (!is_nodal[d][1]), k + (!is_nodal[d][2]))
+                +   height_arr(i0 + ii + (!is_nodal[d][0]), j0 + jj + (!is_nodal[d][1]), k + (!is_nodal[d][2]))
+                );
+                height_at_pxy += hlo[ilo] * sx[ii] * sy[jj];
+                ++ilo;
+            }
+        }
 
+        int const k0 = (amrex::Real(p.pos(2)) >= height_at_pxy ) ? k : k-1;
+
+        int zctr = 0;
+        amrex::Real ht[8];
+        for (int ii = 0; ii < 2; ++ii) {
+        for (int jj = 0; jj < 2; ++jj) {
+        for (int kk = 0; kk < 2; ++kk) {
+            ht[zctr] = amrex::Real(0.125) *
+             ( height_arr(i0 + ii                    , j0 + jj                    , k0 + kk                    )
+             + height_arr(i0 + ii                    , j0 + jj                    , k0 + kk + (!is_nodal[d][2]))
+             + height_arr(i0 + ii                    , j0 + jj + (!is_nodal[d][1]), k0 + kk                    )
+             + height_arr(i0 + ii                    , j0 + jj + (!is_nodal[d][1]), k0 + kk + (!is_nodal[d][2]))
+             + height_arr(i0 + ii + (!is_nodal[d][0]), j0 + jj                    , k0 + kk                    )
+             + height_arr(i0 + ii + (!is_nodal[d][0]), j0 + jj                    , k0 + kk + (!is_nodal[d][2]))
+             + height_arr(i0 + ii + (!is_nodal[d][0]), j0 + jj + (!is_nodal[d][1]), k0 + kk                    )
+             + height_arr(i0 + ii + (!is_nodal[d][0]), j0 + jj + (!is_nodal[d][1]), k0 + kk + (!is_nodal[d][2]))
+             );
+            ++zctr;
+        }}}
+
+        amrex::Real hint_ilojlo = ( p.pos(2) - ht[0] ) / (ht[1] - ht[0]);
+        amrex::Real hint_ilojhi = ( p.pos(2) - ht[2] ) / (ht[3] - ht[2]);
+        amrex::Real hint_ihijlo = ( p.pos(2) - ht[4] ) / (ht[5] - ht[4]);
+        amrex::Real hint_ihijhi = ( p.pos(2) - ht[6] ) / (ht[7] - ht[6]);
+
+        amrex::Real sz[] = { amrex::Real(1.) - hint_ilojlo,
+                             amrex::Real(1.) - hint_ihijlo,
+                             amrex::Real(1.) - hint_ilojhi,
+                             amrex::Real(1.) - hint_ihijhi,
+                                               hint_ilojlo,
+                                               hint_ihijlo,
+                                               hint_ilojhi,
+                                               hint_ihijhi};
+#endif
+        for (int comp = start_comp; comp < ncomp; ++comp) {
+            val[ctr] = amrex::ParticleReal(0.);
+#if (AMREX_SPACEDIM == 2)
+            int k0 = 0;
+            int sy_ctr = 0;
+            for (int jj = 0; jj <= 1; ++jj) {
+                for (int ii = 0; ii <=1; ++ii) {
+                    val[ctr] += static_cast<ParticleReal>( (data_arr[d])(i0+ii, j0+jj, k0 ,comp)*sx[ii]*sy[sy_ctr] );
+                    ++sy_ctr;
+                }
+            }
 #elif (AMREX_SPACEDIM == 3)
+            int sz_ctr = 0;
+            for (int kk = 0; kk <= 1; ++kk) {
+                for (int jj = 0; jj <= 1; ++jj) {
+                    for (int ii = 0; ii <= 1; ++ii) {
+                        val[ctr] += static_cast<ParticleReal>(
+                                    (data_arr[d])(i0+ii, j0+jj, k0 + kk, comp)*sx[ii]*sy[jj]*sz[sz_ctr]);
+                        ++sz_ctr;
+                    }
+                }
+            }
+#endif
+            ctr++;
+        } // ncomp
+    } // d
+#endif
+}
 
-    amrex::Real lx = (Real(p.pos(0)) - plo[0]) * dxi[0] - Real(0.5);
-    amrex::Real ly = (Real(p.pos(1)) - plo[1]) * dxi[1] - Real(0.5);
-    amrex::Real lz = (Real(p.pos(2)) - plo[2]) * dxi[2] - Real(0.5);
 
-    int const i = static_cast<int>(amrex::Math::floor(lx));
-    int const j = static_cast<int>(amrex::Math::floor(ly));
-    int const k = static_cast<int>(amrex::Math::floor(lz));
+//
+// **********************************************************************
+// General mapped coordinates
+// **********************************************************************
+//
 
-    amrex::Real const xint = lx - static_cast<Real>(i);
-    amrex::Real const yint = ly - static_cast<Real>(j);
-    amrex::Real const zint = lz - static_cast<Real>(k);
+/**
+ \brief Helper functions for 3D interpolation from values on nodes/vertices of arbitrary hexahedra to particle location
+*/
 
-    amrex::Real sx[] = {Real(1.0) - xint, xint};
-    amrex::Real sy[] = {Real(1.0) - yint, yint};
-    amrex::Real sz[] = {Real(1.0) - zint, zint};
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void particle_interp_decomp (Array2D<Real,1,7,1,7>& a, Array1D<int,1,7>& ip, int neq)
+{
+    amrex::Real amult,apm,apn;
+    for (int n=1; n<neq+1; n++)
+    {
+        ip(n) = n;
+    }
 
-    for (int d=0; d < M; ++d)
+    for (int i=1; i<neq; i++)
     {
-        val[d] = ParticleReal(0.0);
-        for (int kk = 0; kk<=1; ++kk)
+        int ip1=i+1;
+        int k=i;
+        int ipi = ip(i);
+        apm  = std::abs(a(ipi,i));
+
+        for (int j=ip1; j<neq+1; j++)
+        {
+            int ipj = ip(j);
+            apn=std::abs(a(ipj,i));
+            if(apm < apn){
+                apm = apn;
+                k=j;
+            }
+        }
+
+        int j=ip(k);
+        ip(k)=ip(i);
+        ip(i)=j;
+        for (int l=ip1; l<neq+1; l++)
         {
-            for (int jj = 0; jj <= 1; ++jj)
+            int n=ip(l);
+            amult=a(n,i)/a(j,i);
+            a(n,i)=amult;
+            for (int kk = ip1; kk < neq+1; kk++)
             {
-                for (int ii = 0; ii <= 1; ++ii)
-                {
-                    val[d] += static_cast<ParticleReal>(sx[ii]*sy[jj]*sz[kk]*uccarr(i+ii,j+jj,k+kk,d));
-                }
+                a(n,kk) -= amult*a(j,kk);
             }
         }
     }
-#endif
 }
 
-template <typename P>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void cic_interpolate (const P& p,
-                      amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
-                      amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
-                      const amrex::Array4<amrex::Real const> &  uccarr,
-                      amrex::ParticleReal * val)
+void particle_interp_solve (Array2D<Real,1,7,1,7>& a, Array1D<Real,1,7>& b, Array1D<int,1,7>& ip, int neq)
 {
-    cic_interpolate(p, plo, dxi, uccarr, val, AMREX_SPACEDIM);
+    amrex::Real scr;
+    for (int l=2; l<neq+1; l++)
+    {
+        int n = ip(l);
+
+        for (int k=1; k<l; k++){
+             b(n) -= a(n,k) * b(ip(k));
+        }
+    }
+    b(ip(neq))=b(ip(neq))/a(ip(neq),neq);
+
+    for (int l=1; l<neq; l++)
+    {
+        int j=neq-l;
+        int jp1=j+1;
+        int n=ip(j);
+        for (int k=jp1; k<neq+1; k++){
+           b(n) -= a(n,k  ) * b(ip(k));
+        }
+        b(n)=b(n)/a(n,j);
+    }
+    for (int n=1; n<neq+1; n++)
+    {
+        while(ip(n) != n) {
+            int j=ip(n);
+            scr=b(j);
+            ip(n)=ip(j);
+            b(j)=b(ip(j));
+            b(ip(j))=scr;
+            ip(j)=j;
+        }
+    }
 }
 
+
+/**
+ \brief Linearly interpolates the mesh data to the particle position from node-centered data on a general mapped grid.
+*/
 template <typename P>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mac_interpolate (const P& p,
-                      amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
-                      amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
-                      amrex::GpuArray<amrex::Array4<amrex::Real const>,AMREX_SPACEDIM> const& p_uccarr,
-                      amrex::ParticleReal * val)
+void cic_interpolate_nd_mapped (const P& p,
+                                const amrex::Array4<amrex::Real const>& data_arr,
+                                const amrex::Array4<amrex::Real const>&  loc_arr,
+                                amrex::ParticleReal * val, int M = AMREX_SPACEDIM)
 {
+    int icomp = 0;
+    int ncomp_per_array = M;
+    int num_arrays = 1;
+    IntVect is_nodal = amrex::IntVect::TheUnitVector();
+    linear_interpolate_to_particle_mapped(p, &data_arr, loc_arr,
+                                          val, &is_nodal, icomp, ncomp_per_array, num_arrays);
+}
 
+/**
+ \brief Linearly interpolates the mesh data to the particle position on a general mapped grid.
+        Note that currently only node-centered data interpolation is supported in 2D and 3D.
+*/
+
+template <typename P>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void linear_interpolate_to_particle_mapped (const P& p,
+                                            const Array4<amrex::Real const>* data_arr,
+                                            const amrex::Array4<amrex::Real const>&  loc_arr,
+                                            amrex::ParticleReal * val,
+                                            const IntVect* is_nodal,
+                                            int start_comp, int ncomp, int num_arrays)
+{
 #if (AMREX_SPACEDIM == 1)
-    for (int d=0; d < AMREX_SPACEDIM; ++d)
+    amrex::ignore_unused(p, data_arr, loc_arr, val, is_nodal, start_comp, ncomp, num_arrays);
+    amrex::Abort(" General mapped interpolation is not supported in 1D\n");
+#else
+
+    AMREX_ASSERT(val != nullptr);
+    AMREX_ASSERT(num_arrays == 1);
+    for (int d = 0; d < num_arrays; d++)
     {
-        amrex::Real lx = (Real(p.pos(0))-plo[0])*dxi[0] - static_cast<Real>(d != 0)*Real(0.5);
+        // Assertion to ensure that only nodal data is supported
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE((is_nodal[d][0] == 1),"For general mapped coordinates, interpolation is supported only for node-centered data");
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE((is_nodal[d][1] == 1),"For general mapped coordinates, interpolation is supported only for node-centered data");
+#if (AMREX_SPACEDIM == 3)
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE((is_nodal[d][2] == 1),"For general mapped coordinates, interpolation is supported only for node-centered data");
+#endif
+    }
+    int n_of_data = 0;
 
-        int const i = static_cast<int>(amrex::Math::floor(lx));
+    int i = p.idata(0);
+    int j = p.idata(1);
+    for (int comp = start_comp; comp < ncomp; ++comp) {
 
-        amrex::Real const xint = lx - static_cast<Real>(i);
+#if (AMREX_SPACEDIM == 2)
+        // Value of data at surrounding nodes
+        amrex::Real f1 = (data_arr[n_of_data])(i  ,j  ,0,comp);
+        amrex::Real f2 = (data_arr[n_of_data])(i  ,j+1,0,comp) - f1;
+        amrex::Real f3 = (data_arr[n_of_data])(i+1,j  ,0,comp) - f1;
+        amrex::Real f4 = (data_arr[n_of_data])(i+1,j+1,0,comp) - f1;
 
-        amrex::Real sx[] = {Real(1.0) - xint, xint};
+        // Node locations
+        amrex::Real x1 =  loc_arr(i  ,j  ,0,0);
+        amrex::Real x2 =  loc_arr(i  ,j+1,0,0) - x1;
+        amrex::Real x3 =  loc_arr(i+1,j  ,0,0) - x1;
+        amrex::Real x4 =  loc_arr(i+1,j+1,0,0) - x1;
 
-        val[d] = ParticleReal(0.0);
-        for (int ii = 0; ii <= 1; ++ii)
-        {
-            val[d] += static_cast<ParticleReal>((p_uccarr[d])(i+ii, 0, 0, 0)*sx[ii]);
-        }
-    }
+        amrex::Real y1 =  loc_arr(i  ,j  ,0,1);
+        amrex::Real y2 =  loc_arr(i  ,j+1,0,1) - y1;
+        amrex::Real y3 =  loc_arr(i+1,j  ,0,1) - y1;
+        amrex::Real y4 =  loc_arr(i+1,j+1,0,1) - y1;
 
-#elif (AMREX_SPACEDIM == 2)
+        // Particle location
+        amrex::Real x = p.pos(0) - x1;
+        amrex::Real y = p.pos(1) - y1;
 
-  for (int d=0; d < AMREX_SPACEDIM; ++d)
-  {
-      amrex::Real lx = (Real(p.pos(0))-plo[0])*dxi[0] - static_cast<Real>(d != 0)*Real(0.5);
-      amrex::Real ly = (Real(p.pos(1))-plo[1])*dxi[1] - static_cast<Real>(d != 1)*Real(0.5);
+        amrex::Real det = x2*x4*y3*(y2 - y4) - x3*x4*y2*(y3 - y4) - x2*x3*(y2 - y3)*y4;
 
-      int const i = static_cast<int>(amrex::Math::floor(lx));
-      int const j = static_cast<int>(amrex::Math::floor(ly));
+        amrex::Real b = (  f4*(x2*y2*y3 - x3*y2*y3) + (f2*(x3 - x4)*y3)*y4 + f3*(-(x2*y2*y4) + x4*y2*y4) ) / det;
+        amrex::Real c = ( -f2*x3*x4*y3 + f4*x2*x3*(-y2 + y3) + f3*x2*x4*(y2 - y4) + f2*x3*x4*y4          ) / det;
+        amrex::Real d = (  f2*x4*y3 + f4*(x3*y2 - x2*y3) - f2*x3*y4 + f3*(-(x4*y2) + x2*y4)              ) / det;
 
-      amrex::Real const xint = lx - static_cast<Real>(i);
-      amrex::Real const yint = ly - static_cast<Real>(j);
+        amrex::Real f = b*x + c*y + d*x*y;
+        val[comp-start_comp] = f1 + f;
 
-      amrex::Real sx[] = {Real(1.0) - xint, xint};
-      amrex::Real sy[] = {Real(1.0) - yint, yint};
+#elif (AMREX_SPACEDIM == 3)
 
-      val[d] = ParticleReal(0.0);
-      for (int jj = 0; jj <= 1; ++jj)
-      {
-          for (int ii = 0; ii <= 1; ++ii)
-          {
-              val[d] += static_cast<ParticleReal>((p_uccarr[d])(i+ii, j+jj, 0, 0)*sx[ii]*sy[jj]);
-          }
-      }
-  }
+        int k = p.idata(2);
+
+        //
+        // Ordering of particles (lo,lo,lo) (hi,lo,lo) (lo,hi,lo) (hi,hi,lo) (lo,lo,hi) (hi,lo,hi) (lo,hi,hi) (hi,hi,hi)
+        //    All locations are calculated as offsets from (lo,lo,lo), which is only need 7 values are needed
+        //
+        Array1D<Real,0,7> x_vals;
+        Array1D<Real,0,7> y_vals;
+        Array1D<Real,0,7> z_vals;
+        Array1D<Real,1,7> b;
+
+        amrex::Real x1, y1, z1, f1;
+
+        x1        = loc_arr(i  ,j  ,k  ,0);
+        x_vals(1) = loc_arr(i+1,j  ,k  ,0) - x1;
+        x_vals(2) = loc_arr(i  ,j+1,k  ,0) - x1;
+        x_vals(3) = loc_arr(i+1,j+1,k  ,0) - x1;
+        x_vals(4) = loc_arr(i  ,j  ,k+1,0) - x1;
+        x_vals(5) = loc_arr(i+1,j  ,k+1,0) - x1;
+        x_vals(6) = loc_arr(i  ,j+1,k+1,0) - x1;
+        x_vals(7) = loc_arr(i+1,j+1,k+1,0) - x1;
+
+        y1        = loc_arr(i  ,j  ,k  ,1);
+        y_vals(1) = loc_arr(i+1,j  ,k  ,1) - y1;
+        y_vals(2) = loc_arr(i  ,j+1,k  ,1) - y1;
+        y_vals(3) = loc_arr(i+1,j+1,k  ,1) - y1;
+        y_vals(4) = loc_arr(i  ,j  ,k+1,1) - y1;
+        y_vals(5) = loc_arr(i+1,j  ,k+1,1) - y1;
+        y_vals(6) = loc_arr(i  ,j+1,k+1,1) - y1;
+        y_vals(7) = loc_arr(i+1,j+1,k+1,1) - y1;
+
+        z1        = loc_arr(i  ,j  ,k  ,2);
+        z_vals(1) = loc_arr(i+1,j  ,k  ,2) - z1;
+        z_vals(2) = loc_arr(i  ,j+1,k  ,2) - z1;
+        z_vals(3) = loc_arr(i+1,j+1,k  ,2) - z1;
+        z_vals(4) = loc_arr(i  ,j  ,k+1,2) - z1;
+        z_vals(5) = loc_arr(i+1,j  ,k+1,2) - z1;
+        z_vals(6) = loc_arr(i  ,j+1,k+1,2) - z1;
+        z_vals(7) = loc_arr(i+1,j+1,k+1,2) - z1;
+
+        f1   = (data_arr[n_of_data])(i  ,j  ,k  ,comp);
+        b(1) = (data_arr[n_of_data])(i+1,j  ,k  ,comp) - f1;
+        b(2) = (data_arr[n_of_data])(i  ,j+1,k  ,comp) - f1;
+        b(3) = (data_arr[n_of_data])(i+1,j+1,k  ,comp) - f1;
+        b(4) = (data_arr[n_of_data])(i  ,j  ,k+1,comp) - f1;
+        b(5) = (data_arr[n_of_data])(i+1,j  ,k+1,comp) - f1;
+        b(6) = (data_arr[n_of_data])(i  ,j+1,k+1,comp) - f1;
+        b(7) = (data_arr[n_of_data])(i+1,j+1,k+1,comp) - f1;
+
+        Array2D<Real,1,7,1,7> a;
+        Array1D<int,1,7> ip;
+
+        int neq = 7;
+
+        for (int n=1; n<neq+1; n++) {
+            a(n,1) = x_vals(n);
+            a(n,2) = y_vals(n);
+            a(n,3) = z_vals(n);
+            a(n,4) = x_vals(n)*y_vals(n);
+            a(n,5) = x_vals(n)*z_vals(n);
+            a(n,6) = z_vals(n)*y_vals(n);
+            a(n,7) = x_vals(n)*y_vals(n)*z_vals(n);
+        }
 
+        particle_interp_decomp(a,ip,neq);
+        particle_interp_solve(a,b,ip,neq);
 
-#elif (AMREX_SPACEDIM == 3)
+        amrex::Real px, py, pz;
+        px = p.pos(0) - x1;
+        py = p.pos(1) - y1;
+        pz = p.pos(2) - z1;
 
-  for (int d=0; d < AMREX_SPACEDIM; ++d)
-  {
-      amrex::Real lx = (Real(p.pos(0))-plo[0])*dxi[0] - static_cast<Real>(d != 0)*Real(0.5);
-      amrex::Real ly = (Real(p.pos(1))-plo[1])*dxi[1] - static_cast<Real>(d != 1)*Real(0.5);
-      amrex::Real lz = (Real(p.pos(2))-plo[2])*dxi[2] - static_cast<Real>(d != 2)*Real(0.5);
-
-      int const i = static_cast<int>(amrex::Math::floor(lx));
-      int const j = static_cast<int>(amrex::Math::floor(ly));
-      int const k = static_cast<int>(amrex::Math::floor(lz));
-
-      amrex::Real const xint = lx - static_cast<Real>(i);
-      amrex::Real const yint = ly - static_cast<Real>(j);
-      amrex::Real const zint = lz - static_cast<Real>(k);
-
-      amrex::Real sx[] = {Real(1.0) - xint, xint};
-      amrex::Real sy[] = {Real(1.0) - yint, yint};
-      amrex::Real sz[] = {Real(1.0) - zint, zint};
-
-      val[d] = ParticleReal(0.0);
-      for (int kk = 0; kk <=1; ++kk)
-      {
-          for (int jj = 0; jj <= 1; ++jj)
-          {
-              for (int ii = 0; ii <= 1; ++ii)
-              {
-                  val[d] += static_cast<ParticleReal>((p_uccarr[d])(i+ii, j+jj, k+kk ,0)*sx[ii]*sy[jj]*sz[kk]);
-              }
-          }
-      }
-  }
+        amrex::Real f = b(1)*px + b(2)*py + b(3)*pz + b(4)*px*py + b(5)*px*pz + b(6)*py*pz + b(7)*px*py*pz;
+        val[comp-start_comp] = static_cast<ParticleReal>(f + f1);
 #endif
-}
-}
+
+    } // comp
 #endif
+}
+
+}  // namespace amrex
+#endif  // include guard
diff --git a/Src/Particle/AMReX_WriteBinaryParticleData.H b/Src/Particle/AMReX_WriteBinaryParticleData.H
index 31ca7f8df6a..aa0b3772838 100644
--- a/Src/Particle/AMReX_WriteBinaryParticleData.H
+++ b/Src/Particle/AMReX_WriteBinaryParticleData.H
@@ -3,7 +3,6 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_TypeTraits.H>
-#include <AMReX_Particles.H>
 #include <AMReX_ParticleUtil.H>
 #include <AMReX_GpuDevice.H>
 
@@ -31,8 +30,8 @@ template <template <class, class> class Container,
           class Allocator,
           class PTile,
           class F>
-typename std::enable_if<RunOnGpu<typename Container<int, Allocator>::allocator_type>::value>::type
-fillFlags (Container<int, Allocator>& pflags, const PTile& ptile, F&& f)
+std::enable_if_t<RunOnGpu<typename Container<int, Allocator>::allocator_type>::value>
+fillFlags (Container<int, Allocator>& pflags, const PTile& ptile, F const& f)
 {
     const auto ptd = ptile.getConstParticleTileData();
     const auto np = ptile.numParticles();
@@ -55,8 +54,8 @@ template <template <class, class> class Container,
           class Allocator,
           class PTile,
           class F>
-typename std::enable_if<!RunOnGpu<typename Container<int, Allocator>::allocator_type>::value>::type
-fillFlags (Container<int, Allocator>& pflags, const PTile& ptile, F&& f)
+std::enable_if_t<!RunOnGpu<typename Container<int, Allocator>::allocator_type>::value>
+fillFlags (Container<int, Allocator>& pflags, const PTile& ptile, F const& f)
 {
     const auto ptd = ptile.getConstParticleTileData();
     const auto np = ptile.numParticles();
@@ -73,7 +72,7 @@ fillFlags (Container<int, Allocator>& pflags, const PTile& ptile, F&& f)
 }
 
 template <template <class, class> class Container, class Allocator, class PC>
-typename std::enable_if<RunOnGpu<typename Container<int, Allocator>::allocator_type>::value, amrex::Long>::type
+std::enable_if_t<RunOnGpu<typename Container<int, Allocator>::allocator_type>::value, amrex::Long>
 countFlags (const Vector<std::map<std::pair<int,int>,Container<int,Allocator>>>& particle_io_flags, const PC& pc)
 {
     ReduceOps<ReduceOpSum> reduce_op;
@@ -99,7 +98,7 @@ countFlags (const Vector<std::map<std::pair<int,int>,Container<int,Allocator>>>&
 }
 
 template <template <class, class> class Container, class Allocator>
-typename std::enable_if<RunOnGpu<typename Container<int, Allocator>::allocator_type>::value, int>::type
+std::enable_if_t<RunOnGpu<typename Container<int, Allocator>::allocator_type>::value, int>
 countFlags (const Container<int,Allocator>& pflags)
 {
     ReduceOps<ReduceOpSum> reduce_op;
@@ -117,7 +116,7 @@ countFlags (const Container<int,Allocator>& pflags)
 }
 
 template <template <class, class> class Container, class Allocator, class PC>
-typename std::enable_if<!RunOnGpu<typename Container<int, Allocator>::allocator_type>::value, amrex::Long>::type
+std::enable_if_t<!RunOnGpu<typename Container<int, Allocator>::allocator_type>::value, amrex::Long>
 countFlags (const Vector<std::map<std::pair<int,int>,Container<int,Allocator>>>& particle_io_flags, const PC& pc)
 {
     amrex::Long nparticles = 0;
@@ -137,7 +136,7 @@ countFlags (const Vector<std::map<std::pair<int,int>,Container<int,Allocator>>>&
 }
 
 template <template <class, class> class Container, class Allocator>
-typename std::enable_if<!RunOnGpu<typename Container<int, Allocator>::allocator_type>::value, int>::type
+std::enable_if_t<!RunOnGpu<typename Container<int, Allocator>::allocator_type>::value, int>
 countFlags (const Container<int,Allocator>& pflags)
 {
     int nparticles = 0;
@@ -157,8 +156,8 @@ void packParticleIDs (I* idata, const P& p, bool is_checkpoint) noexcept
         std::uint32_t xu, yu;
         xu = (std::uint32_t)((p.m_idcpu & 0xFFFFFFFF00000000LL) >> 32);
         yu = (std::uint32_t)( p.m_idcpu & 0xFFFFFFFFLL);
-        std::memcpy(&xi, &xu, sizeof(xu));
-        std::memcpy(&yi, &yu, sizeof(yu));
+        amrex::Gpu::memcpy(&xi, &xu, sizeof(xu));
+        amrex::Gpu::memcpy(&yi, &yu, sizeof(yu));
         idata[0] = xi;
         idata[1] = yi;
     } else {
@@ -168,7 +167,7 @@ void packParticleIDs (I* idata, const P& p, bool is_checkpoint) noexcept
 }
 
 template <class PC>
-typename std::enable_if<RunOnGpu<typename PC::template AllocatorType<int>>::value>::type
+std::enable_if_t<RunOnGpu<typename PC::template AllocatorType<int>>::value>
 packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int lev, int grid,
             const Vector<int>& write_real_comp, const Vector<int>& write_int_comp,
             const Vector<std::map<std::pair<int, int>, typename PC::IntVector>>& particle_io_flags,
@@ -183,8 +182,8 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
     idata.resize(np*iChunkSize);
 
     int num_output_real = 0;
-    for (int i = 0; i < pc.NumRealComps() + PC::NStructReal; ++i) {
-        if (write_real_comp[i]) { ++num_output_real; }
+    for (int i : write_real_comp) {
+        if (i) { ++num_output_real; }
     }
 
     const Long rChunkSize = AMREX_SPACEDIM + num_output_real;
@@ -205,9 +204,9 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
     const auto write_real_comp_d_ptr = write_real_comp_d.data();
 
     std::size_t poffset = 0;
-    for (int i = 0; i < tiles.size(); i++) {
-        const auto& ptile = pc.ParticlesAt(lev, grid, tiles[i]);
-        const auto& pflags = particle_io_flags[lev].at(std::make_pair(grid, tiles[i]));
+    for (int tile : tiles) {
+        const auto& ptile = pc.ParticlesAt(lev, grid, tile);
+        const auto& pflags = particle_io_flags[lev].at(std::make_pair(grid, tile));
         int np_tile = ptile.numParticles();
         typename PC::IntVector offsets(np_tile);
         int num_copies = Scan::ExclusiveSum(np_tile, pflags.begin(), offsets.begin(), Scan::retSum);
@@ -249,15 +248,18 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
                     }
                 }
 
-                for (int j = 0; j < PC::SuperParticleType::NReal; j++) {
-                    if (write_real_comp_d_ptr[j]) {
+                // extra SoA Real components
+                const int real_start_offset = PC::ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0; // pure SoA: skip positions
+                for (int j = real_start_offset; j < PC::SuperParticleType::NReal; j++) {
+                    const int write_comp_index = j-real_start_offset;
+                    if (write_real_comp_d_ptr[write_comp_index]) {
                         rdata_d_ptr[rout_index] = p.rdata(j);
                         rout_index++;
                     }
                 }
 
                 for (int j = 0; j < ptd.m_num_runtime_real; j++) {
-                    if (write_real_comp_d_ptr[PC::SuperParticleType::NReal + j]) {
+                    if (write_real_comp_d_ptr[PC::SuperParticleType::NReal+j-real_start_offset]) {
                         rdata_d_ptr[rout_index] = ptd.m_runtime_rdata[j][pindex];
                         rout_index++;
                     }
@@ -274,7 +276,7 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
 }
 
 template <class PC>
-typename std::enable_if<!RunOnGpu<typename PC::template AllocatorType<int>>::value>::type
+std::enable_if_t<!RunOnGpu<typename PC::template AllocatorType<int>>::value>
 packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int lev, int grid,
             const Vector<int>& write_real_comp, const Vector<int>& write_int_comp,
             const Vector<std::map<std::pair<int, int>, typename PC::IntVector>>& particle_io_flags,
@@ -289,8 +291,8 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
     idata.resize(np*iChunkSize);
 
     int num_output_real = 0;
-    for (int i = 0; i < pc.NumRealComps() + PC::NStructReal; ++i) {
-        if (write_real_comp[i]) { ++num_output_real; }
+    for (int i : write_real_comp) {
+        if (i) { ++num_output_real; }
     }
 
     const Long rChunkSize = AMREX_SPACEDIM + num_output_real;
@@ -298,9 +300,9 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
 
     int* iptr = idata.dataPtr();
     ParticleReal* rptr = rdata.dataPtr();
-    for (unsigned i = 0; i < tiles.size(); i++) {
-        const auto& ptile = pc.ParticlesAt(lev, grid, tiles[i]);
-        const auto& pflags = particle_io_flags[lev].at(std::make_pair(grid, tiles[i]));
+    for (int tile : tiles) {
+        const auto& ptile = pc.ParticlesAt(lev, grid, tile);
+        const auto& pflags = particle_io_flags[lev].at(std::make_pair(grid, tile));
         for (int pindex = 0; pindex < ptile.numParticles(); ++pindex) {
             if (pflags[pindex]) {
                 const auto& soa  = ptile.GetStructOfArrays();
@@ -336,20 +338,33 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
                     }
                 }
                 else {
-                    amrex::ignore_unused(is_checkpoint);
-                    // Int: id, cpu
-                    *iptr = soa.GetIntData(0)[pindex];
-                    iptr += 1;
-                    *iptr = soa.GetIntData(1)[pindex];
-                    iptr += 1;
+                    uint64_t idcpu = soa.GetIdCPUData()[pindex];
+                    if (is_checkpoint) {
+                        std::int32_t  xi, yi;
+                        std::uint32_t xu, yu;
+                        xu = (std::uint32_t)((idcpu & 0xFFFFFFFF00000000LL) >> 32);
+                        yu = (std::uint32_t)( idcpu & 0xFFFFFFFFLL);
+                        std::memcpy(&xi, &xu, sizeof(xu));
+                        std::memcpy(&yi, &yu, sizeof(yu));
+                        *iptr = xi;
+                        iptr += 1;
+                        *iptr = yi;
+                        iptr += 1;
+                    } else {
+                        // Int: id, cpu
+                        *iptr = (int) ParticleIDWrapper(idcpu);
+                        iptr += 1;
+                        *iptr = (int) ParticleCPUWrapper(idcpu);
+                        iptr += 1;
+                    }
 
                     // Real: position
                     for (int j = 0; j < AMREX_SPACEDIM; j++) { rptr[j] = soa.GetRealData(j)[pindex]; }
                     rptr += AMREX_SPACEDIM;
                 }
 
-                // extra SoA int data
-                const int int_start_offset = PC::ParticleType::is_soa_particle ? 2 : 0; // pure SoA: skip id, cpu
+                // SoA int data
+                const int int_start_offset = 0;
                 for (int j = int_start_offset; j < pc.NumIntComps(); j++) {
                     if (write_int_comp[PC::NStructInt+j]) {
                         *iptr = soa.GetIntData(j)[pindex];
@@ -360,7 +375,8 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
                 // extra SoA Real components
                 const int real_start_offset = PC::ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0; // pure SoA: skip positions
                 for (int j = real_start_offset; j < pc.NumRealComps(); j++) {
-                    if (write_real_comp[PC::NStructReal+j]) {
+                    const int write_comp_index = PC::NStructReal+j-real_start_offset;
+                    if (write_real_comp[write_comp_index]) {
                         *rptr = (ParticleReal) soa.GetRealData(j)[pindex];
                         ++rptr;
                     }
@@ -378,7 +394,7 @@ void WriteBinaryParticleDataSync (PC const& pc,
                                   const Vector<int>& write_int_comp,
                                   const Vector<std::string>& real_comp_names,
                                   const Vector<std::string>& int_comp_names,
-                                  F&& f, bool is_checkpoint)
+                                  F const& f, bool is_checkpoint)
 {
     BL_PROFILE("WriteBinaryParticleData()");
     AMREX_ASSERT(pc.OK());
@@ -392,7 +408,11 @@ void WriteBinaryParticleDataSync (PC const& pc,
     const int NProcs = ParallelDescriptor::NProcs();
     const int IOProcNumber = ParallelDescriptor::IOProcessorNumber();
 
-    AMREX_ALWAYS_ASSERT(real_comp_names.size() == pc.NumRealComps() + NStructReal);
+    if constexpr(PC::ParticleType::is_soa_particle) {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == pc.NumRealComps() + NStructReal - AMREX_SPACEDIM); // pure SoA: skip positions
+    } else {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == pc.NumRealComps() + NStructReal);
+    }
     AMREX_ALWAYS_ASSERT( int_comp_names.size() == pc.NumIntComps() + NStructInt);
 
     std::string pdir = dir;
@@ -475,8 +495,8 @@ void WriteBinaryParticleDataSync (PC const& pc,
         }
 
         int num_output_real = 0;
-        for (int i = 0; i < pc.NumRealComps() + NStructReal; ++i) {
-            if (write_real_comp[i]) { ++num_output_real; }
+        for (int i : write_real_comp) {
+            if (i) { ++num_output_real; }
         }
 
         int num_output_int = 0;
@@ -491,7 +511,7 @@ void WriteBinaryParticleDataSync (PC const& pc,
         HdrFile << num_output_real << '\n';
 
         // Real component names
-        for (int i = 0; i < NStructReal + pc.NumRealComps(); ++i ) {
+        for (int i = 0; i < (int) real_comp_names.size(); ++i ) {
             if (write_real_comp[i]) { HdrFile << real_comp_names[i] << '\n'; }
         }
 
@@ -682,7 +702,11 @@ void WriteBinaryParticleDataAsync (PC const& pc,
     const int NProcs = ParallelDescriptor::NProcs();
     const int IOProcNumber = NProcs - 1;
 
-    AMREX_ALWAYS_ASSERT(real_comp_names.size() == pc.NumRealComps() + NStructReal);
+    if constexpr(PC::ParticleType::is_soa_particle) {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == pc.NumRealComps() + NStructReal - AMREX_SPACEDIM); // pure SoA: skip positions
+    } else {
+        AMREX_ALWAYS_ASSERT(real_comp_names.size() == pc.NumRealComps() + NStructReal);
+    }
     AMREX_ALWAYS_ASSERT( int_comp_names.size() == pc.NumIntComps() + NStructInt);
 
     Vector<LayoutData<Long> > np_per_grid_local(pc.finestLevel()+1);
@@ -846,6 +870,7 @@ void WriteBinaryParticleDataAsync (PC const& pc,
 
     int nrc = pc.NumRealComps();
     int nic = pc.NumIntComps();
+    int rnames_size = (int) real_comp_names.size();
 
     auto RD = pc.ParticleRealDescriptor;
 
@@ -880,7 +905,7 @@ void WriteBinaryParticleDataAsync (PC const& pc,
             }
 
             int num_output_real = 0;
-            for (int i = 0; i < nrc + NStructReal; ++i) {
+            for (int i = 0; i < rnames_size; ++i) {
                 if (write_real_comp[i]) { ++num_output_real; }
             }
 
@@ -896,7 +921,7 @@ void WriteBinaryParticleDataAsync (PC const& pc,
             HdrFile << num_output_real << '\n';
 
             // Real component names
-            for (int i = 0; i < NStructReal + nrc; ++i ) {
+            for (int i = 0; i < rnames_size; ++i ) {
                 if (write_real_comp[i]) { HdrFile << real_comp_names[i] << '\n'; }
             }
 
@@ -1020,15 +1045,29 @@ void WriteBinaryParticleDataAsync (PC const& pc,
                             }
                         }
                         else {
-                            // Ints: id, cpu
-                            *iptr = soa.GetIntData(0)[pindex];
-                            iptr += 1;
-                            *iptr = soa.GetIntData(1)[pindex];
-                            iptr += 1;
+                            uint64_t idcpu = soa.GetIdCPUData()[pindex];
+                            if (is_checkpoint) {
+                                std::int32_t  xi, yi;
+                                std::uint32_t xu, yu;
+                                xu = (std::uint32_t)((idcpu & 0xFFFFFFFF00000000LL) >> 32);
+                                yu = (std::uint32_t)( idcpu & 0xFFFFFFFFLL);
+                                std::memcpy(&xi, &xu, sizeof(xu));
+                                std::memcpy(&yi, &yu, sizeof(yu));
+                                *iptr = xi;
+                                iptr += 1;
+                                *iptr = yi;
+                                iptr += 1;
+                            } else {
+                                // Int: id, cpu
+                                *iptr = (int) ParticleIDWrapper(idcpu);
+                                iptr += 1;
+                                *iptr = (int) ParticleCPUWrapper(idcpu);
+                                iptr += 1;
+                            }
                         }
 
                         // extra SoA Ints
-                        const int int_start_offset = PC::ParticleType::is_soa_particle ? 2 : 0; // pure SoA: skip id, cpu
+                        const int int_start_offset = 0;
                         for (int j = int_start_offset; j < nic; j++)
                         {
                             if (write_int_comp[NStructInt+j])
@@ -1045,7 +1084,7 @@ void WriteBinaryParticleDataAsync (PC const& pc,
 
                 // Write the Real data in binary.
                 int num_output_real = 0;
-                for (int i = 0; i < nrc + NStructReal; ++i) {
+                for (int i = 0; i < rnames_size; ++i) {
                     if (write_real_comp[i]) { ++num_output_real; }
                 }
 
@@ -1091,7 +1130,9 @@ void WriteBinaryParticleDataAsync (PC const& pc,
                         const int real_start_offset = PC::ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0; // pure SoA: positions
                         for (int j = real_start_offset; j < nrc; j++)
                         {
-                            if (write_real_comp[NStructReal+j])
+                    const int write_comp_offset = PC::ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0; // pure SoA: skip positions
+                    const int write_comp_index = PC::NStructReal+j-write_comp_offset;
+                            if (write_real_comp[write_comp_index])
                             {
                                 *rptr = (typename PC::ParticleType::RealType) soa.GetRealData(j)[pindex];
                                 ++rptr;
diff --git a/Tests/Amr/Advection_AmrCore/Exec/GNUmakefile b/Tests/Amr/Advection_AmrCore/Exec/GNUmakefile
index 284f9942c50..b7242d85957 100644
--- a/Tests/Amr/Advection_AmrCore/Exec/GNUmakefile
+++ b/Tests/Amr/Advection_AmrCore/Exec/GNUmakefile
@@ -14,6 +14,7 @@ USE_MPI    = TRUE
 USE_OMP    = FALSE
 USE_CUDA   = FALSE
 USE_PARTICLES = TRUE
+USE_BITTREE = FALSE
 
 Bpack   := ./Make.package
 Blocs   := .
diff --git a/Tests/Amr/Advection_AmrCore/Exec/inputs_bittree b/Tests/Amr/Advection_AmrCore/Exec/inputs_bittree
new file mode 100644
index 00000000000..b136cbdc951
--- /dev/null
+++ b/Tests/Amr/Advection_AmrCore/Exec/inputs_bittree
@@ -0,0 +1,81 @@
+# *****************************************************************
+# Run until nsteps == max_step or time == stop_time, 
+#     whichever comes first
+# *****************************************************************
+max_step  = 1000
+stop_time = 2.0
+
+# *****************************************************************
+# Are we restarting from an existing checkpoint file?
+# *****************************************************************
+#amr.restart  = chk00060 # restart from this checkpoint file
+
+# *****************************************************************
+# Problem size and geometry
+# *****************************************************************
+geometry.prob_lo     =  0.0  0.0  0.0 
+geometry.prob_hi     =  1.0  1.0  0.125
+geometry.is_periodic =  1    1    1
+
+# *****************************************************************
+# VERBOSITY
+# *****************************************************************
+amr.v              = 1       # verbosity in Amr
+
+# *****************************************************************
+# Resolution and refinement
+# *****************************************************************
+amr.n_cell          = 64 64 8
+amr.max_level       = 2       # maximum level number allowed -- 
+                              # number of levels = max_level + 1
+
+amr.ref_ratio       = 2 2 2 2 # refinement ratio between levels
+
+# *****************************************************************
+# Control of grid creation
+# *****************************************************************
+# Blocking factor for grid creation in each dimension --
+#   this ensures that every grid is coarsenable by a factor of 8 --
+#   this is mostly relevant for multigrid performance
+amr.blocking_factor_x = 8
+amr.blocking_factor_y = 8
+amr.blocking_factor_z = 8
+
+amr.max_grid_size   = 16
+
+amr.regrid_int      = 2       # how often to regrid
+
+amr.use_bittree = 1
+
+# *****************************************************************
+# Time step control
+# *****************************************************************
+adv.cfl            = 0.7     # CFL constraint for explicit advection
+
+adv.do_subcycle    = 1       # Do we subcycle in time?
+
+# *****************************************************************
+# Should we reflux at coarse-fine boundaries?
+# *****************************************************************
+adv.do_reflux = 1
+
+# *****************************************************************
+# Tagging -  if phi > 1.01 at level 0, then refine 
+#            if phi > 1.1  at level 1, then refine 
+#            if phi > 1.5  at level 2, then refine 
+# *****************************************************************
+adv.phierr = 1.01  1.1  1.5
+
+# *****************************************************************
+# Plotfile name and frequency
+# *****************************************************************
+amr.plot_file  = plt    # root name of plot file
+amr.plot_int   =  10    # number of timesteps between plot files
+                        # if negative then no plot files will be written
+
+# *****************************************************************
+# Checkpoint name and frequency
+# *****************************************************************
+amr.chk_file = chk      # root name of checkpoint file
+amr.chk_int  = -1       # number of timesteps between checkpoint files
+                        # if negative then no checkpoint files will be written
diff --git a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.H b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.H
index fff3de9fea6..eee16a7ace5 100644
--- a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.H
+++ b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.H
@@ -87,7 +87,7 @@ public:
     amrex::Real EstTimeStep (int lev, amrex::Real time);
 
 #ifdef AMREX_PARTICLES
-    static amrex::AmrTracerParticleContainer* theTracerPC () { return TracerPC.get(); }
+    amrex::AmrTracerParticleContainer* theTracerPC () { return TracerPC.get(); }
 #endif
 
 
@@ -217,8 +217,8 @@ private:
 
 #ifdef AMREX_PARTICLES
     void init_particles ();
-    static int       do_tracers;
-    static std::unique_ptr<amrex::AmrTracerParticleContainer> TracerPC;
+    int       do_tracers = 0;
+    std::unique_ptr<amrex::AmrTracerParticleContainer> TracerPC;
 #endif
 };
 
diff --git a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
index 48a5d96e37d..450315fa802 100644
--- a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
+++ b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
@@ -15,14 +15,6 @@
 
 using namespace amrex;
 
-
-
-#ifdef AMREX_PARTICLES
-std::unique_ptr<AmrTracerParticleContainer> AmrCoreAdv::TracerPC =  nullptr;
-int AmrCoreAdv::do_tracers = 0;
-#endif
-
-
 // constructor - reads in parameters from inputs file
 //             - sizes multilevel arrays and data structures
 //             - initializes BCRec boundary condition object
@@ -60,8 +52,8 @@ AmrCoreAdv::AmrCoreAdv ()
 
 /*
     // walls (Neumann)
-    int bc_lo[] = {FOEXTRAP, FOEXTRAP, FOEXTRAP};
-    int bc_hi[] = {FOEXTRAP, FOEXTRAP, FOEXTRAP};
+    int bc_lo[] = {amrex::BCType::foextrap, amrex::BCType::foextrap, amrex::BCType::foextrap};
+    int bc_hi[] = {amrex::BCType::foextrap, amrex::BCType::foextrap, amrex::BCType::foextrap};
 */
 
     bcs.resize(1);     // Setup 1-component
diff --git a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
index f9d13cec1d7..8ff8bad0b61 100644
--- a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
+++ b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
@@ -165,7 +165,7 @@ void
 AmrLevelAdv::initData ()
 {
     if (verbose) {
-        amrex::Print() << "Initializing the data at level " << level << std::endl;
+        amrex::Print() << "Initializing the data at level " << level << '\n';
     }
 
     // Initialize data on MultiFab
@@ -177,7 +177,7 @@ AmrLevelAdv::initData ()
 
     if (verbose) {
         amrex::Print() << "Done initializing the level " << level
-                       << " data " << std::endl;
+                       << " data " << '\n';
     }
 }
 
@@ -233,7 +233,7 @@ AmrLevelAdv::advance (Real time,
     Real maxval = S_mm.max(0);
     Real minval = S_mm.min(0);
 
-    amrex::Print() << "phi max = " << maxval << ", min = " << minval  << std::endl;
+    amrex::Print() << "phi max = " << maxval << ", min = " << minval  << '\n';
     for (int k = 0; k < NUM_STATE_TYPE; k++) {
         state[k].allocOldData();
         state[k].swapTimeLevels(dt);
@@ -352,10 +352,10 @@ AmrLevelAdv::advance (Real time,
             {
 #if (AMREX_SPACEDIM > 2)
                 amrex::AllPrint() << "umax = " << umax << ", vmax = " << vmax << ", wmax = " << wmax
-                                  << ", dt = " << dt << " dx = " << dx[0] << " " << dx[1] << " " << dx[2] << std::endl;
+                                  << ", dt = " << dt << " dx = " << dx[0] << " " << dx[1] << " " << dx[2] << '\n';
 #else
                 amrex::AllPrint() << "umax = " << umax << ", vmax = " << vmax
-                                  << ", dt = " << dt << " dx = " << dx[0] << " " << dx[1] << std::endl;
+                                  << ", dt = " << dt << " dx = " << dx[0] << " " << dx[1] << '\n';
 #endif
                 amrex::Abort("CFL violation. Use smaller adv.cfl.");
             }
@@ -451,7 +451,7 @@ AmrLevelAdv::estTimeStep (Real)
 
     if (verbose) {
         amrex::Print() << "AmrLevelAdv::estTimeStep at level " << level
-                       << ":  dt_est = " << dt_est << std::endl;
+                       << ":  dt_est = " << dt_est << '\n';
     }
 
     return dt_est;
@@ -795,7 +795,7 @@ AmrLevelAdv::reflux ()
         ParallelDescriptor::ReduceRealMax(end,IOProc);
 
         amrex::Print() << "AmrLevelAdv::reflux() at level " << level
-                       << " : time = " << end << std::endl;
+                       << " : time = " << end << '\n';
     }
 }
 
diff --git a/Tests/Amr/Advection_AmrLevel/Source/main.cpp b/Tests/Amr/Advection_AmrLevel/Source/main.cpp
index dd88c2d2330..15337f93353 100644
--- a/Tests/Amr/Advection_AmrLevel/Source/main.cpp
+++ b/Tests/Amr/Advection_AmrLevel/Source/main.cpp
@@ -83,7 +83,7 @@ main (int   argc,
 
     ParallelDescriptor::ReduceRealMax(dRunTime2, ParallelDescriptor::IOProcessorNumber());
 
-    amrex::Print() << "Run time = " << dRunTime2 << std::endl;
+    amrex::Print() << "Run time = " << dRunTime2 << '\n';
 
     amrex::Finalize();
 
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 73a5dfe69f8..d15a1e89697 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -143,6 +143,10 @@ else()
       list(APPEND AMREX_TESTS_SUBDIRS GPU)
    endif ()
 
+   if (AMReX_GPU_BACKEND STREQUAL NONE)
+      list(APPEND AMREX_TESTS_SUBDIRS OpenMP)
+   endif ()
+
    list(TRANSFORM AMREX_TESTS_SUBDIRS PREPEND "${CMAKE_CURRENT_LIST_DIR}/")
 
    #
diff --git a/Tests/CMakeTestInstall/CMakeLists.txt b/Tests/CMakeTestInstall/CMakeLists.txt
index 4f6e4bda5ed..32fa809e57d 100644
--- a/Tests/CMakeTestInstall/CMakeLists.txt
+++ b/Tests/CMakeTestInstall/CMakeLists.txt
@@ -54,11 +54,14 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        ${_input_dir}/Prob.H
     )
 
+    file(TO_NATIVE_PATH "${_input_dir}/inputs" inputs_path)
+
     add_custom_command(
        TARGET install_test_${D}d
        POST_BUILD
        COMMAND ${CMAKE_COMMAND} -E echo "Running test project"
-       COMMAND ${PROJECT_BINARY_DIR}/install_test_${D}d ${_input_dir}/inputs max_step=1 > out_${D}d.txt
+       COMMAND ${CMAKE_COMMAND} -E echo "Command: $<TARGET_FILE:install_test_${D}d> ${inputs_path} max_step=1 > out_${D}d.txt"
+       COMMAND $<TARGET_FILE:install_test_${D}d> ${inputs_path} max_step=1 > out_${D}d.txt
        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
     )
 endforeach()
diff --git a/Tests/FortranInterface/Advection_F/Source/fillpatch_mod.F90 b/Tests/FortranInterface/Advection_F/Source/fillpatch_mod.F90
index f337de6e7ff..c79f2aa7b9a 100644
--- a/Tests/FortranInterface/Advection_F/Source/fillpatch_mod.F90
+++ b/Tests/FortranInterface/Advection_F/Source/fillpatch_mod.F90
@@ -90,7 +90,7 @@ subroutine fill_physbc (pmf, scomp, ncomp, time, pgeom) bind(c)
                   geom%get_physical_location(plo), & ! physical location of lower left corner
                   lo_bc, hi_bc)                      ! bc types for each component
 
-             ! amrex_filcc doesn't fill EXT_DIR (see amrex_bc_types_module for a list of bc types
+             ! amrex_filcc doesn't fill EXT_DIR/EXT_DIR_CC (see amrex_bc_types_module for a list of bc types
              ! In that case, the user needs to fill it.
           end if
        end do
diff --git a/Tests/GPU/CNS/Source/CNS_bcfill.cpp b/Tests/GPU/CNS/Source/CNS_bcfill.cpp
index 1c787e29a96..a75c7ebce88 100644
--- a/Tests/GPU/CNS/Source/CNS_bcfill.cpp
+++ b/Tests/GPU/CNS/Source/CNS_bcfill.cpp
@@ -14,7 +14,7 @@ struct CnsFillExtDir
                      const BCRec* /*bcr*/, const int /*bcomp*/,
                      const int /*orig_comp*/) const
         {
-            // do something for external Dirichlet (BCType::ext_dir)
+            // do something for external Dirichlet (BCType::ext_dir/BCType::ext_dir_cc)
         }
 };
 
diff --git a/Tests/HDF5Benchmark/sz.config b/Tests/HDF5Benchmark/sz.config
index dd363085210..f2abf5529e1 100644
--- a/Tests/HDF5Benchmark/sz.config
+++ b/Tests/HDF5Benchmark/sz.config
@@ -76,7 +76,7 @@ relBoundRatio = 1E-9
 pw_relBoundRatio = 1E-2
 
 #point-wise relative-error-bound segment size
-#The larger this value, the lower overhead of maitaining error bound information for each segment/block, while the stricter absolute bound used to control the point-wise relative-error-bound. The block size will be M^K, where K is the # dimensions of the data and M is an integer such that (M-1)^K < segment_size <= M^K. For instance, for a 2D data, segment_size=32 means the whole data set will be split into multiple small blocks each being 6x6. For a 3D data, segment_size=32 means the whole data set is split into a lot of 4x4x4 blocks, each of which has an individual compression precision/error.
+#The larger this value, the lower overhead of maintaining error bound information for each segment/block, while the stricter absolute bound used to control the point-wise relative-error-bound. The block size will be M^K, where K is the # dimensions of the data and M is an integer such that (M-1)^K < segment_size <= M^K. For instance, for a 2D data, segment_size=32 means the whole data set will be split into multiple small blocks each being 6x6. For a 3D data, segment_size=32 means the whole data set is split into a lot of 4x4x4 blocks, each of which has an individual compression precision/error.
 segment_size = 32
 
 #point-wise relative-error-bound type
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/GNUmakefile b/Tests/LinearSolvers/ABecLaplacian_C/GNUmakefile
index 329fb9afcf2..e26200bd94f 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/GNUmakefile
+++ b/Tests/LinearSolvers/ABecLaplacian_C/GNUmakefile
@@ -16,7 +16,7 @@ include $(AMREX_HOME)/Tools/GNUMake/Make.defs
 
 include ./Make.package
 
-Pdirs 	:= Base Boundary LinearSolvers/MLMG
+Pdirs 	:= Base Boundary LinearSolvers
 
 Ppack	+= $(foreach dir, $(Pdirs), $(AMREX_HOME)/Src/$(dir)/Make.package)
 
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H
index afe97c9bea6..97c928a88da 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H
+++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H
@@ -20,6 +20,7 @@ public:
     void initProbPoisson ();
     void initProbABecLaplacian ();
     void initProbABecLaplacianInhomNeumann ();
+    void initProbNodeABecLaplacian ();
 
 private:
 
@@ -28,6 +29,8 @@ private:
     void solvePoisson ();
     void solveABecLaplacian ();
     void solveABecLaplacianInhomNeumann ();
+    void solveNodeABecLaplacian ();
+    void solveABecLaplacianGMRES ();
 
     int max_level = 1;
     int ref_ratio = 2;
@@ -36,7 +39,9 @@ private:
 
     bool composite_solve = true;
 
-    int prob_type = 1;  // 1. Poisson,  2. ABecLaplacian
+    // 1. Poisson,  2. ABecLaplacian, 3. ABecLaplacian w/ inhomNeumann
+    // 4. NodeABecLaplacian
+    int prob_type = 1;
 
     // For MLMG solver
     int verbose = 2;
@@ -52,6 +57,9 @@ private:
     bool use_hypre = false;
     bool use_petsc = false;
 
+    // GMRES
+    bool use_gmres = false;
+
 #ifdef AMREX_USE_HYPRE
     int hypre_interface_i = 1;  // 1. structed, 2. semi-structed, 3. ij
     amrex::Hypre::Interface hypre_interface = amrex::Hypre::Interface::structed;
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
index 0db9f5959d8..d2b784f36e1 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
+++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
@@ -1,5 +1,8 @@
 #include "MyTest.H"
 
+#include <AMReX_GMRES.H>
+#include <AMReX_GMRES_MLMG.H>
+#include <AMReX_MLNodeABecLaplacian.H>
 #include <AMReX_MLABecLaplacian.H>
 #include <AMReX_MLPoisson.H>
 #include <AMReX_ParmParse.H>
@@ -19,9 +22,15 @@ MyTest::solve ()
     if (prob_type == 1) {
         solvePoisson();
     } else if (prob_type == 2) {
-        solveABecLaplacian();
+        if (use_gmres) {
+            solveABecLaplacianGMRES();
+        } else {
+            solveABecLaplacian();
+        }
     } else if (prob_type == 3) {
         solveABecLaplacianInhomNeumann();
+    } else if (prob_type == 4) {
+        solveNodeABecLaplacian();
     } else {
         amrex::Abort("Unknown prob_type");
     }
@@ -258,6 +267,7 @@ MyTest::solveABecLaplacian ()
     // Since this problem has Neumann BC, solution + constant is also a
     // solution.  So we are going to shift the solution by a constant
     // for comparison with the "exact solution".
+    // The statement above is incorrect because we have the a term, albeit small.
     const Real npts = grids[0].d_numPts();
     const Real avg1 = exact_solution[0].sum();
     const Real avg2 = solution[0].sum();
@@ -400,6 +410,137 @@ MyTest::solveABecLaplacianInhomNeumann ()
     // Since this problem has Neumann BC, solution + constant is also a
     // solution.  So we are going to shift the solution by a constant
     // for comparison with the "exact solution".
+    // The statement above is incorrect because we have the a term, albeit small.
+    const Real npts = grids[0].d_numPts();
+    const Real avg1 = exact_solution[0].sum();
+    const Real avg2 = solution[0].sum();
+    const Real offset = (avg1-avg2)/npts;
+    for (int ilev = 0; ilev < nlevels; ++ilev) {
+        solution[ilev].plus(offset, 0, 1, 0);
+    }
+}
+
+void
+MyTest::solveNodeABecLaplacian ()
+{
+    LPInfo info;
+    info.setAgglomeration(agglomeration);
+    info.setConsolidation(consolidation);
+    info.setMaxCoarseningLevel(max_coarsening_level);
+
+    const auto tol_rel = Real(1.e-10);
+    const auto tol_abs = Real(0.0);
+
+    const auto nlevels = static_cast<int>(geom.size());
+
+    if (composite_solve && nlevels > 1)
+    {
+        amrex::Abort("solveNodeABecLaplacian: TODO composite_solve");
+    }
+    else
+    {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(nlevels == 1, "solveNodeABecLaplacian: nlevels > 1 TODO");
+        for (int ilev = 0; ilev < nlevels; ++ilev)
+        {
+            MLNodeABecLaplacian mlndabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]},
+                                         info);
+
+            mlndabec.setDomainBC({AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                               LinOpBCType::Neumann,
+                                               LinOpBCType::Dirichlet)},
+                                 {AMREX_D_DECL(LinOpBCType::Neumann,
+                                               LinOpBCType::Dirichlet,
+                                               LinOpBCType::Dirichlet)});
+
+            mlndabec.setScalars(ascalar, bscalar);
+
+            mlndabec.setACoeffs(0, acoef[ilev]);
+            mlndabec.setBCoeffs(0, bcoef[ilev]);
+
+            MLMG mlmg(mlndabec);
+            mlmg.setMaxIter(max_iter);
+            mlmg.setMaxFmgIter(max_fmg_iter);
+            mlmg.setVerbose(verbose);
+            mlmg.setBottomVerbose(bottom_verbose);
+
+            mlmg.solve({&solution[ilev]}, {&rhs[ilev]}, tol_rel, tol_abs);
+        }
+    }
+}
+
+void
+MyTest::solveABecLaplacianGMRES ()
+{
+    LPInfo info;
+    info.setAgglomeration(agglomeration);
+    info.setConsolidation(consolidation);
+    info.setSemicoarsening(semicoarsening);
+    info.setMaxCoarseningLevel(max_coarsening_level);
+    info.setMaxSemicoarseningLevel(max_semicoarsening_level);
+
+    const auto tol_rel = Real(1.e-10);
+    const auto tol_abs = Real(0.0);
+
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(composite_solve == false,
+       "solveABecLaplacianGMRES does not support composite solve");
+
+    const auto nlevels = static_cast<int>(geom.size());
+
+    for (int ilev = 0; ilev < nlevels; ++ilev)
+    {
+        MLABecLaplacian mlabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info);
+
+        mlabec.setMaxOrder(linop_maxorder);
+
+        // This is a 3d problem with homogeneous Neumann BC
+        mlabec.setDomainBC({AMREX_D_DECL(LinOpBCType::Neumann,
+                                         LinOpBCType::Neumann,
+                                         LinOpBCType::Neumann)},
+                           {AMREX_D_DECL(LinOpBCType::Neumann,
+                                         LinOpBCType::Neumann,
+                                         LinOpBCType::Neumann)});
+
+        if (ilev > 0) {
+            mlabec.setCoarseFineBC(&solution[ilev-1], ref_ratio);
+        }
+
+        // for problem with pure homogeneous Neumann BC, we could pass a nullptr
+        mlabec.setLevelBC(0, nullptr);
+
+        mlabec.setScalars(ascalar, bscalar);
+
+        mlabec.setACoeffs(0, acoef[ilev]);
+
+        Array<MultiFab,AMREX_SPACEDIM> face_bcoef;
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
+        {
+            const BoxArray& ba = amrex::convert(bcoef[ilev].boxArray(),
+                                                IntVect::TheDimensionVector(idim));
+            face_bcoef[idim].define(ba, bcoef[ilev].DistributionMap(), 1, 0);
+        }
+        amrex::average_cellcenter_to_face(GetArrOfPtrs(face_bcoef),
+                                          bcoef[ilev], geom[ilev]);
+        mlabec.setBCoeffs(0, amrex::GetArrOfConstPtrs(face_bcoef));
+
+        MLMG mlmg(mlabec);
+        GMRESMLMG gmsolver(mlmg);
+        gmsolver.usePrecond(true);
+        gmsolver.setVerbose(verbose);
+        gmsolver.solve(solution[ilev], rhs[ilev], tol_rel, tol_abs);
+
+        if (verbose) {
+            MultiFab res(rhs[ilev].boxArray(), rhs[ilev].DistributionMap(), 1, 0);
+            mlmg.apply({&res}, {&solution[ilev]}); // res = L(sol)
+            MultiFab::Subtract(res, rhs[ilev], 0, 0, 1, 0); // now res = L(sol) - rhs
+            amrex::Print() << "Final residual = " << res.norminf(0)
+                           << " " << res.norm1(0) << " " << res.norm2(0) << std::endl;
+        }
+    }
+
+    // Since this problem has Neumann BC, solution + constant is also a
+    // solution.  So we are going to shift the solution by a constant
+    // for comparison with the "exact solution".
+    // The statement above is incorrect because we have the a term, albeit small.
     const Real npts = grids[0].d_numPts();
     const Real avg1 = exact_solution[0].sum();
     const Real avg2 = solution[0].sum();
@@ -433,6 +574,9 @@ MyTest::readParameters ()
     pp.query("max_coarsening_level", max_coarsening_level);
     pp.query("max_semicoarsening_level", max_semicoarsening_level);
 
+    pp.query("use_gmres", use_gmres);
+    AMREX_ALWAYS_ASSERT(use_gmres == false || prob_type == 2);
+
 #ifdef AMREX_USE_HYPRE
     pp.query("use_hypre", use_hypre);
     pp.query("hypre_interface", hypre_interface_i);
@@ -463,7 +607,7 @@ MyTest::initData ()
     rhs.resize(nlevels);
     exact_solution.resize(nlevels);
 
-    if (prob_type == 2 || prob_type == 3) {
+    if (prob_type == 2 || prob_type == 3 || prob_type == 4) {
         acoef.resize(nlevels);
         bcoef.resize(nlevels);
     }
@@ -491,12 +635,17 @@ MyTest::initData ()
     for (int ilev = 0; ilev < nlevels; ++ilev)
     {
         dmap[ilev].define(grids[ilev]);
-        solution      [ilev].define(grids[ilev], dmap[ilev], 1, 1);
-        rhs           [ilev].define(grids[ilev], dmap[ilev], 1, 0);
-        exact_solution[ilev].define(grids[ilev], dmap[ilev], 1, 0);
+        BoxArray ba = grids[ilev];
+        if (prob_type == 4) {
+            ba.surroundingNodes();
+        }
+        solution      [ilev].define(ba, dmap[ilev], 1, 1);
+        rhs           [ilev].define(ba, dmap[ilev], 1, 0);
+        exact_solution[ilev].define(ba, dmap[ilev], 1, 0);
         if (!acoef.empty()) {
-            acoef[ilev].define(grids[ilev], dmap[ilev], 1, 0);
-            bcoef[ilev].define(grids[ilev], dmap[ilev], 1, 1);
+            acoef[ilev].define(ba         , dmap[ilev], 1, 0);
+            const int ngb = (prob_type == 4) ? 0 : 1;
+            bcoef[ilev].define(grids[ilev], dmap[ilev], 1, ngb);
         }
     }
 
@@ -506,6 +655,8 @@ MyTest::initData ()
         initProbABecLaplacian();
     } else if (prob_type == 3) {
         initProbABecLaplacianInhomNeumann();
+    } else if (prob_type == 4) {
+        initProbNodeABecLaplacian();
     } else {
         amrex::Abort("Unknown prob_type "+std::to_string(prob_type));
     }
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTestPlotfile.cpp b/Tests/LinearSolvers/ABecLaplacian_C/MyTestPlotfile.cpp
index 707361a4e34..6c48c90d247 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/MyTestPlotfile.cpp
+++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTestPlotfile.cpp
@@ -8,6 +8,19 @@ using namespace amrex;
 void
 MyTest::writePlotfile () const
 {
+    if (prob_type == 4) {
+        for (int ilev = 0; ilev <= max_level; ++ilev) {
+            VisMF::Write(solution[ilev], "solution-lev"+std::to_string(ilev));
+            MultiFab errmf(solution[ilev].boxArray(),
+                           solution[ilev].DistributionMap(), 1, 1);
+            MultiFab::Copy(errmf, solution[ilev], 0, 0, 1, 0);
+            MultiFab::Subtract(errmf, exact_solution[ilev], 0, 0, 1, 0);
+            auto error = errmf.norminf();
+            amrex::Print() << "Level " << ilev << " max-norm error: " << error << std::endl;
+        }
+        return;
+    }
+
     ParmParse pp;
     bool gpu_regtest = false;
 #ifdef AMREX_USE_GPU
@@ -64,7 +77,8 @@ MyTest::writePlotfile () const
             Real dvol = AMREX_D_TERM(dx[0],*dx[1],*dx[2]);
             amrex::Print() << "Level " << ilev
                            << " max-norm error: " << plotmf[ilev].norminf(3)
-                           << " 1-norm error: " << plotmf[ilev].norm1(3)*dvol << std::endl;
+                           << " 1-norm error: " << plotmf[ilev].norm1(3)*dvol
+                           << " 2-norm error: " << plotmf[ilev].norm2(3)*std::sqrt(dvol) << std::endl;
         }
 
         WriteMultiLevelPlotfile("plot", nlevels, amrex::GetVecOfConstPtrs(plotmf),
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/initProb.cpp b/Tests/LinearSolvers/ABecLaplacian_C/initProb.cpp
index a7b197adbed..ec102ae0601 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/initProb.cpp
+++ b/Tests/LinearSolvers/ABecLaplacian_C/initProb.cpp
@@ -172,3 +172,40 @@ MyTest::initProbABecLaplacianInhomNeumann ()
         solution[ilev].setVal(0.0,0,1,0); // set interior to 0
     }
 }
+
+void
+MyTest::initProbNodeABecLaplacian ()
+{
+    for (int ilev = 0; ilev <= max_level; ++ilev)
+    {
+        solution[ilev].setVal(0.0);
+
+        const auto prob_lo = geom[ilev].ProbLoArray();
+        const auto prob_hi = geom[ilev].ProbHiArray();
+        const auto dx      = geom[ilev].CellSizeArray();
+        auto a = ascalar;
+        auto b = bscalar;
+        Box const& nddom = amrex::surroundingNodes(geom[ilev].Domain());
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+        for (MFIter mfi(rhs[ilev]); mfi.isValid(); ++mfi)
+        {
+            const Box& ndbx = mfi.validbox();
+
+            auto rhsfab = rhs[ilev].array(mfi);
+            auto exactfab = exact_solution[ilev].array(mfi);
+            auto solfab   = solution[ilev].array(mfi);
+            auto acoeffab = acoef[ilev].array(mfi);
+            auto bcoeffab = bcoef[ilev].array(mfi);
+
+            amrex::ParallelFor(ndbx,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                actual_init_nodeabeclap(i,j,k,rhsfab,exactfab,solfab,
+                                        acoeffab,bcoeffab,a,b,
+                                        nddom,prob_lo,prob_hi,dx);
+            });
+        }
+    }
+}
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/initProb_K.H b/Tests/LinearSolvers/ABecLaplacian_C/initProb_K.H
index 4fcab046f0b..0b6fbe20f71 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/initProb_K.H
+++ b/Tests/LinearSolvers/ABecLaplacian_C/initProb_K.H
@@ -310,4 +310,74 @@ void actual_init_dphi_dz_hi (int i, int j, int k, amrex::Array4<amrex::Real> con
         + .25 * std::cos(fpi*x) * std::sin(fpi*y) * (-fpi) * std::sin(fpi*z);
 }
 
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void actual_init_nodeabeclap(int i, int j, int k,
+                             amrex::Array4<amrex::Real> const& rhs,
+                             amrex::Array4<amrex::Real> const& exact,
+                             amrex::Array4<amrex::Real> const& sol,
+                             amrex::Array4<amrex::Real> const& acoef,
+                             amrex::Array4<amrex::Real> const& bcoef,
+                             amrex::Real a, amrex::Real b,
+                             amrex::Box const& nddom,
+                             amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& prob_lo,
+                             amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& prob_hi,
+                             amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dx)
+{
+    constexpr amrex::Real w = 0.05;
+    constexpr amrex::Real sigma = 10.;
+    const amrex::Real theta = 0.5*std::log(3.) / (w + 1.e-50);
+
+    constexpr amrex::Real pi = 3.1415926535897932;
+    constexpr amrex::Real tpi =  2.*pi;
+    constexpr amrex::Real fpi =  4.*pi;
+    constexpr amrex::Real fac = static_cast<amrex::Real>(AMREX_SPACEDIM*4)*pi*pi;
+
+    // bcoef is at cell center, whereas the rest at nodes.
+    if (bcoef.contains(i,j,k)) {
+        actual_init_bcoef(i,j,k, bcoef, prob_lo, prob_hi, dx);
+    }
+
+    amrex::Real xc = (prob_hi[0] + prob_lo[0])*0.5;
+    amrex::Real yc = (prob_hi[1] + prob_lo[1])*0.5;
+#if (AMREX_SPACEDIM == 2)
+    amrex::Real zc = 0.0;
+#else
+    amrex::Real zc = (prob_hi[2] + prob_lo[2])*0.5;
+#endif
+
+    amrex::Real x = prob_lo[0] + dx[0] * (i);
+    amrex::Real y = prob_lo[1] + dx[1] * (j);
+#if (AMREX_SPACEDIM == 2)
+    amrex::Real z = 0.0;
+#else
+    amrex::Real z = prob_lo[2] + dx[2] * (k);
+#endif
+
+    amrex::Real r = std::sqrt((x-xc)*(x-xc) + (y-yc)*(y-yc) + (z-zc)*(z-zc));
+    amrex::Real bcnd = (sigma-1.)/2.*std::tanh(theta*(r-0.25)) + (sigma+1.)/2.;
+    amrex::Real tmp = std::cosh(theta*(r-0.25));
+    amrex::Real dbdrfac = (r == amrex::Real(0.0))
+        ? amrex::Real(0.0) : (sigma-1.)/2./(tmp*tmp) * theta/r;
+    dbdrfac *= b;
+
+    acoef(i,j,k) = 1.;
+
+    exact(i,j,k) = std::cos(tpi*x) * std::cos(tpi*y) * std::cos(tpi*z)
+           + .25 * std::cos(fpi*x) * std::cos(fpi*y) * std::cos(fpi*z);
+
+    rhs(i,j,k) = bcnd*b*fac*(  std::cos(tpi*x) * std::cos(tpi*y) * std::cos(tpi*z)
+                             + std::cos(fpi*x) * std::cos(fpi*y) * std::cos(fpi*z))
+        + dbdrfac*((x-xc)*(tpi*std::sin(tpi*x) * std::cos(tpi*y) * std::cos(tpi*z)
+                          + pi*std::sin(fpi*x) * std::cos(fpi*y) * std::cos(fpi*z))
+                 + (y-yc)*(tpi*std::cos(tpi*x) * std::sin(tpi*y) * std::cos(tpi*z)
+                          + pi*std::cos(fpi*x) * std::sin(fpi*y) * std::cos(fpi*z))
+                 + (z-zc)*(tpi*std::cos(tpi*x) * std::cos(tpi*y) * std::sin(tpi*z)
+                          + pi*std::cos(fpi*x) * std::cos(fpi*y) * std::sin(fpi*z)))
+        + a * exact(i,j,k);
+
+    if (! nddom.strictly_contains(i,j,k)) {
+        sol(i,j,k) = exact(i,j,k); // domain boundary
+    }
+}
+
 #endif
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/inputs-node b/Tests/LinearSolvers/ABecLaplacian_C/inputs-node
new file mode 100644
index 00000000000..2d96588d241
--- /dev/null
+++ b/Tests/LinearSolvers/ABecLaplacian_C/inputs-node
@@ -0,0 +1,16 @@
+
+max_level = 0
+ref_ratio = 2
+n_cell = 128
+max_grid_size = 64
+
+composite_solve = 0   # composite solve or level by level?
+
+prob_type = 4 # nodal ABecLaplacian
+
+# For MLMG
+verbose = 2
+bottom_verbose = 0
+max_iter = 100
+agglomeration = 1    # Do agglomeration on AMR Level 0?
+consolidation = 1    # Do consolidation?
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/inputs.gmres b/Tests/LinearSolvers/ABecLaplacian_C/inputs.gmres
new file mode 100644
index 00000000000..17e7767dffa
--- /dev/null
+++ b/Tests/LinearSolvers/ABecLaplacian_C/inputs.gmres
@@ -0,0 +1,17 @@
+
+max_level = 0
+ref_ratio = 2
+n_cell = 128
+max_grid_size = 64
+
+composite_solve = 0   # composite solve or level by level?
+
+prob_type = 2
+
+use_gmres = 1
+
+verbose = 2
+
+amrex.fpe_trap_invalid = 1
+amrex.fpe_trap_zero = 1
+amrex.fpe_trap_overflow = 1
diff --git a/Tests/LinearSolvers/CellEB/MyTest.cpp b/Tests/LinearSolvers/CellEB/MyTest.cpp
index f7e83e1b848..8f8c000997e 100644
--- a/Tests/LinearSolvers/CellEB/MyTest.cpp
+++ b/Tests/LinearSolvers/CellEB/MyTest.cpp
@@ -31,7 +31,7 @@ MyTest::solve ()
                    MFInfo(), *factory[ilev]);
         MultiFab::Copy(v, vfrc, 0, 0, 1, 0);
         amrex::EB_set_covered(v, 1.0);
-        amrex::Print() << "vfrc min = " << v.min(0) << std::endl;
+        amrex::Print() << "vfrc min = " << v.min(0) << '\n';
     }
 
     std::array<LinOpBCType,AMREX_SPACEDIM> mlmg_lobc;
diff --git a/Tests/LinearSolvers/CurlCurl/CMakeLists.txt b/Tests/LinearSolvers/CurlCurl/CMakeLists.txt
new file mode 100644
index 00000000000..9dacdeb2fea
--- /dev/null
+++ b/Tests/LinearSolvers/CurlCurl/CMakeLists.txt
@@ -0,0 +1,19 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    if (D EQUAL 1)
+       return()
+    endif ()
+
+    set(_sources
+       main.cpp
+       MyTest.cpp
+       MyTest.H
+       initProb.cpp
+       initProb_K.H)
+
+    set(_input_files  inputs)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/LinearSolvers/CurlCurl/GNUmakefile b/Tests/LinearSolvers/CurlCurl/GNUmakefile
new file mode 100644
index 00000000000..caa6472719d
--- /dev/null
+++ b/Tests/LinearSolvers/CurlCurl/GNUmakefile
@@ -0,0 +1,27 @@
+DEBUG = FALSE
+USE_MPI  = TRUE
+USE_OMP  = FALSE
+COMP = gnu
+DIM = 3
+BL_NO_FORT = TRUE
+
+USE_CUDA  = FALSE
+USE_SYCL  = FALSE
+USE_HIP   = FALSE
+
+TINY_PROFILE = FALSE
+
+AMREX_HOME = ../../..
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+
+Pdirs 	:= Base Boundary LinearSolvers
+
+Ppack	+= $(foreach dir, $(Pdirs), $(AMREX_HOME)/Src/$(dir)/Make.package)
+
+include $(Ppack)
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
+
diff --git a/Tests/LinearSolvers/CurlCurl/Make.package b/Tests/LinearSolvers/CurlCurl/Make.package
new file mode 100644
index 00000000000..8f306d8eb54
--- /dev/null
+++ b/Tests/LinearSolvers/CurlCurl/Make.package
@@ -0,0 +1,5 @@
+
+CEXE_sources += main.cpp
+CEXE_sources += MyTest.cpp initProb.cpp
+CEXE_headers += MyTest.H
+CEXE_headers += initProb_K.H
diff --git a/Tests/LinearSolvers/CurlCurl/MyTest.H b/Tests/LinearSolvers/CurlCurl/MyTest.H
new file mode 100644
index 00000000000..050d328c618
--- /dev/null
+++ b/Tests/LinearSolvers/CurlCurl/MyTest.H
@@ -0,0 +1,49 @@
+#ifndef MY_TEST_H_
+#define MY_TEST_H_
+
+#include <AMReX_MultiFab.H>
+
+class MyTest
+{
+public:
+
+    MyTest ();
+
+    void solve ();
+
+// public for cuda
+    void initProb ();
+
+private:
+
+    void readParameters ();
+    void initData ();
+
+    int n_cell = 128;
+    int max_grid_size = 64;
+
+    // For MLMG solver
+    int verbose = 1;
+    int bottom_verbose = 0;
+    int max_iter = 300;
+    bool agglomeration = true;
+    bool consolidation = true;
+    int max_coarsening_level = 30;
+
+    bool use_gmres = false;
+    bool gmres_use_precond = true;
+
+    amrex::Geometry geom;
+    amrex::BoxArray grids;
+    amrex::DistributionMapping dmap;
+
+    amrex::Array<amrex::MultiFab,3> solution;
+    amrex::Array<amrex::MultiFab,3> exact;
+    amrex::Array<amrex::MultiFab,3> rhs;
+
+    amrex::Real beta_factor = 1.e-2;
+    amrex::Real alpha = 1.0;
+    amrex::Real beta = 1.0;
+};
+
+#endif
diff --git a/Tests/LinearSolvers/CurlCurl/MyTest.cpp b/Tests/LinearSolvers/CurlCurl/MyTest.cpp
new file mode 100644
index 00000000000..51625d5ede7
--- /dev/null
+++ b/Tests/LinearSolvers/CurlCurl/MyTest.cpp
@@ -0,0 +1,138 @@
+#include <AMReX_MLCurlCurl.H>
+
+#include "MyTest.H"
+
+#include <AMReX_GMRES.H>
+#include <AMReX_GMRES_MLMG.H>
+#include <AMReX_MLMG.H>
+#include <AMReX_ParmParse.H>
+
+using namespace amrex;
+
+MyTest::MyTest ()
+{
+    readParameters();
+    initData();
+}
+
+void
+MyTest::solve ()
+{
+    LPInfo info;
+    info.setAgglomeration(agglomeration);
+    info.setConsolidation(consolidation);
+    info.setMaxCoarseningLevel(max_coarsening_level);
+
+    MLCurlCurl mlcc({geom}, {grids}, {dmap}, info);
+
+    mlcc.setDomainBC({AMREX_D_DECL(LinOpBCType::symmetry,
+                                   LinOpBCType::Dirichlet,
+                                   LinOpBCType::Periodic)},
+                     {AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                   LinOpBCType::symmetry,
+                                   LinOpBCType::Periodic)});
+
+    mlcc.setScalars(alpha, beta);
+    mlcc.prepareRHS({&rhs});
+
+    using V = Array<MultiFab,3>;
+    MLMGT<V> mlmg(mlcc);
+    mlmg.setMaxIter(max_iter);
+    mlmg.setVerbose(verbose);
+    mlmg.setBottomVerbose(bottom_verbose);
+    for (auto& mf : solution) {
+        mf.setVal(Real(0));
+    }
+
+    auto tol_rel = Real(1.0e-10);
+    auto tol_abs = Real(0.0);
+
+    if (use_gmres)
+    {
+        GMRESMLMGT<V> gmsolver(mlmg);
+        gmsolver.usePrecond(gmres_use_precond);
+
+        // This system has homogeneous BC unlike
+        // Tests/LinearSolvers/ABecLaplacian_C, so the setup is simpler.
+        gmsolver.setPropertyOfZero(true);
+
+        gmsolver.setVerbose(verbose);
+        gmsolver.solve(solution, rhs, tol_rel, tol_abs);
+    }
+    else
+    {
+        mlmg.solve({&solution}, {&rhs}, tol_rel, tol_abs);
+    }
+
+    amrex::Print() << "  Number of cells: " << n_cell << std::endl;
+    auto dvol = AMREX_D_TERM(geom.CellSize(0), *geom.CellSize(1), *geom.CellSize(2));
+    Array<std::string,3> names{"Ex", "Ey", "Ez"};
+    for (int idim = 0; idim < 3; ++idim) {
+        MultiFab::Subtract(solution[idim], exact[idim], 0, 0, 1, 0);
+        auto e0 = solution[idim].norminf();
+        auto e1 = solution[idim].norm1(0,geom.periodicity());
+        e1 *= dvol;
+        auto e2 = solution[idim].norm2(0,geom.periodicity());
+        e2 *= std::sqrt(dvol);
+        amrex::Print() << "  " << names[idim] << " errors (max, L1, L2): "
+                       << e0 << " " << e1 << " " << e2 << std::endl;
+    }
+}
+
+void
+MyTest::readParameters ()
+{
+    ParmParse pp;
+    pp.query("n_cell", n_cell);
+    pp.query("max_grid_size", max_grid_size);
+
+    pp.query("verbose", verbose);
+    pp.query("bottom_verbose", bottom_verbose);
+    pp.query("max_iter", max_iter);
+    pp.query("agglomeration", agglomeration);
+    pp.query("consolidation", consolidation);
+    pp.query("max_coarsening_level", max_coarsening_level);
+
+    pp.query("use_gmres", use_gmres);
+    pp.query("gmres_use_precond", gmres_use_precond);
+
+    pp.query("beta_factor", beta_factor);
+    pp.query("alpha", alpha);
+}
+
+void
+MyTest::initData ()
+{
+    RealBox rb({AMREX_D_DECL(0.,0.,0.)}, {AMREX_D_DECL(1.,1.,1.)});
+    Array<int,AMREX_SPACEDIM> is_periodic{AMREX_D_DECL(0,0,1)};
+    Geometry::Setup(&rb, 0, is_periodic.data());
+    Box domain(IntVect(0), IntVect(n_cell-1));
+    geom.define(domain);
+
+    const Real dx = geom.CellSize(0);
+    beta = beta_factor * alpha/(dx*dx);
+
+    grids.define(domain);
+    grids.maxSize(max_grid_size);
+    dmap.define(grids);
+
+    for (int idim = 0; idim < 3; ++idim) {
+        IntVect itype(1);
+#if (AMREX_SPACEDIM == 2)
+        if (idim < AMREX_SPACEDIM)
+#endif
+        {
+            itype[idim] = 0;
+        }
+        BoxArray const& ba = amrex::convert(grids, itype);
+        solution[idim].define(ba,dmap,1,1);
+        exact   [idim].define(ba,dmap,1,1);
+        rhs     [idim].define(ba,dmap,1,0);
+    }
+
+    initProb();
+
+    for (int idim = 0; idim < 3; ++idim) {
+        exact[idim].LocalCopy(solution[idim], 0, 0, 1, IntVect(1));
+    }
+}
diff --git a/Tests/LinearSolvers/CurlCurl/initProb.cpp b/Tests/LinearSolvers/CurlCurl/initProb.cpp
new file mode 100644
index 00000000000..c5d2e1af18e
--- /dev/null
+++ b/Tests/LinearSolvers/CurlCurl/initProb.cpp
@@ -0,0 +1,32 @@
+#include "initProb_K.H"
+
+#include "MyTest.H"
+
+using namespace amrex;
+
+void
+MyTest::initProb ()
+{
+    const auto prob_lo = geom.ProbLoArray();
+    const auto dx      = geom.CellSizeArray();
+    const auto a = alpha;
+    const auto b = beta;
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(rhs[0], TilingIfNotGPU()); mfi.isValid(); ++mfi)
+    {
+        const Box& gbx = mfi.tilebox(IntVect(1),IntVect(1));
+        GpuArray<Array4<Real>,3> rhsfab{rhs[0].array(mfi),
+                                        rhs[1].array(mfi),
+                                        rhs[2].array(mfi)};
+        GpuArray<Array4<Real>,3> solfab{solution[0].array(mfi),
+                                        solution[1].array(mfi),
+                                        solution[2].array(mfi)};
+        amrex::ParallelFor(gbx,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            actual_init_prob(i,j,k,rhsfab,solfab,prob_lo,dx,a,b);
+        });
+    }
+}
diff --git a/Tests/LinearSolvers/CurlCurl/initProb_K.H b/Tests/LinearSolvers/CurlCurl/initProb_K.H
new file mode 100644
index 00000000000..66b7d2916cd
--- /dev/null
+++ b/Tests/LinearSolvers/CurlCurl/initProb_K.H
@@ -0,0 +1,107 @@
+#ifndef INIT_PROB_K_H_
+#define INIT_PROB_K_H_
+
+#include <AMReX_FArrayBox.H>
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void actual_init_prob (int i, int j, int k,
+                       amrex::GpuArray<amrex::Array4<amrex::Real>,3> const& rhs,
+                       amrex::GpuArray<amrex::Array4<amrex::Real>,3> const& sol,
+                       amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& problo,
+                       amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dx,
+                       amrex::Real alpha, amrex::Real beta)
+{
+    using namespace amrex;
+
+    constexpr Real pi = amrex::Math::pi<Real>();
+
+    Real xnd = problo[0] + Real(i)*dx[0];
+    Real ynd = problo[1] + Real(j)*dx[1];
+    Real xcc = xnd + Real(0.5)*dx[0];
+    Real ycc = ynd + Real(0.5)*dx[1];
+#if (AMREX_SPACEDIM == 3)
+    Real znd = problo[2] + Real(k)*dx[2];
+    Real zcc = znd + Real(0.5)*dx[2];
+#endif
+
+    if (sol[0].contains(i,j,k)) {
+        Real x = xcc;
+        Real y = ynd;
+        Real Ex = std::sin(pi*x) * std::sin(Real(2.5)*pi*y);
+#if (AMREX_SPACEDIM == 3)
+        Real z = znd;
+        Ex *= std::sin(Real(2.0)*pi*z + Real(1./3.)*pi);
+#endif
+        sol[0](i,j,k) = Ex;
+    }
+
+    if (sol[1].contains(i,j,k)) {
+        Real x = xnd;
+        Real y = ycc;
+        Real Ey = std::cos(Real(2.5)*pi*x) * std::sin(Real(3.)*pi*y);
+#if (AMREX_SPACEDIM == 3)
+        Real z = znd;
+        Ey *= std::sin(Real(4.)*pi*z + Real(0.25)*pi);
+#endif
+        sol[1](i,j,k) = Ey;
+    }
+
+    if (sol[2].contains(i,j,k)) {
+        Real x = xnd;
+        Real y = ynd;
+        Real Ez = std::cos(Real(3.5)*pi*x) * std::sin(Real(3.5)*pi*y);
+#if (AMREX_SPACEDIM == 3)
+        Real z = zcc;
+        Ez *= std::sin(Real(4.)*pi*z + Real(1./6.)*pi);
+#endif
+        sol[2](i,j,k) = Ez;
+    }
+
+    if (rhs[0].contains(i,j,k)) {
+        Real x = xcc;
+        Real y = ynd;
+#if (AMREX_SPACEDIM == 2)
+        Real cce = Real(-7.5)*pi*pi*std::sin(Real(2.5)*pi*x)*std::cos(Real(3.)*pi*y)
+            + Real(6.25)*pi*pi*std::sin(pi*x)*std::sin(Real(2.5)*pi*y);
+#else
+        Real z = znd;
+        Real cce = Real(-7.5)*pi*pi*std::sin(Real(2.5)*pi*x)*std::cos(Real(3.)*pi*y)*std::sin(Real(4.)*pi*z+Real(0.25)*pi)
+            + Real(6.25)*pi*pi*std::sin(pi*x)*std::sin(Real(2.5)*pi*y)*std::sin(Real(2.)*pi*z+Real(1./3.)*pi)
+            - Real(14.)*pi*pi*std::sin(Real(3.5)*pi*x)*std::sin(Real(3.5)*pi*y)*std::cos(Real(4.)*pi*z+Real(1./6.)*pi)
+            + Real(4.)*pi*pi*std::sin(pi*x)*std::sin(Real(2.5)*pi*y)*std::sin(Real(2.)*pi*z+Real(1./3.)*pi);
+#endif
+        rhs[0](i,j,k) = alpha*cce + beta*sol[0](i,j,k);
+    }
+
+    if (rhs[1].contains(i,j,k)) {
+        Real x = xnd;
+        Real y = ycc;
+#if (AMREX_SPACEDIM == 2)
+        Real cce = Real(6.25)*pi*pi*std::cos(Real(2.5)*pi*x)*std::sin(Real(3.)*pi*y)
+            + Real(2.5)*pi*pi*std::cos(pi*x)*std::cos(Real(2.5)*pi*y);
+#else
+        Real z = znd;
+        Real cce = Real(6.25)*pi*pi*std::cos(Real(2.5)*pi*x)*std::sin(Real(3.)*pi*y)*std::sin(Real(4.)*pi*z+Real(0.25)*pi)
+            + Real(2.5)*pi*pi*std::cos(pi*x)*std::cos(Real(2.5)*pi*y)*std::sin(Real(2.)*pi*z+Real(1./3.)*pi)
+            + Real(14.)*pi*pi*std::cos(Real(3.5)*pi*x)*std::cos(Real(3.5)*pi*y)*std::cos(Real(4.)*pi*z+Real(1./6.)*pi)
+            + Real(16.)*pi*pi*std::cos(Real(2.5)*pi*x)*std::sin(Real(3.)*pi*y)*std::sin(Real(4.)*pi*z+Real(0.25)*pi);
+#endif
+        rhs[1](i,j,k) = alpha*cce + beta*sol[1](i,j,k);
+    }
+
+    if (rhs[2].contains(i,j,k)) {
+        Real x = xnd;
+        Real y = ynd;
+#if (AMREX_SPACEDIM == 2)
+        Real cce = Real(24.5)*pi*pi*std::cos(Real(3.5)*pi*x)*std::sin(Real(3.5)*pi*y);
+#else
+        Real z = zcc;
+        Real cce = Real(24.5)*pi*pi*std::cos(Real(3.5)*pi*x)*std::sin(Real(3.5)*pi*y)*std::sin(Real(4.)*pi*z+Real(1./6.)*pi)
+            + Real(2.)*pi*pi*std::cos(pi*x)*std::sin(Real(2.5)*pi*y)*std::cos(Real(2.)*pi*z+Real(1./3.)*pi)
+            + Real(12.)*pi*pi*std::cos(Real(2.5)*pi*x)*std::cos(Real(3.)*pi*y)*std::cos(Real(4.)*pi*z+Real(0.25)*pi);
+#endif
+        rhs[2](i,j,k) = alpha*cce + beta*sol[2](i,j,k);
+    }
+}
+
+#endif
diff --git a/Tests/LinearSolvers/CurlCurl/inputs b/Tests/LinearSolvers/CurlCurl/inputs
new file mode 100644
index 00000000000..15eff3d2702
--- /dev/null
+++ b/Tests/LinearSolvers/CurlCurl/inputs
@@ -0,0 +1,13 @@
+
+n_cell = 128
+max_grid_size = 64
+
+verbose = 2
+bottom_verbose = 2
+
+alpha = 1.0
+beta_factor = 0.01
+
+amrex.fpe_trap_invalid=1
+amrex.fpe_trap_zero=1
+amrex.fpe_trap_overflow=1
diff --git a/Tests/LinearSolvers/CurlCurl/main.cpp b/Tests/LinearSolvers/CurlCurl/main.cpp
new file mode 100644
index 00000000000..76ca0b2e765
--- /dev/null
+++ b/Tests/LinearSolvers/CurlCurl/main.cpp
@@ -0,0 +1,17 @@
+#include "MyTest.H"
+
+#include <AMReX.H>
+#include <AMReX_BLProfiler.H>
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+
+    {
+        BL_PROFILE("main");
+        MyTest mytest;
+        mytest.solve();
+    }
+
+    amrex::Finalize();
+}
diff --git a/Tests/LinearSolvers/EBflux_grad/MyTest.H b/Tests/LinearSolvers/EBflux_grad/MyTest.H
index 17ccdec1604..f26ee4e1c0b 100644
--- a/Tests/LinearSolvers/EBflux_grad/MyTest.H
+++ b/Tests/LinearSolvers/EBflux_grad/MyTest.H
@@ -45,6 +45,7 @@ private:
     amrex::Vector<amrex::MultiFab> phi;
     amrex::Vector<amrex::Array<amrex::MultiFab, AMREX_SPACEDIM>> flux;
     amrex::Vector<amrex::Array<amrex::MultiFab, AMREX_SPACEDIM>> grad;
+    amrex::Vector<amrex::Array<amrex::fMultiFab, AMREX_SPACEDIM>> fgrad; // Used to test getGradSolution when MF is different from AMF
     amrex::Vector<amrex::MultiFab> rhs;
     amrex::Vector<amrex::MultiFab> acoef;
     amrex::Vector<amrex::Array<amrex::MultiFab,AMREX_SPACEDIM> > bcoef;
diff --git a/Tests/LinearSolvers/EBflux_grad/MyTest.cpp b/Tests/LinearSolvers/EBflux_grad/MyTest.cpp
index 782f3eece39..c814536203d 100644
--- a/Tests/LinearSolvers/EBflux_grad/MyTest.cpp
+++ b/Tests/LinearSolvers/EBflux_grad/MyTest.cpp
@@ -73,6 +73,7 @@ MyTest::solve ()
     mlmg.solve(amrex::GetVecOfPtrs(phi), amrex::GetVecOfConstPtrs(rhs), tol_rel, tol_abs);
     mlmg.getFluxes(amrex::GetVecOfArrOfPtrs(flux));
     mlmg.getGradSolution(amrex::GetVecOfArrOfPtrs(grad));
+    mlmg.getGradSolution(amrex::GetVecOfArrOfPtrs(fgrad)); // Test when MF is different from AMF
     for (int ilev = 0; ilev <= max_level; ++ilev) {
         amrex::VisMF::Write(phi[0], "phi-"+std::to_string(ilev));
     }
@@ -146,6 +147,7 @@ MyTest::initData ()
     bcoef.resize(nlevels);
     flux.resize(1);
     grad.resize(1);
+    fgrad.resize(1);
     for (int ilev = 0; ilev < nlevels; ++ilev)
     {
         dmap[ilev].define(grids[ilev]);
@@ -193,5 +195,7 @@ MyTest::initData ()
                                      dmap[0], 1, 0, MFInfo(), *factory[0]);
         grad[0][idim].define(amrex::convert(grids[0],IntVect::TheDimensionVector(idim)),
                                      dmap[0], 1, 0, MFInfo(), *factory[0]);
+        fgrad[0][idim].define(amrex::convert(grids[0],IntVect::TheDimensionVector(idim)),
+                                      dmap[0], 1, 0, MFInfo(), DefaultFabFactory<BaseFab<float>>());
     }
 }
diff --git a/Tests/LinearSolvers/NodalPoisson/GNUmakefile b/Tests/LinearSolvers/NodalPoisson/GNUmakefile
index d47c9f65f4c..45e2dbb6145 100644
--- a/Tests/LinearSolvers/NodalPoisson/GNUmakefile
+++ b/Tests/LinearSolvers/NodalPoisson/GNUmakefile
@@ -14,7 +14,7 @@ include $(AMREX_HOME)/Tools/GNUMake/Make.defs
 
 include ./Make.package
 
-Pdirs 	:= Base Boundary AmrCore LinearSolvers/MLMG
+Pdirs 	:= Base Boundary AmrCore LinearSolvers
 
 Ppack	+= $(foreach dir, $(Pdirs), $(AMREX_HOME)/Src/$(dir)/Make.package)
 
diff --git a/Tests/LinearSolvers/NodalPoisson/MyTest.H b/Tests/LinearSolvers/NodalPoisson/MyTest.H
index 29d87cfd18c..e0de6f9284e 100644
--- a/Tests/LinearSolvers/NodalPoisson/MyTest.H
+++ b/Tests/LinearSolvers/NodalPoisson/MyTest.H
@@ -31,8 +31,10 @@ private:
     int ref_ratio = 2;
     int n_cell = 128;
     int max_grid_size = 64;
+    amrex::Real domain_ratio = 1.0;
 
     bool composite_solve = true;
+    bool use_gmres = false;
 
     // For MLMG solver
     int verbose = 2;
diff --git a/Tests/LinearSolvers/NodalPoisson/MyTest.cpp b/Tests/LinearSolvers/NodalPoisson/MyTest.cpp
index d16de76ff19..ebb73dc1237 100644
--- a/Tests/LinearSolvers/NodalPoisson/MyTest.cpp
+++ b/Tests/LinearSolvers/NodalPoisson/MyTest.cpp
@@ -1,5 +1,6 @@
 #include "MyTest.H"
 
+#include <AMReX_GMRES_MLMG.H>
 #include <AMReX_MLNodeLaplacian.H>
 #include <AMReX_ParmParse.H>
 #include <AMReX_FillPatchUtil.H>
@@ -112,7 +113,13 @@ MyTest::solve ()
                                              bcrec, 0);
             }
 
-            mlmg.solve({&solution[ilev]}, {&rhs[ilev]}, reltol, 0.0);
+            if (use_gmres) {
+                GMRESMLMG gmsolver(mlmg);
+                gmsolver.setVerbose(verbose);
+                gmsolver.solve(solution[ilev], rhs[ilev], reltol, 0.0);
+            } else {
+                mlmg.solve({&solution[ilev]}, {&rhs[ilev]}, reltol, 0.0);
+            }
         }
     }
 }
@@ -143,8 +150,13 @@ MyTest::readParameters ()
     pp.query("ref_ratio", ref_ratio);
     pp.query("n_cell", n_cell);
     pp.query("max_grid_size", max_grid_size);
+    pp.query("domain_ratio", domain_ratio);
 
     pp.query("composite_solve", composite_solve);
+    pp.query("use_gmres", use_gmres);
+    if (use_gmres) {
+        composite_solve = false;
+    }
 
     pp.query("verbose", verbose);
     pp.query("bottom_verbose", bottom_verbose);
@@ -190,7 +202,7 @@ MyTest::initData ()
     exact_solution.resize(nlevels);
     sigma.resize(nlevels);
 
-    RealBox rb({AMREX_D_DECL(0.,0.,0.)}, {AMREX_D_DECL(1.,1.,1.)});
+    RealBox rb({AMREX_D_DECL(0.,0.,0.)}, {AMREX_D_DECL(1.,domain_ratio,1.)});
     Array<int,AMREX_SPACEDIM> is_periodic{AMREX_D_DECL(0,0,0)};
     Geometry::Setup(&rb, 0, is_periodic.data());
     Box domain0(IntVect{AMREX_D_DECL(0,0,0)}, IntVect{AMREX_D_DECL(n_cell-1,n_cell-1,n_cell-1)});
@@ -253,4 +265,3 @@ MyTest::initData ()
         sigma[ilev].setVal(1.0);
     }
 }
-
diff --git a/Tests/LinearSolvers/NodalPoisson/inputs-gmres b/Tests/LinearSolvers/NodalPoisson/inputs-gmres
new file mode 100644
index 00000000000..976e2249e30
--- /dev/null
+++ b/Tests/LinearSolvers/NodalPoisson/inputs-gmres
@@ -0,0 +1,16 @@
+
+max_level = 1
+ref_ratio = 2
+n_cell = 128
+max_grid_size = 64
+
+composite_solve = 0   # composite solve or level by level?
+use_gmres = 1
+domain_ratio = 1.0
+
+# For MLMG
+verbose = 2
+bottom_verbose = 0
+max_iter = 100
+max_fmg_iter = 0     # # of F-cycles before switching to V.  To do pure V-cycle, set to 0
+reltol = 1.e-11
diff --git a/Tests/OpenMP/atomicAdd/CMakeLists.txt b/Tests/OpenMP/atomicAdd/CMakeLists.txt
new file mode 100644
index 00000000000..11a11e0ead4
--- /dev/null
+++ b/Tests/OpenMP/atomicAdd/CMakeLists.txt
@@ -0,0 +1,13 @@
+if (NOT AMReX_GPU_BACKEND STREQUAL NONE)
+   return()
+endif ()
+
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/OpenMP/atomicAdd/GNUmakefile b/Tests/OpenMP/atomicAdd/GNUmakefile
new file mode 100644
index 00000000000..ef82105e725
--- /dev/null
+++ b/Tests/OpenMP/atomicAdd/GNUmakefile
@@ -0,0 +1,15 @@
+AMREX_HOME = ../../..
+
+DEBUG	= FALSE
+DIM	= 3
+COMP    = gcc
+
+USE_MPI   = FALSE
+USE_OMP   = TRUE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/OpenMP/atomicAdd/Make.package b/Tests/OpenMP/atomicAdd/Make.package
new file mode 100644
index 00000000000..6b4b865e8fc
--- /dev/null
+++ b/Tests/OpenMP/atomicAdd/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/OpenMP/atomicAdd/main.cpp b/Tests/OpenMP/atomicAdd/main.cpp
new file mode 100644
index 00000000000..7340079b8bd
--- /dev/null
+++ b/Tests/OpenMP/atomicAdd/main.cpp
@@ -0,0 +1,67 @@
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_OpenMP.H>
+#include <AMReX_Print.H>
+
+using namespace amrex;
+
+void test_atomicAdd (MultiFab& mf)
+{
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel
+#endif
+    {
+        FArrayBox tmp;
+        for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
+            Box const& tbx = mfi.growntilebox();
+            tmp.resize(tbx);
+            tmp.template setVal<RunOn::Host>(0.2);
+            mf[mfi].template atomicAdd<RunOn::Host>(tmp, tbx, tbx, 0, 0, 1);
+        }
+    }
+}
+
+void test_lockAdd (MultiFab& mf)
+{
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel
+#endif
+    {
+        FArrayBox tmp;
+        for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
+            Box const& tbx = mfi.growntilebox();
+            tmp.resize(tbx);
+            tmp.template setVal<RunOn::Host>(0.2);
+            mf[mfi].template lockAdd<RunOn::Host>(tmp, tbx, tbx, 0, 0, 1);
+        }
+    }
+}
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        BoxArray ba(Box(IntVect(0),IntVect(127)));
+        ba.maxSize(32);
+        DistributionMapping dm(ba);
+
+        MultiFab mf(ba,dm,1,2);
+        mf.setVal(0.0);
+
+        test_atomicAdd(mf);
+        double t = amrex::second();
+        test_atomicAdd(mf);
+        double t_aa = amrex::second() - t;
+
+        test_lockAdd(mf);
+        t = amrex::second();
+        test_lockAdd(mf);
+        double t_la = amrex::second() - t;
+
+        amrex::Print() << "  atomicAdd time is " << t_aa << "\n"
+                       << "    lockAdd time is " << t_la << "\n";
+    }
+    amrex::Finalize();
+}
diff --git a/Tests/Particles/Ascent_Insitu_SOA/CMakeLists.txt b/Tests/Particles/Ascent_Insitu_SOA/CMakeLists.txt
new file mode 100644
index 00000000000..82216a02af8
--- /dev/null
+++ b/Tests/Particles/Ascent_Insitu_SOA/CMakeLists.txt
@@ -0,0 +1,13 @@
+if ( NOT AMReX_ASCENT )
+   return ()
+endif ()
+
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files inputs.rt  )
+
+    setup_test(${D} _sources _input_files NTASKS 2)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/Particles/Ascent_Insitu_SOA/GNUmakefile b/Tests/Particles/Ascent_Insitu_SOA/GNUmakefile
new file mode 100644
index 00000000000..660e4a13f22
--- /dev/null
+++ b/Tests/Particles/Ascent_Insitu_SOA/GNUmakefile
@@ -0,0 +1,24 @@
+AMREX_HOME = ../../../
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+
+TINY_PROFILE = TRUE
+USE_PARTICLES = TRUE
+USE_ASCENT = TRUE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+include $(AMREX_HOME)/Src/Particle/Make.package
+include $(AMREX_HOME)/Src/Extern/Conduit/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/Particles/Ascent_Insitu_SOA/inputs.rt b/Tests/Particles/Ascent_Insitu_SOA/inputs.rt
new file mode 100644
index 00000000000..e34fda14923
--- /dev/null
+++ b/Tests/Particles/Ascent_Insitu_SOA/inputs.rt
@@ -0,0 +1,10 @@
+ascent.size = (32, 64, 64)
+ascent.max_grid_size = 32
+ascent.is_periodic = 1
+ascent.num_ppc = 1
+ascent.nlevs = 1
+
+ascent.num_runtime_real = 0
+ascent.num_runtime_int = 0
+
+particles.do_tiling = 1
diff --git a/Tests/Particles/Ascent_Insitu_SOA/main.cpp b/Tests/Particles/Ascent_Insitu_SOA/main.cpp
new file mode 100644
index 00000000000..46e2af98422
--- /dev/null
+++ b/Tests/Particles/Ascent_Insitu_SOA/main.cpp
@@ -0,0 +1,306 @@
+#include <AMReX.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_Particles.H>
+
+#if !defined(AMREX_PARTICLES) || !defined(AMREX_USE_CONDUIT)
+#error Incompatible AMReX library configuration! This tutorial requires AMREX_PARTICLES and AMREX_USE_CONDUIT
+#endif
+
+#include <AMReX_Conduit_Blueprint.H>
+
+#include <ascent.hpp>
+
+#include <string>
+
+
+using namespace amrex;
+
+static constexpr int NR = 7;
+static constexpr int NI = 4;
+
+int num_runtime_real = 0;
+int num_runtime_int = 0;
+
+bool remove_negative = true;
+
+void get_position_unit_cell (Real* r, const IntVect& nppc, int i_part)
+{
+    int nx = nppc[0];
+#if AMREX_SPACEDIM > 1
+    int ny = nppc[1];
+#else
+    int ny = 1;
+#endif
+#if AMREX_SPACEDIM > 2
+    int nz = nppc[2];
+#else
+    int nz = 1;
+#endif
+
+    int ix_part = i_part/(ny * nz);
+    int iy_part = (i_part % (ny * nz)) % ny;
+    int iz_part = (i_part % (ny * nz)) / ny;
+
+    r[0] = (0.5+ix_part)/nx;
+    r[1] = (0.5+iy_part)/ny;
+    r[2] = (0.5+iz_part)/nz;
+}
+
+class TestParticleContainer
+    : public amrex::ParticleContainerPureSoA<NR, NI>
+{
+
+public:
+
+    TestParticleContainer (const Vector<amrex::Geometry>            & a_geom,
+                           const Vector<amrex::DistributionMapping> & a_dmap,
+                           const Vector<amrex::BoxArray>            & a_ba,
+                           const Vector<amrex::IntVect>             & a_rr)
+        : amrex::ParticleContainerPureSoA<NR, NI>(a_geom, a_dmap, a_ba, a_rr)
+    {
+        for (int i = 0; i < num_runtime_real; ++i)
+        {
+            AddRealComp(true);
+        }
+        for (int i = 0; i < num_runtime_int; ++i)
+        {
+            AddIntComp(true);
+        }
+    }
+
+    void InitParticles (const amrex::IntVect& a_num_particles_per_cell)
+    {
+        BL_PROFILE("InitParticles");
+
+        const int lev = 0;  // only add particles on level 0
+        const Real* dx = Geom(lev).CellSize();
+        const Real* plo = Geom(lev).ProbLo();
+
+        const int num_ppc = AMREX_D_TERM( a_num_particles_per_cell[0],
+                                         *a_num_particles_per_cell[1],
+                                         *a_num_particles_per_cell[2]);
+
+        for(MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi)
+        {
+            const Box& tile_box  = mfi.tilebox();
+
+            std::array<Gpu::HostVector<ParticleReal>, NR> host_real;
+            std::array<Gpu::HostVector<int>, NI> host_int;
+
+            std::vector<Gpu::HostVector<ParticleReal> > host_runtime_real(NumRuntimeRealComps());
+            std::vector<Gpu::HostVector<int> > host_runtime_int(NumRuntimeIntComps());
+
+            for (IntVect iv = tile_box.smallEnd(); iv <= tile_box.bigEnd(); tile_box.next(iv))
+            {
+                for (int i_part=0; i_part<num_ppc;i_part++) {
+                    Real r[3];
+                    get_position_unit_cell(r, a_num_particles_per_cell, i_part);
+
+                    amrex::Long id = ParticleType::NextID();
+
+                    host_int[0].push_back(static_cast<int>(id));
+                    host_int[1].push_back(ParallelDescriptor::MyProc());
+                    host_real[0].push_back(static_cast<ParticleReal> (plo[0] + (iv[0] + r[0])*dx[0]));
+#if AMREX_SPACEDIM > 1
+                    host_real[1].push_back(static_cast<ParticleReal> (plo[1] + (iv[1] + r[1])*dx[1]));
+#endif
+#if AMREX_SPACEDIM > 2
+                    host_real[2].push_back(static_cast<ParticleReal> (plo[2] + (iv[2] + r[2])*dx[2]));
+#endif
+
+                    for (int i = AMREX_SPACEDIM; i < NR; ++i)
+                        host_real[i].push_back(static_cast<ParticleReal>(id));
+                    for (int i = 2; i < NI; ++i)
+                        host_int[i].push_back(static_cast<int>(id));
+                    for (int i = 0; i < NumRuntimeRealComps(); ++i)
+                        host_runtime_real[i].push_back(static_cast<ParticleReal>(id));
+                    for (int i = 0; i < NumRuntimeIntComps(); ++i)
+                        host_runtime_int[i].push_back(static_cast<int>(id));
+                }
+            }
+
+            auto& particle_tile = DefineAndReturnParticleTile(lev, mfi.index(), mfi.LocalTileIndex());
+            auto old_size = particle_tile.size();
+            auto new_size = old_size + host_real[0].size();
+            particle_tile.resize(new_size);
+
+            auto& soa = particle_tile.GetStructOfArrays();
+            for (int i = 0; i < NR; ++i)
+            {
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_real[i].begin(),
+                               host_real[i].end(),
+                               soa.GetRealData(i).begin() + old_size);
+            }
+
+            for (int i = 0; i < NI; ++i)
+            {
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_int[i].begin(),
+                               host_int[i].end(),
+                               soa.GetIntData(i).begin() + old_size);
+            }
+            for (int i = 0; i < NumRuntimeRealComps(); ++i)
+            {
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_real[i].begin(),
+                               host_runtime_real[i].end(),
+                               soa.GetRealData(NR+i).begin() + old_size);
+            }
+
+            for (int i = 0; i < NumRuntimeIntComps(); ++i)
+            {
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_runtime_int[i].begin(),
+                               host_runtime_int[i].end(),
+                               soa.GetIntData(NI+i).begin() + old_size);
+            }
+
+            Gpu::streamSynchronize();
+        }
+
+        Redistribute();
+    }
+};
+
+struct TestParams
+{
+    IntVect size;
+    int max_grid_size;
+    int num_ppc;
+    int is_periodic;
+    int nlevs;
+};
+
+void testAscent ();
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+
+    amrex::Print() << "Running redistribute test \n";
+    testAscent();
+
+    amrex::Finalize();
+}
+
+void get_test_params (TestParams& params, const std::string& prefix)
+{
+    ParmParse pp(prefix);
+    pp.get("size", params.size);
+    pp.get("max_grid_size", params.max_grid_size);
+    pp.get("num_ppc", params.num_ppc);
+    pp.get("is_periodic", params.is_periodic);
+    pp.get("nlevs", params.nlevs);
+    pp.query("num_runtime_real", num_runtime_real);
+    pp.query("num_runtime_int", num_runtime_int);
+}
+
+void testAscent ()
+{
+    BL_PROFILE("testAscent");
+    TestParams params;
+    get_test_params(params, "ascent");
+
+    int is_per[BL_SPACEDIM];
+    for (int & d : is_per)
+        d = params.is_periodic;
+
+    Vector<IntVect> rr(params.nlevs-1);
+    for (int lev = 1; lev < params.nlevs; lev++)
+        rr[lev-1] = IntVect(AMREX_D_DECL(2,2,2));
+
+    RealBox real_box;
+    for (int n = 0; n < BL_SPACEDIM; n++)
+    {
+        real_box.setLo(n, 0.0);
+        real_box.setHi(n, params.size[n]);
+    }
+
+    IntVect domain_lo(AMREX_D_DECL(0, 0, 0));
+    IntVect domain_hi(AMREX_D_DECL(params.size[0]-1,params.size[1]-1,params.size[2]-1));
+    const Box base_domain(domain_lo, domain_hi);
+
+    Vector<Geometry> geom(params.nlevs);
+    geom[0].define(base_domain, &real_box, CoordSys::cartesian, is_per);
+    for (int lev = 1; lev < params.nlevs; lev++) {
+        geom[lev].define(amrex::refine(geom[lev-1].Domain(), rr[lev-1]),
+                         &real_box, CoordSys::cartesian, is_per);
+    }
+
+    Vector<BoxArray> ba(params.nlevs);
+    Vector<DistributionMapping> dm(params.nlevs);
+    auto lo = IntVect(AMREX_D_DECL(0, 0, 0));
+    IntVect size = params.size;
+    for (int lev = 0; lev < params.nlevs; ++lev)
+    {
+        ba[lev].define(Box(lo, lo+params.size-1));
+        ba[lev].maxSize(params.max_grid_size);
+        dm[lev].define(ba[lev]);
+        lo += size/2;
+        size *= 2;
+    }
+
+    TestParticleContainer pc(geom, dm, ba, rr);
+
+    int npc = params.num_ppc;
+    auto nppc = IntVect(AMREX_D_DECL(npc, npc, npc));
+
+    amrex::Print() << "About to initialize particles \n";
+
+    pc.InitParticles(nppc);
+
+    {
+        conduit::Node bp_mesh;
+        /* TODO
+        amrex::MultiLevelToBlueprint(
+            nlev,
+            amrex::GetVecOfConstPtrs(mf),
+            varnames,
+            geom,
+            time,
+            iteration,
+            warpx.refRatio(),
+            bp_mesh
+        );
+        */
+
+        // wrap pc for current species into a blueprint topology
+        std::string const prefix = "particle";
+        Vector<std::string> particle_varnames;
+        for (int i = 0; i < pc.NumRealComps(); ++i) {
+            particle_varnames.push_back(prefix + "_real_" + std::to_string(i));
+        }
+        Vector<std::string> particle_int_varnames;
+        for (int i = 0; i < pc.NumIntComps(); ++i) {
+            particle_int_varnames.push_back(prefix + "_int_" + std::to_string(i));
+        }
+
+        amrex::ParticleContainerToBlueprint(pc,
+                                            particle_varnames,
+                                            particle_int_varnames,
+                                            bp_mesh,
+                                            prefix);
+        // publish
+        ascent::Ascent ascent;
+        conduit::Node opts;
+        opts["exceptions"] = "catch";
+        opts["mpi_comm"] = MPI_Comm_c2f(ParallelDescriptor::Communicator());
+        ascent.open(opts);
+        ascent.publish(bp_mesh);
+
+        // If you want to save blueprint HDF5 files w/o using an Ascent
+        // extract, you can call the following AMReX helper:
+        // const auto step = istep[0];
+        // amrex::WriteBlueprintFiles(bp_mesh, "bp_export", step, "hdf5");
+
+        // render
+        conduit::Node actions;
+        ascent.execute(actions);
+        ascent.close();
+    }
+
+    // the way this test is set up, if we make it here we pass
+    amrex::Print() << "pass \n";
+}
+
diff --git a/Tests/Particles/AssignDensity/main.cpp b/Tests/Particles/AssignDensity/main.cpp
index fb4a0fb970b..d8caa32870e 100644
--- a/Tests/Particles/AssignDensity/main.cpp
+++ b/Tests/Particles/AssignDensity/main.cpp
@@ -68,10 +68,6 @@ void test_assign_density(TestParams& parms)
   myPC.InitRandom(num_particles, iseed, pdata, serialize);
   myPC.AssignCellDensitySingleLevel(0, partMF, 0, 1 + AMREX_SPACEDIM, 0);
 
-  //  myPC.AssignDensitySingleLevel(0, partMF, 0, 4, 0);
-
-  //  myPC.InterpolateSingleLevel(acceleration, 0);
-
   MultiFab::Copy(density, partMF, 0, 0, 1, 0);
 
   WriteSingleLevelPlotfile("plt00000", partMF,
diff --git a/Tests/Particles/AssignMultiLevelDensity/main.cpp b/Tests/Particles/AssignMultiLevelDensity/main.cpp
index 0ffa37dd695..21ae50725d2 100644
--- a/Tests/Particles/AssignMultiLevelDensity/main.cpp
+++ b/Tests/Particles/AssignMultiLevelDensity/main.cpp
@@ -98,14 +98,10 @@ void test_assign_density(TestParams& parms)
     double mass = 10.0;
     MyParticleContainer::ParticleInitData pdata = {{mass},{},{},{}};
 
-    //    myPC.InitRandom(num_particles, iseed, pdata, serialize, fine_box);
     myPC.InitRandom(num_particles, iseed, pdata, serialize);
 
-    //myPC.AssignDensity(0, true, partMF, 0, 1, 1);
     myPC.AssignDensity(0, partMF, 0, 1, nlevs-1);
 
-    myPC.Interpolate(acceleration, 0, nlevs-1);
-
     for (int lev = 0; lev < nlevs; ++lev) {
         MultiFab::Copy(*density[lev], *partMF[lev], 0, 0, 1, 0);
     }
diff --git a/Tests/Particles/CheckpointRestartSOA/main.cpp b/Tests/Particles/CheckpointRestartSOA/main.cpp
index e8629560196..0458239fdcd 100644
--- a/Tests/Particles/CheckpointRestartSOA/main.cpp
+++ b/Tests/Particles/CheckpointRestartSOA/main.cpp
@@ -120,7 +120,7 @@ void test ()
         amrex::Print() << " done \n";
 
         Vector<std::string> particle_realnames;
-        for (int i = 0; i < NReal; ++i) {
+        for (int i = 0; i < NReal-AMREX_SPACEDIM; ++i) {
             particle_realnames.push_back("particle_real_component_" + std::to_string(i));
         }
 
diff --git a/Tests/Particles/RedistributeSOA/main.cpp b/Tests/Particles/RedistributeSOA/main.cpp
index 94715b7d6aa..62da81def81 100644
--- a/Tests/Particles/RedistributeSOA/main.cpp
+++ b/Tests/Particles/RedistributeSOA/main.cpp
@@ -92,6 +92,7 @@ class TestParticleContainer
         {
             const Box& tile_box  = mfi.tilebox();
 
+            Gpu::HostVector<uint64_t> host_idcpu;
             std::array<Gpu::HostVector<ParticleReal>, NR> host_real;
             std::array<Gpu::HostVector<int>, NI> host_int;
 
@@ -106,6 +107,10 @@ class TestParticleContainer
 
                     amrex::Long id = ParticleType::NextID();
 
+                    host_idcpu.push_back(0);
+                    ParticleIDWrapper(host_idcpu.back()) = id;
+                    ParticleCPUWrapper(host_idcpu.back()) = ParallelDescriptor::MyProc();
+
                     host_int[0].push_back(static_cast<int>(id));
                     host_int[1].push_back(ParallelDescriptor::MyProc());
                     host_real[0].push_back(static_cast<ParticleReal> (plo[0] + (iv[0] + r[0])*dx[0]));
@@ -137,6 +142,13 @@ class TestParticleContainer
             particle_tile.resize(new_size);
 
             auto& soa = particle_tile.GetStructOfArrays();
+            {
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               host_idcpu.begin(),
+                               host_idcpu.end(),
+                               soa.GetIdCPUData().begin() + old_size);
+
+            }
             for (int i = 0; i < NR; ++i)
             {
                 Gpu::copyAsync(Gpu::hostToDevice,
diff --git a/Tools/CMake/AMReXBuildInfo.cmake b/Tools/CMake/AMReXBuildInfo.cmake
index 36c70057871..0b87f053c2e 100644
--- a/Tools/CMake/AMReXBuildInfo.cmake
+++ b/Tools/CMake/AMReXBuildInfo.cmake
@@ -38,15 +38,37 @@ include(AMReXTargetHelpers)
 #
 # Set paths
 #
-string(REPLACE "/Tools/CMake" "" AMREX_TOP_DIR ${CMAKE_CURRENT_LIST_DIR})
-set( AMREX_TOP_DIR ${AMREX_TOP_DIR} CACHE INTERNAL "Top level AMReX directory")
-
-set( AMREX_BUILDINFO_IFILE ${CMAKE_CURRENT_LIST_DIR}/AMReX_buildInfo.cpp.in
+if (AMReX_DIR)
+   # AMReX is pre-installed and used as a library
+   if (WIN32)  # see AMReXInstallHelpers.cmake
+       string(REPLACE "/cmake/AMReXCMakeModules" ""
+              AMREX_TOP_DIR_DEFAULT
+              ${CMAKE_CURRENT_LIST_DIR})
+   else ()
+       string(REPLACE "/lib/cmake/AMReX/AMReXCMakeModules" ""
+              AMREX_TOP_DIR_DEFAULT
+              ${CMAKE_CURRENT_LIST_DIR})
+   endif ()
+else ()
+   # this is a superbuild
+   string(REPLACE "/Tools/CMake" "" AMREX_TOP_DIR_DEFAULT
+          ${CMAKE_CURRENT_LIST_DIR})
+endif ()
+set(AMREX_TOP_DIR "${AMREX_TOP_DIR_DEFAULT}" CACHE INTERNAL "Top level AMReX directory")
+
+if (AMReX_DIR)
+   # AMReX is pre-installed and used as a library
+   set(AMREX_C_SCRIPTS_DIR "${AMREX_TOP_DIR}/share/amrex/C_scripts"
+       CACHE INTERNAL "Path to AMReX' C_scripts dir")
+else ()
+   # this is a superbuild
+   set(AMREX_C_SCRIPTS_DIR "${AMREX_TOP_DIR}/Tools/C_scripts"
+       CACHE INTERNAL "Path to AMReX' C_scripts dir")
+endif ()
+
+set(AMREX_BUILDINFO_IFILE ${CMAKE_CURRENT_LIST_DIR}/AMReX_buildInfo.cpp.in
    CACHE INTERNAL "Full path and name of AMReX_buildInfo.cpp.in")
 
-set( AMREX_C_SCRIPTS_DIR "${AMREX_TOP_DIR}/Tools/C_scripts"
-   CACHE INTERNAL "Path to AMReX' C_scripts dir")
-
 set(AMREX_BUILD_DATETIME "" CACHE STRING
    "User defined build date and time. Set ONLY for reproducibly built binary distributions")
 
@@ -189,7 +211,7 @@ function (generate_buildinfo _target _git_dir)
    configure_file( ${AMREX_BUILDINFO_IFILE}
       ${PROJECT_BINARY_DIR}/${_target}/AMReX_buildInfo.cpp @ONLY)
 
-   # add a re-usable target
+   # add a reusable target
    add_library(buildInfo${_target} STATIC)
    add_library(buildInfo::${_target} ALIAS buildInfo${_target})
 
diff --git a/Tools/CMake/AMReXClangTidy.cmake b/Tools/CMake/AMReXClangTidy.cmake
index 4f002bc3820..3021d5939d7 100644
--- a/Tools/CMake/AMReXClangTidy.cmake
+++ b/Tools/CMake/AMReXClangTidy.cmake
@@ -9,7 +9,7 @@ macro(setup_clang_tidy)
       if (_tmp MATCHES "LLVM version ([0-9\.]+)")
          message(STATUS "Found clang-tidy ${CMAKE_MATCH_1}")
          if ("${CMAKE_MATCH_1}" VERSION_GREATER_EQUAL 12.0.0)
-            # Cofig file not supported in earlier versions
+            # Config file not supported in earlier versions
             set(AMReX_CLANG_TIDY_CONFIG_FILE_NAME ${PROJECT_SOURCE_DIR}/.clang-tidy)
          endif()
       endif()
diff --git a/Tools/CMake/AMReXFlagsTargets.cmake b/Tools/CMake/AMReXFlagsTargets.cmake
index 9e3073cd53f..a2e86b2fbd3 100644
--- a/Tools/CMake/AMReXFlagsTargets.cmake
+++ b/Tools/CMake/AMReXFlagsTargets.cmake
@@ -89,7 +89,7 @@ target_compile_options( Flags_CXX
    $<${_cxx_appleclang_rwdbg}:>
    $<${_cxx_appleclang_rel}:>
    $<${_cxx_intelllvm_dbg}:-O0 -Wall -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable>
-   $<${_cxx_intelllvm_rwdbg}:-g1>
+   $<${_cxx_intelllvm_rwdbg}:-gline-tables-only -fdebug-info-for-profiling> # recommended by Intel VTune
    $<${_cxx_intelllvm_rel}:>
    )
 
diff --git a/Tools/CMake/AMReXInstallHelpers.cmake b/Tools/CMake/AMReXInstallHelpers.cmake
index 0720efb4cf9..7c832d3a1cc 100644
--- a/Tools/CMake/AMReXInstallHelpers.cmake
+++ b/Tools/CMake/AMReXInstallHelpers.cmake
@@ -26,7 +26,7 @@ function (install_amrex_targets)
    else()
        set(CMAKE_FILES_DIR   lib/cmake/AMReX)
    endif()
-   set(MODULE_PATH       Tools/CMake)       # Relative path to top level installation/build-tree
+   set(MODULE_PATH       ${CMAKE_FILES_DIR}/AMReXCMakeModules)       # Relative path to top level installation/build-tree
 
    # Write Config file -- this is designed to work for both install and build trees
    configure_package_config_file(${AMREX_CMAKE_MODULES_PATH}/AMReXConfig.cmake.in
@@ -60,7 +60,7 @@ function (install_amrex_targets)
 
        install( EXPORT AMReXTargets
           NAMESPACE AMReX::
-          DESTINATION lib/cmake/AMReX )
+          DESTINATION ${CMAKE_FILES_DIR} )
 
        #
        # alias: last element will be legacy target
@@ -88,11 +88,17 @@ function (install_amrex_targets)
        # Install Tools directory
        install(
           DIRECTORY
-          ${PROJECT_SOURCE_DIR}/Tools/CMake
-          ${PROJECT_SOURCE_DIR}/Tools/C_scripts
-          ${PROJECT_SOURCE_DIR}/Tools/typechecker
+            ${PROJECT_SOURCE_DIR}/Tools/C_scripts
+            ${PROJECT_SOURCE_DIR}/Tools/typechecker
           DESTINATION
-          Tools
+            share/amrex
+          USE_SOURCE_PERMISSIONS
+          )
+       install(
+          DIRECTORY
+            ${PROJECT_SOURCE_DIR}/Tools/CMake/
+          DESTINATION
+            ${MODULE_PATH}
           USE_SOURCE_PERMISSIONS
           )
    endif()
@@ -106,11 +112,16 @@ function (install_amrex_targets)
    # Copy Tools directory to build tree
    file(
       COPY
-      ${PROJECT_SOURCE_DIR}/Tools/CMake
-      ${PROJECT_SOURCE_DIR}/Tools/C_scripts
-      ${PROJECT_SOURCE_DIR}/Tools/typechecker
+        ${PROJECT_SOURCE_DIR}/Tools/C_scripts
+        ${PROJECT_SOURCE_DIR}/Tools/typechecker
       DESTINATION
-      ${PROJECT_BINARY_DIR}/Tools
+        ${PROJECT_BINARY_DIR}/share/amrex
+      )
+   file(
+      COPY
+        ${PROJECT_SOURCE_DIR}/Tools/CMake/
+      DESTINATION
+        ${PROJECT_BINARY_DIR}/${MODULE_PATH}
       )
 
 
@@ -124,13 +135,28 @@ endfunction ()
 # _amrex_root is the amrex installation dir
 #
 macro( add_test_install_target _dir _amrex_root )
+   # Multi-Config generators such as Visual Studio or Ninja Multi-Config
+   # select the build type at build time. Others (GNUmake, Ninja, ...) select
+   # the build type at configure time.
+   get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+   set(_select_config)
+   set(_select_bldtype)
+   if(isMultiConfig)
+      set(_select_config "--config $<CONFIG>")
+   else()
+      set(_select_bldtype "-DCMAKE_BUILD_TYPE=$<CONFIG>")
+   endif()
 
+   # Fortran compiler used?
    get_filename_component( _dirname ${_dir} NAME )
    set(_builddir  ${CMAKE_CURRENT_BINARY_DIR}/${_dirname})
    set(_enable_fortran)
    if(AMReX_FORTRAN)
       set(_enable_fortran -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER})
    endif()
+
+   # Configure and Build
+   # A registered POST_BUILD target in Tests/CMakeTestInstall will then also run the test.
    add_custom_target(test_install
       COMMAND ${CMAKE_COMMAND} -E echo ""
       COMMAND ${CMAKE_COMMAND} -E echo "------------------------------------"
@@ -139,9 +165,9 @@ macro( add_test_install_target _dir _amrex_root )
       COMMAND ${CMAKE_COMMAND} -E echo ""
       COMMAND ${CMAKE_COMMAND} -E make_directory ${_builddir}
       COMMAND ${CMAKE_COMMAND} -E echo "Configuring test project"
-      COMMAND ${CMAKE_COMMAND} -S ${_dir} -B ${_builddir} -DAMReX_ROOT=${_amrex_root} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} ${_enable_fortran}
+      COMMAND ${CMAKE_COMMAND} -S ${_dir} -B ${_builddir} ${_select_bldtype} -DAMReX_ROOT=${_amrex_root} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} ${_enable_fortran}
       COMMAND ${CMAKE_COMMAND} -E echo "Building test project"
-      COMMAND ${CMAKE_COMMAND} --build ${_builddir}
+      COMMAND ${CMAKE_COMMAND} --build ${_builddir} ${_select_config}
       COMMAND ${CMAKE_COMMAND} -E echo ""
       COMMAND ${CMAKE_COMMAND} -E echo "------------------------------------"
       COMMAND ${CMAKE_COMMAND} -E echo "   AMReX is installed correctly"
diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake
index 80196639984..e24244ea29a 100644
--- a/Tools/CMake/AMReXOptions.cmake
+++ b/Tools/CMake/AMReXOptions.cmake
@@ -213,6 +213,16 @@ if (AMReX_SYCL)
       endif()
    endif()
 
+   set(AMReX_PARALLEL_LINK_JOBS_DEFAULT 1)
+   if (DEFINED ENV{AMREX_PARALLEL_LINK_JOBS})
+      set(AMReX_PARALLEL_LINK_JOBS_DEFAULT "$ENV{AMREX_PARALLEL_LINK_JOBS}")
+   endif()
+   set(AMReX_PARALLEL_LINK_JOBS ${AMReX_PARALLEL_LINK_JOBS_DEFAULT}
+       CACHE STRING "SYCL max parallel link jobs")
+   if (NOT AMReX_PARALLEL_LINK_JOBS GREATER_EQUAL 1 OR
+       NOT AMReX_PARALLEL_LINK_JOBS MATCHES "^[1-9][0-9]*$")
+      message(FATAL_ERROR "AMReX_PARALLEL_LINK_JOBS (${AMReX_PARALLEL_LINK_JOBS}) must be a positive integer")
+   endif()
 endif ()
 
 # --- HIP ----
diff --git a/Tools/CMake/AMReXSYCL.cmake b/Tools/CMake/AMReXSYCL.cmake
index 42eb5c4802b..2b48f1c53fe 100644
--- a/Tools/CMake/AMReXSYCL.cmake
+++ b/Tools/CMake/AMReXSYCL.cmake
@@ -53,7 +53,7 @@ endif()
 #
 target_link_options( SYCL
    INTERFACE
-   $<${_cxx_sycl}:-fsycl -fsycl-device-lib=libc,libm-fp32,libm-fp64> )
+   $<${_cxx_sycl}:-qmkl=sequential -fsycl -fsycl-device-lib=libc,libm-fp32,libm-fp64> )
 
 
 # TODO: use $<LINK_LANG_AND_ID:> genex for CMake >=3.17
@@ -88,4 +88,10 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND "${CMAKE_BUILD_TYPE}" MATCHES "Debug"
       "$<${_cxx_sycl}:-fsycl-link-huge-device-code>" )
 endif ()
 
+if (AMReX_PARALLEL_LINK_JOBS GREATER 1)
+   target_link_options( SYCL
+      INTERFACE
+      $<${_cxx_sycl}:-fsycl-max-parallel-link-jobs=${AMReX_PARALLEL_LINK_JOBS}>)
+endif()
+
 unset(_cxx_sycl)
diff --git a/Tools/CMake/AMReXTypecheck.cmake b/Tools/CMake/AMReXTypecheck.cmake
index 0b68fb8c274..c569eb5d9bd 100644
--- a/Tools/CMake/AMReXTypecheck.cmake
+++ b/Tools/CMake/AMReXTypecheck.cmake
@@ -296,9 +296,13 @@ function( add_typecheck_target _target)
    #
    set(_outfile  "${_typecheck_dir}/${_target}_typecheck.ou" )
 
-   # Find typechecker 
+   # Find typechecker
    find_file(_typechecker "typechecker.py"
-      HINTS ${AMReX_SOURCE_DIR} ${AMReX_ROOT} ENV AMReX_ROOT PATH_SUFFIXES Tools/typechecker)
+      HINTS ${AMReX_SOURCE_DIR} ${AMReX_ROOT} ENV AMReX_ROOT
+      PATH_SUFFIXES
+        Tools/typechecker        # in-source
+        share/amrex/typechecker  # installed
+   )
 
    add_custom_target( typecheck_${_target}
       COMMAND python3  ${_typechecker}
diff --git a/Tools/CMake/FindFilesystem.cmake b/Tools/CMake/FindFilesystem.cmake
deleted file mode 100644
index fe9a0aea604..00000000000
--- a/Tools/CMake/FindFilesystem.cmake
+++ /dev/null
@@ -1,244 +0,0 @@
-# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
-# file Copyright.txt or https://cmake.org/licensing for details.
-
-# This is copied from:
-#   https://github.com/vector-of-bool/CMakeCM/blob/master/modules/FindFilesystem.cmake
-
-#[=======================================================================[.rst:
-
-FindFilesystem
-##############
-
-This module supports the C++17 standard library's filesystem utilities. Use the
-:imp-target:`std::filesystem` imported target to
-
-Options
-*******
-
-The ``COMPONENTS`` argument to this module supports the following values:
-
-.. find-component:: Experimental
-    :name: fs.Experimental
-
-    Allows the module to find the "experimental" Filesystem TS version of the
-    Filesystem library. This is the library that should be used with the
-    ``std::experimental::filesystem`` namespace.
-
-.. find-component:: Final
-    :name: fs.Final
-
-    Finds the final C++17 standard version of the filesystem library.
-
-If no components are provided, behaves as if the
-:find-component:`fs.Final` component was specified.
-
-If both :find-component:`fs.Experimental` and :find-component:`fs.Final` are
-provided, first looks for ``Final``, and falls back to ``Experimental`` in case
-of failure. If ``Final`` is found, :imp-target:`std::filesystem` and all
-:ref:`variables <fs.variables>` will refer to the ``Final`` version.
-
-
-Imported Targets
-****************
-
-.. imp-target:: std::filesystem
-
-    The ``std::filesystem`` imported target is defined when any requested
-    version of the C++ filesystem library has been found, whether it is
-    *Experimental* or *Final*.
-
-    If no version of the filesystem library is available, this target will not
-    be defined.
-
-    .. note::
-        This target has ``cxx_std_17`` as an ``INTERFACE``
-        :ref:`compile language standard feature <req-lang-standards>`. Linking
-        to this target will automatically enable C++17 if no later standard
-        version is already required on the linking target.
-
-
-.. _fs.variables:
-
-Variables
-*********
-
-.. variable:: CXX_FILESYSTEM_IS_EXPERIMENTAL
-
-    Set to ``TRUE`` when the :find-component:`fs.Experimental` version of C++
-    filesystem library was found, otherwise ``FALSE``.
-
-.. variable:: CXX_FILESYSTEM_HAVE_FS
-
-    Set to ``TRUE`` when a filesystem header was found.
-
-.. variable:: CXX_FILESYSTEM_HEADER
-
-    Set to either ``filesystem`` or ``experimental/filesystem`` depending on
-    whether :find-component:`fs.Final` or :find-component:`fs.Experimental` was
-    found.
-
-.. variable:: CXX_FILESYSTEM_NAMESPACE
-
-    Set to either ``std::filesystem`` or ``std::experimental::filesystem``
-    depending on whether :find-component:`fs.Final` or
-    :find-component:`fs.Experimental` was found.
-
-
-Examples
-********
-
-Using `find_package(Filesystem)` with no component arguments:
-
-.. code-block:: cmake
-
-    find_package(Filesystem REQUIRED)
-
-    add_executable(my-program main.cpp)
-    target_link_libraries(my-program PRIVATE std::filesystem)
-
-
-#]=======================================================================]
-
-
-if(TARGET std::filesystem)
-    # This module has already been processed. Don't do it again.
-    return()
-endif()
-
-cmake_policy(PUSH)
-if(POLICY CMP0067)
-    # pass CMAKE_CXX_STANDARD to check_cxx_source_compiles()
-    # has to appear before including CheckCXXSourceCompiles module
-    cmake_policy(SET CMP0067 NEW)
-endif()
-
-include(CMakePushCheckState)
-include(CheckIncludeFileCXX)
-include(CheckCXXSourceCompiles)
-
-cmake_push_check_state()
-
-set(CMAKE_REQUIRED_QUIET ${Filesystem_FIND_QUIETLY})
-
-# All of our tests required C++17 or later
-set(CMAKE_CXX_STANDARD 17)
-
-# Normalize and check the component list we were given
-set(want_components ${Filesystem_FIND_COMPONENTS})
-if(Filesystem_FIND_COMPONENTS STREQUAL "")
-    set(want_components Final)
-endif()
-
-# Warn on any unrecognized components
-set(extra_components ${want_components})
-list(REMOVE_ITEM extra_components Final Experimental)
-foreach(component IN LISTS extra_components)
-    message(WARNING "Extraneous find_package component for Filesystem: ${component}")
-endforeach()
-
-# Detect which of Experimental and Final we should look for
-set(find_experimental TRUE)
-set(find_final TRUE)
-if(NOT "Final" IN_LIST want_components)
-    set(find_final FALSE)
-endif()
-if(NOT "Experimental" IN_LIST want_components)
-    set(find_experimental FALSE)
-endif()
-
-if(find_final)
-    check_include_file_cxx("filesystem" _CXX_FILESYSTEM_HAVE_HEADER)
-    mark_as_advanced(_CXX_FILESYSTEM_HAVE_HEADER)
-    if(_CXX_FILESYSTEM_HAVE_HEADER)
-        # We found the non-experimental header. Don't bother looking for the
-        # experimental one.
-        set(find_experimental FALSE)
-    endif()
-else()
-    set(_CXX_FILESYSTEM_HAVE_HEADER FALSE)
-endif()
-
-if(find_experimental)
-    check_include_file_cxx("experimental/filesystem" _CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER)
-    mark_as_advanced(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER)
-else()
-    set(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER FALSE)
-endif()
-
-if(_CXX_FILESYSTEM_HAVE_HEADER)
-    set(_have_fs TRUE)
-    set(_fs_header filesystem)
-    set(_fs_namespace std::filesystem)
-elseif(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER)
-    set(_have_fs TRUE)
-    set(_fs_header experimental/filesystem)
-    set(_fs_namespace std::experimental::filesystem)
-else()
-    set(_have_fs FALSE)
-endif()
-
-set(CXX_FILESYSTEM_HAVE_FS ${_have_fs} CACHE BOOL "TRUE if we have the C++ filesystem headers")
-set(CXX_FILESYSTEM_HEADER ${_fs_header} CACHE STRING "The header that should be included to obtain the filesystem APIs")
-set(CXX_FILESYSTEM_NAMESPACE ${_fs_namespace} CACHE STRING "The C++ namespace that contains the filesystem APIs")
-
-set(_found FALSE)
-
-if(CXX_FILESYSTEM_HAVE_FS)
-    # We have some filesystem library available. Do link checks
-    string(CONFIGURE [[
-        #include <@CXX_FILESYSTEM_HEADER@>
-
-        int main() {
-            auto cwd = @CXX_FILESYSTEM_NAMESPACE@::current_path();
-            return static_cast<int>(cwd.string().size());
-        }
-    ]] code @ONLY)
-
-    # Try to compile a simple filesystem program without any linker flags
-    check_cxx_source_compiles("${code}" CXX_FILESYSTEM_NO_LINK_NEEDED)
-
-    set(can_link ${CXX_FILESYSTEM_NO_LINK_NEEDED})
-
-    if(NOT CXX_FILESYSTEM_NO_LINK_NEEDED)
-        set(prev_libraries ${CMAKE_REQUIRED_LIBRARIES})
-        # Add the libstdc++ flag
-        set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lstdc++fs)
-        check_cxx_source_compiles("${code}" CXX_FILESYSTEM_STDCPPFS_NEEDED)
-        set(can_link ${CXX_FILESYSTEM_STDCPPFS_NEEDED})
-        if(NOT CXX_FILESYSTEM_STDCPPFS_NEEDED)
-            # Try the libc++ flag
-            set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lc++fs)
-            check_cxx_source_compiles("${code}" CXX_FILESYSTEM_CPPFS_NEEDED)
-            set(can_link ${CXX_FILESYSTEM_CPPFS_NEEDED})
-        endif()
-    endif()
-
-    if(can_link)
-        add_library(std::filesystem INTERFACE IMPORTED)
-        target_compile_features(std::filesystem INTERFACE cxx_std_17)
-        set(_found TRUE)
-        if(CXX_FILESYSTEM_NO_LINK_NEEDED)
-            # on certain linux distros we have a version of libstdc++ which has the final code for c++17 fs in the
-            # libstdc++.so.*. BUT when compiling with g++ < 9, we MUST still link with libstdc++fs.a 
-            # libc++ should not suffer from this issue, so, in theory we should be fine with only checking for
-            # GCC's libstdc++ 
-            if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0.0"))
-                target_link_libraries(std::filesystem INTERFACE -lstdc++fs)
-            endif()
-        elseif(CXX_FILESYSTEM_STDCPPFS_NEEDED)
-            target_link_libraries(std::filesystem INTERFACE -lstdc++fs)
-        elseif(CXX_FILESYSTEM_CPPFS_NEEDED)
-            target_link_libraries(std::filesystem INTERFACE -lc++fs)
-        endif()
-    endif()
-endif()
-
-cmake_pop_check_state()
-
-set(Filesystem_FOUND ${_found} CACHE BOOL "TRUE if we can compile and link a program using std::filesystem" FORCE)
-
-if(Filesystem_FIND_REQUIRED AND NOT Filesystem_FOUND)
-    message(FATAL_ERROR "Cannot Compile simple program using std::filesystem")
-endif()
-
-cmake_policy(POP)
diff --git a/Tools/C_scripts/mkdep b/Tools/C_scripts/mkdep
index 32552d44837..b5c039b212d 100755
--- a/Tools/C_scripts/mkdep
+++ b/Tools/C_scripts/mkdep
@@ -7,7 +7,7 @@
 # Notes:  *  -I<path> defines a search path for include files
 #         *  -DBG turn on debug flag
 #         +  -fortran: parse fortran style include directives
-#         *  -X<path> means disgard entries with this path (NOT IMPLEMENTED)
+#         *  -X<path> means discard entries with this path (NOT IMPLEMENTED)
 #         *   searches current directory only if -I. is in search
 #             path or #include directive uses double quotes rather
 #             than angle brackets.
diff --git a/Tools/C_util/Convergence/DiffFab.cpp b/Tools/C_util/Convergence/DiffFab.cpp
index 0794583bde8..a21eb8d7f6c 100644
--- a/Tools/C_util/Convergence/DiffFab.cpp
+++ b/Tools/C_util/Convergence/DiffFab.cpp
@@ -123,10 +123,10 @@ main (int   argc,
             FArrayBox exactAvg(domainI,nComp);
 
             FORT_CV_AVGDOWN(exactAvg.dataPtr(),
-                            ARLIM(exactAvg.loVect()),
-                            ARLIM(exactAvg.hiVect()), &nComp,
+                            AMREX_ARLIM(exactAvg.loVect()),
+                            AMREX_ARLIM(exactAvg.hiVect()), &nComp,
                             dataE.dataPtr(),
-                            ARLIM(dataE.loVect()), ARLIM(dataE.hiVect()),
+                            AMREX_ARLIM(dataE.loVect()), AMREX_ARLIM(dataE.hiVect()),
                             domainI.loVect(), domainI.hiVect(),
                             refine_ratio.getVect());
 
diff --git a/Tools/C_util/Convergence/DiffSameDomainRefined.cpp b/Tools/C_util/Convergence/DiffSameDomainRefined.cpp
index 5e4051b03fe..609dfd6c314 100644
--- a/Tools/C_util/Convergence/DiffSameDomainRefined.cpp
+++ b/Tools/C_util/Convergence/DiffSameDomainRefined.cpp
@@ -225,11 +225,11 @@ main (int   argc,
                     int ncCoarse = 1;
 
                     FORT_CV_AVGDOWN(data2Coarse.dataPtr(),
-                                    ARLIM(bx.loVect()), ARLIM(bx.hiVect()),
+                                    AMREX_ARLIM(bx.loVect()), AMREX_ARLIM(bx.hiVect()),
                                     &ncCoarse,
                                     data2Fine[mfi].dataPtr(),
-                                    ARLIM(data2Fine[mfi].loVect()),
-                                    ARLIM(data2Fine[mfi].hiVect()),
+                                    AMREX_ARLIM(data2Fine[mfi].loVect()),
+                                    AMREX_ARLIM(data2Fine[mfi].hiVect()),
                                     bx.loVect(), bx.hiVect(),
                                     refine_ratio.getVect());
 
@@ -379,4 +379,3 @@ getRefRatio(const Box& crse,
         ref_ratio = IntVect();
     return ref_ratio;
 }
-
diff --git a/Tools/C_util/Convergence/DiffSameDomainRefinedComposite.cpp b/Tools/C_util/Convergence/DiffSameDomainRefinedComposite.cpp
index 1e490d817b6..92433136be6 100644
--- a/Tools/C_util/Convergence/DiffSameDomainRefinedComposite.cpp
+++ b/Tools/C_util/Convergence/DiffSameDomainRefinedComposite.cpp
@@ -241,11 +241,11 @@ main (int   argc,
 
 
                     FORT_CV_AVGDOWN(data2Coarse.dataPtr(),
-                                    ARLIM(bx.loVect()), ARLIM(bx.hiVect()),
+                                    AMREX_ARLIM(bx.loVect()), AMREX_ARLIM(bx.hiVect()),
                                     &ncCoarse,
                                     data2Fine[mfi].dataPtr(),
-                                    ARLIM(data2Fine[mfi].loVect()),
-                                    ARLIM(data2Fine[mfi].hiVect()),
+                                    AMREX_ARLIM(data2Fine[mfi].loVect()),
+                                    AMREX_ARLIM(data2Fine[mfi].hiVect()),
                                     bx.loVect(), bx.hiVect(),
                                     refine_ratio.getVect());
 
@@ -414,4 +414,3 @@ getRefRatio(const Box& crse,
         ref_ratio = IntVect();
     return ref_ratio;
 }
-
diff --git a/Tools/C_util/Convergence/DiffSameDomainRefinedStag.cpp b/Tools/C_util/Convergence/DiffSameDomainRefinedStag.cpp
index 0f5cf0226e4..25736b2f7e9 100644
--- a/Tools/C_util/Convergence/DiffSameDomainRefinedStag.cpp
+++ b/Tools/C_util/Convergence/DiffSameDomainRefinedStag.cpp
@@ -250,12 +250,12 @@ main (int   argc,
 
                     FORT_CV_AVGDOWN_STAG(&nodal_dir,
                                          data2Coarse.dataPtr(),
-                                         ARLIM(data2Coarse.loVect()),
-                                         ARLIM(data2Coarse.hiVect()),
+                                         AMREX_ARLIM(data2Coarse.loVect()),
+                                         AMREX_ARLIM(data2Coarse.hiVect()),
                                          &ncCoarse,
                                          data2Fine[mfi].dataPtr(),
-                                         ARLIM(data2Fine[mfi].loVect()),
-                                         ARLIM(data2Fine[mfi].hiVect()),
+                                         AMREX_ARLIM(data2Fine[mfi].loVect()),
+                                         AMREX_ARLIM(data2Fine[mfi].hiVect()),
                                          loiv.getVect(),
                                          hiiv.getVect(),
                                          refine_ratio.getVect());
@@ -406,4 +406,3 @@ getRefRatio(const Box& crse,
         ref_ratio = IntVect();
     return ref_ratio;
 }
-
diff --git a/Tools/C_util/Convergence/DiffSameGridRefined.cpp b/Tools/C_util/Convergence/DiffSameGridRefined.cpp
index 3d8da3162a2..3e6b4e2815e 100644
--- a/Tools/C_util/Convergence/DiffSameGridRefined.cpp
+++ b/Tools/C_util/Convergence/DiffSameGridRefined.cpp
@@ -219,11 +219,11 @@ main (int   argc,
                     int ncCoarse = data2Coarse.nComp();
 
                     FORT_CV_AVGDOWN(data2Coarse.dataPtr(),
-                                    ARLIM(bx.loVect()), ARLIM(bx.hiVect()),
+                                    AMREX_ARLIM(bx.loVect()), AMREX_ARLIM(bx.hiVect()),
                                     &ncCoarse,
                                     data2Fine[mfi].dataPtr(),
-                                    ARLIM(data2Fine[mfi].loVect()),
-                                    ARLIM(data2Fine[mfi].hiVect()),
+                                    AMREX_ARLIM(data2Fine[mfi].loVect()),
+                                    AMREX_ARLIM(data2Fine[mfi].hiVect()),
                                     bx.loVect(), bx.hiVect(),
                                     refine_ratio.getVect());
 
@@ -372,4 +372,3 @@ getRefRatio(const Box& crse,
         ref_ratio = IntVect();
     return ref_ratio;
 }
-
diff --git a/Tools/C_util/Convergence/DiffUniform.cpp b/Tools/C_util/Convergence/DiffUniform.cpp
index fd749365581..4598aff4aa3 100644
--- a/Tools/C_util/Convergence/DiffUniform.cpp
+++ b/Tools/C_util/Convergence/DiffUniform.cpp
@@ -159,9 +159,9 @@ main (int   argc,
                     const int* e_lo = exact[amfi].loVect();
                     const int* e_hi = exact[amfi].hiVect();
                     FORT_CV_AVGDOWN(aveExact[amfi].dataPtr(),
-                                    ARLIM(a_lo), ARLIM(a_hi), &nc,
+                                    AMREX_ARLIM(a_lo), AMREX_ARLIM(a_hi), &nc,
                                     exact[amfi].dataPtr(),
-                                    ARLIM(e_lo), ARLIM(e_hi),
+                                    AMREX_ARLIM(e_lo), AMREX_ARLIM(e_hi),
                                     crseBox.loVect(), crseBox.hiVect(),
                                     refine_ratio.getVect());
                 }
@@ -265,4 +265,3 @@ amrDatasHaveSameDerives(const AmrData& amrd1,
             return false;
     return true;
 }
-
diff --git a/Tools/C_util/Convergence/RichardsonConvergenceTest.cpp b/Tools/C_util/Convergence/RichardsonConvergenceTest.cpp
index 1974605e1ee..1ca1cf2673f 100644
--- a/Tools/C_util/Convergence/RichardsonConvergenceTest.cpp
+++ b/Tools/C_util/Convergence/RichardsonConvergenceTest.cpp
@@ -278,11 +278,11 @@ getErrorNorms(Vector<Real>& a_norms, //one for each comp
 
 
         FORT_CV_AVGDOWN(data2Coarse.dataPtr(),
-                        ARLIM(bx.loVect()), ARLIM(bx.hiVect()),
+                        AMREX_ARLIM(bx.loVect()), AMREX_ARLIM(bx.hiVect()),
                         &ncCoarse,
                         data2Fine[mfi].dataPtr(),
-                        ARLIM(data2Fine[mfi].loVect()),
-                        ARLIM(data2Fine[mfi].hiVect()),
+                        AMREX_ARLIM(data2Fine[mfi].loVect()),
+                        AMREX_ARLIM(data2Fine[mfi].hiVect()),
                         bx.loVect(), bx.hiVect(),
                         refine_ratio.getVect());
 
diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs
index 7b2ba129b83..8ec8832d1eb 100644
--- a/Tools/GNUMake/Make.defs
+++ b/Tools/GNUMake/Make.defs
@@ -1067,6 +1067,12 @@ ifeq ($(USE_HDF5),TRUE)
   include        $(AMREX_HOME)/Tools/GNUMake/packages/Make.hdf5
 endif
 
+ifeq ($(USE_BITTREE),TRUE)
+  $(info Loading $(AMREX_HOME)/Tools/GNUMake/packages/Make.bittree...)
+  include        $(AMREX_HOME)/Tools/GNUMake/packages/Make.bittree
+endif
+
+
 ifneq ("$(wildcard $(AMREX_HOME)/Tools/GNUMake/Make.local)","")
   $(info Loading $(AMREX_HOME)/Tools/GNUMake/Make.local...)
   include        $(AMREX_HOME)/Tools/GNUMake/Make.local
diff --git a/Tools/GNUMake/Make.machines b/Tools/GNUMake/Make.machines
index 1e6293dd001..db91bb99947 100644
--- a/Tools/GNUMake/Make.machines
+++ b/Tools/GNUMake/Make.machines
@@ -123,9 +123,11 @@ ifeq ($(findstring asterix, $(host_name)), asterix)
 endif
 
 ifeq ($(findstring aurora, $(host_name)), aurora)
+ifneq ($(findstring alcf.anl.gov, $(host_name)),alcf.anl.gov)
   which_site := hs
   which_computer := aurora
 endif
+endif
 
 ifeq ($(findstring kestrel, $(NREL_CLUSTER)), kestrel)
   which_site := nrel
diff --git a/Tools/GNUMake/comps/dpcpp.mak b/Tools/GNUMake/comps/dpcpp.mak
index 6e490d9c063..3bcf5cb4372 100644
--- a/Tools/GNUMake/comps/dpcpp.mak
+++ b/Tools/GNUMake/comps/dpcpp.mak
@@ -123,7 +123,7 @@ ifneq ($(BL_NO_FORT),TRUE)
   endif
 endif
 
-LDFLAGS += -fsycl-device-lib=libc,libm-fp32,libm-fp64
+LDFLAGS += -qmkl=sequential -fsycl-device-lib=libc,libm-fp32,libm-fp64
 
 ifdef SYCL_PARALLEL_LINK_JOBS
 LDFLAGS += -fsycl-max-parallel-link-jobs=$(SYCL_PARALLEL_LINK_JOBS)
diff --git a/Tools/GNUMake/comps/hip.mak b/Tools/GNUMake/comps/hip.mak
index f887b206e4c..87bb3e93f59 100644
--- a/Tools/GNUMake/comps/hip.mak
+++ b/Tools/GNUMake/comps/hip.mak
@@ -8,6 +8,7 @@ ifneq ($(NO_CONFIG_CHECKING),TRUE)
   ifeq ($(HIP_PATH),)
     $(error hipconfig failed. Is the HIP toolkit available?)
   endif
+  COMP_VERSION = $(hipcc_version)
 endif
 
 CXX = $(HIP_PATH)/bin/hipcc
@@ -91,10 +92,10 @@ ifeq ($(HIP_COMPILER),clang)
 
   else  # DEBUG=FALSE flags
 
-    CXXFLAGS += -g -O3 -munsafe-fp-atomics
-    CFLAGS   += -g -O3
-    FFLAGS   += -g -O3
-    F90FLAGS += -g -O3
+    CXXFLAGS += -gline-tables-only -fdebug-info-for-profiling -O3 -munsafe-fp-atomics
+    CFLAGS   += -gline-tables-only -fdebug-info-for-profiling -O3
+    FFLAGS   += -g1 -O3
+    F90FLAGS += -g1 -O3
 
   endif
 
diff --git a/Tools/GNUMake/comps/nvcc.mak b/Tools/GNUMake/comps/nvcc.mak
index 7274b2ec225..bd79969a29a 100644
--- a/Tools/GNUMake/comps/nvcc.mak
+++ b/Tools/GNUMake/comps/nvcc.mak
@@ -4,6 +4,7 @@ ifneq ($(NO_CONFIG_CHECKING),TRUE)
   nvcc_version       := $(shell nvcc --version | grep "release" | awk 'BEGIN {FS = ","} {print $$2}' | awk '{print $$2}')
   nvcc_major_version := $(shell nvcc --version | grep "release" | awk 'BEGIN {FS = ","} {print $$2}' | awk '{print $$2}' | awk 'BEGIN {FS = "."} {print $$1}')
   nvcc_minor_version := $(shell nvcc --version | grep "release" | awk 'BEGIN {FS = ","} {print $$2}' | awk '{print $$2}' | awk 'BEGIN {FS = "."} {print $$2}')
+  COMP_VERSION = $(nvcc_version)
 else
   nvcc_version       := 99.9
   nvcc_major_version := 99
diff --git a/Tools/GNUMake/packages/Make.bittree b/Tools/GNUMake/packages/Make.bittree
new file mode 100644
index 00000000000..98758a915c2
--- /dev/null
+++ b/Tools/GNUMake/packages/Make.bittree
@@ -0,0 +1,16 @@
+
+CPPFLAGS += -DAMREX_USE_BITTREE
+include $(AMREX_HOME)/Src/Extern/Bittree/Make.package
+
+ifndef AMREX_BITTREE_HOME
+ifdef BITTREE_$(DIM)D_HOME
+  AMREX_BITTREE_HOME = $(BITTREE_$(DIM)D_HOME)
+endif
+endif
+
+ifdef AMREX_BITTREE_HOME
+  BITTREE_ABSPATH = $(abspath $(AMREX_BITTREE_HOME))
+  INCLUDE_LOCATIONS += $(BITTREE_ABSPATH)/include
+  LIBRARY_LOCATIONS += $(BITTREE_ABSPATH)/lib
+  LIBRARIES += -Wl,-rpath,$(BITTREE_ABSPATH)/lib -lbittree
+endif
diff --git a/Tools/GNUMake/sites/Make.unknown b/Tools/GNUMake/sites/Make.unknown
index 8781899d1d6..bbe39442d2d 100644
--- a/Tools/GNUMake/sites/Make.unknown
+++ b/Tools/GNUMake/sites/Make.unknown
@@ -59,6 +59,10 @@ ifeq ($(USE_MPI),TRUE)
      mpi_link_flags := $(filter-out $(mpi_filter), $(mpi_link_flags))
   endif
 
+  # Some wrappers include `-fallow-argument-mismatch`, `-g` etc in the link line info.
+  mpi_link_flags := $(filter-out -f%,$(mpi_link_flags))
+  mpi_link_flags := $(filter-out -g%,$(mpi_link_flags))
+
   LIBRARIES += $(mpi_link_flags)
   ifneq ($(MPI_OTHER_COMP),mpicxx)
     LIBRARIES += $(mpicxx_link_libs)
diff --git a/Tools/Plotfile/CMakeLists.txt b/Tools/Plotfile/CMakeLists.txt
index 464dad40bb1..f462445c952 100644
--- a/Tools/Plotfile/CMakeLists.txt
+++ b/Tools/Plotfile/CMakeLists.txt
@@ -27,6 +27,11 @@ foreach( _exe IN LISTS _exe_names)
    if (AMReX_CUDA)
       set_source_files_properties(${_exe}.cpp PROPERTIES LANGUAGE CUDA)
    endif()
+
+    # Add prefix to each tool's name to make them unique when installed.
+    # This avoids potential collisions of names on user systems, e.g., in
+    # software packages (Spack/Conda/Debian/...).
+    set_target_properties(${_exe} PROPERTIES OUTPUT_NAME "amrex_${_exe}")
 endforeach()
 
 
@@ -37,11 +42,3 @@ if (AMReX_CUDA)
    set_source_files_properties(AMReX_PPMUtil.cpp PROPERTIES LANGUAGE CUDA)
    target_compile_features(fsnapshot PUBLIC cxx_std_17)
 endif()
-
-# fgradient uses std::filesystem and GCC < 9 needs special treatment for std::filesystem
-if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0.0"))
-   find_package(Filesystem)
-   if (CXX_FILESYSTEM_HAVE_FS)
-      target_link_libraries(fgradient PRIVATE std::filesystem)
-   endif()
-endif()
diff --git a/Tools/Plotfile/fcompare.cpp b/Tools/Plotfile/fcompare.cpp
index 6a7b95daf5a..3a46120805c 100644
--- a/Tools/Plotfile/fcompare.cpp
+++ b/Tools/Plotfile/fcompare.cpp
@@ -24,17 +24,18 @@ void PrintUsage()
         << " variable.\n"
         << "\n"
         << " usage:\n"
-        << "    fcompare [-n|--norm num] [-d|--diffvar var] [-z|--zone_info var] [-a|--allow_diff_grids] [-r|rel_tol] [--abs_tol] file1 file2\n"
+        << "    fcompare [-n|--norm num] [-d|--diffvar var] [-z|--zone_info var] [-a|--allow_diff_grids] [-r|rel_tol] [--abs_tol] [--abort_if_not_all_found] file1 file2\n"
         << "\n"
         << " optional arguments:\n"
-        << "    -n|--norm num         : what norm to use (default is 0 for inf norm)\n"
-        << "    -d|--diffvar var      : output a plotfile showing the differences for\n"
-        << "                            variable var\n"
-        << "    -z|--zone_info var    : output the information for a zone corresponding\n"
-        << "                            to the maximum error for the given variable\n"
-        << "    -a|--allow_diff_grids : allow different BoxArrays covering the same domain\n"
-        << "    -r|--rel_tol rtol     : relative tolerance (default is 0)\n"
-        << "    --abs_tol atol        : absolute tolerance (default is 0)\n"
+        << "    -n|--norm num            : what norm to use (default is 0 for inf norm)\n"
+        << "    -d|--diffvar var         : output a plotfile showing the differences for\n"
+        << "                               variable var\n"
+        << "    -z|--zone_info var       : output the information for a zone corresponding\n"
+        << "                               to the maximum error for the given variable\n"
+        << "    -a|--allow_diff_grids    : allow different BoxArrays covering the same domain\n"
+        << "    -r|--rel_tol rtol        : relative tolerance (default is 0)\n"
+        << "    --abs_tol atol           : absolute tolerance (default is 0)\n"
+        << "    --abort_if_not_all_found : abort if not all variables are present in both files\n"
         << std::endl;
 }
 
@@ -278,19 +279,23 @@ int main_main()
             if (ivar_b[icomp_a] < 0) {
                 amrex::Print() << " " << std::setw(24) << std::left << names_a[icomp_a]
                                << "  " << std::setw(50)
-                               << "< variable not present in both files > \n";
+                               << "< variable not present in both files > "
+                               << "\n";
             } else if (has_nan_a[icomp_a] && has_nan_b[icomp_a]) {
                 amrex::Print() << " " << std::setw(24) << std::left << names_a[icomp_a]
                                << "  " << std::setw(50)
-                               << "< NaN present in both A and B > \n";
+                               << "< NaN present in both A and B > "
+                               << "\n";
             } else if (has_nan_a[icomp_a]) {
                 amrex::Print() << " " << std::setw(24) << std::left << names_a[icomp_a]
                                << "  " << std::setw(50)
-                               << "< NaN present in A > \n";
+                               << "< NaN present in A > "
+                               << "\n";
             } else if (has_nan_b[icomp_a]) {
                 amrex::Print() << " " << std::setw(24) << std::left << names_b[icomp_a]
                                << "  " << std::setw(50)
-                               << "< NaN present in B > \n";
+                               << "< NaN present in B > "
+                               << "\n";
             } else {
                 Real aerr = 0., rerr = 0.;
                 if (aerror[icomp_a] > 0.) {
diff --git a/Tools/Plotfile/fextrema.cpp b/Tools/Plotfile/fextrema.cpp
index 44cfaf161d4..55596bf952d 100644
--- a/Tools/Plotfile/fextrema.cpp
+++ b/Tools/Plotfile/fextrema.cpp
@@ -1,5 +1,6 @@
 #include <AMReX.H>
 #include <AMReX_Print.H>
+#include <AMReX_ParReduce.H>
 #include <AMReX_PlotFileUtil.H>
 #include <AMReX_MultiFabUtil.H>
 #include <algorithm>
@@ -80,23 +81,23 @@ void main_main()
                                               pf.boxArray(ilev+1), ratio);
                 for (int ivar = 0; ivar < var_names.size(); ++ivar) {
                     const MultiFab& mf = pf.get(ilev, var_names[ivar]);
-                    for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-                        const Box& bx = mfi.validbox();
-                        const auto lo = amrex::lbound(bx);
-                        const auto hi = amrex::ubound(bx);
-                        const auto& ifab = mask.array(mfi);
-                        const auto& fab = mf.array(mfi);
-                        for         (int k = lo.z; k <= hi.z; ++k) {
-                            for     (int j = lo.y; j <= hi.y; ++j) {
-                                for (int i = lo.x; i <= hi.x; ++i) {
-                                    if (ifab(i,j,k) == 0) {
-                                        vvmin[ivar] = std::min(fab(i,j,k),vvmin[ivar]);
-                                        vvmax[ivar] = std::max(fab(i,j,k),vvmax[ivar]);
-                                    }
-                                }
-                            }
-                        }
-                    }
+                    auto const& ma = mf.const_arrays();
+                    auto const& ima = mask.const_arrays();
+                    auto rr = ParReduce(TypeList<ReduceOpMin,ReduceOpMax>{},
+                                        TypeList<Real,Real>{}, mf,
+                              [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+                                  -> GpuTuple<Real,Real>
+                              {
+                                  if (ima[bno](i,j,k) == 0) {
+                                      auto x = ma[bno](i,j,k);
+                                      return {x,x};
+                                  } else {
+                                      return {std::numeric_limits<Real>::max(),
+                                              std::numeric_limits<Real>::lowest()};
+                                  }
+                              });
+                    vvmin[ivar] = std::min(amrex::get<0>(rr), vvmin[ivar]);
+                    vvmax[ivar] = std::max(amrex::get<1>(rr), vvmax[ivar]);
                 }
             }
         }
diff --git a/Tools/Plotfile/fgradient.cpp b/Tools/Plotfile/fgradient.cpp
index 60bb1371585..1de840f68e2 100644
--- a/Tools/Plotfile/fgradient.cpp
+++ b/Tools/Plotfile/fgradient.cpp
@@ -6,7 +6,6 @@
 #include <AMReX_Vector.H>
 #include <cstdlib>
 #include <iterator>
-#include <filesystem>
 #include <sstream>
 #include <string>
 
@@ -129,7 +128,7 @@ void main_main()
     }
 
     if (outfile.empty()) {
-        outfile = "grad."+std::filesystem::path(pltfile).filename().string();
+        outfile = "grad."+VisMF::BaseName(pltfile);
     }
 
     PlotFileData pf(pltfile);
diff --git a/Tools/Plotfile/fsnapshot.cpp b/Tools/Plotfile/fsnapshot.cpp
index e68f8a33b6d..c4b9fd3f361 100644
--- a/Tools/Plotfile/fsnapshot.cpp
+++ b/Tools/Plotfile/fsnapshot.cpp
@@ -278,7 +278,7 @@ void main_main()
         gmx = std::log10(gmx);
     }
 
-    BaseFab<unsigned char> intdat;
+    BaseFab<unsigned char> intdat(The_Pinned_Arena());
     for (int idir = ndir_begin; idir < ndir_end; ++idir) {
         intdat.resize(finebox[idir],1);
         const int width = (idir == 0) ? finebox[idir].length(1) : finebox[idir].length(0);
@@ -286,7 +286,7 @@ void main_main()
         const auto& intarr = intdat.array();
         const auto& realarr = datamf[idir].array(0);
         Real fac = Real(253.999) / (gmx-gmn);
-        amrex::LoopOnCpu(finebox[idir], [=] (int i, int j, int k)
+        amrex::ParallelFor(finebox[idir], [=] AMREX_GPU_DEVICE (int i, int j, int k)
         {
             int jj = (idir == 2) ? height - 1 - j : j;  // flip the data in second image direction
             int kk = (idir == 2) ? k : height - 1 - k;
diff --git a/Tools/Plotfile/fvolumesum.cpp b/Tools/Plotfile/fvolumesum.cpp
index 2f9f03cc522..ec6e461dcc7 100644
--- a/Tools/Plotfile/fvolumesum.cpp
+++ b/Tools/Plotfile/fvolumesum.cpp
@@ -2,6 +2,7 @@
 #include <AMReX_Print.H>
 #include <AMReX_PlotFileUtil.H>
 #include <AMReX_MultiFabUtil.H>
+#include <AMReX_ParReduce.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <limits>
 #include <iterator>
@@ -80,7 +81,6 @@ void main_main()
 
     const int dim = pf.spaceDim();
 
-
     int fine_level = pf.finestLevel();
 
     Vector<Real> pos;
@@ -95,6 +95,35 @@ void main_main()
 
         Array<Real,AMREX_SPACEDIM> dx = pf.cellSize(ilev);
 
+        Real volfac = AMREX_D_TERM(dx[0], *dx[1], *dx[2]);
+
+        if (coord == 1) {
+            AMREX_ALWAYS_ASSERT(AMREX_SPACEDIM == 2);
+            // axisymmetric V = pi (r_r**2 - r_l**2) * dz
+            //                = pi dr * dz * (r_r + r_l)
+            //                = 2 pi r dr dz
+            volfac *= 2 * pi; // 2 * pi * dr * dz part here
+        } else if (coord == 2) {
+            AMREX_ALWAYS_ASSERT(AMREX_SPACEDIM == 1);
+            // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3)
+            volfac *= (4.0_rt/3.0_rt) * pi; // 4/3 * pi * dr part here
+        }
+
+        auto xlo = problo[0];
+        auto dx0 = dx[0];
+        AMREX_ASSERT(coord == 0 || coord == 1 || coord == 2);
+        auto f_vol = [=] AMREX_GPU_DEVICE (int i) {
+                         if (coord == 0) {
+                             return volfac;
+                         } else if (coord == 1) {
+                             return volfac * (xlo + (Real(i)+0.5_rt)*dx0);
+                         } else {
+                             Real r_r = xlo + Real(i+1)*dx0;
+                             Real r_l = xlo + Real(i  )*dx0;
+                             return volfac * (r_r*r_r + r_l*r_r + r_l*r_l);
+                         }
+                     };
+
         if (ilev < fine_level) {
             IntVect ratio{pf.refRatio(ilev)};
             for (int idim = dim; idim < AMREX_SPACEDIM; ++idim) {
@@ -103,97 +132,28 @@ void main_main()
             const iMultiFab mask = makeFineMask(pf.boxArray(ilev), pf.DistributionMap(ilev),
                                                 pf.boxArray(ilev+1), ratio);
             const MultiFab& mf = pf.get(ilev, var_name);
-            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-                const Box& bx = mfi.validbox();
-                if (bx.ok()) {
-                    const auto& m = mask.array(mfi);
-                    const auto& fab = mf.array(mfi);
-                    const auto lo = amrex::lbound(bx);
-                    const auto hi = amrex::ubound(bx);
-                    for (int k = lo.z; k <= hi.z; ++k) {
-                        for (int j = lo.y; j <= hi.y; ++j) {
-                            for (int i = lo.x; i <= hi.x; ++i) {
-                                if (m(i,j,k) == 0) { // not covered by fine
-                                    Array<Real,AMREX_SPACEDIM> p
-                                        = {AMREX_D_DECL(problo[0]+static_cast<Real>(i+0.5)*dx[0],
-                                                        problo[1]+static_cast<Real>(j+0.5)*dx[1],
-                                                        problo[2]+static_cast<Real>(k+0.5)*dx[2])};
-
-                                    // compute the volume
-                                    Real vol = std::numeric_limits<Real>::quiet_NaN();
-                                    if (coord == 0) {
-                                        // Cartesian
-                                        vol = 1.0_rt;
-                                        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                                            vol *= dx[idim];
-                                        }
-                                    } else if (coord == 1) {
-                                        // axisymmetric V = pi (r_r**2 - r_l**2) * dz
-                                        //                = pi dr * dz * (r_r + r_l)
-                                        //                = 2 pi r dr dz
-                                        vol = 2 * pi * p[0] * dx[0] * dx[1];
-                                    } else if (coord == 2) {
-                                        // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3)
-                                        Real r_r = problo[0]+static_cast<Real>(i+1)*dx[0];
-                                        Real r_l = problo[0]+static_cast<Real>(i)*dx[0];
-                                        vol = (4.0_rt/3.0_rt) * pi * dx[0] * (r_r*r_r + r_l*r_r + r_l*r_l);
-                                    }
-
-                                    lsum += fab(i,j,k) * vol;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
+            auto const& ima = mask.const_arrays();
+            auto const& ma = mf.const_arrays();
+            lsum += ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, mf,
+                    [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+                        -> GpuTuple<Real>
+                    {
+                        return { (ima[bno](i,j,k) == 0) ? ma[bno](i,j,k)*f_vol(i) : 0._rt };
+                    });
         } else {
             const MultiFab& mf = pf.get(ilev, var_name);
-            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-                const Box& bx = mfi.validbox();
-                if (bx.ok()) {
-                    const auto& fab = mf.array(mfi);
-                    const auto lo = amrex::lbound(bx);
-                    const auto hi = amrex::ubound(bx);
-                    for (int k = lo.z; k <= hi.z; ++k) {
-                        for (int j = lo.y; j <= hi.y; ++j) {
-                            for (int i = lo.x; i <= hi.x; ++i) {
-                                Array<Real,AMREX_SPACEDIM> p
-                                    = {AMREX_D_DECL(problo[0]+static_cast<Real>(i+0.5)*dx[0],
-                                                    problo[1]+static_cast<Real>(j+0.5)*dx[1],
-                                                    problo[2]+static_cast<Real>(k+0.5)*dx[2])};
-
-                                    // compute the volume
-                                    Real vol = std::numeric_limits<Real>::quiet_NaN();
-                                    if (coord == 0) {
-                                        // Cartesian
-                                        vol = 1.0_rt;
-                                        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                                            vol *= dx[idim];
-                                        }
-                                    } else if (coord == 1) {
-                                        // axisymmetric V = pi (r_r**2 - r_l**2) * dz
-                                        //                = pi dr * dz * (r_r + r_l)
-                                        //                = 2 pi r dr dz
-                                        vol = 2 * pi * p[0] * dx[0] * dx[1];
-                                    } else if (coord == 2) {
-                                        // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3)
-                                        Real r_r = problo[0]+static_cast<Real>(i+1)*dx[0];
-                                        Real r_l = problo[0]+static_cast<Real>(i)*dx[0];
-                                        vol = (4.0_rt/3.0_rt) * pi * dx[0] * (r_r*r_r + r_l*r_r + r_l*r_l);
-                                    }
-
-                                    lsum += fab(i,j,k) * vol;
-                            }
-                        }
-                    }
-                }
-            }
+            auto const& ma = mf.const_arrays();
+            lsum += ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, mf,
+                    [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+                        -> GpuTuple<Real>
+                    {
+                        return { ma[bno](i,j,k)*f_vol(i) };
+                    });
         }
     }
 
     ParallelDescriptor::ReduceRealSum(lsum);
 
-
     if (ParallelDescriptor::IOProcessor()) {
         std::cout << "integral of " << var_name << " = " << lsum << std::endl;
 
diff --git a/Tools/Postprocessing/C_Src/PlotfileToTurb.cpp b/Tools/Postprocessing/C_Src/PlotfileToTurb.cpp
index 826fe69b2f6..8672c529226 100644
--- a/Tools/Postprocessing/C_Src/PlotfileToTurb.cpp
+++ b/Tools/Postprocessing/C_Src/PlotfileToTurb.cpp
@@ -17,7 +17,7 @@
 #include <AMReX_BLFort.H>
 
 extern "C" {
-  void fliprowsy(amrex_real* u, ARLIM_P(ulo), ARLIM_P(uhi));
+  void fliprowsy(amrex_real* u, AMREX_ARLIM_P(ulo), AMREX_ARLIM_P(uhi));
 }
 
 using namespace amrex;
diff --git a/Tools/Postprocessing/C_Src/particle_compare.cpp b/Tools/Postprocessing/C_Src/particle_compare.cpp
index 9967625cca6..e9efc69cf62 100644
--- a/Tools/Postprocessing/C_Src/particle_compare.cpp
+++ b/Tools/Postprocessing/C_Src/particle_compare.cpp
@@ -438,12 +438,15 @@ int main_main()
     std::string fn2;
     std::string pt;
     Real rtol = 0.0;
+    Real atol = 0.0;
 
     int farg=1;
     while (farg <= narg) {
         const std::string fname = amrex::get_command_argument(farg);
         if (fname == "-r" || fname == "--rel_tol") {
             rtol = std::stod(amrex::get_command_argument(++farg));
+    } else if (fname == "--abs_tol") {
+            atol = std::stod(amrex::get_command_argument(++farg));
         } else {
             break;
         }
@@ -472,6 +475,7 @@ int main_main()
             << "\n"
             << " optional arguments:\n"
             << "    -r|--rel_tol rtol     : relative tolerance (default is 0)\n"
+            << "       --abs_tol atol     : absolute tolerance (default is 0)\n"
             << std::endl;
         return EXIT_SUCCESS;
     }
@@ -555,16 +559,25 @@ int main_main()
 
     int exit_code = 0;
     for (int i = 0; i < header1.num_comp; ++i) {
-        if (global_norms[i+header1.num_comp] > rtol) exit_code = 1;
+        if (global_norms[i                 ] > atol &&
+            global_norms[i+header1.num_comp] > rtol) exit_code = 1;
     }
 
     if (exit_code == 0)
     {
-        amrex::Print() << " PARTICLES AGREE to relative tolerance " << rtol << "\n";
+        if (atol > 0.) {
+            amrex::Print() << " PARTICLES AGREE to relative tolerance " << rtol << " and/or absolute tolerance " << atol << "\n";
+    } else {
+            amrex::Print() << " PARTICLES AGREE to relative tolerance " << rtol << "\n";
+    }
     }
     else
     {
-        amrex::Print() << " PARTICLES DISAGREE to relative tolerance " << rtol << "\n";
+        if (atol > 0.) {
+            amrex::Print() << " PARTICLES DISAGREE to relative tolerance " << rtol << " and/or absolute tolerance " << atol << "\n";
+    } else {
+            amrex::Print() << " PARTICLES DISAGREE to relative tolerance " << rtol << "\n";
+    }
     }
 
     return exit_code;
diff --git a/Tools/libamrex/configure.py b/Tools/libamrex/configure.py
index 42493e4647e..1545f86dfb2 100755
--- a/Tools/libamrex/configure.py
+++ b/Tools/libamrex/configure.py
@@ -116,6 +116,10 @@ def configure(argv):
                         help="Only relevant to Amr/AmrLevel based codes that need to read probin file or call amrex_probinit",
                         choices=["yes","no"],
                         default="yes")
+    parser.add_argument("--enable-bittree",
+                        help="Enable Bittree mode [default=no]",
+                        choices=["yes","no"],
+                        default="no")
     args = parser.parse_args()
 
     if args.with_fortran == "no":
@@ -154,6 +158,7 @@ def configure(argv):
     f.write("USE_COMPILE_PIC = {}\n".format("FALSE" if args.enable_pic == "no" else "TRUE"))
     f.write("CUDA_ARCH = " + args.cuda_arch.strip() + "\n")
     f.write("AMREX_NO_PROBINIT = {}\n".format("TRUE" if args.enable_probinit == "no" else "FALSE"))
+    f.write("USE_BITTREE = {}\n".format("TRUE" if args.enable_bittree == "yes" else "FALSE"))
     f.write("\n")
 
     fin = open("GNUmakefile.in","r")