diff --git a/.codespell-ignore-words b/.codespell-ignore-words index 4e265268e63..4069bf5a026 100644 --- a/.codespell-ignore-words +++ b/.codespell-ignore-words @@ -27,6 +27,7 @@ parms pres ptd recuse +shft siz structed te diff --git a/.github/workflows/apps.yml b/.github/workflows/apps.yml index 45afcab4d77..0e7f3641984 100644 --- a/.github/workflows/apps.yml +++ b/.github/workflows/apps.yml @@ -11,7 +11,7 @@ jobs: name: Castro runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Get Latest Release Tag run: | CASTRO_TAG=$(wget https://github.com/AMReX-Astro/Castro/releases/latest 2>&1 | grep Location: | awk '{print $2}' | awk -F/ '{print $NF}') @@ -19,13 +19,13 @@ jobs: MICROPHYSICS_TAG=$(wget https://github.com/AMReX-Astro/Microphysics/releases/latest 2>&1 | grep Location: | awk '{print $2}' | awk -F/ '{print $NF}') echo "MICROPHYSICS_TAG=$MICROPHYSICS_TAG" >> $GITHUB_ENV - name: Download Castro - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: 'AMReX-Astro/Castro' ref: ${{env.CASTRO_TAG}} path: 'Castro' - name: Download Microphysics - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: 'AMReX-Astro/Microphysics' ref: ${{env.MICROPHYSICS_TAG}} @@ -35,7 +35,7 @@ jobs: .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -52,7 +52,7 @@ jobs: export AMREX_HOME=${PWD} export MICROPHYSICS_HOME=${PWD}/Microphysics cd Castro/Exec/hydro_tests/Sedov/ - make -j2 CCACHE=ccache USE_MPI=FALSE + make -j4 CCACHE=ccache USE_MPI=FALSE ccache -s du -hs ~/.cache/ccache @@ -61,13 +61,13 @@ jobs: name: WarpX runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Get Latest Release Tag run: | WARPX_TAG=$(wget https://github.com/ECP-WarpX/WarpX/releases/latest 2>&1 | grep Location: | awk '{print $2}' | awk -F/ '{print $NF}') echo "WARPX_TAG=$WARPX_TAG" >> $GITHUB_ENV - name: Download WarpX - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: 'ECP-WarpX/WarpX' ref: ${{env.WARPX_TAG}} @@ -77,7 +77,7 @@ jobs: .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -97,7 +97,7 @@ jobs: -DWarpX_OPENPMD=OFF \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build WarpX/build -j 2 + cmake --build WarpX/build -j 4 ccache -s du -hs ~/.cache/ccache @@ -111,7 +111,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/ascent.yml b/.github/workflows/ascent.yml index 83d2f7ebac3..74421cebac3 100644 --- a/.github/workflows/ascent.yml +++ b/.github/workflows/ascent.yml @@ -18,7 +18,7 @@ jobs: container: image: alpinedav/ascent:0.9.2 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Configure run: | . /ascent_docker_setup_env.sh @@ -26,9 +26,10 @@ jobs: -DCMAKE_BUILD_TYPE=Debug \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ - -DAMReX_ASCENT=ON + -DAMReX_ASCENT=ON \ + -DAMReX_CONDUIT=ON - name: Build run: | . /ascent_docker_setup_env.sh - cmake --build build -j 2 + cmake --build build -j 4 diff --git a/.github/workflows/bittree.yml b/.github/workflows/bittree.yml new file mode 100644 index 00000000000..cf50e16631c --- /dev/null +++ b/.github/workflows/bittree.yml @@ -0,0 +1,127 @@ +name: bittree + +on: [push, pull_request] + +concurrency: + group: ${{ github.ref }}-${{ github.head_ref }}-bittree + cancel-in-progress: true + +jobs: + bittree-2d: + name: Bittree 2D + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Dependencies + run: | + .github/workflows/dependencies/dependencies.sh + .github/workflows/dependencies/dependencies_clang-tidy.sh 15 + .github/workflows/dependencies/dependencies_ccache.sh + - name: Set Up Cache + uses: actions/cache@v4 + with: + path: ~/.cache/ccache + key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} + restore-keys: | + ccache-${{ github.workflow }}-${{ github.job }}-git- + - name: Check out Bittree + uses: actions/checkout@v4 + with: + repository: Flash-X/Bittree + path: bittree + - name: Build Bittree + run: | + cd ${{ github.workspace }}/bittree + python setup.py library --dim 2 --prefix ${{ github.workspace }}/libbittree + cd build + make -j4 + make install + - name: Build and Run Test + run: | + export CCACHE_COMPRESS=1 + export CCACHE_COMPRESSLEVEL=10 + export CCACHE_MAXSIZE=80M + export CCACHE_EXTRAFILES=${{ github.workspace }}/.clang-tidy + export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt + ccache -z + + export AMREX_BITTREE_HOME=${{ github.workspace }}/libbittree + cd ${{ github.workspace }}/Tests/Amr/Advection_AmrCore/Exec + make -j4 USE_MPI=TRUE USE_BITTREE=TRUE DIM=2 TEST=TRUE \ + CCACHE=ccache + mpiexec -n 2 ./main2d.gnu.TEST.MPI.ex inputs_bittree amr.plot_int=1000 + + ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt + make -j4 -k -f clang-tidy-ccache-misses.mak \ + CLANG_TIDY=clang-tidy-15 \ + CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" + + ccache -s + du -hs ~/.cache/ccache + + bittree-3d: + name: Bittree 3D + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Dependencies + run: | + .github/workflows/dependencies/dependencies.sh + .github/workflows/dependencies/dependencies_clang-tidy.sh 15 + .github/workflows/dependencies/dependencies_ccache.sh + - name: Set Up Cache + uses: actions/cache@v4 + with: + path: ~/.cache/ccache + key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} + restore-keys: | + ccache-${{ github.workflow }}-${{ github.job }}-git- + - name: Check out Bittree + uses: actions/checkout@v4 + with: + repository: Flash-X/Bittree + path: bittree + - name: Build Bittree + run: | + cd ${{ github.workspace }}/bittree + python setup.py library --dim 3 --prefix ${{ github.workspace }}/libbittree + cd build + make -j4 + make install + - name: Build and Run Test + run: | + export CCACHE_COMPRESS=1 + export CCACHE_COMPRESSLEVEL=10 + export CCACHE_MAXSIZE=80M + export CCACHE_EXTRAFILES=${{ github.workspace }}/.clang-tidy + export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt + ccache -z + + export AMREX_BITTREE_HOME=${{ github.workspace }}/libbittree + cd ${{ github.workspace }}/Tests/Amr/Advection_AmrCore/Exec + make -j4 USE_MPI=TRUE USE_BITTREE=TRUE DIM=3 TEST=TRUE BL_NO_FORT=TRUE\ + CCACHE=ccache + mpiexec -n 2 ./main3d.gnu.TEST.MPI.ex inputs_bittree max_step=10 + + ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt + make -j4 -k -f clang-tidy-ccache-misses.mak \ + CLANG_TIDY=clang-tidy-15 \ + CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" + + ccache -s + du -hs ~/.cache/ccache + + save_pr_number: + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Save PR number + env: + PR_NUMBER: ${{ github.event.number }} + run: | + echo $PR_NUMBER > pr_number.txt + - uses: actions/upload-artifact@v4 + with: + name: pr_number + path: pr_number.txt + retention-days: 1 diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index eea7e576af6..b4886ff9623 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -15,14 +15,14 @@ jobs: name: Clang@7.0 C++17 SP NOMPI Debug [lib] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_clang.sh 7 .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -54,17 +54,17 @@ jobs: -DCMAKE_CXX_COMPILER=$(which clang++-7) \ -DCMAKE_Fortran_COMPILER=$(which gfortran) \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - make -j 2 + make -j 4 make install make test_install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" export PATH=/tmp/my-amrex/bin:$PATH - which fcompare + which amrex_fcompare ctest --output-on-failure @@ -75,14 +75,14 @@ jobs: name: Clang@14.0 C++17 SP Particles DP Mesh Debug [tests] runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_clang.sh 14 .github/workflows/dependencies/dependencies_clang-tidy.sh 14 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -114,10 +114,10 @@ jobs: -DCMAKE_CXX_COMPILER=$(which clang++-14) \ -DCMAKE_Fortran_COMPILER=$(which gfortran) \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - make -j 2 + make -j 4 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -131,14 +131,14 @@ jobs: name: Clang NOMPI Release [configure 2D] runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_clang.sh 14 .github/workflows/dependencies/dependencies_clang-tidy.sh 14 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -154,12 +154,12 @@ jobs: ccache -z ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no - make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names" \ + make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names" \ CCACHE=ccache make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -175,7 +175,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/cleanup-cache-postpr.yml b/.github/workflows/cleanup-cache-postpr.yml index 73d6eaf0903..9a2ffb0f61a 100644 --- a/.github/workflows/cleanup-cache-postpr.yml +++ b/.github/workflows/cleanup-cache-postpr.yml @@ -16,7 +16,7 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Clean up ccache run: | gh extension install actions/gh-actions-cache @@ -31,7 +31,10 @@ jobs: set +e keys=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH | cut -f 1) + # $keys might contain spaces. Thus we set IFS to \n. + IFS=$'\n' for k in $keys do - gh actions-cache delete $k -R $REPO -B $BRANCH --confirm + gh actions-cache delete "$k" -R $REPO -B $BRANCH --confirm done + unset IFS diff --git a/.github/workflows/cleanup-cache.yml b/.github/workflows/cleanup-cache.yml index 2a1a2f254a3..d18acbaa788 100644 --- a/.github/workflows/cleanup-cache.yml +++ b/.github/workflows/cleanup-cache.yml @@ -2,7 +2,7 @@ name: CleanUpCache on: workflow_run: - workflows: [LinuxClang, cuda, LinuxGcc, hip, Hypre, intel, macos, PETSc, SUNDIALS, windows, CodeQL, smoke, apps] + workflows: [bittree, LinuxClang, cuda, LinuxGcc, hip, Hypre, intel, macos, PETSc, SUNDIALS, CodeQL, smoke, apps] types: - completed @@ -16,7 +16,7 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Clean up ccache run: | gh extension install actions/gh-actions-cache @@ -27,7 +27,7 @@ jobs: EVENT=${{ github.event.workflow_run.event }} # Triggering workflow run name (e.g., LinuxClang) - WORKFLOW_NAME=${{ github.event.workflow_run.name }} + WORKFLOW_NAME="${{ github.event.workflow_run.name }}" if [[ $EVENT == "pull_request" ]]; then gh run download ${{ github.event.workflow_run.id }} -n pr_number @@ -45,16 +45,19 @@ jobs: # The goal is to keep the last used key of each job and delete all others. # something like ccache-LinuxClang- - keyprefix=ccache-${WORKFLOW_NAME}- + keyprefix="ccache-${WORKFLOW_NAME}-" - cached_jobs=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH --key $keyprefix | awk -F '-git-' '{print $1}' | sort | uniq) + cached_jobs=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH --key "$keyprefix" | awk -F '-git-' '{print $1}' | sort | uniq) # cached_jobs is something like "ccache-LinuxClang-configure-1d ccache-LinuxClang-configure-2d". + # It might also contain spaces. Thus we set IFS to \n. + IFS=$'\n' for j in $cached_jobs do - old_keys=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH --key ${j}-git- --sort last-used | cut -f 1 | tail -n +2) + old_keys=$(gh actions-cache list -L 100 -R $REPO -B $BRANCH --key "${j}-git-" --sort last-used | cut -f 1 | tail -n +2) for k in $old_keys do - gh actions-cache delete $k -R $REPO -B $BRANCH --confirm + gh actions-cache delete "$k" -R $REPO -B $BRANCH --confirm done done + unset IFS diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 0e03e384633..c7340ee449b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -14,6 +14,7 @@ concurrency: jobs: analyze: + if: ${{ github.repository == 'AMReX-Codes/amrex' || github.event_name != 'schedule' }} name: Analyze runs-on: ubuntu-latest permissions: @@ -28,7 +29,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Packages (C++) if: ${{ matrix.language == 'cpp' }} @@ -40,7 +41,7 @@ jobs: - name: Set Up Cache if: ${{ matrix.language == 'cpp' }} - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -58,14 +59,14 @@ jobs: -DCMAKE_CXX_COMPILER="/usr/local/bin/g++" - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} queries: +security-and-quality config-file: ./.github/workflows/codeql/codeql-config.yml - name: Build (py) - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 if: ${{ matrix.language == 'python' }} - name: Build (C++) @@ -76,7 +77,7 @@ jobs: export CCACHE_MAXSIZE=30M ccache -z - cmake --build build -j 2 + cmake --build build -j 4 ccache -s du -hs ~/.cache/ccache @@ -85,10 +86,10 @@ jobs: touch Src/Base/AMReX.cpp export CCACHE_DISABLE=1 cd build - make -j 2 + make -j 4 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{ matrix.language }}" @@ -101,7 +102,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index 491b839c00e..bec24c5c7ca 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install codespell run: | diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index b4abc1c0a05..1ceed52ba63 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -12,13 +12,13 @@ jobs: name: CUDA@11.2 GNU@9.3.0 C++17 Release [tests] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_nvcc.sh 11.2 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -52,7 +52,7 @@ jobs: -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build -j 2 + cmake --build build -j 4 ccache -s du -hs ~/.cache/ccache @@ -71,13 +71,13 @@ jobs: CUDAARCHS: "70" AMReX_CUDA_ARCH: "7.0" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_llvm_cuda11_clang15.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -104,7 +104,7 @@ jobs: -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build -j 2 + cmake --build build -j 4 ccache -s du -hs ~/.cache/ccache @@ -114,13 +114,14 @@ jobs: name: NVHPC NVCC/NVC++ C++17 Release [tests] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | + .github/workflows/dependencies/ubuntu_free_disk_space.sh .github/workflows/dependencies/dependencies_nvhpc.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -164,7 +165,7 @@ jobs: -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build -j 2 + cmake --build build -j 4 ccache -s du -hs ~/.cache/ccache @@ -174,13 +175,13 @@ jobs: name: CUDA@11.2 GNU@9.3.0 [configure 3D] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_nvcc.sh 11.2 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -199,7 +200,7 @@ jobs: # /home/runner/work/amrex/amrex/Src/Base/AMReX_GpuLaunchGlobal.H:16:41: error: unused parameter ‘f0’ [-Werror=unused-parameter] # 16 | AMREX_GPU_GLOBAL void launch_global (L f0) { f0(); } # - make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names -Wno-unused-parameter" CCACHE=ccache CUDA_ARCH="7.0 7.2" + make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names -Wno-unused-parameter" CCACHE=ccache CUDA_ARCH="7.0 7.2" make install ccache -s @@ -214,7 +215,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/dependencies/dependencies_codeplay.sh b/.github/workflows/dependencies/dependencies_codeplay.sh index dfd2e5a03f7..0f723a0bfc0 100755 --- a/.github/workflows/dependencies/dependencies_codeplay.sh +++ b/.github/workflows/dependencies/dependencies_codeplay.sh @@ -6,10 +6,25 @@ set -eu -o pipefail -curl -o oneapi_nvidia.sh -L "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=nvidia&filters[]=linux&aat=$1" -chmod +x oneapi_nvidia.sh -sudo ./oneapi_nvidia.sh --yes +# `man apt.conf`: +# Number of retries to perform. If this is non-zero APT will retry +# failed files the given number of times. +echo 'Acquire::Retries "3";' | sudo tee /etc/apt/apt.conf.d/80-retries -curl -o oneapi_amd.sh -L "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=amd&filters[]=linux&aat=$1" -chmod +x oneapi_amd.sh -sudo ./oneapi_amd.sh --yes +# https://developer.codeplay.com/apt/index.html +sudo wget -qO - https://developer.codeplay.com/apt/public.key | gpg --dearmor | sudo tee /usr/share/keyrings/codeplay-keyring.gpg > /dev/null +echo "deb [signed-by=/usr/share/keyrings/codeplay-keyring.gpg] https://developer.codeplay.com/apt all main" | sudo tee /etc/apt/sources.list.d/codeplay.list + +sudo apt-get clean +sudo apt-get update + +# try apt install up to five times, to avoid connection splits +status=1 +for itry in {1..5} +do + sudo apt-get install -y --no-install-recommends \ + $1 \ + && { sudo apt-get clean; sudo apt-get update; status=0; break; } \ + || { sleep 10; } +done +if [[ ${status} -ne 0 ]]; then exit 1; fi diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh index 852342e4ac1..4673a7caed5 100755 --- a/.github/workflows/dependencies/dependencies_hip.sh +++ b/.github/workflows/dependencies/dependencies_hip.sh @@ -45,6 +45,9 @@ sudo apt-get install -y --no-install-recommends \ rocrand-dev \ rocprim-dev +# hiprand-dev is a new package that does not exist in old versions +sudo apt-get install -y --no-install-recommends hiprand-dev || true + # activate # source /etc/profile.d/rocm.sh diff --git a/.github/workflows/dependencies/ubuntu_free_disk_space.sh b/.github/workflows/dependencies/ubuntu_free_disk_space.sh new file mode 100755 index 00000000000..6b3e5b2f55e --- /dev/null +++ b/.github/workflows/dependencies/ubuntu_free_disk_space.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# +# Copyright 2023 The AMReX Community +# +# License: BSD-3-Clause-LBNL + +# Don't want to use the following line because apt-get remove may fail if +# the package specfied does not exist. +# set -eu -o pipefail + +# Large packages +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 + +echo 'Removing some packages we do not need' + +df -h + +apt list --installed + +sudo apt-get remove -y '^apache.*' +sudo apt-get remove -y '^aspnetcore.*' +sudo apt-get remove -y '^azure.*' +sudo apt-get remove -y '^dotnet.*' +sudo apt-get remove -y '^firebird.*' +sudo apt-get remove -y '^firefox.*' +sudo apt-get remove -y '^google.*' +sudo apt-get remove -y '^hhvm.*' +sudo apt-get remove -y '^microsoft.*' +sudo apt-get remove -y '^mongodb.*' +sudo apt-get remove -y '^mono-.*' +sudo apt-get remove -y '^monodoc-.*' +sudo apt-get remove -y '^mysql.*' +sudo apt-get remove -y '^php.*' +sudo apt-get remove -y '^powershell.*' +sudo apt-get remove -y '^snapd.*' +sudo apt-get remove -y '^temurin.*' + +sudo apt-get autoremove -y + +df -h diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 82e387cbff4..49d564b36b3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: persist-credentials: false @@ -27,12 +27,11 @@ jobs: - name: Deploy if: github.event_name == 'push' && github.repository == 'AMReX-Codes/amrex' && github.ref == 'refs/heads/development' - uses: JamesIves/github-pages-deploy-action@3.7.1 + uses: JamesIves/github-pages-deploy-action@v4.5.0 with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - ACCESS_TOKEN: ${{ secrets.DEPLOY_DOCS }} - REPOSITORY_NAME: AMReX-Codes/AMReX-Codes.github.io - BRANCH: main # The branch the action should deploy to. - FOLDER: build # The folder the action should deploy. - TARGET_FOLDER: amrex # The folder the action should deploy to. - CLEAN: false # Do not remove existing files from the deploy target. + ssh-key: ${{ secrets.AMREX_BUILD_DOCS }} + repository-name: AMReX-Codes/AMReX-Codes.github.io + branch: main # The branch the action should deploy to. + folder: build # The folder the action should deploy. + target-folder: amrex # The folder the action should deploy to. + clean: false # Do not remove existing files from the deploy target. diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index 6915a246018..8c2e8403f5d 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -16,14 +16,14 @@ jobs: name: GNU@8.4 C++17 Release [lib] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_gcc.sh 8 .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -50,17 +50,17 @@ jobs: -DCMAKE_CXX_COMPILER=$(which g++-8) \ -DCMAKE_Fortran_COMPILER=$(which gfortran-8) \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - make -j 2 + make -j 4 make install make test_install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" export PATH=/tmp/my-amrex/bin:$PATH - which fcompare + which amrex_fcompare ctest --output-on-failure @@ -72,14 +72,14 @@ jobs: name: GNU@9.3 C++17 3D Debug Fortran [tests] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -104,10 +104,10 @@ jobs: -DAMReX_FORTRAN=ON \ -DAMReX_SPACEDIM=3 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build -j 2 + cmake --build build -j 4 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -120,14 +120,14 @@ jobs: name: GNU@9.3 C++17 2D Debug Fortran [tests] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -152,10 +152,10 @@ jobs: -DAMReX_FORTRAN=ON \ -DAMReX_SPACEDIM=2 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build -j 2 + cmake --build build -j 4 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -168,14 +168,14 @@ jobs: name: GNU@9.3 C++17 1D Debug Fortran [tests] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -201,10 +201,10 @@ jobs: -DAMReX_FORTRAN=ON \ -DAMReX_SPACEDIM=1 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build -j 2 + cmake --build build -j 4 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -218,14 +218,14 @@ jobs: name: GNU@10.1 C++20 OMP [tests] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_gcc.sh 10 .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -259,11 +259,11 @@ jobs: -DCMAKE_CXX_COMPILER=$(which g++-10) \ -DCMAKE_Fortran_COMPILER=$(which gfortran-10) \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - make -j 2 + make -j 4 # Let's not use clang-tidy for this test because it wants to use C++20. # ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - # make -j2 -f clang-tidy-ccache-misses.mak \ + # make -j4 -k -f clang-tidy-ccache-misses.mak \ # CLANG_TIDY=clang-tidy-12 \ # CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -277,14 +277,14 @@ jobs: name: GNU@8.4 C++17 NOMPI [tests] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_gcc.sh 8 .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -317,10 +317,10 @@ jobs: -DCMAKE_CXX_COMPILER=$(which g++-8) \ -DCMAKE_Fortran_COMPILER=$(which gfortran-8) \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - make -j 2 + make -j 4 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -334,14 +334,14 @@ jobs: name: GNU@12 C++17 w/o Fortran [tests] runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_gcc.sh 12 .github/workflows/dependencies/dependencies_clang-tidy.sh 14 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -381,10 +381,10 @@ jobs: -DCMAKE_CXX_COMPILER=$(which g++-12) \ -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - make -j 2 + make -j 4 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -398,14 +398,14 @@ jobs: name: GNU@9.3 Release [configure 1D] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -421,12 +421,12 @@ jobs: ccache -z ./configure --dim 1 - make -j2 XTRA_CXXFLAGS=-fno-operator-names \ + make -j4 XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -438,14 +438,14 @@ jobs: name: GNU@11.2 Release [configure 3D] runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 15 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -461,12 +461,12 @@ jobs: ccache -z ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes - make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ + make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-15 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -478,14 +478,14 @@ jobs: name: GNU@9.3 Release [configure 3D] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -501,12 +501,12 @@ jobs: ccache -z ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes --single-precision-particles yes --enable-tiny-profile yes - make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ + make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -518,14 +518,14 @@ jobs: name: GNU@9.3 OMP Debug [configure 3D] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -541,12 +541,12 @@ jobs: ccache -z ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes --with-omp yes --debug yes - make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ + make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -558,14 +558,14 @@ jobs: name: GNU Plotfile Tools [tools] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -581,11 +581,11 @@ jobs: ccache -z cd Tools/Plotfile - make -j2 USE_MPI=FALSE USE_OMP=FALSE WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ + make -j4 USE_MPI=FALSE USE_OMP=FALSE WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -597,14 +597,14 @@ jobs: name: GNU@9.3 C++17 [tests] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 12 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -627,10 +627,10 @@ jobs: -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ENABLE_TESTS=ON \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - make -j 2 + make -j 4 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -649,7 +649,7 @@ jobs: CXX: h5pcc CC: h5cc steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh @@ -659,7 +659,7 @@ jobs: run: | cd Tests/HDF5Benchmark export OLCF_HDF5_ROOT=/usr/lib/x86_64-linux-gnu/hdf5/openmpi - make -j 2 + make -j 4 mpirun -np 2 ./main3d.gnu.TPROF.MPI.ex ./inputs h5dump -d "level_0/data:offsets=0" -s "1" -c "1" ./plt00000.h5 h5dump -d "level_0/data:datatype=1" -s "1" -c "1" ./plt00000/particle0/particle0.h5 @@ -673,7 +673,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml index 0c0e59a96c4..0a66bac0ab3 100644 --- a/.github/workflows/hip.yml +++ b/.github/workflows/hip.yml @@ -11,13 +11,13 @@ jobs: name: HIP ROCm Flang C++17 [tests] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_hip.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -66,7 +66,7 @@ jobs: -DCMAKE_Fortran_COMPILER=$(which flang) \ -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build -j 2 + cmake --build build -j 4 ccache -s du -hs ~/.cache/ccache @@ -75,13 +75,13 @@ jobs: name: HIP ROCm GFortran@9.3 C++17 [tests-hipcc] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_hip.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -128,7 +128,7 @@ jobs: -DCMAKE_Fortran_COMPILER=$(which gfortran) \ -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build_full_legacywrapper -j 2 + cmake --build build_full_legacywrapper -j 4 ccache -s du -hs ~/.cache/ccache @@ -138,13 +138,13 @@ jobs: name: HIP EB [configure 2D] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_hip.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -158,7 +158,7 @@ jobs: ccache -z ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes --with-mpi no --with-omp no --single-precision yes --single-precision-particles yes - make -j2 WARN_ALL=TRUE XTRA_CXXFLAGS="-fno-operator-names" AMD_ARCH=gfx90a CCACHE=ccache + make -j4 WARN_ALL=TRUE XTRA_CXXFLAGS="-fno-operator-names" AMD_ARCH=gfx90a CCACHE=ccache make install ccache -s @@ -169,13 +169,13 @@ jobs: name: HIP EB 3D GMake runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_hip.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -189,7 +189,7 @@ jobs: ccache -z cd Tests/LinearSolvers/NodeEB - make -j2 USE_HIP=TRUE USE_MPI=FALSE BL_NO_FORT=TRUE WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names AMD_ARCH=gfx90a CCACHE=ccache + make -j4 USE_HIP=TRUE USE_MPI=FALSE BL_NO_FORT=TRUE WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names AMD_ARCH=gfx90a CCACHE=ccache ccache -s du -hs ~/.cache/ccache @@ -203,7 +203,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/hypre.yml b/.github/workflows/hypre.yml index 50423f3942a..b3916176f79 100644 --- a/.github/workflows/hypre.yml +++ b/.github/workflows/hypre.yml @@ -13,14 +13,14 @@ jobs: env: AMREX_HYPRE_HOME: ${HOME}/.cache/hypre-2.26.0-cuda steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_nvcc.sh 11.2 sudo apt-get install -y libcublas-dev-11-2 libcusparse-dev-11-2 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ~/.cache/ccache @@ -38,7 +38,7 @@ jobs: ./configure --with-cxxstandard=17 --with-cuda --enable-unified-memory \ --with-cuda-home=/usr/local/cuda --with-gpu-arch="80" \ --prefix=${{ env.AMREX_HYPRE_HOME }} - make -j 2 + make -j 4 make install cd ../../ fi @@ -53,7 +53,7 @@ jobs: export CUDA_PATH=/usr/local/cuda export PATH=${PATH}:/usr/local/cuda/bin cd Tests/LinearSolvers/CellEB - make -j2 USE_MPI=TRUE USE_HYPRE=TRUE DIM=2 USE_CUDA=TRUE CCACHE=ccache + make -j4 USE_MPI=TRUE USE_HYPRE=TRUE DIM=2 USE_CUDA=TRUE CCACHE=ccache ccache -s du -h -d1 ~/.cache @@ -62,14 +62,14 @@ jobs: name: GCC 3D Hypre@2.21.0 runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 14 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -81,7 +81,7 @@ jobs: tar xfz v2.21.0.tar.gz cd hypre-2.21.0/src ./configure --with-cxxstandard=17 --enable-bigint - make -j 2 + make -j 4 make install cd ../../ - name: Build and Run Test @@ -95,12 +95,12 @@ jobs: export AMREX_HYPRE_HOME=${PWD}/hypre-2.21.0/src/hypre cd Tests/LinearSolvers/ABecLaplacian_C - make -j2 USE_MPI=TRUE USE_HYPRE=TRUE DIM=3 \ + make -j4 USE_MPI=TRUE USE_HYPRE=TRUE DIM=3 \ CCACHE=ccache mpiexec -n 2 ./main3d.gnu.MPI.ex inputs.hypre ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -111,14 +111,14 @@ jobs: name: GCC EB 2D Hypre@2.28.0 runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 14 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -130,7 +130,7 @@ jobs: tar xfz v2.28.0.tar.gz cd hypre-2.28.0/src ./configure --with-cxxstandard=17 - make -j 2 + make -j 4 make install cd ../../ - name: Build and Run Test @@ -144,11 +144,11 @@ jobs: export AMREX_HYPRE_HOME=${PWD}/hypre-2.28.0/src/hypre cd Tests/LinearSolvers/Hypre - make -j2 USE_MPI=TRUE USE_HYPRE=TRUE DIM=2 CCACHE=ccache + make -j4 USE_MPI=TRUE USE_HYPRE=TRUE DIM=2 CCACHE=ccache mpiexec -n 2 ./main2d.gnu.MPI.ex inputs.2d ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -164,7 +164,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 031f7d331bf..e251612b599 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -11,13 +11,13 @@ jobs: name: oneAPI SYCL [tests] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_dpcpp.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -44,8 +44,9 @@ jobs: -DCMAKE_C_COMPILER=$(which icx) \ -DCMAKE_CXX_COMPILER=$(which icpx) \ -DCMAKE_Fortran_COMPILER=$(which ifx) \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build --parallel 2 + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_PARALLEL_LINK_JOBS=4 + cmake --build build --parallel 4 ccache -s du -hs ~/.cache/ccache @@ -54,13 +55,13 @@ jobs: name: oneAPI SYCL [tests w/ EB] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_dpcpp.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -86,8 +87,9 @@ jobs: -DAMReX_GPU_BACKEND=SYCL \ -DCMAKE_C_COMPILER=$(which icx) \ -DCMAKE_CXX_COMPILER=$(which icpx) \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build --parallel 2 + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_PARALLEL_LINK_JOBS=4 + cmake --build build --parallel 4 ccache -s du -hs ~/.cache/ccache @@ -95,29 +97,24 @@ jobs: tests-oneapi-sycl-eb-nvidia: name: oneAPI SYCL for Nvidia GPUs [tests w/ EB] runs-on: ubuntu-latest - env: - CODEPLAYTOKEN: ${{ secrets.CODEPLAYTOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies - if: ${{ env.CODEPLAYTOKEN != '' }} run: | - .github/workflows/dependencies/dependencies_nvcc.sh + .github/workflows/dependencies/ubuntu_free_disk_space.sh .github/workflows/dependencies/dependencies_dpcpp.sh - .github/workflows/dependencies/dependencies_codeplay.sh ${{ env.CODEPLAYTOKEN }} + .github/workflows/dependencies/dependencies_nvcc.sh 12.0 + .github/workflows/dependencies/dependencies_codeplay.sh oneapi-nvidia-12.0 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - if: ${{ env.CODEPLAYTOKEN != '' }} - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} restore-keys: | ccache-${{ github.workflow }}-${{ github.job }}-git- - name: Build & Install - if: ${{ env.CODEPLAYTOKEN != '' }} - # clang currently supports CUDA up to version 11.5 and a warning is issued with newer versions - env: {CXXFLAGS: "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-unknown-cuda-version"} + env: {CXXFLAGS: "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --offload-arch=sm_80 -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"} run: | export CCACHE_COMPRESS=1 export CCACHE_COMPRESSLEVEL=10 @@ -136,8 +133,9 @@ jobs: -DAMReX_GPU_BACKEND=SYCL \ -DCMAKE_C_COMPILER=$(which icx) \ -DCMAKE_CXX_COMPILER=$(which clang++) \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build --parallel 2 + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_PARALLEL_LINK_JOBS=4 + cmake --build build --parallel 4 ccache -s du -hs ~/.cache/ccache @@ -145,27 +143,23 @@ jobs: no-tests-oneapi-sycl-amd: name: oneAPI SYCL for AMD GPUs runs-on: ubuntu-20.04 - env: - CODEPLAYTOKEN: ${{ secrets.CODEPLAYTOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies - if: ${{ env.CODEPLAYTOKEN != '' }} run: | - .github/workflows/dependencies/dependencies_hip.sh 5.4.6 + .github/workflows/dependencies/ubuntu_free_disk_space.sh .github/workflows/dependencies/dependencies_dpcpp.sh - .github/workflows/dependencies/dependencies_codeplay.sh ${{ env.CODEPLAYTOKEN }} + .github/workflows/dependencies/dependencies_hip.sh 5.4.3 + .github/workflows/dependencies/dependencies_codeplay.sh oneapi-amd-5.4.3 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - if: ${{ env.CODEPLAYTOKEN != '' }} - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} restore-keys: | ccache-${{ github.workflow }}-${{ github.job }}-git- - name: Build & Install - if: ${{ env.CODEPLAYTOKEN != '' }} env: {CXXFLAGS: "-fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"} run: | export CCACHE_COMPRESS=1 @@ -185,8 +179,9 @@ jobs: -DAMReX_SYCL_SUB_GROUP_SIZE=64 \ -DCMAKE_C_COMPILER=$(which icx) \ -DCMAKE_CXX_COMPILER=$(which clang++) \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build --parallel 2 + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_PARALLEL_LINK_JOBS=4 + cmake --build build --parallel 4 ccache -s du -hs ~/.cache/ccache @@ -196,14 +191,15 @@ jobs: name: ICC [tests] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | + .github/workflows/dependencies/ubuntu_free_disk_space.sh .github/workflows/dependencies/dependencies_dpcpp.sh sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -224,12 +220,11 @@ jobs: -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ - -DAMReX_FORTRAN=ON \ + -DAMReX_FORTRAN=OFF \ -DCMAKE_C_COMPILER=$(which icc) \ -DCMAKE_CXX_COMPILER=$(which icpc) \ - -DCMAKE_Fortran_COMPILER=$(which ifort) \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build --parallel 2 + cmake --build build --parallel 4 cmake --build build --target install cmake --build build --target test_install @@ -253,7 +248,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 1c405c2d66d..007ac29cb3a 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -12,11 +12,11 @@ jobs: name: AppleClang Universal w/o MPI [tests-universal] runs-on: macos-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: .github/workflows/dependencies/dependencies_mac.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: /Users/runner/Library/Caches/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -53,11 +53,11 @@ jobs: name: AppleClang@11.0 GFortran@9.3 [tests] runs-on: macos-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: .github/workflows/dependencies/dependencies_mac.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: /Users/runner/Library/Caches/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -95,7 +95,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/petsc.yml b/.github/workflows/petsc.yml index 6d0b92b1343..0e79fddd020 100644 --- a/.github/workflows/petsc.yml +++ b/.github/workflows/petsc.yml @@ -11,14 +11,14 @@ jobs: name: GCC 2D EB PETSc@3.18.1 runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 14 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -31,7 +31,7 @@ jobs: cd petsc-3.18.1 export PETSC_DIR=${PWD} ./configure --prefix=${PWD}/petsc - make -j 2 + make -j 4 make install cd ../ - name: Build and Run Test @@ -45,12 +45,12 @@ jobs: export AMREX_PETSC_HOME=${PWD}/petsc-3.18.1/petsc cd Tests/LinearSolvers/CellEB - make -j2 USE_MPI=TRUE USE_PETSC=TRUE DIM=2 TEST=TRUE \ + make -j4 USE_MPI=TRUE USE_PETSC=TRUE DIM=2 TEST=TRUE \ CCACHE=ccache mpiexec -n 2 ./main2d.gnu.TEST.MPI.ex inputs.rt.2d.petsc ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -66,7 +66,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/post-pr.yml b/.github/workflows/post-pr.yml index f5b914033b7..2768ef376cc 100644 --- a/.github/workflows/post-pr.yml +++ b/.github/workflows/post-pr.yml @@ -13,7 +13,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/sensei.yml b/.github/workflows/sensei.yml index fc5a0db3059..52f8e418a6b 100644 --- a/.github/workflows/sensei.yml +++ b/.github/workflows/sensei.yml @@ -21,7 +21,7 @@ jobs: container: image: senseiinsitu/ci:fedora35-amrex-20220613 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup run: mkdir build - name: Configure @@ -35,4 +35,4 @@ jobs: - name: Build run: | cd build - cmake --build . -j 2 + cmake --build . -j 4 diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml index 080a17fd984..f3d0b899df9 100644 --- a/.github/workflows/smoke.yml +++ b/.github/workflows/smoke.yml @@ -11,14 +11,14 @@ jobs: name: GNU Multi-D runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_clang-tidy.sh 15 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -42,12 +42,12 @@ jobs: -DAMReX_EB=ON \ -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - make -j 2 + make -j 4 make install make test_install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-15 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -58,7 +58,7 @@ jobs: -DAMReX_ROOT=../../../installdir \ -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - make -j 2 + make -j 4 mpiexec -n 2 ./install_test ../../Amr/Advection_AmrCore/Exec/inputs-ci ccache -s @@ -73,7 +73,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 8cd87d66e87..00a86b00ac4 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -10,21 +10,21 @@ jobs: tabs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Tabs run: .github/workflows/style/check_tabs.sh trailing_whitespaces: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Trailing Whitespaces run: .github/workflows/style/check_trailing_whitespaces.sh doxygen: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Doxygen run: | sudo apt-get install -y --no-install-recommends doxygen diff --git a/.github/workflows/sundials.yml b/.github/workflows/sundials.yml index 12dfd42c159..c4735479105 100644 --- a/.github/workflows/sundials.yml +++ b/.github/workflows/sundials.yml @@ -15,13 +15,13 @@ jobs: CCACHE_COMPRESSLEVEL: 10 CCACHE_MAXSIZE: 20M steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -41,7 +41,7 @@ jobs: -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_C_COMPILER_LAUNCHER=ccache - make -j2 + make -j4 make install cd ../.. - name: Compile Test @@ -57,10 +57,10 @@ jobs: -DSUNDIALS_ROOT=${PWD}/sundials-6.5.0/instdir \ -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build -j 2 + cmake --build build -j 4 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j4 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -75,13 +75,13 @@ jobs: CCACHE_COMPRESSLEVEL: 10 CCACHE_MAXSIZE: 75M steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Dependencies run: | .github/workflows/dependencies/dependencies_nvcc.sh .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/ccache key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} @@ -110,7 +110,7 @@ jobs: -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache - make -j2 + make -j4 make install cd ../.. - name: Compile Test @@ -129,7 +129,7 @@ jobs: -DSUNDIALS_ROOT=${PWD}/sundials-6.5.0/instdir \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache - cmake --build build -j 2 + cmake --build build -j 4 ccache -s du -hs ~/.cache/ccache @@ -143,7 +143,7 @@ jobs: PR_NUMBER: ${{ github.event.number }} run: | echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: pr_number path: pr_number.txt diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 985e143108f..ab3db558649 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -12,101 +12,104 @@ jobs: name: MSVC C++17 w/o Fortran w/o MPI runs-on: windows-latest steps: - - uses: actions/checkout@v3 - - uses: seanmiddleditch/gha-setup-ninja@master - - name: Set Up Cache - uses: actions/cache@v3 - with: - path: ~/.ccache - key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} - restore-keys: | - ccache-${{ github.workflow }}-${{ github.job }}-git- - - name: Install Ccache - run: | - Invoke-WebRequest https://github.com/ccache/ccache/releases/download/v4.8/ccache-4.8-windows-x86_64.zip -OutFile ccache-4.8-windows-x86_64.zip - Expand-Archive ccache-4.8-windows-x86_64.zip + - uses: actions/checkout@v4 + #- name: Set Up Cache + # uses: actions/cache@v3 + # with: + # path: ~/.ccache + # key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} + # restore-keys: | + # ccache-${{ github.workflow }}-${{ github.job }}-git- + #- name: Install Ccache + # run: | + # Invoke-WebRequest https://github.com/ccache/ccache/releases/download/v4.8/ccache-4.8-windows-x86_64.zip -OutFile ccache-4.8-windows-x86_64.zip + # Expand-Archive ccache-4.8-windows-x86_64.zip - name: Build & Install run: | - $ccachepath = Join-Path $pwd "ccache-4.8-windows-x86_64" - $Env:PATH += ";$ccachepath" - $ccachecachedir = Join-Path $HOME ".ccache" - $Env:CCACHE_DIR="$ccachecachedir" - $Env:CCACHE_DIR - $Env:CCACHE_COMPRESS='1' - $Env:CCACHE_COMPRESSLEVEL='10' - $Env:CCACHE_MAXSIZE='105M' - ccache -z + #$ccachepath = Join-Path $pwd "ccache-4.8-windows-x86_64" + #$Env:PATH += ";$ccachepath" + #$ccachecachedir = Join-Path $HOME ".ccache" + #$Env:CCACHE_DIR="$ccachecachedir" + #$Env:CCACHE_DIR + #$Env:CCACHE_COMPRESS='1' + #$Env:CCACHE_COMPRESSLEVEL='10' + #$Env:CCACHE_MAXSIZE='105M' + #ccache -z cmake -S . -B build ` - -G "Ninja" ` - -DCMAKE_BUILD_TYPE=Debug ` -DBUILD_SHARED_LIBS=ON ` -DCMAKE_VERBOSE_MAKEFILE=ON ` -DAMReX_EB=OFF ` -DAMReX_ENABLE_TESTS=ON ` -DAMReX_FORTRAN=OFF ` - -DAMReX_MPI=OFF ` - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build --config Debug -j 2 + -DAMReX_MPI=OFF + #-DCMAKE_CXX_COMPILER_LAUNCHER=ccache + cmake --build build --config Debug -j 4 + + cmake --build build --config Debug --target install - ccache -s + $Env:PATH += ";D:\\a\amrex\amrex\installdir\bin" + cmake --build build --config Debug --target test_install + + #ccache -s # Build libamrex and all test (static) test_msvc_static: name: MSVC C++17 w/o Fortran w/o MPI static runs-on: windows-latest steps: - - uses: actions/checkout@v3 - - uses: seanmiddleditch/gha-setup-ninja@master - - name: Set Up Cache - uses: actions/cache@v3 - with: - path: ~/.ccache - key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} - restore-keys: | - ccache-${{ github.workflow }}-${{ github.job }}-git- - - name: Install Ccache - run: | - Invoke-WebRequest https://github.com/ccache/ccache/releases/download/v4.8/ccache-4.8-windows-x86_64.zip -OutFile ccache-4.8-windows-x86_64.zip - Expand-Archive ccache-4.8-windows-x86_64.zip + - uses: actions/checkout@v4 + #- name: Set Up Cache + # uses: actions/cache@v3 + # with: + # path: ~/.ccache + # key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }} + # restore-keys: | + # ccache-${{ github.workflow }}-${{ github.job }}-git- + #- name: Install Ccache + # run: | + # Invoke-WebRequest https://github.com/ccache/ccache/releases/download/v4.8/ccache-4.8-windows-x86_64.zip -OutFile ccache-4.8-windows-x86_64.zip + # Expand-Archive ccache-4.8-windows-x86_64.zip - name: Build & Install run: | - $ccachepath = Join-Path $pwd "ccache-4.8-windows-x86_64" - $Env:PATH += ";$ccachepath" - $ccachecachedir = Join-Path $HOME ".ccache" - $Env:CCACHE_DIR="$ccachecachedir" - $Env:CCACHE_COMPRESS='1' - $Env:CCACHE_COMPRESSLEVEL='10' - $Env:CCACHE_MAXSIZE='135M' - ccache -z + #$ccachepath = Join-Path $pwd "ccache-4.8-windows-x86_64" + #$Env:PATH += ";$ccachepath" + #$ccachecachedir = Join-Path $HOME ".ccache" + #$Env:CCACHE_DIR="$ccachecachedir" + #$Env:CCACHE_COMPRESS='1' + #$Env:CCACHE_COMPRESSLEVEL='10' + #$Env:CCACHE_MAXSIZE='135M' + #ccache -z cmake -S . -B build ` - -G "Ninja" ` - -DCMAKE_BUILD_TYPE=RelWithDebInfo ` -DCMAKE_VERBOSE_MAKEFILE=ON ` -DAMReX_EB=ON ` -DAMReX_ENABLE_TESTS=ON ` -DAMReX_FORTRAN=OFF ` - -DAMReX_MPI=OFF ` - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build --config RelWithDebInfo -j 2 + -DAMReX_MPI=OFF + #-DCMAKE_CXX_COMPILER_LAUNCHER=ccache + cmake --build build --config RelWithDebInfo -j 4 + + cmake --build build --config RelWithDebInfo --target install + cmake --build build --config RelWithDebInfo --target test_install - ccache -s + #ccache -s # Build libamrex and all tests tests_clang: name: Clang C++17 w/o Fortran w/o MPI runs-on: windows-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: seanmiddleditch/gha-setup-ninja@master - name: Build & Install shell: cmd + env: + CMAKE_GENERATOR_TOOLSET: "ClangCl" + CMAKE_GENERATOR: "Visual Studio 17 2022" run: | call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\vc\Auxiliary\build\vcvarsall.bat" x64 cmake -S . -B build ^ - -T "ClangCl" ^ - -DCMAKE_BUILD_TYPE=Release ^ -DBUILD_SHARED_LIBS=ON ^ -DCMAKE_VERBOSE_MAKEFILE=ON ^ -DAMReX_EB=ON ^ @@ -114,19 +117,25 @@ jobs: -DAMReX_FORTRAN=OFF ^ -DAMReX_MPI=OFF ^ -DAMReX_OMP=ON - cmake --build build --config Release -j 2 + cmake --build build --config Release -j 4 - save_pr_number: - if: github.event_name == 'pull_request' - runs-on: ubuntu-latest - steps: - - name: Save PR number - env: - PR_NUMBER: ${{ github.event.number }} - run: | - echo $PR_NUMBER > pr_number.txt - - uses: actions/upload-artifact@v3 - with: - name: pr_number - path: pr_number.txt - retention-days: 1 + cmake --build build --config Release --target install + + set "PATH=%PATH%;D:\\a\amrex\amrex\installdir\bin" + cmake --build build --config Release --target test_install + + # If we add ccache back, don't forget to update cleanup-cache.yml + #save_pr_number: + # if: github.event_name == 'pull_request' + # runs-on: ubuntu-latest + # steps: + # - name: Save PR number + # env: + # PR_NUMBER: ${{ github.event.number }} + # run: | + # echo $PR_NUMBER > pr_number.txt + # - uses: actions/upload-artifact@v3 + # with: + # name: pr_number + # path: pr_number.txt + # retention-days: 1 diff --git a/CHANGES b/CHANGES index e27552e7cb4..a946f87cf23 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,306 @@ +# 24.03 + + -- Fix GPU restart for pure SoA particles (#3783) + + -- fix for ref_ratio=1 (#3786) + + -- Update GMRES/MLMG interface (#3779) + + -- Ref ratio 3 (#3781) + + -- Curl Curl solver: 4-color Gauss-Seidel smoother (#3778) + + -- assert wavefront size (#3777) + + -- Curl of Curl solver: Tweak restriction (#3765) + + -- Adjust debug info argument for HIP compiler (#3761) + + -- fixed bug in MLCurlCurL::xdoty() to prevent doing MPI sum twice. (#3774) + + -- Implement portable assumptions with AMREX_ASSUME (#3770) + Fix bug in AMREX_ASSUME (#3773) + + -- Pure SoA: `NextID` as `Long` (#3772) + + -- GNU Make: set COMP_VERSION for hipcc and nvcc (#3771) + + -- Make MFParallelFor safer from int overflow (#3768) + + -- getParticleTileData: HostVector must be initialized during resize (#3769) + + -- add AMREX_LIKELY and AMREX_UNLIKELY (#3767) + + -- Only do a htod memcpy in getParticleTileData when necessary (#3760) + + -- Sync GPU stream before getting the time in TinyProfiler (#3763) + + -- Add a logspace-like function in AMReX_Algorithm.H (#3754) + + -- Fix offset in send buffer of single precision particle communication (#3758) + + -- EdgeFluxRegister for MHD (#3633) + + -- Fix a typo in SYCL version of scan (#3757) + + -- Interpolation from node-centered general mapped coordinates to tracers (#3750) + + -- Remove various deprecated stuff not prefixed by amrex or bl (#3713) + + -- GNU Make: Link flags (#3711) + + -- Remove UB from is_aligned (#3751) + + -- Add partitionParticles Function (#3743) + + -- Minor new functions in AMReX_MPMD to provide flexibility for python binding (#3748) + + -- Box::numPts() returns 0 for empty boxes (#3747) + + -- New Linear Solver: Curl of Curl (#3682) + + -- Use long integer in GPU kernels (#3742) + + -- Add `ParticleIDWrapper::make_invalid()` (#3735) + +# 24.02 + + -- Disable m_aos for SoA Particle (#3736) + Update AoS Restrict (#3738) + + -- Add helper function for setting id and cpu simultaneously. (#3733) + + -- Disable SYCL on Nvidia and AMD CIs (#3726) + + -- ParticleCopyPlan for SoA Particles (#3732) + + -- Do not use std::forward twice on the same object in AmrParticleLocator (#3734) + + -- Particle Copy Plan: Default Vals (#3729) + + -- Fix circular header file dependency (#3725) + + -- Provide portable Gpu::Atomic::Multiply and Gpu::Atomic::Divide implemented with CAS. (#3724) + + -- Plotfile Tools: Add missing option to fcompare usage print (#3722) + + -- Update documentation for STL files (#3723) + + -- Add TypeMultiplier, MakeZeroTuple and IdentityTuple (#3718) + + -- Interpolation routines for tracers with mapped_z. (#3714) + + -- relax constraint that real_comp_names.size() == pc.NumRealComps() + NStructReal for pure SoA plotfiles (#3717) + + -- Fix warnings in DenseBins::build with serial bin policy (#3716) + + -- lockAdd: case of 2D plane in 3D (#3700) + + -- Clean up interpolation routines in AMReX_TracerParticle_mod_K.H and AMReX_Particle_mod_K.H (#3679) + + -- Add macro for loop unrolling across compilers (#3701) + + -- Add a linspace-like function in AMReX_Algorithm.H (#3698) + + -- use amrex::Gpu::memcpy for packParticleIDs (#3699) + + -- BaseFab::lockAdd: Faster version of BaseFab::atomicAdd for OpenMP (#3696) + + -- GMRES (#3648) + + -- Add special named flag for invalid particles (#3688) + + -- Fix BL_PROFILE_TINY_FLUSH (#3695) + + -- Align GpuComplex to its size (#3691) + + -- Fix Advection_AmrCore test (#3690) + +# 24.01 + + -- MLMG: Use free functions instead of MF member functions (#3681) + + -- Add a few free functions for MLMG (#3680) + + -- Eliminating Matrix operations in MLMG CG bottom solver if initial vector is zero (#3668) + + -- Add a for loop that is unrolled at compile time (#3674) + + -- Add PTD version of getParticleCell (#3675) + + -- Improve ParIter docs (#3676) + + -- Fix CI for ROCm 6.0 (#3673) + + -- PureSoA IdCpu fixes (#3671) + + -- CMake: AMReX_PARALLEL_LINK_JOBS (#3628) + + -- Clang-Tidy in CI: Keep Going after Errors (#3667) + + -- Delete empty below comments on classes and functions (#3669) + + -- Documentation for Profiling: Hot Spots and Load Balance (#3622) + + -- Fix warnings in SortParticlesForDeposition (#3664) + + -- Fix Resize Issue of Fab with the Async Arena (#3663) + + -- Fix SuperParticle `push_back` (#3661) + + -- Pure SoA Particle: Separate Array for IdCPU (#3585) + + -- Limit the scope of gpu_rand_generator (#3659) + + -- Fix a typo in doxygen for NonLocalBC::FillBoundary (#3658) + + -- GNU Make: Fix name collision for aurora (#3656) + + -- two separate fixes -- particle_compare and ref_ratio=1 (#3655) + + -- Clarify documentation on setEBDirchlet() and fix link to AMReX-Hydro (#3652) + + -- Robustify the Cache Cleanup Scripts (#3650) + + -- Disable CodeQL scheduled jobs on forks (#3649) + + -- Work around compiler bug in nvcc 12.2 by using functor instead of lambda (#3653) + +# 23.12 + + -- solve_cg: avoid use of MF `z` (#3637) + + -- Fix: nosmt OMP Threads Default (#3647) + `amrex.omp_threads`: Can Avoid SMT (#3607) + + -- When checking for periodic outs on GPU, copy full particle data (#3646) + + -- MLEBABecLap: Support Robin BC at Domain Boundaries (#3617) + + -- Ascent: SoA Particle Support (#3350) + + -- solve_bicgstab: use fewer MFs (#3635) + + -- solve_bicgstab: cut use of `s` (#3629) + + -- Bug fix for amrex::Subtract when called with interger nghost (#3634) + + -- Fix typo in `MLMGT::getGradSolution` when `MF` is different from `AMF` (#3631) + + -- SUNDIALS: Use sunrealtype instead of realtype (#3632) + + -- SYCL: Use get_multi_ptr instead of get_pointer (#3630) + + -- Plotfile Tools: GPU support (#3626) + + -- solve_cg: use linop.make instead of MF constructor (#3627) + + -- CArena: shrink_in_place and operator<< (#3621) + + -- solve_bicgstab: use linop.make instead of MF constructor (#3619) + + -- replace AMREX_DEVICE_COMPILE with AMREX_IF_ON_DEVICE and AMREX_IF_ON_HOST (#3591) + + -- [Breaking] Prefix `amrex_` to each plotfile Tool (#3600) + + -- FillRandom: Use MKL host API (#3536) + + -- use hipPointerAttribute_t.type as HIP is removing hipPointerAttribute_t.memoryType (#3610) + +# 23.11 + + -- Give FlashFluxRegisters ways to accumulate data in registers (#3597) + + -- `AMReXBuildInfo.cmake`: AMReX_DIR (#3609) + + -- update doc for amrex::Abort on GPU (#3605) + + -- Add runtime particle components to HDF5 wrapper (#3596) + + -- Windows: Fix Installed AMReXBuildInfo.cmake (#3606) + + -- Print AMReX version at the beginning of Initialize (#3604) + + -- Install Move Tools to `shared/amrex` (#3599) + + -- Revert "Add ability for GCC 8 in CMake to build fgradient which uses std::filesystem" (#3601) + + -- Avoid std::filesystem (#3602) + + -- Fix Assertion in MLEBNodeFDLaplacian (#3594) + + -- Fix a memory "leak" in VisMF's persistent streams (#3592) + + -- RealVect Static: Export (#3589) + + -- change MaxCnt from 4 to max(4,max_level+1) for how many iterations we… (#3588) + … allow in creation of the initial grid hierarchy + + -- Add Bittree CI (#3577) + + -- BCType::ext_dir_cc (#3581) + + -- Disable CCache in Windows CIs (#3566) + + -- Fix ICC CI by Freeing up Disk Space (#3583) + + -- Docs: Link pyAMReX (#3582) + + -- NodeABecLaplacian: Reuse (#3579) + + -- simplify how 2d surface integrals are computed (#3571) + + -- Adding bittree interface to improve regridding performance in octree mode (#3555) + + -- MLNodeABecLaplacian (#3559) + + -- Fix Boundary Centroid in a Corner Case in 2D (#3568) + +# 23.10 + + -- Bugfix typo in AMReX_SundialsIntegrator.H Nvar vs NVar, the + declared/used variable is NVar (#3573) + + -- Code Spell (#3563) + + -- Add Fortran interface for average_down_faces (#3553) + + -- PureSoA: Disable AoS Access (#3290) + + -- Another terrain fix for MPI (#3557) + Fix Increment, OK, and EnforcePeriodicWhere for terrain-fitted particles. (#3556) + + -- Added cvode functionality to SUNDIALS integrator (#3436) + + -- ParmParse::addfile needs Init (#3440) + + -- Make the same changes to ApplyInitialRedistribution as to ApplyMLRedistribution (#3554) + + -- Reset EB Fab Type (#3552) + EB Data outside domain (#3549) + + -- We weren't defining cent_hat out far enough (#3548) + + -- Add Fortran interface for FillCoarsePatch for face variables (#3542) + + -- print_state/printCell: Make it work without managed memory (#3543) + + -- FillPatch Fortran Interface: Fix incorrect size of Vector (#3546) + + -- ReduceOps: reset result readiness flag (#3545) + + -- Fix Fortran interface for FillPatch for face variables (#3541) + + -- Support multiple CUDA architectures at compilation (#3535) + + -- Add Kestrel machine and remove Rhodes machine from Make.nrel (#3533) + + -- Explicitly flush when writing to the terse run log (#3532) + + -- Missing header in AMReX_GpuComplex.H (#3531) + + -- Add global domain id offset to conduit wrapper (#3524) + # 23.09 -- Fix InitRandomPerBox for 1D & 2D (#3527) @@ -20,7 +323,7 @@ -- Simplify filterParticles Kernel (#3510) - -- Generatize particle-to-cell assignment function (#3499) + -- Generalize particle-to-cell assignment function (#3499) Follow-on to 3499 (#3514) ParticleLocator: Make Assignor optional template parameter (#3515) @@ -208,7 +511,7 @@ # 23.07 - -- Allow users to change the default vector growth stategy (#3389) + -- Allow users to change the default vector growth strategy (#3389) -- Communications arena implementation (#3388) diff --git a/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst b/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst index 8726f51a2ba..cdd774488e4 100644 --- a/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst +++ b/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst @@ -93,6 +93,47 @@ it is also recommended to wrap any ``BL_PROFILE_TINY_FLUSH();`` calls in informative ``amrex::Print()`` lines to ensure accurate identification of each set of timers. +Hot Spots and Load Balance +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The output of TinyProfiler can help us to identify hot spots. For example, +the following output shows the top three hot spots of a linear solver test +running on 4 MPI processes. + +.. highlight:: console + +:: + + -------------------------------------------------------------------------------------------- + Name NCalls Excl. Min Excl. Avg Excl. Max Max % + -------------------------------------------------------------------------------------------- + MLPoisson::Fsmooth() 560 0.4775 0.4793 0.4815 34.97% + MLPoisson::Fapply() 114 0.1103 0.113 0.1167 8.48% + FabArray::Xpay() 109 0.1 0.1013 0.1038 7.54% + +In this test, there are 16 boxes evenly distributed among 4 MPI processes. The +output above shows that the load is perfectly balanced. However, if the load +is not balanced, the results can be very different and sometimes +misleading. For example, if we put 2, 2, 6 and 6 boxes on processes 0, 1, 2 +and 3, respectively, the top three hot spots now include two MPI +communication functions, ``FillBoundary`` and ``ParallelCopy``. + +.. highlight:: console + +:: + + -------------------------------------------------------------------------------------------- + Name NCalls Excl. Min Excl. Avg Excl. Max Max % + -------------------------------------------------------------------------------------------- + FillBoundary_finish() 607 0.01568 0.3367 0.6574 41.97% + MLPoisson::Fsmooth() 560 0.2133 0.4047 0.5973 38.13% + FabArray::ParallelCopy_finish() 231 0.002977 0.09748 0.1895 12.10% + +The reason that the MPI communication appears slow is that the lightly +loaded processes have to wait for messages sent by the heavily loaded +processes. See also :ref:`sec:profopts` for a diagnostic option that may +provide more insight on the load imbalance. + .. _sec:full:profiling: Full Profiling diff --git a/Docs/sphinx_documentation/source/AmrCore.rst b/Docs/sphinx_documentation/source/AmrCore.rst index 6aadd62250e..0e4e6932126 100644 --- a/Docs/sphinx_documentation/source/AmrCore.rst +++ b/Docs/sphinx_documentation/source/AmrCore.rst @@ -648,11 +648,11 @@ interface to a Fortran routine that tags cells (in this case, :fortran:`state_er const int* thi = tilebox.hiVect(); // tag cells for refinement - state_error(tptr, ARLIM_3D(tlo), ARLIM_3D(thi), + state_error(tptr, AMREX_ARLIM_3D(tlo), AMREX_ARLIM_3D(thi), BL_TO_FORTRAN_3D(state[mfi]), &tagval, &clearval, - ARLIM_3D(tilebox.loVect()), ARLIM_3D(tilebox.hiVect()), - ZFILL(dx), ZFILL(prob_lo), &time, &phierr[lev]); + AMREX_ARLIM_3D(tilebox.loVect()), AMREX_ARLIM_3D(tilebox.hiVect()), + AMREX_ZFILL(dx), AMREX_ZFILL(prob_lo), &time, &phierr[lev]); // // Now update the tags in the TagBox in the tilebox region // to be equal to itags diff --git a/Docs/sphinx_documentation/source/AmrLevel.rst b/Docs/sphinx_documentation/source/AmrLevel.rst index 6f0a1f6fb18..dfbae5cde25 100644 --- a/Docs/sphinx_documentation/source/AmrLevel.rst +++ b/Docs/sphinx_documentation/source/AmrLevel.rst @@ -110,7 +110,7 @@ the :cpp:`AmrLevelAdv` class, e.g., int lo_bc[BL_SPACEDIM]; int hi_bc[BL_SPACEDIM]; for (int i = 0; i < BL_SPACEDIM; ++i) { - lo_bc[i] = hi_bc[i] = INT_DIR; // periodic boundaries + lo_bc[i] = hi_bc[i] = amrex::BCType::int_dir; // periodic boundaries } BCRec bc(lo_bc, hi_bc); diff --git a/Docs/sphinx_documentation/source/Basics.rst b/Docs/sphinx_documentation/source/Basics.rst index fd2d12ee0b8..8c5fe3dbba5 100644 --- a/Docs/sphinx_documentation/source/Basics.rst +++ b/Docs/sphinx_documentation/source/Basics.rst @@ -1942,7 +1942,8 @@ tiling flag is on. One can change the default size using :cpp:`ParmParse` | | FArrayBoxes. | | +-----------------------------------------------------+------------------------------------------------------+ -Dynamic tiling, which runs one box per OpenMP thread, is also available. +Dynamic tiling, which runs one box per OpenMP thread, either with or without +tiling the box, is also available. This is useful when the underlying work cannot benefit from thread parallelization. Dynamic tiling is implemented using the :cpp:`MFItInfo` object and requires the :cpp:`MFIter` loop to be defined in an OpenMP @@ -1981,9 +1982,13 @@ Dynamic tiling also allows explicit definition of a tile size: ... } -Usually :cpp:`MFIter` is used for accessing multiple MultiFabs like the second -example, in which two MultiFabs, :cpp:`U` and :cpp:`F`, use :cpp:`MFIter` via -:cpp:`operator[]`. These different MultiFabs may have different BoxArrays. For +Note that :cpp:`EnableTiling()`, with no argument, will use the default tile size. + +Usually :cpp:`MFIter` is used for accessing multiple MultiFabs, like +the second example in the previous section on :ref:`sec:basics:mfiter:notiling` +in which two MultiFabs, :cpp:`U` and :cpp:`F`, use :cpp:`MFIter` via +:cpp:`array()` and :cpp:`const_array()` functions. These different MultiFabs +may have different BoxArrays. For example, :cpp:`U` might be cell-centered, whereas :cpp:`F` might be nodal in :math:`x`-direction and cell in other directions. The :cpp:`MFIter::validbox` and :cpp:`tilebox` functions return Boxes of the same type as the @@ -2537,12 +2542,27 @@ The basic idea behind physical boundary conditions is as follows: ext_dir "External Dirichlet". It is the user's responsibility to write a routine - to fill ghost cells (more details below). + to fill ghost cells (more details below). The boundary location + is on the domain face even when the data inside the domain are + cell-centered. + + ext_dir_cc + "External Dirichlet". It is the user's responsibility to write a routine + to fill ghost cells (more details below). The boundary location + is at the cell center of ghost cells outside the domain. foextrap "First Order Extrapolation" First order extrapolation from last cell in interior. + hoextrap + "High Order Extrapolation". The boundary location is on the domain + face even when the data inside the domain are cell-centered. + + hoextrapcc + "High Order Extrapolation" The boundary location is at the cell + center of ghost cells outside the domain. + reflect_even Reflection from interior cells with sign unchanged, :math:`q(-i) = q(i)`. @@ -2797,3 +2817,6 @@ Backtrace files are produced by AMReX signal handler by default when segfault occurs or ``Abort`` is called. If the application does not want AMReX to handle this, ``ParmParse`` parameter `amrex.signal_handling=0` can be used to disable it. + +See :ref:`sec:gpu:assertion` for considerations on using these functions in +GPU-enabled code. diff --git a/Docs/sphinx_documentation/source/Debugging.rst b/Docs/sphinx_documentation/source/Debugging.rst index 89eee31c2bd..aa5a9dcc9a3 100644 --- a/Docs/sphinx_documentation/source/Debugging.rst +++ b/Docs/sphinx_documentation/source/Debugging.rst @@ -24,7 +24,7 @@ handling of floating point exceptions: ``amrex.fpe_trap_invalid`` for NaNs, ``amrex.fpe_trap_zero`` for division by zero and ``amrex.fpe_trap_overflow`` for overflow. To more effectively trap the use of uninitialized values, AMReX also initializes ``FArrayBox``\ s in -``MulitFab``\ s and arrays allocated by ``bl_allocate`` to signaling NaNs when it is compiled +``MultiFab``\ s and arrays allocated by ``bl_allocate`` to signaling NaNs when it is compiled with ``TEST=TRUE`` or ``DEBUG=TRUE`` in GNU make, or with ``-DCMAKE_BUILD_TYPE=Debug`` in CMake. One can also control the setting for ``FArrayBox`` using the runtime parameter, ``fab.init_snan``. Note for Macs, M1 and M2 chips using Arm64 architecture are not able to trap division by zero. diff --git a/Docs/sphinx_documentation/source/EB.rst b/Docs/sphinx_documentation/source/EB.rst index 446e7305e50..9b847a3259c 100644 --- a/Docs/sphinx_documentation/source/EB.rst +++ b/Docs/sphinx_documentation/source/EB.rst @@ -48,7 +48,7 @@ Here is a simple example of initialize the database for an embedded sphere. EB2::Build(shop, geom, 0, 0); Alternatively, the EB information can be initialized from an STL file -specified by a :cpp:`ParmParse` parameter ``eb2.stl_file``. The +specified by a :cpp:`ParmParse` parameter ``eb2.stl_file``. (This also requires setting ``eb2.geom_type = stl``.) The initialization is done by calling .. highlight:: c++ @@ -386,6 +386,9 @@ testing cell types and getting neighbor information. For example end do end do + +.. _sec:EB:redistribution: + Small Cell Problem and Redistribution ===================================== diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst index 1391015f31e..c75440a27fd 100644 --- a/Docs/sphinx_documentation/source/GPU.rst +++ b/Docs/sphinx_documentation/source/GPU.rst @@ -217,7 +217,7 @@ variables to configure the build +------------------------------+-------------------------------------------------+-------------+-----------------+ | SYCL_SUB_GROUP_SIZE | Specify subgroup size | 32 | 64, 32, 16 | +------------------------------+-------------------------------------------------+-------------+-----------------+ - | SYCL_MAX_PARALLEL_LINK_JOBS | Number of parallel jobs in device link | 1 | 1, 2, 3, etc. | + | SYCL_PARALLEL_LINK_JOBS | Number of parallel jobs in device link | 1 | 1, 2, 3, etc. | +------------------------------+-------------------------------------------------+-------------+-----------------+ .. raw:: latex @@ -307,7 +307,7 @@ If autodetection fails, a list of "common" architectures is assumed. Building for multiple CUDA architectures will generally result in a larger library and longer build times. **Note that AMReX supports NVIDIA GPU architectures with compute capability 6.0 or higher and -CUDA Toolkit version 9.0 or higher.** +CUDA Toolkit version 11.0 or higher.** In order to import the CUDA-enabled AMReX library into your CMake project, you need to include the following code into the appropriate CMakeLists.txt file: @@ -428,22 +428,24 @@ Below is an example configuration for SYCL: .. table:: AMReX SYCL-specific build options - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | Variable Name | Description | Default | Possible values | - +==============================+=================================================+=============+=================+ - | AMReX_SYCL_AOT | Enable SYCL ahead-of-time compilation | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMReX_SYCL_AOT_GRF_MODE | Specify AOT register file mode | Default | Default, Large, | - | | | | AutoLarge | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMREX_INTEL_ARCH | Specify target if AOT is enabled | None | pvc, etc. | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMReX_SYCL_SPLIT_KERNEL | Enable SYCL kernel splitting | YES | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMReX_SYCL_ONEDPL | Enable SYCL's oneDPL algorithms | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMReX_SYCL_SUB_GROUP_SIZE | Specify subgroup size | 32 | 64, 32, 16 | - +------------------------------+-------------------------------------------------+-------------+-----------------+ + +-------------------------------+----------------------------------------------+-------------+------------------+ + | Variable Name | Description | Default | Possible values | + +===============================+==============================================+=============+==================+ + | AMReX_SYCL_AOT | Enable SYCL ahead-of-time compilation | NO | YES, NO | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_SYCL_AOT_GRF_MODE | Specify AOT register file mode | Default | Default, Large, | + | | | | AutoLarge | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMREX_INTEL_ARCH | Specify target if AOT is enabled | None | pvc, etc. | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_SYCL_SPLIT_KERNEL | Enable SYCL kernel splitting | YES | YES, NO | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_SYCL_ONEDPL | Enable SYCL's oneDPL algorithms | NO | YES, NO | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_SYCL_SUB_GROUP_SIZE | Specify subgroup size | 32 | 64, 32, 16 | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_PARALLEL_LINK_JOBS | Specify number of parallel link jobs | 1 | positive integer | + +-------------------------------+----------------------------------------------+-------------+------------------+ .. raw:: latex \end{center} @@ -489,11 +491,10 @@ GPU support. When AMReX is compiled with ``USE_OMP_OFFLOAD=TRUE``, ``AMREX_USE_OMP_OFFLOAD`` is defined. -In addition to AMReX's preprocessor macros, CUDA provides the -``__CUDA_ARCH__`` macro which is only defined when in device code. -HIP and Sycl provide similar macros. -``AMREX_DEVICE_COMPILE`` should be used when a ``__host__ __device__`` -function requires separate code for the CPU and GPU implementations. +The macros ``AMREX_IF_ON_DEVICE((code_for_device))`` and +``AMREX_IF_ON_HOST((code_for_host))`` should be used when a +``__host__ __device__`` function requires separate code for the +CPU and GPU implementations. .. =================================================================== @@ -795,7 +796,7 @@ As another example, the following function computes the max- and 1-norm of a :: GpuTuple compute_norms (MultiFab const& mf, - iMulitiFab const& mask) + iMultiFab const& mask) { auto const& data_ma = mf.const_arrays(); auto const& mask_ma = mask.const_arrays(); @@ -1553,9 +1554,13 @@ Assertions and Error Checking To help debugging, we often use :cpp:`amrex::Assert` and :cpp:`amrex::Abort`. These functions are GPU safe and can be used in GPU kernels. However, implementing these functions requires additional -GPU registers, which will reduce overall performance. Therefore, it -is preferred to implement such calls in debug mode only by wrapping the -calls using ``#ifdef AMREX_DEBUG``. +GPU registers, which will reduce overall performance. Therefore, by +default these functions and the macro ``AMREX_ALWAYS_ASSERT`` are no-ops +for optimized builds (e.g., ``DEBUG=FALSE`` using the GNU Make build +system) when called from kernels run on GPU. Calls to these functions from +GPU kernels are active for debug builds and can optionally be activated +at compile time for optimized builds (e.g., ``DEBUG=FALSE`` and +``USE_ASSERTION=TRUE`` using the GNU Make build system). In CPU code, :cpp:`AMREX_GPU_ERROR_CHECK()` can be called to check the health of previous GPU launches. This call @@ -1738,14 +1743,14 @@ by "amrex" in your :cpp:`inputs` file. +----------------------------+-----------------------------------------------------------------------+-------------+----------+ | | Description | Type | Default | +============================+=======================================================================+=============+==========+ -| use_gpu_aware_mpi | Whether to use GPU memory for communication buffers during MPI calls. | Bool | False | -| | If true, the buffers will use device memory. If false, they will use | | | -| | pinned memory. In practice, we find it is usually not worth it to use | | | -| | GPU aware MPI. | | | +| use_gpu_aware_mpi | Whether to use GPU memory for communication buffers during MPI calls. | Bool | 0 | +| | If true, the buffers will use device memory. If false (i.e., 0), they | | | +| | will use pinned memory. In practice, we find it is not always worth | | | +| | it to use GPU aware MPI. | | | +----------------------------+-----------------------------------------------------------------------+-------------+----------+ -| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a | Bool | False | +| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a | Bool | 0 | | | requested allocation, AMReX will call AMReX::Abort() with an error | | | | | describing how much free memory there is and what was requested. | | | +----------------------------+-----------------------------------------------------------------------+-------------+----------+ -| the_arena_is_managed | Whether :cpp:`The_Arena()` allocates managed memory. | Bool | False | +| the_arena_is_managed | Whether :cpp:`The_Arena()` allocates managed memory. | Bool | 0 | +----------------------------+-----------------------------------------------------------------------+-------------+----------+ diff --git a/Docs/sphinx_documentation/source/InputsComputeBackends.rst b/Docs/sphinx_documentation/source/InputsComputeBackends.rst new file mode 100644 index 00000000000..26e5d527508 --- /dev/null +++ b/Docs/sphinx_documentation/source/InputsComputeBackends.rst @@ -0,0 +1,21 @@ +.. _Chap:InputsComputeBackends: + +Compute Backends +================ + +The following inputs must be preceded by ``amrex.`` and determine runtime options of CPU or GPU compute implementations. + ++------------------------+-----------------------------------------------------------------------+-------------+------------+ +| Parameter | Description | Type | Default | ++========================+=======================================================================+=============+============+ +| ``omp_threads`` | If OpenMP is enabled, this can be used to set the default number of | String | ``system`` | +| | threads. The special value ``nosmt`` can be used to avoid using | or Int | | +| | threads for virtual cores (aka Hyperthreading or SMT), as is default | | | +| | in OpenMP, and instead only spawns threads equal to the number of | | | +| | physical cores in the system. | | | +| | For the values ``system`` and ``nosmt``, the environment variable | | | +| | ``OMP_NUM_THREADS`` takes precedence. For Integer values, | | | +| | ``OMP_NUM_THREADS`` is ignored. | | | ++------------------------+-----------------------------------------------------------------------+-------------+------------+ + +For GPU-specific parameters, see also the :ref:`GPU chapter `. diff --git a/Docs/sphinx_documentation/source/InputsPlotFiles.rst b/Docs/sphinx_documentation/source/InputsPlotFiles.rst index 64b202ebf5c..9e8789a90ac 100644 --- a/Docs/sphinx_documentation/source/InputsPlotFiles.rst +++ b/Docs/sphinx_documentation/source/InputsPlotFiles.rst @@ -12,7 +12,7 @@ as whether a plotfile should be written out immediately after restarting a simul | plot_int | Frequency of plotfile output; | Int | -1 | | | if -1 then no plotfiles will be written | | | +---------------------+-----------------------------------------------------------------------+-------------+-----------+ -| plotfile_on_restart | Should we write a plotfile when we restart (only used if plot_int>0) | Bool | False | +| plotfile_on_restart | Should we write a plotfile when we restart (only used if plot_int>0) | Bool | 0 (false) | +---------------------+-----------------------------------------------------------------------+-------------+-----------+ | plot_file | Prefix to use for plotfile output | String | plt | +---------------------+-----------------------------------------------------------------------+-------------+-----------+ diff --git a/Docs/sphinx_documentation/source/Inputs_Chapter.rst b/Docs/sphinx_documentation/source/Inputs_Chapter.rst index 0a64aeb492c..43ead40b3c6 100644 --- a/Docs/sphinx_documentation/source/Inputs_Chapter.rst +++ b/Docs/sphinx_documentation/source/Inputs_Chapter.rst @@ -9,6 +9,7 @@ Run-time Inputs InputsProblemDefinition InputsTimeStepping InputsLoadBalancing + InputsComputeBackends InputsPlotFiles InputsCheckpoint diff --git a/Docs/sphinx_documentation/source/LinearSolvers.rst b/Docs/sphinx_documentation/source/LinearSolvers.rst index 87048bc8195..ab0ba3506a9 100644 --- a/Docs/sphinx_documentation/source/LinearSolvers.rst +++ b/Docs/sphinx_documentation/source/LinearSolvers.rst @@ -293,6 +293,18 @@ For Robin boundary conditions, the ghost cells in store the numerical values in the condition, :math:`a\phi + b\frac{\partial\phi}{\partial n} = f`. +4) Nodal solver provides the option to use an overset mask: + +.. highlight:: c++ + +:: + + // omask is either 0 or 1. 1 means the node is an unknown. 0 means it's known. + void setOversetMask (int amrlev, const iMultiFab& a_dmask); + +Note this is an integer (not bool) MultiFab, so the values must be only either 0 or 1. + + .. _sec:linearsolver:pars: Parameters @@ -483,7 +495,9 @@ To set homogeneous Dirichlet boundary conditions, call ml_ebabeclap->setEBHomogDirichlet(lev, coeff); where coeff can be a real number (i.e. the value is the same at every cell) -or is the MultiFab holding the coefficient of the gradient at each cell with an EB face. +or a MultiFab holding the coefficient of the gradient at each cell with an EB face. +In other words, coeff is :math:`\beta` in the canonical form given in equation :eq:`eqn::abeclap` +located at the EB surface centroid. To set inhomogeneous Dirichlet boundary conditions, call @@ -494,8 +508,9 @@ To set inhomogeneous Dirichlet boundary conditions, call ml_ebabeclap->setEBDirichlet(lev, phi_on_eb, coeff); where phi_on_eb is the MultiFab holding the Dirichlet values in every cut cell, -and coeff again is a real number (i.e. the value is the same at every cell) -or a MultiFab holding the coefficient of the gradient at each cell with an EB face. +and coeff again is a real number +or a MultiFab holding the coefficient of the gradient at each cell with an EB face, +i.e. :math:`\beta` in equation :eq:`eqn::abeclap` located at the EB surface centroid. Currently there are options to define the face-based coefficients on face centers vs face centroids, and to interpret the solution variable @@ -565,7 +580,7 @@ The following parameter should be set to True if the problem to be solved has a In this case, the solution is only defined to within a constant. Setting this parameter to True replaces one row in the matrix sent to hypre from AMReX by a row that sets the value at one cell to 0. -- :cpp:`hypre.adjust_singular_matrix`: Default is False. +- :cpp:`hypre.adjust_singular_matrix`: Default is false. The following parameters can be set in the inputs file to control the choice of preconditioner and smoother: diff --git a/Docs/sphinx_documentation/source/Particle.rst b/Docs/sphinx_documentation/source/Particle.rst index da5fabb02b2..e3a28591a72 100644 --- a/Docs/sphinx_documentation/source/Particle.rst +++ b/Docs/sphinx_documentation/source/Particle.rst @@ -86,7 +86,8 @@ tracked as the particle positions change. To do this, we provide the :: - ParticleContainer<3, 2, 4, 4> mypc; + using MyParticleContainer = ParticleContainer<3, 2, 4, 4>; + MyParticleContainer mypc; Like the :cpp:`Particle` class itself, the :cpp:`ParticleContainer` class is templated. The first two template parameters have the same meaning as @@ -375,8 +376,8 @@ example, to iterate over all the AoS data: :: - using MyParIter = ConstParIter<2*BL_SPACEDIM>; - for (MyParIter pti(pc, lev); pti.isValid(); ++pti) { + using MyParConstIter = MyParticleContainer::ParConstIterType; + for (MyParConstIter pti(pc, lev); pti.isValid(); ++pti) { const auto& particles = pti.GetArrayOfStructs(); for (const auto& p : particles) { // do stuff with p... @@ -392,7 +393,7 @@ skipped. You can also access the SoA data using the :math:`ParIter` as follows: :: - using MyParIter = ParIter<0, 0, 2, 2>; + using MyParIter = MyParticleContainer::ParIterType; for (MyParIter pti(pc, lev); pti.isValid(); ++pti) { auto& particle_attributes = pti.GetStructOfArrays(); RealVector& real_comp0 = particle_attributes.GetRealData(0); @@ -713,7 +714,7 @@ with OpenMP, the first thing to look at is whether there are enough tiles availa +-------------------+-----------------------------------------------------------------------+-------------+-------------+ | | Description | Type | Default | +===================+=======================================================================+=============+=============+ -| do_tiling | Whether to use tiling for particles. Should be on when using OpenMP, | Bool | False | +| do_tiling | Whether to use tiling for particles. Should be on when using OpenMP, | Bool | false | | | and off when running on GPUs. | | | +-------------------+-----------------------------------------------------------------------+-------------+-------------+ | tile_size | If tiling is on, the maximum tile_size to in each direction | Ints | 1024000,8,8 | @@ -739,7 +740,7 @@ problems with particle IO, you could try varying some / all of these parameters. | datadigits_read | This for backwards compatibility, don't use unless you need to read | Int | 5 | | | and old (pre mid 2017) AMReX dataset. | | | +-------------------+-----------------------------------------------------------------------+-------------+-------------+ -| use_prepost | This is an optimization for large particle datasets that groups MPI | Bool | False | +| use_prepost | This is an optimization for large particle datasets that groups MPI | Bool | false | | | calls needed during the IO together. Try it seeing poor IO speeds | | | | | on large problems. | | | +-------------------+-----------------------------------------------------------------------+-------------+-------------+ diff --git a/Docs/sphinx_documentation/source/Python_Chapter.rst b/Docs/sphinx_documentation/source/Python_Chapter.rst new file mode 100644 index 00000000000..76061ea2ebe --- /dev/null +++ b/Docs/sphinx_documentation/source/Python_Chapter.rst @@ -0,0 +1,13 @@ +.. role:: cpp(code) + :language: c++ + +.. _Chap:Python: + +Python Interface +================ + + +The core of AMReX is written in C++. +For users who want to write all of their programs in Python, or C++ application developers that like to add Python interfaces to their applications for scripting, rapid prototyping, code coupling and/or AI/ML workflows, many AMReX classes, functions and all data containers are now also available. + +Please see `pyAMReX `__ (`manual `__) for further details. diff --git a/Docs/sphinx_documentation/source/conf.py b/Docs/sphinx_documentation/source/conf.py index dc29ab6e041..8cb17c78e1f 100644 --- a/Docs/sphinx_documentation/source/conf.py +++ b/Docs/sphinx_documentation/source/conf.py @@ -42,7 +42,7 @@ def get_amrex_version(): intersphinx_mapping = { 'amrex_tutorials': ('https://amrex-codes.github.io/amrex/tutorials_html/', None), - 'amrex_hydro':('https://amrex-codes.github.io/amrex/hydro_html/', None) + 'amrex_hydro':('https://amrex-fluids.github.io/amrex-hydro/docs_html/', None) } # Add any paths that contain templates here, relative to this directory. diff --git a/Docs/sphinx_documentation/source/index.rst b/Docs/sphinx_documentation/source/index.rst index 8cdced26a19..b748a6394a5 100644 --- a/Docs/sphinx_documentation/source/index.rst +++ b/Docs/sphinx_documentation/source/index.rst @@ -51,6 +51,7 @@ Documentation on migration from BoxLib is available in the AMReX repository at D LinearSolvers_Chapter Particle_Chapter Fortran_Chapter + Python_Chapter EB_Chapter TimeIntegration_Chapter GPU_Chapter diff --git a/GNUmakefile.in b/GNUmakefile.in index ab31f4e192b..dda52f90d04 100644 --- a/GNUmakefile.in +++ b/GNUmakefile.in @@ -49,6 +49,9 @@ endif ifeq ($(USE_SUNDIALS),TRUE) Pdirs += Extern/SUNDIALS endif +ifeq ($(USE_BITTREE),TRUE) + Pdirs += Extern/Bittree +endif Ppack := $(foreach dir, $(Pdirs), $(AMREX_HOME)/Src/$(dir)/Make.package) include $(Ppack) diff --git a/LICENSE b/LICENSE index 60027deecd0..a9a04e67eac 100644 --- a/LICENSE +++ b/LICENSE @@ -1,33 +1,31 @@ -Copyright (c) 2017, The Regents of the University of California, -through Lawrence Berkeley National Laboratory and the Alliance for -Sustainable Energy, LLC., through National Renewable Energy Laboratory -(subject to receipt of any required approvals from the U.S. Dept. of -Energy). All rights reserved. +AMReX Copyright (c) 2024, The Regents of the University of California, +through Lawrence Berkeley National Laboratory (subject to receipt of any +required approvals from the U.S. Dept. of Energy). All rights reserved. Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: +modification, are permitted provided that the following conditions are met: -(1) Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. +(1) Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -(3) Neither the name of the University of California, Lawrence -Berkeley National Laboratory, Alliance for Sustainable Energy, LLC., -National Renewable Energy Laboratory, U.S. Dept. of Energy nor the -names of its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. +(3) Neither the name of the University of California, Lawrence Berkeley +National Laboratory, U.S. Dept. of Energy nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 786fa11babc..c7efc128376 100644 --- a/README.md +++ b/README.md @@ -93,25 +93,24 @@ Any level of changes are welcomed: documentation, bug fixes, new test problems, new solvers, etc. For more details on how to contribute to AMReX, please see [CONTRIBUTING.md](CONTRIBUTING.md). -## License +## Copyright Notice + +AMReX Copyright (c) 2024, The Regents of the University of California, +through Lawrence Berkeley National Laboratory (subject to receipt of any +required approvals from the U.S. Dept. of Energy). All rights reserved. + +If you have questions about your rights to use or distribute this software, +please contact Berkeley Lab's Intellectual Property Office at IPO@lbl.gov. -AMReX Copyright (c) 2017, The Regents of the University of California, -through Lawrence Berkeley National Laboratory and the Alliance for -Sustainable Energy, LLC., through National Renewable Energy Laboratory -(subject to receipt of any required approvals from the U.S. Dept. of -Energy). All rights reserved. - -If you have questions about your rights to use or distribute this -software, please contact Berkeley Lab's Innovation & Partnerships -Office at IPO@lbl.gov. - -NOTICE. This Software was developed under funding from the -U.S. Department of Energy and the U.S. Government consequently retains -certain rights. As such, the U.S. Government has been granted for -itself and others acting on its behalf a paid-up, nonexclusive, -irrevocable, worldwide license in the Software to reproduce, -distribute copies to the public, prepare derivative works, and perform -publicly and display publicly, and to permit other to do so. +NOTICE. This Software was developed under funding from the U.S. Department +of Energy and the U.S. Government consequently retains certain rights. As +such, the U.S. Government has been granted for itself and others acting on +its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the +Software to reproduce, distribute copies to the public, prepare derivative +works, and perform publicly and display publicly, and to permit others to do +so. + +## License License for AMReX can be found at [LICENSE](LICENSE). diff --git a/Src/Amr/AMReX_Amr.H b/Src/Amr/AMReX_Amr.H index a7173fd105a..bb18ec9d160 100644 --- a/Src/Amr/AMReX_Amr.H +++ b/Src/Amr/AMReX_Amr.H @@ -30,7 +30,6 @@ class AmrInSituBridge; * not belong on a single level, like establishing and updating the hierarchy * of levels, global timestepping, and managing the different AmrLevels */ - class Amr : public AmrCore { diff --git a/Src/Amr/AMReX_Amr.cpp b/Src/Amr/AMReX_Amr.cpp index 0b1dfb6f886..30ca8f8dc00 100644 --- a/Src/Amr/AMReX_Amr.cpp +++ b/Src/Amr/AMReX_Amr.cpp @@ -1830,8 +1830,8 @@ Amr::checkPoint () amrex::FileOpenFailed(FAHeaderFilesName); } - for(int i(0); i < FAHeaderNames.size(); ++i) { - FAHeaderFile << FAHeaderNames[i] << '\n'; + for(const auto & FAHeaderName : FAHeaderNames) { + FAHeaderFile << FAHeaderName << '\n'; } } } @@ -2206,7 +2206,8 @@ Amr::coarseTimeStep (Real stop_time) } if (record_run_info_terse && ParallelDescriptor::IOProcessor()) { runlog_terse << level_steps[0] << " " << cumtime << " " << dt_level[0]; - runlog_terse << std::endl; // Make sure we flush! + runlog_terse << '\n'; + runlog_terse.flush(); } int check_test = 0; @@ -2346,11 +2347,11 @@ Amr::coarseTimeStep (Real stop_time) if(ParallelDescriptor::IOProcessor()) { if (to_checkpoint) { - amrex::ErrorStream() << "Stopped by user w/ checkpoint" << std::endl; + amrex::ErrorStream() << "Stopped by user w/ checkpoint" << '\n'; } else { - amrex::ErrorStream() << "Stopped by user w/o checkpoint" << std::endl; + amrex::ErrorStream() << "Stopped by user w/o checkpoint" << '\n'; } } } @@ -2735,7 +2736,7 @@ Amr::regrid (int lbase, << time << " : REGRID with lbase = " << lbase - << std::endl; + << '\n'; if (verbose > 1) { @@ -2916,7 +2917,8 @@ Amr::printGridInfo (std::ostream& os, } } - os << std::endl; // Make sure we flush! + os << '\n'; + os.flush(); } @@ -3085,7 +3087,7 @@ Amr::bldFineLevels (Real strt_time) { bool grids_the_same; - const int MaxCnt = 4; + const int MaxCnt = std::max(4, max_level+1); int count = 0; diff --git a/Src/Amr/AMReX_AmrLevel.H b/Src/Amr/AMReX_AmrLevel.H index d4ac6c7c70d..37d0bdb9200 100644 --- a/Src/Amr/AMReX_AmrLevel.H +++ b/Src/Amr/AMReX_AmrLevel.H @@ -34,7 +34,6 @@ class TagBoxArray; * AmrLevel functions both as a container for state data on a level * and also manages the advancement of data in time. */ - class AmrLevel { friend class Amr; @@ -554,8 +553,8 @@ private: // // The data. // - AmrLevel& m_amrlevel; - MultiFab& m_leveldata; + AmrLevel* m_amrlevel; + MultiFab* m_leveldata; std::vector< std::pair > m_range; MultiFab m_fabs; int m_ncomp; @@ -592,8 +591,8 @@ private: // // The data. // - AmrLevel& m_amrlevel; - MultiFab& m_leveldata; + AmrLevel* m_amrlevel; + MultiFab* m_leveldata; MultiFabCopyDescriptor m_mfcd; Vector< Vector > m_mfid; // [level][oldnew] Interpolater* m_map = nullptr; diff --git a/Src/Amr/AMReX_AmrLevel.cpp b/Src/Amr/AMReX_AmrLevel.cpp index 7cdd83e1cae..dff9c88b61c 100644 --- a/Src/Amr/AMReX_AmrLevel.cpp +++ b/Src/Amr/AMReX_AmrLevel.cpp @@ -663,17 +663,17 @@ AmrLevel::setPhysBoundaryValues (FArrayBox& dest, FillPatchIteratorHelper::FillPatchIteratorHelper (AmrLevel& amrlevel, MultiFab& leveldata) : - m_amrlevel(amrlevel), - m_leveldata(leveldata), - m_mfid(m_amrlevel.level+1) + m_amrlevel(&amrlevel), + m_leveldata(&leveldata), + m_mfid(m_amrlevel->level+1) {} FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel, MultiFab& leveldata) : MFIter(leveldata), - m_amrlevel(amrlevel), - m_leveldata(leveldata), + m_amrlevel(&amrlevel), + m_leveldata(&leveldata), m_ncomp(0) { MFIter::depth = 0; @@ -688,9 +688,9 @@ FillPatchIteratorHelper::FillPatchIteratorHelper (AmrLevel& amrlevel, int ncomp, InterpBase* mapper) : - m_amrlevel(amrlevel), - m_leveldata(leveldata), - m_mfid(m_amrlevel.level+1), + m_amrlevel(&amrlevel), + m_leveldata(&leveldata), + m_mfid(m_amrlevel->level+1), m_time(time), m_growsize(boxGrow), m_index(index), @@ -709,8 +709,8 @@ FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel, int ncomp) : MFIter(leveldata), - m_amrlevel(amrlevel), - m_leveldata(leveldata), + m_amrlevel(&amrlevel), + m_leveldata(&leveldata), m_ncomp(ncomp) { BL_ASSERT(scomp >= 0); @@ -726,12 +726,13 @@ FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel, #endif } -static +namespace { bool NeedToTouchUpPhysCorners (const Geometry& geom) { return geom.isAnyPeriodic() && !geom.isAllPeriodic(); } +} void FillPatchIteratorHelper::Initialize (int boxGrow, @@ -756,24 +757,24 @@ FillPatchIteratorHelper::Initialize (int boxGrow, m_index = idx; m_scomp = scomp; m_ncomp = ncomp; - m_FixUpCorners = NeedToTouchUpPhysCorners(m_amrlevel.geom); + m_FixUpCorners = NeedToTouchUpPhysCorners(m_amrlevel->geom); const int MyProc = ParallelDescriptor::MyProc(); - auto& amrLevels = m_amrlevel.parent->getAmrLevels(); - const AmrLevel& topLevel = *amrLevels[m_amrlevel.level]; + auto& amrLevels = m_amrlevel->parent->getAmrLevels(); + const AmrLevel& topLevel = *amrLevels[m_amrlevel->level]; const Box& topPDomain = topLevel.state[m_index].getDomain(); - const IndexType& boxType = m_leveldata.boxArray().ixType(); + const IndexType& boxType = m_leveldata->boxArray().ixType(); const bool extrap = AmrLevel::desc_lst[m_index].extrap(); // // Check that the interpolaters are identical. // BL_ASSERT(AmrLevel::desc_lst[m_index].identicalInterps(scomp,ncomp)); - for (int l = 0; l <= m_amrlevel.level; ++l) + for (int l = 0; l <= m_amrlevel->level; ++l) { amrLevels[l]->state[m_index].RegisterData(m_mfcd, m_mfid[l]); } - for (int i = 0, N = static_cast(m_leveldata.boxArray().size()); i < N; ++i) + for (int i = 0, N = static_cast(m_leveldata->boxArray().size()); i < N; ++i) { // // A couple typedefs we'll use in the next code segment. @@ -782,20 +783,20 @@ FillPatchIteratorHelper::Initialize (int boxGrow, using IntAAAFBIDMapValType = std::map > > >::value_type; - if (m_leveldata.DistributionMap()[i] != MyProc) { continue; } + if (m_leveldata->DistributionMap()[i] != MyProc) { continue; } // // Insert with a hint since the indices are ordered lowest to highest. // IntAAAFBIDMapValType v1(i,Vector > >()); - m_fbid.insert(m_fbid.end(),v1)->second.resize(m_amrlevel.level+1); + m_fbid.insert(m_fbid.end(),v1)->second.resize(m_amrlevel->level+1); IntAABoxMapValType v2(i,Vector >()); - m_fbox.insert(m_fbox.end(),v2)->second.resize(m_amrlevel.level+1); - m_cbox.insert(m_cbox.end(),v2)->second.resize(m_amrlevel.level+1); + m_fbox.insert(m_fbox.end(),v2)->second.resize(m_amrlevel->level+1); + m_cbox.insert(m_cbox.end(),v2)->second.resize(m_amrlevel->level+1); - m_ba.insert(m_ba.end(),std::map::value_type(i,amrex::grow(m_leveldata.boxArray()[i],m_growsize))); + m_ba.insert(m_ba.end(),std::map::value_type(i,amrex::grow(m_leveldata->boxArray()[i],m_growsize))); } BoxList tempUnfillable(boxType); @@ -862,7 +863,7 @@ FillPatchIteratorHelper::Initialize (int boxGrow, Vector< Vector >& TheFineBoxes = m_fbox[bxidx]; Vector< Vector< Vector > >& TheFBIDs = m_fbid[bxidx]; - for (int l = m_amrlevel.level; l >= 0 && !Done; --l) + for (int l = m_amrlevel->level; l >= 0 && !Done; --l) { unfillableThisLevel.clear(); @@ -892,7 +893,7 @@ FillPatchIteratorHelper::Initialize (int boxGrow, { crse_boxes.push_back(fbx); - if (l != m_amrlevel.level) + if (l != m_amrlevel->level) { const Box& cbox = m_map->CoarseBox(fbx,fine_ratio); @@ -999,15 +1000,15 @@ FillPatchIterator::Initialize (int boxGrow, m_ncomp = ncomp; m_range = desc.sameInterps(scomp,ncomp); - m_fabs.define(m_leveldata.boxArray(),m_leveldata.DistributionMap(), - m_ncomp,boxGrow,MFInfo(),m_leveldata.Factory()); + m_fabs.define(m_leveldata->boxArray(),m_leveldata->DistributionMap(), + m_ncomp,boxGrow,MFInfo(),m_leveldata->Factory()); - const Geometry& geom = m_amrlevel.Geom(); + const Geometry& geom = m_amrlevel->Geom(); m_fabs.setDomainBndry(std::numeric_limits::quiet_NaN(), geom); - const IndexType& boxType = m_leveldata.boxArray().ixType(); - const int level = m_amrlevel.level; + const IndexType& boxType = m_leveldata->boxArray().ixType(); + const int level = m_amrlevel->level; for (int i = 0, DComp = 0; i < static_cast(m_range.size()); i++) { @@ -1021,8 +1022,8 @@ FillPatchIterator::Initialize (int boxGrow, else { if (level == 1 || - amrex::ProperlyNested(m_amrlevel.crse_ratio, - m_amrlevel.parent->blockingFactor(m_amrlevel.level), + amrex::ProperlyNested(m_amrlevel->crse_ratio, + m_amrlevel->parent->blockingFactor(m_amrlevel->level), boxGrow, boxType, desc.interp(SComp))) { FillFromTwoLevels(time, idx, SComp, DComp, NComp); @@ -1034,7 +1035,7 @@ FillPatchIterator::Initialize (int boxGrow, # endif { IntVect new_blocking_factor = AmrLevel::ProperBlockingFactor - (m_amrlevel, boxGrow, boxType, desc, SComp); + (*m_amrlevel, boxGrow, boxType, desc, SComp); amrex::Print() << "WARNING: Grids are not properly nested. Consider using amr.blocking_factor = " << AMREX_D_TERM(new_blocking_factor[0], << " " << new_blocking_factor[1], @@ -1049,7 +1050,7 @@ FillPatchIterator::Initialize (int boxGrow, first = false; if (ParallelDescriptor::IOProcessor() && amrex::Verbose()) { IntVect new_blocking_factor = AmrLevel::ProperBlockingFactor - (m_amrlevel, boxGrow, boxType, desc, SComp); + (*m_amrlevel, boxGrow, boxType, desc, SComp); amrex::Print() << "WARNING: Grids are not properly nested. We might have to use\n" << " two coarse levels to do fillpatch. Consider using\n"; if (new_blocking_factor < IntVect{AMREX_D_DECL(128,128,128)}) { @@ -1060,8 +1061,8 @@ FillPatchIterator::Initialize (int boxGrow, } } - auto* fph = new FillPatchIteratorHelper(m_amrlevel, - m_leveldata, + auto* fph = new FillPatchIteratorHelper(*m_amrlevel, + *m_leveldata, boxGrow, time, idx, @@ -1086,7 +1087,7 @@ FillPatchIterator::Initialize (int boxGrow, // // Call hack to touch up fillPatched data. // - m_amrlevel.set_preferred_boundary_values(m_fabs, + m_amrlevel->set_preferred_boundary_values(m_fabs, idx, scomp, 0, @@ -1098,15 +1099,15 @@ void FillPatchIterator::FillFromLevel0 (Real time, int idx, int scomp, int dcomp, int ncomp) { BL_PROFILE("FillPatchIterator::FillFromLevel0()"); - BL_ASSERT(m_amrlevel.level == 0); + BL_ASSERT(m_amrlevel->level == 0); - StateData& statedata = m_amrlevel.state[idx]; + StateData& statedata = m_amrlevel->state[idx]; Vector smf; Vector stime; statedata.getData(smf,stime,time); - const Geometry& geom = m_amrlevel.geom; + const Geometry& geom = m_amrlevel->geom; StateDataPhysBCFunct physbcf(statedata,scomp,geom); @@ -1117,13 +1118,13 @@ void FillPatchIterator::FillFromTwoLevels (Real time, int idx, int scomp, int dcomp, int ncomp) { BL_PROFILE("FillPatchIterator::FillFromTwoLevels()"); - int ilev_fine = m_amrlevel.level; + int ilev_fine = m_amrlevel->level; int ilev_crse = ilev_fine-1; BL_ASSERT(ilev_crse >= 0); - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); + AmrLevel& fine_level = *m_amrlevel; + AmrLevel& crse_level = m_amrlevel->parent->getLevel(ilev_crse); const Geometry& geom_fine = fine_level.geom; const Geometry& geom_crse = crse_level.geom; @@ -1154,7 +1155,8 @@ FillPatchIterator::FillFromTwoLevels (Real time, int idx, int scomp, int dcomp, desc.getBCs(),scomp); } -static +namespace { + bool HasPhysBndry (const Box& b, const Box& dmn, @@ -1174,7 +1176,6 @@ HasPhysBndry (const Box& b, return false; } -static void FixUpPhysCorners (FArrayBox& fab, AmrLevel& TheLevel, @@ -1258,6 +1259,8 @@ FixUpPhysCorners (FArrayBox& fab, } } +} + void FillPatchIteratorHelper::fill (FArrayBox& fab, int dcomp, @@ -1268,17 +1271,17 @@ FillPatchIteratorHelper::fill (FArrayBox& fab, BL_ASSERT(fab.box() == m_ba[idx]); BL_ASSERT(fab.nComp() >= dcomp + m_ncomp); - Vector< Vector > > cfab(m_amrlevel.level+1); + Vector< Vector > > cfab(m_amrlevel->level+1); Vector< Vector >& TheCrseBoxes = m_cbox[idx]; Vector< Vector >& TheFineBoxes = m_fbox[idx]; Vector< Vector< Vector > >& TheFBIDs = m_fbid[idx]; const bool extrap = AmrLevel::desc_lst[m_index].extrap(); - auto& amrLevels = m_amrlevel.parent->getAmrLevels(); + auto& amrLevels = m_amrlevel->parent->getAmrLevels(); // // Build all coarse fabs from which we'll interpolate and // fill them with coarse data as best we can. // - for (int l = 0; l <= m_amrlevel.level; l++) + for (int l = 0; l <= m_amrlevel->level; l++) { StateData& TheState = amrLevels[l]->state[m_index]; const Vector& CrseBoxes = TheCrseBoxes[l]; @@ -1323,7 +1326,7 @@ FillPatchIteratorHelper::fill (FArrayBox& fab, // // Now work from the bottom up interpolating to next higher level. // - for (int l = 0; l < m_amrlevel.level; l++) + for (int l = 0; l < m_amrlevel->level; l++) { auto& CrseFabs = cfab[l]; AmrLevel& TheLevel = *amrLevels[l]; @@ -1442,8 +1445,8 @@ FillPatchIteratorHelper::fill (FArrayBox& fab, // // Copy intersect finefab into next level m_cboxes. // - for (int j = 0, K = static_cast(FinerCrseFabs.size()); j < K; ++j) { - FinerCrseFabs[j]->copy(finefab); + for (const auto & FinerCrseFab : FinerCrseFabs) { + FinerCrseFab->copy(finefab); } } @@ -1452,15 +1455,15 @@ FillPatchIteratorHelper::fill (FArrayBox& fab, // // Now for the finest level stuff. // - StateData& FineState = m_amrlevel.state[m_index]; + StateData& FineState = m_amrlevel->state[m_index]; const Box& FineDomain = FineState.getDomain(); - const Geometry& FineGeom = m_amrlevel.geom; - auto& FinestCrseFabs = cfab[m_amrlevel.level]; + const Geometry& FineGeom = m_amrlevel->geom; + auto& FinestCrseFabs = cfab[m_amrlevel->level]; // // Copy intersect coarse into destination fab. // - for (int i = 0, N = static_cast(FinestCrseFabs.size()); i < N; ++i) { - fab.copy(*FinestCrseFabs[i],0,dcomp,m_ncomp); + for (const auto & FinestCrseFab : FinestCrseFabs) { + fab.copy(*FinestCrseFab,0,dcomp,m_ncomp); } if (FineGeom.isAnyPeriodic() && !FineDomain.contains(fab.box())) @@ -1469,17 +1472,17 @@ FillPatchIteratorHelper::fill (FArrayBox& fab, FineGeom.periodicShift(FineDomain,fab.box(),pshifts); - for (int i = 0, N = static_cast(FinestCrseFabs.size()); i < N; i++) + for (const auto & FinestCrseFab : FinestCrseFabs) { for (const auto& iv : pshifts) { fab.shift(iv); - Box src_dst = FinestCrseFabs[i]->box() & fab.box(); + Box src_dst = FinestCrseFab->box() & fab.box(); src_dst &= FineDomain; if (src_dst.ok()) { - fab.copy(*FinestCrseFabs[i],src_dst,0,src_dst,dcomp,m_ncomp); + fab.copy(*FinestCrseFab,src_dst,0,src_dst,dcomp,m_ncomp); } fab.shift(-iv); @@ -1495,7 +1498,7 @@ FillPatchIteratorHelper::fill (FArrayBox& fab, // if (! FineState.getDomain().contains(fab.box())) { - m_amrlevel.setPhysBoundaryValues(fab, + m_amrlevel->setPhysBoundaryValues(fab, m_index, m_time, dcomp, @@ -1506,7 +1509,7 @@ FillPatchIteratorHelper::fill (FArrayBox& fab, if (m_FixUpCorners) { - FixUpPhysCorners(fab,m_amrlevel,m_index,m_time,m_scomp,dcomp,m_ncomp); + FixUpPhysCorners(fab,*m_amrlevel,m_index,m_time,m_scomp,dcomp,m_ncomp); Gpu::streamSynchronize(); // In case this runs on GPU } } @@ -1843,10 +1846,10 @@ AmrLevel::UpdateDistributionMaps ( DistributionMapping& update_dmap ) if (dmap.size() == mapsize) { dmap = update_dmap; } - for (int i = 0; i < state.size(); ++i) + for (auto & i : state) { - if (state[i].DistributionMap().size() == mapsize) - { state[i].setDistributionMap(update_dmap); } + if (i.DistributionMap().size() == mapsize) + { i.setDistributionMap(update_dmap); } } } diff --git a/Src/Amr/AMReX_Derive.H b/Src/Amr/AMReX_Derive.H index e1a7310a7b2..1e0cceb7894 100644 --- a/Src/Amr/AMReX_Derive.H +++ b/Src/Amr/AMReX_Derive.H @@ -100,7 +100,6 @@ class DescriptorList; * from the state data contained in AmrLevel and its derivatives. Some * examples might be kinetic energy, vorticity, concentration gradients ... */ - class DeriveRec { friend class DeriveList; @@ -339,7 +338,6 @@ private: * * DeriveList manages and provides access to the list of DeriveRecs. */ - class DeriveList { public: diff --git a/Src/Amr/AMReX_LevelBld.H b/Src/Amr/AMReX_LevelBld.H index 8b421265bfc..bb79184ca5e 100644 --- a/Src/Amr/AMReX_LevelBld.H +++ b/Src/Amr/AMReX_LevelBld.H @@ -18,7 +18,6 @@ namespace amrex { * Abstract base class specifying an interface for building problem-specific * AmrLevels. */ - class LevelBld { public: diff --git a/Src/Amr/AMReX_StateData.H b/Src/Amr/AMReX_StateData.H index 251e6482a45..e030eacb5b1 100644 --- a/Src/Amr/AMReX_StateData.H +++ b/Src/Amr/AMReX_StateData.H @@ -29,7 +29,6 @@ class StateDataPhysBCFunct; * * StateData holds state data on a level for the current and previous time step. */ - class StateData { friend class StateDataPhysBCFunct; @@ -471,7 +470,7 @@ public: private: StateData* statedata; int src_comp; - const Geometry& geom; + const Geometry* geom; }; diff --git a/Src/Amr/AMReX_StateData.cpp b/Src/Amr/AMReX_StateData.cpp index fffa8074e8b..7fd2c0c5745 100644 --- a/Src/Amr/AMReX_StateData.cpp +++ b/Src/Amr/AMReX_StateData.cpp @@ -863,7 +863,7 @@ StateData::printTimeInterval (std::ostream &os) const StateDataPhysBCFunct::StateDataPhysBCFunct (StateData&sd, int sc, const Geometry& geom_) : statedata(&sd), src_comp(sc), - geom(geom_) + geom(&geom_) { } void @@ -876,8 +876,8 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int const Box& domain_mt = amrex::convert(statedata->getDomain(),mf.ixType()); const int* domainlo = domain_mt.loVect(); const int* domainhi = domain_mt.hiVect(); - const Real* dx = geom.CellSize(); - const RealBox& prob_domain = geom.ProbDomain(); + const Real* dx = geom->CellSize(); + const RealBox& prob_domain = geom->ProbDomain(); bool has_bndryfunc_fab = statedata->desc->hasBndryFuncFab(); bool run_on_gpu = statedata->desc->RunOnGPU() && Gpu::inLaunchRegion(); @@ -901,7 +901,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int bool is_periodic = false; for (int i = 0; i < AMREX_SPACEDIM; ++i) { bool touch = bx.smallEnd(i) < domainlo[i] || bx.bigEnd(i) > domainhi[i]; - if (geom.isPeriodic(i)) { + if (geom->isPeriodic(i)) { is_periodic = is_periodic || touch; } else { has_phys_bc = has_phys_bc || touch; @@ -911,7 +911,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int if (has_phys_bc) { if (has_bndryfunc_fab) { - statedata->FillBoundary(bx, dest, time, geom, dest_comp, src_comp, num_comp); + statedata->FillBoundary(bx, dest, time, *geom, dest_comp, src_comp, num_comp); } else { statedata->FillBoundary(dest, time, dx, prob_domain, dest_comp, src_comp, num_comp); } @@ -922,7 +922,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int for (int dir = 0; dir < AMREX_SPACEDIM; dir++) { - if (!geom.isPeriodic(dir)) + if (!(geom->isPeriodic(dir))) { const int lo = domainlo[dir] - bx.smallEnd(dir); const int hi = bx.bigEnd(dir) - domainhi[dir]; @@ -933,12 +933,12 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int for (int dir = 0; dir < AMREX_SPACEDIM; dir++) { - if (!geom.isPeriodic(dir)) { continue; } + if (!(geom->isPeriodic(dir))) { continue; } Box lo_slab = bx; Box hi_slab = bx; - lo_slab.shift(dir, geom.period(dir)); - hi_slab.shift(dir,-geom.period(dir)); + lo_slab.shift(dir, geom->period(dir)); + hi_slab.shift(dir,-geom->period(dir)); lo_slab &= GrownDomain; hi_slab &= GrownDomain; @@ -950,7 +950,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int tmp.resize(lo_slab,num_comp); Elixir elitmp = tmp.elixir(); Array4 const& tmpa = tmp.array(); - const int ishift = -geom.period(dir); + const int ishift = -geom->period(dir); amrex::launch(lo_slab, [=] AMREX_GPU_DEVICE (Box const& tbx) noexcept { @@ -971,7 +971,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int } }); if (has_bndryfunc_fab) { - statedata->FillBoundary(lo_slab, tmp, time, geom, 0, src_comp, num_comp); + statedata->FillBoundary(lo_slab, tmp, time, *geom, 0, src_comp, num_comp); } else { statedata->FillBoundary(tmp, time, dx, prob_domain, 0, src_comp, num_comp); } @@ -999,10 +999,10 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int #endif { tmp.resize(lo_slab,num_comp); - const Box db = amrex::shift(lo_slab, dir, -geom.period(dir)); + const Box db = amrex::shift(lo_slab, dir, -geom->period(dir)); tmp.copy(dest, db, dest_comp, lo_slab, 0, num_comp); if (has_bndryfunc_fab) { - statedata->FillBoundary(lo_slab, tmp, time, geom, 0, src_comp, num_comp); + statedata->FillBoundary(lo_slab, tmp, time, *geom, 0, src_comp, num_comp); } else { statedata->FillBoundary(tmp, time, dx, prob_domain, 0, src_comp, num_comp); } @@ -1018,7 +1018,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int tmp.resize(hi_slab,num_comp); Elixir elitmp = tmp.elixir(); Array4 const& tmpa = tmp.array(); - const int ishift = geom.period(dir); + const int ishift = geom->period(dir); amrex::launch(hi_slab, [=] AMREX_GPU_DEVICE (Box const& tbx) noexcept { @@ -1039,7 +1039,7 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int } }); if (has_bndryfunc_fab) { - statedata->FillBoundary(hi_slab, tmp, time, geom, 0, src_comp, num_comp); + statedata->FillBoundary(hi_slab, tmp, time, *geom, 0, src_comp, num_comp); } else { statedata->FillBoundary(tmp, time, dx, prob_domain, 0, src_comp, num_comp); } @@ -1067,10 +1067,10 @@ StateDataPhysBCFunct::operator() (MultiFab& mf, int dest_comp, int num_comp, Int #endif { tmp.resize(hi_slab,num_comp); - const Box db = amrex::shift(hi_slab, dir, geom.period(dir)); + const Box db = amrex::shift(hi_slab, dir, geom->period(dir)); tmp.copy(dest, db, dest_comp, hi_slab, 0, num_comp); if (has_bndryfunc_fab) { - statedata->FillBoundary(hi_slab, tmp, time, geom, 0, src_comp, num_comp); + statedata->FillBoundary(hi_slab, tmp, time, *geom, 0, src_comp, num_comp); } else { statedata->FillBoundary(tmp, time, dx, prob_domain, 0, src_comp, num_comp); } diff --git a/Src/Amr/AMReX_StateDescriptor.H b/Src/Amr/AMReX_StateDescriptor.H index 6cd6c92cdd3..2830b955705 100644 --- a/Src/Amr/AMReX_StateDescriptor.H +++ b/Src/Amr/AMReX_StateDescriptor.H @@ -29,7 +29,6 @@ namespace amrex { /** * \brief Attributes of StateData. */ - class StateDescriptor { friend class DescriptorList; @@ -434,7 +433,6 @@ private: * * A container class for StateDescriptors. */ - class DescriptorList { public: diff --git a/Src/Amr/AMReX_StateDescriptor.cpp b/Src/Amr/AMReX_StateDescriptor.cpp index 161090d1a7f..4f1e3c3fe1b 100644 --- a/Src/Amr/AMReX_StateDescriptor.cpp +++ b/Src/Amr/AMReX_StateDescriptor.cpp @@ -19,7 +19,7 @@ StateDescriptor::bf_thread_safety (const int* /*lo*/,const int* /*hi*/, if (!bf_ext_dir_threadsafe) { bool has_ext_dir = false; for (int i=0; i<2*AMREX_SPACEDIM*ng && !has_ext_dir; ++i) { - has_ext_dir = bc[i]==BCType::ext_dir; + has_ext_dir = ((bc[i]==BCType::ext_dir) || (bc[i]==BCType::ext_dir_cc)); } if (has_ext_dir) { thread_safe = false; } } diff --git a/Src/AmrCore/AMReX_AmrCore.H b/Src/AmrCore/AMReX_AmrCore.H index 20428b40930..2969b986a75 100644 --- a/Src/AmrCore/AMReX_AmrCore.H +++ b/Src/AmrCore/AMReX_AmrCore.H @@ -20,7 +20,6 @@ class AmrParGDB; * virtual functions to allocate, initialize and delete data. It also * requires the derived class to tag cells for refinement. */ - class AmrCore : public AmrMesh { diff --git a/Src/AmrCore/AMReX_AmrCore.cpp b/Src/AmrCore/AMReX_AmrCore.cpp index 502b3f5cb23..1e56c1652d2 100644 --- a/Src/AmrCore/AMReX_AmrCore.cpp +++ b/Src/AmrCore/AMReX_AmrCore.cpp @@ -1,6 +1,5 @@ #include -#include #ifdef AMREX_PARTICLES #include @@ -104,7 +103,7 @@ AmrCore::regrid (int lbase, Real time, bool) DistributionMapping level_dmap = dmap[lev]; if (ba_changed) { level_grids = new_grids[lev]; - level_dmap = DistributionMapping(level_grids); + level_dmap = MakeDistributionMap(lev, level_grids); } const auto old_num_setdm = num_setdm; RemakeLevel(lev, time, level_grids, level_dmap); @@ -117,7 +116,7 @@ AmrCore::regrid (int lbase, Real time, bool) } else // a new level { - DistributionMapping new_dmap(new_grids[lev]); + DistributionMapping new_dmap = MakeDistributionMap(lev, new_grids[lev]); const auto old_num_setdm = num_setdm; MakeNewLevelFromCoarse(lev, time, new_grids[lev], new_dmap); SetBoxArray(lev, new_grids[lev]); @@ -227,7 +226,8 @@ AmrCore::printGridSummary (std::ostream& os, int min_lev, int max_lev) const noe } } - os << std::endl; // Make sure we flush! + os << '\n'; + os.flush(); } } diff --git a/Src/AmrCore/AMReX_AmrMesh.H b/Src/AmrCore/AMReX_AmrMesh.H index a3c6fbc8f62..f5d49f5c5da 100644 --- a/Src/AmrCore/AMReX_AmrMesh.H +++ b/Src/AmrCore/AMReX_AmrMesh.H @@ -11,6 +11,10 @@ #include #include +#ifdef AMREX_USE_BITTREE +#include +#endif + namespace amrex { struct AmrInfo { @@ -166,7 +170,7 @@ public: void SetGridEff (Real eff) noexcept { grid_eff = eff; } void SetNProper (int n) noexcept { n_proper = n; } - //! Set ref_ratio would require rebuiling Geometry objects. + //! Set ref_ratio would require rebuilding Geometry objects. void SetFinestLevel (int new_finest_level) noexcept { finest_level = new_finest_level; } void SetDistributionMap (int lev, const DistributionMapping& dmap_in) noexcept; @@ -253,6 +257,8 @@ public: [[nodiscard]] long CountCells (int lev) noexcept; + [[nodiscard]] virtual DistributionMapping MakeDistributionMap (int lev, BoxArray const& ba); + protected: int finest_level; //!< Current finest level. @@ -260,6 +266,11 @@ protected: Vector dmap; Vector grids; +#ifdef AMREX_USE_BITTREE + bool use_bittree = false; + std::unique_ptr btmesh; +#endif + unsigned int num_setdm = 0; unsigned int num_setba = 0; diff --git a/Src/AmrCore/AMReX_AmrMesh.cpp b/Src/AmrCore/AMReX_AmrMesh.cpp index 70a8df1dabf..0ed59002f2e 100644 --- a/Src/AmrCore/AMReX_AmrMesh.cpp +++ b/Src/AmrCore/AMReX_AmrMesh.cpp @@ -5,6 +5,13 @@ #include #include #include +#include + +#ifdef AMREX_USE_BITTREE +#include +#endif + +#include namespace amrex { @@ -376,6 +383,10 @@ AmrMesh::InitAmrMesh (int max_level_in, const Vector& n_cell_in, finest_level = -1; +#ifdef AMREX_USE_BITTREE + pp.queryAdd("use_bittree",use_bittree); +#endif + if (check_input) { checkInput(); } } @@ -437,6 +448,26 @@ AmrMesh::LevelDefined (int lev) noexcept return lev <= max_level && !grids[lev].empty() && !dmap[lev].empty(); } +DistributionMapping +AmrMesh::MakeDistributionMap (int lev, BoxArray const& ba) +{ + + BL_PROFILE("AmrMesh::MakeDistributionMap()"); + + if (verbose) { + amrex::Print() << "Creating new distribution map on level: " << lev << "\n"; + } + +#ifdef AMREX_USE_BITTREE + // if (use_bittree) { + // return DistributionMapping(ba); + // } else +#endif + { + return DistributionMapping(ba); + } +} + void AmrMesh::ChopGrids (int lev, BoxArray& ba, int target_size) const { @@ -514,6 +545,10 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& if (new_grids.size() < max_crse+2) { new_grids.resize(max_crse+2); } +#ifdef AMREX_USE_BITTREE + if(!use_bittree) { +#endif + // // Construct problem domain at each level. // @@ -728,13 +763,47 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& } new_bx.Bcast(); // Broadcast the new BoxList to other processes - // - // Refine up to levf. - // - new_bx.refine(ref_ratio[levc]); - BL_ASSERT(new_bx.isDisjoint()); + bool odd_ref_ratio = false; + for (auto const& rr : ref_ratio[levc]) { + if (rr != 1 && (rr%2 != 0)) { + odd_ref_ratio = true; + } + } + + if (odd_ref_ratio) + { + // This approach imposes max_grid_size (suitably scaled) before + // refining so as to ensure fine grids align with coarse grids + + // + // Impose max_grid_size (suitably coarsened) + // + AMREX_ASSERT(max_grid_size[levf].allGE(ref_ratio[levc])); + new_grids[levf] = BoxArray(std::move(new_bx), max_grid_size[levf]/ref_ratio[levc]); + + // + // Refine up to levf. + // + new_grids[levf].refine(ref_ratio[levc]); + } + else + { + // This approach imposes max_grid_size after refining. + // For ref_ratio = 3 this can create fine grids that do not correctly divide by 3, + // but we leave it here so as not to change the gridding in + // existing ref_ratio = 2 or 4 applications - new_grids[levf] = BoxArray(std::move(new_bx), max_grid_size[levf]); + // + // Refine up to levf. + // + new_bx.refine(ref_ratio[levc]); + + // + // Impose max_grid_size + // + new_grids[levf] = BoxArray(std::move(new_bx), max_grid_size[levf]); + } + BL_ASSERT(new_grids[levf].isDisjoint()); } } } @@ -774,6 +843,72 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& } } } + +#ifdef AMREX_USE_BITTREE + } +#endif + +#ifdef AMREX_USE_BITTREE + // Bittree version + if(use_bittree) { + // Initialize BT refinement + btmesh->refine_init(); + + // ------------------------------------------------------------------- + // Use tagging data to mark BT for refinement, then use the new bitmap + // to calculate the new grids. + auto tree0 = btmesh->getTree(); + + // [1] Error Estimation and tagging + // btTags is indexed by bitid, Bittree's internal indexing scheme. + // For any id, btTags = 1 if should be parent, -1 if should not be parent (or not exist). + std::vector btTags(tree0->id_upper_bound(),0); + + for (int lev=max_crse; lev>=lbase; --lev) { + + TagBoxArray tags(grids[lev],dmap[lev], n_error_buf[lev]); + ErrorEst(lev, tags, time, 0); + tags.buffer(n_error_buf[lev]); + + for (MFIter mfi(tags); mfi.isValid(); ++mfi) { + auto const& tagbox = tags.const_array(mfi); + bool has_set_tags = amrex::Reduce::AnyOf(mfi.validbox(), + [=] AMREX_GPU_DEVICE (int i, int j, int k) + { + return tagbox(i,j,k)!=TagBox::CLEAR; + }); + + // Set the values of btTags. + int bitid = btUnit::getBitid(btmesh.get(),false,lev,mfi.index()); + // TODO Check lev == tree0->block_level(bitid) + if(has_set_tags) { + btTags[bitid] = 1; + } + else { + btTags[bitid] = -1; + } + } + } + + // [2] btRefine - check for proper octree nesting and update bitmap + MPI_Comm comm = ParallelContext::CommunicatorSub(); + int changed = btUnit::btRefine(btmesh.get(), btTags, max_crse, lbase, grids, dmap, comm); + + // [3] btCalculateGrids - use new bitmap to generate new grids + if (changed>0) { + btUnit::btCalculateGrids(btmesh.get(),lbase,new_finest,new_grids,max_grid_size); + } else { + new_finest = finest_level; + for(int i=0; i<=finest_level; ++i) { + new_grids[i] = grids[i]; + } + } + + // Finalize BT refinement + btmesh->refine_apply(); + } +#endif + } void @@ -783,11 +918,48 @@ AmrMesh::MakeNewGrids (Real time) { finest_level = 0; - const BoxArray& ba = MakeBaseGrids(); - DistributionMapping dm(ba); + BoxArray ba; + DistributionMapping dm; const auto old_num_setdm = num_setdm; const auto old_num_setba = num_setba; +#ifdef AMREX_USE_BITTREE + if(!use_bittree) { +#endif + ba = MakeBaseGrids(); + dm = MakeDistributionMap(0, ba); + +#ifdef AMREX_USE_BITTREE + } + else { + //Initialize Bittree + + // top = number of grids on coarsest level in each direction + std::vector top(AMREX_SPACEDIM,0); + IntVect ncells = geom[0].Domain().length(); + for(int i=0; i includes(ngrids,1); + + btmesh = std::make_unique(top.data(),includes.data()); + + // Set BCs + for(int d=0; d value, AMRErrorTag::TEST test, std::string field, const AMRErrorTagInfo& info = AMRErrorTagInfo()) noexcept - : m_test(test), m_field(std::move(field)), m_info(info) + : m_test(test), m_field(std::move(field)), m_info(info), m_ngrow(SetNGrow()) { AMREX_ASSERT(!value.empty()); m_value.resize(info.m_max_level); @@ -480,7 +474,6 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst); for (auto i = int(value.size()); i < m_value.size(); ++i) { m_value[i] = value[value.size()-1]; } - m_ngrow = SetNGrow(); } AMRErrorTag (AMRErrorTag::UserFunc* userfunc, diff --git a/Src/AmrCore/AMReX_ErrorList.cpp b/Src/AmrCore/AMReX_ErrorList.cpp index 783a06504be..c9d1a7feef8 100644 --- a/Src/AmrCore/AMReX_ErrorList.cpp +++ b/Src/AmrCore/AMReX_ErrorList.cpp @@ -194,7 +194,9 @@ ErrorList::operator[] (int k) const noexcept return *vec[k]; } -static const char* err_name[] = { "Special", "Standard", "UseAverage" }; +namespace { + const char* err_name[] = { "Special", "Standard", "UseAverage" }; +} std::ostream& operator << (std::ostream& os, diff --git a/Src/AmrCore/AMReX_FillPatchUtil_I.H b/Src/AmrCore/AMReX_FillPatchUtil_I.H index 36d5e9ee853..b3e1ee64aae 100644 --- a/Src/AmrCore/AMReX_FillPatchUtil_I.H +++ b/Src/AmrCore/AMReX_FillPatchUtil_I.H @@ -189,7 +189,7 @@ void FillPatchInterp (MultiFab& mf_fine_patch, int fcomp, MultiFab const& mf_crs MFInterpolater* mapper, const Vector& bcs, int bcscomp); template -std::enable_if_t::value && !std::is_same::value> +std::enable_if_t::value && !std::is_same_v> FillPatchInterp (MF& mf_fine_patch, int fcomp, MF const& mf_crse_patch, int ccomp, int ncomp, IntVect const& ng, const Geometry& cgeom, const Geometry& fgeom, Box const& dest_domain, const IntVect& ratio, @@ -240,7 +240,7 @@ FillPatchInterp (MF& mf_fine_patch, int fcomp, MF const& mf_crse_patch, int ccom } template -std::enable_if_t::value && !std::is_same::value> +std::enable_if_t::value && !std::is_same_v> InterpFace (Interp *interp, MF const& mf_crse_patch, int crse_comp, MF& mf_refined_patch, int fine_comp, @@ -298,9 +298,9 @@ namespace { // ======== FArrayBox template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp) { MF mf_crse_patch(fpc.ba_crse_patch, fpc.dm_patch, ncomp, 0, MFInfo(), @@ -309,9 +309,9 @@ namespace { } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type) { MF mf_crse_patch(amrex::convert(fpc.ba_crse_patch, idx_type), fpc.dm_patch, @@ -320,9 +320,9 @@ namespace { } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp) { MF mf_fine_patch(fpc.ba_fine_patch, fpc.dm_patch, ncomp, 0, MFInfo(), @@ -331,9 +331,9 @@ namespace { } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type) { MF mf_fine_patch(amrex::convert(fpc.ba_fine_patch, idx_type), fpc.dm_patch, @@ -342,9 +342,9 @@ namespace { } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_refined_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type, IntVect ratio) { MF mf_refined_patch(amrex::convert( amrex::refine( amrex::coarsen(fpc.ba_fine_patch, ratio), ratio), idx_type), @@ -353,9 +353,9 @@ namespace { } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_crse_mask (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type, IntVect ratio) { MF mf_crse_mask(amrex::convert(amrex::coarsen(fpc.ba_fine_patch, ratio), idx_type), @@ -364,9 +364,9 @@ namespace { } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> void mf_set_domain_bndry (MF &mf, Geometry const & geom) { mf.setDomainBndry(std::numeric_limits::quiet_NaN(), geom); @@ -376,63 +376,63 @@ namespace { // ======== Not FArrayBox template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp) { return MF(fpc.ba_crse_patch, fpc.dm_patch, ncomp, 0); } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type) { return MF(amrex::convert(fpc.ba_crse_patch, idx_type), fpc.dm_patch, ncomp, 0); } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp) { return MF(fpc.ba_fine_patch, fpc.dm_patch, ncomp, 0); } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type) { return MF(amrex::convert(fpc.ba_fine_patch, idx_type), fpc.dm_patch, ncomp, 0); } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_refined_patch (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type, IntVect ratio) { return MF(amrex::convert( amrex::refine( amrex::coarsen(fpc.ba_fine_patch, ratio), ratio), idx_type), fpc.dm_patch, ncomp, 0); } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> MF make_mf_crse_mask (FabArrayBase::FPinfo const& fpc, int ncomp, IndexType idx_type, IntVect ratio) { return MF(amrex::convert(amrex::coarsen(fpc.ba_fine_patch, ratio), idx_type), fpc.dm_patch, ncomp, 0); } template ::value, - int>::type = 0> + std::enable_if_t, + int> = 0> void mf_set_domain_bndry (MF &/*mf*/, Geometry const & /*geom*/) { // nothing diff --git a/Src/AmrCore/AMReX_FillPatcher.H b/Src/AmrCore/AMReX_FillPatcher.H index 46d1107dea3..5ff1c9550d3 100644 --- a/Src/AmrCore/AMReX_FillPatcher.H +++ b/Src/AmrCore/AMReX_FillPatcher.H @@ -15,13 +15,13 @@ namespace amrex { * with interpolation of the coarse data. Then it fills the fine ghost * cells overlapping fine level valid cells with the fine level data. If * the valid cells of the destination need to be filled, it will be done as - * well. Finally, it will fill the physical bounbary using the user + * well. Finally, it will fill the physical boundary using the user * provided functor. The `fill` member function can be used to do the * operations just described. Alternatively, one can also use the * `fillCoarseFineBounary` to fill the ghost cells at the coarse/fine * boundary only. Then one can manually call FillBoundary to fill the other * ghost cells, and use the physical BC functor to handle the physical - * boundeary. + * boundary. * * The communication of the coarse data needed for spatial interpolation is * optimized at the cost of being error-prone. One must follow the @@ -42,7 +42,7 @@ namespace amrex { * * (3) When to destroy? Usually, we do time steppig on a coarse level * first. Then we recursively do time stepping on fine levels. After the - * finer level finishes, we do reflux and averge the fine data down to the + * finer level finishes, we do reflux and average the fine data down to the * coarse level. After that we should destroy the FillPatcher object * associated with these two levels, because the coarse data stored in the * object has become outdated. For AmrCore based codes, you could use @@ -68,7 +68,6 @@ namespace amrex { * See AmrLevel::RK for an example of using the RungeKutta functions and * FillPatcher together. */ - template class FillPatcher { @@ -118,7 +117,7 @@ public: * \param fbc for filling fine level physical BC * \param fbccomp starting component of the fine level BC functor * \param bcs BCRec specifying physical boundary types - * \parame bcscomp starting component of the BCRec Vector. + * \param bcscomp starting component of the BCRec Vector. * \param pre_interp optional pre-interpolation hook for modifying the coarse data * \param post_interp optional post-interpolation hook for modifying the fine data */ diff --git a/Src/AmrCore/AMReX_FluxRegister.H b/Src/AmrCore/AMReX_FluxRegister.H index 4178eb289ba..f5983e18872 100644 --- a/Src/AmrCore/AMReX_FluxRegister.H +++ b/Src/AmrCore/AMReX_FluxRegister.H @@ -14,7 +14,6 @@ namespace amrex { * * Stores and manipulates fluxes at coarse-fine interfaces. */ - class FluxRegister : public BndryRegister diff --git a/Src/AmrCore/AMReX_InterpFaceReg_3D_C.H b/Src/AmrCore/AMReX_InterpFaceReg_3D_C.H index c55cda3c357..2df7fef055a 100644 --- a/Src/AmrCore/AMReX_InterpFaceReg_3D_C.H +++ b/Src/AmrCore/AMReX_InterpFaceReg_3D_C.H @@ -12,11 +12,10 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4 const int jc = amrex::coarsen(j,rr[1]); int kc = amrex::coarsen(k,rr[2]); if (idim == 0) { - if (jc == domface.smallEnd(1) || jc == domface.bigEnd(1)) { - for (int n = 0; n < ncomp; ++n) { - fine(i,j,k,n+scomp) = crse(ic,jc,kc,n); - } - } else { + for (int n = 0; n < ncomp; ++n) { + fine(i,j,k,n+scomp) = crse(ic,jc,kc,n); + } + if (jc != domface.smallEnd(1) && jc != domface.bigEnd(1) && rr[1] > 1) { Real sfy = Real(1.0); for (int n = 0; n < ncomp; ++n) { Real dc = Real(0.5) * (crse(ic,jc+1,kc,n) - crse(ic,jc-1,kc,n)); @@ -32,11 +31,11 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4 const } Real yoff = (static_cast(j - jc*rr[1]) + Real(0.5)) / Real(rr[1]) - Real(0.5); for (int n = 0; n < ncomp; ++n) { - fine(i,j,k,n+scomp) = crse(ic,jc,kc,n) + yoff * slope(i,j,k,n) * sfy; + fine(i,j,k,n+scomp) += yoff * slope(i,j,k,n) * sfy; } } - if (kc != domface.smallEnd(2) && kc != domface.bigEnd(2)) { + if (kc != domface.smallEnd(2) && kc != domface.bigEnd(2) && rr[2] > 1) { Real sfz = Real(1.0); for (int n = 0; n < ncomp; ++n) { Real dc = Real(0.5) * (crse(ic,jc,kc+1,n) - crse(ic,jc,kc-1,n)); @@ -56,11 +55,10 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4 const } } } else if (idim == 1) { - if (ic == domface.smallEnd(0) || ic == domface.bigEnd(0)) { - for (int n = 0; n < ncomp; ++n) { - fine(i,j,k,n+scomp) = crse(ic,jc,kc,n); - } - } else { + for (int n = 0; n < ncomp; ++n) { + fine(i,j,k,n+scomp) = crse(ic,jc,kc,n); + } + if (ic != domface.smallEnd(0) && ic != domface.bigEnd(0) && rr[0] > 1) { Real sfx = Real(1.0); for (int n = 0; n < ncomp; ++n) { Real dc = Real(0.5) * (crse(ic+1,jc,kc,n) - crse(ic-1,jc,kc,n)); @@ -76,11 +74,11 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4 const } Real xoff = (static_cast(i - ic*rr[0]) + Real(0.5)) / Real(rr[0]) - Real(0.5); for (int n = 0; n < ncomp; ++n) { - fine(i,j,k,n+scomp) = crse(ic,jc,kc,n) + xoff * slope(i,j,k,n) * sfx; + fine(i,j,k,n+scomp) += xoff * slope(i,j,k,n) * sfx; } } - if (kc != domface.smallEnd(2) && kc != domface.bigEnd(2)) { + if (kc != domface.smallEnd(2) && kc != domface.bigEnd(2) && rr[2] > 1) { Real sfz = Real(1.0); for (int n = 0; n < ncomp; ++n) { Real dc = Real(0.5) * (crse(ic,jc,kc+1,n) - crse(ic,jc,kc-1,n)); @@ -100,11 +98,10 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4 const } } } else { - if (ic == domface.smallEnd(0) || ic == domface.bigEnd(0)) { - for (int n = 0; n < ncomp; ++n) { - fine(i,j,k,n+scomp) = crse(ic,jc,kc,n); - } - } else { + for (int n = 0; n < ncomp; ++n) { + fine(i,j,k,n+scomp) = crse(ic,jc,kc,n); + } + if (ic != domface.smallEnd(0) && ic != domface.bigEnd(0) && rr[0] > 1) { Real sfx = Real(1.0); for (int n = 0; n < ncomp; ++n) { Real dc = Real(0.5) * (crse(ic+1,jc,kc,n) - crse(ic-1,jc,kc,n)); @@ -120,11 +117,11 @@ void interp_face_reg (int i, int j, int k, IntVect const& rr, Array4 const } Real xoff = (static_cast(i - ic*rr[0]) + Real(0.5)) / Real(rr[0]) - Real(0.5); for (int n = 0; n < ncomp; ++n) { - fine(i,j,k,n+scomp) = crse(ic,jc,kc,n) + xoff * slope(i,j,k,n) * sfx; + fine(i,j,k,n+scomp) += xoff * slope(i,j,k,n) * sfx; } } - if (jc != domface.smallEnd(1) && jc != domface.bigEnd(1)) { + if (jc != domface.smallEnd(1) && jc != domface.bigEnd(1) && rr[1] > 1) { Real sfy = Real(1.0); for (int n = 0; n < ncomp; ++n) { Real dc = Real(0.5) * (crse(ic,jc+1,kc,n) - crse(ic,jc-1,kc,n)); diff --git a/Src/AmrCore/AMReX_InterpFaceRegister.H b/Src/AmrCore/AMReX_InterpFaceRegister.H index 5e9f92784e7..c54879bcaf6 100644 --- a/Src/AmrCore/AMReX_InterpFaceRegister.H +++ b/Src/AmrCore/AMReX_InterpFaceRegister.H @@ -10,9 +10,8 @@ namespace amrex { /** * \brief InterpFaceRegister is a coarse/fine boundary register for - * interpolation of face data at the coarse/fine boundadry. + * interpolation of face data at the coarse/fine boundary. */ - class InterpFaceRegister { public: @@ -31,7 +30,7 @@ public: Geometry const& fgeom, IntVect const& ref_ratio); /** - * \brief Defines an InterpFaceRegister objecct. + * \brief Defines an InterpFaceRegister object. * * \param fba The fine level BoxArray * \param fdm The fine level DistributionMapping diff --git a/Src/AmrCore/AMReX_Interp_2D_C.H b/Src/AmrCore/AMReX_Interp_2D_C.H index 20f8b1c3a0f..a47f265011d 100644 --- a/Src/AmrCore/AMReX_Interp_2D_C.H +++ b/Src/AmrCore/AMReX_Interp_2D_C.H @@ -128,6 +128,7 @@ facediv_face_interp (int ci, int cj, int /*ck*/, break; } + default : { break; } } } diff --git a/Src/AmrCore/AMReX_Interp_3D_C.H b/Src/AmrCore/AMReX_Interp_3D_C.H index daa19f6d5e3..709b44761d0 100644 --- a/Src/AmrCore/AMReX_Interp_3D_C.H +++ b/Src/AmrCore/AMReX_Interp_3D_C.H @@ -204,6 +204,7 @@ facediv_face_interp (int ci, int cj, int ck, break; } + default : { break; } } } diff --git a/Src/AmrCore/AMReX_Interp_C.H b/Src/AmrCore/AMReX_Interp_C.H index 86e7935a436..01e920680c0 100644 --- a/Src/AmrCore/AMReX_Interp_C.H +++ b/Src/AmrCore/AMReX_Interp_C.H @@ -15,8 +15,8 @@ namespace amrex { // // Fill fine values with piecewise-constant interpolation of coarse data. -// Operate only on faces that overlap--ie, only fill the fine faces that make up each -// coarse face, leave the in-between faces alone. +// Operate only on faces that overlap -- i.e., only fill the fine faces that +// make up each coarse face, leave the in-between faces alone. // template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void @@ -84,6 +84,154 @@ face_linear_face_interp_z (int fi, int fj, int fk, int n, Array4 const& fine, } } +// +// Fill fine values with tangential interpolation of coarse data. +// Operate only on faces that overlap -- i.e., only fill the fine faces that +// make up each coarse face, leave the in-between faces alone. +// +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void +face_cons_linear_face_interp (int i, int j, int k, int n, Array4 const& fine, + Array4 const& crse, Array4 const& mask, + IntVect const& ratio, Box const& per_grown_domain, int dim) noexcept +{ + int ci = amrex::coarsen(i, ratio[0]); + +#if (AMREX_SPACEDIM == 1) + amrex::ignore_unused(per_grown_domain); + int cj = 0; +#else + int cj = amrex::coarsen(j, ratio[1]); +#endif + +#if (AMREX_SPACEDIM == 3) + int ck = amrex::coarsen(k, ratio[2]); +#else + int ck = 0; +#endif + + if (dim == 0 && ci*ratio[0] == i) { + // Check solve mask to ensure we don't overwrite valid fine data. + if (!mask || mask(ci, cj, ck, n)) { + fine(i, j, k, n) = crse(ci, cj, ck, n); +#if (AMREX_SPACEDIM >= 2) + if (cj > per_grown_domain.smallEnd(1) && cj < per_grown_domain.bigEnd(1) && ratio[1] > 1) { + Real sfy = Real(1.0); + Real dc = Real(0.5) * (crse(ci,cj+1,ck,n) - crse(ci,cj-1,ck,n)); + Real df = Real(2.0) * (crse(ci,cj+1,ck,n) - crse(ci,cj ,ck,n)); + Real db = Real(2.0) * (crse(ci,cj ,ck,n) - crse(ci,cj-1,ck,n)); + Real sy = (df*db >= Real(0.0)) ? + amrex::min(std::abs(df),std::abs(db)) : Real(0.); + sy = std::copysign(Real(1.),dc)*amrex::min(sy,std::abs(dc)); + if (dc != Real(0.0)) { + sfy = amrex::min(sfy, sy / dc); + } + Real slope = dc; + Real yoff = (static_cast(j - cj*ratio[1]) + Real(0.5)) / Real(ratio[1]) - Real(0.5); + fine(i,j,k,n) += yoff * slope * sfy; + } // jc +#if (AMREX_SPACEDIM == 3) + if (ck > per_grown_domain.smallEnd(2) && ck < per_grown_domain.bigEnd(2) && ratio[2] > 1) { + Real sfz = Real(1.0); + Real dc = Real(0.5) * (crse(ci,cj,ck+1,n) - crse(ci,cj,ck-1,n)); + Real df = Real(2.0) * (crse(ci,cj,ck+1,n) - crse(ci,cj,ck ,n)); + Real db = Real(2.0) * (crse(ci,cj,ck ,n) - crse(ci,cj,ck-1,n)); + Real sz = (df*db >= Real(0.0)) ? + amrex::min(std::abs(df),std::abs(db)) : Real(0.); + sz = std::copysign(Real(1.),dc)*amrex::min(sz,std::abs(dc)); + if (dc != Real(0.0)) { + sfz = amrex::min(sfz, sz / dc); + } + Real slope = dc; + Real zoff = (static_cast(k - ck*ratio[2]) + Real(0.5)) / Real(ratio[2]) - Real(0.5); + fine(i,j,k,n) += zoff * slope * sfz; + } // ck +#endif +#endif + } // mask + } // dim + +#if (AMREX_SPACEDIM >= 2) + if (dim == 1 && cj*ratio[1] == j) { + // Check solve mask to ensure we don't overwrite valid fine data. + if (!mask || mask(ci, cj, ck, n)) { + fine(i, j, k, n) = crse(ci, cj, ck, n); + if (ci > per_grown_domain.smallEnd(0) && ci < per_grown_domain.bigEnd(0) && ratio[0] > 1) { + Real sfx = Real(1.0); + Real dc = Real(0.5) * (crse(ci+1,cj,ck,n) - crse(ci-1,cj,ck,n)); + Real df = Real(2.0) * (crse(ci+1,cj,ck,n) - crse(ci ,cj,ck,n)); + Real db = Real(2.0) * (crse(ci ,cj,ck,n) - crse(ci-1,cj,ck,n)); + Real sx = (df*db >= Real(0.0)) ? + amrex::min(std::abs(df),std::abs(db)) : Real(0.); + sx = std::copysign(Real(1.),dc)*amrex::min(sx,std::abs(dc)); + if (dc != Real(0.0)) { + sfx = amrex::min(sfx, sx / dc); + } + Real slope = dc; + Real xoff = (static_cast(i - ci*ratio[0]) + Real(0.5)) / Real(ratio[0]) - Real(0.5); + fine(i,j,k,n) += xoff * slope * sfx; + } // ci +#if (AMREX_SPACEDIM == 3) + if (ck > per_grown_domain.smallEnd(2) && ck < per_grown_domain.bigEnd(2) && ratio[2] > 1) { + Real sfz = Real(1.0); + Real dc = Real(0.5) * (crse(ci,cj,ck+1,n) - crse(ci,cj,ck-1,n)); + Real df = Real(2.0) * (crse(ci,cj,ck+1,n) - crse(ci,cj,ck ,n)); + Real db = Real(2.0) * (crse(ci,cj,ck ,n) - crse(ci,cj,ck-1,n)); + Real sz = (df*db >= Real(0.0)) ? + amrex::min(std::abs(df),std::abs(db)) : Real(0.); + sz = std::copysign(Real(1.),dc)*amrex::min(sz,std::abs(dc)); + if (dc != Real(0.0)) { + sfz = amrex::min(sfz, sz / dc); + } + Real slope = dc; + Real zoff = (static_cast(k - ck*ratio[2]) + Real(0.5)) / Real(ratio[2]) - Real(0.5); + fine(i,j,k,n) += zoff * slope * sfz; + } // ck +#endif // SPACEDIM >= 3 + } // mask + } // dim == 1 +#endif // SPACEDIM >= 2 + +#if (AMREX_SPACEDIM == 3) + if (dim == 2 && ck*ratio[2] == k) { + // Check solve mask to ensure we don't overwrite valid fine data. + if (!mask || mask(ci, cj, ck, n)) { + fine(i, j, k, n) = crse(ci, cj, ck, n); + if (ci > per_grown_domain.smallEnd(0) && ci < per_grown_domain.bigEnd(0) && ratio[0] > 1) { + Real sfx = Real(1.0); + Real dc = Real(0.5) * (crse(ci+1,cj,ck,n) - crse(ci-1,cj,ck,n)); + Real df = Real(2.0) * (crse(ci+1,cj,ck,n) - crse(ci ,cj,ck,n)); + Real db = Real(2.0) * (crse(ci ,cj,ck,n) - crse(ci-1,cj,ck,n)); + Real sx = (df*db >= Real(0.0)) ? + amrex::min(std::abs(df),std::abs(db)) : Real(0.); + sx = std::copysign(Real(1.),dc)*amrex::min(sx,std::abs(dc)); + if (dc != Real(0.0)) { + sfx = amrex::min(sfx, sx / dc); + } + Real slope = dc; + Real xoff = (static_cast(i - ci*ratio[0]) + Real(0.5)) / Real(ratio[0]) - Real(0.5); + fine(i,j,k,n) += xoff * slope * sfx; + } // ci + if (cj > per_grown_domain.smallEnd(1) && cj < per_grown_domain.bigEnd(1) && ratio[1] > 1) { + Real sfy = Real(1.0); + Real dc = Real(0.5) * (crse(ci,cj+1,ck,n) - crse(ci,cj-1,ck,n)); + Real df = Real(2.0) * (crse(ci,cj+1,ck,n) - crse(ci,cj ,ck,n)); + Real db = Real(2.0) * (crse(ci,cj ,ck,n) - crse(ci,cj-1,ck,n)); + Real sy = (df*db >= Real(0.0)) ? + amrex::min(std::abs(df),std::abs(db)) : Real(0.); + sy = std::copysign(Real(1.),dc)*amrex::min(sy,std::abs(dc)); + if (dc != Real(0.0)) { + sfy = amrex::min(sfy, sy / dc); + } + Real slope = dc; + Real yoff = (static_cast(j - cj*ratio[1]) + Real(0.5)) / Real(ratio[1]) - Real(0.5); + fine(i,j,k,n) += yoff * slope * sfy; + } // cj + } // mask + } // dim == 2 +#endif +} + // // Do linear in dir, pc transverse to dir, leave alone the fine values // lining up with coarse edges--assume these have been set to hold the diff --git a/Src/AmrCore/AMReX_Interpolater.H b/Src/AmrCore/AMReX_Interpolater.H index d2fe66b0cbd..c13fb283a35 100644 --- a/Src/AmrCore/AMReX_Interpolater.H +++ b/Src/AmrCore/AMReX_Interpolater.H @@ -17,7 +17,6 @@ class IArrayBox; * * Specifies interpolater interface for coarse-to-fine interpolation in space. */ - class Interpolater : public InterpBase { @@ -160,7 +159,6 @@ public: * * Bilinear interpolation on node centered data. */ - class NodeBilinear : public Interpolater @@ -219,7 +217,6 @@ public: * * Bilinear interpolation on cell centered data. */ - class CellBilinear : public Interpolater @@ -286,7 +283,6 @@ public: * sum_ivar a(ic,jc,ivar)*fab(if,jf,ivar) = 0 is satisfied * in all fine cells if,jf covering coarse cell ic,jc. */ - class CellConservativeLinear : public Interpolater @@ -344,7 +340,6 @@ protected: * Linear conservative interpolation on cell centered data * but with protection against undershoots or overshoots. */ - class CellConservativeProtected : public CellConservativeLinear @@ -393,7 +388,6 @@ public: * * Quadratic interpolation on cell centered data. */ - class CellQuadratic : public Interpolater @@ -451,7 +445,6 @@ public: /** * \brief Piecewise Constant interpolation on cell centered data. */ - class PCInterp : public Interpolater @@ -512,7 +505,6 @@ public: * in constructing the polynomial, the average of the polynomial inside that * cell is equal to the cell averaged value of the original data. */ - class CellConservativeQuartic : public Interpolater @@ -574,7 +566,6 @@ public: * a given coarse cell will have the same divergence, even when the coarse * grid divergence is spatially varying. */ - class FaceDivFree : public Interpolater @@ -663,11 +654,10 @@ public: /** -* \brief Bilinear interpolation on face data. +* \brief Piecewise constant tangential interpolation / linear normal interpolation of face data. * -* Bilinear interpolation on data. +* Piecewise constant tangential interpolation / linear normal interpolation of face data. */ - class FaceLinear : public Interpolater @@ -782,6 +772,127 @@ public: RunOn runon) override; +}; + +/** +* \brief Bilinear tangential interpolation / linear normal interpolation of face data. +* +* Bilinear tangential interpolation / linear normal interpolation of face data. +*/ +class FaceConservativeLinear + : + public Interpolater +{ +public: + /** + * \brief Returns coarsened box given fine box and refinement ratio. + * + * \param fine + * \param ratio + */ + Box CoarseBox (const Box& fine, int ratio) override; + + /** + * \brief Returns coarsened box given fine box and refinement ratio. + * + * \param fine + * \param ratio + */ + Box CoarseBox (const Box& fine, const IntVect& ratio) override; + + /** + * \brief Coarse to fine interpolation in space. + * + * \param crse + * \param crse_comp + * \param fine + * \param fine_comp + * \param ncomp + * \param fine_region + * \param ratio + * \param crse_geom + * \param fine_geom + * \param bcr + * \param actual_comp + * \param actual_state + */ + void interp (const FArrayBox& crse, + int crse_comp, + FArrayBox& fine, + int fine_comp, + int ncomp, + const Box& fine_region, + const IntVect& ratio, + const Geometry& crse_geom, + const Geometry& fine_geom, + Vector const& bcr, + int actual_comp, + int actual_state, + RunOn runon) override; + + /** + * \brief Coarse to fine interpolation in space for face-based data. + * + * \param crse + * \param crse_comp + * \param fine + * \param fine_comp + * \param ncomp + * \param fine_region + * \param ratio + * \param solve_mask + * \param crse_geom + * \param fine_geom + * \param bcr + * \param bccomp + * \param runon + */ + void interp_face (const FArrayBox& crse, + int crse_comp, + FArrayBox& fine, + int fine_comp, + int ncomp, + const Box& fine_region, + const IntVect& ratio, + const IArrayBox& solve_mask, + const Geometry& crse_geom, + const Geometry& fine_geom, + Vector const & bcr, + int bccomp, + RunOn runon) override; + + /** + * \brief Coarse to fine interpolation in space. + * + * \param crse + * \param crse_comp + * \param fine + * \param fine_comp + * \param ncomp + * \param fine_region + * \param ratio + * \param crse_geom + * \param fine_geom + * \param bcr + * \param actual_comp + * \param actual_state + */ + void interp_arr (Array const& crse, + int crse_comp, + Array const& fine, + int fine_comp, + int ncomp, + const Box& fine_region, + const IntVect& ratio, + Array const& solve_mask, + const Geometry& /*crse_geom*/, + const Geometry& /*fine_geom*/, + Vector > const& /*bcr*/, + int /*actual_comp*/, + int /*actual_state*/, + RunOn runon) override; + + }; /** @@ -789,7 +900,6 @@ public: * * Quartic interpolation on cell centered data. */ - class CellQuartic : public Interpolater @@ -847,6 +957,7 @@ extern AMREX_EXPORT PCInterp pc_interp; extern AMREX_EXPORT NodeBilinear node_bilinear_interp; extern AMREX_EXPORT FaceDivFree face_divfree_interp; extern AMREX_EXPORT FaceLinear face_linear_interp; +extern AMREX_EXPORT FaceConservativeLinear face_cons_linear_interp; extern AMREX_EXPORT CellConservativeLinear lincc_interp; extern AMREX_EXPORT CellConservativeLinear cell_cons_interp; extern AMREX_EXPORT CellBilinear cell_bilinear_interp; diff --git a/Src/AmrCore/AMReX_Interpolater.cpp b/Src/AmrCore/AMReX_Interpolater.cpp index 601b8b4b861..b5e855feb13 100644 --- a/Src/AmrCore/AMReX_Interpolater.cpp +++ b/Src/AmrCore/AMReX_Interpolater.cpp @@ -11,7 +11,7 @@ namespace amrex { /* - * PCInterp, NodeBilinear, FaceLinear, CellConservativeLinear, and + * PCInterp, NodeBilinear, FaceLinear, CellConservativeLinear and * CellBilinear are supported for all dimensions on cpu and gpu. * * CellConservativeProtected only works in 2D and 3D on cpu and gpu @@ -23,6 +23,8 @@ namespace amrex { * * CellConservativeQuartic only works with ref ratio of 2 on cpu and gpu. * + * FaceConservativeLinear works in 2D and 3D on cpu and gpu. + * * FaceDivFree works in 2D and 3D on cpu and gpu. * The algorithm is restricted to ref ratio of 2. */ @@ -33,6 +35,7 @@ namespace amrex { PCInterp pc_interp; NodeBilinear node_bilinear_interp; FaceLinear face_linear_interp; +FaceConservativeLinear face_cons_linear_interp; FaceDivFree face_divfree_interp; CellConservativeLinear lincc_interp; CellConservativeLinear cell_cons_interp(false); @@ -142,7 +145,14 @@ FaceLinear::interp (const FArrayBox& crse, RunOn runon) { // - // This version is called from InterpFromCoarseLevel + // This version is called from FillPatchInterp which is called by + // InterpFromCoarseLevel in AMReX_FillPatchUtil_I.H + // + // It assumes no existing fine values that need to be preserved (unlike interp_face below) + // + // Inside each call to face_linear_interp_* (in AMRex_Interp_*D_C.H), we do: + // * on fine faces which overlie crse faces, the fine value is set to the crse value (piecewise constant) + // * on fine faces which are between two crse faces, the fine value is set to the average of the crse values (linear) // BL_PROFILE("FaceLinear::interp()"); @@ -193,6 +203,18 @@ FaceLinear::interp_face (const FArrayBox& crse, const int /*bccomp*/, RunOn runon) { + // + // This version is called from InterpFace which is called from the version FillPatchTwoLevels_doit + // that takes a single MF (in AMReX_FillPatchUtil_I.H) + // + // It assumes there are existing fine values which we want to preserve (unlike interp above) + // + // We do the interpolation in two steps: + // 1) face_linear_face_interp_*: on fine faces which overlie crse faces, the fine value is set to the crse value (piecewise constant) ONLY IF + // there is not already fine data there + // 2) face_linear_interp_*: on fine faces which are between two crse faces, the fine value is set to the average of the values + // on the faces overlying -- this uses only the results of step 1, it does not take the crse values + // BL_PROFILE("FaceLinear::interp_face()"); AMREX_ASSERT(AMREX_D_TERM(fine_region.type(0),+fine_region.type(1),+fine_region.type(2)) == 1); @@ -283,6 +305,17 @@ void FaceLinear::interp_arr (Array const& crse, const int /*actual_state*/, const RunOn runon) { + // + // This version is called from FillPatchTwoLevels_doit (that takes an Array of MF*) in AMReX_FillPatchUtil_I.H + // + // It assumes there are existing fine values which we want to preserve (like face_interp, unlike interp above) + // + // We do the interpolation in two steps: + // 1) face_linear_face_interp_*: on fine faces which overlie crse faces, the fine value is set to the crse value (piecewise constant) ONLY IF + // there is not already fine data there + // 2) face_linear_interp_*: on fine faces which are between two crse faces, the fine value is set to the average of the values + // on the faces overlying -- this uses only the results of step 1, it does not take the crse values + // BL_PROFILE("FaceLinear::interp_arr()"); Array types; @@ -377,6 +410,307 @@ void FaceLinear::interp_arr (Array const& crse, }); } +Box +FaceConservativeLinear::CoarseBox (const Box& fine, int ratio) +{ + return CoarseBox(fine, IntVect(ratio)); +} + +Box +FaceConservativeLinear::CoarseBox (const Box& fine, const IntVect& ratio) +{ + IntVect ng(1); + for (int i = 0; i < AMREX_SPACEDIM; i++) { + if ( (fine.type(i) == IndexType::NODE) || (ratio[i] == 1) ) { + ng[i] = 0; + } + } + Box b = amrex::coarsen(fine,ratio); b.grow(ng); + + for (int i = 0; i < AMREX_SPACEDIM; i++) { + if (b.type(i) == IndexType::NODE) { + if (b.type(i) == IndexType::NODE && b.length(i) < 2) { + // Don't want degenerate boxes in nodal direction. + b.growHi(i,1); + } + } + } + return b; +} + +void +FaceConservativeLinear::interp (const FArrayBox& crse, + int crse_comp, + FArrayBox& fine, + int fine_comp, + int ncomp, + const Box& fine_region, + const IntVect& ratio, + const Geometry& crse_geom, + const Geometry& fine_geom, + Vector const& bcr, + int /*actual_comp*/, + int /*actual_state*/, + RunOn runon) +{ + // + // This version is called from FillPatchInterp which is called by + // InterpFromCoarseLevel in AMReX_FillPatchUtil_I.H + // + // It assumes no existing fine values that need to be preserved thus does not send a mask to interp_face + // + BL_PROFILE("FaceConservativeLinear::interp()"); + + AMREX_ASSERT(AMREX_D_TERM(fine_region.type(0),+fine_region.type(1),+fine_region.type(2)) == 1); + + // We intentionally do not allocate the mask so that all faces are filled from coarse values + IArrayBox dummy_mask; + int bccomp = 0; // This is also a dummy -- it's not used + interp_face(crse,crse_comp,fine,fine_comp,ncomp,fine_region,ratio,dummy_mask, + crse_geom,fine_geom,bcr,bccomp,runon); +} + +void +FaceConservativeLinear::interp_face (const FArrayBox& crse, + const int crse_comp, + FArrayBox& fine, + const int fine_comp, + const int ncomp, + const Box& fine_region, + const IntVect& ratio, + const IArrayBox& solve_mask, + const Geometry& crse_geom, + const Geometry& /*fine_geom */, + Vector const& /*bcr*/, + const int /*bccomp*/, + RunOn runon) +{ + // + // This version is called from InterpFace which is called from the version FillPatchTwoLevels_doit + // that takes a single MF (in AMReX_FillPatchUtil_I.H) + // + // It assumes there are existing fine values which we want to preserve (unlike interp above) + // + // We do the interpolation in two steps: + // 1) face_cons_linear_face_interp: on fine faces which overlie crse faces, slopes are computed (linear in 2d, bilinear in 3d) + // and the fine value is over-written ONLY IF there is not already fine data there (assuming the mask is used) + // 2) face_linear_interp_*: on fine faces which are between two crse faces, the fine value is set to the average of the values + // on the faces overlying -- this uses only the results of step 1 + // NOTE: we use the same routines as used by FaceLinear since this interpolation is only in the normal direction + // + BL_PROFILE("FaceConservativeLinear::interp_face()"); + + AMREX_ASSERT(AMREX_D_TERM(fine_region.type(0),+fine_region.type(1),+fine_region.type(2)) == 1); + Array4 const& fine_arr = fine.array(fine_comp); + Array4 const& crse_arr = crse.const_array(crse_comp); + Array4 mask_arr; + if (solve_mask.isAllocated()) { + mask_arr = solve_mask.const_array(); + } + + // We don't need to worry about face-based domain because this is only used in the tangential interpolation + Box per_grown_domain = crse_geom.Domain(); + for (int dim = 0; dim < AMREX_SPACEDIM; dim++) { + if (crse_geom.isPeriodic(dim)) { + per_grown_domain.grow(dim,1); + } + } + + // + // Fill fine ghost faces with interpolation of coarse data that is conservative linear + // in the tangential direction. + // Operate only on faces that overlap--ie, only fill the fine faces that make up each + // coarse face, leave the in-between faces alone. + // The mask ensures we do not overwrite valid fine cells. + // + if (fine_region.type(0) == IndexType::NODE) + { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n, + { + face_cons_linear_face_interp(i,j,k,n,fine_arr,crse_arr,mask_arr,ratio,per_grown_domain,0); + }); + } +#if (AMREX_SPACEDIM >= 2) + else if (fine_region.type(1) == IndexType::NODE) + { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n, + { + face_cons_linear_face_interp(i,j,k,n,fine_arr,crse_arr,mask_arr,ratio,per_grown_domain,1); + }); + } +#if (AMREX_SPACEDIM == 3) + else + { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n, + { + face_cons_linear_face_interp(i,j,k,n,fine_arr,crse_arr,mask_arr,ratio,per_grown_domain,2); + }); + } +#endif +#endif + + // + // Interpolate unfilled grow cells using best data from + // surrounding faces of valid region, and pc-interpd data + // on fine faces overlaying coarse edges. + // + if (fine_region.type(0) == IndexType::NODE) + { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n, + { + face_linear_interp_x(i,j,k,n,fine_arr,ratio); + }); + } +#if (AMREX_SPACEDIM >= 2) + else if (fine_region.type(1) == IndexType::NODE) + { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n, + { + face_linear_interp_y(i,j,k,n,fine_arr,ratio); + }); + } +#if (AMREX_SPACEDIM == 3) + else + { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n, + { + face_linear_interp_z(i,j,k,n,fine_arr,ratio); + }); + } +#endif +#endif +} + +void FaceConservativeLinear::interp_arr (Array const& crse, + const int crse_comp, + Array const& fine, + const int fine_comp, + const int ncomp, + const Box& fine_region, + const IntVect& ratio, + Array const& solve_mask, + const Geometry& crse_geom, + const Geometry& /*fine_geom*/, + Vector > const& /*bcr*/, + const int /*actual_comp*/, + const int /*actual_state*/, + const RunOn runon) +{ + // + // This version is called from FillPatchTwoLevels_doit (that takes an Array of MF*) in AMReX_FillPatchUtil_I.H + // + // It assumes there are existing fine values which we want to preserve (like face_interp, unlike interp above) + // + // We do the interpolation in two steps: + // 1) face_cons_linear_face_interp_*: on fine faces which overlie crse faces, we compute tangential slopes + // to compute the fine values (linear in 2d, bilinear in 3d) ONLY IF there is not already fine data there + // 2) face_cons_linear_interp_*: on fine faces which are between two crse faces, the fine value is set to the average of the values + // on the faces overlying -- this uses only the results of step 1, it does not take the crse values + // NOTE: here we use the same routines as used by FaceLinear since this interpolation is only in the normal direction + // + BL_PROFILE("FaceConservativeLinear::interp_arr()"); + + Array types; + for (int d=0; d, AMREX_SPACEDIM> crse_arr; + GpuArray, AMREX_SPACEDIM> fine_arr; + GpuArray, AMREX_SPACEDIM> mask_arr; + for (int d=0; dconst_array(crse_comp); + fine_arr[d] = fine[d]->array(fine_comp); + if (solve_mask[d] != nullptr) + { mask_arr[d] = solve_mask[d]->const_array(0); } + } + + // We don't need to worry about face-based domain because this is only used in the tangential interpolation + Box per_grown_domain = crse_geom.Domain(); + for (int dim = 0; dim < AMREX_SPACEDIM; dim++) { + if (crse_geom.isPeriodic(dim)) { + per_grown_domain.grow(dim,1); + } + } + + // + // Fill fine ghost faces with interpolation of coarse data that is conservative linear + // in the tangential direction. + // Operate only on faces that overlap--ie, only fill the fine faces that make up each + // coarse face, leave the in-between faces alone. + // The mask ensures we do not overwrite valid fine cells. + // + // Fuse the launches, 1 for each dimension, into a single launch. + AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM_FLAG(runon, + amrex::convert(fine_region,types[0]), bx0, + { + AMREX_LOOP_3D(bx0, i, j, k, + { + for (int n=0; n @@ -145,7 +144,6 @@ public: * * A container class for TagBoxes. */ - class TagBoxArray : public FabArray diff --git a/Src/AmrCore/Make.package b/Src/AmrCore/Make.package index df3c2e83d40..bd0bddd6704 100644 --- a/Src/AmrCore/Make.package +++ b/Src/AmrCore/Make.package @@ -1,5 +1,6 @@ -CEXE_headers += AMReX_AmrCore.H AMReX_Cluster.H AMReX_ErrorList.H AMReX_FillPatchUtil.H AMReX_FillPatchUtil_I.H AMReX_FluxRegister.H \ +CEXE_headers += AMReX_AmrCore.H AMReX_Cluster.H AMReX_ErrorList.H AMReX_FillPatchUtil.H \ + AMReX_FillPatchUtil_I.H AMReX_FluxRegister.H \ AMReX_Interpolater.H AMReX_MFInterpolater.H AMReX_TagBox.H AMReX_AmrMesh.H \ AMReX_InterpBase.H CEXE_sources += AMReX_AmrCore.cpp AMReX_Cluster.cpp AMReX_ErrorList.cpp AMReX_FillPatchUtil.cpp AMReX_FluxRegister.cpp \ diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H index c539a1d8e75..2b88553bcdf 100644 --- a/Src/Base/AMReX.H +++ b/Src/Base/AMReX.H @@ -113,16 +113,15 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Error (const char* msg = nullptr) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(msg); + AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);)) #else - if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); } - AMREX_DEVICE_ASSERT(0); -#endif -#else - Error_host("Error", msg); + AMREX_IF_ON_DEVICE(( + if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); } + AMREX_DEVICE_ASSERT(0); + )) #endif + AMREX_IF_ON_HOST((Error_host("Error", msg);)) } //! Print out warning message to cerr. @@ -132,15 +131,12 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Warning (const char * msg) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(msg); -#else - if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); } -#endif + AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);)) #else - Warning_host(msg); + AMREX_IF_ON_DEVICE((if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); })) #endif + AMREX_IF_ON_HOST((Warning_host(msg);)) } //! Print out message to cerr and exit via abort(). @@ -148,16 +144,15 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Abort (const char * msg = nullptr) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(msg); + AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);)) #else - if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); } - AMREX_DEVICE_ASSERT(0); -#endif -#else - Error_host("Abort", msg); + AMREX_IF_ON_DEVICE(( + if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); } + AMREX_DEVICE_ASSERT(0); + )) #endif + AMREX_IF_ON_HOST((Error_host("Abort", msg);)) } /** @@ -170,22 +165,21 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Assert (const char* EX, const char* file, int line, const char* msg = nullptr) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(EX,file,line,msg); -#else - if (msg) { - AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s", - EX, file, line, msg); - } else { - AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d", - EX, file, line); - } - AMREX_DEVICE_ASSERT(0); -#endif + AMREX_IF_ON_DEVICE((amrex::ignore_unused(EX,file,line,msg);)) #else - Assert_host(EX,file,line,msg); + AMREX_IF_ON_DEVICE(( + if (msg) { + AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s", + EX, file, line, msg); + } else { + AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d", + EX, file, line); + } + AMREX_DEVICE_ASSERT(0); + )) #endif + AMREX_IF_ON_HOST((Assert_host(EX,file,line,msg);)) } /** diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index f0e56952de2..2d6e7626c80 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -52,6 +52,7 @@ #endif #ifdef AMREX_USE_OMP +#include #include #endif @@ -72,7 +73,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -443,6 +446,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, pp.queryAdd("verbose", system::verbose); } + if (system::verbose > 0) { + amrex::Print() << "Initializing AMReX (" << amrex::Version() << ")...\n"; + } + #ifdef AMREX_USE_MPI if (system::verbose > 0) { amrex::Print() << "MPI initialized with " @@ -450,20 +457,22 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, << " MPI processes\n"; int provided = -1; MPI_Query_thread(&provided); - amrex::Print() << "MPI initialized with thread support level " << provided << std::endl; + amrex::Print() << "MPI initialized with thread support level " << provided << '\n'; } #endif #ifdef AMREX_USE_OMP + amrex::OpenMP::Initialize(); + + // status output if (system::verbose > 0) { // static_assert(_OPENMP >= 201107, "OpenMP >= 3.1 is required."); amrex::Print() << "OMP initialized with " << omp_get_max_threads() << " OMP threads\n"; } -#endif -#if defined(AMREX_USE_MPI) && defined(AMREX_USE_OMP) + // warn if over-subscription is detected if (system::verbose > 0) { auto ncores = int(std::thread::hardware_concurrency()); if (ncores != 0 && // It might be zero according to the C++ standard. @@ -472,8 +481,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, amrex::Print(amrex::ErrorStream()) << "AMReX Warning: You might be oversubscribing CPU cores with OMP threads.\n" << " There are " << ncores << " cores per node.\n" - << " There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks per node.\n" - << " But OMP is initialized with " << omp_get_max_threads() << " threads per rank.\n" +#if defined(AMREX_USE_MPI) + << " There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks (processes) per node.\n" +#endif + << " But OMP is initialized with " << omp_get_max_threads() << " threads per process.\n" << " You should consider setting OMP_NUM_THREADS=" << ncores/ParallelDescriptor::NProcsPerNode() << " or less in the environment.\n"; } @@ -677,7 +688,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, #endif if (system::verbose > 0) { - amrex::Print() << "AMReX (" << amrex::Version() << ") initialized" << std::endl; + amrex::Print() << "AMReX (" << amrex::Version() << ") initialized" << '\n'; } BL_TINY_PROFILE_INITIALIZE(); @@ -748,7 +759,7 @@ amrex::Finalize (amrex::AMReX* pamrex) << "min used in a thread: " << mp_min << " MB, " << "max used in a thread: " << mp_max << " MB, " #endif - << "tot used: " << mp_tot << " MB." << std::endl; + << "tot used: " << mp_tot << " MB." << '\n'; } } else { int global_max = mp_tot; @@ -806,6 +817,10 @@ amrex::Finalize (amrex::AMReX* pamrex) Gpu::Device::Finalize(); #endif +#ifdef AMREX_USE_OMP + amrex::OpenMP::Finalize(); +#endif + #if defined(AMREX_USE_UPCXX) upcxx::finalize(); #endif @@ -823,7 +838,7 @@ amrex::Finalize (amrex::AMReX* pamrex) #endif if (amrex::system::verbose > 0 && is_ioproc) { - amrex::OutStream() << "AMReX (" << amrex::Version() << ") finalized" << std::endl; + amrex::OutStream() << "AMReX (" << amrex::Version() << ") finalized" << '\n'; } } diff --git a/Src/Base/AMReX_Algorithm.H b/Src/Base/AMReX_Algorithm.H index b418f3cc1c0..666bd04e230 100644 --- a/Src/Base/AMReX_Algorithm.H +++ b/Src/Base/AMReX_Algorithm.H @@ -89,7 +89,7 @@ namespace amrex // Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/epsilon template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - typename std::enable_if::value,bool>::type + std::enable_if_t,bool> almostEqual (T x, T y, int ulp = 2) { // the machine epsilon has to be scaled to the magnitude of the values used @@ -100,7 +100,7 @@ namespace amrex } template ::value,int>::type FOO = 0> + std::enable_if_t,int>FOO = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect (T lo, T hi, F f, T tol=1e-12, int max_iter=100) { @@ -141,7 +141,7 @@ namespace amrex // It is assumed that the input data are sorted and T[lo] <= v < T[hi]. // Note that this is different from std::lower_bound. template ::value,int>::type = 0> + std::enable_if_t,int> = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE I bisect (T const* d, I lo, I hi, T const& v) { while (lo <= hi) { @@ -161,51 +161,89 @@ namespace amrex AMREX_GPU_HOST_DEVICE ItType upper_bound (ItType first, ItType last, const ValType& val) { -#if AMREX_DEVICE_COMPILE - std::ptrdiff_t count = last-first; - while(count>0){ - auto it = first; - const auto step = count/2; - it += step; - if (!(val < *it)){ - first = ++it; - count -= step + 1; + AMREX_IF_ON_DEVICE(( + std::ptrdiff_t count = last-first; + while(count>0){ + auto it = first; + const auto step = count/2; + it += step; + if (!(val < *it)){ + first = ++it; + count -= step + 1; + } + else{ + count = step; + } } - else{ - count = step; - } - } - - return first; -#else - return std::upper_bound(first, last, val); -#endif + return first; + )) + AMREX_IF_ON_HOST(( + return std::upper_bound(first, last, val); + )) } template AMREX_GPU_HOST_DEVICE ItType lower_bound (ItType first, ItType last, const ValType& val) { -#ifdef AMREX_DEVICE_COMPILE - std::ptrdiff_t count = last-first; - while(count>0) - { - auto it = first; - const auto step = count/2; - it += step; - if (*it < val){ - first = ++it; - count -= step + 1; + AMREX_IF_ON_DEVICE(( + std::ptrdiff_t count = last-first; + while(count>0) + { + auto it = first; + const auto step = count/2; + it += step; + if (*it < val){ + first = ++it; + count -= step + 1; + } + else{ + count = step; + } } - else{ - count = step; + + return first; + )) + AMREX_IF_ON_HOST(( + return std::lower_bound(first, last, val); + )) + } + + template::value_type> && + std::is_floating_point_v, + int> = 0> + AMREX_GPU_HOST_DEVICE + void linspace (ItType first, const ItType& last, const ValType& start, const ValType& stop) + { + const std::ptrdiff_t count = last-first; + if (count >= 2){ + const auto delta = (stop - start)/(count - 1); + for (std::ptrdiff_t i = 0; i < count-1; ++i){ + *(first++) = start + i*delta; } + *first = stop; } + } - return first; -#else - return std::lower_bound(first, last, val); -#endif + template::value_type> && + std::is_floating_point_v, + int> = 0> + AMREX_GPU_HOST_DEVICE + void logspace (ItType first, const ItType& last, + const ValType& start, const ValType& stop, const ValType& base) + { + const std::ptrdiff_t count = last-first; + if (count >= 2){ + const auto delta = (stop - start)/(count - 1); + for (std::ptrdiff_t i = 0; i < count-1; ++i){ + *(first++) = std::pow(base, start + i*delta); + } + *first = std::pow(base, stop); + } } namespace detail { @@ -218,104 +256,125 @@ struct clz_tag : clzl_tag {}; // unsigned long, and unsigned long long inputs. Because the sizes of these data types // vary on different platforms, we work with fixed-width integer types. // these tags and overloads select the smallest version of __builtin_clz that will hold the input type -template ::type> +template > AMREX_FORCE_INLINE int builtin_clz_wrapper (clz_tag, T x) noexcept { return static_cast(__builtin_clz(x) - (sizeof(unsigned int) * CHAR_BIT - sizeof(T) * CHAR_BIT)); } -template ::type> +template > AMREX_FORCE_INLINE int builtin_clz_wrapper (clzl_tag, T x) noexcept { return static_cast(__builtin_clzl(x) - (sizeof(unsigned long) * CHAR_BIT - sizeof(T) * CHAR_BIT)); } -template ::type> +template > AMREX_FORCE_INLINE int builtin_clz_wrapper (clzll_tag, T x) noexcept { return static_cast(__builtin_clzll(x) - (sizeof(unsigned long long) * CHAR_BIT - sizeof(T) * CHAR_BIT)); } -#ifdef AMREX_USE_CUDA - -// likewise with CUDA, there are __clz functions that take (signed) int and long long int -template ::type> -AMREX_GPU_DEVICE AMREX_FORCE_INLINE -int clz_wrapper (clz_tag, T x) noexcept -{ - return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT); -} - -template ::type> -AMREX_GPU_DEVICE AMREX_FORCE_INLINE -int clz_wrapper (clzll_tag, T x) noexcept -{ - return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT); } -#endif -} +template ,std::uint8_t> || + std::is_same_v,std::uint16_t> || + std::is_same_v,std::uint32_t> || + std::is_same_v,std::uint64_t>, int> = 0> +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +int clz (T x) noexcept; AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint8_t x) noexcept +int clz_generic (std::uint8_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else +#if !defined(__NVCOMPILER) static constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; +#else + constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; +#endif auto upper = x >> 4; auto lower = x & 0xF; return upper ? clz_lookup[upper] : 4 + clz_lookup[lower]; -#endif } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint16_t x) noexcept +int clz_generic (std::uint16_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else auto upper = std::uint8_t(x >> 8); auto lower = std::uint8_t(x & 0xFF); return upper ? clz(upper) : 8 + clz(lower); -#endif } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint32_t x) noexcept +int clz_generic (std::uint32_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else auto upper = std::uint16_t(x >> 16); auto lower = std::uint16_t(x & 0xFFFF); return upper ? clz(upper) : 16 + clz(lower); -#endif } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint64_t x) noexcept +int clz_generic (std::uint64_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else auto upper = std::uint32_t(x >> 32); auto lower = std::uint32_t(x & 0xFFFFFFFF); return upper ? clz(upper) : 32 + clz(lower); +} + +#if defined AMREX_USE_CUDA + +namespace detail { + // likewise with CUDA, there are __clz functions that take (signed) int and long long int + template > + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + int clz_wrapper (clz_tag, T x) noexcept + { + return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT); + } + + template > + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + int clz_wrapper (clzll_tag, T x) noexcept + { + return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT); + } +} + +template ,std::uint8_t> || + std::is_same_v,std::uint16_t> || + std::is_same_v,std::uint32_t> || + std::is_same_v,std::uint64_t>, int> > +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +int clz (T x) noexcept +{ + AMREX_IF_ON_DEVICE((return detail::clz_wrapper(detail::clz_tag{}, x);)) +#if AMREX_HAS_BUILTIN_CLZ + AMREX_IF_ON_HOST((return detail::builtin_clz_wrapper(detail::clz_tag{}, x);)) +#else + AMREX_IF_ON_HOST((return clz_generic(x);)) #endif } +#else // !defined AMREX_USE_CUDA + +template ,std::uint8_t> || + std::is_same_v,std::uint16_t> || + std::is_same_v,std::uint32_t> || + std::is_same_v,std::uint64_t>, int> > +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +int clz (T x) noexcept +{ +#if (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) + return detail::builtin_clz_wrapper(detail::clz_tag{}, x); +#else + return clz_generic(x); +#endif +} + +#endif // defined AMREX_USE_CUDA + } #endif diff --git a/Src/Base/AMReX_Any.H b/Src/Base/AMReX_Any.H index 2b9edc084b7..87196c9bf4d 100644 --- a/Src/Base/AMReX_Any.H +++ b/Src/Base/AMReX_Any.H @@ -88,7 +88,7 @@ private: struct innards final : innards_base // NOLINT(cppcoreguidelines-special-member-functions) { innards (MF && mf) - : m_mf(std::forward(mf)) + : m_mf(std::move(mf)) {} ~innards () final = default; diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index e42ebdc9cd2..2a6cbb25a08 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -22,7 +22,9 @@ inline std::size_t aligned_size (std::size_t align_requirement, std::size_t size inline bool is_aligned (const void* p, std::size_t alignment) noexcept { - return (reinterpret_cast(p) % alignment) == 0; + auto* q = const_cast(p); + auto space = alignment; + return std::align(alignment, alignment, q, space) == p; } class Arena; @@ -82,7 +84,6 @@ struct ArenaInfo * A virtual base class for objects that manage their own dynamic * memory allocation. */ - class Arena { public: @@ -157,6 +158,11 @@ public: */ virtual void registerForProfiling (const std::string& memory_name); +#ifdef AMREX_USE_GPU + //! Is this GPU stream ordered memory allocator? + [[nodiscard]] virtual bool isStreamOrderedArena () const { return false; } +#endif + /** * \brief Given a minimum required arena size of sz bytes, this returns * the next largest arena size that will align to align_size bytes diff --git a/Src/Base/AMReX_Array.H b/Src/Base/AMReX_Array.H index aaf8298ee99..525133cde87 100644 --- a/Src/Base/AMReX_Array.H +++ b/Src/Base/AMReX_Array.H @@ -362,7 +362,7 @@ namespace amrex { * (the index \c i moves the fastest) */ template ::value,int>::type=0> + std::enable_if_t,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j) const noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); @@ -376,7 +376,7 @@ namespace amrex { * (the index \c i moves the fastest) */ template ::value,int>::type=0> + std::enable_if_t,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j) noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); @@ -390,7 +390,7 @@ namespace amrex { * is used (the index \c j moves the fastest). */ template ::value,int>::type=0> + std::enable_if_t,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j) const noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); @@ -404,7 +404,7 @@ namespace amrex { * is used (the index \c j moves the fastest). */ template ::value,int>::type=0> + std::enable_if_t,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j) noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); @@ -654,7 +654,7 @@ namespace amrex { * (the index \c i moves the fastest) */ template ::value,int>::type=0> + std::enable_if_t,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j, int k) const noexcept { return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1)) @@ -668,7 +668,7 @@ namespace amrex { * (the index \c i moves the fastest) */ template ::value,int>::type=0> + std::enable_if_t,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j, int k) noexcept { return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1)) @@ -682,7 +682,7 @@ namespace amrex { * is used (the index \c k moves the fastest). */ template ::value,int>::type=0> + std::enable_if_t,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j, int k) const noexcept { return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1)) @@ -696,7 +696,7 @@ namespace amrex { * is used (the index \c k moves the fastest). */ template ::value,int>::type=0> + std::enable_if_t,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j, int k) noexcept { return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1)) diff --git a/Src/Base/AMReX_Array4.H b/Src/Base/AMReX_Array4.H index b2ff0fcb549..fc27ca77b7a 100644 --- a/Src/Base/AMReX_Array4.H +++ b/Src/Base/AMReX_Array4.H @@ -24,9 +24,9 @@ namespace amrex { {} template ::value,int> = 0> + std::enable_if_t,int> = 0> AMREX_GPU_HOST_DEVICE - constexpr CellData (CellData::type> const& rhs) noexcept + constexpr CellData (CellData> const& rhs) noexcept : p(rhs.p), stride(rhs.stride), ncomp(rhs.ncomp) {} @@ -37,18 +37,19 @@ namespace amrex { int nComp() const noexcept { return ncomp; } template ::value,int> = 0> + std::enable_if_t,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator[] (int n) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) if (n < 0 || n >= ncomp) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1); -#else - std::stringstream ss; - ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")"; + amrex::Abort(ss.str()); + )) } #endif return p[n*stride]; @@ -69,9 +70,9 @@ namespace amrex { AMREX_GPU_HOST_DEVICE constexpr Array4 () noexcept : p(nullptr) {} - template ::value,int>::type = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE - constexpr Array4 (Array4::type> const& rhs) noexcept + constexpr Array4 (Array4> const& rhs) noexcept : p(rhs.p), jstride(rhs.jstride), kstride(rhs.kstride), @@ -93,9 +94,9 @@ namespace amrex { {} template ::type, - typename std::remove_const::type>::value,int>::type = 0> + std::enable_if_t + , + std::remove_const_t>,int> = 0> AMREX_GPU_HOST_DEVICE constexpr Array4 (Array4 const& rhs, int start_comp) noexcept : p((T*)(rhs.p+start_comp*rhs.nstride)), @@ -108,9 +109,9 @@ namespace amrex { {} template ::type, - typename std::remove_const::type>::value,int>::type = 0> + std::enable_if_t + , + std::remove_const_t>,int> = 0> AMREX_GPU_HOST_DEVICE constexpr Array4 (Array4 const& rhs, int start_comp, int num_comps) noexcept : p((T*)(rhs.p+start_comp*rhs.nstride)), @@ -125,7 +126,7 @@ namespace amrex { AMREX_GPU_HOST_DEVICE explicit operator bool() const noexcept { return p != nullptr; } - template ::value,int>::type = 0> + template ,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i, int j, int k) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -134,7 +135,7 @@ namespace amrex { return p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride]; } - template ::value,int>::type = 0> + template ,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i, int j, int k, int n) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -143,7 +144,7 @@ namespace amrex { return p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride]; } - template ::value,int>::type = 0> + template ,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T* ptr (int i, int j, int k) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -152,7 +153,7 @@ namespace amrex { return p + ((i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride); } - template ::value,int>::type = 0> + template ,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T* ptr (int i, int j, int k, int n) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -161,7 +162,7 @@ namespace amrex { return p + ((i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride); } - template ::value,int>::type = 0> + template ,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (IntVect const& iv) const noexcept { #if (AMREX_SPACEDIM == 1) @@ -173,7 +174,7 @@ namespace amrex { #endif } - template ::value,int>::type = 0> + template ,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (IntVect const& iv, int n) const noexcept { #if (AMREX_SPACEDIM == 1) @@ -185,7 +186,7 @@ namespace amrex { #endif } - template ::value,int>::type = 0> + template ,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T* ptr (IntVect const& iv) const noexcept { #if (AMREX_SPACEDIM == 1) @@ -197,7 +198,7 @@ namespace amrex { #endif } - template ::value,int>::type = 0> + template ,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T* ptr (IntVect const& iv, int n) const noexcept { #if (AMREX_SPACEDIM == 1) @@ -233,21 +234,22 @@ namespace amrex { { if (i=end.x || j=end.y || k=end.z || n < 0 || n >= ncomp) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n", - i, j, k, n, begin.x, end.x-1, begin.y, end.y-1, - begin.z, end.z-1, ncomp-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << "," << k << "," << n - << ") is out of bound (" - << begin.x << ":" << end.x-1 << "," - << begin.y << ":" << end.y-1 << "," - << begin.z << ":" << end.z-1 << "," - << "0:" << ncomp-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n", + i, j, k, n, begin.x, end.x-1, begin.y, end.y-1, + begin.z, end.z-1, ncomp-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << "," << k << "," << n + << ") is out of bound (" + << begin.x << ":" << end.x-1 << "," + << begin.y << ":" << end.y-1 << "," + << begin.z << ":" << end.z-1 << "," + << "0:" << ncomp-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -298,7 +300,7 @@ namespace amrex { template struct HasMultiComp : std::false_type {}; // template - struct HasMultiComp= 1>::type> + struct HasMultiComp= 1>> : std::true_type {}; // @@ -319,14 +321,14 @@ namespace amrex { return this->Array4::operator()(i,j,k); } - template ::value,int>::type = 0> + template ::value,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE typename U::reference_type operator() (int i, int j, int k, int n) const noexcept { return this->Array4::operator()(i,j,k,0)[n]; } - template ::value,int>::type = 0> + template ::value,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i, int j, int k, int n) const noexcept { return this->Array4::operator()(i,j,k,n); diff --git a/Src/Base/AMReX_ArrayLim.H b/Src/Base/AMReX_ArrayLim.H index 2654f511a9b..7b32363d7fc 100644 --- a/Src/Base/AMReX_ArrayLim.H +++ b/Src/Base/AMReX_ArrayLim.H @@ -8,37 +8,6 @@ #include -/* -** C++ stuff ... -*/ - -#ifndef AMREX_XSDK - -#if AMREX_SPACEDIM==1 -#define ARLIM_P(x) const int& -#define ARLIM(x) (x)[0] -#define ARLIM_3D(x) amrex::GpuArray{(x)[0], 0, 0}.data() -#define ARLIM_ANYD(x) amrex::GpuArray{(x)[0], 0, 0}.data() -#define ZFILL(x) amrex::GpuArray{(x)[0], 0., 0.}.data() -#define AMREX_REAL_ANYD(x) AMREX_ZFILL(x) -#elif AMREX_SPACEDIM==2 -#define ARLIM_P(x) const int&,const int& -#define ARLIM(x) (x)[0],(x)[1] -#define ARLIM_3D(x) amrex::GpuArray{(x)[0], (x)[1], 0}.data() -#define ARLIM_ANYD(x) amrex::GpuArray{(x)[0], (x)[1], 0}.data() -#define ZFILL(x) amrex::GpuArray{(x)[0], (x)[1], 0.}.data() -#define AMREX_REAL_ANYD(x) AMREX_ZFILL(x) -#elif AMREX_SPACEDIM==3 -#define ARLIM_P(x) const int&,const int&,const int& -#define ARLIM(x) (x)[0],(x)[1],(x)[2] -#define ARLIM_3D(x) x -#define ARLIM_ANYD(x) x -#define ZFILL(x) x -#define AMREX_REAL_ANYD(x) AMREX_ZFILL(x) -#endif - -#endif /* ndef AMREX_XSDK */ - #if AMREX_SPACEDIM==1 #define AMREX_ARLIM_P(x) const int& #define AMREX_ARLIM(x) (x)[0] @@ -81,133 +50,6 @@ #define BL_TO_FORTRAN_BOX(x) AMREX_ARLIM_3D((x).loVect()), AMREX_ARLIM_3D((x).hiVect()) -#else - -#if !defined(BL_LANG_FORT) - -#ifndef AMREX_XSDK - -/* C stuff */ -#if AMREX_SPACEDIM == 1 -#define ARLIM_P(x) const int* -#elif AMREX_SPACEDIM == 2 -#define ARLIM_P(x) const int*, const int* -#else -#define ARLIM_P(x) const int*, const int*, const int* -#endif - -#endif /* ndef AMREX_XSDK */ - -#if AMREX_SPACEDIM == 1 -#define AMREX_ARLIM_P(x) const int* -#elif AMREX_SPACEDIM == 2 -#define AMREX_ARLIM_P(x) const int*, const int* -#else -#define AMREX_ARLIM_P(x) const int*, const int*, const int* -#endif - -#else - -#ifndef AMREX_XSDK - -/* -** Fortran stuff ... -*/ - -#if __STDC__==1 || defined(__INTEL_COMPILER) - -#if (AMREX_SPACEDIM == 1) -#define DIMS(a) a##_l1, a##_h1 -#define DIMDEC(a) a##_l1, a##_h1 -#define DIMV(a) a##_l1:a##_h1 -#define DIM1(a) a##_l1:a##_h1 -#define ARG_L1(a) a##_l1 -#define ARG_H1(a) a##_h1 -#define DIMARG(a) a##(1) -#endif - -#if (AMREX_SPACEDIM == 2) -#define DIMS(a) a##_l1, a##_l2, a##_h1, a##_h2 -#define DIMDEC(a) a##_l1, a##_l2, a##_h1, a##_h2 -#define DIMV(a) a##_l1:a##_h1, a##_l2:a##_h2 -#define DIM1(a) a##_l1:a##_h1 -#define DIM2(a) a##_l2:a##_h2 -#define ARG_L1(a) a##_l1 -#define ARG_L2(a) a##_l2 -#define ARG_H1(a) a##_h1 -#define ARG_H2(a) a##_h2 -#define DIMARG(a) a##(1),a##(2) -#endif - -#if (AMREX_SPACEDIM == 3) -#define DIMS(a) a##_l1, a##_l2, a##_l3, a##_h1, a##_h2, a##_h3 -#define DIMDEC(a) a##_l1, a##_l2, a##_l3, a##_h1, a##_h2, a##_h3 -#define DIMV(a) a##_l1:a##_h1, a##_l2:a##_h2, a##_l3:a##_h3 -#define DIM1(a) a##_l1:a##_h1 -#define DIM2(a) a##_l2:a##_h2 -#define DIM3(a) a##_l3:a##_h3 -#define DIM12(a) a##_l1:a##_h1, a##_l2:a##_h2 -#define DIM23(a) a##_l2:a##_h2, a##_l3:a##_h3 -#define DIM13(a) a##_l1:a##_h1, a##_l3:a##_h3 -#define ARG_L1(a) a##_l1 -#define ARG_L2(a) a##_l2 -#define ARG_L3(a) a##_l3 -#define ARG_H1(a) a##_h1 -#define ARG_H2(a) a##_h2 -#define ARG_H3(a) a##_h3 -#define DIMARG(a) a##(1),a##(2),a##(3) -#endif - -#else - -#if (AMREX_SPACEDIM == 1) -#define DIMS(a) a/**/_l1, a/**/_h1 -#define DIMDEC(a) a/**/_l1, a/**/_h1 -#define DIMV(a) a/**/_l1:a/**/_h1 -#define DIM1(a) a/**/_l1:a/**/_h1 -#define ARG_L1(a) a/**/_l1 -#define ARG_H1(a) a/**/_h1 -#define DIMARG(a) a/**/(1) -#endif - -#if (AMREX_SPACEDIM == 2) -#define DIMS(a) a/**/_l1, a/**/_l2, a/**/_h1, a/**/_h2 -#define DIMDEC(a) a/**/_l1, a/**/_l2, a/**/_h1, a/**/_h2 -#define DIMV(a) a/**/_l1:a/**/_h1, a/**/_l2:a/**/_h2 -#define DIM1(a) a/**/_l1:a/**/_h1 -#define DIM2(a) a/**/_l2:a/**/_h2 -#define ARG_L1(a) a/**/_l1 -#define ARG_L2(a) a/**/_l2 -#define ARG_H1(a) a/**/_h1 -#define ARG_H2(a) a/**/_h2 -#define DIMARG(a) a/**/(1),a/**/(2) -#endif - -#if (AMREX_SPACEDIM == 3) -#define DIMS(a) a/**/_l1, a/**/_l2, a/**/_l3, a/**/_h1, a/**/_h2, a/**/_h3 -#define DIMDEC(a) a/**/_l1, a/**/_l2, a/**/_l3, a/**/_h1, a/**/_h2, a/**/_h3 -#define DIMV(a) a/**/_l1:a/**/_h1, a/**/_l2:a/**/_h2, a/**/_l3:a/**/_h3 -#define DIM1(a) a/**/_l1:a/**/_h1 -#define DIM2(a) a/**/_l2:a/**/_h2 -#define DIM3(a) a/**/_l3:a/**/_h3 -#define DIM12(a) a/**/_l1:a/**/_h1, a/**/_l2:a/**/_h2 -#define DIM23(a) a/**/_l2:a/**/_h2, a/**/_l3:a/**/_h3 -#define DIM13(a) a/**/_l1:a/**/_h1, a/**/_l3:a/**/_h3 -#define ARG_L1(a) a/**/_l1 -#define ARG_L2(a) a/**/_l2 -#define ARG_L3(a) a/**/_l3 -#define ARG_H1(a) a/**/_h1 -#define ARG_H2(a) a/**/_h2 -#define ARG_H3(a) a/**/_h3 -#define DIMARG(a) a/**/(1),a/**/(2),a/**/(3) -#endif - -#endif /*__STDC__==1*/ - -#endif /* ndef AMREX_XSDK */ - -#endif /*else of !defined(BL_LANG_FORT)*/ - -#endif /*else of __cplusplus*/ +#endif /* __cplusplus */ -#endif /*BL_ARRAYLIM_H*/ +#endif /* AMREX_ARRAYLIM_H_ */ diff --git a/Src/Base/AMReX_BArena.H b/Src/Base/AMReX_BArena.H index 9a3b4aa0f1b..d587d100859 100644 --- a/Src/Base/AMReX_BArena.H +++ b/Src/Base/AMReX_BArena.H @@ -11,7 +11,6 @@ namespace amrex { * This is the simplest dynamic memory management class derived from Arena. * Makes calls to std::malloc and std::free. */ - class BArena : public Arena diff --git a/Src/Base/AMReX_BCRec.H b/Src/Base/AMReX_BCRec.H index c39634cfb09..d23da777eda 100644 --- a/Src/Base/AMReX_BCRec.H +++ b/Src/Base/AMReX_BCRec.H @@ -10,10 +10,9 @@ namespace amrex { /** * \brief Boundary Condition Records. * Necessary information and functions for computing boundary conditions. +* +* This class has standard layout. And we should keep it so! */ - -// This class has standard layout. And we should keep it so! - class BCRec { public: @@ -43,7 +42,7 @@ public: {} /* * \brief Yet another constructor. Inherits bndry types from bc_domain - * when bx lies on edge of domain otherwise gets interior Dirchlet. + * when bx lies on edge of domain otherwise gets interior Dirichlet. */ AMREX_GPU_HOST_DEVICE BCRec (const Box& bx, diff --git a/Src/Base/AMReX_BCUtil.H b/Src/Base/AMReX_BCUtil.H index dd4e814610c..3ebc53a0576 100644 --- a/Src/Base/AMReX_BCUtil.H +++ b/Src/Base/AMReX_BCUtil.H @@ -9,11 +9,12 @@ namespace amrex { // This is for filling cell-centered data outside physical domain - // (excluding periodic boundaries). It only fills - // BCType::foextrap, BCType::hoextrap, BCType::reflect_even, and - // BCType::reflect::odd. It does not fill BCType::ext_dir (i.e., - // external Dirichlet). If you have BCType::ext_dir, you can - // copy, paste and rename this function, and then modify it. + // (excluding periodic boundaries). It only fills BCType::foextrap, + // BCType::hoextrap, BCType::hoextrapcc, BCType::reflect_even, and + // BCType::reflect::odd. It does not fill BCType::ext_dir and + // BCType::ext_dir_cc (i.e., external Dirichlet). If you have + // BCType::ext_dir or BCType::ext_dir_cc, you can copy, paste and rename + // this function, and then modify it. void FillDomainBoundary (MultiFab& phi, const Geometry& geom, const Vector& bc); } diff --git a/Src/Base/AMReX_BCUtil.cpp b/Src/Base/AMReX_BCUtil.cpp index c0a645ba05c..dc5b4f4f10c 100644 --- a/Src/Base/AMReX_BCUtil.cpp +++ b/Src/Base/AMReX_BCUtil.cpp @@ -15,7 +15,7 @@ void dummy_cpu_fill_extdir (Box const& /*bx*/, Array4 const& /*dest*/, const BCRec* /*bcr*/, const int /*bcomp*/, const int /*orig_comp*/) { - // do something for external Dirichlet (BCType::ext_dir) if there are + // do something for external Dirichlet (BCType::ext_dir or BCType::ext_dir_cc) if there are } struct dummy_gpu_fill_extdir @@ -27,7 +27,7 @@ struct dummy_gpu_fill_extdir const BCRec* /*bcr*/, const int /*bcomp*/, const int /*orig_comp*/) const { - // do something for external Dirichlet (BCType::ext_dir) if there are + // do something for external Dirichlet (BCType::ext_dir or BCType::ext_dir_cc) if there are } }; diff --git a/Src/Base/AMReX_BC_TYPES.H b/Src/Base/AMReX_BC_TYPES.H index 872ac1eff5d..f35175be450 100644 --- a/Src/Base/AMReX_BC_TYPES.H +++ b/Src/Base/AMReX_BC_TYPES.H @@ -44,10 +44,8 @@ SYMMETRY | Un REFLECT_ODD | REFLECT_EVEN | REFLECT_EVEN | INT_DIR : data taken from other grids or interpolated -EXT_DIR : data specified on EDGE (FACE) of bndry for the linear solvers - for application codes, the location of the EXT_DIR data will - depend on how they do reconstruction, and may be edge or - cell-centered. +EXT_DIR : data specified on EDGE (FACE) +EXT_DIR_CC : data specified at cell center HOEXTRAP : higher order extrapolation to EDGE of bndry HOEXTRAPCC : linear extrapolation to CELL of bndry FOEXTRAP : first order extrapolation from last cell in interior @@ -74,6 +72,7 @@ enum mathematicalBndryTypes : int { ext_dir = 3, hoextrap = 4, hoextrapcc = 5, + ext_dir_cc = 6, user_1 = 1001, user_2 = 1002, user_3 = 1003 @@ -83,25 +82,4 @@ enum mathematicalBndryTypes : int { } #endif -#ifndef AMREX_XSDK - -#define BOGUS_BC (-666) - -#define REFLECT_ODD (-1) -#define INT_DIR 0 -#define REFLECT_EVEN 1 -#define FOEXTRAP 2 -#define EXT_DIR 3 -#define HOEXTRAP 4 -#define HOEXTRAPCC 5 - -#define Interior 0 -#define Inflow 1 -#define Outflow 2 -#define Symmetry 3 -#define SlipWall 4 -#define NoSlipWall 5 - -#endif - #endif diff --git a/Src/Base/AMReX_BLBackTrace.cpp b/Src/Base/AMReX_BLBackTrace.cpp index 4a4d527dbb1..e5699996933 100644 --- a/Src/Base/AMReX_BLBackTrace.cpp +++ b/Src/Base/AMReX_BLBackTrace.cpp @@ -70,6 +70,7 @@ BLBackTrace::handler(int s) case SIGABRT: amrex::ErrorStream() << "SIGABRT\n"; break; + default: break; } #if defined(AMREX_BACKTRACE_SUPPORTED) || defined(AMREX_TINY_PROFILING) @@ -99,19 +100,19 @@ BLBackTrace::handler(int s) fclose(p); } - amrex::ErrorStream() << "See " << errfilename << " file for details" << std::endl; + amrex::ErrorStream() << "See " << errfilename << " file for details" << '\n'; if (!bt_stack.empty()) { std::ofstream errfile; errfile.open(errfilename.c_str(), std::ofstream::out | std::ofstream::app); if (errfile.is_open()) { - errfile << std::endl; + errfile << '\n'; while (!bt_stack.empty()) { errfile << "== BACKTRACE == " << bt_stack.top().first <<", " << bt_stack.top().second << "\n"; bt_stack.pop(); } - errfile << std::endl; + errfile << '\n'; } } @@ -148,7 +149,7 @@ BLBackTrace::print_backtrace_info (const std::string& filename) { amrex::Print() << "Warning @ BLBackTrace::print_backtrace_info: " << filename << " is not a valid output file." - << std::endl; + << '\n'; } } @@ -371,7 +372,7 @@ BLBTer::BLBTer(const std::string& s, const char* file, int line) std::ostringstream ss0; ss0 << "Proc. " << ParallelDescriptor::MyProc() << ": \"" << s << "\""; - BLBackTrace::bt_stack.push(std::make_pair(ss0.str(), line_file)); + BLBackTrace::bt_stack.emplace(ss0.str(), line_file); #endif } diff --git a/Src/Base/AMReX_BLProfiler.H b/Src/Base/AMReX_BLProfiler.H index 3b72a627881..b7f8e6a1c27 100644 --- a/Src/Base/AMReX_BLProfiler.H +++ b/Src/Base/AMReX_BLProfiler.H @@ -514,7 +514,7 @@ inline std::string BLProfiler::CommStats::CFTToString(CommFuncType cft) { #define BL_PROFILE_REGION_VAR(fname, rvname) #define BL_PROFILE_REGION_VAR_START(fname, rvname) #define BL_PROFILE_REGION_VAR_STOP(fname, rvname) -#define BL_PROFILE_TINY_FLUSH() amrex::TinyProfiler::Finalize(true); TinyProfiler::MemoryFinalize(true) +#define BL_PROFILE_TINY_FLUSH() amrex::TinyProfiler::Finalize(true); amrex::TinyProfiler::MemoryFinalize(true) #define BL_PROFILE_FLUSH() #define BL_TRACE_PROFILE_FLUSH() #define BL_TRACE_PROFILE_SETFLUSHSIZE(fsize) diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H index 99132038397..c1212fb7a0a 100644 --- a/Src/Base/AMReX_BaseFab.H +++ b/Src/Base/AMReX_BaseFab.H @@ -19,11 +19,10 @@ #include #include #include +#include #include - -#ifdef AMREX_USE_OMP -#include -#endif +#include +#include #include #include @@ -90,62 +89,14 @@ makeArray4 (T* p, Box const& bx, int ncomp) noexcept return Array4{p, amrex::begin(bx), amrex::end(bx), ncomp}; } -/** -* \brief A Fortran Array-like Object -* BaseFab emulates the Fortran array concept. -* Useful operations can be performed upon -* BaseFabs in C++, and they provide a convenient interface to -* Fortran when it is necessary to retreat into that language. - -* BaseFab is a template class. Through use of the -* template, a BaseFab may be based upon any class. So far at least, -* most applications have been based upon simple types like integers, -* real*4s, or real*8s. Most applications do not use BaseFabs -* directly, but utilize specialized classes derived from BaseFab. - -* Classes derived from BaseFab include FArrayBox, IArrayBox, TagBox, -* Mask, EBFArrayBox, EBCellFlag and CutFab. - -* BaseFab objects depend on the dimensionality of space -* (indirectly through the DOMAIN Box member). It is -* typical to define the macro SPACEDIM to be 1, 2, or 3 to indicate -* the dimension of space. See the discussion of class Box for more -* information. A BaseFab contains a Box DOMAIN, which indicates the -* integer indexing space over which the array is defined. A BaseFab -* also has NVAR components. By components, we mean that for each -* point in the rectangular indexing space, there are NVAR values -* associated with that point. A Fortran array corresponding to a -* BaseFab would have (SPACEDIM+1) dimensions. - -* By design, the array layout in a BaseFab mirrors that of a -* Fortran array. The first index (x direction for example) varies -* most rapidly, the next index (y direction), if any, varies next -* fastest. The component index varies last, after all the spatial -* indices. - -* It is sometimes convenient to be able to treat a sub-array within an -* existing BaseFab as a BaseFab in its own right. This is often -* referred to as aliasing the BaseFab. Note that when aliasing is -* used, the BaseFabs domain will not, in general, be the same as the -* parent BaseFabs domain, nor will the number of components. -* BaseFab is a dimension dependent class, so SPACEDIM must be -* defined as either 1, 2, or 3 when compiling. - -* This is NOT a polymorphic class. - -* It does NOT provide a copy constructor or assignment operator. - -* T MUST have a default constructor and an assignment operator. -*/ - template -typename std::enable_if::value>::type +std::enable_if_t> placementNew (T* const /*ptr*/, Long /*n*/) {} template -std::enable_if_t::value - && !std::is_arithmetic::value> +std::enable_if_t + && !std::is_arithmetic_v> placementNew (T* const ptr, Long n) { for (Long i = 0; i < n; ++i) { @@ -154,7 +105,7 @@ placementNew (T* const ptr, Long n) } template -std::enable_if_t::value> +std::enable_if_t> placementNew (T* const ptr, Long n) { AMREX_HOST_DEVICE_FOR_1D ( n, i, @@ -164,12 +115,12 @@ placementNew (T* const ptr, Long n) } template -typename std::enable_if::value>::type +std::enable_if_t> placementDelete (T* const /*ptr*/, Long /*n*/) {} template -typename std::enable_if::value>::type +std::enable_if_t> placementDelete (T* const ptr, Long n) { AMREX_HOST_DEVICE_FOR_1D (n, i, @@ -178,6 +129,54 @@ placementDelete (T* const ptr, Long n) }); } +/** + * \brief A FortranArrayBox(FAB)-like object + * + * BaseFab emulates the Fortran array concept. + * Useful operations can be performed upon + * BaseFabs in C++, and they provide a convenient interface to + * Fortran when it is necessary to retreat into that language. + * + * BaseFab is a template class. Through use of the + * template, a BaseFab may be based upon any class. So far at least, + * most applications have been based upon simple types like integers, + * real*4s, or real*8s. Most applications do not use BaseFabs + * directly, but utilize specialized classes derived from BaseFab. + * + * Classes derived from BaseFab include FArrayBox, IArrayBox, TagBox, + * Mask, EBFArrayBox, EBCellFlag and CutFab. + * + * BaseFab objects depend on the dimensionality of space + * (indirectly through the DOMAIN Box member). It is + * typical to define the macro SPACEDIM to be 1, 2, or 3 to indicate + * the dimension of space. See the discussion of class Box for more + * information. A BaseFab contains a Box DOMAIN, which indicates the + * integer indexing space over which the array is defined. A BaseFab + * also has NVAR components. By components, we mean that for each + * point in the rectangular indexing space, there are NVAR values + * associated with that point. A Fortran array corresponding to a + * BaseFab would have (SPACEDIM+1) dimensions. + * + * By design, the array layout in a BaseFab mirrors that of a + * Fortran array. The first index (x direction for example) varies + * most rapidly, the next index (y direction), if any, varies next + * fastest. The component index varies last, after all the spatial + * indices. + * + * It is sometimes convenient to be able to treat a sub-array within an + * existing BaseFab as a BaseFab in its own right. This is often + * referred to as aliasing the BaseFab. Note that when aliasing is + * used, the BaseFabs domain will not, in general, be the same as the + * parent BaseFabs domain, nor will the number of components. + * BaseFab is a dimension dependent class, so SPACEDIM must be + * defined as either 1, 2, or 3 when compiling. + * + * This is NOT a polymorphic class. + * + * It does NOT provide a copy constructor or assignment operator. + * + * \tparam T MUST have a default constructor and an assignment operator. + */ template class BaseFab : public DataAllocator @@ -251,7 +250,7 @@ public: */ void resize (const Box& b, int N = 1, Arena* ar = nullptr); - template ::value,int>::type = 0> + template ,int> = 0> [[nodiscard]] Elixir elixir () noexcept; /** @@ -543,7 +542,7 @@ public: int numcomp = 1) noexcept; /** * \brief As above, except that the destination Box is specified, - * but the source Box is taken to the equal to the source + * but the source Box is taken to the equal to the destination * Box, and all components of the destination BaseFab are * copied. */ @@ -979,6 +978,19 @@ public: BaseFab& atomicAdd (const BaseFab& src, const Box& srcbox, const Box& destbox, int srccomp, int destcomp, int numcomp=1) noexcept; + /** + * \brief Atomically add srcbox region of src FAB to destbox region of this FAB. + * The srcbox and destbox must be same size. When OMP is on, this uses OMP locks + * in the implementation and it's usually faster than atomicAdd. + */ +#if defined(AMREX_USE_GPU) + template +#else + template +#endif + BaseFab& lockAdd (const BaseFab& src, const Box& srcbox, const Box& destbox, + int srccomp, int destcomp, int numcomp) noexcept; + //! FAB SAXPY (y[i] <- y[i] + a * x[i]), in place. #if defined(AMREX_USE_GPU) template @@ -1631,6 +1643,9 @@ protected: Long truesize = 0L; //!< nvar*numpts that was allocated on heap. bool ptr_owner = false; //!< Owner of T*? bool shared_memory = false; //!< Is the memory allocated in shared memory? +#ifdef AMREX_USE_GPU + gpuStream_t alloc_stream{}; +#endif }; template @@ -1902,6 +1917,9 @@ BaseFab::define () this->truesize = this->nvar*this->domain.numPts(); this->ptr_owner = true; this->dptr = static_cast(this->alloc(this->truesize*sizeof(T))); +#ifdef AMREX_USE_GPU + this->alloc_stream = Gpu::gpuStream(); +#endif placementNew(this->dptr, this->truesize); @@ -2003,6 +2021,9 @@ BaseFab::BaseFab (BaseFab&& rhs) noexcept dptr(rhs.dptr), domain(rhs.domain), nvar(rhs.nvar), truesize(rhs.truesize), ptr_owner(rhs.ptr_owner), shared_memory(rhs.shared_memory) +#ifdef AMREX_USE_GPU + , alloc_stream(rhs.alloc_stream) +#endif { rhs.dptr = nullptr; rhs.ptr_owner = false; @@ -2021,6 +2042,9 @@ BaseFab::operator= (BaseFab&& rhs) noexcept truesize = rhs.truesize; ptr_owner = rhs.ptr_owner; shared_memory = rhs.shared_memory; +#ifdef AMREX_USE_GPU + alloc_stream = rhs.alloc_stream; +#endif rhs.dptr = nullptr; rhs.ptr_owner = false; @@ -2062,7 +2086,11 @@ BaseFab::resize (const Box& b, int n, Arena* ar) this->dptr = nullptr; define(); } - else if (this->nvar*this->domain.numPts() > this->truesize) + else if (this->nvar*this->domain.numPts() > this->truesize +#ifdef AMREX_USE_GPU + || (arena()->isStreamOrderedArena() && alloc_stream != Gpu::gpuStream()) +#endif + ) { if (this->shared_memory) { amrex::Abort("BaseFab::resize: BaseFab in shared memory cannot increase size"); @@ -2075,7 +2103,7 @@ BaseFab::resize (const Box& b, int n, Arena* ar) } template -template ::value,int>::type> +template ,int>> Elixir BaseFab::elixir () noexcept { @@ -2114,7 +2142,14 @@ BaseFab::clear () noexcept placementDelete(this->dptr, this->truesize); +#ifdef AMREX_USE_GPU + auto current_stream = Gpu::Device::gpuStream(); + Gpu::Device::setStream(alloc_stream); +#endif this->free(this->dptr); +#ifdef AMREX_USE_GPU + Gpu::Device::setStream(current_stream); +#endif if (this->nvar > 1) { amrex::update_fab_stats(-this->truesize/this->nvar, -this->truesize, sizeof(T)); @@ -3277,6 +3312,110 @@ BaseFab::atomicAdd (const BaseFab& src, const Box& srcbox, const Box& dest return *this; } +template +template +BaseFab& +BaseFab::lockAdd (const BaseFab& src, const Box& srcbox, const Box& destbox, + int srccomp, int destcomp, int numcomp) noexcept +{ +#if defined(AMREX_USE_OMP) && (AMREX_SPACEDIM > 1) +#if defined(AMREX_USE_GPU) + if (run_on == RunOn::Host || Gpu::notInLaunchRegion()) { +#endif + BL_ASSERT(destbox.ok()); + BL_ASSERT(src.box().contains(srcbox)); + BL_ASSERT(box().contains(destbox)); + BL_ASSERT(destbox.sameSize(srcbox)); + BL_ASSERT(srccomp >= 0 && srccomp+numcomp <= src.nComp()); + BL_ASSERT(destcomp >= 0 && destcomp+numcomp <= nComp()); + + Array4 const& d = this->array(); + Array4 const& s = src.const_array(); + auto const& dlo = amrex::lbound(destbox); + auto const& dhi = amrex::ubound(destbox); + auto const& len = amrex::length(destbox); + auto const& slo = amrex::lbound(srcbox); + Dim3 const offset{slo.x-dlo.x, slo.y-dlo.y, slo.z-dlo.z}; + + int planedim; + int nplanes; + int plo; + if (len.z == 1) { + planedim = 1; + nplanes = len.y; + plo = dlo.y; + } else { + planedim = 2; + nplanes = len.z; + plo = dlo.z; + } + + auto* mask = (bool*) amrex_mempool_alloc(sizeof(bool)*nplanes); + for (int ip = 0; ip < nplanes; ++ip) { + mask[ip] = false; + } + + int mm = 0; + int planes_left = nplanes; + while (planes_left > 0) { + AMREX_ASSERT(mm < nplanes); + auto const m = mm + plo; + auto* lock = OpenMP::get_lock(m); + if (omp_test_lock(lock)) + { + auto lo = dlo; + auto hi = dhi; + if (planedim == 1) { + lo.y = m; + hi.y = m; + } else { + lo.z = m; + hi.z = m; + } + + for (int n = 0; n < numcomp; ++n) { + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + auto * pdst = d.ptr(dlo.x,j ,k ,n+destcomp); + auto const* psrc = s.ptr(slo.x,j+offset.y,k+offset.z,n+ srccomp); +#pragma omp simd + for (int ii = 0; ii < len.x; ++ii) { + pdst[ii] += psrc[ii]; + } + } + } + } + + mask[mm] = true; + --planes_left; + omp_unset_lock(lock); + if (planes_left == 0) { break; } + } + + ++mm; + for (int ip = 0; ip < nplanes; ++ip) { + int new_mm = (mm+ip) % nplanes; + if ( ! mask[new_mm] ) { + mm = new_mm; + break; + } + } + } + + amrex_mempool_free(mask); + + return *this; + +#if defined(AMREX_USE_GPU) + } else { + return this->template atomicAdd(src, srcbox, destbox, srccomp, destcomp, numcomp); + } +#endif +#else + return this->template atomicAdd(src, srcbox, destbox, srccomp, destcomp, numcomp); +#endif +} + template template BaseFab& @@ -3505,7 +3644,6 @@ BaseFab::protected_divide (const BaseFab& src, const Box& srcbox, const Bo * and stored in component comp of this FAB. * This fab is returned as a reference for chaining. */ - template template BaseFab& diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H index 3dafadcd74b..6fef4918c61 100644 --- a/Src/Base/AMReX_BaseFabUtility.H +++ b/Src/Base/AMReX_BaseFabUtility.H @@ -22,11 +22,11 @@ cast (BaseFab& tofab, BaseFab const& fromfab, } template ::value, - int>::type FOO = 0> -void fill (BaseFab& aos_fab, F && f) + std::is_trivially_destructible_v, + int>FOO = 0> +void fill (BaseFab& aos_fab, F const& f) { Box const& box = aos_fab.box(); auto const& aos = aos_fab.array(); @@ -36,37 +36,31 @@ void fill (BaseFab& aos_fab, F && f) "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE"); #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - int ntotcells = box.numPts(); + BoxIndexer indexer(box); + const auto ntotcells = std::uint64_t(box.numPts()); int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128; - int nblocks = (ntotcells+nthreads_per_block-1)/nthreads_per_block; + std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block; + AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits::max())); + auto nblocks = int(nblocks_long); std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT); T* p = (T*)aos_fab.dataPtr(); #ifdef AMREX_USE_SYCL amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept { - int icell = handler.globalIdx(); - unsigned int blockDimx = handler.blockDim(); - unsigned int threadIdxx = handler.threadIdx(); - unsigned int blockIdxx = handler.blockIdx(); + auto const icell = std::uint64_t(handler.globalIdx()); + std::uint64_t const blockDimx = handler.blockDim(); + std::uint64_t const threadIdxx = handler.threadIdx(); + std::uint64_t const blockIdxx = handler.blockIdx(); auto const shared = (T*)handler.sharedMemory(); - if (icell < ntotcells) { + if (icell < indexer.numPts()) { auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT; - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + auto [i, j, k] = indexer(icell); f(*ga, i, j, k); } handler.sharedBarrier(); - for (unsigned int m = threadIdxx, - mend = amrex::min(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE; + for (std::uint64_t m = threadIdxx, + mend = amrex::min(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE; m < mend; m += blockDimx) { p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m]; } @@ -75,31 +69,26 @@ void fill (BaseFab& aos_fab, F && f) amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - int icell = blockDim.x*blockIdx.x+threadIdx.x; + std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; Gpu::SharedMemory gsm; T* const shared = gsm.dataPtr(); - if (icell < ntotcells) { - auto ga = new(shared+threadIdx.x*STRUCTSIZE) STRUCT; - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + if (icell < indexer.numPts()) { + auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT; + auto [i, j, k] = indexer(icell); f(*ga, i, j, k); } __syncthreads(); - for (unsigned int m = threadIdx.x, - mend = amrex::min(blockDim.x, ntotcells-blockDim.x*blockIdx.x) * STRUCTSIZE; + for (std::uint64_t m = threadIdx.x, + mend = amrex::min(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE; m < mend; m += blockDim.x) { - p[blockDim.x*blockIdx.x*STRUCTSIZE+m] = shared[m]; + p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m]; } }); #endif } else #endif { - amrex::LoopOnCpu(box, [=] (int i, int j, int k) noexcept + amrex::LoopOnCpu(box, [&] (int i, int j, int k) noexcept { f(aos(i,j,k), i, j, k); }); diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H index 0a32d637d4f..82a2ad9cd13 100644 --- a/Src/Base/AMReX_Box.H +++ b/Src/Base/AMReX_Box.H @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -338,9 +339,10 @@ public: */ [[nodiscard]] AMREX_GPU_HOST_DEVICE Long numPts () const noexcept { - return AMREX_D_TERM( static_cast(length(0)), - *static_cast(length(1)), - *static_cast(length(2))); + return ok() ? AMREX_D_TERM( static_cast(length(0)), + *static_cast(length(1)), + *static_cast(length(2))) + : Long(0); } /** @@ -349,8 +351,10 @@ public: */ [[nodiscard]] AMREX_GPU_HOST_DEVICE double d_numPts () const noexcept { - BL_ASSERT(ok()); - return AMREX_D_TERM(double(length(0)), *double(length(1)), *double(length(2))); + return ok() ? AMREX_D_TERM( double(length(0)), + *double(length(1)), + *double(length(2))) + : 0.0; } /** @@ -360,9 +364,10 @@ public: */ [[nodiscard]] AMREX_GPU_HOST_DEVICE Long volume () const noexcept { - return AMREX_D_TERM( static_cast(length(0)-btype[0]), - *static_cast(length(1)-btype[1]), - *static_cast(length(2)-btype[2])); + return ok() ? AMREX_D_TERM( static_cast(length(0)-btype[0]), + *static_cast(length(1)-btype[1]), + *static_cast(length(2)-btype[2])) + : Long(0); } /** @@ -1835,6 +1840,94 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ); } +struct BoxIndexer +{ + std::uint64_t npts; + +#if (AMREX_SPACEDIM == 3) + Math::FastDivmodU64 fdxy; + Math::FastDivmodU64 fdx; + IntVect lo; + + BoxIndexer (Box const& box) + : npts(box.numPts()), + fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))), + fdx (std::uint64_t(box.length(0))), + lo (box.smallEnd()) + {} + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Dim3 operator() (std::uint64_t icell) const + { + std::uint64_t x, y, z, rem; + fdxy(z, rem, icell); + fdx(y, x, rem); + return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + IntVect intVect (std::uint64_t icell) const + { + std::uint64_t x, y, z, rem; + fdxy(z, rem, icell); + fdx(y, x, rem); + return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]}; + } + +#elif (AMREX_SPACEDIM == 2) + + Math::FastDivmodU64 fdx; + IntVect lo; + + BoxIndexer (Box const& box) + : npts(box.numPts()), + fdx (std::uint64_t(box.length(0))), + lo (box.smallEnd()) + {} + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Dim3 operator() (std::uint64_t icell) const + { + std::uint64_t x, y; + fdx(y, x, icell); + return {int(x)+lo[0], int(y)+lo[1], 0}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + IntVect intVect (std::uint64_t icell) const + { + std::uint64_t x, y; + fdx(y, x, icell); + return {int(x)+lo[0], int(y)+lo[1]}; + } + +#elif (AMREX_SPACEDIM == 1) + + int lo; + + BoxIndexer (Box const& box) + : npts(box.numPts()), + lo(box.smallEnd(0)) + {} + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Dim3 operator() (std::uint64_t icell) const + { + return {int(icell)+lo, 0, 0}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + IntVect intVect (std::uint64_t icell) const + { + return IntVect{int(icell)+lo}; + } + +#endif + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + std::uint64_t numPts () const { return npts; } +}; + } #endif /*AMREX_BOX_H*/ diff --git a/Src/Base/AMReX_BoxArray.H b/Src/Base/AMReX_BoxArray.H index 807cd9d8516..94358f580a4 100644 --- a/Src/Base/AMReX_BoxArray.H +++ b/Src/Base/AMReX_BoxArray.H @@ -515,16 +515,16 @@ struct BATransformer // for backward compatibility using BndryBATransformer = BATransformer; -/** -* \brief A collection of Boxes stored in an Array. It is a -* reference-counted concrete class, not a polymorphic one; i.e. you -* cannot use any of the List member functions with a BoxList. -*/ - class MFIter; class AmrMesh; class FabArrayBase; +/** + * \brief A collection of Boxes stored in an Array. + * + * It is a reference-counted concrete class, not a polymorphic one; i.e. you + * cannot use any of the List member functions with a BoxList. + */ class BoxArray { public: diff --git a/Src/Base/AMReX_BoxArray.cpp b/Src/Base/AMReX_BoxArray.cpp index 9413f5ae5c1..e5b41d6b9d3 100644 --- a/Src/Base/AMReX_BoxArray.cpp +++ b/Src/Base/AMReX_BoxArray.cpp @@ -61,7 +61,7 @@ BARef::BARef (const BoxList& bl) } BARef::BARef (BoxList&& bl) noexcept - : m_abox(std::move(bl.data())) + : m_abox(std::move(std::move(bl).data())) { #ifdef AMREX_MEM_PROFILING updateMemoryUsage_box(1); @@ -170,7 +170,7 @@ BARef::define (BoxList&& bl) noexcept #ifdef AMREX_MEM_PROFILING updateMemoryUsage_box(-1); #endif - m_abox = std::move(bl.data()); + m_abox = std::move(std::move(bl).data()); #ifdef AMREX_MEM_PROFILING updateMemoryUsage_box(1); #endif diff --git a/Src/Base/AMReX_BoxDomain.H b/Src/Base/AMReX_BoxDomain.H index a82e5ddc72e..af92d631607 100644 --- a/Src/Base/AMReX_BoxDomain.H +++ b/Src/Base/AMReX_BoxDomain.H @@ -55,14 +55,12 @@ std::ostream& operator<< (std::ostream& os, const BoxDomain& bd); /** * \brief A List of Disjoint Boxes. +* * A BoxDomain is a BoxList with the restriction that Boxes in the list * are disjoint. +* Note that a BoxDomain is NOT a BoxList due to the protected inheritance. +* This is a concrete class, not a polymorphic one. */ - -//Note that a BoxDomain is NOT a BoxList due to the protected inheritance. -//This is a concrete class, not a polymorphic one. - - class BoxDomain : protected BoxList diff --git a/Src/Base/AMReX_BoxList.H b/Src/Base/AMReX_BoxList.H index c0ff30025ff..ba06e746ce8 100644 --- a/Src/Base/AMReX_BoxList.H +++ b/Src/Base/AMReX_BoxList.H @@ -48,7 +48,6 @@ namespace amrex * IndexType. This class implements operations for sets of Boxes. * This is a concrete class, not a polymorphic one. */ - class BoxList { public: @@ -213,7 +212,7 @@ public: //! Returns a constant reference to the Vector. [[nodiscard]] const Vector& data () const noexcept { return m_lbox; } - void swap (BoxList& rhs) { + void swap (BoxList& rhs) noexcept { std::swap(m_lbox, rhs.m_lbox); std::swap(btype, rhs.btype); } diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H index d68285bc878..9547bc92f21 100644 --- a/Src/Base/AMReX_CArena.H +++ b/Src/Base/AMReX_CArena.H @@ -5,13 +5,14 @@ #include #include -#include -#include +#include +#include #include #include -#include -#include +#include #include +#include +#include namespace amrex { @@ -23,7 +24,6 @@ struct MemStat; * chunks of heap space and apportions it out as requested. It merges * together neighboring chunks on each free(). */ - class CArena : public Arena @@ -57,7 +57,7 @@ public: * Try to shrink in-place */ [[nodiscard]] void* - shrink_in_place (void* pt, std::size_t sz) final; + shrink_in_place (void* pt, std::size_t new_size) final; /** * \brief Free up allocated memory. Merge neighboring free memory chunks @@ -164,15 +164,15 @@ protected: MemStat* m_stat; }; + //! The list of blocks allocated via ::operator new(). + std::vector > m_alloc; + /** * \brief The type of our freelist and blocklist. * We use a set sorted from lo to hi memory addresses. */ using NL = std::set; - //! The list of blocks allocated via ::operator new(). - std::vector > m_alloc; - /** * \brief The free list of allocated but not currently used blocks. * Maintained in lo to hi memory sorted order. @@ -198,6 +198,8 @@ protected: std::mutex carena_mutex; + + friend std::ostream& operator<< (std::ostream& os, const CArena& arena); }; } diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp index 6f7979d4750..c47f8f5ed26 100644 --- a/Src/Base/AMReX_CArena.cpp +++ b/Src/Base/AMReX_CArena.cpp @@ -14,6 +14,7 @@ namespace amrex { #include #include +#include namespace amrex { @@ -203,9 +204,61 @@ CArena::alloc_in_place (void* pt, std::size_t szmin, std::size_t szmax) } void* -CArena::shrink_in_place (void* /*pt*/, std::size_t sz) +CArena::shrink_in_place (void* pt, std::size_t new_size) { - return alloc(sz); // xxxxx TODO + if ((pt == nullptr) || (new_size == 0)) { return nullptr; } + + new_size = Arena::align(new_size); + + std::lock_guard lock(carena_mutex); + + auto busy_it = m_busylist.find(Node(pt,nullptr,0)); + if (busy_it == m_busylist.end()) { + amrex::Abort("CArena::shrink_in_place: unknown pointer"); + return nullptr; + } + AMREX_ASSERT(m_freelist.find(*busy_it) == m_freelist.end()); + + auto const old_size = busy_it->size(); + + if (new_size > old_size) { + amrex::Abort("CArena::shrink_in_place: wrong size. Cannot shrink to a larger size."); + return nullptr; + } else if (new_size == old_size) { + return pt; + } else { + auto const leftover_size = old_size - new_size; + + void* pt2 = static_cast(pt) + new_size; + Node new_free_node(pt2, busy_it->owner(), leftover_size); + + void* pt_end = static_cast(pt) + old_size; + auto free_it = m_freelist.find(Node(pt_end,nullptr,0)); + if ((free_it == m_freelist.end()) || ! new_free_node.coalescable(*free_it)) { + m_freelist.insert(free_it, new_free_node); + } else { + auto& node = const_cast(*free_it); + // This is safe because the free list is std::set and the + // modification of `block` does not change the order of elements + // in the container, even though Node's operator< uses block. + node.block(pt2); + node.size(leftover_size + node.size()); + } + + const_cast(*busy_it).size(new_size); + + m_actually_used -= leftover_size; + +#ifdef AMREX_TINY_PROFILING + if (m_do_profiling) { + TinyProfiler::memory_free(old_size, busy_it->mem_stat()); + auto* stat = TinyProfiler::memory_alloc(new_size, m_profiling_stats); + const_cast(*busy_it).mem_stat(stat); + } +#endif + + return pt; + } } void @@ -439,4 +492,43 @@ CArena::PrintUsage (std::ostream& os, std::string const& name, std::string const << m_busylist.size() << " busy blocks, " << m_freelist.size() << " free blocks\n"; } +std::ostream& operator<< (std::ostream& os, const CArena& arena) +{ + os << "CArea:\n" + << " Hunk size: " << arena.m_hunk << "\n" + << " Memory allocated: " << arena.m_used << "\n" + << " Memory actually used: " << arena.m_actually_used << "\n"; + + if (arena.m_alloc.empty()) { + os << " No memory allocations\n"; + } else { + os << " List of memory alloations: (address, size)\n"; + for (auto const& a : arena.m_alloc) { + os << " " << a.first << ", " << a.second << "\n"; + } + } + + if (arena.m_freelist.empty()) { + os << " No free nodes\n"; + } else { + os << " List of free nodes: (address, owner, size)\n"; + for (auto const& a : arena.m_freelist) { + os << " " << a.block() << ", " << a.owner() << ", " + << a.size() << "\n"; + } + } + + if (arena.m_busylist.empty()) { + os << " No busy nodes\n"; + } else { + os << " List of busy nodes: (address, owner, size)\n"; + for (auto const& a : arena.m_busylist) { + os << " " << a.block() << ", " << a.owner() << ", " + << a.size() << "\n"; + } + } + + return os; +} + } diff --git a/Src/Base/AMReX_CONSTANTS.H b/Src/Base/AMReX_CONSTANTS.H index d50153275e6..648d3da5c94 100644 --- a/Src/Base/AMReX_CONSTANTS.H +++ b/Src/Base/AMReX_CONSTANTS.H @@ -4,41 +4,7 @@ #include - -#ifdef BL_LANG_FORT - -#ifndef AMREX_XSDK - -#define bigreal BL_REAL_E(1.0,30) -#define zero BL_REAL(0.0) -#define one BL_REAL(1.0) -#define two BL_REAL(2.0) -#define three BL_REAL(3.0) -#define four BL_REAL(4.0) -#define five BL_REAL(5.0) -#define six BL_REAL(6.0) -#define seven BL_REAL(7.0) -#define eight BL_REAL(8.0) -#define nine BL_REAL(9.0) -#define ten BL_REAL(10.0) -#define twelve BL_REAL(12.0) -#define fifteen BL_REAL(15.0) -#define sixteen BL_REAL(16.0) -#define twenty BL_REAL(20.0) -#define seventy BL_REAL(70.0) -#define ninety BL_REAL(90.0) -#define tenth BL_REAL(0.1) -#define eighth BL_REAL(0.125) -#define sixth BL_REAL(0.16666666666666667) -#define fifth BL_REAL(0.2) -#define fourth BL_REAL(0.25) -#define third BL_REAL(0.33333333333333333) -#define half BL_REAL(0.5) -#define two3rd BL_REAL(0.66666666666666667) -#define Pi BL_REAL(3.1415926535897932) - -#endif /* ndef AMREX_XSDK */ - -#endif /*BL_LANG_FORT*/ +/* Maybe in the future we will add some constants here. */ +/* If we do, make sure this file is Fortran safe. */ #endif /*BL_CONSTANTS_H*/ diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H index 73ca3a25e87..35e0ec3be7b 100644 --- a/Src/Base/AMReX_CTOParallelForImpl.H +++ b/Src/Base/AMReX_CTOParallelForImpl.H @@ -27,12 +27,12 @@ struct CompileTimeOptions { namespace detail { template - std::enable_if_t::value || std::is_same::value, bool> - ParallelFor_helper2 (T const& N, F&& f, TypeList, + std::enable_if_t || std::is_same_v, bool> + ParallelFor_helper2 (T const& N, F const& f, TypeList, std::array const& runtime_options) { if (runtime_options == std::array{As::value...}) { - if constexpr (std::is_integral::value) { + if constexpr (std::is_integral_v) { ParallelFor(N, [f] AMREX_GPU_DEVICE (T i) noexcept { f(i, As{}...); @@ -50,8 +50,8 @@ namespace detail } template - std::enable_if_t::value, bool> - ParallelFor_helper2 (Box const& box, T ncomp, F&& f, TypeList, + std::enable_if_t, bool> + ParallelFor_helper2 (Box const& box, T ncomp, F const& f, TypeList, std::array const& runtime_options) { if (runtime_options == std::array{As::value...}) { @@ -66,24 +66,24 @@ namespace detail } template - std::enable_if_t::value || std::is_same::value> - ParallelFor_helper1 (T const& N, F&& f, TypeList, + std::enable_if_t || std::is_same_v> + ParallelFor_helper1 (T const& N, F const& f, TypeList, RO const& runtime_options) { bool found_option = (false || ... || - ParallelFor_helper2(N, std::forward(f), + ParallelFor_helper2(N, f, PPs{}, runtime_options)); amrex::ignore_unused(found_option); AMREX_ASSERT(found_option); } template - std::enable_if_t::value> - ParallelFor_helper1 (Box const& box, T ncomp, F&& f, TypeList, + std::enable_if_t> + ParallelFor_helper1 (Box const& box, T ncomp, F const& f, TypeList, RO const& runtime_options) { bool found_option = (false || ... || - ParallelFor_helper2(box, ncomp, std::forward(f), + ParallelFor_helper2(box, ncomp, f, PPs{}, runtime_options)); amrex::ignore_unused(found_option); AMREX_ASSERT(found_option); @@ -93,7 +93,7 @@ namespace detail #endif template -std::enable_if_t::value> +std::enable_if_t> ParallelFor (TypeList /*list_of_compile_time_options*/, std::array const& runtime_options, T N, F&& f) @@ -124,7 +124,7 @@ void ParallelFor (TypeList /*list_of_compile_time_options*/, } template -std::enable_if_t::value> +std::enable_if_t> ParallelFor (TypeList /*list_of_compile_time_options*/, std::array const& runtime_options, Box const& box, T ncomp, F&& f) @@ -184,7 +184,7 @@ ParallelFor (TypeList /*list_of_compile_time_options*/, * \param f a callable object taking an integer and working on that iteration. */ template -std::enable_if_t::value> +std::enable_if_t> ParallelFor (TypeList ctos, std::array const& option, T N, F&& f) @@ -292,7 +292,7 @@ void ParallelFor (TypeList ctos, * \param f a callable object taking three integers and working on the given cell. */ template -std::enable_if_t::value> +std::enable_if_t> ParallelFor (TypeList ctos, std::array const& option, Box const& box, T ncomp, F&& f) diff --git a/Src/Base/AMReX_CoordSys.H b/Src/Base/AMReX_CoordSys.H index d558ba053db..24096c6f428 100644 --- a/Src/Base/AMReX_CoordSys.H +++ b/Src/Base/AMReX_CoordSys.H @@ -20,7 +20,6 @@ class FArrayBox; * * Routines for mapping between physical coordinate system and index space. */ - class CoordSys { public: @@ -54,7 +53,7 @@ public: BL_ASSERT(c_sys != undef); return (c_sys == RZ); } - //! Is CoordType == cartesion? + //! Is CoordType == cartesian? [[nodiscard]] bool IsCartesian () const noexcept { BL_ASSERT(c_sys != undef); return (c_sys == cartesian); } diff --git a/Src/Base/AMReX_CoordSys.cpp b/Src/Base/AMReX_CoordSys.cpp index 888d2cc10c9..757af532cc5 100644 --- a/Src/Base/AMReX_CoordSys.cpp +++ b/Src/Base/AMReX_CoordSys.cpp @@ -482,6 +482,8 @@ CoordSys::AreaLo (const IntVect& point, int dir) const noexcept // NOLINT(readab { case 0: return dx[1]; case 1: return dx[0]; + default: + AMREX_ASSERT(0); } return 0._rt; // to silent compiler warning case RZ: @@ -490,6 +492,8 @@ CoordSys::AreaLo (const IntVect& point, int dir) const noexcept // NOLINT(readab { case 0: return Real(TWOPI)*dx[1]*xlo[0]; case 1: return ((xlo[0]+dx[0])*(xlo[0]+dx[0])-xlo[0]*xlo[0])*static_cast(0.5*TWOPI); + default: + AMREX_ASSERT(0); } return 0._rt; // to silent compiler warning default: @@ -502,6 +506,8 @@ CoordSys::AreaLo (const IntVect& point, int dir) const noexcept // NOLINT(readab case 0: return dx[1]*dx[2]; case 1: return dx[0]*dx[2]; case 2: return dx[1]*dx[0]; + default: + AMREX_ASSERT(0); } #endif return 0; @@ -520,6 +526,8 @@ CoordSys::AreaHi (const IntVect& point, int dir) const noexcept // NOLINT(readab { case 0: return dx[1]; case 1: return dx[0]; + default: + AMREX_ASSERT(0); } return 0._rt; // to silent compiler warning case RZ: @@ -528,6 +536,8 @@ CoordSys::AreaHi (const IntVect& point, int dir) const noexcept // NOLINT(readab { case 0: return Real(TWOPI)*dx[1]*xhi[0]; case 1: return (xhi[0]*xhi[0]-(xhi[0]-dx[0])*(xhi[0]-dx[0]))*static_cast(TWOPI*0.5); + default: + AMREX_ASSERT(0); } return 0._rt; // to silent compiler warning default: @@ -540,6 +550,8 @@ CoordSys::AreaHi (const IntVect& point, int dir) const noexcept // NOLINT(readab case 0: return dx[1]*dx[2]; case 1: return dx[0]*dx[2]; case 2: return dx[1]*dx[0]; + default: + AMREX_ASSERT(0); } #endif return 0._rt; diff --git a/Src/Base/AMReX_Dim3.H b/Src/Base/AMReX_Dim3.H index 9aee68330c4..320503afe55 100644 --- a/Src/Base/AMReX_Dim3.H +++ b/Src/Base/AMReX_Dim3.H @@ -13,8 +13,8 @@ struct Dim3 { int x; int y; int z; }; struct XDim3 { Real x; Real y; Real z; }; template ::value || - std::is_same::value>::type* = nullptr> + std::enable_if_t || + std::is_same_v>* = nullptr> std::ostream& operator<< (std::ostream& os, const T& d) { os << '(' << d.x << ',' << d.y << ',' << d.z << ')'; diff --git a/Src/Base/AMReX_DistributionMapping.H b/Src/Base/AMReX_DistributionMapping.H index 0707532a0fc..e9aa82f16a2 100644 --- a/Src/Base/AMReX_DistributionMapping.H +++ b/Src/Base/AMReX_DistributionMapping.H @@ -37,7 +37,6 @@ class FabArrayBase; * BoxArray are as equal across CPUs as is possible. The SFC distribution is * based on a space filling curve. */ - class DistributionMapping { public: diff --git a/Src/Base/AMReX_DistributionMapping.cpp b/Src/Base/AMReX_DistributionMapping.cpp index 9350f9203d1..958f7ef9ec6 100644 --- a/Src/Base/AMReX_DistributionMapping.cpp +++ b/Src/Base/AMReX_DistributionMapping.cpp @@ -221,9 +221,9 @@ DistributionMapping::LeastUsedCPUs (int nprocs, } if (flag_verbose_mapper) { - Print() << "LeastUsedCPUs:" << std::endl; + Print() << "LeastUsedCPUs:" << '\n'; for (const auto &p : LIpairV) { - Print() << " Rank " << p.second << " contains " << p.first << std::endl; + Print() << " Rank " << p.second << " contains " << p.first << '\n'; } } #else @@ -368,7 +368,7 @@ DistributionMapping::RoundRobinDoIt (int nboxes, bool sort) { if (flag_verbose_mapper) { - Print() << "DM: RoundRobinDoIt called..." << std::endl; + Print() << "DM: RoundRobinDoIt called..." << '\n'; } int nprocs = ParallelContext::NProcsSub(); @@ -426,7 +426,7 @@ DistributionMapping::RoundRobinDoIt (int nboxes, m_ref->m_pmap[(*LIpairV)[i].second] = ParallelContext::local_to_global_rank(rank); if (flag_verbose_mapper) { Print() << " Mapping box " << (*LIpairV)[i].second << " of size " - << (*LIpairV)[i].first << " to rank " << rank << std::endl; + << (*LIpairV)[i].first << " to rank " << rank << '\n'; } } } @@ -439,7 +439,7 @@ DistributionMapping::RoundRobinDoIt (int nboxes, int rank = tid*nworkers + wrkerord[tid][wid]; m_ref->m_pmap[i] = ParallelContext::local_to_global_rank(rank); if (flag_verbose_mapper) { - Print() << " Mapping box " << i << " to rank " << rank << std::endl; + Print() << " Mapping box " << i << " to rank " << rank << '\n'; } } } @@ -570,7 +570,7 @@ struct WeightedBoxList } }; -static +namespace { void knapsack (const std::vector& wgts, int nprocs, @@ -712,6 +712,7 @@ top: ; } } } +} void DistributionMapping::KnapSackDoIt (const std::vector& wgts, @@ -722,7 +723,7 @@ DistributionMapping::KnapSackDoIt (const std::vector& wgts, bool sort) { if (flag_verbose_mapper) { - Print() << "DM: KnapSackDoIt called..." << std::endl; + Print() << "DM: KnapSackDoIt called..." << '\n'; } BL_PROFILE("DistributionMapping::KnapSackDoIt()"); @@ -747,9 +748,9 @@ DistributionMapping::KnapSackDoIt (const std::vector& wgts, if (flag_verbose_mapper) { for (int i = 0, ni = static_cast(vec.size()); i < ni; ++i) { - Print() << " Bucket " << i << " contains boxes:" << std::endl; + Print() << " Bucket " << i << " contains boxes:" << '\n'; for (int x : vec[i]) { - Print() << " " << x << std::endl; + Print() << " " << x << '\n'; } } } @@ -774,7 +775,7 @@ DistributionMapping::KnapSackDoIt (const std::vector& wgts, if (flag_verbose_mapper) { for (const auto &p : LIpairV) { - Print() << " Bucket " << p.second << " total weight: " << p.first << std::endl; + Print() << " Bucket " << p.second << " total weight: " << p.first << '\n'; } } @@ -811,7 +812,7 @@ DistributionMapping::KnapSackDoIt (const std::vector& wgts, const int N = static_cast(vi.size()); if (flag_verbose_mapper) { - Print() << " Mapping bucket " << idx << " to rank " << tid << std::endl; + Print() << " Mapping bucket " << idx << " to rank " << tid << '\n'; } if (nteams == nprocs) { @@ -1171,7 +1172,7 @@ namespace { } } -static +namespace { void Distribute (const std::vector& tokens, const std::vector& wgts, @@ -1183,14 +1184,14 @@ Distribute (const std::vector& tokens, BL_PROFILE("DistributionMapping::Distribute()"); if (flag_verbose_mapper) { - Print() << "Distribute:" << std::endl; - Print() << " volpercpu: " << volpercpu << std::endl; - Print() << " Sorted SFC Tokens:" << std::endl; + Print() << "Distribute:" << '\n' + << " volpercpu: " << volpercpu << '\n' + << " Sorted SFC Tokens:" << '\n'; int idx = 0; for (const auto &t : tokens) { Print() << " " << idx++ << ": " << t.m_box << ": " - << t.m_morton << std::endl; + << t.m_morton << '\n'; } } @@ -1227,10 +1228,10 @@ Distribute (const std::vector& tokens, } if (flag_verbose_mapper) { - Print() << "Distributed SFC Tokens:" << std::endl; + Print() << "Distributed SFC Tokens:" << '\n'; int idx = 0; for (int i = 0; i < nprocs; ++i) { - Print() << " Rank/Team " << i << ":" << std::endl; + Print() << " Rank/Team " << i << ":" << '\n'; Real rank_vol = 0; for (const auto &box : v[i]) { amrex::ignore_unused(box); @@ -1238,11 +1239,11 @@ Distribute (const std::vector& tokens, BL_ASSERT(box == t.m_box); Print() << " " << idx << ": " << t.m_box << ": " - << t.m_morton << std::endl; + << t.m_morton << '\n'; rank_vol += static_cast(wgts[t.m_box]); idx++; } - Print() << " Total Rank Vol: " << rank_vol << std::endl; + Print() << " Total Rank Vol: " << rank_vol << '\n'; } } @@ -1254,6 +1255,7 @@ Distribute (const std::vector& tokens, BL_ASSERT(cnt == tokens.size()); #endif } +} void DistributionMapping::SFCProcessorMapDoIt (const BoxArray& boxes, @@ -1263,7 +1265,7 @@ DistributionMapping::SFCProcessorMapDoIt (const BoxArray& boxes, Real* eff) { if (flag_verbose_mapper) { - Print() << "DM: SFCProcessorMapDoIt called..." << std::endl; + Print() << "DM: SFCProcessorMapDoIt called..." << '\n'; } BL_PROFILE("DistributionMapping::SFCProcessorMapDoIt()"); @@ -1339,7 +1341,7 @@ DistributionMapping::SFCProcessorMapDoIt (const BoxArray& boxes, if (flag_verbose_mapper) { for (const auto &p : LIpairV) { - Print() << " Bucket " << p.second << " contains " << p.first << std::endl; + Print() << " Bucket " << p.second << " contains " << p.first << '\n'; } } @@ -1382,7 +1384,7 @@ DistributionMapping::SFCProcessorMapDoIt (const BoxArray& boxes, const int Nbx = static_cast(vi.size());// # of boxes assigned to this team if (flag_verbose_mapper) { - Print() << "Mapping bucket " << LIpairV[i].second << " to rank " << ord[i] << std::endl; + Print() << "Mapping bucket " << LIpairV[i].second << " to rank " << ord[i] << '\n'; } if (nteams == nprocs) { // In this case, team id is process id. diff --git a/Src/Base/AMReX_Extension.H b/Src/Base/AMReX_Extension.H index cc299c0823e..065d23e1086 100644 --- a/Src/Base/AMReX_Extension.H +++ b/Src/Base/AMReX_Extension.H @@ -112,6 +112,9 @@ #elif defined(__GNUC__) #define AMREX_FORCE_INLINE inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +#define AMREX_FORCE_INLINE inline __forceinline + #else #define AMREX_FORCE_INLINE inline @@ -133,6 +136,27 @@ #define AMREX_NO_INLINE #endif +// flatten +#if defined(_MSC_VER) +#define AMREX_FLATTEN [[msvc::flatten]] +#elif defined(__clang__) || defined(__GNUC__) +#define AMREX_FLATTEN __attribute__((flatten)) +#else +#define AMREX_FLATTEN +#endif + +// unroll loop +#define AMREX_TO_STRING_HELPER(X) #X +#define AMREX_TO_STRING(X) AMREX_TO_STRING_HELPER(X) + +#if defined(__clang__) || defined(__CUDACC__) || defined(__HIP__) || defined(__INTEL_CLANG_COMPILER) +#define AMREX_UNROLL_LOOP(n) _Pragma(AMREX_TO_STRING(unroll n)) +#elif defined(__GNUC__) +#define AMREX_UNROLL_LOOP(n) _Pragma(AMREX_TO_STRING(GCC unroll n)) +#else +#define AMREX_UNROLL_LOOP(n) +#endif + // __attribute__((weak)) #if defined(AMREX_TYPECHECK) @@ -178,6 +202,42 @@ # define AMREX_FALLTHROUGH ((void)0) #endif +// Note: following compilers support [[likely]] and [[unlikely]] +// - Clang >= 12.0 +// - GCC >= 9.0 +// - Intel >= 2021.7 +// - MSVC >= 19.26 +// - nvcc >= 12 +#if defined(__has_cpp_attribute) && __has_cpp_attribute(likely) >= 201803L +# define AMREX_LIKELY [[likely]] +# define AMREX_UNLIKELY [[unlikely]] +#else +# define AMREX_LIKELY +# define AMREX_UNLIKELY +#endif + +// Note: following compilers support assumptions, at least using builtin functions: +// - Clang >= 3.7 +// - GCC >= 5.1 +// - MSVC >= 19.20 +// - nvcc >= 11.1.0 +// - icx >= 2021.1.2 +#if defined(__has_cpp_attribute) && __has_cpp_attribute(assume) +# define AMREX_ASSUME(ASSUMPTION) [[assume(ASSUMPTION)]] +#else +# if defined(__CUDA_ARCH__) && defined(__CUDACC__) && ( (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 2)) ) +# define AMREX_ASSUME(ASSUMPTION) __builtin_assume(ASSUMPTION) +# elif defined(AMREX_CXX_INTEL) || defined(__clang__) +# define AMREX_ASSUME(ASSUMPTION) __builtin_assume(ASSUMPTION) +# elif defined(_MSC_VER) +# define AMREX_ASSUME(ASSUMPTION) __assume(ASSUMPTION) +# elif defined(__GNUC__) +# define AMREX_ASSUME(ASSUMPTION) if (ASSUMPTION) {} else { __builtin_unreachable(); } +# else +# define AMREX_ASSUME(ASSUMPTION) +# endif +#endif + // CI uses -Werror -Wc++17-extension, thus we need to add the __cplusplus clause #if !defined(AMREX_NO_NODISCARD) && defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201603L # define AMREX_NODISCARD [[nodiscard]] diff --git a/Src/Base/AMReX_FACopyDescriptor.H b/Src/Base/AMReX_FACopyDescriptor.H index 5194ca7f876..6c05f069673 100644 --- a/Src/Base/AMReX_FACopyDescriptor.H +++ b/Src/Base/AMReX_FACopyDescriptor.H @@ -103,7 +103,6 @@ FabCopyDescriptor::~FabCopyDescriptor () * \brief This class orchestrates filling a destination fab of size destFabBox * from fabarray on the local processor (myProc). */ - template class FabArrayCopyDescriptor { @@ -468,7 +467,7 @@ FabArrayCopyDescriptor::CollectData () const int Who = it->procThatHasData; const auto Cnt = static_cast((it->box.numPts())*(it->nComp)); - RcvTags[Who].push_back(it); + RcvTags[Who].emplace_back(it); Total_Rcvs_Size += Cnt; @@ -698,7 +697,7 @@ FabArrayCopyDescriptor::CollectData () amrex::The_Arena()->free(md_recv_data); } - // Wait and upack data + // Wait and unpack data if (N_rcvs > 0) { Vector stats(N_rcvs); diff --git a/Src/Base/AMReX_FArrayBox.H b/Src/Base/AMReX_FArrayBox.H index 45e49ebab93..2dda4e6b0a0 100644 --- a/Src/Base/AMReX_FArrayBox.H +++ b/Src/Base/AMReX_FArrayBox.H @@ -23,10 +23,9 @@ class FArrayBox; * only want to write out 32 bit FABs. * * With the exception of the enumeration constants, this class is -* primarily for FArrayBox implementors; i.e. user's shouldn't +* primarily for FArrayBox implementers; i.e. user's shouldn't * call any of the member functions in this class directly. */ - class FABio // NOLINT(cppcoreguidelines-special-member-functions) { public: @@ -116,7 +115,7 @@ public: * \brief Pure virtual function. Derived classes MUST override this * function to skip over the next FAB f in the istream, under the * assumption that the header for the FAB f has already been - * skpped over. + * skipped over. */ virtual void skip (std::istream& is, FArrayBox& f) const = 0; @@ -224,7 +223,6 @@ private: * This class does NOT provide a copy constructor or assignment operator, * but it has a move constructor. */ - class FArrayBox : public BaseFab @@ -241,7 +239,7 @@ public: /** * \brief Construct an initial FAB with the data space allocated but - * not inititialized. ncomp is the number of components + * not initialized. ncomp is the number of components * (variables) at each data point in the Box. */ explicit FArrayBox (const Box& b, @@ -409,7 +407,7 @@ public: /** * \brief Set the FABio::Format in the program. * This is the preferred way to set the output format - * in "new" FABs. When desiging new programs, this should + * in "new" FABs. When designing new programs, this should * be the only function that needs to be called in order * to set the format. */ diff --git a/Src/Base/AMReX_FILCC_1D.F90 b/Src/Base/AMReX_FILCC_1D.F90 deleted file mode 100644 index 873f67f5941..00000000000 --- a/Src/Base/AMReX_FILCC_1D.F90 +++ /dev/null @@ -1,44 +0,0 @@ -#include - -! ----------------------------------------------------------- -!> This routine is intended to be a generic fill function -!! for cell-centered data. It knows how to extrapolate -!! and reflect data and is used to supplement the problem-specific -!! fill functions which call it. -!! -!! \param q <= array to fill -!! \param lo,hi => index extent of q array -!! \param domlo,domhi => index extent of problem domain -!! \param dx => cell spacing -!! \param xlo => physical location of lower left hand -!! corner of q array -!! \param bc => array of boundary flags bc(SPACEDIM,lo:hi) -!! -!! NOTE: all corner as well as edge data is filled if not EXT_DIR -! ----------------------------------------------------------- - -#ifndef AMREX_XSDK - -subroutine filcc(q,q_l1,q_h1,domlo,domhi,dx,xlo,bc) - - use amrex_fort_module - use amrex_filcc_module, only: filccn - - implicit none - - integer q_l1, q_h1 - integer domlo(1), domhi(1) - integer bc(1,2) - real(amrex_real) xlo(1), dx(1) - real(amrex_real) q(q_l1:q_h1) - - integer :: q_lo(3), q_hi(3) - - q_lo = [q_l1, 0, 0] - q_hi = [q_h1, 0, 0] - - call filccn(q_lo, q_hi, q, q_lo, q_hi, 1, domlo, domhi, dx, xlo, bc) - -end subroutine filcc - -#endif diff --git a/Src/Base/AMReX_FILCC_2D.F90 b/Src/Base/AMReX_FILCC_2D.F90 deleted file mode 100644 index 89ef77d8384..00000000000 --- a/Src/Base/AMReX_FILCC_2D.F90 +++ /dev/null @@ -1,60 +0,0 @@ -#include - -#ifndef AMREX_XSDK - -! ----------------------------------------------------------- -!> This routine is intended to be a generic fill function -!! for cell-centered data. It knows how to extrapolate -!! and reflect data and is used to supplement the problem-specific -!! fill functions which call it. -!! -!! \param q <= array to fill -!! \param lo,hi => index extent of q array -!! \param domlo,domhi => index extent of problem domain -!! \param dx => cell spacing -!! \param xlo => physical location of lower left hand -!! corner of q array -!! \param bc => array of boundary flags bc(SPACEDIM,lo:hi) -!! -!! NOTE: all corner as well as edge data is filled if not EXT_DIR -! ----------------------------------------------------------- - -subroutine filcc(q,q_l1,q_l2,q_h1,q_h2,domlo,domhi,dx,xlo,bc) - - use amrex_fort_module - use amrex_filcc_module, only: filccn - - implicit none - - integer q_l1, q_l2, q_h1, q_h2 - integer domlo(2), domhi(2) - integer bc(2,2) - real(amrex_real) xlo(2), dx(2) - real(amrex_real) q(q_l1:q_h1,q_l2:q_h2) - - integer :: q_lo(3), q_hi(3) - - q_lo = [q_l1, q_l2, 0] - q_hi = [q_h1, q_h2, 0] - - call filccn(q_lo, q_hi, q, q_lo, q_hi, 1, domlo, domhi, dx, xlo, bc) - -end subroutine filcc - -subroutine hoextraptocc(q,q_l1,q_l2,q_h1,q_h2,domlo,domhi,dx,xlo) - - use amrex_fort_module - use amrex_filcc_module, only : amrex_hoextraptocc_2d - - implicit none - - integer q_l1, q_l2, q_h1, q_h2 - integer domlo(2), domhi(2) - real(amrex_real) xlo(2), dx(2) - real(amrex_real) q(q_l1:q_h1,q_l2:q_h2) - - call amrex_hoextraptocc_2d(q,q_l1,q_l2,q_h1,q_h2,domlo,domhi,dx,xlo) - -end subroutine hoextraptocc - -#endif diff --git a/Src/Base/AMReX_FILCC_3D.F90 b/Src/Base/AMReX_FILCC_3D.F90 deleted file mode 100644 index aa3fec74ab0..00000000000 --- a/Src/Base/AMReX_FILCC_3D.F90 +++ /dev/null @@ -1,63 +0,0 @@ -#include - -#ifndef AMREX_XSDK - -! ----------------------------------------------------------- -!> This routine is intended to be a generic fill function -!! for cell centered data. It knows how to exrapolate, -!! and reflect data and can be used to supplement problem -!! specific fill functions (ie. EXT_DIR). -!! -!! \param q <= array to fill -!! \param q_l1,q_l2,q_l3,q_h1,q_h2,q_h3 => index extent of q array -!! \param domlo,hi => index extent of problem domain -!! \param dx => cell spacing -!! \param xlo => physical location of lower left hand -!! corner of q array -!! \param bc => array of boundary flags bc(SPACEDIM,lo:hi) -!! -!! NOTE: corner data not used in computing soln but must have -!! reasonable values for arithmetic to live -! ----------------------------------------------------------- - -subroutine filcc(q,q_l1,q_l2,q_l3,q_h1,q_h2,q_h3,domlo,domhi,dx,xlo,bc) - - use amrex_fort_module, only: rt => amrex_real - use amrex_filcc_module, only: filccn - - implicit none - - integer, intent(in ) :: q_l1, q_l2, q_l3, q_h1, q_h2, q_h3 - integer, intent(in ) :: domlo(3), domhi(3) - real(rt), intent(in ) :: xlo(3), dx(3) - real(rt), intent(inout) :: q(q_l1:q_h1,q_l2:q_h2,q_l3:q_h3) - integer, intent(in ) :: bc(3,2) - - integer :: q_lo(3), q_hi(3) - - q_lo = [q_l1, q_l2, q_l3] - q_hi = [q_h1, q_h2, q_h3] - - call filccn(q_lo, q_hi, q, q_lo, q_hi, 1, domlo, domhi, dx, xlo, bc) - -end subroutine filcc - - - -subroutine hoextraptocc(q,q_l1,q_l2,q_l3,q_h1,q_h2,q_h3,domlo,domhi,dx,xlo) - - use amrex_fort_module, only: rt => amrex_real - use amrex_filcc_module, only : amrex_hoextraptocc_3d - - implicit none - - integer, intent(in ) :: q_l1, q_l2, q_l3, q_h1, q_h2, q_h3 - integer, intent(in ) :: domlo(3), domhi(3) - real(rt), intent(in ) :: xlo(3), dx(3) - real(rt), intent(inout) :: q(q_l1:q_h1,q_l2:q_h2,q_l3:q_h3) - - call amrex_hoextraptocc_3d(q,q_l1,q_l2,q_l3,q_h1,q_h2,q_h3,domlo,domhi,dx,xlo) - -end subroutine hoextraptocc - -#endif diff --git a/Src/Base/AMReX_FPC.H b/Src/Base/AMReX_FPC.H index 77c4dfa923b..8975ed8e9b9 100644 --- a/Src/Base/AMReX_FPC.H +++ b/Src/Base/AMReX_FPC.H @@ -15,7 +15,6 @@ namespace amrex { * namespaces, and we don't like global constants, we make them static * constant data members of this class. */ - class FPC { public: diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index a8839a4bcc0..b09156b6606 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -39,81 +40,30 @@ #include #endif +#include #include #include #include +#include #include -#include -#include #include #include +#include + namespace amrex { -template ::value,int>::type = 0> +template ::value,int> = 0> Long nBytesOwned (T const&) noexcept { return 0; } template Long nBytesOwned (BaseFab const& fab) noexcept { return fab.nBytesOwned(); } -/* - A Collection of Fortran Array-like Objects - - - The FabArray class implements a collection (stored as an array) of - Fortran array-like objects. The parameterized type FAB is intended to be - any class derived from BaseFab. For example, FAB may be a BaseFab of - integers, so we could write: - - FabArray > int_fabs; - - Then int_fabs is a FabArray that can hold a collection of BaseFab - objects. - - FabArray is not just a general container class for Fortran arrays. It is - intended to hold "grid" data for use in finite difference calculations in - which the data is defined on a union of (usually disjoint) rectangular - regions embedded in a uniform index space. This region, called the valid - region, is represented by a BoxArray. For the purposes of this discussion, - the Kth Box in the BoxArray represents the interior region of the Kth grid. - - Since the intent is to be used with finite difference calculations a - FabArray also includes the notion of a boundary region for each grid. The - boundary region is specified by the ngrow parameter which tells the FabArray - to allocate each FAB to be ngrow cells larger in all directions than the - underlying Box. The larger region covered by the union of all the FABs is - called the region of definition. The underlying notion is that the valid - region contains the grid interior data and the region of definition includes - the interior region plus the boundary areas. - - Operations are available to copy data from the valid regions into these - boundary areas where the two overlap. The number of components, that is, - the number of values that can be stored in each cell of a FAB, is either - given as an argument to the constructor or is inherent in the definition of - the underlying FAB. Each FAB in the FabArray will have the same number of - components. - - In summary, a FabArray is an array of FABs. The Kth element contains a FAB - that holds the data for the Kth grid, a Box that defines the valid region - of the Kth grid. - - A typical use for a FabArray would be to hold the solution vector or - right-hand-side when solving a linear system of equations on a union of - rectangular grids. The copy operations would be used to copy data from the - valid regions of neighboring grids into the boundary regions after each - relaxation step of the iterative method. If a multigrid method is used, a - FabArray could be used to hold the data at each level in the multigrid - hierarchy. - - This class is a concrete class not a polymorphic one. - - This class does NOT provide a copy constructor or assignment operator. -*/ - -// -// alloc: allocate memory or not -// +/** + * \brief FabArray memory allocation information + */ struct MFInfo { + // alloc: allocate memory or not bool alloc = true; Arena* arena = nullptr; Vector tags; @@ -198,11 +148,8 @@ struct MultiArray4 { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Array4 const& operator[] (int li) const noexcept { -#if AMREX_DEVICE_COMPILE - return dp[li]; -#else - return hp[li]; -#endif + AMREX_IF_ON_DEVICE((return dp[li];)) + AMREX_IF_ON_HOST((return hp[li];)) } #ifdef AMREX_USE_GPU @@ -317,6 +264,60 @@ Add (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, int } } +/** + * \brief An Array of FortranArrayBox(FAB)-like Objects + * + * The FabArray class implements a collection (stored as an array) of + * Fortran array box-like ( \p FAB ) objects. The parameterized type \p FAB is intended to be + * any class derived from BaseFab. For example, \p FAB may be a BaseFab of + * integers, so we could write: + * + * FabArray > int_fabs; + * + * Then int_fabs is a FabArray that can hold a collection of BaseFab + * objects. + * + * FabArray is not just a general container class for Fortran arrays. It is + * intended to hold "grid" data for use in finite difference calculations in + * which the data is defined on a union of (usually disjoint) rectangular + * regions embedded in a uniform index space. This region, called the valid + * region, is represented by a BoxArray. For the purposes of this discussion, + * the Kth Box in the BoxArray represents the interior region of the Kth grid. + * + * Since the intent is to be used with finite difference calculations a + * FabArray also includes the notion of a boundary region for each grid. The + * boundary region is specified by the ngrow parameter which tells the FabArray + * to allocate each \p FAB to be ngrow cells larger in all directions than the + * underlying Box. The larger region covered by the union of all the \p FABs is + * called the region of definition. The underlying notion is that the valid + * region contains the grid interior data and the region of definition includes + * the interior region plus the boundary areas. + * + * Operations are available to copy data from the valid regions into these + * boundary areas where the two overlap. The number of components, that is, + * the number of values that can be stored in each cell of a \p FAB, is either + * given as an argument to the constructor or is inherent in the definition of + * the underlying \p FAB. Each \p FAB in the FabArray will have the same number of + * components. + * + * In summary, a FabArray is an array of \p FABs. The Kth element contains a \p FAB + * that holds the data for the Kth grid, a Box that defines the valid region + * of the Kth grid. + * + * A typical use for a FabArray would be to hold the solution vector or + * right-hand-side when solving a linear system of equations on a union of + * rectangular grids. The copy operations would be used to copy data from the + * valid regions of neighboring grids into the boundary regions after each + * relaxation step of the iterative method. If a multigrid method is used, a + * FabArray could be used to hold the data at each level in the multigrid + * hierarchy. + * + * This class is a concrete class not a polymorphic one. + * + * This class does NOT provide a copy constructor or assignment operator. + * + * \tparam FAB FortranArrayBox-like object. Typically a derived class of BaseFab. Not to be confused with FabArrayBase. + */ template class FabArray : @@ -332,7 +333,7 @@ public: * if FAB is a BaseFab or its child, value_type = FAB::value_type * else value_type = FAB; */ - using value_type = typename std::conditional::value, FAB, FABType>::type::value_type; + using value_type = typename std::conditional_t::value, FAB, FABType>::value_type; using fab_type = FAB; @@ -341,8 +342,9 @@ public: FabArray () noexcept; /** - * \brief Construct an empty FabArray that has a default Arena. If - * `define` is called later with a nulltpr as MFInfo's arena, the + * \brief Construct an empty FabArray that has a default Arena. + * + * If `define` is called later with a nullptr as MFInfo's arena, the * default Arena `a` will be used. If the arena in MFInfo is not a * nullptr, the MFInfo's arena will be used. */ @@ -499,69 +501,69 @@ public: FAB * fabPtr (int K) noexcept; // Here K is global index FAB const* fabPtr (int K) const noexcept; - template ::value,int>::type = 0> + template ::value,int> = 0> void prefetchToHost (const MFIter& mfi) const noexcept; - template ::value,int>::type = 0> + template ::value,int> = 0> void prefetchToDevice (const MFIter& mfi) const noexcept; - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type const> array (const MFIter& mfi) const noexcept; // - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type> array (const MFIter& mfi) noexcept; // - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type const> array (int K) const noexcept; // - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type> array (int K) noexcept; - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type const> const_array (const MFIter& mfi) const noexcept; // - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type const> const_array (int K) const noexcept; - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type const> array (const MFIter& mfi, int start_comp) const noexcept; // - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type> array (const MFIter& mfi, int start_comp) noexcept; // - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type const> array (int K, int start_comp) const noexcept; // - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type> array (int K, int start_comp) noexcept; - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type const> const_array (const MFIter& mfi, int start_comp) const noexcept; // - template ::value,int>::type = 0> + template ::value,int> = 0> Array4::value_type const> const_array (int K, int start_comp) const noexcept; - template ::value,int>::type = 0> + template ::value,int> = 0> MultiArray4::value_type> arrays () noexcept; - template ::value,int>::type = 0> + template ::value,int> = 0> MultiArray4::value_type const> arrays () const noexcept; - template ::value,int>::type = 0> + template ::value,int> = 0> MultiArray4::value_type const> const_arrays () const noexcept; //! Explicitly set the Kth FAB in the FabArray to point to elem. void setFab (int boxno, std::unique_ptr elem); //! Explicitly set the Kth FAB in the FabArray to point to elem. - template ::value,int> = 0> + template ,int> = 0> void setFab (int boxno, FAB&& elem); //! Explicitly set the FAB associated with mfi in the FabArray to point to elem. void setFab (const MFIter&mfi, std::unique_ptr elem); //! Explicitly set the FAB associated with mfi in the FabArray to point to elem. - template ::value,int> = 0> + template ,int> = 0> void setFab (const MFIter&mfi, FAB&& elem); //! Release ownership of the FAB. This function is not thread safe. @@ -614,11 +616,11 @@ public: IntVect const& nghost); //! Set all components in the entire region of each FAB to val. - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val); //! Set all components in the entire region of each FAB to val. - template ::value,int>::type = 0> + template ::value,int> = 0> FabArray& operator= (value_type val); /** @@ -626,13 +628,13 @@ public: * each FAB in the FabArray, starting at component comp to val. * Also set the value of nghost boundary cells. */ - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val, int comp, int ncomp, int nghost = 0); - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val, int comp, int ncomp, @@ -644,14 +646,14 @@ public: * as nghost boundary cells, to val, provided they also intersect * with the Box region. */ - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val, const Box& region, int comp, int ncomp, int nghost = 0); - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val, const Box& region, int comp, @@ -661,10 +663,10 @@ public: * \brief Set all components in the valid region of each FAB in the * FabArray to val, including nghost boundary cells. */ - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val, int nghost); - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val, const IntVect& nghost); /** @@ -672,50 +674,50 @@ public: * FabArray to val, including nghost boundary cells, that also * intersect the Box region. */ - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val, const Box& region, int nghost); - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val, const Box& region, const IntVect& nghost); - template ::value,int>::type = 0> + template ::value,int> = 0> void abs (int comp, int ncomp, int nghost = 0); - template ::value,int>::type = 0> + template ::value,int> = 0> void abs (int comp, int ncomp, const IntVect& nghost); - template ::value,int>::type = 0> + template ::value,int> = 0> void plus (value_type val, int comp, int num_comp, int nghost = 0); - template ::value,int>::type = 0> + template ::value,int> = 0> void plus (value_type val, const Box& region, int comp, int num_comp, int nghost = 0); - template ::value,int>::type = 0> + template ::value,int> = 0> void mult (value_type val, int comp, int num_comp, int nghost = 0); - template ::value,int>::type = 0> + template ::value,int> = 0> void mult (value_type val, const Box& region, int comp, int num_comp, int nghost = 0); - template ::value,int>::type = 0> + template ::value,int> = 0> void invert (value_type numerator, int comp, int num_comp, int nghost = 0); - template ::value,int>::type = 0> + template ::value,int> = 0> void invert (value_type numerator, const Box& region, int comp, int num_comp, int nghost = 0); //! Set all values in the boundary region to val. - template ::value,int>::type = 0> + template ::value,int> = 0> void setBndry (value_type val); //! Set ncomp values in the boundary region, starting at start_comp to val. - template ::value,int>::type = 0> + template ::value,int> = 0> void setBndry (value_type val, int strt_comp, int ncomp); //! Set all values outside the Geometry domain to val. - template ::value,int>::type = 0> + template ::value,int> = 0> void setDomainBndry (value_type val, const Geometry& geom); //! Set ncomp values outside the Geometry domain to val, starting at start_comp. - template ::value,int>::type = 0> + template ::value,int> = 0> void setDomainBndry (value_type val, int strt_comp, int ncomp, const Geometry& geom); /** @@ -1001,7 +1003,7 @@ public: void FillBoundary_nowait (int scomp, int ncomp, const IntVect& nghost, const Periodicity& period, bool cross = false); template ::value,int>::type = 0> + class F=FAB, std::enable_if_t::value,int> = 0> void FillBoundary_finish (); void FillBoundary_test (); @@ -1110,7 +1112,7 @@ public: // (including ghost cells outside periodic boundaries) // physbnd : boundary cells outside the domain (excluding periodic boundaries) // interior : interior cells (i.e., valid cells) - template ::value,int>::type = 0> + template ::value,int> = 0> void BuildMask (const Box& phys_domain, const Periodicity& period, value_type covered, value_type notcovered, value_type physbnd, value_type interior); @@ -1118,7 +1120,7 @@ public: // The following are private functions. But we have to make them public for cuda. template ::value,int>::type = 0> + class F=FAB, std::enable_if_t::value,int> = 0> void FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, const Periodicity& period, bool cross, bool enforce_periodicity_only = false, @@ -1128,10 +1130,10 @@ public: void PC_local_cpu (const CPC& thecpc, FabArray const& src, int scomp, int dcomp, int ncomp, CpOp op); - template ::value,int>::type = 0> + template ::value,int> = 0> void setVal (value_type val, const CommMetaData& thecmd, int scomp, int ncomp); - template ::value,int>::type = 0> + template ::value,int> = 0> LayoutData RecvLayoutMask (const CommMetaData& thecmd); #ifdef AMREX_USE_GPU @@ -1308,7 +1310,7 @@ private: void setFab_assert (int K, FAB const& fab) const; - template ::value,int>::type = 0> + template ::value,int> = 0> void build_arrays () const; void clear_arrays (); @@ -1490,7 +1492,7 @@ FabArray::fabPtr (int K) const noexcept } template -template ::value,int>::type> +template ::value,int>> void FabArray::prefetchToHost (const MFIter& mfi) const noexcept { @@ -1502,7 +1504,7 @@ FabArray::prefetchToHost (const MFIter& mfi) const noexcept } template -template ::value,int>::type> +template ::value,int>> void FabArray::prefetchToDevice (const MFIter& mfi) const noexcept { @@ -1514,7 +1516,7 @@ FabArray::prefetchToDevice (const MFIter& mfi) const noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type const> FabArray::array (const MFIter& mfi) const noexcept { @@ -1522,7 +1524,7 @@ FabArray::array (const MFIter& mfi) const noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type> FabArray::array (const MFIter& mfi) noexcept { @@ -1530,7 +1532,7 @@ FabArray::array (const MFIter& mfi) noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type const> FabArray::array (int K) const noexcept { @@ -1538,7 +1540,7 @@ FabArray::array (int K) const noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type> FabArray::array (int K) noexcept { @@ -1546,7 +1548,7 @@ FabArray::array (int K) noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type const> FabArray::const_array (const MFIter& mfi) const noexcept { @@ -1554,7 +1556,7 @@ FabArray::const_array (const MFIter& mfi) const noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type const> FabArray::const_array (int K) const noexcept { @@ -1562,7 +1564,7 @@ FabArray::const_array (int K) const noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type const> FabArray::array (const MFIter& mfi, int start_comp) const noexcept { @@ -1570,7 +1572,7 @@ FabArray::array (const MFIter& mfi, int start_comp) const noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type> FabArray::array (const MFIter& mfi, int start_comp) noexcept { @@ -1578,7 +1580,7 @@ FabArray::array (const MFIter& mfi, int start_comp) noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type const> FabArray::array (int K, int start_comp) const noexcept { @@ -1586,7 +1588,7 @@ FabArray::array (int K, int start_comp) const noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type> FabArray::array (int K, int start_comp) noexcept { @@ -1594,7 +1596,7 @@ FabArray::array (int K, int start_comp) noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type const> FabArray::const_array (const MFIter& mfi, int start_comp) const noexcept { @@ -1602,7 +1604,7 @@ FabArray::const_array (const MFIter& mfi, int start_comp) const noexcept } template -template ::value,int>::type> +template ::value,int>> Array4::value_type const> FabArray::const_array (int K, int start_comp) const noexcept { @@ -1610,7 +1612,7 @@ FabArray::const_array (int K, int start_comp) const noexcept } template -template ::value,int>::type> +template ::value,int>> MultiArray4::value_type> FabArray::arrays () noexcept { @@ -1619,7 +1621,7 @@ FabArray::arrays () noexcept } template -template ::value,int>::type> +template ::value,int>> MultiArray4::value_type const> FabArray::arrays () const noexcept { @@ -1628,7 +1630,7 @@ FabArray::arrays () const noexcept } template -template ::value,int>::type> +template ::value,int>> MultiArray4::value_type const> FabArray::const_arrays () const noexcept { @@ -1637,7 +1639,7 @@ FabArray::const_arrays () const noexcept } template -template ::value,int>::type> +template ::value,int>> void FabArray::build_arrays () const { @@ -1781,7 +1783,7 @@ FabArray::LocalAdd (FabArray const& src, int scomp, int dcomp, int nco } template -template ::value,int>::type> +template ::value,int>> void FabArray::setVal (value_type val, int nghost) { @@ -1789,7 +1791,7 @@ FabArray::setVal (value_type val, int nghost) } template -template ::value,int>::type> +template ::value,int>> void FabArray::setVal (value_type val, const IntVect& nghost) { @@ -1797,7 +1799,7 @@ FabArray::setVal (value_type val, const IntVect& nghost) } template -template ::value,int>::type> +template ::value,int>> void FabArray::setVal (value_type val, const Box& region, int nghost) { @@ -1805,7 +1807,7 @@ FabArray::setVal (value_type val, const Box& region, int nghost) } template -template ::value,int>::type> +template ::value,int>> void FabArray::setVal (value_type val, const Box& region, const IntVect& nghost) { @@ -2158,7 +2160,7 @@ FabArray::setFab (int boxno, std::unique_ptr elem) } template -template ::value,int> > +template ,int> > void FabArray::setFab (int boxno, FAB&& elem) { @@ -2201,7 +2203,7 @@ FabArray::setFab (const MFIter& mfi, std::unique_ptr elem) } template -template ::value,int> > +template ,int> > void FabArray::setFab (const MFIter& mfi, FAB&& elem) { @@ -2223,7 +2225,7 @@ FabArray::setFab (const MFIter& mfi, FAB&& elem) } template -template ::value,int>::type> +template ::value,int>> void FabArray::setBndry (value_type val) { @@ -2231,7 +2233,7 @@ FabArray::setBndry (value_type val) } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::setBndry (value_type val, int strt_comp, @@ -2332,7 +2334,7 @@ FabArray::setBndry (value_type val, } template -template ::value,int>::type> +template ::value,int>> void FabArray::setDomainBndry (value_type val, const Geometry& geom) { @@ -2340,7 +2342,7 @@ FabArray::setDomainBndry (value_type val, const Geometry& geom) } template -template ::value,int>::type> +template ::value,int>> void FabArray::setDomainBndry (value_type val, int strt_comp, @@ -2422,7 +2424,7 @@ FabArray::copyTo (FAB& dest, int nghost) const } template -template ::value,int>::type> +template ::value,int>> void FabArray::setVal (value_type val) { @@ -2430,7 +2432,7 @@ FabArray::setVal (value_type val) } template -template ::value,int>::type> +template ::value,int>> FabArray& FabArray::operator= (value_type val) { @@ -2439,7 +2441,7 @@ FabArray::operator= (value_type val) } template -template ::value,int>::type> +template ::value,int>> void FabArray::setVal (value_type val, int comp, @@ -2450,7 +2452,7 @@ FabArray::setVal (value_type val, } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::setVal (value_type val, int comp, @@ -2492,7 +2494,7 @@ FabArray::setVal (value_type val, } template -template ::value,int>::type> +template ::value,int>> void FabArray::setVal (value_type val, const Box& region, @@ -2504,7 +2506,7 @@ FabArray::setVal (value_type val, } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::setVal (value_type val, const Box& region, @@ -2553,7 +2555,7 @@ FabArray::setVal (value_type val, } template -template ::value,int>::type> +template ::value,int>> void FabArray::abs (int comp, int ncomp, int nghost) { @@ -2561,7 +2563,7 @@ FabArray::abs (int comp, int ncomp, int nghost) } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::abs (int comp, int ncomp, const IntVect& nghost) { @@ -2599,7 +2601,7 @@ FabArray::abs (int comp, int ncomp, const IntVect& nghost) } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::plus (value_type val, int comp, int num_comp, int nghost) { @@ -2635,7 +2637,7 @@ FabArray::plus (value_type val, int comp, int num_comp, int nghost) } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::plus (value_type val, const Box& region, int comp, int num_comp, int nghost) { @@ -2675,7 +2677,7 @@ FabArray::plus (value_type val, const Box& region, int comp, int num_comp, } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::mult (value_type val, int comp, int num_comp, int nghost) { @@ -2711,7 +2713,7 @@ FabArray::mult (value_type val, int comp, int num_comp, int nghost) } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::mult (value_type val, const Box& region, int comp, int num_comp, int nghost) { @@ -2751,7 +2753,7 @@ FabArray::mult (value_type val, const Box& region, int comp, int num_comp, } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::invert (value_type numerator, int comp, int num_comp, int nghost) { @@ -2787,7 +2789,7 @@ FabArray::invert (value_type numerator, int comp, int num_comp, int nghost) } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::invert (value_type numerator, const Box& region, int comp, int num_comp, int nghost) { @@ -2830,7 +2832,7 @@ template void FabArray::shift (const IntVect& v) { - clearThisBD(); // The new boxarry will have a different ID. + clearThisBD(); // The new boxarray will have a different ID. boxarray.shift(v); addThisBD(); #ifdef AMREX_USE_OMP @@ -3321,7 +3323,7 @@ FabArray::FillBoundary_nowait (int scomp, int ncomp, const IntVect& nghost, } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::BuildMask (const Box& phys_domain, const Periodicity& period, value_type covered, value_type notcovered, @@ -3389,7 +3391,7 @@ FabArray::BuildMask (const Box& phys_domain, const Periodicity& period, } template -template ::value,int>::type> +template ::value,int>> void FabArray::setVal (value_type val, const CommMetaData& thecmd, int scomp, int ncomp) { @@ -3430,7 +3432,7 @@ FabArray::setVal (value_type val, const CommMetaData& thecmd, int scomp, in } template -template ::value,int>::type> +template ::value,int>> LayoutData FabArray::RecvLayoutMask (const CommMetaData& thecmd) { diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H index 29d3d63b29e..09dfd5e22db 100644 --- a/Src/Base/AMReX_FabArrayBase.H +++ b/Src/Base/AMReX_FabArrayBase.H @@ -15,9 +15,11 @@ #include #endif +#include #include #include + namespace amrex { class MFIter; @@ -28,6 +30,12 @@ template class FabArray; namespace EB2 { class IndexSpace; } +/** + * \brief Base class for FabArray. + * + * Not to be confused with FArrayBox or `FAB` shorthands. + * Can be read as FArrayBox-like Array Base. + */ class FabArrayBase { friend class MFIter; @@ -645,7 +653,7 @@ public: ~ParForInfo (); std::pair const& getBlocks () const { return m_nblocks_x; } - Box const* getBoxes () const { return m_boxes; } + BoxIndexer const* getBoxes () const { return m_boxes; } ParForInfo () = delete; ParForInfo (ParForInfo const&) = delete; @@ -657,7 +665,7 @@ public: IntVect m_ng; int m_nthreads; std::pair m_nblocks_x; - Box* m_boxes = nullptr; + BoxIndexer* m_boxes = nullptr; char* m_hp = nullptr; char* m_dp = nullptr; }; @@ -715,6 +723,11 @@ public: }; +[[nodiscard]] int nComp (FabArrayBase const& fa); +[[nodiscard]] IntVect nGrowVect (FabArrayBase const& fa); +[[nodiscard]] BoxArray const& boxArray (FabArrayBase const& fa); +[[nodiscard]] DistributionMapping const& DistributionMap (FabArrayBase const& fa); + #ifdef BL_USE_MPI bool CheckRcvStats (Vector& recv_stats, const Vector& recv_size, int tag); #endif diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp index 8dd8275f66a..eb8fc99605b 100644 --- a/Src/Base/AMReX_FabArrayBase.cpp +++ b/Src/Base/AMReX_FabArrayBase.cpp @@ -1325,8 +1325,7 @@ FabArrayBase::RB90::define (const FabArrayBase& fa) { Box bxsnd = (n==0) ? amrex::get<0>(dst_to_src)(bxrcv) : amrex::get<1>(dst_to_src)(bxrcv); - send_tags[dst_owner].push_back(FabArrayBase::CopyComTag(bxrcv, bxsnd, - krcv, ksnd)); + send_tags[dst_owner].emplace_back(bxrcv, bxsnd, krcv, ksnd); } } } @@ -1498,8 +1497,7 @@ FabArrayBase::RB180::define (const FabArrayBase& fa) if (dst_owner != myproc) // local copy will be dealt with later { Box const& bxsnd = convert(bxrcv); - send_tags[dst_owner].push_back(FabArrayBase::CopyComTag(bxrcv, bxsnd, - krcv, ksnd)); + send_tags[dst_owner].emplace_back(bxrcv, bxsnd, krcv, ksnd); } } } @@ -1683,8 +1681,7 @@ FabArrayBase::PolarB::define (const FabArrayBase& fa) if (dst_owner != myproc) // local copy will be dealt with later { Box const bxsnd = (n<4) ? convert(bxrcv) : convert_corner(bxrcv); - send_tags[dst_owner].push_back(FabArrayBase::CopyComTag(bxrcv, bxsnd, - krcv, ksnd)); + send_tags[dst_owner].emplace_back(bxrcv, bxsnd, krcv, ksnd); } } } @@ -2699,4 +2696,24 @@ FabArrayBase::flushParForCache () #endif +int nComp (FabArrayBase const& fa) +{ + return fa.nComp(); +} + +IntVect nGrowVect (FabArrayBase const& fa) +{ + return fa.nGrowVect(); +} + +BoxArray const& boxArray (FabArrayBase const& fa) +{ + return fa.boxArray(); +} + +DistributionMapping const& DistributionMap (FabArrayBase const& fa) +{ + return fa.DistributionMap(); +} + } diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H index ea877266f9a..307ca490505 100644 --- a/Src/Base/AMReX_FabArrayCommI.H +++ b/Src/Base/AMReX_FabArrayCommI.H @@ -3,7 +3,7 @@ #include template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, const Periodicity& period, bool cross, @@ -163,7 +163,7 @@ FabArray::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, } template -template ::value,int>::type Z> +template ::value,int>Z> void FabArray::FillBoundary_finish () { @@ -666,7 +666,7 @@ FabArray::PrepareSendBuffers (const MapOfCopyComTagContainers& SndTags, nbytes += cct.sbox.numPts() * ncomp * sizeof(BUF); } - std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); + std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes); nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned // Also need to align the offset properly @@ -757,7 +757,7 @@ FabArray::PostRcvs (const MapOfCopyComTagContainers& RcvTags, nbytes += cct.dbox.numPts() * ncomp * sizeof(BUF); } - std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); + std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes); nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned // Also need to align the offset properly @@ -992,7 +992,7 @@ FillBoundary (Vector const& mf, Vector const& scomp, } } - std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); + std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes); nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned // Also need to align the offset properly @@ -1065,7 +1065,7 @@ FillBoundary (Vector const& mf, Vector const& scomp, } } - std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); + std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes); nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned // Also need to align the offset properly diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H index 78f3355d34a..88aaa771090 100644 --- a/Src/Base/AMReX_FabArrayUtility.H +++ b/Src/Base/AMReX_FabArrayUtility.H @@ -21,7 +21,7 @@ namespace fudetail { template ::value> > typename FAB::value_type -ReduceSum_host (FabArray const& fa, IntVect const& nghost, F&& f) +ReduceSum_host (FabArray const& fa, IntVect const& nghost, F const& f) { using value_type = typename FAB::value_type; value_type sm = 0; @@ -47,7 +47,7 @@ std::enable_if_t::value, std::conditional_t::value || std::is_same::value, int, typename FAB::value_type> > -ReduceMF (FabArray const& fa, IntVect const& nghost, F&& f) +ReduceMF (FabArray const& fa, IntVect const& nghost, F const& f) { using T = std::conditional_t::value || std::is_same::value, @@ -66,7 +66,7 @@ std::enable_if_t::value && IsBaseFab::value, std::conditional_t::value || std::is_same::value, int, typename FAB1::value_type> > -ReduceMF (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) +ReduceMF (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F const& f) { using T = std::conditional_t::value || std::is_same::value, @@ -88,7 +88,7 @@ std::enable_if_t::value && IsBaseFab::value && IsBaseFab::value, int, typename FAB1::value_type> > ReduceMF (FabArray const& fa1, FabArray const& fa2, - FabArray const& fa3, IntVect const& nghost, F&& f) + FabArray const& fa3, IntVect const& nghost, F const& f) { using T = std::conditional_t::value || std::is_same::value, @@ -156,7 +156,7 @@ template ::value> > typename FAB1::value_type ReduceSum_host (FabArray const& fa1, FabArray const& fa2, - IntVect const& nghost, F&& f) + IntVect const& nghost, F const& f) { using value_type = typename FAB1::value_type; value_type sm = 0; @@ -234,7 +234,7 @@ template ::value> > typename FAB1::value_type ReduceSum_host (FabArray const& fa1, FabArray const& fa2, - FabArray const& fa3, IntVect const& nghost, F&& f) + FabArray const& fa3, IntVect const& nghost, F const& f) { using value_type = typename FAB1::value_type; value_type sm = 0; @@ -311,7 +311,7 @@ namespace fudetail { template ::value> > typename FAB::value_type -ReduceMin_host (FabArray const& fa, IntVect const& nghost, F&& f) +ReduceMin_host (FabArray const& fa, IntVect const& nghost, F const& f) { using value_type = typename FAB::value_type; value_type r = std::numeric_limits::max(); @@ -382,7 +382,7 @@ template ::value> > typename FAB1::value_type ReduceMin_host (FabArray const& fa1, FabArray const& fa2, - IntVect const& nghost, F&& f) + IntVect const& nghost, F const& f) { using value_type = typename FAB1::value_type; value_type r = std::numeric_limits::max(); @@ -460,7 +460,7 @@ template ::value> > typename FAB1::value_type ReduceMin_host (FabArray const& fa1, FabArray const& fa2, - FabArray const& fa3, IntVect const& nghost, F&& f) + FabArray const& fa3, IntVect const& nghost, F const& f) { using value_type = typename FAB1::value_type; value_type r = std::numeric_limits::max(); @@ -537,7 +537,7 @@ namespace fudetail { template ::value> > typename FAB::value_type -ReduceMax_host (FabArray const& fa, IntVect const& nghost, F&& f) +ReduceMax_host (FabArray const& fa, IntVect const& nghost, F const& f) { using value_type = typename FAB::value_type; value_type r = std::numeric_limits::lowest(); @@ -609,7 +609,7 @@ template ::value> > typename FAB1::value_type ReduceMax_host (FabArray const& fa1, FabArray const& fa2, - IntVect const& nghost, F&& f) + IntVect const& nghost, F const& f) { using value_type = typename FAB1::value_type; value_type r = std::numeric_limits::lowest(); @@ -687,7 +687,7 @@ template ::value> > typename FAB1::value_type ReduceMax_host (FabArray const& fa1, FabArray const& fa2, - FabArray const& fa3, IntVect const& nghost, F&& f) + FabArray const& fa3, IntVect const& nghost, F const& f) { using value_type = typename FAB1::value_type; value_type r = std::numeric_limits::lowest(); @@ -764,7 +764,7 @@ namespace fudetail { template ::value> > bool -ReduceLogicalAnd_host (FabArray const& fa, IntVect const& nghost, F&& f) +ReduceLogicalAnd_host (FabArray const& fa, IntVect const& nghost, F const& f) { int r = true; @@ -836,7 +836,7 @@ template ::value> > bool ReduceLogicalAnd_host (FabArray const& fa1, FabArray const& fa2, - IntVect const& nghost, F&& f) + IntVect const& nghost, F const& f) { int r = true; @@ -911,7 +911,7 @@ namespace fudetail { template ::value> > bool -ReduceLogicalOr_host (FabArray const& fa, IntVect const& nghost, F&& f) +ReduceLogicalOr_host (FabArray const& fa, IntVect const& nghost, F const& f) { int r = false; @@ -940,9 +940,9 @@ ReduceLogicalOr_host_wrapper (FabArray const& fa, IntVect const& nghost, F& template std::enable_if_t::value, bool> -ReduceLogicalOr_host (FabArray const& fa, IntVect const& nghost, F&& f) +ReduceLogicalOr_host (FabArray const& fa, IntVect const& nghost, F&& /*f*/) { - amrex::ignore_unused(fa,nghost,f); + amrex::ignore_unused(fa,nghost); amrex::Abort("ReduceLogicalOr: Launch Region is off. Device lambda cannot be called by host."); return 0; } @@ -983,7 +983,7 @@ template ::value> > bool ReduceLogicalOr_host (FabArray const& fa1, FabArray const& fa2, - IntVect const& nghost, F&& f) + IntVect const& nghost, F const& f) { int r = false; @@ -1092,7 +1092,7 @@ printCell (FabArray const& mf, const IntVect& cell, int comp = -1, } ss << dp[n-1]; amrex::AllPrint() << " At cell " << cell << " in Box " << bx - << ": " << ss.str() << std::endl; + << ": " << ss.str() << '\n'; } } } @@ -1103,7 +1103,7 @@ template & dst, FabArray const& src, int srccomp, int dstcomp, int numcomp, int nghost) { - Subtract(dst,src,srccomp,dstcomp,numcomp,nghost); + Subtract(dst,src,srccomp,dstcomp,numcomp,IntVect(nghost)); } template const& x, int xcomp, FabArray const& y, int ycomp, int n return sm; } +//! dst = val +template ,int> = 0> +void setVal (MF& dst, typename MF::value_type val) +{ + dst.setVal(val); +} + +//! dst = val in ghost cells. +template ,int> = 0> +void setBndry (MF& dst, typename MF::value_type val, int scomp, int ncomp) +{ + dst.setBndry(val, scomp, ncomp); +} + +//! dst *= val +template ,int> = 0> +void Scale (MF& dst, typename MF::value_type val, int scomp, int ncomp, int nghost) +{ + dst.mult(val, scomp, ncomp, nghost); +} + +//! dst = src +template && + IsMultiFabLike_v, int> = 0> +void LocalCopy (DMF& dst, SMF const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + amrex::Copy(dst, src, scomp, dcomp, ncomp, nghost); +} + +//! dst += src +template ,int> = 0> +void LocalAdd (MF& dst, MF const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + amrex::Add(dst, src, scomp, dcomp, ncomp, nghost); +} + +//! dst += a * src +template ,int> = 0> +void Saxpy (MF& dst, typename MF::value_type a, MF const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + MF::Saxpy(dst, a, src, scomp, dcomp, ncomp, nghost); +} + +//! dst = src + a * dst +template ,int> = 0> +void Xpay (MF& dst, typename MF::value_type a, MF const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + MF::Xpay(dst, a, src, scomp, dcomp, ncomp, nghost); +} + +//! dst = a*src_a + b*src_b +template ,int> = 0> +void LinComb (MF& dst, + typename MF::value_type a, MF const& src_a, int acomp, + typename MF::value_type b, MF const& src_b, int bcomp, + int dcomp, int ncomp, IntVect const& nghost) +{ + MF::LinComb(dst, a, src_a, acomp, b, src_b, bcomp, dcomp, ncomp, nghost); +} + +//! dst = src w/ MPI communication +template , int> = 0> +void ParallelCopy (MF& dst, MF const& src, int scomp, int dcomp, int ncomp, + IntVect const& ng_src = IntVect(0), + IntVect const& ng_dst = IntVect(0), + Periodicity const& period = Periodicity::NonPeriodic()) +{ + dst.ParallelCopy(src, scomp, dcomp, ncomp, ng_src, ng_dst, period); +} + +template , int> = 0> +[[nodiscard]] typename MF::value_type +norminf (MF const& mf, int scomp, int ncomp, IntVect const& nghost, + bool local = false) +{ + return mf.norminf(scomp, ncomp, nghost, local); +} + +//! dst = val +template ,int> = 0> +void setVal (Array& dst, typename MF::value_type val) +{ + for (auto& mf: dst) { + mf.setVal(val); + } +} + +//! dst = val in ghost cells. +template ,int> = 0> +void setBndry (Array& dst, typename MF::value_type val, int scomp, int ncomp) +{ + for (auto& mf : dst) { + mf.setBndry(val, scomp, ncomp); + } +} + +//! dst *= val +template ,int> = 0> +void Scale (Array& dst, typename MF::value_type val, int scomp, int ncomp, + int nghost) +{ + for (auto& mf : dst) { + mf.mult(val, scomp, ncomp, nghost); + } +} + +//! dst = src +template && + IsMultiFabLike_v, int> = 0> +void LocalCopy (Array& dst, Array const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + for (std::size_t i = 0; i < N; ++i) { + amrex::Copy(dst[i], src[i], scomp, dcomp, ncomp, nghost); + } +} + +//! dst += src +template ,int> = 0> +void LocalAdd (Array& dst, Array const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + for (std::size_t i = 0; i < N; ++i) { + amrex::Add(dst[i], src[i], scomp, dcomp, ncomp, nghost); + } +} + +//! dst += a * src +template ,int> = 0> +void Saxpy (Array& dst, typename MF::value_type a, + Array const& src, int scomp, int dcomp, int ncomp, + IntVect const& nghost) +{ + for (std::size_t i = 0; i < N; ++i) { + MF::Saxpy(dst[i], a, src[i], scomp, dcomp, ncomp, nghost); + } +} + +//! dst = src + a * dst +template ,int> = 0> +void Xpay (Array& dst, typename MF::value_type a, + Array const& src, int scomp, int dcomp, int ncomp, + IntVect const& nghost) +{ + for (std::size_t i = 0; i < N; ++i) { + MF::Xpay(dst[i], a, src[i], scomp, dcomp, ncomp, nghost); + } +} + +//! dst = a*src_a + b*src_b +template ,int> = 0> +void LinComb (Array& dst, + typename MF::value_type a, Array const& src_a, int acomp, + typename MF::value_type b, Array const& src_b, int bcomp, + int dcomp, int ncomp, IntVect const& nghost) +{ + for (std::size_t i = 0; i < N; ++i) { + MF::LinComb(dst[i], a, src_a[i], acomp, b, src_b[i], bcomp, dcomp, ncomp, nghost); + } +} + +//! dst = src w/ MPI communication +template , int> = 0> +void ParallelCopy (Array& dst, Array const& src, + int scomp, int dcomp, int ncomp, + IntVect const& ng_src = IntVect(0), + IntVect const& ng_dst = IntVect(0), + Periodicity const& period = Periodicity::NonPeriodic()) +{ + for (std::size_t i = 0; i < N; ++i) { + dst[i].ParallelCopy(src[i], scomp, dcomp, ncomp, ng_src, ng_dst, period); + } +} + +template , int> = 0> +[[nodiscard]] typename MF::value_type +norminf (Array const& mf, int scomp, int ncomp, IntVect const& nghost, + bool local = false) +{ + auto r = typename MF::value_type(0); + for (std::size_t i = 0; i < N; ++i) { + auto tmp = mf[i].norminf(scomp, ncomp, nghost, true); + r = std::max(r,tmp); + } + if (!local) { + ParallelAllReduce::Max(r, ParallelContext::CommunicatorSub()); + } + return r; +} + +template && (N > 0), + int> = 0> +[[nodiscard]] int nComp (Array const& mf) +{ + return mf[0].nComp(); +} + +template && (N > 0), + int> = 0> +[[nodiscard]] IntVect nGrowVect (Array const& mf) +{ + return mf[0].nGrowVect(); +} + +template && (N > 0), + int> = 0> +[[nodiscard]] BoxArray const& +boxArray (Array const& mf) +{ + return mf[0].boxArray(); +} + +template && (N > 0), + int> = 0> +[[nodiscard]] DistributionMapping const& +DistributionMap (Array const& mf) +{ + return mf[0].DistributionMap(); +} + } #endif diff --git a/Src/Base/AMReX_FabConv.H b/Src/Base/AMReX_FabConv.H index 78554000f25..25dae063de7 100644 --- a/Src/Base/AMReX_FabConv.H +++ b/Src/Base/AMReX_FabConv.H @@ -13,20 +13,18 @@ namespace amrex { -// -// A Descriptor of the Long Integer type - /** -* This class is meant to hold all information needed to completely -* describe the "int" or "Long" type on a machine. To describe an integer both -* the number of bytes and their ordering, relative to canonical -* ordering 1 .. sizeof(Long), needs to be specified. -* This allows us to write out integers in the native format on a machine, -* and then by also saving the IntDescriptor, we can read them back in on -* another machine and have enough information to construct the exact same -* values. -*/ - + * \brief A Descriptor of the Long Integer type + * + * This class is meant to hold all information needed to completely + * describe the "int" or "Long" type on a machine. To describe an integer both + * the number of bytes and their ordering, relative to canonical + * ordering 1 .. sizeof(Long), needs to be specified. + * This allows us to write out integers in the native format on a machine, + * and then by also saving the IntDescriptor, we can read them back in on + * another machine and have enough information to construct the exact same + * values. + */ class IntDescriptor { @@ -72,39 +70,37 @@ std::ostream& operator<< (std::ostream& os, const IntDescriptor& id); //! std::istream& operator>> (std::istream& is, IntDescriptor& id); - - //A Descriptor of the Real Type - /** -* \brief This class is meant to hold all information needed to completely -* describe the "Real" floating-point type on a machine. By "Real" here we -* mean either the "float" or "double" type that this version of AMReX -* was built with, which corresponds to whether BL_USE_FLOAT or -* BL_USE_DOUBLE was used to build the version of the library. -* -* To describe a "Real" type two arrays are needed: one detailing the ordering -* of the bytes in the Real, relative to the canonical ordering -* 1 .. sizeof(Real) and the other detailing the format of the floating-point -* number. -* -* The array detailing the format of a floating-point number is an eight-element -* array of longs containing the following information: -* -* format[0] = number of bits per number -* format[1] = number of bits in exponent -* format[2] = number of bits in mantissa -* format[3] = start bit of sign -* format[4] = start bit of exponent -* format[5] = start bit of mantissa -* format[6] = high order mantissa bit (CRAY needs this) -* format[7] = bias of exponent -* -* This allows us to write out "Real"s in the native format on a machine, -* and then by also saving the IntDescriptor, we can read them back in on -* another machine and have enough information to construct the exact same -* "Real" values, provided the Reals have the same size on the two machines. -*/ - + * \brief A Descriptor of the Real Type + * + * This class is meant to hold all information needed to completely + * describe the "Real" floating-point type on a machine. By "Real" here we + * mean either the "float" or "double" type that this version of AMReX + * was built with, which corresponds to whether BL_USE_FLOAT or + * BL_USE_DOUBLE was used to build the version of the library. + * + * To describe a "Real" type two arrays are needed: one detailing the ordering + * of the bytes in the Real, relative to the canonical ordering + * 1 .. sizeof(Real) and the other detailing the format of the floating-point + * number. + * + * The array detailing the format of a floating-point number is an eight-element + * array of longs containing the following information: + * + * format[0] = number of bits per number + * format[1] = number of bits in exponent + * format[2] = number of bits in mantissa + * format[3] = start bit of sign + * format[4] = start bit of exponent + * format[5] = start bit of mantissa + * format[6] = high order mantissa bit (CRAY needs this) + * format[7] = bias of exponent + * + * This allows us to write out "Real"s in the native format on a machine, + * and then by also saving the IntDescriptor, we can read them back in on + * another machine and have enough information to construct the exact same + * "Real" values, provided the Reals have the same size on the two machines. + */ class RealDescriptor { public: diff --git a/Src/Base/AMReX_FabConv.cpp b/Src/Base/AMReX_FabConv.cpp index 522216aadd1..e8dd870a058 100644 --- a/Src/Base/AMReX_FabConv.cpp +++ b/Src/Base/AMReX_FabConv.cpp @@ -170,7 +170,7 @@ RealDescriptor::clone () const // This exists solely to support reading "old" FABs. // -static +namespace { const int* selectOrdering (int prec, int ordering) @@ -208,6 +208,7 @@ selectOrdering (int prec, } return nullptr; } +} // // This is here solely to support reading "old" FABs. @@ -293,7 +294,7 @@ _pd_get_bit (char const* base, // for longer fields // -static +namespace { Long _pd_extract_field (char const* in, int offs, @@ -367,7 +368,6 @@ _pd_extract_field (char const* in, // template -static void _pd_btrvout (char* out, Long nitems) { @@ -387,8 +387,10 @@ _pd_btrvout (char* out, Long nitems) } } -const int BitsMax = 8*sizeof(Long); -const int REVERSE_ORDER = 2; +constexpr int BitsMax = 8*sizeof(Long); +constexpr int REVERSE_ORDER = 2; + +} // // Copy the least significant NB bits from the given Long into the byte array @@ -487,7 +489,7 @@ _pd_set_bit (char* base, int offs) // on input. // -static +namespace { void _pd_reorder (char* arr, Long nitems, @@ -512,7 +514,6 @@ _pd_reorder (char* arr, // from inord to outord. // -static void permute_real_word_order (void* out, const void* in, @@ -786,7 +787,6 @@ PD_fconvert (void* out, _pd_reorder((char*)out, nitems, outbytes, outord); } -static void PD_fixdenormals (void* out, Long nitems, @@ -816,13 +816,15 @@ PD_fixdenormals (void* out, } } +} + // // It's really sad that I need to do this ... // #undef GETARRAY +namespace { #define GETARRAY(TYPE) \ -static \ void \ getarray (std::istream& is, \ Vector< TYPE >& ar) \ @@ -852,11 +854,12 @@ getarray (std::istream& is, \ } GETARRAY(int) GETARRAY(Long) +} #undef GETARRAY #undef PUTARRAY +namespace { #define PUTARRAY(TYPE) \ -static \ void \ putarray (std::ostream& os, \ const Vector< TYPE >& ar) \ @@ -874,6 +877,7 @@ putarray (std::ostream& os, \ } PUTARRAY(int) PUTARRAY(Long) +} #undef PUTARRAY std::ostream& @@ -917,7 +921,7 @@ operator>> (std::istream& is, return is; } -static +namespace { void PD_convert (void* out, const void* in, @@ -960,6 +964,7 @@ PD_convert (void* out, PD_fixdenormals(out, nitems, ord.format(), ord.order()); } } +} // // Convert nitems in RealDescriptor format to native Real format. diff --git a/Src/Base/AMReX_FabDataType.H b/Src/Base/AMReX_FabDataType.H new file mode 100644 index 00000000000..81537ae8065 --- /dev/null +++ b/Src/Base/AMReX_FabDataType.H @@ -0,0 +1,27 @@ +#ifndef AMREX_FAB_DATA_TYPE_H_ +#define AMREX_FAB_DATA_TYPE_H_ +#include + +#include + +namespace amrex { + +template struct FabDataType {}; +// +template +struct FabDataType > > +{ + using fab_type = typename T::fab_type; + using value_type = typename T::value_type; +}; + +template +struct FabDataType > > +{ + using fab_type = typename T::value_type::fab_type; + using value_type = typename T::value_type::value_type; +}; + +} + +#endif diff --git a/Src/Base/AMReX_FilCC_1D_C.H b/Src/Base/AMReX_FilCC_1D_C.H index 5f21541432c..4021e4b52a0 100644 --- a/Src/Base/AMReX_FilCC_1D_C.H +++ b/Src/Base/AMReX_FilCC_1D_C.H @@ -69,6 +69,7 @@ struct FilccCell q(i,0,0,n) = -q(2*ilo-i-1,0,0,n); break; } + default: { break; } } } else if (i > ihi) @@ -111,6 +112,7 @@ struct FilccCell q(i,0,0,n) = -q(2*ihi-i+1,0,0,n); break; } + default: { break; } } } } diff --git a/Src/Base/AMReX_FilCC_2D_C.H b/Src/Base/AMReX_FilCC_2D_C.H index 79f65156d48..80b92929724 100644 --- a/Src/Base/AMReX_FilCC_2D_C.H +++ b/Src/Base/AMReX_FilCC_2D_C.H @@ -74,6 +74,7 @@ struct FilccCell q(i,j,0,n) = -q(2*ilo-i-1,j,0,n); break; } + default: { break; } } } else if (i > ihi) @@ -116,6 +117,7 @@ struct FilccCell q(i,j,0,n) = -q(2*ihi-i+1,j,0,n); break; } + default: { break; } } } @@ -159,6 +161,7 @@ struct FilccCell q(i,j,0,n) = -q(i,2*jlo-j-1,0,n); break; } + default: { break; } } } else if (j > jhi) @@ -201,6 +204,7 @@ struct FilccCell q(i,j,0,n) = -q(i,2*jhi-j+1,0,n); break; } + default: { break; } } } } diff --git a/Src/Base/AMReX_FilCC_3D_C.H b/Src/Base/AMReX_FilCC_3D_C.H index 6a2e3dccfcf..f311045c0a1 100644 --- a/Src/Base/AMReX_FilCC_3D_C.H +++ b/Src/Base/AMReX_FilCC_3D_C.H @@ -73,6 +73,7 @@ struct FilccCell q(i,j,k,n) = -q(2*ilo-i-1,j,k,n); break; } + default: { break; } } } else if (i > ihi) @@ -115,6 +116,7 @@ struct FilccCell q(i,j,k,n) = -q(2*ihi-i+1,j,k,n); break; } + default: { break; } } } @@ -158,6 +160,7 @@ struct FilccCell q(i,j,k,n) = -q(i,2*jlo-j-1,k,n); break; } + default: { break; } } } else if (j > jhi) @@ -200,6 +203,7 @@ struct FilccCell q(i,j,k,n) = -q(i,2*jhi-j+1,k,n); break; } + default: { break; } } } @@ -243,6 +247,7 @@ struct FilccCell q(i,j,k,n) = -q(i,j,2*klo-k-1,n); break; } + default: { break; } } } else if (k > khi) @@ -285,6 +290,7 @@ struct FilccCell q(i,j,k,n) = -q(i,j,2*khi-k+1,n); break; } + default: { break; } } } } diff --git a/Src/Base/AMReX_FilCC_C.cpp b/Src/Base/AMReX_FilCC_C.cpp index 7cdba486957..e2d8c6129fd 100644 --- a/Src/Base/AMReX_FilCC_C.cpp +++ b/Src/Base/AMReX_FilCC_C.cpp @@ -41,7 +41,7 @@ void fab_filcc (Box const& bx, Array4 const& qn, int ncomp, if (lo.x < ilo) { const int imin = lo.x; const int imax = ilo-1; - if (bc.lo(0) == BCType::ext_dir) { + if (bc.lo(0) == BCType::ext_dir || bc.lo(0) == BCType::ext_dir_cc) { // Do nothing. } else if (bc.lo(0) == BCType::foextrap) { for (int k = lo.z; k <= hi.z; ++k) { @@ -88,7 +88,7 @@ void fab_filcc (Box const& bx, Array4 const& qn, int ncomp, const int imin = ihi+1; const int imax = hi.x; - if (bc.hi(0) == BCType::ext_dir) { + if (bc.hi(0) == BCType::ext_dir || bc.hi(0) == BCType::ext_dir_cc) { // Do nothing. } else if (bc.hi(0) == BCType::foextrap) { for (int k = lo.z; k <= hi.z; ++k) { @@ -136,7 +136,7 @@ void fab_filcc (Box const& bx, Array4 const& qn, int ncomp, if (lo.y < jlo) { const int jmin = lo.y; const int jmax = jlo-1; - if (bc.lo(1) == BCType::ext_dir) { + if (bc.lo(1) == BCType::ext_dir || bc.lo(1) == BCType::ext_dir_cc) { // Do nothing. } else if (bc.lo(1) == BCType::foextrap) { for (int k = lo.z; k <= hi.z; ++k) { @@ -182,7 +182,7 @@ void fab_filcc (Box const& bx, Array4 const& qn, int ncomp, if (hi.y > jhi) { const int jmin = jhi+1; const int jmax = hi.y; - if (bc.hi(1) == BCType::ext_dir) { + if (bc.hi(1) == BCType::ext_dir || bc.hi(1) == BCType::ext_dir_cc) { // Do nothing. } else if (bc.hi(1) == BCType::foextrap) { for (int k = lo.z; k <= hi.z; ++k) { @@ -231,7 +231,7 @@ void fab_filcc (Box const& bx, Array4 const& qn, int ncomp, if (lo.z < klo) { const int kmin = lo.z; const int kmax = klo-1; - if (bc.lo(2) == BCType::ext_dir) { + if (bc.lo(2) == BCType::ext_dir || bc.lo(2) == BCType::ext_dir_cc) { // Do nothing. } else if (bc.lo(2) == BCType::foextrap) { for (int k = kmin; k <= kmax; ++k) { @@ -277,7 +277,7 @@ void fab_filcc (Box const& bx, Array4 const& qn, int ncomp, if (hi.z > khi) { const int kmin = khi+1; const int kmax = hi.z; - if (bc.hi(2) == BCType::ext_dir) { + if (bc.hi(2) == BCType::ext_dir || bc.hi(2) == BCType::ext_dir_cc) { // Do nothing. } else if (bc.hi(2) == BCType::foextrap) { for (int k = kmin; k <= kmax; ++k) { diff --git a/Src/Base/AMReX_FilFC_1D_C.H b/Src/Base/AMReX_FilFC_1D_C.H index 43e3a1d41d0..31fe69f7fa0 100644 --- a/Src/Base/AMReX_FilFC_1D_C.H +++ b/Src/Base/AMReX_FilFC_1D_C.H @@ -66,6 +66,7 @@ struct FilfcFace q(i,0,0,n) = -q(2*ilo-i,0,0,n); break; } + default: { break; } } } else if (i == ihi) @@ -106,6 +107,7 @@ struct FilfcFace q(i,0,0,n) = -q(2*ihi-i,0,0,n); break; } + default: { break; } } } } diff --git a/Src/Base/AMReX_FilFC_2D_C.H b/Src/Base/AMReX_FilFC_2D_C.H index 0185386392a..df76a2de23b 100644 --- a/Src/Base/AMReX_FilFC_2D_C.H +++ b/Src/Base/AMReX_FilFC_2D_C.H @@ -74,6 +74,7 @@ struct FilfcFace : -q(2*ilo-i-1,j,0,n); break; } + default: { break; } } } else if (i == ihi) @@ -118,6 +119,7 @@ struct FilfcFace : -q(2*ihi-i+1,j,0,n); break; } + default: { break; } } } @@ -163,6 +165,7 @@ struct FilfcFace : -q(i,2*jlo-j-1,0,n); break; } + default: { break; } } } else if (j == jhi) @@ -207,6 +210,7 @@ struct FilfcFace : -q(i,2*jhi-j+1,0,n); break; } + default: { break; } } } } diff --git a/Src/Base/AMReX_FilFC_3D_C.H b/Src/Base/AMReX_FilFC_3D_C.H index 76c49fdc86a..1628131caae 100644 --- a/Src/Base/AMReX_FilFC_3D_C.H +++ b/Src/Base/AMReX_FilFC_3D_C.H @@ -77,6 +77,7 @@ struct FilfcFace : -q(2*ilo-i-1,j,k,n); break; } + default: { break; } } } if (i == ihi) @@ -121,6 +122,7 @@ struct FilfcFace : -q(2*ihi-i+1,j,k,n); break; } + default: { break; } } } @@ -166,6 +168,7 @@ struct FilfcFace : -q(i,2*jlo-j-1,k,n); break; } + default: { break; } } } else if (j == jhi) @@ -210,6 +213,7 @@ struct FilfcFace : -q(i,2*jhi-j+1,k,n); break; } + default: { break; } } } @@ -255,6 +259,7 @@ struct FilfcFace : -q(i,j,2*klo-k-1,n); break; } + default: { break; } } } if (k == khi) @@ -299,6 +304,7 @@ struct FilfcFace : -q(i,j,2*khi-k+1,n); break; } + default: { break; } } } } diff --git a/Src/Base/AMReX_FileSystem.cpp b/Src/Base/AMReX_FileSystem.cpp index 283844427fe..5ba4cb5b585 100644 --- a/Src/Base/AMReX_FileSystem.cpp +++ b/Src/Base/AMReX_FileSystem.cpp @@ -154,11 +154,11 @@ CreateDirectories (std::string const& path, mode_t mode, bool verbose) } if(retVal == false || verbose == true) { - for(int i(0); i < pathError.size(); ++i) { + for(auto & i : pathError) { amrex::AllPrint()<< "amrex::UtilCreateDirectory:: path errno: " - << pathError[i].first << " :: " - << strerror(pathError[i].second) - << std::endl; + << i.first << " :: " + << strerror(i.second) + << '\n'; } } diff --git a/Src/Base/AMReX_ForkJoin.cpp b/Src/Base/AMReX_ForkJoin.cpp index 2154ed90719..f0cc03ae04b 100644 --- a/Src/Base/AMReX_ForkJoin.cpp +++ b/Src/Base/AMReX_ForkJoin.cpp @@ -201,14 +201,14 @@ ForkJoin::copy_data_to_tasks () if (forked.size() <= i) { if (flag_verbose) { amrex::Print() << " Creating forked " << mf_name << "[" << idx << "] for task " << i - << (mff.strategy == Strategy::split ? " (split)" : " (whole)") << std::endl; + << (mff.strategy == Strategy::split ? " (split)" : " (whole)") << '\n'; } // look up the distribution mapping for this (box array, task) pair const DistributionMapping &dm = get_dm(ba, i, orig.DistributionMap()); forked.emplace_back(ba, dm, task_comp_n, mff.ngrow); } else if (flag_verbose) { amrex::Print() << " Forked " << mf_name << "[" << idx << "] for task " << i - << " already created" << std::endl; + << " already created" << '\n'; } AMREX_ASSERT(i < forked.size()); @@ -216,7 +216,7 @@ ForkJoin::copy_data_to_tasks () if (mff.intent == Intent::in || mff.intent == Intent::inout) { if (flag_verbose) { amrex::Print() << " Copying " << mf_name << "[" << idx << "] components [" - << comp_split[i].lo << ", " << comp_split[i].hi << ") into to task " << i << std::endl; + << comp_split[i].lo << ", " << comp_split[i].hi << ") into to task " << i << '\n'; } // parallel copy data into forked MF forked[i].Redistribute(orig, comp_split[i].lo, 0, task_comp_n, mff.ngrow); @@ -257,7 +257,7 @@ ForkJoin::copy_data_from_tasks () for (int i = 0; i < NTasks(); ++i) { if (flag_verbose) { amrex::Print() << " Copying " << mf_name << "[" << idx << "] components [" - << comp_split[i].lo << ", " << comp_split[i].hi << ") out from task " << i << " (unsplit)" << std::endl; + << comp_split[i].lo << ", " << comp_split[i].hi << ") out from task " << i << " (unsplit)" << '\n'; } int task_comp_n = comp_split[i].hi - comp_split[i].lo; AMREX_ASSERT(forked[i].nComp() == task_comp_n); @@ -266,7 +266,7 @@ ForkJoin::copy_data_from_tasks () } else { // mff.strategy == single or duplicate // copy all components from owner_task if (flag_verbose) { - amrex::Print() << "Copying " << mf_name << " out from task " << mff.owner_task << " (whole)" << std::endl; + amrex::Print() << "Copying " << mf_name << " out from task " << mff.owner_task << " (whole)" << '\n'; } AMREX_ASSERT(forked[mff.owner_task].nComp() == orig.nComp()); orig.Redistribute(forked[mff.owner_task], 0, 0, orig.nComp(), mff.ngrow); @@ -307,13 +307,13 @@ ForkJoin::get_dm (const BoxArray& ba, int task_idx, const DistributionMapping& d if (flag_verbose) { amrex::Print() << " Creating DM for (box array, task id) = (" - << ba.getRefID() << ", " << task_idx << ")" << std::endl; + << ba.getRefID() << ", " << task_idx << ")" << '\n'; } } else { // DM has already been created if (flag_verbose) { amrex::Print() << " DM for (box array, task id) = (" << ba.getRefID() << ", " << task_idx - << ") already created" << std::endl; + << ") already created" << '\n'; } } AMREX_ASSERT(dm_vec[task_idx] != nullptr); @@ -350,7 +350,7 @@ void ForkJoin::create_task_output_dir () { if (!task_output_dir.empty() && !amrex::FileExists(task_output_dir)) { if (flag_verbose) { - Print() << "Creating task_output_dir: " << task_output_dir << std::endl; + Print() << "Creating task_output_dir: " << task_output_dir << '\n'; } if (ParallelContext::IOProcessorSub()) { if (! amrex::UtilCreateDirectory(task_output_dir, 0755, flag_verbose)) { diff --git a/Src/Base/AMReX_Functional.H b/Src/Base/AMReX_Functional.H index bed6e7a4267..0098365db4a 100644 --- a/Src/Base/AMReX_Functional.H +++ b/Src/Base/AMReX_Functional.H @@ -63,6 +63,24 @@ struct LogicalOr } }; +template +struct Multiplies +{ + constexpr T operator() (const T & lhs, const T & rhs) const + { + return lhs * rhs; + } +}; + +template +struct Divides +{ + constexpr T operator() (const T & lhs, const T & rhs) const + { + return lhs / rhs; + } +}; + } #endif diff --git a/Src/Base/AMReX_Geometry.H b/Src/Base/AMReX_Geometry.H index 4017273151a..550b42f2f6a 100644 --- a/Src/Base/AMReX_Geometry.H +++ b/Src/Base/AMReX_Geometry.H @@ -16,14 +16,6 @@ #include namespace amrex { -/** -* \class Geometry -* \brief Rectangular problem domain geometry. -* -* This class describes problem domain and coordinate system for -* RECTANGULAR problem domains. Since the problem domain is RECTANGULAR, -* periodicity is meaningful. -*/ class MultiFab; class DistributionMapping; @@ -67,6 +59,14 @@ public: int coord; }; +/** + * \class Geometry + * \brief Rectangular problem domain geometry. + * + * This class describes problem domain and coordinate system for + * RECTANGULAR problem domains. Since the problem domain is RECTANGULAR, + * periodicity is meaningful. + */ class Geometry : public CoordSys diff --git a/Src/Base/AMReX_GpuAllocators.H b/Src/Base/AMReX_GpuAllocators.H index e3f76bee216..de2fc0ad6bc 100644 --- a/Src/Base/AMReX_GpuAllocators.H +++ b/Src/Base/AMReX_GpuAllocators.H @@ -162,10 +162,10 @@ namespace amrex { // template struct IsArenaAllocator - , - T>::value>> + T>>> : std::true_type {}; template diff --git a/Src/Base/AMReX_GpuAsyncArray.H b/Src/Base/AMReX_GpuAsyncArray.H index 13a1642d690..ab43496093a 100644 --- a/Src/Base/AMReX_GpuAsyncArray.H +++ b/Src/Base/AMReX_GpuAsyncArray.H @@ -24,7 +24,7 @@ extern "C" { namespace amrex { namespace Gpu { -template ::value,int> = 0> +template ,int> = 0> class AsyncArray { public: @@ -43,7 +43,7 @@ public: #endif } - template ::value && std::is_trivial::value,int>::type = 0> + template && std::is_trivial_v,int> = 0> explicit AsyncArray (const std::size_t n) { if (n == 0) { return; } diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H index 55fc351156a..0a056ab5c38 100644 --- a/Src/Base/AMReX_GpuAtomic.H +++ b/Src/Base/AMReX_GpuAtomic.H @@ -13,11 +13,13 @@ namespace amrex { namespace Gpu::Atomic { // For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double. +// For Multiply and Divide, we support generic types provided they are the same size as int or unsigned long long +// and have *= and /= operators. // For LogicalOr and LogicalAnd, the data type is int. // For Exch and CAS, the data type is generic. // All these functions are non-atomic in host code!!! // If one needs them to be atomic in host code, use HostDevice::Atomic::*. Currently only -// HostDevice::Atomic is supported. We could certainly add more. +// HostDevice::Atomic::Add is supported. We could certainly add more. namespace detail { @@ -132,17 +134,17 @@ namespace detail { AMREX_GPU_DEVICE AMREX_FORCE_INLINE T Add_device (T* const sum, T const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicAdd(sum, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; sycl::atomic_ref a{*sum}; return a.fetch_add(value); #else - amrex::ignore_unused(sum, value); - return T(); // should never get here, but have to return something + AMREX_IF_ON_DEVICE(( return atomicAdd(sum, value); )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(sum, value); + return T(); // should never get here, but have to return something + )) #endif } @@ -175,7 +177,7 @@ namespace detail { #endif -#if defined(AMREX_USE_CUDA) && (__CUDA_ARCH__ < 600) +#if defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) AMREX_GPU_DEVICE AMREX_FORCE_INLINE double Add_device (double* const sum, double const value) noexcept @@ -195,17 +197,16 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Add (T* sum, T value) noexcept { -#if AMREX_DEVICE_COMPILE #ifdef AMREX_USE_SYCL - return Add_device(sum, value); -#else - return Add_device(sum, value); -#endif + AMREX_IF_ON_DEVICE((return Add_device(sum, value);)) #else - auto old = *sum; - *sum += value; - return old; + AMREX_IF_ON_DEVICE((return Add_device(sum, value);)) #endif + AMREX_IF_ON_HOST(( + auto old = *sum; + *sum += value; + return old; + )) } //////////////////////////////////////////////////////////////////////// @@ -252,18 +253,19 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool If (T* const add, T const value, Op&& op, Cond&& cond) noexcept { -#if AMREX_DEVICE_COMPILE - return If_device(add, value, std::forward(op), std::forward(cond)); -#else - T old = *add; - T const tmp = op(old, value); - if (cond(tmp)) { - *add = tmp; - return true; - } else { - return false; - } -#endif + AMREX_IF_ON_DEVICE(( + return If_device(add, value, std::forward(op), std::forward(cond)); + )) + AMREX_IF_ON_HOST(( + T old = *add; + T const tmp = std::forward(op)(old, value); + if (std::forward(cond)(tmp)) { + *add = tmp; + return true; + } else { + return false; + } + )) } //////////////////////////////////////////////////////////////////////// @@ -278,14 +280,11 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet (T* sum, T value) noexcept { -#if AMREX_DEVICE_COMPILE -#ifdef AMREX_USE_SYCL +#if defined(__SYCL_DEVICE_ONLY__) Add_device(sum, value); #else - Add_device(sum, value); -#endif -#else - *sum += value; + AMREX_IF_ON_DEVICE((Add_device(sum, value);)) + AMREX_IF_ON_HOST((*sum += value;)) #endif } @@ -293,14 +292,11 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet (float* const sum, float const value) noexcept { -#if AMREX_DEVICE_COMPILE #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" - atomicAddNoRet(sum, value); + AMREX_IF_ON_DEVICE((atomicAddNoRet(sum, value);)) #pragma clang diagnostic pop -#else - *sum += value; -#endif + AMREX_IF_ON_HOST((*sum += value;)) } #endif @@ -314,18 +310,18 @@ namespace detail { AMREX_GPU_DEVICE AMREX_FORCE_INLINE T Min_device (T* const m, T const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicMin(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_min(value); #else - amrex::ignore_unused(m,value); - return T(); // should never get here, but have to return something + AMREX_IF_ON_DEVICE(( return atomicMin(m, value); )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(m,value); + return T(); // should never get here, but have to return something + )) #endif } @@ -357,13 +353,14 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Min (T* const m, T const value) noexcept { -#if AMREX_DEVICE_COMPILE - return Min_device(m, value); -#else - auto const old = *m; - *m = (*m) < value ? (*m) : value; - return old; -#endif + AMREX_IF_ON_DEVICE(( + return Min_device(m, value); + )) + AMREX_IF_ON_HOST(( + auto const old = *m; + *m = (*m) < value ? (*m) : value; + return old; + )) } //////////////////////////////////////////////////////////////////////// @@ -376,18 +373,18 @@ namespace detail { AMREX_GPU_DEVICE AMREX_FORCE_INLINE T Max_device (T* const m, T const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicMax(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_max(value); #else - amrex::ignore_unused(m,value); - return T(); // should never get here, but have to return something + AMREX_IF_ON_DEVICE(( return atomicMax(m, value); )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(m,value); + return T(); // should never get here, but have to return something + )) #endif } @@ -419,13 +416,14 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Max (T* const m, T const value) noexcept { -#if AMREX_DEVICE_COMPILE - return Max_device(m, value); -#else - auto const old = *m; - *m = (*m) > value ? (*m) : value; - return old; -#endif + AMREX_IF_ON_DEVICE(( + return Max_device(m, value); + )) + AMREX_IF_ON_HOST(( + auto const old = *m; + *m = (*m) > value ? (*m) : value; + return old; + )) } //////////////////////////////////////////////////////////////////////// @@ -435,19 +433,21 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalOr (int* const m, int const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicOr(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_or(value); #else - int const old = *m; - *m = (*m) || value; - return old; + AMREX_IF_ON_DEVICE(( + return atomicOr(m, value); + )) + AMREX_IF_ON_HOST(( + int const old = *m; + *m = (*m) || value; + return old; + )) #endif } @@ -458,19 +458,21 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalAnd (int* const m, int const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicAnd(m, value ? ~0x0 : 0); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_and(value ? ~0x0 : 0); #else - int const old = *m; - *m = (*m) && value; - return old; + AMREX_IF_ON_DEVICE(( + return atomicAnd(m, value ? ~0x0 : 0); + )) + AMREX_IF_ON_HOST(( + int const old = *m; + *m = (*m) && value; + return old; + )) #endif } @@ -482,19 +484,21 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Exch (T* address, T val) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicExch(address, val); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*address}; return a.exchange(val); #else - auto const old = *address; - *address = val; - return old; + AMREX_IF_ON_DEVICE(( + return atomicExch(address, val); + )) + AMREX_IF_ON_HOST(( + auto const old = *address; + *address = val; + return old; + )) #endif } @@ -506,10 +510,7 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T CAS (T* const address, T compare, T const val) noexcept { // cannot be T const compare because of compare_exchange_strong -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicCAS(address, compare, val); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; @@ -517,27 +518,108 @@ namespace detail { a.compare_exchange_strong(compare, val); return compare; #else - auto const old = *address; - *address = (old == compare ? val : old); - return old; + AMREX_IF_ON_DEVICE(( + return atomicCAS(address, compare, val); + )) + AMREX_IF_ON_HOST(( + auto const old = *address; + *address = (old == compare ? val : old); + return old; + )) #endif } + +//////////////////////////////////////////////////////////////////////// +// Multiply +//////////////////////////////////////////////////////////////////////// + +#ifdef AMREX_USE_GPU + + template = 0> + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + T Multiply_device (T* const prod, T const value) noexcept + { + return detail::atomic_op(prod,value,amrex::Multiplies()); + } + + template = 0> + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + T Multiply_device (T* const prod, T const value) noexcept + { + return detail::atomic_op(prod,value,amrex::Multiplies()); + } + +#endif + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T Multiply (T* const prod, T const value) noexcept + { + AMREX_IF_ON_DEVICE(( + return Multiply_device(prod, value); + )) + AMREX_IF_ON_HOST(( + auto const old = *prod; + *prod *= value; + return old; + )) + } + +//////////////////////////////////////////////////////////////////////// +// Divide +//////////////////////////////////////////////////////////////////////// + +#ifdef AMREX_USE_GPU + + template = 0> + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + T Divide_device (T* const quot, T const value) noexcept + { + return detail::atomic_op(quot,value,amrex::Divides()); + } + + template = 0> + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + T Divide_device (T* const quot, T const value) noexcept + { + return detail::atomic_op(quot,value,amrex::Divides()); + } + +#endif + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T Divide (T* const quot, T const value) noexcept + { + AMREX_IF_ON_DEVICE(( + return Divide_device(quot, value); + )) + AMREX_IF_ON_HOST(( + auto const old = *quot; + *quot /= value; + return old; + )) + } } namespace HostDevice::Atomic { template - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - void Add (T* const sum, T const value) noexcept + AMREX_FORCE_INLINE + void Add_Host (T* const sum, T const value) noexcept { -#if AMREX_DEVICE_COMPILE - Gpu::Atomic::AddNoRet(sum,value); -#else #ifdef AMREX_USE_OMP #pragma omp atomic update #endif *sum += value; -#endif + } + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void Add (T* const sum, T const value) noexcept + { + AMREX_IF_ON_DEVICE((Gpu::Atomic::AddNoRet(sum,value);)) + AMREX_IF_ON_HOST((Add_Host(sum,value);)) } } diff --git a/Src/Base/AMReX_GpuBuffer.H b/Src/Base/AMReX_GpuBuffer.H index a52dc04785b..f930cfa321e 100644 --- a/Src/Base/AMReX_GpuBuffer.H +++ b/Src/Base/AMReX_GpuBuffer.H @@ -12,7 +12,7 @@ namespace amrex::Gpu { -template ::value,int> = 0> +template ,int> = 0> class Buffer { public: diff --git a/Src/Base/AMReX_GpuComplex.H b/Src/Base/AMReX_GpuComplex.H index 205788375f4..274da82604d 100644 --- a/Src/Base/AMReX_GpuComplex.H +++ b/Src/Base/AMReX_GpuComplex.H @@ -20,9 +20,12 @@ T norm (const GpuComplex& a_z) noexcept; * work in device code with Cuda yet. * * Should be bit-wise compatible with std::complex. + * + * GpuComplex is aligned to its size (stricter than std::complex) to allow for + * coalesced memory accesses with nvidia GPUs. */ template -struct GpuComplex +struct alignas(2*sizeof(T)) GpuComplex { using value_type = T; diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H index 05399b2e047..012941b0055 100644 --- a/Src/Base/AMReX_GpuContainers.H +++ b/Src/Base/AMReX_GpuContainers.H @@ -5,7 +5,6 @@ #include #include #include -#include #include #include @@ -408,7 +407,7 @@ namespace amrex::Gpu { std::is_trivially_copyable_v && amrex::IsCallable::value, int> FOO = 0> - void fillAsync (IT first, IT last, F&& f) noexcept + void fillAsync (IT first, IT last, F const& f) noexcept { auto N = static_cast(std::distance(first, last)); if (N <= 0) { return; } diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H index 0f21213a86d..1532045bec4 100644 --- a/Src/Base/AMReX_GpuControl.H +++ b/Src/Base/AMReX_GpuControl.H @@ -184,13 +184,13 @@ namespace Gpu { #else - [[nodiscard]] inline static constexpr bool inLaunchRegion () { return false; } - [[nodiscard]] inline static constexpr bool notInLaunchRegion () { return true; } - [[nodiscard]] inline static constexpr bool setLaunchRegion (bool) { return false; } + [[nodiscard]] inline constexpr bool inLaunchRegion () { return false; } + [[nodiscard]] inline constexpr bool notInLaunchRegion () { return true; } + [[nodiscard]] inline constexpr bool setLaunchRegion (bool) { return false; } - [[nodiscard]] inline static constexpr bool inGraphRegion () { return false; } - [[nodiscard]] inline static constexpr bool notInGraphRegion () { return true; } - [[nodiscard]] inline static constexpr bool setGraphRegion (bool) { return false; } + [[nodiscard]] inline constexpr bool inGraphRegion () { return false; } + [[nodiscard]] inline constexpr bool notInGraphRegion () { return true; } + [[nodiscard]] inline constexpr bool setGraphRegion (bool) { return false; } struct [[nodiscard]] LaunchSafeGuard { @@ -202,10 +202,10 @@ namespace Gpu { explicit GraphSafeGuard (bool) {} }; - [[nodiscard]] inline static constexpr bool inSingleStreamRegion () { return false; } - [[nodiscard]] inline static constexpr bool inNoSyncRegion () { return true; } - [[nodiscard]] inline static constexpr bool setSingleStreamRegion (bool) { return false; } - [[nodiscard]] inline static constexpr bool setNoSyncRegion (bool) { return true; } + [[nodiscard]] inline constexpr bool inSingleStreamRegion () { return false; } + [[nodiscard]] inline constexpr bool inNoSyncRegion () { return true; } + [[nodiscard]] inline constexpr bool setSingleStreamRegion (bool) { return false; } + [[nodiscard]] inline constexpr bool setNoSyncRegion (bool) { return true; } struct [[nodiscard]] SingleStreamRegion {}; struct [[nodiscard]] NoSyncRegion {}; diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp index df3625d13ce..d699a20a2b5 100644 --- a/Src/Base/AMReX_GpuDevice.cpp +++ b/Src/Base/AMReX_GpuDevice.cpp @@ -354,6 +354,8 @@ Device::initialize_gpu () AMREX_HIP_SAFE_CALL(hipGetDeviceProperties(&device_prop, device_id)); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(warp_size == device_prop.warpSize, "Incorrect warp size"); + // check compute capability // AMD devices do not support shared cache banking. @@ -372,11 +374,13 @@ Device::initialize_gpu () cudaDeviceGetAttribute(&memory_pools_supported, cudaDevAttrMemoryPoolsSupported, device_id); #endif +#if (__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 4)) if (sizeof(Real) == 8) { AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); } else if (sizeof(Real) == 4) { AMREX_CUDA_SAFE_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); } +#endif for (int i = 0; i < max_gpu_streams; ++i) { AMREX_CUDA_SAFE_CALL(cudaStreamCreate(&gpu_stream_pool[i])); diff --git a/Src/Base/AMReX_GpuElixir.H b/Src/Base/AMReX_GpuElixir.H index c61f5678a2b..4c006e18c53 100644 --- a/Src/Base/AMReX_GpuElixir.H +++ b/Src/Base/AMReX_GpuElixir.H @@ -34,7 +34,7 @@ public: return *this; } - void append (Elixir && rhs) noexcept + void append (Elixir rhs) noexcept { m_pa.insert(m_pa.end(), rhs.m_pa.begin(), rhs.m_pa.end()); rhs.m_pa.clear(); diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index c1870d2ef58..435a11f342b 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -61,11 +62,11 @@ namespace amrex { // CPU variation template - void launch_host (L&& f0) noexcept { f0(); } + void launch_host (L&& f0) noexcept { std::forward(f0)(); } template void launch_host (L&& f0, Lambdas&&... fs) noexcept { - f0(); + std::forward(f0)(); launch_host(std::forward(fs)...); } @@ -76,11 +77,11 @@ namespace amrex { namespace Gpu { #ifdef AMREX_USE_GPU - static constexpr std::size_t numThreadsPerBlockParallelFor () { + inline constexpr std::size_t numThreadsPerBlockParallelFor () { return AMREX_GPU_MAX_THREADS; } #else - static constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; } + inline constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; } #endif // ************************************************ @@ -103,20 +104,21 @@ namespace Gpu { inline Box getThreadBox (const Box& bx, Long offset) noexcept { -#if AMREX_DEVICE_COMPILE - const auto len = bx.length3d(); - Long k = offset / (len[0]*len[1]); - Long j = (offset - k*(len[0]*len[1])) / len[0]; - Long i = (offset - k*(len[0]*len[1])) - j*len[0]; - IntVect iv{AMREX_D_DECL(static_cast(i), - static_cast(j), - static_cast(k))}; - iv += bx.smallEnd(); - return (bx & Box(iv,iv,bx.type())); -#else - amrex::ignore_unused(offset); - return bx; -#endif + AMREX_IF_ON_DEVICE(( + const auto len = bx.length3d(); + Long k = offset / (len[0]*len[1]); + Long j = (offset - k*(len[0]*len[1])) / len[0]; + Long i = (offset - k*(len[0]*len[1])) - j*len[0]; + IntVect iv{AMREX_D_DECL(static_cast(i), + static_cast(j), + static_cast(k))}; + iv += bx.smallEnd(); + return (bx & Box(iv,iv,bx.type())); + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(offset); + return bx; + )) } // ************************************************ diff --git a/Src/Base/AMReX_GpuLaunch.nolint.H b/Src/Base/AMReX_GpuLaunch.nolint.H index 9cf92018e0a..c7df1737517 100644 --- a/Src/Base/AMReX_GpuLaunch.nolint.H +++ b/Src/Base/AMReX_GpuLaunch.nolint.H @@ -60,7 +60,7 @@ #ifndef AMREX_USE_SYCL #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \ - { using amrex_i_inttype = typename std::remove_const::type; \ + { using amrex_i_inttype = std::remove_const_t; \ if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \ { \ amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \ @@ -101,7 +101,7 @@ } #define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \ - { using amrex_i_inttype = typename std::remove_const::type; \ + { using amrex_i_inttype = std::remove_const_t; \ if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \ { \ amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \ @@ -186,7 +186,7 @@ // xxxxx SYCL todo: host disabled in host device #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \ - { using amrex_i_inttype = typename std::remove_const::type; \ + { using amrex_i_inttype = std::remove_const_t; \ if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \ { \ amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \ @@ -220,7 +220,7 @@ } #define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \ - { using amrex_i_inttype = typename std::remove_const::type; \ + { using amrex_i_inttype = std::remove_const_t; \ if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \ { \ amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \ @@ -290,7 +290,7 @@ #else #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \ - { using amrex_i_inttype = typename std::remove_const::type; \ + { using amrex_i_inttype = std::remove_const_t; \ amrex::ignore_unused(where_to_run); \ AMREX_PRAGMA_SIMD \ for (amrex_i_inttype i = 0; i < n; ++i) { \ @@ -310,7 +310,7 @@ ); #define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \ - { using amrex_i_inttype = typename std::remove_const::type; \ + { using amrex_i_inttype = std::remove_const_t; \ amrex::ignore_unused(where_to_run); \ for (amrex_i_inttype i = 0; i < n; ++i) { \ block \ diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 6ce9cca0f3a..04496348cf0 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -57,46 +57,46 @@ namespace detail { template void launch (T const& n, L&& f) noexcept { - f(n); + std::forward(f)(n); } template void launch (T const& n, L&& f) noexcept { amrex::ignore_unused(MT); - f(n); + std::forward(f)(n); } -template ::value> > -void For (T n, L&& f) noexcept +template > > +void For (T n, L const& f) noexcept { for (T i = 0; i < n; ++i) { detail::call_f(f,i); } } -template ::value> > +template > > void For (T n, L&& f) noexcept { amrex::ignore_unused(MT); For(n, std::forward(f)); } -template ::value> > +template > > void For (Gpu::KernelInfo const&, T n, L&& f) noexcept { For(n, std::forward(f)); } -template ::value> > +template > > void For (Gpu::KernelInfo const&, T n, L&& f) noexcept { amrex::ignore_unused(MT); For(n, std::forward(f)); } -template ::value> > -void ParallelFor (T n, L&& f) noexcept +template > > +void ParallelFor (T n, L const& f) noexcept { AMREX_PRAGMA_SIMD for (T i = 0; i < n; ++i) { @@ -104,20 +104,20 @@ void ParallelFor (T n, L&& f) noexcept } } -template ::value> > +template > > void ParallelFor (T n, L&& f) noexcept { amrex::ignore_unused(MT); ParallelFor(n, std::forward(f)); } -template ::value> > +template > > void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept { ParallelFor(n, std::forward(f)); } -template ::value> > +template > > void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept { amrex::ignore_unused(MT); @@ -125,7 +125,7 @@ void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept } template -void For (Box const& box, L&& f) noexcept +void For (Box const& box, L const& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -157,7 +157,7 @@ void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept } template -void ParallelFor (Box const& box, L&& f) noexcept +void ParallelFor (Box const& box, L const& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -189,8 +189,8 @@ void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept ParallelFor(box, std::forward(f)); } -template ::value> > -void For (Box const& box, T ncomp, L&& f) noexcept +template > > +void For (Box const& box, T ncomp, L const& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -203,28 +203,28 @@ void For (Box const& box, T ncomp, L&& f) noexcept } } -template ::value> > +template > > void For (Box const& box, T ncomp, L&& f) noexcept { amrex::ignore_unused(MT); For(box, ncomp, std::forward(f)); } -template ::value> > +template > > void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { For(box, ncomp, std::forward(f)); } -template ::value> > +template > > void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { amrex::ignore_unused(MT); For(box, ncomp, std::forward(f)); } -template ::value> > -void ParallelFor (Box const& box, T ncomp, L&& f) noexcept +template > > +void ParallelFor (Box const& box, T ncomp, L const& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -238,20 +238,20 @@ void ParallelFor (Box const& box, T ncomp, L&& f) noexcept } } -template ::value> > +template > > void ParallelFor (Box const& box, T ncomp, L&& f) noexcept { amrex::ignore_unused(MT); ParallelFor(box, ncomp, std::forward(f)); } -template ::value> > +template > > void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { ParallelFor(box, ncomp, std::forward(f)); } -template ::value> > +template > > void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { amrex::ignore_unused(MT); @@ -317,8 +317,8 @@ void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& b } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void For (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { @@ -327,8 +327,8 @@ void For (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void For (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { @@ -338,8 +338,8 @@ void For (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void For (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept @@ -348,8 +348,8 @@ void For (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void For (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept @@ -359,9 +359,9 @@ void For (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void For (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept @@ -372,9 +372,9 @@ void For (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void For (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept @@ -386,9 +386,9 @@ void For (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void For (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, @@ -400,9 +400,9 @@ void For (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void For (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, @@ -432,14 +432,14 @@ void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept template void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { - ParallelFor(box1,box2,f1,f2); + ParallelFor(box1,box2,std::forward(f1),std::forward(f2)); } template void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { amrex::ignore_unused(MT); - ParallelFor(box1,box2,f1,f2); + ParallelFor(box1,box2,std::forward(f1),std::forward(f2)); } template @@ -473,8 +473,8 @@ void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { @@ -483,8 +483,8 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { @@ -494,8 +494,8 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void ParallelFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept @@ -505,8 +505,8 @@ void ParallelFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void ParallelFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept @@ -517,9 +517,9 @@ void ParallelFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept @@ -530,9 +530,9 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept @@ -544,9 +544,9 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void ParallelFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, @@ -558,9 +558,9 @@ void ParallelFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void ParallelFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, @@ -572,13 +572,13 @@ void ParallelFor (Gpu::KernelInfo const&, box3, ncomp3, std::forward(f3)); } -template ::value> > +template > > void HostDeviceParallelFor (T n, L&& f) noexcept { ParallelFor(n,std::forward(f)); } -template ::value> > +template > > void HostDeviceParallelFor (T n, L&& f) noexcept { amrex::ignore_unused(MT); @@ -598,13 +598,13 @@ void HostDeviceParallelFor (Box const& box, L&& f) noexcept ParallelFor(box,std::forward(f)); } -template ::value> > +template > > void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept { ParallelFor(box,ncomp,std::forward(f)); } -template ::value> > +template > > void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept { amrex::ignore_unused(MT); @@ -640,8 +640,8 @@ void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { @@ -649,8 +649,8 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { @@ -659,9 +659,9 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept @@ -672,9 +672,9 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept @@ -685,13 +685,13 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, box3,ncomp3,std::forward(f3)); } -template ::value> > +template > > void HostDeviceFor (T n, L&& f) noexcept { For(n,std::forward(f)); } -template ::value> > +template > > void HostDeviceFor (T n, L&& f) noexcept { amrex::ignore_unused(MT); @@ -711,13 +711,13 @@ void HostDeviceFor (Box const& box, L&& f) noexcept For(box,std::forward(f)); } -template ::value> > +template > > void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept { For(box,ncomp,std::forward(f)); } -template ::value> > +template > > void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept { amrex::ignore_unused(MT); @@ -753,8 +753,8 @@ void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { @@ -762,8 +762,8 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { @@ -772,9 +772,9 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept @@ -785,9 +785,9 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept @@ -798,13 +798,13 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, box3,ncomp3,std::forward(f3)); } -template ::value> > +template > > void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept { ParallelFor(n,std::forward(f)); } -template ::value> > +template > > void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept { amrex::ignore_unused(MT); @@ -824,13 +824,13 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexc ParallelFor(box,std::forward(f)); } -template ::value> > +template > > void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { ParallelFor(box,ncomp,std::forward(f)); } -template ::value> > +template > > void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { amrex::ignore_unused(MT); @@ -868,8 +868,8 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept @@ -878,8 +878,8 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept @@ -889,9 +889,9 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, @@ -903,9 +903,9 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, @@ -917,13 +917,13 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&, box3,ncomp3,std::forward(f3)); } -template ::value> > +template > > void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept { For(n,std::forward(f)); } -template ::value> > +template > > void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept { amrex::ignore_unused(MT); @@ -943,13 +943,13 @@ void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept For(box,std::forward(f)); } -template ::value> > +template > > void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { For(box,ncomp,std::forward(f)); } -template ::value> > +template > > void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { amrex::ignore_unused(MT); @@ -987,8 +987,8 @@ void HostDeviceFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept @@ -997,8 +997,8 @@ void HostDeviceFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t> > void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept @@ -1008,9 +1008,9 @@ void HostDeviceFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, @@ -1022,9 +1022,9 @@ void HostDeviceFor (Gpu::KernelInfo const&, } template ::value>, - typename M2=std::enable_if_t::value>, - typename M3=std::enable_if_t::value> > + typename M1=std::enable_if_t>, + typename M2=std::enable_if_t>, + typename M3=std::enable_if_t> > void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, @@ -1036,8 +1036,8 @@ void HostDeviceFor (Gpu::KernelInfo const&, box3,ncomp3,std::forward(f3)); } -template ::value> > -void ParallelForRNG (T n, L&& f) noexcept +template > > +void ParallelForRNG (T n, L const& f) noexcept { for (T i = 0; i < n; ++i) { f(i,RandomEngine{}); @@ -1045,7 +1045,7 @@ void ParallelForRNG (T n, L&& f) noexcept } template -void ParallelForRNG (Box const& box, L&& f) noexcept +void ParallelForRNG (Box const& box, L const& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -1056,8 +1056,8 @@ void ParallelForRNG (Box const& box, L&& f) noexcept }}} } -template ::value> > -void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept +template > > +void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -1073,7 +1073,7 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept template void single_task (L&& f) noexcept { - f(); + std::forward(f)(); } } diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index aea0c030152..07f28d0944a 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -7,7 +7,7 @@ namespace amrex { #ifdef AMREX_USE_SYCL template -void single_task (gpuStream_t stream, L&& f) noexcept +void single_task (gpuStream_t stream, L const& f) noexcept { auto& q = *(stream.queue); try { @@ -21,10 +21,10 @@ void single_task (gpuStream_t stream, L&& f) noexcept template void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes, - gpuStream_t stream, L&& f) noexcept + gpuStream_t stream, L const& f) noexcept { - int nthreads_total = nthreads_per_block * nblocks; - std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) + const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks; + const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) / sizeof(unsigned long long); auto& q = *(stream.queue); try { @@ -36,7 +36,7 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes, [=] (sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - f(Gpu::Handler{&item,shared_data.get_pointer()}); + f(Gpu::Handler{&item,shared_data.get_multi_ptr().get()}); }); }); } catch (sycl::exception const& ex) { @@ -45,9 +45,9 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes, } template -void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept +void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept { - int nthreads_total = nthreads_per_block * nblocks; + const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks; auto& q = *(stream.queue); try { q.submit([&] (sycl::handler& h) { @@ -66,10 +66,10 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe template void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream, - L&& f) noexcept + L const& f) noexcept { - int nthreads_total = MT * nblocks; - std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) + const auto nthreads_total = MT * std::size_t(nblocks); + const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) / sizeof(unsigned long long); auto& q = *(stream.queue); try { @@ -82,7 +82,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - f(Gpu::Handler{&item,shared_data.get_pointer()}); + f(Gpu::Handler{&item,shared_data.get_multi_ptr().get()}); }); }); } catch (sycl::exception const& ex) { @@ -91,9 +91,9 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream, } template -void launch (int nblocks, gpuStream_t stream, L&& f) noexcept +void launch (int nblocks, gpuStream_t stream, L const& f) noexcept { - int nthreads_total = MT * nblocks; + const auto nthreads_total = MT * std::size_t(nblocks); auto& q = *(stream.queue); try { q.submit([&] (sycl::handler& h) { @@ -112,12 +112,12 @@ void launch (int nblocks, gpuStream_t stream, L&& f) noexcept } template -void launch (T const& n, L&& f) noexcept +void launch (T const& n, L const& f) noexcept { if (amrex::isEmpty(n)) { return; } const auto ec = Gpu::makeExecutionConfig(n); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -188,12 +188,12 @@ namespace detail { } template ::value> > -void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept { if (amrex::isEmpty(n)) { return; } const auto ec = Gpu::makeExecutionConfig(n); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { if (info.hasReduction()) { @@ -206,11 +206,11 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (T i = item.get_global_id(0), stride = item.get_global_range(0); - i < n; i += stride) { - int n_active_threads = amrex::min(n-i+(T)item.get_local_id(0), - (T)item.get_local_range(0)); - detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_pointer(), + for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0); + i < std::size_t(n); i += stride) { + int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0), + item.get_local_range(0)); + detail::call_f(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); } }); @@ -223,9 +223,9 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (T i = item.get_global_id(0), stride = item.get_global_range(0); - i < n; i += stride) { - detail::call_f(f, i, Gpu::Handler{&item}); + for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0); + i < std::size_t(n); i += stride) { + detail::call_f(f, T(i), Gpu::Handler{&item}); } }); }); @@ -236,17 +236,13 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept } template -void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const BoxIndexer indexer(box); + const auto ec = Gpu::makeExecutionConfig(box.numPts()); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { if (info.hasReduction()) { @@ -259,17 +255,12 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0), - (int)item.get_local_range(0)); - detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_pointer(), + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); + icell < indexer.numPts(); icell += stride) { + auto [i, j, k] = indexer(icell); + int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)), + std::uint64_t(item.get_local_range(0))); + detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); } }); @@ -282,14 +273,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); + icell < indexer.numPts(); icell += stride) { + auto [i, j, k] = indexer(icell); detail::call_f(f,i,j,k,Gpu::Handler{&item}); } }); @@ -301,17 +287,13 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept } template ::value> > -void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const BoxIndexer indexer(box); + const auto ec = Gpu::makeExecutionConfig(box.numPts()); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { if (info.hasReduction()) { @@ -324,18 +306,13 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0), - (int)item.get_local_range(0)); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); + icell < indexer.numPts(); icell += stride) { + auto [i, j, k] = indexer(icell); + int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)), + std::uint64_t(item.get_local_range(0))); detail::call_f(f, i, j, k, ncomp, - Gpu::Handler{&item, shared_data.get_pointer(), + Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); } }); @@ -348,14 +325,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); + icell < indexer.numPts(); icell += stride) { + auto [i, j, k] = indexer(icell); detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item}); } }); @@ -367,12 +339,12 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n } template ::value> > -void ParallelForRNG (T n, L&& f) noexcept +void ParallelForRNG (T n, L const& f) noexcept { if (amrex::isEmpty(n)) { return; } const auto ec = Gpu::ExecutionConfig(n); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); auto& q = Gpu::Device::streamQueue(); auto& engdescr = *(getRandEngineDescriptor()); try { @@ -384,11 +356,11 @@ void ParallelForRNG (T n, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - int tid = item.get_global_id(0); + auto const tid = item.get_global_id(0); auto engine = engine_acc.load(tid); RandomEngine rand_eng{&engine}; - for (T i = tid, stride = item.get_global_range(0); i < n; i += stride) { - f(i,rand_eng); + for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) { + f(T(i),rand_eng); } engine_acc.store(engine, tid); }); @@ -400,17 +372,13 @@ void ParallelForRNG (T n, L&& f) noexcept } template -void ParallelForRNG (Box const& box, L&& f) noexcept +void ParallelForRNG (Box const& box, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - const auto ec = Gpu::ExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); + const BoxIndexer indexer(box); + const auto ec = Gpu::ExecutionConfig(box.numPts()); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); auto& q = Gpu::Device::streamQueue(); auto& engdescr = *(getRandEngineDescriptor()); try { @@ -422,17 +390,12 @@ void ParallelForRNG (Box const& box, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - int tid = item.get_global_id(0); + auto const tid = item.get_global_id(0); auto engine = engine_acc.load(tid); RandomEngine rand_eng{&engine}; - for (int icell = tid, stride = item.get_global_range(0); - icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + for (std::uint64_t icell = tid, stride = item.get_global_range(0); + icell < indexer.numPts(); icell += stride) { + auto [i, j, k] = indexer(icell); f(i,j,k,rand_eng); } engine_acc.store(engine, tid); @@ -445,17 +408,13 @@ void ParallelForRNG (Box const& box, L&& f) noexcept } template ::value> > -void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept +void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - const auto ec = Gpu::ExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); + const BoxIndexer indexer(box); + const auto ec = Gpu::ExecutionConfig(box.numPts()); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); auto& q = Gpu::Device::streamQueue(); auto& engdescr = *(getRandEngineDescriptor()); try { @@ -467,17 +426,12 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - int tid = item.get_global_id(0); + auto const tid = item.get_global_id(0); auto engine = engine_acc.load(tid); RandomEngine rand_eng{&engine}; - for (int icell = tid, stride = item.get_global_range(0); - icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + for (std::uint64_t icell = tid, stride = item.get_global_range(0); + icell < indexer.numPts(); icell += stride) { + auto [i, j, k] = indexer(icell); for (T n = 0; n < ncomp; ++n) { f(i,j,k,n,rand_eng); } @@ -495,20 +449,11 @@ template void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const auto ec = Gpu::makeExecutionConfig(std::max(box1.numPts(), box2.numPts())); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -518,24 +463,15 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + auto const ncells = std::max(indexer1.numPts(), indexer2.numPts()); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + if (icell < indexer1.numPts()) { + auto [i, j, k] = indexer1(icell); f1(i,j,k); } - if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + if (icell < indexer2.numPts()) { + auto [i, j, k] = indexer2(icell); f2(i,j,k); } } @@ -552,25 +488,12 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells3 = box3.numPts(); - int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len3xy = len3.x*len3.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto len3x = len3.x; - const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const BoxIndexer indexer3(box3); + const auto ec = Gpu::makeExecutionConfig(std::max({box1.numPts(),box2.numPts(),box3.numPts()})); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -580,33 +503,19 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()}); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + if (icell < indexer1.numPts()) { + auto [i, j, k] = indexer1(icell); f1(i,j,k); } - if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + if (icell < indexer2.numPts()) { + auto [i, j, k] = indexer2(icell); f2(i,j,k); } - if (icell < ncells3) { - int k = icell / len3xy; - int j = (icell - k*len3xy) / len3x; - int i = (icell - k*len3xy) - j*len3x; - i += lo3.x; - j += lo3.y; - k += lo3.z; + if (icell < indexer3.numPts()) { + auto [i, j, k] = indexer3(icell); f3(i,j,k); } } @@ -625,20 +534,11 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const auto ec = Gpu::makeExecutionConfig(std::max(box1.numPts(),box2.numPts())); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -648,26 +548,17 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + auto const ncells = std::max(indexer1.numPts(), indexer2.numPts()); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + if (icell < indexer1.numPts()) { + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } - if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + if (icell < indexer2.numPts()) { + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } @@ -690,25 +581,12 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells3 = box3.numPts(); - int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len3xy = len3.x*len3.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto len3x = len3.x; - const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const BoxIndexer indexer3(box3); + const auto ec = Gpu::makeExecutionConfig(std::max({box1.numPts(),box2.numPts(),box3.numPts()})); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -718,37 +596,23 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()}); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + if (icell < indexer1.numPts()) { + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } - if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + if (icell < indexer2.numPts()) { + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } } - if (icell < ncells3) { - int k = icell / len3xy; - int j = (icell - k*len3xy) / len3x; - int i = (icell - k*len3xy) - j*len3x; - i += lo3.x; - j += lo3.y; - k += lo3.z; + if (icell < indexer3.numPts()) { + auto [i, j, k] = indexer3(icell); for (T3 n = 0; n < ncomp3; ++n) { f3(i,j,k,n); } @@ -765,7 +629,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, // CUDA or HIP template -void single_task (gpuStream_t stream, L&& f) noexcept +void single_task (gpuStream_t stream, L const& f) noexcept { AMREX_LAUNCH_KERNEL(Gpu::Device::warp_size, 1, 1, 0, stream, [=] AMREX_GPU_DEVICE () noexcept {f();}); @@ -774,7 +638,7 @@ void single_task (gpuStream_t stream, L&& f) noexcept template void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream, - L&& f) noexcept + L const& f) noexcept { AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE () noexcept { f(); }); @@ -782,7 +646,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream, } template -void launch (int nblocks, gpuStream_t stream, L&& f) noexcept +void launch (int nblocks, gpuStream_t stream, L const& f) noexcept { AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream, [=] AMREX_GPU_DEVICE () noexcept { f(); }); @@ -791,7 +655,7 @@ void launch (int nblocks, gpuStream_t stream, L&& f) noexcept template void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes, - gpuStream_t stream, L&& f) noexcept + gpuStream_t stream, L const& f) noexcept { AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS); AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes, @@ -806,7 +670,7 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe } template -void launch (T const& n, L&& f) noexcept +void launch (T const& n, L const& f) noexcept { if (amrex::isEmpty(n)) { return; } const auto ec = Gpu::makeExecutionConfig(n); @@ -822,7 +686,7 @@ void launch (T const& n, L&& f) noexcept namespace detail { template AMREX_GPU_DEVICE - auto call_f (F const& f, N i, N /*nleft*/) + auto call_f (F const& f, N i, std::uint64_t /*nleft*/) noexcept -> decltype(f(0)) { f(i); @@ -830,15 +694,15 @@ namespace detail { template AMREX_GPU_DEVICE - auto call_f (F const& f, N i, N nleft) + auto call_f (F const& f, N i, std::uint64_t nleft) noexcept -> decltype(f(0,Gpu::Handler{})) { - f(i,Gpu::Handler(amrex::min(nleft,(N)blockDim.x))); + f(i,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x))); } template AMREX_GPU_DEVICE - auto call_f (F const& f, int i, int j, int k, int /*nleft*/) + auto call_f (F const& f, int i, int j, int k, std::uint64_t /*nleft*/) noexcept -> decltype(f(0,0,0)) { f(i,j,k); @@ -846,15 +710,15 @@ namespace detail { template AMREX_GPU_DEVICE - auto call_f (F const& f, int i, int j, int k, int nleft) + auto call_f (F const& f, int i, int j, int k, std::uint64_t nleft) noexcept -> decltype(f(0,0,0,Gpu::Handler{})) { - f(i,j,k,Gpu::Handler(amrex::min(nleft,(int)blockDim.x))); + f(i,j,k,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x))); } template AMREX_GPU_DEVICE - auto call_f (F const& f, int i, int j, int k, T ncomp, int /*nleft*/) + auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t /*nleft*/) noexcept -> decltype(f(0,0,0,0)) { for (T n = 0; n < ncomp; ++n) f(i,j,k,n); @@ -862,24 +726,24 @@ namespace detail { template AMREX_GPU_DEVICE - auto call_f (F const& f, int i, int j, int k, T ncomp, int nleft) + auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t nleft) noexcept -> decltype(f(0,0,0,0,Gpu::Handler{})) { - for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(int)blockDim.x))); + for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x))); } } template ::value> > std::enable_if_t::value> -ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept +ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept { if (amrex::isEmpty(n)) { return; } const auto ec = Gpu::makeExecutionConfig(n); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - i < n; i += stride) { - detail::call_f(f, i, (n-i+(T)threadIdx.x)); + for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x; + i < Long(n); i += stride) { + detail::call_f(f, T(i), (Long(n)-i+(Long)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -887,27 +751,18 @@ ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept template std::enable_if_t::value> -ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept +ParallelFor (Gpu::KernelInfo const&, Box const& box, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - const auto ec = Gpu::makeExecutionConfig(ncells); + const BoxIndexer indexer(box); + const auto ec = Gpu::makeExecutionConfig(box.numPts()); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; + icell < indexer.numPts(); icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - detail::call_f(f, i, j, k, (ncells-icell+(int)threadIdx.x)); + auto [i, j, k] = indexer(icell); + detail::call_f(f, i, j, k, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -915,26 +770,17 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept template ::value> > std::enable_if_t::value> -ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept +ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - const auto ec = Gpu::makeExecutionConfig(ncells); + const BoxIndexer indexer(box); + const auto ec = Gpu::makeExecutionConfig(box.numPts()); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - detail::call_f(f, i, j, k, ncomp, (ncells-icell+(int)threadIdx.x)); + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; + icell < indexer.numPts(); icell += stride) { + auto [i, j, k] = indexer(icell); + detail::call_f(f, i, j, k, ncomp, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -942,7 +788,7 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept template ::value> > std::enable_if_t::value> -ParallelForRNG (T n, L&& f) noexcept +ParallelForRNG (T n, L const& f) noexcept { if (amrex::isEmpty(n)) { return; } randState_t* rand_state = getRandState(); @@ -951,10 +797,10 @@ ParallelForRNG (T n, L&& f) noexcept amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - int tid = blockDim.x*blockIdx.x+threadIdx.x; + Long tid = Long(blockDim.x)*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (T i = tid, stride = blockDim.x*gridDim.x; i < n; i += stride) { - f(i,engine); + for (Long i = tid, stride = Long(blockDim.x)*gridDim.x; i < Long(n); i += stride) { + f(T(i),engine); } }); Gpu::streamSynchronize(); // To avoid multiple streams using RNG @@ -963,29 +809,20 @@ ParallelForRNG (T n, L&& f) noexcept template std::enable_if_t::value> -ParallelForRNG (Box const& box, L&& f) noexcept +ParallelForRNG (Box const& box, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } randState_t* rand_state = getRandState(); - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - const auto ec = Gpu::ExecutionConfig(ncells); + const BoxIndexer indexer(box); + const auto ec = Gpu::ExecutionConfig(box.numPts()); AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - int tid = blockDim.x*blockIdx.x+threadIdx.x; + auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (int icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) { + auto [i, j, k] = indexer(icell); f(i,j,k,engine); } }); @@ -995,29 +832,20 @@ ParallelForRNG (Box const& box, L&& f) noexcept template ::value> > std::enable_if_t::value> -ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept +ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } randState_t* rand_state = getRandState(); - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - const auto ec = Gpu::ExecutionConfig(ncells); + const BoxIndexer indexer(box); + const auto ec = Gpu::ExecutionConfig(box.numPts()); AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - int tid = blockDim.x*blockIdx.x+threadIdx.x; + auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (int icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) { + auto [i, j, k] = indexer(icell); for (T n = 0; n < ncomp; ++n) { f(i,j,k,n,engine); } @@ -1033,38 +861,20 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto ec = Gpu::makeExecutionConfig(ncells); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const auto ec = Gpu::makeExecutionConfig(std::max(box1.numPts(),box2.numPts())); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + auto const ncells = std::max(indexer1.numPts(), indexer2.numPts()); + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + if (icell < indexer1.numPts()) { + auto [i, j, k] = indexer1(icell); f1(i,j,k); } - if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + if (icell < indexer2.numPts()) { + auto [i, j, k] = indexer2(icell); f2(i,j,k); } } @@ -1079,52 +889,25 @@ ParallelFor (Gpu::KernelInfo const&, L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells3 = box3.numPts(); - int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len3xy = len3.x*len3.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto len3x = len3.x; - const auto ec = Gpu::makeExecutionConfig(ncells); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const BoxIndexer indexer3(box3); + const auto ec = Gpu::makeExecutionConfig(std::max({box1.numPts(),box2.numPts(),box3.numPts()})); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()}); + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + if (icell < indexer1.numPts()) { + auto [i, j, k] = indexer1(icell); f1(i,j,k); } - if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + if (icell < indexer2.numPts()) { + auto [i, j, k] = indexer2(icell); f2(i,j,k); } - if (icell < ncells3) { - int k = icell / len3xy; - int j = (icell - k*len3xy) / len3x; - int i = (icell - k*len3xy) - j*len3x; - i += lo3.x; - j += lo3.y; - k += lo3.z; + if (icell < indexer3.numPts()) { + auto [i, j, k] = indexer3(icell); f3(i,j,k); } } @@ -1141,40 +924,22 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto ec = Gpu::makeExecutionConfig(ncells); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const auto ec = Gpu::makeExecutionConfig(std::max(box1.numPts(),box2.numPts())); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + auto const ncells = std::max(indexer1.numPts(), indexer2.numPts()); + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + if (icell < indexer1.numPts()) { + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } - if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + if (icell < indexer2.numPts()) { + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } @@ -1195,56 +960,29 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells3 = box3.numPts(); - int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len3xy = len3.x*len3.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto len3x = len3.x; - const auto ec = Gpu::makeExecutionConfig(ncells); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const BoxIndexer indexer3(box3); + const auto ec = Gpu::makeExecutionConfig(std::max({box1.numPts(),box2.numPts(),box3.numPts()})); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()}); + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + if (icell < indexer1.numPts()) { + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } - if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + if (icell < indexer2.numPts()) { + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } } - if (icell < ncells3) { - int k = icell / len3xy; - int j = (icell - k*len3xy) / len3x; - int i = (icell - k*len3xy) - j*len3x; - i += lo3.x; - j += lo3.y; - k += lo3.z; + if (icell < indexer3.numPts()) { + auto [i, j, k] = indexer3(icell); for (T3 n = 0; n < ncomp3; ++n) { f3(i,j,k,n); } @@ -1678,7 +1416,7 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); #else AMREX_PRAGMA_SIMD - for (T i = 0; i < n; ++i) f(i); + for (T i = 0; i < n; ++i) { f(i); } #endif } } @@ -1694,7 +1432,7 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); #else AMREX_PRAGMA_SIMD - for (T i = 0; i < n; ++i) f(i); + for (T i = 0; i < n; ++i) { f(i); } #endif } } diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H b/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H index 22c0e00b56b..f01673c9821 100644 --- a/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H +++ b/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H @@ -457,7 +457,7 @@ #define AMREX_GPU_HOST_DEVICE_FOR_1D(n,i,block) \ { \ auto const& amrex_i_n = n; \ - using amrex_i_inttype = typename std::remove_const::type; \ + using amrex_i_inttype = std::remove_const_t; \ if (amrex::Gpu::inLaunchRegion()) { \ amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \ } else { \ @@ -468,7 +468,7 @@ #define AMREX_GPU_HOST_DEVICE_FOR_1D(n,i,block) \ { \ auto const& amrex_i_n = n; \ - using amrex_i_inttype = typename std::remove_const::type; \ + using amrex_i_inttype = std::remove_const_t; \ if (amrex::Gpu::inLaunchRegion()) { \ amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \ } else { \ @@ -481,7 +481,7 @@ #define AMREX_GPU_DEVICE_FOR_1D(n,i,block) \ { \ - using amrex_i_inttype = typename std::remove_const::type; \ + using amrex_i_inttype = std::remove_const_t; \ amrex::ParallelFor(n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \ } diff --git a/Src/Base/AMReX_GpuMemory.H b/Src/Base/AMReX_GpuMemory.H index 1ffee387015..8daeceba914 100644 --- a/Src/Base/AMReX_GpuMemory.H +++ b/Src/Base/AMReX_GpuMemory.H @@ -51,7 +51,7 @@ struct Deleter { void operator() (void* pt) const noexcept { m_arena->free(pt); } }; -template ::value,int> = 0> +template ,int> = 0> struct DeviceScalar { DeviceScalar (DeviceScalar const&) = delete; @@ -104,7 +104,7 @@ private: #else - DeviceScalar (T init_val) : d(init_val) {} + DeviceScalar (T const& init_val) : d(init_val) {} DeviceScalar () = default; ~DeviceScalar () = default; diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H index 1c0b5731762..4fba23a849a 100644 --- a/Src/Base/AMReX_GpuQualifiers.H +++ b/Src/Base/AMReX_GpuQualifiers.H @@ -8,6 +8,12 @@ #include #endif +#if defined(AMREX_USE_CUDA) && (defined(AMREX_CXX_PGI) || defined(AMREX_CXX_NVHPC)) +#include +#define AMREX_IF_ON_DEVICE(CODE) NV_IF_TARGET(NV_IS_DEVICE, CODE) +#define AMREX_IF_ON_HOST(CODE) NV_IF_TARGET(NV_IS_HOST, CODE) +#endif + #define AMREX_GPU_HOST __host__ #define AMREX_GPU_DEVICE __device__ #define AMREX_GPU_GLOBAL __global__ @@ -31,6 +37,29 @@ #define AMREX_DEVICE_COMPILE (__CUDA_ARCH__ || __HIP_DEVICE_COMPILE__ || __SYCL_DEVICE_ONLY__) +// Remove surrounding parentheses if present +#define AMREX_IMPL_STRIP_PARENS(X) AMREX_IMPL_ESC(AMREX_IMPL_ISH X) +#define AMREX_IMPL_ISH(...) AMREX_IMPL_ISH __VA_ARGS__ +#define AMREX_IMPL_ESC(...) AMREX_IMPL_ESC_(__VA_ARGS__) +#define AMREX_IMPL_ESC_(...) AMREX_IMPL_VAN_##__VA_ARGS__ +#define AMREX_IMPL_VAN_AMREX_IMPL_ISH + +#if !defined(AMREX_IF_ON_DEVICE) && !defined(AMREX_IF_ON_HOST) +#if (defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__)) || \ + (defined(AMREX_USE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ + (defined(AMREX_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)) +#define AMREX_IF_ON_DEVICE(CODE) \ + { AMREX_IMPL_STRIP_PARENS(CODE) } +#define AMREX_IF_ON_HOST(CODE) \ + {} +#else +#define AMREX_IF_ON_DEVICE(CODE) \ + {} +#define AMREX_IF_ON_HOST(CODE) \ + { AMREX_IMPL_STRIP_PARENS(CODE) } +#endif +#endif + #ifdef AMREX_USE_SYCL # include #endif diff --git a/Src/Base/AMReX_GpuRange.H b/Src/Base/AMReX_GpuRange.H index b8d2ab89d08..ecf9a32fd2e 100644 --- a/Src/Base/AMReX_GpuRange.H +++ b/Src/Base/AMReX_GpuRange.H @@ -10,7 +10,7 @@ namespace amrex { -template ::value,int>::type = 0> +template ,int> = 0> bool isEmpty (T n) noexcept { return n <= 0; } AMREX_FORCE_INLINE bool isEmpty (Box const& b) noexcept { return b.isEmpty(); } @@ -20,11 +20,11 @@ namespace Gpu { namespace range_detail { //! integer version -template ::value,int>::type = 0> +template ,int> = 0> AMREX_GPU_HOST_DEVICE Long size (T const& b) noexcept { return static_cast(b); } -template ::value,int>::type = 0> +template ,int> = 0> AMREX_GPU_HOST_DEVICE Long at (T const& /*b*/, Long offset) noexcept { return offset; } @@ -32,31 +32,31 @@ Long at (T const& /*b*/, Long offset) noexcept { return offset; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Long size (Box const& b) noexcept { -#if AMREX_DEVICE_COMPILE - return b.numPts(); -#else - amrex::ignore_unused(b); - return 1; -#endif + AMREX_IF_ON_DEVICE((return b.numPts();)) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(b); + return 1; + )) } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box at (Box const& b, Long offset) noexcept { -#if AMREX_DEVICE_COMPILE - auto len = b.length3d(); - Long k = offset / (len[0]*len[1]); - Long j = (offset - k*(len[0]*len[1])) / len[0]; - Long i = (offset - k*(len[0]*len[1])) - j*len[0]; - IntVect iv{AMREX_D_DECL(static_cast(i), - static_cast(j), - static_cast(k))}; - iv += b.smallEnd(); - return Box(iv,iv,b.type()); -#else - amrex::ignore_unused(offset); - return b; -#endif + AMREX_IF_ON_DEVICE(( + auto len = b.length3d(); + Long k = offset / (len[0]*len[1]); + Long j = (offset - k*(len[0]*len[1])) / len[0]; + Long i = (offset - k*(len[0]*len[1])) - j*len[0]; + IntVect iv{AMREX_D_DECL(static_cast(i), + static_cast(j), + static_cast(k))}; + iv += b.smallEnd(); + return Box(iv,iv,b.type()); + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(offset); + return b; + )) } template @@ -73,7 +73,7 @@ struct range_impl struct iterator { AMREX_GPU_HOST_DEVICE - iterator (T const& b, Long i, Long s) noexcept : mi_b(b), mi_i(i), mi_s(s) {} + iterator (T const& b, Long i, Long s) noexcept : mi_b(&b), mi_i(i), mi_s(s) {} AMREX_GPU_HOST_DEVICE void operator++ () noexcept { mi_i += mi_s; } @@ -82,23 +82,25 @@ struct range_impl bool operator!= (iterator const& rhs) const noexcept { return mi_i < rhs.mi_i; } AMREX_GPU_HOST_DEVICE - T operator* () const noexcept { return range_detail::at(mi_b,mi_i); } + T operator* () const noexcept { return range_detail::at(*mi_b,mi_i); } private: - T const& mi_b; + T const* mi_b; Long mi_i; Long mi_s; }; [[nodiscard]] AMREX_GPU_HOST_DEVICE iterator begin () const noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x); -#elif defined (__SYCL_DEVICE_ONLY__) +#if defined (__SYCL_DEVICE_ONLY__) return iterator(m_b, m_gid, m_grange); #else - return iterator(m_b,0,1); + AMREX_IF_ON_DEVICE(( + return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x); + )) + AMREX_IF_ON_HOST(( + return iterator(m_b,0,1); + )) #endif } diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 8b5680b41b8..ecb992983ba 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -29,6 +29,7 @@ struct Dim1 { struct gpuStream_t { sycl::queue* queue = nullptr; bool operator== (gpuStream_t const& rhs) noexcept { return queue == rhs.queue; } + bool operator!= (gpuStream_t const& rhs) noexcept { return queue != rhs.queue; } }; #endif diff --git a/Src/Base/AMReX_GpuUtility.H b/Src/Base/AMReX_GpuUtility.H index a1fa3cdd9dc..4adc111f5e2 100644 --- a/Src/Base/AMReX_GpuUtility.H +++ b/Src/Base/AMReX_GpuUtility.H @@ -26,8 +26,9 @@ namespace Gpu { template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T LDG (Array4 const& a, int i, int j, int k) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) - return __ldg(a.ptr(i,j,k)); +#if defined(AMREX_USE_CUDA) + AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k));)) + AMREX_IF_ON_HOST((return a(i,j,k);)) #else return a(i,j,k); #endif @@ -36,8 +37,9 @@ namespace Gpu { template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T LDG (Array4 const& a, int i, int j, int k, int n) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) - return __ldg(a.ptr(i,j,k,n)); +#if defined(AMREX_USE_CUDA) + AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k,n));)) + AMREX_IF_ON_HOST((return a(i,j,k,n);)) #else return a(i,j,k,n); #endif @@ -63,7 +65,11 @@ namespace Gpu { #if defined(AMREX_USE_HIP) hipPointerAttribute_t attrib; hipError_t r = hipPointerGetAttributes(&attrib, p); +#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) return r == hipSuccess && attrib.memoryType == hipMemoryTypeDevice; +#else + return r == hipSuccess && attrib.type == hipMemoryTypeDevice; +#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) #elif defined(AMREX_USE_CUDA) CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE; CUmemorytype mem_type = static_cast(0); @@ -83,7 +89,11 @@ namespace Gpu { #if defined(AMREX_USE_HIP) hipPointerAttribute_t attrib; hipError_t r = hipPointerGetAttributes(&attrib, p); +#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) return r == hipSuccess && attrib.memoryType == hipMemoryTypeHost; +#else + return r == hipSuccess && attrib.type == hipMemoryTypeHost; +#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) #elif defined(AMREX_USE_CUDA) CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE; CUmemorytype mem_type = static_cast(0); @@ -106,9 +116,15 @@ namespace Gpu { } else { hipPointerAttribute_t attrib; hipError_t r = hipPointerGetAttributes(&attrib, p); +#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) return r == hipSuccess && (attrib.memoryType == hipMemoryTypeHost || attrib.memoryType == hipMemoryTypeDevice); +#else + return r == hipSuccess && + (attrib.type == hipMemoryTypeHost || + attrib.type == hipMemoryTypeDevice); +#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) } #elif defined(AMREX_USE_CUDA) CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE; diff --git a/Src/Base/AMReX_IArrayBox.H b/Src/Base/AMReX_IArrayBox.H index 4d39ace1012..db0f26d5080 100644 --- a/Src/Base/AMReX_IArrayBox.H +++ b/Src/Base/AMReX_IArrayBox.H @@ -41,7 +41,6 @@ public: * This class does NOT provide a copy constructor or assignment operator. */ - class IArrayBox : public BaseFab @@ -58,7 +57,7 @@ public: /** * \brief Construct an initial FAB with the data space allocated but - * not inititialized. ncomp is the number of components + * not initialized. ncomp is the number of components * (variables) at each data point in the Box. */ explicit IArrayBox (const Box& b, diff --git a/Src/Base/AMReX_INT.H b/Src/Base/AMReX_INT.H index f8ab0e9ba8f..4356c70f12c 100644 --- a/Src/Base/AMReX_INT.H +++ b/Src/Base/AMReX_INT.H @@ -31,4 +31,29 @@ namespace amrex { } #endif +#if (defined(__x86_64) || defined (__aarch64__)) && !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__)) && !defined(__NVCOMPILER) + +#define AMREX_INT128_SUPPORTED 1 + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#endif + +typedef unsigned __int128 amrex_uint128_t; // NOLINT(modernize-use-using) +typedef __int128 amrex_int128_t; // NOLINT(modernize-use-using) + +#ifdef __cplusplus +namespace amrex { + using UInt128_t = amrex_uint128_t; + using Int128_t = amrex_int128_t; +} +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +#endif /* (defined(__x86_64) || defined (__aarch64__)) && !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__)) */ + #endif diff --git a/Src/Base/AMReX_IndexType.H b/Src/Base/AMReX_IndexType.H index 02a56aae2a2..0fd613d2a99 100644 --- a/Src/Base/AMReX_IndexType.H +++ b/Src/Base/AMReX_IndexType.H @@ -19,7 +19,6 @@ namespace amrex { * enumerated type CellIndex to be either CELL or NODE; i.e. each of the * AMREX_SPACEDIM dimensions must be either CELL or NODE. */ - class IndexType { friend MPI_Datatype ParallelDescriptor::Mpi_typemap::type(); diff --git a/Src/Base/AMReX_IntVect.H b/Src/Base/AMReX_IntVect.H index fd71c93ae87..b2658a5ec94 100644 --- a/Src/Base/AMReX_IntVect.H +++ b/Src/Base/AMReX_IntVect.H @@ -42,7 +42,6 @@ int coarsen (int i, int ratio) noexcept * C++ array. In addition, the basic arithmetic operators have been overloaded * to implement scaling and translation operations. */ - class IntVect { friend MPI_Datatype ParallelDescriptor::Mpi_typemap::type(); diff --git a/Src/Base/AMReX_IntegratorBase.H b/Src/Base/AMReX_IntegratorBase.H index bf99d264fb1..568e063bed5 100644 --- a/Src/Base/AMReX_IntegratorBase.H +++ b/Src/Base/AMReX_IntegratorBase.H @@ -18,7 +18,7 @@ template struct IntegratorOps; #if defined(AMREX_PARTICLES) template -struct IntegratorOps::value>::type> +struct IntegratorOps > > { static void CreateLike (amrex::Vector >& V, const T& Other) @@ -84,7 +84,7 @@ struct IntegratorOps -struct IntegratorOps, T>::value>::type> +struct IntegratorOps, T> > > { static void CreateLike (amrex::Vector >& V, const T& Other, bool Grow = false) @@ -130,7 +130,7 @@ struct IntegratorOps -struct IntegratorOps::value>::type> +struct IntegratorOps > > { static void CreateLike (amrex::Vector >& V, const T& Other, bool Grow = false) diff --git a/Src/Base/AMReX_LUSolver.H b/Src/Base/AMReX_LUSolver.H new file mode 100644 index 00000000000..bd69822ea5a --- /dev/null +++ b/Src/Base/AMReX_LUSolver.H @@ -0,0 +1,146 @@ +#ifndef AMREX_LU_SOLVER_H_ +#define AMREX_LU_SOLVER_H_ +#include + +#include +#include +#include +#include + +namespace amrex { + +// https://en.wikipedia.org/wiki/LU_decomposition + +template +class LUSolver +{ +public: + + LUSolver () = default; + + LUSolver (Array2D const& a_mat); + + void define (Array2D const& a_mat); + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void operator() (T* AMREX_RESTRICT x, T const* AMREX_RESTRICT b) const + { + for (int i = 0; i < N; ++i) { + x[i] = b[m_piv(i)]; + for (int k = 0; k < i; ++k) { + x[i] -= m_mat(i,k) * x[k]; + } + } + + for (int i = N-1; i >= 0; --i) { + for (int k = i+1; k < N; ++k) { + x[i] -= m_mat(i,k) * x[k]; + } + x[i] *= m_mat(i,i); + } + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE + Array2D invert () const + { + Array2D IA; + for (int j = 0; j < N; ++j) { + for (int i = 0; i < N; ++i) { + IA(i,j) = (m_piv(i) == j) ? T(1.0) : T(0.0); + for (int k = 0; k < i; ++k) { + IA(i,j) -= m_mat(i,k) * IA(k,j); + } + } + for (int i = N-1; i >= 0; --i) { + for (int k = i+1; k < N; ++k) { + IA(i,j) -= m_mat(i,k) * IA(k,j); + } + IA(i,j) *= m_mat(i,i); + } + } + return IA; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE + T determinant () const + { + T det = m_mat(0,0); + for (int i = 1; i < N; ++i) { + det *= m_mat(i,i); + } + det = T(1.0) / det; + return (m_npivs % 2 == 0) ? det : -det; + } + +private: + + void define_innard (); + + Array2D m_mat; + Array1D m_piv; + int m_npivs = 0; +}; + +template +LUSolver::LUSolver (Array2D const& a_mat) + : m_mat(a_mat) +{ + define_innard(); +} + +template +void LUSolver::define (Array2D const& a_mat) +{ + m_mat = a_mat; + define_innard(); +} + +template +void LUSolver::define_innard () +{ + static_assert(N > 1); + static_assert(std::is_floating_point_v); + + for (int i = 0; i < N; ++i) { m_piv(i) = i; } + m_npivs = 0; + + for (int i = 0; i < N; ++i) { + T maxA = 0; + int imax = i; + + for (int k = i; k < N; ++k) { + auto const absA = std::abs(m_mat(k,i)); + if (absA > maxA) { + maxA = absA; + imax = k; + } + } + + if (maxA < std::numeric_limits::min()) { + amrex::Abort("LUSolver: matrix is degenerate"); + } + + if (imax != i) { + std::swap(m_piv(i), m_piv(imax)); + for (int j = 0; j < N; ++j) { + std::swap(m_mat(i,j), m_mat(imax,j)); + } + ++m_npivs; + } + + for (int j = i+1; j < N; ++j) { + m_mat(j,i) /= m_mat(i,i); + for (int k = i+1; k < N; ++k) { + m_mat(j,k) -= m_mat(j,i) * m_mat(i,k); + } + } + } + + for (int i = 0; i < N; ++i) { + m_mat(i,i) = T(1) / m_mat(i,i); + } +} + +} + +#endif diff --git a/Src/Base/AMReX_Loop.H b/Src/Base/AMReX_Loop.H index 84b39107e45..f45a2198c56 100644 --- a/Src/Base/AMReX_Loop.H +++ b/Src/Base/AMReX_Loop.H @@ -8,7 +8,7 @@ namespace amrex { template AMREX_GPU_HOST_DEVICE -void Loop (Dim3 lo, Dim3 hi, F&& f) noexcept +void Loop (Dim3 lo, Dim3 hi, F const& f) noexcept { for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { @@ -19,7 +19,7 @@ void Loop (Dim3 lo, Dim3 hi, F&& f) noexcept template AMREX_GPU_HOST_DEVICE -void Loop (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept +void Loop (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept { for (int n = 0; n < ncomp; ++n) { for (int k = lo.z; k <= hi.z; ++k) { @@ -31,7 +31,7 @@ void Loop (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept template AMREX_GPU_HOST_DEVICE -void LoopConcurrent (Dim3 lo, Dim3 hi, F&& f) noexcept +void LoopConcurrent (Dim3 lo, Dim3 hi, F const& f) noexcept { for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { @@ -43,7 +43,7 @@ void LoopConcurrent (Dim3 lo, Dim3 hi, F&& f) noexcept template AMREX_GPU_HOST_DEVICE -void LoopConcurrent (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept +void LoopConcurrent (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept { for (int n = 0; n < ncomp; ++n) { for (int k = lo.z; k <= hi.z; ++k) { @@ -56,7 +56,7 @@ void LoopConcurrent (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept template AMREX_GPU_HOST_DEVICE -void Loop (Box const& bx, F&& f) noexcept +void Loop (Box const& bx, F const& f) noexcept { const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); @@ -69,7 +69,7 @@ void Loop (Box const& bx, F&& f) noexcept template AMREX_GPU_HOST_DEVICE -void Loop (Box const& bx, int ncomp, F&& f) noexcept +void Loop (Box const& bx, int ncomp, F const& f) noexcept { const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); @@ -83,7 +83,7 @@ void Loop (Box const& bx, int ncomp, F&& f) noexcept template AMREX_GPU_HOST_DEVICE -void LoopConcurrent (Box const& bx, F&& f) noexcept +void LoopConcurrent (Box const& bx, F const& f) noexcept { const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); @@ -97,7 +97,7 @@ void LoopConcurrent (Box const& bx, F&& f) noexcept template AMREX_GPU_HOST_DEVICE -void LoopConcurrent (Box const& bx, int ncomp, F&& f) noexcept +void LoopConcurrent (Box const& bx, int ncomp, F const& f) noexcept { const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); @@ -116,7 +116,7 @@ void LoopConcurrent (Box const& bx, int ncomp, F&& f) noexcept // of the warning, we have to use the functions below for those situations. template -void LoopOnCpu (Dim3 lo, Dim3 hi, F&& f) noexcept +void LoopOnCpu (Dim3 lo, Dim3 hi, F const& f) noexcept { for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { @@ -126,7 +126,7 @@ void LoopOnCpu (Dim3 lo, Dim3 hi, F&& f) noexcept } template -void LoopOnCpu (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept +void LoopOnCpu (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept { for (int n = 0; n < ncomp; ++n) { for (int k = lo.z; k <= hi.z; ++k) { @@ -137,7 +137,7 @@ void LoopOnCpu (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept } template -void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, F&& f) noexcept +void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, F const& f) noexcept { for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { @@ -148,7 +148,7 @@ void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, F&& f) noexcept } template -void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept +void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, int ncomp, F const& f) noexcept { for (int n = 0; n < ncomp; ++n) { for (int k = lo.z; k <= hi.z; ++k) { @@ -160,7 +160,7 @@ void LoopConcurrentOnCpu (Dim3 lo, Dim3 hi, int ncomp, F&& f) noexcept } template -void LoopOnCpu (Box const& bx, F&& f) noexcept +void LoopOnCpu (Box const& bx, F const& f) noexcept { const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); @@ -172,7 +172,7 @@ void LoopOnCpu (Box const& bx, F&& f) noexcept } template -void LoopOnCpu (Box const& bx, int ncomp, F&& f) noexcept +void LoopOnCpu (Box const& bx, int ncomp, F const& f) noexcept { const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); @@ -185,7 +185,7 @@ void LoopOnCpu (Box const& bx, int ncomp, F&& f) noexcept } template -void LoopConcurrentOnCpu (Box const& bx, F&& f) noexcept +void LoopConcurrentOnCpu (Box const& bx, F const& f) noexcept { const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); @@ -198,7 +198,7 @@ void LoopConcurrentOnCpu (Box const& bx, F&& f) noexcept } template -void LoopConcurrentOnCpu (Box const& bx, int ncomp, F&& f) noexcept +void LoopConcurrentOnCpu (Box const& bx, int ncomp, F const& f) noexcept { const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); @@ -211,6 +211,30 @@ void LoopConcurrentOnCpu (Box const& bx, int ncomp, F&& f) noexcept }}}} } +// Implementation of "constexpr for" based on +// https://artificial-mind.net/blog/2020/10/31/constexpr-for +// +// Approximates what one would get from a compile-time +// unrolling of the loop +// for (int i = 0; i < N; ++i) { +// f(i); +// } +// +// The mechanism is recursive: we evaluate f(i) at the current +// i and then call the for loop at i+1. f() is a lambda function +// that provides the body of the loop and takes only an integer +// i as its argument. + +template +AMREX_GPU_HOST_DEVICE AMREX_INLINE +constexpr void constexpr_for (F const& f) +{ + if constexpr (I < N) { + f(std::integral_constant()); + constexpr_for(f); + } +} + #include } diff --git a/Src/Base/AMReX_MFIter.H b/Src/Base/AMReX_MFIter.H index bfad2d8c32f..7f0ca4d3539 100644 --- a/Src/Base/AMReX_MFIter.H +++ b/Src/Base/AMReX_MFIter.H @@ -129,10 +129,10 @@ public: [[nodiscard]] Box grownnodaltilebox (int dir, const IntVect& ng) const noexcept; //! Return the valid Box in which the current tile resides. - [[nodiscard]] Box validbox () const noexcept { return fabArray.box((*index_map)[currentIndex]); } + [[nodiscard]] Box validbox () const noexcept { return fabArray->box((*index_map)[currentIndex]); } //! Return the Box of the FAB at which we currently point. - [[nodiscard]] Box fabbox () const noexcept { return fabArray.fabbox((*index_map)[currentIndex]); } + [[nodiscard]] Box fabbox () const noexcept { return fabArray->fabbox((*index_map)[currentIndex]); } //! Increment iterator to the next tile we own. void operator++ () noexcept; @@ -159,11 +159,11 @@ public: [[nodiscard]] int LocalIndex () const noexcept { return local_index_map ? (*local_index_map)[currentIndex] : currentIndex; } //! Constant reference to FabArray over which we're iterating. - [[nodiscard]] const FabArrayBase& theFabArrayBase () const noexcept { return fabArray; } + [[nodiscard]] const FabArrayBase& theFabArrayBase () const noexcept { return *fabArray; } [[nodiscard]] int tileIndex () const noexcept {return currentIndex;} - [[nodiscard]] const DistributionMapping& DistributionMap () const noexcept { return fabArray.DistributionMap(); } + [[nodiscard]] const DistributionMapping& DistributionMap () const noexcept { return fabArray->DistributionMap(); } static int allowMultipleMFIters (int allow); @@ -173,7 +173,7 @@ protected: std::unique_ptr m_fa; //!< This must be the first member! - const FabArrayBase& fabArray; + const FabArrayBase* fabArray; IntVect tile_size; diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp index b05d0bbb2af..f68ab9ce35f 100644 --- a/Src/Base/AMReX_MFIter.cpp +++ b/Src/Base/AMReX_MFIter.cpp @@ -20,7 +20,7 @@ MFIter::allowMultipleMFIters (int allow) MFIter::MFIter (const FabArrayBase& fabarray_, unsigned char flags_) : - fabArray(fabarray_), + fabArray(&fabarray_), tile_size((flags_ & Tiling) ? FabArrayBase::mfiter_tile_size : IntVect::TheZeroVector()), flags(flags_), streams(Gpu::numGpuStreams()), @@ -38,7 +38,7 @@ MFIter::MFIter (const FabArrayBase& fabarray_, MFIter::MFIter (const FabArrayBase& fabarray_, bool do_tiling_) : - fabArray(fabarray_), + fabArray(&fabarray_), tile_size((do_tiling_) ? FabArrayBase::mfiter_tile_size : IntVect::TheZeroVector()), flags(do_tiling_ ? Tiling : 0), streams(Gpu::numGpuStreams()), @@ -57,7 +57,7 @@ MFIter::MFIter (const FabArrayBase& fabarray_, const IntVect& tilesize_, unsigned char flags_) : - fabArray(fabarray_), + fabArray(&fabarray_), tile_size(tilesize_), flags(flags_ | Tiling), streams(Gpu::numGpuStreams()), @@ -75,7 +75,7 @@ MFIter::MFIter (const FabArrayBase& fabarray_, MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, unsigned char flags_) : m_fa(std::make_unique(ba,dm,1,0)), - fabArray(*m_fa), + fabArray(m_fa.get()), tile_size((flags_ & Tiling) ? FabArrayBase::mfiter_tile_size : IntVect::TheZeroVector()), flags(flags_), streams(Gpu::numGpuStreams()), @@ -99,7 +99,7 @@ MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, unsigned char MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, bool do_tiling_) : m_fa(std::make_unique(ba,dm,1,0)), - fabArray(*m_fa), + fabArray(m_fa.get()), tile_size((do_tiling_) ? FabArrayBase::mfiter_tile_size : IntVect::TheZeroVector()), flags(do_tiling_ ? Tiling : 0), streams(Gpu::numGpuStreams()), @@ -125,7 +125,7 @@ MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, const IntVect& tilesize_, unsigned char flags_) : m_fa(std::make_unique(ba,dm,1,0)), - fabArray(*m_fa), + fabArray(m_fa.get()), tile_size(tilesize_), flags(flags_ | Tiling), streams(Gpu::numGpuStreams()), @@ -150,7 +150,7 @@ MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, const MFItInfo& info) : m_fa(std::make_unique(ba, dm, 1, 0)), - fabArray(*m_fa), + fabArray(m_fa.get()), tile_size(info.tilesize), flags(info.do_tiling ? Tiling : 0), streams(std::max(1,std::min(Gpu::numGpuStreams(),info.num_streams))), @@ -182,7 +182,7 @@ MFIter::MFIter (const BoxArray& ba, const DistributionMapping& dm, const MFItInf MFIter::MFIter (const FabArrayBase& fabarray_, const MFItInfo& info) : - fabArray(fabarray_), + fabArray(&fabarray_), tile_size(info.tilesize), flags(info.do_tiling ? Tiling : 0), streams(std::max(1,std::min(Gpu::numGpuStreams(),info.num_streams))), @@ -282,14 +282,14 @@ MFIter::Initialize () if (flags & AllBoxes) // a very special case { - index_map = &(fabArray.IndexArray()); + index_map = &(fabArray->IndexArray()); currentIndex = 0; beginIndex = 0; endIndex = static_cast(index_map->size()); } else { - const FabArrayBase::TileArray* pta = fabArray.getTileArray(tile_size); + const FabArrayBase::TileArray* pta = fabArray->getTileArray(tile_size); index_map = &(pta->indexMap); local_index_map = &(pta->localIndexMap); @@ -366,7 +366,7 @@ MFIter::Initialize () Gpu::Device::setStreamIndex(currentIndex%streams); #endif - typ = fabArray.boxArray().ixType(); + typ = fabArray->boxArray().ixType(); } } @@ -462,7 +462,7 @@ MFIter::growntilebox (int a_ng) const noexcept { Box bx = tilebox(); IntVect ngv{a_ng}; - if (a_ng < -100) { ngv = fabArray.nGrowVect(); } + if (a_ng < -100) { ngv = fabArray->nGrowVect(); } const Box& vbx = validbox(); for (int d=0; dnGrowVect(); } return grownnodaltilebox(dir, ngv); } diff --git a/Src/Base/AMReX_MFParallelForC.H b/Src/Base/AMReX_MFParallelForC.H index cd050b2e331..b2269df1ea1 100644 --- a/Src/Base/AMReX_MFParallelForC.H +++ b/Src/Base/AMReX_MFParallelForC.H @@ -10,7 +10,7 @@ namespace amrex::experimental::detail { template std::enable_if_t::value> -ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynamic, F&& f) +ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynamic, F const& f) { #ifdef AMREX_USE_OMP #pragma omp parallel @@ -33,7 +33,7 @@ ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynami template std::enable_if_t::value> -ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const& ts, bool dynamic, F&& f) +ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const& ts, bool dynamic, F const& f) { #ifdef AMREX_USE_OMP #pragma omp parallel diff --git a/Src/Base/AMReX_MFParallelForG.H b/Src/Base/AMReX_MFParallelForG.H index ba65b18937d..066e46f3b89 100644 --- a/Src/Base/AMReX_MFParallelForG.H +++ b/Src/Base/AMReX_MFParallelForG.H @@ -12,36 +12,33 @@ namespace amrex { namespace detail { inline -void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair& blocks_x, Box*& pboxes, +void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair& blocks_x, BoxIndexer*& pboxes, Vector const& boxes, Vector const& ncells, int nthreads) { if (!ncells.empty()) { const int nboxes = ncells.size(); - const std::size_t nbytes_boxes = amrex::aligned_size(16, (nboxes+1) * sizeof(int)); - const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(Box); + const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int)); + const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer); a_hp = (char*)The_Pinned_Arena()->alloc(nbytes); int* hp_blks = (int*)a_hp; - Box* hp_boxes = (Box*)(a_hp + nbytes_boxes); + auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes); hp_blks[0] = 0; - Long ntot = 0; bool same_size = true; for (int i = 0; i < nboxes; ++i) { Long nblocks = (ncells[i] + nthreads-1) / nthreads; + AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits::max())); hp_blks[i+1] = hp_blks[i] + static_cast(nblocks); - ntot += nblocks; same_size = same_size && (ncells[i] == ncells[0]); - new (hp_boxes+i) Box(boxes[i]); + new (hp_boxes+i) BoxIndexer(boxes[i]); } - amrex::ignore_unused(ntot); - AMREX_ASSERT(static_cast(hp_blks[nboxes]) == ntot); // no overflow a_dp = (char*) The_Arena()->alloc(nbytes); Gpu::htod_memcpy_async(a_dp, a_hp, nbytes); blocks_x.first = hp_blks; blocks_x.second = (same_size) ? nullptr : (int*)a_dp; - pboxes = (Box*)(a_dp + nbytes_boxes); + pboxes = (BoxIndexer*)(a_dp + nbytes_boxes); } } @@ -75,7 +72,7 @@ namespace parfor_mf_detail { template std::enable_if_t::value> -ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, bool, F&& f) +ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, bool, F const& f) { const auto& index_array = mf.IndexArray(); const int nboxes = index_array.size(); @@ -94,7 +91,7 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo const int nblocks = par_for_blocks.first[nboxes]; const int block_0_size = par_for_blocks.first[1]; const int* dp_nblocks = par_for_blocks.second; - const Box* dp_boxes = parforinfo.getBoxes(); + const BoxIndexer* dp_boxes = parforinfo.getBoxes(); #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) @@ -102,13 +99,14 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo <<>> ([=] AMREX_GPU_DEVICE () noexcept { - int ibox, icell; + int ibox; + std::uint64_t icell; if (dp_nblocks) { ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast(blockIdx.x)); - icell = (blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x; + icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x; } else { ibox = blockIdx.x / block_0_size; - icell = (blockIdx.x-ibox*block_0_size)*MT + threadIdx.x; + icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x; } #elif defined(AMREX_USE_SYCL) @@ -116,27 +114,21 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo amrex::launch(nblocks, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept { - int ibox, icell; + int ibox; + std::uint64_t icell; int blockIdxx = item.get_group_linear_id(); int threadIdxx = item.get_local_linear_id(); if (dp_nblocks) { ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast(blockIdxx)); - icell = (blockIdxx-dp_nblocks[ibox])*MT + threadIdxx; + icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx; } else { ibox = blockIdxx / block_0_size; - icell = (blockIdxx-ibox*block_0_size)*MT + threadIdxx; + icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx; } #endif - Box const& b = dp_boxes[ibox]; - int ncells = b.numPts(); - if (icell < ncells) { - const auto len = amrex::length(b); - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - AMREX_D_TERM(i += b.smallEnd(0);, - j += b.smallEnd(1);, - k += b.smallEnd(2);) + BoxIndexer const& indexer = dp_boxes[ibox]; + if (icell < indexer.numPts()) { + auto [i, j, k] = indexer(icell); for (int n = 0; n < ncomp; ++n) { parfor_mf_detail::call_f(f, ibox, i, j, k, n); } diff --git a/Src/Base/AMReX_MPMD.H b/Src/Base/AMReX_MPMD.H index 79ff3dd69b4..1c7ad218c1f 100644 --- a/Src/Base/AMReX_MPMD.H +++ b/Src/Base/AMReX_MPMD.H @@ -10,6 +10,8 @@ namespace amrex::MPMD { +void Initialize_without_split (int argc, char* argv[]); + MPI_Comm Initialize (int argc, char* argv[]); void Finalize (); @@ -18,12 +20,16 @@ bool Initialized (); int MyProc (); //! Process ID in MPI_COMM_WORLD int NProcs (); //! Number of processes in MPI_COMM_WORLD +int AppNum (); //! Get the appnum (color) required for MPI_Comm_split int MyProgId (); //! Program ID class Copier { public: - Copier (BoxArray const& ba, DistributionMapping const& dm); + explicit Copier (bool); + + Copier (BoxArray const& ba, DistributionMapping const& dm, + bool send_ba = false); template void send (FabArray const& mf, int icomp, int ncomp) const; @@ -31,9 +37,16 @@ public: template void recv (FabArray& mf, int icomp, int ncomp) const; + [[nodiscard]] BoxArray const& boxArray () const; + + [[nodiscard]] DistributionMapping const& DistributionMap () const; + private: std::map m_SndTags; std::map m_RcvTags; + bool m_is_thread_safe; + BoxArray m_ba; + DistributionMapping m_dm; }; template @@ -61,7 +74,7 @@ void Copier::send (FabArray const& mf, int icomp, int ncomp) const nbytes += cct.sbox.numPts() * ncomp * sizeof(typename FAB::value_type); } - std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); + std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes); nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned // Also need to align the offset properly @@ -125,7 +138,7 @@ void Copier::recv (FabArray& mf, int icomp, int ncomp) const nbytes += cct.dbox.numPts() * ncomp * sizeof(typename FAB::value_type); } - std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); + std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes); nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned // Also need to align the offset properly @@ -163,15 +176,16 @@ void Copier::recv (FabArray& mf, int icomp, int ncomp) const #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion() && (mf.arena()->isDevice() || mf.arena()->isManaged())) { mf.unpack_recv_buffer_gpu(mf, icomp, ncomp, recv_data, recv_size, recv_cctc, - FabArrayBase::COPY, true); + FabArrayBase::COPY, m_is_thread_safe); } else #endif { mf.unpack_recv_buffer_cpu(mf, icomp, ncomp, recv_data, recv_size, recv_cctc, - FabArrayBase::COPY, true); + FabArrayBase::COPY, m_is_thread_safe); } } + } #endif diff --git a/Src/Base/AMReX_MPMD.cpp b/Src/Base/AMReX_MPMD.cpp index 51b76d4bf94..bbaae32902c 100644 --- a/Src/Base/AMReX_MPMD.cpp +++ b/Src/Base/AMReX_MPMD.cpp @@ -17,6 +17,7 @@ namespace { MPI_Comm app_comm = MPI_COMM_NULL; int myproc; int nprocs; + int appnum; } namespace { @@ -31,7 +32,16 @@ int num_unique_elements (std::vector& v) } -MPI_Comm Initialize (int argc, char* argv[]) +/* +Initialize_without_split function assigns and checks the required +AMReX_MPMD variables. This function is internally leveraged by +Initialize function. + +This function needs to be used EXPLICITLY ONLY with pyAMReX (python) +so that the communication split can be performed using a python +library, for example, mpi4py. +*/ +void Initialize_without_split (int argc, char* argv[]) { initialized = true; int flag; @@ -46,7 +56,7 @@ MPI_Comm Initialize (int argc, char* argv[]) int* p; MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_APPNUM, &p, &flag); - int appnum = *p; + appnum = *p; std::vector all_appnum(nprocs); MPI_Allgather(&appnum, 1, MPI_INT, all_appnum.data(), 1, MPI_INT, MPI_COMM_WORLD); @@ -77,13 +87,18 @@ MPI_Comm Initialize (int argc, char* argv[]) } } - if (napps == 2) { - MPI_Comm_split(MPI_COMM_WORLD, appnum, myproc, &app_comm); - } else { - std::cout << "amrex::MPMD only supports two programs." << std::endl; + if (napps != 2) { + std::cout << "amrex::MPMD only supports two programs." << '\n'; MPI_Abort(MPI_COMM_WORLD, 1); } +} + +MPI_Comm Initialize (int argc, char* argv[]) +{ + Initialize_without_split(argc,argv); + MPI_Comm_split(MPI_COMM_WORLD, appnum, myproc, &app_comm); + return app_comm; } @@ -109,12 +124,24 @@ int NProcs () return nprocs; } +/* +AppNum function is provided so that appnum (color) +can be passed to python library (mpi4py) to perform +a pythonic version of MPI_Comm_split. +*/ +int AppNum () +{ + return appnum; +} + int MyProgId () { return (myproc == ParallelDescriptor::MyProc()) ? 0 : 1; } -Copier::Copier (BoxArray const& ba, DistributionMapping const& dm) +Copier::Copier (BoxArray const& ba, DistributionMapping const& dm, + bool send_ba) + : m_ba(ba), m_dm(dm) { int rank_offset = myproc - ParallelDescriptor::MyProc(); int this_root, other_root; @@ -127,7 +154,6 @@ Copier::Copier (BoxArray const& ba, DistributionMapping const& dm) } Vector bv = ba.boxList().data(); - int this_nboxes = static_cast(ba.size()); Vector procs = dm.ProcessorMap(); if (rank_offset != 0) { @@ -138,34 +164,46 @@ Copier::Copier (BoxArray const& ba, DistributionMapping const& dm) Vector obv; Vector oprocs; - int other_nboxes; + int other_nboxes = this_nboxes; if (myproc == this_root) { if (rank_offset == 0) // the first program { MPI_Send(&this_nboxes, 1, MPI_INT, other_root, 0, MPI_COMM_WORLD); - MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 1, MPI_COMM_WORLD, + if (!send_ba) + { + MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - obv.resize(other_nboxes); + obv.resize(other_nboxes); + } MPI_Send(bv.data(), this_nboxes, ParallelDescriptor::Mpi_typemap::type(), other_root, 2, MPI_COMM_WORLD); - MPI_Recv(obv.data(), other_nboxes, + if (!send_ba) + { + MPI_Recv(obv.data(), other_nboxes, ParallelDescriptor::Mpi_typemap::type(), other_root, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - oprocs.resize(other_nboxes); + } MPI_Send(procs.data(), this_nboxes, MPI_INT, other_root, 4, MPI_COMM_WORLD); + oprocs.resize(other_nboxes); MPI_Recv(oprocs.data(), other_nboxes, MPI_INT, other_root, 5, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } else // the second program { - MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 0, MPI_COMM_WORLD, + if (!send_ba) + { + MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + obv.resize(other_nboxes); + } MPI_Send(&this_nboxes, 1, MPI_INT, other_root, 1, MPI_COMM_WORLD); - obv.resize(other_nboxes); - MPI_Recv(obv.data(), other_nboxes, + if (!send_ba) + { + MPI_Recv(obv.data(), other_nboxes, ParallelDescriptor::Mpi_typemap::type(), other_root, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } MPI_Send(bv.data(), this_nboxes, ParallelDescriptor::Mpi_typemap::type(), other_root, 3, MPI_COMM_WORLD); @@ -176,15 +214,23 @@ Copier::Copier (BoxArray const& ba, DistributionMapping const& dm) } } - ParallelDescriptor::Bcast(&other_nboxes, 1); - if (obv.empty()) { - obv.resize(other_nboxes); + if (!send_ba) { + ParallelDescriptor::Bcast(&other_nboxes, 1); + if (obv.empty()){ + obv.resize(other_nboxes); + } + ParallelDescriptor::Bcast(obv.data(), obv.size()); + } + + if (oprocs.empty()) { oprocs.resize(other_nboxes); } - ParallelDescriptor::Bcast(obv.data(), obv.size()); ParallelDescriptor::Bcast(oprocs.data(), oprocs.size()); - BoxArray oba(BoxList(std::move(obv))); + BoxArray oba; + if (!obv.empty()) { + oba.define(BoxList(std::move(obv))); + } // At this point, ba and bv hold our boxes, and oba holds the other // program's boxes. procs holds mpi ranks of our boxes, and oprocs holds @@ -192,32 +238,138 @@ Copier::Copier (BoxArray const& ba, DistributionMapping const& dm) // MPI_COMM_WORLD. // Build communication meta-data - - AMREX_ALWAYS_ASSERT(ba.ixType().cellCentered()); + if (!send_ba){ + AMREX_ALWAYS_ASSERT(ba.ixType() == oba.ixType()); + m_is_thread_safe = ba.ixType().cellCentered(); + }else{ + m_is_thread_safe = true; + } std::vector > isects; for (int i = 0; i < this_nboxes; ++i) { if (procs[i] == myproc) { - oba.intersections(bv[i], isects); + if (!send_ba){ + oba.intersections(bv[i], isects); + } + else{ + isects.resize(0); + isects.emplace_back(i,bv[i]); + } for (auto const& isec : isects) { const int oi = isec.first; const Box& bx = isec.second; const int orank = oprocs[oi]; - m_SndTags[orank].push_back - (FabArrayBase::CopyComTag(bx, bx, oi, i)); - m_RcvTags[orank].push_back - (FabArrayBase::CopyComTag(bx, bx, i, oi)); + m_SndTags[orank].emplace_back(bx, bx, oi, i); + m_RcvTags[orank].emplace_back(bx, bx, i, oi); } } } - for (auto& kv : m_SndTags) { - std::sort(kv.second.begin(), kv.second.end()); + if (!send_ba){ + for (auto& kv : m_SndTags) { + std::sort(kv.second.begin(), kv.second.end()); + } + for (auto& kv : m_RcvTags) { + std::sort(kv.second.begin(), kv.second.end()); + } } - for (auto& kv : m_RcvTags) { - std::sort(kv.second.begin(), kv.second.end()); +} + +Copier::Copier (bool) + : m_is_thread_safe(true) +{ + int rank_offset = myproc - ParallelDescriptor::MyProc(); + int this_root, other_root; + if (rank_offset == 0) { // First program + this_root = 0; + other_root = ParallelDescriptor::NProcs(); + } else { + this_root = rank_offset; + other_root = 0; } + + Vector bv; + int this_nboxes; + + if (myproc == this_root) { + int tags[2]; + if (rank_offset == 0) // the first program + { + tags[0] = 1; + tags[1] = 3; + } + else // the second program + { + tags[0] = 0; + tags[1] = 2; + } + + MPI_Recv(&this_nboxes, 1, MPI_INT, other_root, tags[0], MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + bv.resize(this_nboxes); + MPI_Recv(bv.data(), this_nboxes, + ParallelDescriptor::Mpi_typemap::type(), + other_root, tags[1], MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + + ParallelDescriptor::Bcast(&this_nboxes, 1); + if (bv.empty()) { + bv.resize(this_nboxes); + } + + ParallelDescriptor::Bcast(bv.data(), bv.size()); + m_ba.define(BoxList(std::move(bv))); + m_dm.define(m_ba); + Vector procs = m_dm.ProcessorMap(); + if (rank_offset != 0) { + for (int i = 0; i < this_nboxes; ++i) { + procs[i] += rank_offset; + } + } + + Vector oprocs(this_nboxes); + if (myproc == this_root) { + if (rank_offset == 0) // the first program + { + MPI_Send(procs.data(), this_nboxes, MPI_INT, other_root, 4, MPI_COMM_WORLD); + MPI_Recv(oprocs.data(), this_nboxes, MPI_INT, other_root, 5, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } + else // the second program + { + MPI_Recv(oprocs.data(), this_nboxes, MPI_INT, other_root, 4, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + MPI_Send(procs.data(), this_nboxes, MPI_INT, other_root, 5, MPI_COMM_WORLD); + } + } + + ParallelDescriptor::Bcast(oprocs.data(), oprocs.size()); + + // procs holds mpi ranks of our boxes, and oprocs holds + // mpi ranks of the other program's boxes. All mpi ranks are in + // MPI_COMM_WORLD. + + // Build communication meta-data + + for (int i = 0; i < this_nboxes; ++i) { + if (procs[i] == myproc) { + const Box& bx = m_ba[i]; + const int orank = oprocs[i]; + m_SndTags[orank].emplace_back(bx, bx, i, i); + m_RcvTags[orank].emplace_back(bx, bx, i, i); + } + } +} + +BoxArray const& Copier::boxArray () const +{ + return m_ba; +} + +DistributionMapping const& Copier::DistributionMap () const +{ + return m_dm; } } diff --git a/Src/Base/AMReX_Machine.cpp b/Src/Base/AMReX_Machine.cpp index bcb420b3f67..97de84f30c8 100644 --- a/Src/Base/AMReX_Machine.cpp +++ b/Src/Base/AMReX_Machine.cpp @@ -227,14 +227,14 @@ class Machine auto sg_rank_n = sg_g_ranks.size(); if (flag_verbose) { Print() << "Machine::find_best_nbh(): called for " << nbh_rank_n - << " of " << sg_rank_n << " ranks" << std::endl; + << " of " << sg_rank_n << " ranks" << '\n'; } Vector result; auto key = NeighborhoodCache::hash(sg_g_ranks, nbh_rank_n); if (nbh_cache.get(key, result)) { if (flag_verbose) { - Print() << "Machine::find_best_nbh(): found neighborhood in cache" << std::endl; + Print() << "Machine::find_best_nbh(): found neighborhood in cache" << '\n'; } } else { // get node IDs of current subgroup @@ -251,10 +251,10 @@ class Machine } if (flag_very_verbose) { - Print() << "SubRank: GloRank: Node ID: Node Coord:" << std::endl; + Print() << "SubRank: GloRank: Node ID: Node Coord:" << '\n'; for (int i = 0; i < sg_rank_n; ++i) { Print() << " " << i << ": " << sg_g_ranks[i] << ": " << sg_node_ids[i] - << ": " << to_str(id_to_coord(sg_node_ids[i])) << std::endl; + << ": " << to_str(id_to_coord(sg_node_ids[i])) << '\n'; } } @@ -268,8 +268,8 @@ class Machine double base_score; tie(base_nbh, base_score) = baseline_score(sg_node_ids, nbh_rank_n); - Print() << "Baseline neighborhood: " << to_str(base_nbh) << ", score = " << base_score << std::endl; - Print() << "Rank 0's neighborhood: " << to_str(local_nbh) << ", score = " << score << std::endl; + Print() << "Baseline neighborhood: " << to_str(base_nbh) << ", score = " << base_score << '\n' + << "Rank 0's neighborhood: " << to_str(local_nbh) << ", score = " << score << '\n'; } // determine the best neighborhood among ranks @@ -287,12 +287,12 @@ class Machine std::sort(local_nbh.begin(), local_nbh.end()); if (flag_verbose) { Print() << "Winning neighborhood: " << winner_rank << ": " << to_str(local_nbh) - << ", score = " << winner_score << std::endl; + << ", score = " << winner_score << '\n'; } result.reserve(nbh_rank_n); - for (int i = 0; i < local_nbh.size(); ++i) { - for (auto rank : node_ranks.at(local_nbh[i])) { + for (int i : local_nbh) { + for (auto rank : node_ranks.at(i)) { if (result.size() < nbh_rank_n) { result.push_back(rank); } @@ -302,7 +302,7 @@ class Machine } if (flag_very_verbose) { - Print() << "Ranks in neighborhood: " << to_str(result) << std::endl; + Print() << "Ranks in neighborhood: " << to_str(result) << '\n'; } return result; @@ -360,11 +360,11 @@ class Machine topo_addr = get_env_str("SLURM_TOPOLOGY_ADDR"); if (flag_verbose) { - Print() << "HOSTNAME = " << hostname << std::endl; - Print() << "NERSC_HOST = " << nersc_host << std::endl; - Print() << "SLURM_JOB_PARTITION = " << partition << std::endl; - Print() << "SLURM_NODELIST = " << node_list << std::endl; - Print() << "SLURM_TOPOLOGY_ADDR = " << topo_addr << std::endl; + Print() << "HOSTNAME = " << hostname << '\n' + << "NERSC_HOST = " << nersc_host << '\n' + << "SLURM_JOB_PARTITION = " << partition << '\n' + << "SLURM_NODELIST = " << node_list << '\n' + << "SLURM_TOPOLOGY_ADDR = " << topo_addr << '\n'; } } } @@ -379,19 +379,19 @@ class Machine if (pos != std::string::npos) { result = stoi(topo_addr.substr(pos + tag.size())); // assumes format ".*nid(\d+)" if (flag_verbose) { - Print() << "Got node ID from SLURM_TOPOLOGY_ADDR: " << result << std::endl; + Print() << "Got node ID from SLURM_TOPOLOGY_ADDR: " << result << '\n'; } } else { if (cluster_name == "escori") { tag = "cgpu"; } auto mpi_proc_name = get_mpi_processor_name(); - Print() << "MPI_Get_processor_name: " << mpi_proc_name << std::endl; + Print() << "MPI_Get_processor_name: " << mpi_proc_name << '\n'; pos = mpi_proc_name.find(tag); if (pos != std::string::npos) { result = stoi(mpi_proc_name.substr(pos + tag.size())); // assumes format ".*nid(\d+)" if (flag_verbose) { - Print() << "Got node ID from MPI_Get_processor_name(): " << result << std::endl; + Print() << "Got node ID from MPI_Get_processor_name(): " << result << '\n'; } } } @@ -422,10 +422,10 @@ class Machine for (int i = 0; i < ids.size(); ++i) { node_ranks[ids[i]].push_back(i); } - Print() << "Node ID: Node Coord: Ranks:" << std::endl; + Print() << "Node ID: Node Coord: Ranks:" << '\n'; for (const auto & p : node_ranks) { Print() << " " << p.first << ": " << to_str(id_to_coord(p.first)) - << ": " << to_str(p.second) << std::endl; + << ": " << to_str(p.second) << '\n'; } } return ids; @@ -467,7 +467,7 @@ class Machine if (flag_very_verbose) { Print() << " Distance from " << a.id << " to " << b.id - << ": " << pair_dist << std::endl; + << ": " << pair_dist << '\n'; } } } @@ -482,7 +482,7 @@ class Machine BL_PROFILE("Machine::search_local_nbh()"); if (amrex::Verbose() > 0) { - Print() << "Machine::search_local_nbh() called ..." << std::endl; + Print() << "Machine::search_local_nbh() called ..." << '\n'; } Vector result; @@ -497,11 +497,11 @@ class Machine } if (flag_very_verbose) { - Print() << " Candidates:" << std::endl; + Print() << " Candidates:" << '\n'; for (const auto & p : candidates) { const auto & cand = p.second; Print() << " " << cand.id << " : " << to_str(cand.coord) - << ": " << cand.rank_n << " ranks" << std::endl; + << ": " << cand.rank_n << " ranks" << '\n'; } } @@ -518,7 +518,7 @@ class Machine << ": " << to_str(cur_node.coord) << ", ranks: " << cur_node.rank_n << ", total ranks: " << total_rank_n - << ", avg dist: " << 0 << std::endl; + << ", avg dist: " << 0 << '\n'; } if (total_rank_n >= nbh_rank_n) { return {std::move(result), 0}; @@ -541,7 +541,7 @@ class Machine Print() << " Distance from " << cand_node.id << " to " << cur_node.id << ": " << cand_dist - << ", candidate avg: " << avg_dist << std::endl; + << ", candidate avg: " << avg_dist << '\n'; } // keep track of what should be the next node to add if (avg_dist < min_avg_dist) { @@ -565,7 +565,7 @@ class Machine << ": " << to_str(cur_node.coord) << ", ranks: " << cur_node.rank_n << ", total ranks: " << total_rank_n - << ", avg dist: " << min_avg_dist << std::endl; + << ", avg dist: " << min_avg_dist << '\n'; } } } diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H index 769b9bf50f4..42762001268 100644 --- a/Src/Base/AMReX_Math.H +++ b/Src/Base/AMReX_Math.H @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -57,7 +58,7 @@ using std::isinf; #endif template -constexpr std::enable_if_t::value,T> pi () +constexpr std::enable_if_t,T> pi () { return T(3.1415926535897932384626433832795029L); } @@ -68,11 +69,9 @@ double cospi (double x) { #if defined(AMREX_USE_SYCL) return sycl::cospi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::cospi(x); #else - return std::cos(pi()*x); + AMREX_IF_ON_DEVICE(( return ::cospi(x); )) + AMREX_IF_ON_HOST(( return std::cos(pi()*x); )) #endif } @@ -82,11 +81,9 @@ float cospi (float x) { #if defined(AMREX_USE_SYCL) return sycl::cospi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::cospif(x); #else - return std::cos(pi()*x); + AMREX_IF_ON_DEVICE(( return ::cospif(x); )) + AMREX_IF_ON_HOST(( return std::cos(pi()*x); )) #endif } @@ -96,11 +93,9 @@ double sinpi (double x) { #if defined(AMREX_USE_SYCL) return sycl::sinpi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::sinpi(x); #else - return std::sin(pi()*x); + AMREX_IF_ON_DEVICE(( return ::sinpi(x); )) + AMREX_IF_ON_HOST(( return std::sin(pi()*x); )) #endif } @@ -110,14 +105,32 @@ float sinpi (float x) { #if defined(AMREX_USE_SYCL) return sycl::sinpi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::sinpif(x); #else - return std::sin(pi()*x); + AMREX_IF_ON_DEVICE(( return ::sinpif(x); )) + AMREX_IF_ON_HOST(( return std::sin(pi()*x); )) #endif } +namespace detail { + AMREX_FORCE_INLINE void sincos (double x, double* sinx, double* cosx) { +#if defined(_GNU_SOURCE) && !defined(__APPLE__) + ::sincos(x, sinx, cosx); +#else + *sinx = std::sin(x); + *cosx = std::cos(x); +#endif + } + + AMREX_FORCE_INLINE void sincosf (float x, float* sinx, float* cosx) { +#if defined(_GNU_SOURCE) && !defined(__APPLE__) + ::sincosf(x, sinx, cosx); +#else + *sinx = std::sin(x); + *cosx = std::cos(x); +#endif + } +} + //! Return sine and cosine of given number AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::pair sincos (double x) @@ -125,13 +138,9 @@ std::pair sincos (double x) std::pair r; #if defined(AMREX_USE_SYCL) r.first = sycl::sincos(x, sycl::private_ptr(&r.second)); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \ - (defined(_GNU_SOURCE) && !defined(__APPLE__)) - ::sincos(x, &r.first, &r.second); #else - r.first = std::sin(x); - r.second = std::cos(x); + AMREX_IF_ON_DEVICE(( ::sincos(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( detail::sincos(x, &r.first, &r.second); )) #endif return r; } @@ -143,13 +152,9 @@ std::pair sincos (float x) std::pair r; #if defined(AMREX_USE_SYCL) r.first = sycl::sincos(x, sycl::private_ptr(&r.second)); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \ - (defined(_GNU_SOURCE) && !defined(__APPLE__)) - ::sincosf(x, &r.first, &r.second); #else - r.first = std::sin(x); - r.second = std::cos(x); + AMREX_IF_ON_DEVICE(( ::sincosf(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( detail::sincosf(x, &r.first, &r.second); )) #endif return r; } @@ -159,11 +164,11 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::pair sincospi (double x) { std::pair r; -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - ::sincospi(x, &r.first, &r.second); -#else +#if defined(AMREX_USE_SYCL) r = sincos(pi()*x); +#else + AMREX_IF_ON_DEVICE(( ::sincospi(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( r = sincos(pi()*x); )) #endif return r; } @@ -173,18 +178,18 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::pair sincospi (float x) { std::pair r; -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - ::sincospif(x, &r.first, &r.second); -#else +#if defined(AMREX_USE_SYCL) r = sincos(pi()*x); +#else + AMREX_IF_ON_DEVICE(( ::sincospif(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( r = sincos(pi()*x); )) #endif return r; } //! Return pow(x, Power), where Power is an integer known at compile time template () || Power>=0>::type> + typename = std::enable_if_t() || Power>=0>> AMREX_FORCE_INLINE constexpr T powi (T x) noexcept { @@ -204,6 +209,161 @@ constexpr T powi (T x) noexcept } } +#if defined(AMREX_INT128_SUPPORTED) +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +std::uint64_t umulhi (std::uint64_t a, std::uint64_t b) +{ +#if defined(AMREX_USE_SYCL) + return sycl::mul_hi(a,b); +#else + AMREX_IF_ON_DEVICE(( return __umul64hi(a, b); )) + AMREX_IF_ON_HOST(( + auto tmp = amrex::UInt128_t(a) * amrex::UInt128_t(b); + return std::uint64_t(tmp >> 64); + )) +#endif +} +#endif + +/*************************************************************************************************** + * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Object to encapsulate the fast division+modulus operation for 64b integer division. +/// +/// Example: +/// +/// +/// uint64_t quotient, remainder, dividend, divisor; +/// +/// FastDivmodU64 divmod(divisor); +/// +/// divmod(quotient, remainder, dividend); +/// +/// // quotient = (dividend / divisor) +/// // remainder = (dividend % divisor) +/// +struct FastDivmodU64 +{ + std::uint64_t divisor; + +#ifdef AMREX_INT128_SUPPORTED + std::uint64_t multiplier = 1U; + unsigned int shift_right = 0; + unsigned int round_up = 0; + + // + // Static methods + // + + /// Computes b, where 2^b is the greatest power of two that is less than or equal to x + static std::uint32_t integer_log2 (std::uint64_t x) + { + std::uint32_t n = 0; + while (x >>= 1) { + ++n; + } + return n; + } + + /// Construct the FastDivmod object, in host code only + /// + /// This precomputes some values based on the divisor and is computationally expensive. + FastDivmodU64 (std::uint64_t divisor_) + : divisor(divisor_) + { + if (divisor) { + shift_right = integer_log2(divisor); + + if ((divisor & (divisor - 1)) == 0) { + multiplier = 0; + } + else { + std::uint64_t power_of_two = (std::uint64_t(1) << shift_right); + auto n = amrex::UInt128_t(power_of_two) << 64; + std::uint64_t multiplier_lo = n / divisor; + n += power_of_two; + multiplier = n / divisor; + round_up = (multiplier_lo == multiplier ? 1 : 0); + } + } + } + +#else + + FastDivmodU64 (std::uint64_t divisor_) : divisor(divisor_) {} + +#endif + + /// Returns the quotient of floor(dividend / divisor) + [[nodiscard]] AMREX_GPU_HOST_DEVICE + std::uint64_t divide (std::uint64_t dividend) const + { +#if defined(AMREX_INT128_SUPPORTED) + auto x = dividend; + if (multiplier) { + x = amrex::Math::umulhi(dividend + round_up, multiplier); + } + return (x >> shift_right); +#else + return dividend / divisor; +#endif + } + + /// Computes the remainder given a computed quotient and dividend + [[nodiscard]] AMREX_GPU_HOST_DEVICE + std::uint64_t modulus (std::uint64_t quotient, std::uint64_t dividend) const + { + return dividend - quotient * divisor; + } + + /// Returns the quotient of floor(dividend / divisor) and computes the remainder + [[nodiscard]] AMREX_GPU_HOST_DEVICE + std::uint64_t divmod (std::uint64_t &remainder, std::uint64_t dividend) const + { + auto quotient = divide(dividend); + remainder = modulus(quotient, dividend); + return quotient; + } + + /// Computes integer division and modulus using precomputed values. This is computationally + /// inexpensive. + AMREX_GPU_HOST_DEVICE + void operator() (std::uint64_t "ient, std::uint64_t &remainder, std::uint64_t dividend) const + { + quotient = divmod(remainder, dividend); + } +}; + } #endif diff --git a/Src/Base/AMReX_MultiFab.H b/Src/Base/AMReX_MultiFab.H index 416c4540da5..b2fc9c80a1f 100644 --- a/Src/Base/AMReX_MultiFab.H +++ b/Src/Base/AMReX_MultiFab.H @@ -4,16 +4,13 @@ #include #include +#include #include #include #include #include #include -#ifdef AMREX_USE_EB -#include -#endif - #include namespace amrex @@ -24,16 +21,17 @@ using fMultiFab = FabArray >; class iMultiFab; /** - * \brief - * A collection (stored as an array) of FArrayBox objects. + * \brief A collection (stored as an array) of FArrayBox objects. + * * This class is useful for storing floating point data on a domain defined by * a union of rectangular regions embedded in a uniform index space. * MultiFab class extends the function of the underlying FabArray class just * as the FArrayBox class extends the function of BaseFab. - * Additional member functions are defined for I/O and simple arithmetic operations on these aggregate objects. + * Additional member functions are defined for I/O and simple arithmetic + * operations on these aggregate objects. + * * This class does NOT provide a copy constructor or assignment operator. */ - class MultiFab : public FabArray @@ -41,34 +39,36 @@ class MultiFab public: /** - * \brief Constructs an empty MultiFab. Data can be defined at a later - * time using the define member functions inherited - * from FabArray. + * \brief Constructs an empty MultiFab. + * + * Data can be defined at a later time using the define member functions + * inherited from FabArray. */ MultiFab () noexcept; /** - * \brief Constructs an empty MultiFab. Data can be defined at a later - * time using the define member functions inherited from FabArray. If - * `define` is called later with a nulltpr as MFInfo's arena, the default - * Arena `a` will be used. If the arena in MFInfo is not a nullptr, the - * MFInfo's arena will be used. + * \brief Constructs an empty MultiFab. + * + * Data can be defined at a later time using the define member functions. + * If `define` is called later with a nullptr as MFInfo's arena, the + * default Arena `a` will be used. If the arena in MFInfo is not a + * nullptr, the MFInfo's arena will be used. */ explicit MultiFab (Arena* a) noexcept; /** - * \brief - * Constructs a MultiFab - * \param bs a valid region + * \brief Constructs a MultiFab + * + * The size of the FArrayBox is given by the Box grown by \p ngrow, and + * the number of components is given by \p ncomp. If \p info is set to + * not allocating memory, then no FArrayBoxes are allocated at + * this time but can be defined later. + * + * \param bxs a valid region * \param dm a DistribuionMapping * \param ncomp number of components * \param ngrow number of cells the region grows * \param info MFInfo - - * The size of the FArrayBox is given by the Box grown by ngrow, and - * the number of components is given by ncomp. If info is set to - * not allocating memory, then no FArrayBoxes are allocated at - * this time but can be defined later. */ MultiFab (const BoxArray& bxs, const DistributionMapping& dm, @@ -95,10 +95,11 @@ public: #endif /** - * \brief Make an alias MultiFab. maketype must be - * amrex::make_alias. scomp is the starting component of the - * alias and ncomp is the number of components in the new aliasing - * MultiFab. + * \brief Make an alias MultiFab. + * + * Note that \p maketype must be `amrex::make_alias`, + * \p scomp is the starting component of the alias, and + * \p ncomp is the number of components in the new aliasing MultiFab. */ MultiFab (const MultiFab& rhs, MakeType maketype, int scomp, int ncomp); @@ -135,11 +136,13 @@ public: #endif MultiFab& operator= (Real r); - // + /** - * \brief Returns the minimum value contained in component comp of the - * MultiFab. The parameter nghost determines the number of - * boundary cells to search for the minimum. The default is to + * \brief Returns the minimum value contained in component \p comp of the + * MultiFab. + * + * The parameter \p nghost determines the number of + * boundary cells to search for the minimum. The default is to * search only the valid regions of the FArrayBoxes. */ [[nodiscard]] Real min (int comp, @@ -154,16 +157,18 @@ public: int nghost = 0, bool local = false) const; /** - * \brief Returns the maximum value contained in component comp of the - * MultiFab. The parameter nghost determines the number of - * boundary cells to search for the maximum. The default is to + * \brief Returns the maximum value contained in component \p comp of the + * MultiFab. + * + * The parameter \p nghost determines the number of + * boundary cells to search for the maximum. The default is to * search only the valid regions of the FArrayBoxes. */ [[nodiscard]] Real max (int comp, int nghost = 0, bool local = false) const; /** - * \brief Identical to the previous max() function, but confines its + * \brief Identical to the previous `max()` function, but confines its * search to intersection of Box b and the MultiFab. */ [[nodiscard]] Real max (const Box& region, @@ -191,7 +196,7 @@ public: /** * \brief Returns the maximum *absolute* values contained in - * each component of "comps" of the MultiFab. "nghost" ghost cells are used. + * each component of \p comps of the MultiFab. \p nghost ghost cells are used. */ [[nodiscard]] Vector norm0 (const Vector& comps, int nghost = 0, bool local = false, bool ignore_covered = false ) const; [[nodiscard]] Vector norminf (const Vector& comps, int nghost = 0, bool local = false, bool ignore_covered = false) const { @@ -199,13 +204,14 @@ public: } /** - * \brief Returns the L1 norm of component "comp" over the MultiFab. + * \brief Returns the L1 norm of component \p comp over the MultiFab. + * * No ghost cells are used. This version has no double counting for nodal data. */ [[nodiscard]] Real norm1 (int comp, const Periodicity& period, bool ignore_covered = false) const; /** - * \brief Returns the L1 norm of component "comp" over the MultiFab. - * ngrow ghost cells are used. + * \brief Returns the L1 norm of component \p comp over the MultiFab. + * \p ngrow ghost cells are used. */ [[nodiscard]] Real norm1 (int comp = 0, int ngrow = 0, bool local = false) const; /** @@ -214,12 +220,12 @@ public: */ [[nodiscard]] Vector norm1 (const Vector& comps, int ngrow = 0, bool local = false) const; /** - * \brief Returns the L2 norm of component "comp" over the MultiFab. + * \brief Returns the L2 norm of component \p comp over the MultiFab. * No ghost cells are used. */ [[nodiscard]] Real norm2 (int comp = 0) const; /** - * \brief Returns the L2 norm of component "comp" over the MultiFab. + * \brief Returns the L2 norm of component \p comp over the MultiFab. * No ghost cells are used. This version has no double counting for nodal data. */ [[nodiscard]] Real norm2 (int comp, const Periodicity& period) const; @@ -236,16 +242,17 @@ public: using FabArray::sum; /** - * \brief Same as sum with local=false, but for non-cell-centered data, this - * skips non-unique points that are owned by multiple boxes. + * \brief Same as sum with \p local =false, but for non-cell-centered data, this + * skips non-unique points that are owned by multiple boxes. */ [[nodiscard]] Real sum_unique (int comp = 0, bool local = false, const Periodicity& period = Periodicity::NonPeriodic()) const; /** - * \brief Adds the scalar value val to the value of each cell in the - * specified subregion of the MultiFab. The subregion consists - * of the num_comp components starting at component comp. + * \brief Adds the scalar value \p val to the value of each cell in the + * specified subregion of the MultiFab. + * + * The subregion consists of the \p num_comp components starting at component \p comp. * The value of nghost specifies the number of cells in the * boundary region of each FArrayBox in the subregion that should * be modified. diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H index ad1fa669f32..5c05850f346 100644 --- a/Src/Base/AMReX_MultiFabUtil.H +++ b/Src/Base/AMReX_MultiFabUtil.H @@ -19,67 +19,76 @@ namespace amrex const MultiFab& nd, int scomp, int ncomp, int ngrow = 0); - //! Average edge-based MultiFab onto cell-centered MultiFab. This fills in - //! ngrow ghost cells in the cell-centered MultiFab. Both cell centered and - //! edge centered MultiFabs need to have ngrow ghost values + /** + * \brief Average edge-based MultiFab onto cell-centered MultiFab. + * + * This fills in \p ngrow ghost cells in the cell-centered MultiFab. Both cell centered and + * edge centered MultiFabs need to have \p ngrow ghost values. + */ void average_edge_to_cellcenter (MultiFab& cc, int dcomp, const Vector& edge, int ngrow = 0); - //! Average face-based MultiFab onto cell-centered MultiFab. void average_face_to_cellcenter (MultiFab& cc, int dcomp, const Vector& fc, int ngrow = 0); - + //! Average face-based FabArray onto cell-centered FabArray. template && IsFabArray_v, int> = 0> void average_face_to_cellcenter (CMF& cc, int dcomp, const Array& fc, int ngrow = 0); - + //! Average face-based MultiFab onto cell-centered MultiFab with geometric weighting. void average_face_to_cellcenter (MultiFab& cc, const Vector& fc, const Geometry& geom); + //! Average face-based MultiFab onto cell-centered MultiFab with geometric weighting. void average_face_to_cellcenter (MultiFab& cc, const Array& fc, const Geometry& geom); - - //! Average cell-centered MultiFab onto face-based MultiFab. + //! Average cell-centered MultiFab onto face-based MultiFab with geometric weighting. void average_cellcenter_to_face (const Vector& fc, const MultiFab& cc, const Geometry& geom, int ncomp = 1, bool use_harmonic_averaging = false); + //! Average cell-centered MultiFab onto face-based MultiFab with geometric weighting. void average_cellcenter_to_face (const Array& fc, const MultiFab& cc, const Geometry& geom, int ncomp = 1, bool use_harmonic_averaging = false); - //! Average fine face-based MultiFab onto crse face-based MultiFab. + //! Average fine face-based FabArray onto crse face-based FabArray. template ::value,int> = 0> void average_down_faces (const Vector& fine, const Vector& crse, const IntVect& ratio, int ngcrse = 0); + //! Average fine face-based FabArray onto crse face-based FabArray. template ::value,int> = 0> void average_down_faces (const Vector& fine, const Vector& crse, int ratio, int ngcrse = 0); + //! Average fine face-based FabArray onto crse face-based FabArray. template ::value,int> = 0> void average_down_faces (const Array& fine, const Array& crse, const IntVect& ratio, int ngcrse = 0); + //! Average fine face-based FabArray onto crse face-based FabArray. template ::value,int> = 0> void average_down_faces (const Array& fine, const Array& crse, int ratio, int ngcrse = 0); - //! This version does average down for one direction. - //! It uses the IndexType of MultiFabs to determine the direction. - //! It is expected that one direction is nodal and the rest are cell-centered. + /** + * \brief This version does average down for one face direction. + * + * It uses the IndexType of MultiFabs to determine the direction. + * It is expected that one direction is nodal and the rest are cell-centered. + */ template void average_down_faces (const FabArray& fine, FabArray& crse, const IntVect& ratio, int ngcrse=0); @@ -117,9 +126,12 @@ namespace amrex int ngcrse = 0, bool mfiter_is_definitely_safe=false); - //! Average fine cell-based MultiFab onto crse cell-centered MultiFab using - //! volume-weighting. This routine DOES NOT assume that the crse BoxArray is - //! a coarsened version of the fine BoxArray. + /** + * \brief Volume weighed average of fine MultiFab onto coarse MultiFab. + * + * Both MultiFabs are assumed to be cell-centered. This routine DOES NOT assume that + * the crse BoxArray is a coarsened version of the fine BoxArray. + */ void average_down (const MultiFab& S_fine, MultiFab& S_crse, const Geometry& fgeom, const Geometry& cgeom, int scomp, int ncomp, const IntVect& ratio); @@ -375,7 +387,7 @@ namespace amrex void FillRandom (MultiFab& mf, int scomp, int ncomp); /** - * \brief Fill MultiFab with random numbers from nornmal distribution + * \brief Fill MultiFab with random numbers from normal distribution * * All cells including ghost cells are filled. * @@ -625,18 +637,18 @@ void average_down (const FabArray& S_fine, FabArray& S_crse, - /** - * \brief Returns part of a norm based on two MultiFabs - * The MultiFabs MUST have the same underlying BoxArray. - * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n)) - * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n) - */ - +/** + * \brief Returns part of a norm based on two MultiFabs. + * + * The MultiFabs MUST have the same underlying BoxArray. + * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n)) + * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n) + */ template Real NormHelper (const MultiFab& x, int xcomp, const MultiFab& y, int ycomp, - F && f, + F const& f, int numcomp, IntVect nghost, bool local) { BL_ASSERT(x.boxArray() == y.boxArray()); @@ -684,21 +696,21 @@ NormHelper (const MultiFab& x, int xcomp, return sm; } - /** - * \brief Returns part of a norm based on three MultiFabs - * The MultiFabs MUST have the same underlying BoxArray. - * The Predicate pf is used to test the mask - * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n)) - * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n) - */ - +/** + * \brief Returns part of a norm based on three MultiFabs + * + * The MultiFabs MUST have the same underlying BoxArray. + * The Predicate pf is used to test the mask + * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n)) + * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n) + */ template Real NormHelper (const MMF& mask, const MultiFab& x, int xcomp, const MultiFab& y, int ycomp, - Pred && pf, - F && f, + Pred const& pf, + F const& f, int numcomp, IntVect nghost, bool local) { BL_ASSERT(x.boxArray() == y.boxArray()); @@ -1024,7 +1036,7 @@ template FOO> BaseFab -ReduceToPlane (int direction, Box const& domain, FabArray const& mf, F&& f) +ReduceToPlane (int direction, Box const& domain, FabArray const& mf, F const& f) { Box domain2d = domain; domain2d.setRange(direction, 0); diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp index 93ba453cc07..5dd97fa6536 100644 --- a/Src/Base/AMReX_MultiFabUtil.cpp +++ b/Src/Base/AMReX_MultiFabUtil.cpp @@ -46,11 +46,15 @@ namespace { boxes.push_back(is.second); slice_to_full_ba_map.push_back(is.first); } - BoxArray slice_ba(boxes.data(), static_cast(boxes.size())); - DistributionMapping slice_dmap(std::move(procs)); - - return std::make_unique(slice_ba, slice_dmap, ncomp, 0, - MFInfo(), FArrayBoxFactory()); + if (!boxes.empty()) { + BoxArray slice_ba(boxes.data(), static_cast(boxes.size())); + DistributionMapping slice_dmap(std::move(procs)); + + return std::make_unique(slice_ba, slice_dmap, ncomp, 0, + MFInfo(), FArrayBoxFactory()); + } else { + return nullptr; + } } } @@ -308,9 +312,6 @@ namespace amrex // ************************************************************************************************************* - // Average fine cell-based MultiFab onto crse cell-centered MultiFab. - // We do NOT assume that the coarse layout is a coarsened version of the fine layout. - // This version DOES use volume-weighting. void average_down (const MultiFab& S_fine, MultiFab& S_crse, const Geometry& fgeom, const Geometry& cgeom, int scomp, int ncomp, int rr) @@ -477,7 +478,7 @@ namespace amrex auto tmptype = type; tmptype.set(dir); if (dir >= AMREX_SPACEDIM || !tmptype.nodeCentered()) { - amrex::Abort("average_down_edges: not face index type"); + amrex::Abort("average_down_edges: not edge index type"); } const int ncomp = crse.nComp(); if (isMFIterSafe(fine, crse)) @@ -563,6 +564,10 @@ namespace amrex Vector slice_to_full_ba_map; std::unique_ptr slice = allocateSlice(dir, cc, ncomp, geom, coord, slice_to_full_ba_map); + if (!slice) { + return nullptr; + } + #ifdef AMREX_USE_OMP #pragma omp parallel if (Gpu::notInLaunchRegion()) #endif diff --git a/Src/Base/AMReX_MultiFabUtil_2D_C.H b/Src/Base/AMReX_MultiFabUtil_2D_C.H index 3cb97db8e2d..0a55ba0540a 100644 --- a/Src/Base/AMReX_MultiFabUtil_2D_C.H +++ b/Src/Base/AMReX_MultiFabUtil_2D_C.H @@ -120,6 +120,7 @@ void amrex_avgdown_faces (Box const& bx, Array4 const& crse, } break; } + default: { break; } } } @@ -157,6 +158,7 @@ void amrex_avgdown_faces (int i, int j, int, int n, Array4 const& crse, crse(i,j,0,n+ccomp) = c * facInv; break; } + default: { break; } } } @@ -206,6 +208,7 @@ void amrex_avgdown_edges (Box const& bx, Array4 const& crse, } break; } + default: { break; } } } @@ -242,6 +245,7 @@ void amrex_avgdown_edges (int i, int j, int, int n, Array4 const& crse, crse(i,j,0,n+ccomp) = c * facInv; break; } + default: { break; } } } diff --git a/Src/Base/AMReX_MultiFabUtil_3D_C.H b/Src/Base/AMReX_MultiFabUtil_3D_C.H index 0cc12ff1aba..5945e7e511b 100644 --- a/Src/Base/AMReX_MultiFabUtil_3D_C.H +++ b/Src/Base/AMReX_MultiFabUtil_3D_C.H @@ -164,6 +164,7 @@ void amrex_avgdown_faces (Box const& bx, Array4 const& crse, } break; } + default: { break; } } } @@ -214,6 +215,7 @@ void amrex_avgdown_faces (int i, int j, int k, int n, Array4 const& crse, crse(i,j,k,n+ccomp) = c * facInv; break; } + default: { break; } } } @@ -287,6 +289,7 @@ void amrex_avgdown_edges (Box const& bx, Array4 const& crse, } break; } + default: { break; } } } @@ -333,6 +336,7 @@ void amrex_avgdown_edges (int i, int j, int k, int n, Array4 const& crse, crse(i,j,k,n+ccomp) = c * facInv; break; } + default: { break; } } } diff --git a/Src/Base/AMReX_MultiFabUtil_nd_C.H b/Src/Base/AMReX_MultiFabUtil_nd_C.H index 3956e818316..cffec43d2f1 100644 --- a/Src/Base/AMReX_MultiFabUtil_nd_C.H +++ b/Src/Base/AMReX_MultiFabUtil_nd_C.H @@ -39,6 +39,7 @@ void amrex_fill_slice_interp (Box const& bx, Array4 slice, break; case 2: khi = 1; + default: { break; } } } else { switch (dir) @@ -51,6 +52,7 @@ void amrex_fill_slice_interp (Box const& bx, Array4 slice, break; case 2: klo = -1; + default: { break; } } } diff --git a/Src/Base/AMReX_NFiles.H b/Src/Base/AMReX_NFiles.H index 824351b50dc..bd1518dd44c 100644 --- a/Src/Base/AMReX_NFiles.H +++ b/Src/Base/AMReX_NFiles.H @@ -23,7 +23,6 @@ namespace amrex { * nfi.Stream().write((const char *) data.dataPtr(), nChars); * } */ - class NFilesIter { public: diff --git a/Src/Base/AMReX_NFiles.cpp b/Src/Base/AMReX_NFiles.cpp index 2abd27986f7..3892cabfd07 100644 --- a/Src/Base/AMReX_NFiles.cpp +++ b/Src/Base/AMReX_NFiles.cpp @@ -125,11 +125,11 @@ void NFilesIter::SetSparseFPP(const Vector &ranksToWrite) // ---- do more error checking here // ---- ranks in range, is dynamic on already mySparseFileNumber = -1; - for(int r(0); r < ranksToWrite.size(); ++r) { - if(ranksToWrite[r] < 0 || ranksToWrite[r] >= nProcs) { + for(int r : ranksToWrite) { + if(r < 0 || r >= nProcs) { amrex::Abort("**** Error in NFilesIter::SetSparseFPP: rank out of range."); } - if(ranksToWrite[r] == myProc) { + if(r == myProc) { if(mySparseFileNumber == -1) { mySparseFileNumber = myProc; } else { @@ -281,8 +281,8 @@ bool NFilesIter::ReadyToWrite(bool appendFirst) { BL_PROFILE("NFI::ReadyToWrite:decider"); // ---- the first message received is the coordinator ParallelDescriptor::Recv(&coordinatorProc, 1, MPI_ANY_SOURCE, deciderTag); - for(int i(0); i < setZeroProcs.size(); ++i) { // ---- tell the set zero ranks who is coordinating - ParallelDescriptor::Send(&coordinatorProc, 1, setZeroProcs[i], coordinatorTag); + for(int setZeroProc : setZeroProcs) { // ---- tell the set zero ranks who is coordinating + ParallelDescriptor::Send(&coordinatorProc, 1, setZeroProc, coordinatorTag); } unreadMessages.push_back(std::make_pair(deciderTag, setZeroProcs.size() - 1)); } @@ -552,8 +552,7 @@ Vector NFilesIter::FileNumbersWritten() void NFilesIter::CleanUpMessages() { #ifdef BL_USE_MPI BL_PROFILE("NFI::CleanUpMessages"); - for(int i(0); i < unreadMessages.size(); ++i) { - std::pair & pii = unreadMessages[i]; + for(auto & pii : unreadMessages) { int fromProc, tag(pii.first), nMessages(pii.second); #if 0 amrex::AllPrint() << ParallelDescriptor::MyProc() << ":: cleaning up " << nMessages diff --git a/Src/Base/AMReX_NonLocalBC.H b/Src/Base/AMReX_NonLocalBC.H index fc7f4cb6583..f308aae1389 100644 --- a/Src/Base/AMReX_NonLocalBC.H +++ b/Src/Base/AMReX_NonLocalBC.H @@ -264,7 +264,7 @@ struct MultiBlockCommMetaData : FabArrayBase::CommMetaData { // [concept.FabProjection] // -//! \brief This type trait tests if a type P is a projetion for FAB. +//! \brief This type trait tests if a type P is a projection for FAB. template struct IsFabProjection : IsCallableR, Dim3, int> @@ -296,9 +296,9 @@ struct Identity { static constexpr Identity identity{}; static_assert(sizeof(Identity) == 1 ); -static_assert(std::is_trivially_default_constructible::value ); -static_assert(std::is_trivially_copy_assignable::value ); -static_assert(std::is_trivially_copy_constructible::value ); +static_assert(std::is_trivially_default_constructible_v ); +static_assert(std::is_trivially_copy_assignable_v ); +static_assert(std::is_trivially_copy_constructible_v ); static_assert(IsIndexMapping() ); // NOLINT(bugprone-throw-keyword-missing) static_assert(IsFabProjection() ); // NOLINT(bugprone-throw-keyword-missing) @@ -1129,7 +1129,7 @@ FillBoundary_finish (CommHandler handler, auto cmd = makeFillBoundaryMetaData(mf, mf.nGrowVect, geom, dtos); // The metadata cmd can be cached and reused on a MultiFab/FabArray with // the same BoxArray and DistributionMapping. - FillBoundary_finish(mf, cmd, scomp, ncomp, dtos, proj); + FillBoundary(mf, cmd, scomp, ncomp, dtos, proj); \endverbatim * * The FillBoundary capability here is more flexible than FabArray's diff --git a/Src/Base/AMReX_NonLocalBC.cpp b/Src/Base/AMReX_NonLocalBC.cpp index ebd78f7b3d1..a716150884c 100644 --- a/Src/Base/AMReX_NonLocalBC.cpp +++ b/Src/Base/AMReX_NonLocalBC.cpp @@ -73,7 +73,7 @@ void PrepareCommBuffers(CommData& comm, nbytes += cct.sbox.numPts() * object_size * n_components; } - std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); + std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes); nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned // Also need to align the offset properly diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H index 8eb8ada4513..15d6854c926 100644 --- a/Src/Base/AMReX_OpenMP.H +++ b/Src/Base/AMReX_OpenMP.H @@ -3,6 +3,7 @@ #include #ifdef AMREX_USE_OMP +#include #include namespace amrex::OpenMP { @@ -11,10 +12,15 @@ namespace amrex::OpenMP { inline int get_max_threads () { return omp_get_max_threads(); } inline int get_thread_num () { return omp_get_thread_num(); } inline int in_parallel () { return omp_in_parallel(); } + inline void set_num_threads (int num) { omp_set_num_threads(num); } + void Initialize (); + void Finalize (); + + omp_lock_t* get_lock (int ilock); } -#else +#else // AMREX_USE_OMP namespace amrex::OpenMP { @@ -22,9 +28,15 @@ namespace amrex::OpenMP { constexpr int get_max_threads () { return 1; } constexpr int get_thread_num () { return 0; } constexpr int in_parallel () { return false; } - + constexpr void set_num_threads (int) { /* nothing */ } } -#endif +#endif // AMREX_USE_OMP + +namespace amrex { + /** ... */ + int + numUniquePhysicalCores(); +} #endif diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp new file mode 100644 index 00000000000..03c54b5358b --- /dev/null +++ b/Src/Base/AMReX_OpenMP.cpp @@ -0,0 +1,215 @@ +#include +#include +#include +#include + +#if defined(__APPLE__) +#include +#include +#endif + +#if defined(_WIN32) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace amrex +{ + int + numUniquePhysicalCores () + { + int ncores; + +#if defined(__APPLE__) + size_t len = sizeof(ncores); + // See hw.physicalcpu and hw.physicalcpu_max + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_system_capabilities/ + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname + if (sysctlbyname("hw.physicalcpu", &ncores, &len, NULL, 0) == -1) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Error receiving hw.physicalcpu! " + << "Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } +#elif defined(__linux__) + std::set> uniqueThreadSets; + int cpuIndex = 0; + + while (true) { + // for each logical CPU in cpuIndex from 0...N-1 + std::string path = "/sys/devices/system/cpu/cpu" + std::to_string(cpuIndex) + "/topology/thread_siblings_list"; + std::ifstream file(path); + if (!file.is_open()) { + break; // no further CPUs to check + } + + // find its siblings + std::vector siblings; + std::string line; + if (std::getline(file, line)) { + std::stringstream ss(line); + std::string token; + + // Possible syntax: 0-3, 8-11, 14,17 + // https://github.com/torvalds/linux/blob/v6.5/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-L72 + while (std::getline(ss, token, ',')) { + size_t dashPos = token.find('-'); + if (dashPos != std::string::npos) { + // Range detected + int start = std::stoi(token.substr(0, dashPos)); + int end = std::stoi(token.substr(dashPos + 1)); + for (int i = start; i <= end; ++i) { + siblings.push_back(i); + } + } else { + siblings.push_back(std::stoi(token)); + } + } + } + + // and record the siblings group + // (assumes: ascending and unique sets per cpuIndex) + uniqueThreadSets.insert(siblings); + cpuIndex++; + } + + if (cpuIndex == 0) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Error reading CPU info.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } else { + ncores = int(uniqueThreadSets.size()); + } +#elif defined(_WIN32) + DWORD length = 0; + bool result = GetLogicalProcessorInformation(NULL, &length); + + if (!result) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! " + << "Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } + else { + std::vector buffer(length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)); + if (!GetLogicalProcessorInformation(&buffer[0], &length)) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! " + << "Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } else { + ncores = 0; + for (const auto& info : buffer) { + if (info.Relationship == RelationProcessorCore) { + ncores++; + } + } + } + } +#else + // TODO: + // BSD + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Unknown system. Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); +#endif + return ncores; + } +} // namespace amrex + +#ifdef AMREX_USE_OMP +namespace amrex::OpenMP +{ + namespace { + constexpr int nlocks = 128; + omp_lock_t omp_locks[nlocks]; + unsigned int initialized = 0; + } + + void Initialize () + { + if (initialized) { + ++initialized; + return; + } + + amrex::ParmParse pp("amrex"); + std::string omp_threads = "system"; + pp.queryAdd("omp_threads", omp_threads); + + auto to_int = [](std::string const & str_omp_threads) { + std::optional num; + try { num = std::stoi(str_omp_threads); } + catch (...) { /* nothing */ } + return num; + }; + + if (omp_threads == "system") { + // default or OMP_NUM_THREADS environment variable + } else if (omp_threads == "nosmt") { + char const *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); + if (env_omp_num_threads == nullptr) { + omp_set_num_threads(numUniquePhysicalCores()); + } + else if (amrex::system::verbose > 1) { + amrex::Print() << "amrex.omp_threads was set to nosmt," + << "but OMP_NUM_THREADS was set. Will keep " + << "OMP_NUM_THREADS=" << env_omp_num_threads << ".\n"; + } + } else { + std::optional num_omp_threads = to_int(omp_threads); + if (num_omp_threads.has_value()) { + omp_set_num_threads(num_omp_threads.value()); + } + else { + if (amrex::system::verbose > 0) { + amrex::Print() << "amrex.omp_threads has an unknown value: " + << omp_threads + << " (try system, nosmt, or a positive integer)\n"; + } + } + } + + for (auto& lck : omp_locks) { + omp_init_lock(&lck); + } + + ++initialized; + } + + void Finalize () + { + if (initialized) { + --initialized; + if (initialized == 0) { + for (auto& lck : omp_locks) { + omp_destroy_lock(&lck); + } + } + } + } + + omp_lock_t* get_lock (int ilock) + { + ilock = ilock % nlocks; + if (ilock < 0) { ilock += nlocks; } + return omp_locks + ilock; + } + +} // namespace amrex::OpenMP +#endif // AMREX_USE_OMP diff --git a/Src/Base/AMReX_Orientation.H b/Src/Base/AMReX_Orientation.H index 263bb84a95e..61e3622b336 100644 --- a/Src/Base/AMReX_Orientation.H +++ b/Src/Base/AMReX_Orientation.H @@ -25,7 +25,6 @@ class OrientationIter; * AMREX_SPACEDIM-1 and then the AMREX_SPACEDIM high sides from direction 0 .. * AMREX_SPACEDIM-1. */ - class Orientation { public: diff --git a/Src/Base/AMReX_PArena.H b/Src/Base/AMReX_PArena.H index cc221ba7bae..75db747fd9f 100644 --- a/Src/Base/AMReX_PArena.H +++ b/Src/Base/AMReX_PArena.H @@ -15,7 +15,6 @@ namespace amrex { * \brief This arena uses CUDA stream-ordered memory allocator if available. * If not, use The_Arena(). */ - class PArena : public Arena @@ -38,6 +37,11 @@ public: [[nodiscard]] bool isDevice () const final; [[nodiscard]] bool isPinned () const final; +#ifdef AMREX_USE_GPU + //! Is this CUDA stream ordered memory allocator? + [[nodiscard]] bool isStreamOrderedArena () const final { return true; } +#endif + #ifdef AMREX_CUDA_GE_11_2 private: cudaMemPool_t m_pool; diff --git a/Src/Base/AMReX_PODVector.H b/Src/Base/AMReX_PODVector.H index 0f10dfb94ef..7800fb145ed 100644 --- a/Src/Base/AMReX_PODVector.H +++ b/Src/Base/AMReX_PODVector.H @@ -423,7 +423,7 @@ namespace amrex iterator insert (const_iterator a_pos, T&& a_item) { // This is *POD* vector after all - return insert(a_pos, 1, a_item); + return insert(a_pos, 1, std::move(a_item)); } iterator insert (const_iterator a_pos, diff --git a/Src/Base/AMReX_ParallelDescriptor.H b/Src/Base/AMReX_ParallelDescriptor.H index 46ca0f99fc3..f949ae6f45c 100644 --- a/Src/Base/AMReX_ParallelDescriptor.H +++ b/Src/Base/AMReX_ParallelDescriptor.H @@ -447,162 +447,162 @@ while ( false ) //! Real sum reduction. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T& rvar); template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T* rvar, int cnt); // Having this for backward compatibility - void ReduceRealSum (Vector >&& rvar); + void ReduceRealSum (Vector > const& rvar); // template - typename std::enable_if::value>::type - ReduceRealSum (Vector >&& rvar); + std::enable_if_t> + ReduceRealSum (Vector > const& rvar); //! Real sum reduction to specified cpu. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T& rvar, int cpu); template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T* rvar, int cnt, int cpu); // Having this for backward compatibility - void ReduceRealSum (Vector >&& rvar, int cpu); + void ReduceRealSum (Vector > const& rvar, int cpu); // template - typename std::enable_if::value>::type - ReduceRealSum (Vector >&& rvar, int cpu); + std::enable_if_t> + ReduceRealSum (Vector > const& rvar, int cpu); //! Real max reduction. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T& rvar); template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T* rvar, int cnt); // Having this for backward compatibility - void ReduceRealMax (Vector >&& rvar); + void ReduceRealMax (Vector > const& rvar); // template - typename std::enable_if::value>::type - ReduceRealMax (Vector >&& rvar); + std::enable_if_t> + ReduceRealMax (Vector > const& rvar); //! Real max reduction to specified cpu. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T& rvar, int cpu); template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T* rvar, int cnt, int cpu); // Having this for backward compatibility - void ReduceRealMax (Vector >&& rvar, int cpu); + void ReduceRealMax (Vector > const& rvar, int cpu); // template - typename std::enable_if::value>::type - ReduceRealMax (Vector >&& rvar, int cpu); + std::enable_if_t> + ReduceRealMax (Vector > const& rvar, int cpu); //! Real min reduction. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T& rvar); template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T* rvar, int cnt); // Having this for backward compatibility - void ReduceRealMin (Vector >&& rvar); + void ReduceRealMin (Vector > const& rvar); // template - typename std::enable_if::value>::type - ReduceRealMin (Vector >&& rvar); + std::enable_if_t> + ReduceRealMin (Vector > const& rvar); //! Real min reduction to specified cpu. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T& rvar, int cpu); template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T* rvar, int cnt, int cpu); // Having this for backward compatibility - void ReduceRealMin (Vector >&& rvar, int cpu); + void ReduceRealMin (Vector > const& rvar, int cpu); // template - typename std::enable_if::value>::type - ReduceRealMin (Vector >&& rvar, int cpu); + std::enable_if_t> + ReduceRealMin (Vector > const& rvar, int cpu); //! Integer sum reduction. void ReduceIntSum (int& rvar); void ReduceIntSum (int* rvar, int cnt); - void ReduceIntSum (Vector >&& rvar); + void ReduceIntSum (Vector > const& rvar); //! Integer sum reduction to specified cpu. void ReduceIntSum (int& rvar, int cpu); void ReduceIntSum (int* rvar, int cnt, int cpu); - void ReduceIntSum (Vector >&& rvar, int cpu); + void ReduceIntSum (Vector > const& rvar, int cpu); //! Integer max reduction. void ReduceIntMax (int& rvar); void ReduceIntMax (int* rvar, int cnt); - void ReduceIntMax (Vector >&& rvar); + void ReduceIntMax (Vector > const& rvar); //! Integer max reduction to specified cpu. void ReduceIntMax (int& rvar, int cpu); void ReduceIntMax (int* rvar, int cnt, int cpu); - void ReduceIntMax (Vector >&& rvar, int cpu); + void ReduceIntMax (Vector > const& rvar, int cpu); //! Integer min reduction. void ReduceIntMin (int& rvar); void ReduceIntMin (int* rvar, int cnt); - void ReduceIntMin (Vector >&& rvar); + void ReduceIntMin (Vector > const& rvar); //! Integer min reduction to specified cpu. void ReduceIntMin (int& rvar, int cpu); void ReduceIntMin (int* rvar, int cnt, int cpu); - void ReduceIntMin (Vector >&& rvar, int cpu); + void ReduceIntMin (Vector > const& rvar, int cpu); //! Long sum reduction. void ReduceLongSum (Long& rvar); void ReduceLongSum (Long* rvar, int cnt); - void ReduceLongSum (Vector >&& rvar); + void ReduceLongSum (Vector > const& rvar); //! Long sum reduction to specified cpu. void ReduceLongSum (Long& rvar, int cpu); void ReduceLongSum (Long* rvar, int cnt, int cpu); - void ReduceLongSum (Vector >&& rvar, int cpu); + void ReduceLongSum (Vector > const& rvar, int cpu); //! Long max reduction. void ReduceLongMax (Long& rvar); void ReduceLongMax (Long* rvar, int cnt); - void ReduceLongMax (Vector >&& rvar); + void ReduceLongMax (Vector > const& rvar); //! Long max reduction to specified cpu. void ReduceLongMax (Long& rvar, int cpu); void ReduceLongMax (Long* rvar, int cnt, int cpu); - void ReduceLongMax (Vector >&& rvar, int cpu); + void ReduceLongMax (Vector > const& rvar, int cpu); //! Long min reduction. void ReduceLongMin (Long& rvar); void ReduceLongMin (Long* rvar, int cnt); - void ReduceLongMin (Vector >&& rvar); + void ReduceLongMin (Vector > const& rvar); //! Long min reduction to specified cpu. void ReduceLongMin (Long& rvar, int cpu); void ReduceLongMin (Long* rvar, int cnt, int cpu); - void ReduceLongMin (Vector >&& rvar, int cpu); + void ReduceLongMin (Vector > const& rvar, int cpu); //! Long and-wise reduction. void ReduceLongAnd (Long& rvar); void ReduceLongAnd (Long* rvar, int cnt); - void ReduceLongAnd (Vector >&& rvar); + void ReduceLongAnd (Vector > const& rvar); //! Long and-wise reduction to specified cpu. void ReduceLongAnd (Long& rvar, int cpu); void ReduceLongAnd (Long* rvar, int cnt, int cpu); - void ReduceLongAnd (Vector >&& rvar, int cpu); + void ReduceLongAnd (Vector > const& rvar, int cpu); //! Parallel gather. void Gather (Real const* sendbuf, int nsend, Real* recvbuf, int root); @@ -674,7 +674,7 @@ while ( false ) #ifdef BL_USE_MPI int select_comm_data_type (std::size_t nbytes); - std::size_t alignof_comm_data (std::size_t nbytes); + std::size_t sizeof_selected_comm_data_type (std::size_t nbytes); #endif } } @@ -699,7 +699,7 @@ template Message Asend (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm) { - static_assert(!std::is_same::value, "Asend: char version has been specialized"); + static_assert(!std::is_same_v, "Asend: char version has been specialized"); BL_PROFILE_T_S("ParallelDescriptor::Asend(TsiiM)", T); BL_COMM_PROFILE(BLProfiler::AsendTsiiM, n * sizeof(T), dst_pid, tag); @@ -740,7 +740,7 @@ template Message Send (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm) { - static_assert(!std::is_same::value, "Send: char version has been specialized"); + static_assert(!std::is_same_v, "Send: char version has been specialized"); BL_PROFILE_T_S("ParallelDescriptor::Send(Tsii)", T); @@ -788,7 +788,7 @@ template Message Arecv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm) { - static_assert(!std::is_same::value, "Arecv: char version has been specialized"); + static_assert(!std::is_same_v, "Arecv: char version has been specialized"); BL_PROFILE_T_S("ParallelDescriptor::Arecv(TsiiM)", T); BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, n * sizeof(T), src_pid, tag); @@ -829,7 +829,7 @@ template Message Recv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm) { - static_assert(!std::is_same::value, "Recv: char version has been specialized"); + static_assert(!std::is_same_v, "Recv: char version has been specialized"); BL_PROFILE_T_S("ParallelDescriptor::Recv(Tsii)", T); BL_COMM_PROFILE(BLProfiler::RecvTsii, BLProfiler::BeforeCall(), src_pid, tag); @@ -996,9 +996,9 @@ ParallelDescriptor::GatherLayoutDataToVector (const LayoutData& sendbuf, Vector recvcount(nprocs, 0); recvbuf.resize(sendbuf.size()); const Vector& old_pmap = sendbuf.DistributionMap().ProcessorMap(); - for (int i=0; i - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T& rvar) { detail::DoAllReduce(&rvar,MPI_SUM,1); } template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T* rvar, int cnt) { detail::DoAllReduce(rvar,MPI_SUM,cnt); } template - typename std::enable_if::value>::type - ReduceRealSum (Vector >&& rvar) + std::enable_if_t> + ReduceRealSum (Vector > const& rvar) { int cnt = rvar.size(); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -1296,20 +1296,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu) //! Real sum reduction to specified cpu. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T& rvar, int cpu) { detail::DoReduce(&rvar,MPI_SUM,1,cpu); } template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T* rvar, int cnt, int cpu) { detail::DoReduce(rvar,MPI_SUM,cnt,cpu); } template - typename std::enable_if::value>::type - ReduceRealSum (Vector >&& rvar, int cpu) + std::enable_if_t> + ReduceRealSum (Vector > const& rvar, int cpu) { int cnt = rvar.size(); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -1321,20 +1321,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu) //! Real max reduction. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T& rvar) { detail::DoAllReduce(&rvar,MPI_MAX,1); } template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T* rvar, int cnt) { detail::DoAllReduce(rvar,MPI_MAX,cnt); } template - typename std::enable_if::value>::type - ReduceRealMax (Vector >&& rvar) + std::enable_if_t> + ReduceRealMax (Vector > const& rvar) { int cnt = rvar.size(); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -1346,20 +1346,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu) //! Real max reduction to specified cpu. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T& rvar, int cpu) { detail::DoReduce(&rvar,MPI_MAX,1,cpu); } template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T* rvar, int cnt, int cpu) { detail::DoReduce(rvar,MPI_MAX,cnt,cpu); } template - typename std::enable_if::value>::type - ReduceRealMax (Vector >&& rvar, int cpu) + std::enable_if_t> + ReduceRealMax (Vector > const& rvar, int cpu) { int cnt = rvar.size(); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -1371,20 +1371,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu) //! Real min reduction. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T& rvar) { detail::DoAllReduce(&rvar,MPI_MIN,1); } template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T* rvar, int cnt) { detail::DoAllReduce(rvar,MPI_MIN,cnt); } template - typename std::enable_if::value>::type - ReduceRealMin (Vector >&& rvar) + std::enable_if_t> + ReduceRealMin (Vector > const& rvar) { int cnt = rvar.size(); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -1396,20 +1396,20 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu) //! Real min reduction to specified cpu. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T& rvar, int cpu) { detail::DoReduce(&rvar,MPI_MIN,1,cpu); } template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T* rvar, int cnt, int cpu) { detail::DoReduce(rvar,MPI_MIN,cnt,cpu); } template - typename std::enable_if::value>::type - ReduceRealMin (Vector >&& rvar, int cpu) + std::enable_if_t> + ReduceRealMin (Vector > const& rvar, int cpu) { int cnt = rvar.size(); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -1423,81 +1423,81 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu) //! Real sum reduction. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T& ) {} template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T*, int) {} template - typename std::enable_if::value>::type - ReduceRealSum (Vector >&&) {} + std::enable_if_t> + ReduceRealSum (Vector > const&) {} //! Real sum reduction to specified cpu. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T&, int) {} template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealSum (T*, int, int) {} template - typename std::enable_if::value>::type - ReduceRealSum (Vector >&&, int) {} + std::enable_if_t> + ReduceRealSum (Vector > const&, int) {} //! Real max reduction. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T&) {} template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T*, int) {} template - typename std::enable_if::value>::type - ReduceRealMax (Vector >&&) {} + std::enable_if_t> + ReduceRealMax (Vector > const&) {} //! Real max reduction to specified cpu. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T&, int) {} template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMax (T*, int, int) {} template - typename std::enable_if::value>::type - ReduceRealMax (Vector >&&, int) {} + std::enable_if_t> + ReduceRealMax (Vector > const&, int) {} //! Real min reduction. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T&) {} template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T*, int) {} template - typename std::enable_if::value>::type - ReduceRealMin (Vector >&&) {} + std::enable_if_t> + ReduceRealMin (Vector > const&) {} //! Real min reduction to specified cpu. template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T&, int) {} template - typename std::enable_if::value>::type + std::enable_if_t> ReduceRealMin (T*, int, int) {} template - typename std::enable_if::value>::type - ReduceRealMin (Vector >&&, int) {} + std::enable_if_t> + ReduceRealMin (Vector > const&, int) {} #endif } @@ -1529,9 +1529,9 @@ struct Mpi_typemap> static MPI_Datatype mpi_type = MPI_DATATYPE_NULL; if (mpi_type == MPI_DATATYPE_NULL) { using T = ValLocPair; - static_assert(std::is_trivially_copyable::value, + static_assert(std::is_trivially_copyable_v, "To communicate with MPI, ValLocPair must be trivially copyable."); - static_assert(std::is_standard_layout::value, + static_assert(std::is_standard_layout_v, "To communicate with MPI, ValLocPair must be standard layout"); T vlp[2]; diff --git a/Src/Base/AMReX_ParallelDescriptor.cpp b/Src/Base/AMReX_ParallelDescriptor.cpp index 4e959504158..50f16c17967 100644 --- a/Src/Base/AMReX_ParallelDescriptor.cpp +++ b/Src/Base/AMReX_ParallelDescriptor.cpp @@ -551,39 +551,39 @@ Comm_dup (MPI_Comm comm, MPI_Comm& newcomm) } void -ReduceRealSum (Vector >&& rvar) +ReduceRealSum (Vector > const& rvar) { - ReduceRealSum(std::move(rvar)); + ReduceRealSum(rvar); } void -ReduceRealSum (Vector >&& rvar, int cpu) +ReduceRealSum (Vector > const& rvar, int cpu) { - ReduceRealSum(std::move(rvar), cpu); + ReduceRealSum(rvar, cpu); } void -ReduceRealMax (Vector > && rvar) +ReduceRealMax (Vector > const& rvar) { - ReduceRealMax(std::move(rvar)); + ReduceRealMax(rvar); } void -ReduceRealMax (Vector >&& rvar, int cpu) +ReduceRealMax (Vector > const& rvar, int cpu) { - ReduceRealMax(std::move(rvar), cpu); + ReduceRealMax(rvar, cpu); } void -ReduceRealMin (Vector >&& rvar) +ReduceRealMin (Vector > const& rvar) { - ReduceRealMin(std::move(rvar)); + ReduceRealMin(rvar); } void -ReduceRealMin (Vector >&& rvar, int cpu) +ReduceRealMin (Vector > const& rvar, int cpu) { - ReduceRealMin(std::move(rvar), cpu); + ReduceRealMin(rvar, cpu); } void @@ -643,7 +643,7 @@ ReduceIntSum (int* r, int cnt) } void -ReduceIntSum (Vector >&& rvar) +ReduceIntSum (Vector > const& rvar) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -666,7 +666,7 @@ ReduceIntSum (int* r, int cnt, int cpu) } void -ReduceIntSum (Vector >&& rvar, int cpu) +ReduceIntSum (Vector > const& rvar, int cpu) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -689,7 +689,7 @@ ReduceIntMax (int* r, int cnt) } void -ReduceIntMax (Vector >&& rvar) +ReduceIntMax (Vector > const& rvar) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -712,7 +712,7 @@ ReduceIntMax (int* r, int cnt, int cpu) } void -ReduceIntMax (Vector >&& rvar, int cpu) +ReduceIntMax (Vector > const& rvar, int cpu) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -735,7 +735,7 @@ ReduceIntMin (int* r, int cnt) } void -ReduceIntMin (Vector >&& rvar) +ReduceIntMin (Vector > const& rvar) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -758,7 +758,7 @@ ReduceIntMin (int* r, int cnt, int cpu) } void -ReduceIntMin (Vector >&& rvar, int cpu) +ReduceIntMin (Vector > const& rvar, int cpu) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -781,7 +781,7 @@ ReduceLongSum (Long* r, int cnt) } void -ReduceLongSum (Vector >&& rvar) +ReduceLongSum (Vector > const& rvar) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -804,7 +804,7 @@ ReduceLongSum (Long* r, int cnt, int cpu) } void -ReduceLongSum (Vector >&& rvar, int cpu) +ReduceLongSum (Vector > const& rvar, int cpu) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -827,7 +827,7 @@ ReduceLongMax (Long* r, int cnt) } void -ReduceLongMax (Vector >&& rvar) +ReduceLongMax (Vector > const& rvar) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -850,7 +850,7 @@ ReduceLongMax (Long* r, int cnt, int cpu) } void -ReduceLongMax (Vector >&& rvar, int cpu) +ReduceLongMax (Vector > const& rvar, int cpu) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -873,7 +873,7 @@ ReduceLongMin (Long* r, int cnt) } void -ReduceLongMin (Vector >&& rvar) +ReduceLongMin (Vector > const& rvar) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -896,7 +896,7 @@ ReduceLongMin (Long* r, int cnt, int cpu) } void -ReduceLongMin (Vector >&& rvar, int cpu) +ReduceLongMin (Vector > const& rvar, int cpu) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -919,7 +919,7 @@ ReduceLongAnd (Long* r, int cnt) } void -ReduceLongAnd (Vector >&& rvar) +ReduceLongAnd (Vector > const& rvar) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -942,7 +942,7 @@ ReduceLongAnd (Long* r, int cnt, int cpu) } void -ReduceLongAnd (Vector >&& rvar,int cpu) +ReduceLongAnd (Vector > const& rvar,int cpu) { auto cnt = static_cast(rvar.size()); Vector tmp{std::begin(rvar), std::end(rvar)}; @@ -1211,13 +1211,13 @@ void IProbe (int, int, MPI_Comm, int&, MPI_Status&) {} void Comm_dup (MPI_Comm, MPI_Comm&) {} -void ReduceRealSum (Vector >&& /*rvar*/) {} -void ReduceRealMax (Vector >&& /*rvar*/) {} -void ReduceRealMin (Vector >&& /*rvar*/) {} +void ReduceRealSum (Vector > const& /*rvar*/) {} +void ReduceRealMax (Vector > const& /*rvar*/) {} +void ReduceRealMin (Vector > const& /*rvar*/) {} -void ReduceRealSum (Vector >&& /*rvar*/, int /*cpu*/) {} -void ReduceRealMax (Vector >&& /*rvar*/, int /*cpu*/) {} -void ReduceRealMin (Vector >&& /*rvar*/, int /*cpu*/) {} +void ReduceRealSum (Vector > const& /*rvar*/, int /*cpu*/) {} +void ReduceRealMax (Vector > const& /*rvar*/, int /*cpu*/) {} +void ReduceRealMin (Vector > const& /*rvar*/, int /*cpu*/) {} void ReduceLongAnd (Long&) {} void ReduceLongSum (Long&) {} @@ -1239,15 +1239,15 @@ void ReduceLongSum (Long*,int,int) {} void ReduceLongMax (Long*,int,int) {} void ReduceLongMin (Long*,int,int) {} -void ReduceLongAnd (Vector >&& /*rvar*/) {} -void ReduceLongSum (Vector >&& /*rvar*/) {} -void ReduceLongMax (Vector >&& /*rvar*/) {} -void ReduceLongMin (Vector >&& /*rvar*/) {} +void ReduceLongAnd (Vector > const& /*rvar*/) {} +void ReduceLongSum (Vector > const& /*rvar*/) {} +void ReduceLongMax (Vector > const& /*rvar*/) {} +void ReduceLongMin (Vector > const& /*rvar*/) {} -void ReduceLongAnd (Vector >&& /*rvar*/, int /*cpu*/) {} -void ReduceLongSum (Vector >&& /*rvar*/, int /*cpu*/) {} -void ReduceLongMax (Vector >&& /*rvar*/, int /*cpu*/) {} -void ReduceLongMin (Vector >&& /*rvar*/, int /*cpu*/) {} +void ReduceLongAnd (Vector > const& /*rvar*/, int /*cpu*/) {} +void ReduceLongSum (Vector > const& /*rvar*/, int /*cpu*/) {} +void ReduceLongMax (Vector > const& /*rvar*/, int /*cpu*/) {} +void ReduceLongMin (Vector > const& /*rvar*/, int /*cpu*/) {} void ReduceIntSum (int&) {} void ReduceIntMax (int&) {} @@ -1265,13 +1265,13 @@ void ReduceIntSum (int*,int,int) {} void ReduceIntMax (int*,int,int) {} void ReduceIntMin (int*,int,int) {} -void ReduceIntSum (Vector >&& /*rvar*/) {} -void ReduceIntMax (Vector >&& /*rvar*/) {} -void ReduceIntMin (Vector >&& /*rvar*/) {} +void ReduceIntSum (Vector > const& /*rvar*/) {} +void ReduceIntMax (Vector > const& /*rvar*/) {} +void ReduceIntMin (Vector > const& /*rvar*/) {} -void ReduceIntSum (Vector >&& /*rvar*/, int /*cpu*/) {} -void ReduceIntMax (Vector >&& /*rvar*/, int /*cpu*/) {} -void ReduceIntMin (Vector >&& /*rvar*/, int /*cpu*/) {} +void ReduceIntSum (Vector > const& /*rvar*/, int /*cpu*/) {} +void ReduceIntMax (Vector > const& /*rvar*/, int /*cpu*/) {} +void ReduceIntMin (Vector > const& /*rvar*/, int /*cpu*/) {} void ReduceBoolAnd (bool&) {} void ReduceBoolOr (bool&) {} @@ -1376,8 +1376,8 @@ BL_FORT_PROC_DECL(BL_PD_ABORT,bl_pd_abort)() #if defined(BL_USE_MPI) && !defined(BL_AMRPROF) template <> MPI_Datatype Mpi_typemap::type() { - static_assert(std::is_trivially_copyable::value, "IntVect must be trivially copyable"); - static_assert(std::is_standard_layout::value, "IntVect must be standard layout"); + static_assert(std::is_trivially_copyable_v, "IntVect must be trivially copyable"); + static_assert(std::is_standard_layout_v, "IntVect must be standard layout"); if ( mpi_type_intvect == MPI_DATATYPE_NULL ) { @@ -1399,8 +1399,8 @@ template <> MPI_Datatype Mpi_typemap::type() template <> MPI_Datatype Mpi_typemap::type() { - static_assert(std::is_trivially_copyable::value, "IndexType must be trivially copyable"); - static_assert(std::is_standard_layout::value, "IndexType must be standard layout"); + static_assert(std::is_trivially_copyable_v, "IndexType must be trivially copyable"); + static_assert(std::is_standard_layout_v, "IndexType must be standard layout"); if ( mpi_type_indextype == MPI_DATATYPE_NULL ) { @@ -1422,8 +1422,8 @@ template <> MPI_Datatype Mpi_typemap::type() template <> MPI_Datatype Mpi_typemap::type() { - static_assert(std::is_trivially_copyable::value, "Box must be trivially copyable"); - static_assert(std::is_standard_layout::value, "Box must be standard layout"); + static_assert(std::is_trivially_copyable_v, "Box must be trivially copyable"); + static_assert(std::is_standard_layout_v, "Box must be standard layout"); if ( mpi_type_box == MPI_DATATYPE_NULL ) { @@ -1630,7 +1630,7 @@ select_comm_data_type (std::size_t nbytes) } std::size_t -alignof_comm_data (std::size_t nbytes) +sizeof_selected_comm_data_type (std::size_t nbytes) { const int t = select_comm_data_type(nbytes); if (t == 1) { diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H index 01a0098333e..062db374029 100644 --- a/Src/Base/AMReX_ParmParse.H +++ b/Src/Base/AMReX_ParmParse.H @@ -267,7 +267,6 @@ class IntVect; * #endif * */ - class ParmParse { public: @@ -1063,7 +1062,7 @@ public: struct PP_entry; using Table = std::list; static void appendTable(ParmParse::Table& tab); - [[nodiscard]] const Table& table() const {return m_table;} + [[nodiscard]] const Table& table() const {return *m_table;} protected: @@ -1081,7 +1080,7 @@ protected: // //! Prefix used in keyword search. std::stack m_pstack; - Table& m_table; + Table* m_table; }; struct ParmParse::PP_entry @@ -1116,7 +1115,7 @@ public: void pop(); [[nodiscard]] std::string getPrefix() const; private: - ParmParse& m_pp; + ParmParse* m_pp; int m_np{0}; }; diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp index c2ecfc7b37a..a2b42b64a15 100644 --- a/Src/Base/AMReX_ParmParse.cpp +++ b/Src/Base/AMReX_ParmParse.cpp @@ -29,11 +29,13 @@ extern "C" void amrex_finalize_namelist (); namespace amrex { +namespace { #ifdef AMREX_XSDK -static bool finalize_verbose = false; + bool finalize_verbose = false; #else -static bool finalize_verbose = true; + bool finalize_verbose = true; #endif +} std::string const ParmParse::FileKeyword = "FILE"; @@ -450,7 +452,6 @@ ppfound (const std::string& keyword, // except if n==-1, return the index of the last occurrence. // Return 0 if the specified occurrence does not exist. // - const ParmParse::PP_entry* ppindex (const ParmParse::Table& table, int n, @@ -1123,20 +1124,20 @@ ParmParse::getPrefix() const ParmParse::ParmParse (const std::string& prefix) : - m_table(g_table) + m_table(&g_table) { m_pstack.push(prefix); } ParmParse::ParmParse (Table& a_table) - : m_table(a_table) + : m_table(&a_table) { - m_pstack.push(""); + m_pstack.emplace(""); } ParmParse::Frame::Frame (ParmParse& pp, const std::string& pfix) : - m_pp(pp) + m_pp(&pp) { push(pfix); BL_ASSERT( m_np == 1 ); @@ -1155,7 +1156,7 @@ ParmParse::Frame::~Frame () void ParmParse::Frame::push (const std::string& str) { - m_pp.pushPrefix(str); + m_pp->pushPrefix(str); m_np++; } @@ -1163,14 +1164,14 @@ void ParmParse::Frame::pop () { BL_ASSERT( m_np > 0); - m_pp.popPrefix(); + m_pp->popPrefix(); m_np--; } std::string ParmParse::Frame::getPrefix () const { - return m_pp.getPrefix(); + return m_pp->getPrefix(); } void @@ -1179,7 +1180,8 @@ ParmParse::appendTable(ParmParse::Table& tab) g_table.splice(g_table.end(), tab); } -static +namespace { + bool unused_table_entries_q (const ParmParse::Table& table, const std::string& prefix = std::string()) { @@ -1216,7 +1218,6 @@ unused_table_entries_q (const ParmParse::Table& table, const std::string& prefix return false; } -static void finalize_table (const std::string& pfx, const ParmParse::Table& table) { @@ -1227,7 +1228,7 @@ finalize_table (const std::string& pfx, const ParmParse::Table& table) if ( !li.m_queried ) { if (finalize_verbose) { - amrex::AllPrint() << "Record " << li.m_name << std::endl; + amrex::AllPrint() << "Record " << li.m_name << '\n'; } } else @@ -1238,12 +1239,14 @@ finalize_table (const std::string& pfx, const ParmParse::Table& table) else if ( !li.m_queried ) { if (finalize_verbose) { - amrex::AllPrint() << pfx << "::" << li << std::endl; + amrex::AllPrint() << pfx << "::" << li << '\n'; } } } } +} + void ParmParse::Initialize (int argc, char** argv, @@ -1267,7 +1270,7 @@ ParmParse::QueryUnusedInputs () finalize_verbose = amrex::system::verbose; if (finalize_verbose) { amrex::OutStream() << "Unused ParmParse Variables:\n"; } finalize_table(" [TOP]", g_table); - if (finalize_verbose) { amrex::OutStream() << std::endl; } + if (finalize_verbose) { amrex::OutStream() << '\n'; } return true; } return false; @@ -1279,7 +1282,7 @@ ParmParse::hasUnusedInputs (const std::string& prefix) return unused_table_entries_q(g_table, prefix); } -static +namespace { void get_entries_under_prefix (std::vector& found_entries, const ParmParse::Table& table, @@ -1308,6 +1311,7 @@ get_entries_under_prefix (std::vector& found_entries, } } } +} std::vector ParmParse::getUnusedInputs (const std::string& prefix) @@ -1333,7 +1337,7 @@ ParmParse::Finalize () finalize_verbose = amrex::system::verbose; if (finalize_verbose) { amrex::OutStream() << "Unused ParmParse Variables:\n"; } finalize_table(" [TOP]", g_table); - if (finalize_verbose) { amrex::OutStream() << std::endl; } + if (finalize_verbose) { amrex::OutStream() << '\n'; } // // First loop through and delete all queried entries. // @@ -1356,10 +1360,10 @@ ParmParse::dumpTable (std::ostream& os, bool prettyPrint) for (auto const& li : g_table) { if(prettyPrint && li.m_queried) { - os << li.print() << std::endl; + os << li.print() << '\n'; } else { - os << li << std::endl; + os << li << '\n'; } } } @@ -1371,7 +1375,7 @@ ParmParse::countval (const char* name, // // First find n'th occurrence of name in table. // - const PP_entry* def = ppindex(m_table, n, prefixedName(name), false); + const PP_entry* def = ppindex(*m_table, n, prefixedName(name), false); return def == nullptr ? 0 : static_cast(def->m_vals.size()); } @@ -1382,7 +1386,7 @@ ParmParse::getkth (const char* name, bool& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival,k); + sgetval(*m_table, prefixedName(name),ref,ival,k); } void @@ -1390,7 +1394,7 @@ ParmParse::get (const char* name, bool& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival, LAST); + sgetval(*m_table, prefixedName(name),ref,ival, LAST); } int @@ -1399,7 +1403,7 @@ ParmParse::querykth (const char* name, bool& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival,k); + return squeryval(*m_table, prefixedName(name),ref,ival,k); } int @@ -1407,7 +1411,7 @@ ParmParse::query (const char* name, bool& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival, LAST); + return squeryval(*m_table, prefixedName(name),ref,ival, LAST); } void @@ -1421,25 +1425,25 @@ ParmParse::add (const char* name, void ParmParse::getkth (const char* name, int k, int& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival,k); + sgetval(*m_table, prefixedName(name),ref,ival,k); } void ParmParse::get (const char* name, int& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival, LAST); + sgetval(*m_table, prefixedName(name),ref,ival, LAST); } int ParmParse::querykth (const char* name, int k, int& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival,k); + return squeryval(*m_table, prefixedName(name),ref,ival,k); } int ParmParse::query (const char* name, int& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival, LAST); + return squeryval(*m_table, prefixedName(name),ref,ival, LAST); } void @@ -1452,28 +1456,28 @@ void ParmParse::getktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } void ParmParse::getarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } int ParmParse::queryktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } int ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } void @@ -1487,25 +1491,25 @@ ParmParse::addarr (const char* name, const std::vector& ref) void ParmParse::getkth (const char* name, int k, long& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival,k); + sgetval(*m_table, prefixedName(name),ref,ival,k); } void ParmParse::get (const char* name, long& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival, LAST); + sgetval(*m_table, prefixedName(name),ref,ival, LAST); } int ParmParse::querykth (const char* name, int k, long& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival,k); + return squeryval(*m_table, prefixedName(name),ref,ival,k); } int ParmParse::query (const char* name, long& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival, LAST); + return squeryval(*m_table, prefixedName(name),ref,ival, LAST); } void @@ -1519,28 +1523,28 @@ void ParmParse::getktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } void ParmParse::getarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } int ParmParse::queryktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } int ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } void @@ -1553,25 +1557,25 @@ ParmParse::addarr (const char* name, const std::vector& ref) void ParmParse::getkth (const char* name, int k, long long& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival,k); + sgetval(*m_table, prefixedName(name),ref,ival,k); } void ParmParse::get (const char* name, long long& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival, LAST); + sgetval(*m_table, prefixedName(name),ref,ival, LAST); } int ParmParse::querykth (const char* name, int k, long long& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival,k); + return squeryval(*m_table, prefixedName(name),ref,ival,k); } int ParmParse::query (const char* name, long long& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival, LAST); + return squeryval(*m_table, prefixedName(name),ref,ival, LAST); } void @@ -1584,28 +1588,28 @@ void ParmParse::getktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } void ParmParse::getarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } int ParmParse::queryktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } int ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } void @@ -1618,25 +1622,25 @@ ParmParse::addarr (const char* name, const std::vector& ref) void ParmParse::getkth (const char* name, int k, float& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival,k); + sgetval(*m_table, prefixedName(name),ref,ival,k); } void ParmParse::get (const char* name, float& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival, LAST); + sgetval(*m_table, prefixedName(name),ref,ival, LAST); } int ParmParse::querykth (const char* name, int k, float& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival,k); + return squeryval(*m_table, prefixedName(name),ref,ival,k); } int ParmParse::query (const char* name, float& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival, LAST); + return squeryval(*m_table, prefixedName(name),ref,ival, LAST); } void @@ -1649,28 +1653,28 @@ void ParmParse::getktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } void ParmParse::getarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } int ParmParse::queryktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k); + return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k); } int ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } void @@ -1685,25 +1689,25 @@ ParmParse::addarr (const char* name, const std::vector& ref) void ParmParse::getkth (const char* name, int k, double& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival,k); + sgetval(*m_table, prefixedName(name),ref,ival,k); } void ParmParse::get (const char* name, double& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival, LAST); + sgetval(*m_table, prefixedName(name),ref,ival, LAST); } int ParmParse::querykth (const char* name, int k, double& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival,k); + return squeryval(*m_table, prefixedName(name),ref,ival,k); } int ParmParse::query (const char* name, double& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival, LAST); + return squeryval(*m_table, prefixedName(name),ref,ival, LAST); } void @@ -1716,28 +1720,28 @@ void ParmParse::getktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } void ParmParse::getarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } int ParmParse::queryktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k); + return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k); } int ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } void @@ -1752,25 +1756,25 @@ ParmParse::addarr (const char* name, const std::vector& ref) void ParmParse::getkth (const char* name, int k, std::string& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival,k); + sgetval(*m_table, prefixedName(name),ref,ival,k); } void ParmParse::get (const char* name, std::string& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival, LAST); + sgetval(*m_table, prefixedName(name),ref,ival, LAST); } int ParmParse::querykth (const char* name, int k, std::string& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival,k); + return squeryval(*m_table, prefixedName(name),ref,ival,k); } int ParmParse::query (const char* name, std::string& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival, LAST); + return squeryval(*m_table, prefixedName(name),ref,ival, LAST); } void @@ -1783,28 +1787,28 @@ void ParmParse::getktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } void ParmParse::getarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } int ParmParse::queryktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k); + return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k); } int ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } void @@ -1819,25 +1823,25 @@ ParmParse::addarr (const char* name, const std::vector& ref) void ParmParse::getkth (const char* name, int k, IntVect& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival,k); + sgetval(*m_table, prefixedName(name),ref,ival,k); } void ParmParse::get (const char* name, IntVect& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival, LAST); + sgetval(*m_table, prefixedName(name),ref,ival, LAST); } int ParmParse::querykth (const char* name, int k, IntVect& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival,k); + return squeryval(*m_table, prefixedName(name),ref,ival,k); } int ParmParse::query (const char* name, IntVect& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival, LAST); + return squeryval(*m_table, prefixedName(name),ref,ival, LAST); } void @@ -1850,28 +1854,28 @@ void ParmParse::getktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } void ParmParse::getarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } int ParmParse::queryktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k); + return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k); } int ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } void @@ -1884,25 +1888,25 @@ ParmParse::addarr (const char* name, const std::vector& ref) void ParmParse::getkth (const char* name, int k, Box& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival,k); + sgetval(*m_table, prefixedName(name),ref,ival,k); } void ParmParse::get (const char* name, Box& ref, int ival) const { - sgetval(m_table, prefixedName(name),ref,ival, LAST); + sgetval(*m_table, prefixedName(name),ref,ival, LAST); } int ParmParse::querykth (const char* name, int k, Box& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival,k); + return squeryval(*m_table, prefixedName(name),ref,ival,k); } int ParmParse::query (const char* name, Box& ref, int ival) const { - return squeryval(m_table, prefixedName(name),ref,ival, LAST); + return squeryval(*m_table, prefixedName(name),ref,ival, LAST); } void @@ -1915,28 +1919,28 @@ void ParmParse::getktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val,k); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val,k); } void ParmParse::getarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - sgetarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + sgetarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } int ParmParse::queryktharr (const char* name, int k, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix, num_val,k); + return squeryarr(*m_table, prefixedName(name),ref,start_ix, num_val,k); } int ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, int num_val) const { - return squeryarr(m_table, prefixedName(name),ref,start_ix,num_val, LAST); + return squeryarr(*m_table, prefixedName(name),ref,start_ix,num_val, LAST); } void @@ -1954,7 +1958,7 @@ int ParmParse::countname (const std::string& name) const { int cnt = 0; - for (auto const& li : m_table) + for (auto const& li : *m_table) { if ( ppfound(prefixedName(name), li, false) ) { @@ -1968,7 +1972,7 @@ int ParmParse::countRecords (const std::string& name) const { int cnt = 0; - for (auto const& li : m_table) + for (auto const& li : *m_table) { if ( ppfound(prefixedName(name), li, true) ) { @@ -1985,14 +1989,14 @@ ParmParse::countRecords (const std::string& name) const bool ParmParse::contains (const char* name) const { - for (auto const& li : m_table) + for (auto const& li : *m_table) { if ( ppfound(prefixedName(name), li, false)) { // // Found an entry; mark all occurrences of name as used. // - for (auto& lli : m_table) + for (auto& lli : *m_table) { if ( ppfound(prefixedName(name), lli, false) ) { @@ -2009,9 +2013,9 @@ int ParmParse::remove (const char* name) { int r = 0; - for (auto it = m_table.begin(); it != m_table.end(); ) { + for (auto it = m_table->begin(); it != m_table->end(); ) { if (ppfound(prefixedName(name), *it, false)) { - it = m_table.erase(it); + it = m_table->erase(it); ++r; } else { ++it; @@ -2023,10 +2027,10 @@ ParmParse::remove (const char* name) ParmParse::Record ParmParse::getRecord (const std::string& name, int n) const { - const PP_entry* pe = ppindex(m_table, n, prefixedName(name), true); + const PP_entry* pe = ppindex(*m_table, n, prefixedName(name), true); if ( pe == nullptr ) { - amrex::ErrorStream() << "ParmParse::getRecord: record " << name << " not found" << std::endl; + amrex::ErrorStream() << "ParmParse::getRecord: record " << name << " not found" << '\n'; amrex::Abort(); return Record(ParmParse()); } else { diff --git a/Src/Base/AMReX_PhysBCFunct.H b/Src/Base/AMReX_PhysBCFunct.H index 2d0906f27ad..ff8b73b5f0e 100644 --- a/Src/Base/AMReX_PhysBCFunct.H +++ b/Src/Base/AMReX_PhysBCFunct.H @@ -78,7 +78,7 @@ public: int dcomp, int numcomp, Geometry const& geom, Real time, const Vector& bcr, int bcomp, - int orig_comp, FF&& fillfunc); + int orig_comp, FF const& fillfunc); void nddoit (Box const& bx, FArrayBox& dest, int dcomp, int numcomp, @@ -355,7 +355,7 @@ GpuBndryFuncFab::ccfcdoit (Box const& bx, FArrayBox& dest, int dcomp, int numcomp, Geometry const& geom, Real time, const Vector& bcr, int bcomp, - int orig_comp, FF&& fillfunc) + int orig_comp, FF const& fillfunc) { const IntVect& len = bx.length(); diff --git a/Src/Base/AMReX_PlotFileDataImpl.cpp b/Src/Base/AMReX_PlotFileDataImpl.cpp index 1fbf5044a50..b85c17ad93c 100644 --- a/Src/Base/AMReX_PlotFileDataImpl.cpp +++ b/Src/Base/AMReX_PlotFileDataImpl.cpp @@ -141,7 +141,7 @@ PlotFileDataImpl::get (int level, std::string const& varname) noexcept int gid = mfi.index(); FArrayBox& dstfab = mf[mfi]; std::unique_ptr srcfab(m_vismf[level]->readFAB(gid, icomp)); - dstfab.copy(*srcfab); + dstfab.copy(*srcfab); } } return mf; diff --git a/Src/Base/AMReX_PlotFileUtil.cpp b/Src/Base/AMReX_PlotFileUtil.cpp index df8ff405a10..52324dda506 100644 --- a/Src/Base/AMReX_PlotFileUtil.cpp +++ b/Src/Base/AMReX_PlotFileUtil.cpp @@ -98,8 +98,8 @@ WriteGenericPlotfileHeader (std::ostream &HeaderFile, HeaderFile << varnames.size() << '\n'; - for (int ivar = 0; ivar < varnames.size(); ++ivar) { - HeaderFile << varnames[ivar] << "\n"; + for (const auto & varname : varnames) { + HeaderFile << varname << "\n"; } HeaderFile << AMREX_SPACEDIM << '\n'; HeaderFile << time << '\n'; diff --git a/Src/Base/AMReX_REAL.H b/Src/Base/AMReX_REAL.H index 6383808a3de..54815fa3da1 100644 --- a/Src/Base/AMReX_REAL.H +++ b/Src/Base/AMReX_REAL.H @@ -127,44 +127,6 @@ inline namespace literals { } // namespace amrex #endif -#else - -#ifndef AMREX_XSDK - -/* - The REAL_T macro specifies the precision of the floating-point - calculations in Fortran code. It will be either real*4 or - real*8 depending upon which of the symbols BL_USE_FLOAT or - BL_USE_DOUBLE, respectively, is defined during compilations. For - portability, you should write floating-point code in terms of this - macro, instead of using real*4 or real*8 directly. - - Note that exactly one of these macros must be defined - when compiling any module that uses floating-point. -*/ - -#ifdef BL_USE_FLOAT -# define REAL_T REAL -#if __STDC__==1 || defined(__INTEL_COMPILER) -# define BL_REAL(a) a##E0 -# define BL_REAL_E(a,b) a##E##b -#else -# define BL_REAL(a) a/**/E0 -# define BL_REAL_E(a,b) a/**/E/**/b -#endif -#else -# define REAL_T DOUBLE PRECISION -#if __STDC__==1 || defined(__INTEL_COMPILER) -# define BL_REAL(a) a##D0 -# define BL_REAL_E(a,b) a##D##b -#else -# define BL_REAL(a) a/**/D0 -# define BL_REAL_E(a,b) a/**/D/**/b -#endif -#endif - -#endif /* ndef AMREX_XSDK */ - #endif /* !BL_LANG_FORT */ #endif /*BL_REAL_H*/ diff --git a/Src/Base/AMReX_Random.H b/Src/Base/AMReX_Random.H index 675c12082d5..50b2c2693b0 100644 --- a/Src/Base/AMReX_Random.H +++ b/Src/Base/AMReX_Random.H @@ -23,24 +23,29 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real Random (RandomEngine const& random_engine) { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) -#ifdef BL_USE_FLOAT - return 1.0f - curand_uniform(random_engine.rand_state); +#if defined (__SYCL_DEVICE_ONLY__) + mkl::rng::device::uniform distr; + return mkl::rng::device::generate(distr, *random_engine.engine); #else - return 1.0 - curand_uniform_double(random_engine.rand_state); -#endif -#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) #ifdef BL_USE_FLOAT - return 1.0f - hiprand_uniform(random_engine.rand_state); + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return 1.0f - hiprand_uniform(random_engine.rand_state); , + return 1.0f - curand_uniform(random_engine.rand_state); + ) + )) #else - return 1.0 - hiprand_uniform_double(random_engine.rand_state); + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return 1.0 - hiprand_uniform_double(random_engine.rand_state); , + return 1.0 - curand_uniform_double(random_engine.rand_state); + ) + )) #endif -#elif defined (__SYCL_DEVICE_ONLY__) - mkl::rng::device::uniform distr; - return mkl::rng::device::generate(distr, *random_engine.engine); -#else - amrex::ignore_unused(random_engine); - return Random(); + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return Random(); + )) #endif } @@ -56,24 +61,29 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real RandomNormal (Real mean, Real stddev, RandomEngine const& random_engine) { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) -#ifdef BL_USE_FLOAT - return stddev * curand_normal(random_engine.rand_state) + mean; +#if defined (__SYCL_DEVICE_ONLY__) + mkl::rng::device::gaussian distr(mean, stddev); + return mkl::rng::device::generate(distr, *random_engine.engine); #else - return stddev * curand_normal_double(random_engine.rand_state) + mean; -#endif -#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) #ifdef BL_USE_FLOAT - return stddev * hiprand_normal(random_engine.rand_state) + mean; + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return stddev * hiprand_normal(random_engine.rand_state) + mean; , + return stddev * curand_normal(random_engine.rand_state) + mean; + ) + )) #else - return stddev * hiprand_normal_double(random_engine.rand_state) + mean; + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return stddev * hiprand_normal_double(random_engine.rand_state) + mean; , + return stddev * curand_normal_double(random_engine.rand_state) + mean; + ) + )) #endif -#elif defined (__SYCL_DEVICE_ONLY__) - mkl::rng::device::gaussian distr(mean, stddev); - return mkl::rng::device::generate(distr, *random_engine.engine); -#else - amrex::ignore_unused(random_engine); - return RandomNormal(mean, stddev); + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return RandomNormal(mean, stddev); + )) #endif } @@ -91,16 +101,20 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE unsigned int RandomPoisson (Real lambda, RandomEngine const& random_engine) { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) - return curand_poisson(random_engine.rand_state, lambda); -#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return hiprand_poisson(random_engine.rand_state, lambda); -#elif defined (__SYCL_DEVICE_ONLY__) +#if defined (__SYCL_DEVICE_ONLY__) mkl::rng::device::poisson distr(lambda); return mkl::rng::device::generate(distr, *random_engine.engine); #else - amrex::ignore_unused(random_engine); - return RandomPoisson(lambda); + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return hiprand_poisson(random_engine.rand_state, lambda); , + return curand_poisson(random_engine.rand_state, lambda); + ) + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return RandomPoisson(lambda); + )) #endif } @@ -116,22 +130,23 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE unsigned int Random_int (unsigned int n, RandomEngine const& random_engine) { -#if AMREX_DEVICE_COMPILE #if defined(__SYCL_DEVICE_ONLY__) mkl::rng::device::uniform distr(0,n); return mkl::rng::device::generate(distr, *random_engine.engine); #else - unsigned int rand; - constexpr unsigned int RAND_M = 4294967295; // 2**32-1 - do { - AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);, - rand = curand(random_engine.rand_state) ); - } while (rand > (RAND_M - RAND_M % n)); - return rand % n; -#endif -#else - amrex::ignore_unused(random_engine); - return Random_int(n); + AMREX_IF_ON_DEVICE(( + unsigned int rand; + constexpr unsigned int RAND_M = 4294967295; // 2**32-1 + do { + AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);, + rand = curand(random_engine.rand_state) ); + } while (rand > (RAND_M - RAND_M % n)); + return rand % n; + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return Random_int(n); + )) #endif } diff --git a/Src/Base/AMReX_Random.cpp b/Src/Base/AMReX_Random.cpp index 9e1059e6798..a0b72155c50 100644 --- a/Src/Base/AMReX_Random.cpp +++ b/Src/Base/AMReX_Random.cpp @@ -19,9 +19,15 @@ namespace namespace amrex { #ifdef AMREX_USE_SYCL sycl_rng_descr* rand_engine_descr = nullptr; -//xxxxx oneapi::mkl::rng::philox4x32x10* gpu_rand_generator = nullptr; #else amrex::randState_t* gpu_rand_state = nullptr; +#endif +} + +namespace { +#ifdef AMREX_USE_SYCL + oneapi::mkl::rng::philox4x32x10* gpu_rand_generator = nullptr; +#else amrex::randGenerator_t gpu_rand_generator = nullptr; #endif } @@ -44,8 +50,8 @@ void ResizeRandomSeed (amrex::ULong gpu_seed) rand_engine_descr = new sycl_rng_descr (Gpu::Device::streamQueue(), sycl::range<1>(N), gpu_seed, 1); -//xxxxx gpu_rand_generator = new std::remove_pointer_t -// (Gpu::Device::streamQueue(), gpu_seed+1234ULL); + gpu_rand_generator = new std::remove_pointer_t + (Gpu::Device::streamQueue(), gpu_seed+1234ULL); #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) @@ -192,7 +198,7 @@ UniqueRandomSubset (Vector &uSet, int setSize, int poolSize, uSet = uSetTemp; if(printSet) { for(int i(0); i < uSet.size(); ++i) { - AllPrint() << "uSet[" << i << "] = " << uSet[i] << std::endl; + AllPrint() << "uSet[" << i << "] = " << uSet[i] << '\n'; } } } @@ -212,11 +218,11 @@ DeallocateRandomSeedDevArray () Gpu::streamSynchronize(); rand_engine_descr = nullptr; } -//xxxxx if (gpu_rand_generator != nullptr) { -// delete gpu_rand_generator; -// Gpu::streamSynchronize(); -// gpu_rand_generator = nullptr; -// } + if (gpu_rand_generator != nullptr) { + delete gpu_rand_generator; + Gpu::streamSynchronize(); + gpu_rand_generator = nullptr; + } #else if (gpu_rand_state != nullptr) { @@ -258,15 +264,9 @@ void FillRandom (Real* p, Long N) #elif defined(AMREX_USE_SYCL) -//xxxxx oneapi::mkl::rng::uniform distr; -// auto event = oneapi::mkl::rng::generate(distr, gpu_rand_generator, N, p); -// event.wait(); - - amrex::ParallelForRNG(N, [=] AMREX_GPU_DEVICE (Long i, RandomEngine const& eng) - { - p[i] = Random(eng); - }); - Gpu::streamSynchronize(); + oneapi::mkl::rng::uniform distr; + auto event = oneapi::mkl::rng::generate(distr, *gpu_rand_generator, N, p); + event.wait(); #else std::uniform_real_distribution distribution(Real(0.0), Real(1.0)); @@ -299,15 +299,9 @@ void FillRandomNormal (Real* p, Long N, Real mean, Real stddev) #elif defined(AMREX_USE_SYCL) -//xxxxx oneapi::mkl::rng::gaussian distr(mean, stddev); -// auto event = oneapi::mkl::rng::generate(distr, gpu_rand_generator, N, p); -// event.wait(); - - amrex::ParallelForRNG(N, [=] AMREX_GPU_DEVICE (Long i, RandomEngine const& eng) - { - p[i] = RandomNormal(mean, stddev, eng); - }); - Gpu::streamSynchronize(); + oneapi::mkl::rng::gaussian distr(mean, stddev); + auto event = oneapi::mkl::rng::generate(distr, *gpu_rand_generator, N, p); + event.wait(); #else @@ -322,30 +316,6 @@ void FillRandomNormal (Real* p, Long N, Real mean, Real stddev) } // namespace amrex - -// -// Fortran entry points for amrex::Random(). -// - -#if !defined(AMREX_XSDK) && !defined(BL_NO_FORT) -BL_FORT_PROC_DECL(BLUTILINITRAND,blutilinitrand)(const int* sd) -{ - amrex::ULong seed = *sd; - amrex::InitRandom(seed); -} - -BL_FORT_PROC_DECL(BLINITRAND,blinitrand)(const int* sd) -{ - amrex::ULong seed = *sd; - amrex::InitRandom(seed); -} - -BL_FORT_PROC_DECL(BLUTILRAND,blutilrand)(amrex::Real* rn) -{ - *rn = amrex::Random(); -} -#endif - extern "C" { double amrex_random () { diff --git a/Src/Base/AMReX_RealVect.H b/Src/Base/AMReX_RealVect.H index 83cc747b030..9e1d72700f7 100644 --- a/Src/Base/AMReX_RealVect.H +++ b/Src/Base/AMReX_RealVect.H @@ -28,7 +28,6 @@ namespace amrex C++ array. In addition, the basic arithmetic operators have been overloaded to implement scaling and translation operations. */ - class RealVect { public: @@ -521,13 +520,13 @@ public: /** This is a RealVect all of whose components are equal to zero. */ - static const RealVect Zero; + static AMREX_EXPORT const RealVect Zero; /// /** This is a RealVect all of whose components are equal to one. */ - static const RealVect Unit; + static AMREX_EXPORT const RealVect Unit; /*@}*/ diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index ae8ae3ca0c1..1060b91f319 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -67,14 +67,14 @@ namespace Reduce::detail { template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - void for_each_init (T& t) + constexpr void for_each_init (T& t) { P().init(amrex::get(t)); } template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - void for_each_init (T& t) + constexpr void for_each_init (T& t) { P().init(amrex::get(t)); for_each_init(t); @@ -200,11 +200,11 @@ struct ReduceOpLogicalAnd template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - std::enable_if_t::value> + std::enable_if_t> local_update (T& d, T s) const noexcept { d = d && s; } template - constexpr std::enable_if_t::value> + constexpr std::enable_if_t> init (T& t) const noexcept { t = true; } }; @@ -232,11 +232,11 @@ struct ReduceOpLogicalOr template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - std::enable_if_t::value> + std::enable_if_t> local_update (T& d, T s) const noexcept { d = d || s; } template - constexpr std::enable_if_t::value> + constexpr std::enable_if_t> init (T& t) const noexcept { t = false; } }; @@ -366,7 +366,7 @@ public: // This is public for CUDA template - void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&&f) + void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f) { using ReduceTuple = typename D::Type; const int nboxes = mf.local_size(); @@ -376,11 +376,12 @@ public: const int nblocks = par_for_blocks.first[nboxes]; const int block_0_size = par_for_blocks.first[1]; const int* dp_nblocks = par_for_blocks.second; - const Box* dp_boxes = parforinfo.getBoxes(); + const BoxIndexer* dp_boxes = parforinfo.getBoxes(); auto const& stream = Gpu::gpuStream(); auto pdst = reduce_data.devicePtr(stream); int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks()); + AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits::max())); reduce_data.nBlocks(stream) = nblocks_ec; reduce_data.updateMaxStreamIndex(stream); @@ -405,25 +406,19 @@ public: dst = r; } for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) { - int ibox, icell; + int ibox; + std::uint64_t icell; if (dp_nblocks) { ibox = amrex::bisect(dp_nblocks, 0, nboxes, iblock); - icell = (iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x; + icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x; } else { ibox = iblock / block_0_size; - icell = (iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x; + icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x; } - Box const& b = dp_boxes[ibox]; - int ncells = b.numPts(); - if (icell < ncells) { - const auto len = amrex::length(b); - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - AMREX_D_TERM(i += b.smallEnd(0);, - j += b.smallEnd(1);, - k += b.smallEnd(2);); + BoxIndexer const& indexer = dp_boxes[ibox]; + if (icell < indexer.numPts()) { + auto [i, j, k] = indexer(icell); Reduce::detail::mf_call_f (f, ibox, i, j, k, ncomp, r); } @@ -496,7 +491,7 @@ public: } template - void eval (Box const& box, D & reduce_data, F&& f) + void eval (Box const& box, D & reduce_data, F const& f) { using ReduceTuple = typename D::Type; auto const& stream = Gpu::gpuStream(); @@ -556,7 +551,7 @@ public: template ::value> > - void eval (Box const& box, N ncomp, D & reduce_data, F&& f) + void eval (Box const& box, N ncomp, D & reduce_data, F const& f) { using ReduceTuple = typename D::Type; auto const& stream = Gpu::gpuStream(); @@ -617,7 +612,7 @@ public: template ::value> > - void eval (N n, D & reduce_data, F&& f) + void eval (N n, D & reduce_data, F const& f) { if (n <= 0) { return; } using ReduceTuple = typename D::Type; @@ -771,7 +766,7 @@ T Sum (N n, T const* v, T init_val = 0) template ::value> > -T Sum (N n, F&& f, T init_val = 0) +T Sum (N n, F const& f, T init_val = 0) { ReduceOps reduce_op; ReduceData reduce_data(reduce_op); @@ -794,7 +789,7 @@ T Min (N n, T const* v, T init_val = std::numeric_limits::max()) template ::value> > -T Min (N n, F&& f, T init_val = std::numeric_limits::max()) +T Min (N n, F const& f, T init_val = std::numeric_limits::max()) { ReduceOps reduce_op; ReduceData reduce_data(reduce_op); @@ -817,7 +812,7 @@ T Max (N n, T const* v, T init_val = std::numeric_limits::lowest()) template ::value> > -T Max (N n, F&& f, T init_val = std::numeric_limits::lowest()) +T Max (N n, F const& f, T init_val = std::numeric_limits::lowest()) { ReduceOps reduce_op; ReduceData reduce_data(reduce_op); @@ -842,7 +837,7 @@ std::pair MinMax (N n, T const* v) template ::value> > -std::pair MinMax (N n, F&& f) +std::pair MinMax (N n, F const& f) { ReduceOps reduce_op; ReduceData reduce_data(reduce_op); @@ -856,7 +851,7 @@ std::pair MinMax (N n, F&& f) } template ::value> > -bool AnyOf (N n, T const* v, P&& pred) +bool AnyOf (N n, T const* v, P const& pred) { Gpu::LaunchSafeGuard lsg(true); Gpu::DeviceScalar ds(0); @@ -912,7 +907,7 @@ bool AnyOf (N n, T const* v, P&& pred) } template -bool AnyOf (Box const& box, P&& pred) +bool AnyOf (Box const& box, P const& pred) { Gpu::LaunchSafeGuard lsg(true); Gpu::DeviceScalar ds(0); @@ -1042,8 +1037,8 @@ private: template AMREX_FORCE_INLINE static auto call_f (Box const& box, typename D::Type & r, F const& f) - noexcept -> std::enable_if_t, - typename D::Type>::value> + noexcept -> std::enable_if_t, + typename D::Type>> { using ReduceTuple = typename D::Type; const auto lo = amrex::lbound(box); @@ -1058,8 +1053,8 @@ private: template AMREX_FORCE_INLINE static auto call_f (Box const& box, typename D::Type & r, F const& f) - noexcept -> std::enable_if_t, - typename D::Type>::value> + noexcept -> std::enable_if_t, + typename D::Type>> { using ReduceTuple = typename D::Type; Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box)); @@ -1069,7 +1064,7 @@ public: template std::enable_if_t::value && IsCallable::value> - eval (MF const& mf, IntVect const& nghost, D & reduce_data, F&& f) + eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f) { using ReduceTuple = typename D::Type; #ifdef AMREX_USE_OMP @@ -1091,7 +1086,7 @@ public: template std::enable_if_t::value && IsCallable::value> - eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F&& f) + eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f) { using ReduceTuple = typename D::Type; #ifdef AMREX_USE_OMP @@ -1116,12 +1111,12 @@ public: void eval (Box const& box, D & reduce_data, F&& f) { auto& rr = reduce_data.reference(OpenMP::get_thread_num()); - call_f(box, rr, f); + call_f(box, rr, std::forward(f)); } template ::value> > - void eval (Box const& box, N ncomp, D & reduce_data, F&& f) + typename M=std::enable_if_t> > + void eval (Box const& box, N ncomp, D & reduce_data, F const& f) { using ReduceTuple = typename D::Type; auto& rr = reduce_data.reference(OpenMP::get_thread_num()); @@ -1136,8 +1131,8 @@ public: } template ::value> > - void eval (N n, D & reduce_data, F&& f) + typename M=std::enable_if_t> > + void eval (N n, D & reduce_data, F const& f) { using ReduceTuple = typename D::Type; auto& rr = reduce_data.reference(OpenMP::get_thread_num()); @@ -1170,8 +1165,8 @@ public: namespace Reduce { template ::value> > -T Sum (N n, F&& f, T init_val = 0) + typename M=std::enable_if_t> > +T Sum (N n, F const& f, T init_val = 0) { T r = init_val; #ifdef AMREX_USE_OMP @@ -1183,15 +1178,15 @@ T Sum (N n, F&& f, T init_val = 0) return r; } -template ::value> > +template > > T Sum (N n, T const* v, T init_val = 0) { return Sum(n, [=] (N i) -> T { return v[i]; }, init_val); } template ::value> > -T Min (N n, F&& f, T init_val = std::numeric_limits::max()) + typename M=std::enable_if_t> > +T Min (N n, F const& f, T init_val = std::numeric_limits::max()) { T r = init_val; #ifdef AMREX_USE_OMP @@ -1203,15 +1198,15 @@ T Min (N n, F&& f, T init_val = std::numeric_limits::max()) return r; } -template ::value> > +template > > T Min (N n, T const* v, T init_val = std::numeric_limits::max()) { return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val); } template ::value> > -T Max (N n, F&& f, T init_val = std::numeric_limits::lowest()) + typename M=std::enable_if_t> > +T Max (N n, F const& f, T init_val = std::numeric_limits::lowest()) { T r = init_val; #ifdef AMREX_USE_OMP @@ -1223,15 +1218,15 @@ T Max (N n, F&& f, T init_val = std::numeric_limits::lowest()) return r; } -template ::value> > +template > > T Max (N n, T const* v, T init_val = std::numeric_limits::lowest()) { return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val); } template ::value> > -std::pair Min (N n, F&& f) + typename M=std::enable_if_t> > +std::pair Min (N n, F const& f) { T r_min = std::numeric_limits::max(); T r_max = std::numeric_limits::lowest(); @@ -1246,20 +1241,20 @@ std::pair Min (N n, F&& f) return std::make_pair(r_min,r_max); } -template ::value> > +template > > std::pair MinMax (N n, T const* v) { return Reduce::MinMax(n, [=] (N i) -> T { return v[i]; }); } -template ::value> > +template > > bool AnyOf (N n, T const* v, P&& pred) { - return std::any_of(v, v+n, pred); + return std::any_of(v, v+n, std::forward

(pred)); } template -bool AnyOf (Box const& box, P&&pred) +bool AnyOf (Box const& box, P const& pred) { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -1275,6 +1270,34 @@ bool AnyOf (Box const& box, P&&pred) #endif +/** + * \brief Return a GpuTuple containing the identity element for each operation in ReduceOps. + * For example 0, +inf and -inf for ReduceOpSum, ReduceOpMin and ReduceOpMax respectively. + */ +template +AMREX_GPU_HOST_DEVICE +constexpr GpuTuple +IdentityTuple (GpuTuple, ReduceOps) noexcept +{ + GpuTuple r{}; + Reduce::detail::for_each_init<0, decltype(r), Ps...>(r); + return r; +} + +/** + * \brief Return a GpuTuple containing the identity element for each ReduceOp in TypeList. + * For example 0, +inf and -inf for ReduceOpSum, ReduceOpMin and ReduceOpMax respectively. + */ +template +AMREX_GPU_HOST_DEVICE +constexpr GpuTuple +IdentityTuple (GpuTuple, TypeList) noexcept +{ + GpuTuple r{}; + Reduce::detail::for_each_init<0, decltype(r), Ps...>(r); + return r; +} + } #endif diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H index cfac0851cab..2dc8514f9c6 100644 --- a/Src/Base/AMReX_RungeKutta.H +++ b/Src/Base/AMReX_RungeKutta.H @@ -4,8 +4,6 @@ #include -namespace amrex::RungeKutta { - /** * \brief Functions for Runge-Kutta methods * @@ -48,6 +46,7 @@ namespace amrex::RungeKutta { * FillPatcher class can be useful for implementing such a callable. See * AmrLevel::RK for an example. */ +namespace amrex::RungeKutta { struct PostStageNoOp { template @@ -156,8 +155,8 @@ void rk4_update_4 (MF& Unew, MF const& Uold, Array const& rkk, Real dt6) * \param post_stage post-processing stage results */ template -void RK2 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry, - P&& post_stage = PostStageNoOp()) +void RK2 (MF& Uold, MF& Unew, Real time, Real dt, F const& frhs, FB const& fillbndry, + P const& post_stage = PostStageNoOp()) { BL_PROFILE("RungeKutta2"); @@ -194,8 +193,8 @@ void RK2 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry, */ template -void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry, - R&& store_crse_data, P&& post_stage = PostStageNoOp()) +void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F const& frhs, FB const& fillbndry, + R const& store_crse_data, P const& post_stage = PostStageNoOp()) { BL_PROFILE("RungeKutta3"); @@ -244,8 +243,8 @@ void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry, */ template -void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry, - R&& store_crse_data, P&& post_stage = PostStageNoOp()) +void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F const& frhs, FB const& fillbndry, + R const& store_crse_data, P const& post_stage = PostStageNoOp()) { BL_PROFILE("RungeKutta4"); diff --git a/Src/Base/AMReX_SPACE.H b/Src/Base/AMReX_SPACE.H index 91c4cc3a222..5328bb40b06 100644 --- a/Src/Base/AMReX_SPACE.H +++ b/Src/Base/AMReX_SPACE.H @@ -161,29 +161,6 @@ namespace amrex # define AMREX_D_PICK(a,b,c) c #endif -#ifndef AMREX_XSDK - -/* for backward compatibility */ -#if (AMREX_SPACEDIM == 1) -# define D_DECL(a,b,c) a -# define D_EXPR(a,b,c) ((void)((a),0)) -# define D_PICK(a,b,c) a -# define D_TERM(a,b,c) a -#elif (AMREX_SPACEDIM == 2) -# define D_DECL(a,b,c) a,b -# define D_EXPR(a,b,c) ((void)((a),(b),0)) -# define D_PICK(a,b,c) b -# define D_TERM(a,b,c) a b -#elif (AMREX_SPACEDIM == 3) -# define D_DECL(a,b,c) a,b,c -# define D_EXPR(a,b,c) ((void)((a),(b),(c),0)) -# define D_PICK(a,b,c) c -# define D_TERM(a,b,c) a b c -#endif - -#endif - - #if (AMREX_SPACEDIM == 1) # define AMREX_1D_ONLY(a) a # define AMREX_2D_ONLY(a) ((void)0) diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H index 11fdfd8bd70..4c94960cba6 100644 --- a/Src/Base/AMReX_Scan.H +++ b/Src/Base/AMReX_Scan.H @@ -187,7 +187,7 @@ struct BlockStatus #ifndef AMREX_SYCL_NO_MULTIPASS_SCAN template -T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum) +T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum) { if (n <= 0) { return 0; } constexpr int nwarps_per_block = 8; @@ -400,15 +400,15 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum #ifndef AMREX_SYCL_NO_MULTIPASS_SCAN if (nblocks > 1) { - return PrefixSum_mp(n, std::forward(fin), std::forward(fout), type, retSum); + return PrefixSum_mp(n, std::forward(fin), std::forward(fout), type, a_ret_sum); } #endif std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int); auto stream = Gpu::gpuStream(); - using BlockStatusT = typename std::conditional) <= 8, - detail::BlockStatus, detail::BlockStatus >::type; + using BlockStatusT = std::conditional_t) <= 8, + detail::BlockStatus, detail::BlockStatus >; std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks); std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int)); @@ -627,7 +627,7 @@ template ::value && (std::is_same,Type::Inclusive>::value || std::is_same,Type::Exclusive>::value)> > -T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum) +T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum) { if (n <= 0) { return 0; } constexpr int nwarps_per_block = 4; @@ -770,7 +770,7 @@ template ::value && (std::is_same,Type::Inclusive>::value || std::is_same,Type::Exclusive>::value)> > -T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum) +T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum) { if (n <= 0) { return 0; } constexpr int nwarps_per_block = 8; @@ -903,7 +903,7 @@ template ::value && (std::is_same,Type::Inclusive>::value || std::is_same,Type::Exclusive>::value)> > -T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum) +T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum) { if (n <= 0) { return 0; } constexpr int nwarps_per_block = 4; @@ -915,8 +915,8 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum) std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int); auto stream = Gpu::gpuStream(); - using BlockStatusT = typename std::conditional) <= 8, - detail::BlockStatus, detail::BlockStatus >::type; + using BlockStatusT = std::conditional_t) <= 8, + detail::BlockStatus, detail::BlockStatus >; std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks); std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int)); @@ -1281,10 +1281,10 @@ T ExclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum) #else // !defined(AMREX_USE_GPU) template ::value && - (std::is_same,Type::Inclusive>::value || - std::is_same,Type::Exclusive>::value)> > -T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum = retSum) + typename M=std::enable_if_t && + (std::is_same_v,Type::Inclusive> || + std::is_same_v,Type::Exclusive>)> > +T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum = retSum) { if (n <= 0) { return 0; } T totalsum = 0; @@ -1292,7 +1292,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum = retSum) T x = fin(i); T y = totalsum; totalsum += x; - AMREX_IF_CONSTEXPR (std::is_same,Type::Inclusive>::value) { + AMREX_IF_CONSTEXPR (std::is_same_v,Type::Inclusive>) { y += x; } fout(i, y); @@ -1301,7 +1301,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum = retSum) } // The return value is the total sum. -template ::value> > +template > > T InclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum) { #if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10) @@ -1314,7 +1314,7 @@ T InclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum) } // The return value is the total sum. -template ::value> > +template > > T ExclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum) { if (n <= 0) { return 0; } diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H index b7572e2a1cf..8ff5f608334 100644 --- a/Src/Base/AMReX_TableData.H +++ b/Src/Base/AMReX_TableData.H @@ -22,12 +22,11 @@ struct Table1D int begin = 1; int end = 0; - AMREX_GPU_HOST_DEVICE - constexpr Table1D () noexcept {} + constexpr Table1D () noexcept = default; - template ::value,int>::type = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE - constexpr Table1D (Table1D::type> const& rhs) noexcept + constexpr Table1D (Table1D> const& rhs) noexcept : p(rhs.p), begin(rhs.begin), end(rhs.end) @@ -43,7 +42,7 @@ struct Table1D AMREX_GPU_HOST_DEVICE explicit operator bool () const noexcept { return p != nullptr; } - template ::value,int>::type = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -57,16 +56,17 @@ struct Table1D void index_assert (int i) const { if (i < begin || i >= end) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n", - i, begin, end-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << ") is out of bound (" - << begin << ":" << end-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n", + i, begin, end-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << ") is out of bound (" + << begin << ":" << end-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -80,12 +80,11 @@ struct Table2D GpuArray begin{{1,1}}; GpuArray end{{0,0}}; - AMREX_GPU_HOST_DEVICE - constexpr Table2D () noexcept {} + constexpr Table2D () noexcept = default; - template ::value,int>::type = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE - constexpr Table2D (Table2D::type> const& rhs) noexcept + constexpr Table2D (Table2D> const& rhs) noexcept : p(rhs.p), jstride(rhs.jstride), begin(rhs.begin), @@ -105,7 +104,7 @@ struct Table2D AMREX_GPU_HOST_DEVICE explicit operator bool () const noexcept { return p != nullptr; } - template ::value,int>::type = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i, int j) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -120,17 +119,18 @@ struct Table2D { if (i < begin[0] || i >= end[0] || j < begin[1] || j >= end[1]) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n", - i, j, begin[0], end[0]-1, begin[1], end[1]-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << ") is out of bound (" - << begin[0] << ":" << end[0]-1 - << "," << begin[1] << ":" << end[1]-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n", + i, j, begin[0], end[0]-1, begin[1], end[1]-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << ") is out of bound (" + << begin[0] << ":" << end[0]-1 + << "," << begin[1] << ":" << end[1]-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -145,12 +145,11 @@ struct Table3D GpuArray begin{{1,1,1}}; GpuArray end{{0,0,0}}; - AMREX_GPU_HOST_DEVICE - constexpr Table3D () noexcept {} + constexpr Table3D () noexcept = default; - template ::value,int>::type = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE - constexpr Table3D (Table3D::type> const& rhs) noexcept + constexpr Table3D (Table3D> const& rhs) noexcept : p(rhs.p), jstride(rhs.jstride), kstride(rhs.kstride), @@ -172,7 +171,7 @@ struct Table3D AMREX_GPU_HOST_DEVICE explicit operator bool () const noexcept { return p != nullptr; } - template ::value,int>::type = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i, int j, int k) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -188,19 +187,20 @@ struct Table3D if (i < begin[0] || i >= end[0] || j < begin[1] || j >= end[1] || k < begin[2] || k >= end[2]) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n", - i, j, k, begin[0], end[0]-1, begin[1], end[1]-1, - begin[2], end[2]-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << "," << k << ") is out of bound (" - << begin[0] << ":" << end[0]-1 - << "," << begin[1] << ":" << end[1]-1 - << "," << begin[2] << ":" << end[2]-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n", + i, j, k, begin[0], end[0]-1, begin[1], end[1]-1, + begin[2], end[2]-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << "," << k << ") is out of bound (" + << begin[0] << ":" << end[0]-1 + << "," << begin[1] << ":" << end[1]-1 + << "," << begin[2] << ":" << end[2]-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -216,12 +216,11 @@ struct Table4D GpuArray begin{{1,1,1,1}}; GpuArray end{{0,0,0,0}}; - AMREX_GPU_HOST_DEVICE - constexpr Table4D () noexcept {} + constexpr Table4D () noexcept = default; - template ::value,int>::type = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE - constexpr Table4D (Table4D::type> const& rhs) noexcept + constexpr Table4D (Table4D> const& rhs) noexcept : p(rhs.p), jstride(rhs.jstride), kstride(rhs.kstride), @@ -245,7 +244,7 @@ struct Table4D AMREX_GPU_HOST_DEVICE explicit operator bool () const noexcept { return p != nullptr; } - template ::value,int>::type = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i, int j, int k, int n) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -262,20 +261,21 @@ struct Table4D j < begin[1] || j >= end[1] || k < begin[2] || k >= end[2] || n < begin[3] || n >= end[3]) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n", - i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1, - begin[2], end[2]-1, begin[3], end[3]-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound (" - << begin[0] << ":" << end[0]-1 - << "," << begin[1] << ":" << end[1]-1 - << "," << begin[2] << ":" << end[2]-1 - << "," << begin[3] << ":" << end[3]-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n", + i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1, + begin[2], end[2]-1, begin[3], end[3]-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound (" + << begin[0] << ":" << end[0]-1 + << "," << begin[1] << ":" << end[1]-1 + << "," << begin[2] << ":" << end[2]-1 + << "," << begin[3] << ":" << end[3]-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -333,7 +333,7 @@ public: std::conditional_t, Table4D > > >; - TableData () noexcept; + TableData () noexcept = default; explicit TableData (Arena* ar) noexcept; @@ -347,11 +347,11 @@ public: ~TableData () noexcept; - constexpr int dim () const noexcept { return N; } + [[nodiscard]] constexpr int dim () const noexcept { return N; } void resize (Array const& lo, Array const& hi, Arena* ar = nullptr); - Long size () const noexcept; + [[nodiscard]] Long size () const noexcept; Array const& lo () const noexcept { return m_lo; } @@ -376,9 +376,6 @@ private: bool m_ptr_owner = false; }; -template -TableData::TableData () noexcept {} - template TableData::TableData (Array const& lo, Array const& hi, Arena* ar) : DataAllocator{ar}, m_lo(lo), m_hi(hi) @@ -404,14 +401,17 @@ template TableData& TableData::operator= (TableData && rhs) noexcept { - m_arena = rhs.m_arena; - m_dptr = rhs.m_dptr; - m_lo = rhs.m_lo; - m_hi = rhs.m_hi; - m_truesize = rhs.m_truesize; - m_ptr_owner = rhs.m_ptr_owner; - rhs.m_dptr = nullptr; - rhs.m_ptr_owner = false; + if (this != &rhs) { + clear(); + m_arena = rhs.m_arena; + m_dptr = rhs.m_dptr; + m_lo = rhs.m_lo; + m_hi = rhs.m_hi; + m_truesize = rhs.m_truesize; + m_ptr_owner = rhs.m_ptr_owner; + rhs.m_dptr = nullptr; + rhs.m_ptr_owner = false; + } return *this; } diff --git a/Src/Base/AMReX_TagParallelFor.H b/Src/Base/AMReX_TagParallelFor.H index 5aa748a3d61..ee8d089ee73 100644 --- a/Src/Base/AMReX_TagParallelFor.H +++ b/Src/Base/AMReX_TagParallelFor.H @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace amrex { @@ -74,10 +75,10 @@ struct Array4BoxValTag { template struct VectorTag { T* p; - int m_size; + Long m_size; [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - int size () const noexcept { return m_size; } + Long size () const noexcept { return m_size; } }; #ifdef AMREX_USE_GPU @@ -85,20 +86,20 @@ struct VectorTag { namespace detail { template -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t().box())>, Box>::value, - int> + Long> get_tag_size (T const& tag) noexcept { + AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits::max())); return static_cast(tag.box().numPts()); } template -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t().size())> >::value, - int> + Long> get_tag_size (T const& tag) noexcept { + AMREX_ASSERT(tag.size() < Long(std::numeric_limits::max())); return tag.size(); } @@ -151,6 +152,7 @@ ParallelFor_doit (Vector const& tags, F && f) const int ntags = tags.size(); if (ntags == 0) { return; } + Long l_ntotwarps = 0; int ntotwarps = 0; Vector nwarps; nwarps.reserve(ntags+1); @@ -158,8 +160,9 @@ ParallelFor_doit (Vector const& tags, F && f) { auto& tag = tags[i]; nwarps.push_back(ntotwarps); - ntotwarps += static_cast((get_tag_size(tag) - + Gpu::Device::warp_size-1) / Gpu::Device::warp_size); + auto nw = (get_tag_size(tag) + Gpu::Device::warp_size-1) / Gpu::Device::warp_size; + l_ntotwarps += nw; + ntotwarps += static_cast(nw); } nwarps.push_back(ntotwarps); @@ -182,6 +185,9 @@ ParallelFor_doit (Vector const& tags, F && f) constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; + amrex::ignore_unused(l_ntotwarps); + AMREX_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits::max())); + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), #ifdef AMREX_USE_SYCL [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept @@ -192,11 +198,11 @@ ParallelFor_doit (Vector const& tags, F && f) #endif { #ifdef AMREX_USE_SYCL - int g_tid = item.get_global_id(0); + std::size_t g_tid = item.get_global_id(0); #else - int g_tid = blockDim.x*blockIdx.x + threadIdx.x; + auto g_tid = std::size_t(blockDim.x)*blockIdx.x + threadIdx.x; #endif - int g_wid = g_tid / Gpu::Device::warp_size; + auto g_wid = int(g_tid / Gpu::Device::warp_size); if (g_wid >= ntotwarps) { return; } int tag_id = amrex::bisect(d_nwarps, 0, ntags, g_wid); diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp index 3d935589409..82f3718c735 100644 --- a/Src/Base/AMReX_TinyProfiler.cpp +++ b/Src/Base/AMReX_TinyProfiler.cpp @@ -93,6 +93,12 @@ TinyProfiler::start () noexcept #endif if (!regionstack.empty()) { +#ifdef AMREX_USE_GPU + if (device_synchronize_around_region) { + amrex::Gpu::streamSynchronize(); + } +#endif + #ifdef AMREX_USE_CUPTI if (uCUPTI) { cudaDeviceSynchronize(); @@ -111,12 +117,6 @@ TinyProfiler::start () noexcept in_parallel_region = false; #endif -#ifdef AMREX_USE_GPU - if (device_synchronize_around_region) { - amrex::Gpu::streamSynchronize(); - } -#endif - #ifdef AMREX_USE_CUDA nvtxRangePush(fname.c_str()); #elif defined(AMREX_USE_HIP) && defined(AMREX_USE_ROCTX) @@ -149,8 +149,14 @@ TinyProfiler::stop () noexcept #ifdef AMREX_USE_OMP #pragma omp master #endif - if (!stats.empty()) - { + if (!stats.empty()) { + +#ifdef AMREX_USE_GPU + if (device_synchronize_around_region) { + amrex::Gpu::streamSynchronize(); + } +#endif + double t; int nKernelCalls = 0; #ifdef AMREX_USE_CUPTI @@ -207,12 +213,6 @@ TinyProfiler::stop () noexcept std::get<1>(parent) += dtin; } -#ifdef AMREX_USE_GPU - if (device_synchronize_around_region) { - amrex::Gpu::streamSynchronize(); - } -#endif - #ifdef AMREX_USE_CUDA nvtxRangePop(); #elif defined(AMREX_USE_HIP) && defined(AMREX_USE_ROCTX) @@ -242,8 +242,12 @@ TinyProfiler::stop (unsigned boxUintID) noexcept #ifdef AMREX_USE_OMP #pragma omp master #endif - if (!stats.empty()) - { + if (!stats.empty()) { + + if (device_synchronize_around_region) { + amrex::Gpu::streamSynchronize(); + } + double t; cudaDeviceSynchronize(); cuptiActivityFlushAll(0); @@ -293,10 +297,6 @@ TinyProfiler::stop (unsigned boxUintID) noexcept std::get<1>(parent) += dtin; } - if (device_synchronize_around_region) { - amrex::Gpu::streamSynchronize(); - } - #ifdef AMREX_USE_CUDA nvtxRangePop(); #elif defined(AMREX_USE_HIP) && defined(AMREX_USE_ROCTX) diff --git a/Src/Base/AMReX_Tuple.H b/Src/Base/AMReX_Tuple.H index 5cdfbee634f..2aca55a0c6b 100644 --- a/Src/Base/AMReX_Tuple.H +++ b/Src/Base/AMReX_Tuple.H @@ -24,7 +24,7 @@ namespace detail { template struct gpu_tuple_element { - template ::value,int> = 0> + template ,int> = 0> AMREX_GPU_HOST_DEVICE constexpr gpu_tuple_element () {} // NOLINT @@ -32,7 +32,7 @@ struct gpu_tuple_element : m_value(a_value) {} - template ::value,int> = 0> + template ,int> = 0> explicit constexpr gpu_tuple_element (U && a_value) // NOLINT(bugprone-forwarding-reference-overload) : m_value(std::forward(a_value)) {} @@ -47,7 +47,7 @@ struct gpu_tuple_impl : public gpu_tuple_impl, public gpu_tuple_element { - template::value,int> = 0> + template,int> = 0> AMREX_GPU_HOST_DEVICE constexpr gpu_tuple_impl () {} // NOLINT @@ -56,7 +56,7 @@ struct gpu_tuple_impl gpu_tuple_element(a_head) {} - template ::value,int> = 0> + template ,int> = 0> constexpr gpu_tuple_impl (UH&& a_head, UT &&... a_tail) : gpu_tuple_impl(std::forward(a_tail)...), gpu_tuple_element(std::forward(a_head)) @@ -68,7 +68,7 @@ struct gpu_tuple_impl : public gpu_tuple_element { - template::value,int> = 0> + template,int> = 0> AMREX_GPU_HOST_DEVICE constexpr gpu_tuple_impl () {} // NOLINT @@ -76,7 +76,7 @@ struct gpu_tuple_impl : gpu_tuple_element(a_head) {} - template ::value,int> = 0> + template ,int> = 0> explicit constexpr gpu_tuple_impl (U&& a_head) // NOLINT(bugprone-forwarding-reference-overload) : gpu_tuple_element(std::forward(a_head)) {} @@ -241,7 +241,7 @@ namespace detail { template struct unwrap { using type = T; }; template struct unwrap > { using type = T&; }; template - using tuple_decay_t = typename unwrap::type>::type; + using tuple_decay_t = typename unwrap>::type; } template @@ -270,7 +270,7 @@ namespace detail { template AMREX_GPU_HOST_DEVICE constexpr R - make_tuple (TP1 && a, TP2 && b, + make_tuple (TP1 const& a, TP2 const& b, std::index_sequence const& /*n1*/, std::index_sequence const& /*n2*/) { return R(amrex::get(a)..., amrex::get(b)...); @@ -297,9 +297,9 @@ TupleCat (TP1 && a, TP2 && b) -> typename detail::tuple_cat_result, detail::tuple_decay_t >::type; return detail::make_tuple - (a, b, - std::make_index_sequence::type>::value>(), - std::make_index_sequence::type>::value>()); + (std::forward(a), std::forward(b), + std::make_index_sequence>::value>(), + std::make_index_sequence>::value>()); } template @@ -345,7 +345,7 @@ namespace detail { apply_impl (F&& f, TP&& t, std::index_sequence /*is*/) -> typename detail::apply_result >::type { - return f(amrex::get(std::forward(t))...); + return std::forward(f)(amrex::get(std::forward(t))...); } } @@ -355,7 +355,7 @@ constexpr auto Apply (F&& f, TP&& t) -> typename detail::apply_result >::type { return detail::apply_impl(std::forward(f), std::forward(t), - std::make_index_sequence::type>::value>()); + std::make_index_sequence>::value>()); } // Tie @@ -378,6 +378,20 @@ ForwardAsTuple (Ts&&... args) noexcept return GpuTuple(std::forward(args)...); } +// MakeZeroTuple + +/** + * \brief Return a GpuTuple containing all zeros. + * Note that a default-constructed GpuTuple can have uninitialized values. + */ +template +AMREX_GPU_HOST_DEVICE +constexpr GpuTuple +MakeZeroTuple (GpuTuple) noexcept +{ + return GpuTuple(static_cast(0)...); +} + } #endif /*AMREX_TUPLE_H_*/ diff --git a/Src/Base/AMReX_TypeList.H b/Src/Base/AMReX_TypeList.H index 3bd0a837069..75201087b7d 100644 --- a/Src/Base/AMReX_TypeList.H +++ b/Src/Base/AMReX_TypeList.H @@ -34,13 +34,13 @@ using TypeAt = typename detail::TypeListGet::type; namespace detail { template - constexpr void for_each_impl (F&&f, std::index_sequence) + constexpr void for_each_impl (F const&f, std::index_sequence) { (f(TypeAt{}), ...); } template - constexpr bool for_each_until_impl (F&&f, std::index_sequence) + constexpr bool for_each_until_impl (F const&f, std::index_sequence) { return (f(TypeAt{}) || ...); } @@ -93,7 +93,7 @@ ForEach (TypeList, F&& f) // dst and src are either MultiFab or fMultiFab auto tt = CartesianProduct(TypeList{}, TypeList{}); - bool r = ForEachUtil(tt, [&] (auto t) -> bool + bool r = ForEachUntil(tt, [&] (auto t) -> bool { using MF0 = TypeAt<0,decltype(t)>; using MF1 = TypeAt<1,decltype(t)>; @@ -151,6 +151,53 @@ constexpr auto CartesianProduct (Ls...) { return (TypeList>{} * ... * Ls{}); } +namespace detail { + // return TypeList by using the fast power algorithm + template + constexpr auto SingleTypeMultiplier_impl () { + if constexpr (N == 0) { + return TypeList<>{}; + } else if constexpr (N == 1) { + return TypeList{}; + } else if constexpr (N % 2 == 0) { + return SingleTypeMultiplier_impl() + SingleTypeMultiplier_impl(); + } else { + return SingleTypeMultiplier_impl() + TypeList{}; + } + } + + // overload of SingleTypeMultiplier for multiple types: + // convert T[N] to T, T, T, T, ... (N times with N >= 1) + template + constexpr auto SingleTypeMultiplier (const T (&)[N]) { + return SingleTypeMultiplier_impl(); + } + + // overload of SingleTypeMultiplier for one regular type + template + constexpr auto SingleTypeMultiplier (T) { + return TypeList{}; + } + + // apply the types of the input TypeList as template arguments to TParam + template