diff --git a/.github/actions/fetch_bladebit_harvester.sh b/.github/actions/fetch_bladebit_harvester.sh new file mode 100755 index 000000000..d55d35a09 --- /dev/null +++ b/.github/actions/fetch_bladebit_harvester.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +set -eo pipefail +_dir=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +cd "$_dir/../.." + +## +# Usage: fetch_bladebit_harvester.sh +# +# Use gitbash or similar under Windows. +## +host_os=$1 +host_arch=$2 + +if [[ "${host_os}" != "linux" ]] && [[ "${host_os}" != "macos" ]] && [[ "${host_os}" != "windows" ]]; then + echo >&2 "Unkonwn OS '${host_os}'" + exit 1 +fi + +if [[ "${host_arch}" != "arm64" ]] && [[ "${host_arch}" != "x86-64" ]]; then + echo >&2 "Unkonwn Architecture '${host_arch}'" + exit 1 +fi + +## Change this if including a new bladebit release +artifact_ver="v3.1.0" +artifact_base_url="https://github.com/Chia-Network/bladebit/releases/download/${artifact_ver}" + +linux_arm_sha256="e8fc09bb5862ce3d029b78144ea46791afe14a2d640390605b6df28fb420e782" +linux_x86_sha256="e31e5226d1e4a399f4181bb2cca243d46218305a8b54912ef29c791022ac079d" +macos_arm_sha256="03958b94ad9d01de074b5a9a9d86a51bd2246c0eab5529c5886bb4bbc4168e0b" +macos_x86_sha256="14975aabfb3d906e22e04cd973be4265f9c5c61e67a92f890c8e51cf9edf0c87" +windows_sha256="ccf115ebec18413c3134f9ca37945f30f4f02d6766242c7e84a6df0d1d989a69" +## End changes + +artifact_ext="tar.gz" +sha_bin="sha256sum" +expected_sha256= + +if [[ "$OSTYPE" == "darwin"* ]]; then + sha_bin="shasum -a 256" +fi + +case "${host_os}" in +linux) + if [[ "${host_arch}" == "arm64" ]]; then + expected_sha256=$linux_arm_sha256 + else + expected_sha256=$linux_x86_sha256 + fi + ;; +macos) + if [[ "${host_arch}" == "arm64" ]]; then + expected_sha256=$macos_arm_sha256 + else + expected_sha256=$macos_x86_sha256 + fi + ;; +windows) + expected_sha256=$windows_sha256 + artifact_ext="zip" + ;; +*) + echo >&2 "Unexpected OS '${host_os}'" + exit 1 + ;; +esac + +# Download artifact +artifact_name="green_reaper.${artifact_ext}" +curl -L "${artifact_base_url}/green_reaper-${artifact_ver}-${host_os}-${host_arch}.${artifact_ext}" >"${artifact_name}" + +# Validate sha256, if one was given +if [ -n "${expected_sha256}" ]; then + gr_sha256="$(${sha_bin} ${artifact_name} | cut -d' ' -f1)" + + if [[ "${gr_sha256}" != "${expected_sha256}" ]]; then + echo >&2 "GreenReaper SHA256 mismatch!" + echo >&2 " Got : '${gr_sha256}'" + echo >&2 " Expected: '${expected_sha256}'" + exit 1 + fi +fi + +# Unpack artifact +dst_dir="libs/green_reaper" +mkdir -p "${dst_dir}" +if [[ "${artifact_ext}" == "zip" ]]; then + unzip -d "${dst_dir}" "${artifact_name}" +else + pushd "${dst_dir}" + tar -xzvf "../../${artifact_name}" + popd +fi diff --git a/.github/workflows/build-test-cplusplus.yml b/.github/workflows/build-test-cplusplus.yml index 6794730fc..07b590ebf 100644 --- a/.github/workflows/build-test-cplusplus.yml +++ b/.github/workflows/build-test-cplusplus.yml @@ -1,6 +1,14 @@ name: Build and Test C++ -on: [push, pull_request] +on: + push: + branches: + - main + release: + types: [published] + pull_request: + branches: + - '**' concurrency: group: ${{ github.ref }}-${{ github.workflow }}-${{ github.event_name }}--${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/heads/long_lived/')) && github.sha || '' }} @@ -9,10 +17,10 @@ concurrency: jobs: valgrind: name: valgrind ubuntu - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests, and valgrind on ubuntu-20.04 run: | @@ -27,31 +35,25 @@ jobs: asan: name: ASAN ubuntu - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests with address- and undefined sanitizer on Ubuntu run: | - sudo fallocate -l 16G /swapfile - sudo chmod 600 /swapfile - sudo mkswap /swapfile - sudo swapon /swapfile - swapon -s mkdir build-asan cd build-asan cmake -DCMAKE_BUILD_TYPE=ASAN ../ cmake --build . -- -j 6 - swapon -s ./RunTests tsan: name: TSAN ubuntu - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests with thread sanitizer on Ubuntu run: | @@ -59,14 +61,29 @@ jobs: cd build-tsan cmake -DCMAKE_BUILD_TYPE=TSAN ../ cmake --build . -- -j 6 - TSAN_OPTIONS="memory_limit_mb=6000" ./RunTests + ./RunTests + + mac: + name: MacOS + runs-on: macos-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: cmake, RunTests on Mac + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release -j 6 + ./RunTests windows: name: Windows Latest runs-on: windows-latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests with Windows run: | diff --git a/.github/workflows/build-test-riscv64.yml b/.github/workflows/build-test-riscv64.yml new file mode 100644 index 000000000..85bb5acb7 --- /dev/null +++ b/.github/workflows/build-test-riscv64.yml @@ -0,0 +1,126 @@ +name: Build and test riscv64 on ubuntu-latest + +on: + push: + branches: + - main + - dev + tags: + - "**" + pull_request: + branches: + - "**" + +jobs: + build_wheels: + timeout-minutes: 60 + name: ${{ matrix.os }} 📦 Build ${{ matrix.python.major-dot-minor }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python: + - major-dot-minor: "3.8" + matrix: "3.8" + - major-dot-minor: "3.9" + matrix: "3.9" + - major-dot-minor: "3.10" + matrix: "3.10" + - major-dot-minor: "3.11" + matrix: "3.11" + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: Set up QEMU on x86_64 + if: startsWith(matrix.os, 'ubuntu-latest') + id: qemu + uses: docker/setup-qemu-action@v2 + with: + platforms: riscv64 + # image: tonistiigi/binfmt:latest + + - name: Build and Test + run: | + docker run --rm --platform linux/riscv64 \ + -v ${{ github.workspace }}:/ws --workdir=/ws \ + chianetwork/ubuntu-22.04-risc-builder:latest \ + bash -exc '\ + pyenv global ${{ matrix.python.matrix }} && \ + cmake --version && \ + uname -a && \ + export CP_USE_GREEN_REAPER=0 && \ + pip wheel -w dist . && \ + python3 -m venv venv && \ + ./venv/bin/python -m pip install dist/*.whl && \ + ./venv/bin/python -m pip install pytest && \ + ./venv/bin/python -m pytest -k "not k_21" -v tests/ + ' + + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: packages + path: ./dist + if-no-files-found: error + upload: + name: Upload to Chia PyPI + runs-on: ubuntu-latest + needs: + - build_wheels + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set Env + uses: Chia-Network/actions/setjobenv@main + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Download artifacts + if: env.RELEASE == 'true' + uses: actions/download-artifact@v3 + with: + name: packages + path: ./dist + + - name: Configure AWS credentials + if: env.RELEASE == 'true' + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::${{ secrets.CHIA_AWS_ACCOUNT_ID }}:role/installer-upload + aws-region: us-west-2 + + - name: List existing wheels + if: env.RELEASE == 'true' + shell: sh + run: | + aws s3 ls s3://download.chia.net/simple/chiavdf/ > existing_wheel_list_raw + cat existing_wheel_list_raw + cat existing_wheel_list_raw | tr -s ' ' | cut -d ' ' -f 4 > existing_wheel_list + + - name: List new wheels + if: env.RELEASE == 'true' + shell: sh + run: | + (cd dist/; ls ${{ inputs.package_name }}-*.whl) > new_wheel_list + cat new_wheel_list | xargs -I % sh -c 'ls -l dist/%' + + - name: Choose wheels to upload + if: env.RELEASE == 'true' + shell: sh + run: | + grep -F -x -v -f existing_wheel_list new_wheel_list > upload_wheel_list + cat upload_wheel_list + + - name: Upload wheels + if: env.RELEASE == 'true' + shell: sh + run: | + cat upload_wheel_list | xargs -I % sh -c 'aws s3 cp dist/% s3://download.chia.net/simple/chiapos/' diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 8fbdfee8e..74ae4ff65 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -4,8 +4,8 @@ on: push: branches: - main - tags: - - '**' + release: + types: [published] pull_request: branches: - '**' @@ -15,10 +15,15 @@ concurrency: group: ${{ github.ref }}-${{ github.workflow }}-${{ github.event_name }}-${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/heads/long_lived/')) && github.sha || '' }} cancel-in-progress: true +permissions: + contents: read + id-token: write + jobs: build-wheels: name: Wheel - ${{ matrix.os.name }} ${{ matrix.python.major-dot-minor }} ${{ matrix.arch.name }} runs-on: ${{ matrix.os.runs-on[matrix.arch.matrix] }} + continue-on-error: true strategy: fail-fast: false matrix: @@ -41,36 +46,36 @@ jobs: runs-on: intel: [windows-latest] python: - - major-dot-minor: '3.7' - cibw-build: 'cp37-*' - manylinux: - arch: manylinux2014 - intel: manylinux2010 - matrix: '3.7' - major-dot-minor: '3.8' cibw-build: 'cp38-*' manylinux: - arch: manylinux2014 - intel: manylinux2010 + arm: manylinux2014 + intel: manylinux2014 matrix: '3.8' - major-dot-minor: '3.9' cibw-build: 'cp39-*' manylinux: - arch: manylinux2014 - intel: manylinux2010 + arm: manylinux2014 + intel: manylinux2014 matrix: '3.9' - major-dot-minor: '3.10' cibw-build: 'cp310-*' manylinux: - arch: manylinux2014 - intel: manylinux2010 + arm: manylinux2014 + intel: manylinux2014 matrix: '3.10' - major-dot-minor: '3.11' cibw-build: 'cp311-*' manylinux: - arch: manylinux2014 + arm: manylinux2014 intel: manylinux2014 matrix: '3.11' + - major-dot-minor: '3.12' + cibw-build: 'cp312-*' + manylinux: + arm: manylinux2014 + intel: manylinux2014 + matrix: '3.12' arch: - name: ARM matrix: arm @@ -96,9 +101,9 @@ jobs: arm: [macOS, ARM64] intel: [macos-latest] python: - major-dot-minor: '3.7' - cibw-build: 'cp37-*' - matrix: '3.7' + major-dot-minor: '3.8' + cibw-build: 'cp38-*' + matrix: '3.8' arch: name: ARM matrix: arm @@ -112,6 +117,12 @@ jobs: with: fetch-depth: 0 + - name: Set Env + if: env.RUNNER_ARCH != 'ARM64' + uses: Chia-Network/actions/setjobenv@main + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: Chia-Network/actions/setup-python@main with: python-version: ${{ matrix.python.major-dot-minor }} @@ -120,36 +131,23 @@ jobs: run: | pip install pipx + - name: Get Windows Bladebit Harvester Artifact + if: runner.os == 'Windows' + shell: bash + run: | + set -eo pipefail + set -x + .github/actions/fetch_bladebit_harvester.sh windows x86-64 + - name: Build and test env: CIBW_PRERELEASE_PYTHONS: True - CIBW_BUILD_VERBOSITY_MACOS: 0 - CIBW_BUILD_VERBOSITY_LINUX: 0 - CIBW_BUILD_VERBOSITY_WINDOWS: 0 CIBW_BUILD: ${{ matrix.python.cibw-build }} - CIBW_SKIP: '*-manylinux_i686 *-win32 *-musllinux_*' CIBW_MANYLINUX_AARCH64_IMAGE: ${{ matrix.python.manylinux['arm'] }} CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.python.manylinux['intel'] }} - CIBW_ENVIRONMENT_LINUX: "PATH=/project/cmake-3.17.3-Linux-`uname -m`/bin:$PATH" - CIBW_BEFORE_ALL_LINUX: > - curl -L https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3-Linux-`uname -m`.sh > cmake.sh - && yes | sh cmake.sh | cat - && rm -f /usr/bin/cmake - && which cmake - && cmake --version - && uname -a - CIBW_BEFORE_BUILD_LINUX: > - python -m pip install --upgrade pip CIBW_ARCHS_MACOS: ${{ matrix.os.cibw-archs-macos[matrix.arch.matrix] }} - CIBW_BEFORE_ALL_MACOS: > - brew install gmp boost cmake - CIBW_BEFORE_BUILD_MACOS: > - python -m pip install --upgrade pip - CIBW_ENVIRONMENT_MACOS: "MACOSX_DEPLOYMENT_TARGET=10.14" - CIBW_TEST_REQUIRES: pytest - CIBW_TEST_COMMAND: py.test -v {project}/tests run: - pipx run --spec='cibuildwheel==2.9.0' cibuildwheel --output-dir dist 2>&1 + pipx run --spec='cibuildwheel==2.16.2' cibuildwheel --output-dir dist 2>&1 - name: Upload artifacts uses: actions/upload-artifact@v3 @@ -274,6 +272,11 @@ jobs: with: fetch-depth: 0 + - name: Set Env + uses: Chia-Network/actions/setjobenv@main + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: Chia-Network/actions/setup-python@main with: python-version: ${{ matrix.python.major-dot-minor }} @@ -284,32 +287,17 @@ jobs: name: packages path: ./dist - - name: Test for secrets access - id: check_secrets - shell: bash - run: | - unset HAS_SECRET - if [ -n "$SECRET" ]; then HAS_SECRET='true' ; fi - echo "HAS_SECRET=${HAS_SECRET}" >>$GITHUB_OUTPUT - env: - SECRET: "${{ secrets.test_pypi_password }}" - - - name: Install twine - run: pip install twine - - name: Publish distribution to PyPI - if: startsWith(github.event.ref, 'refs/tags') && steps.check_secrets.outputs.HAS_SECRET - env: - TWINE_USERNAME: __token__ - TWINE_NON_INTERACTIVE: 1 - TWINE_PASSWORD: ${{ secrets.pypi_password }} - run: twine upload --non-interactive --skip-existing --verbose 'dist/*' + if: env.RELEASE == 'true' + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ + skip-existing: true - name: Publish distribution to Test PyPI - if: steps.check_secrets.outputs.HAS_SECRET - env: - TWINE_REPOSITORY_URL: https://test.pypi.org/legacy/ - TWINE_USERNAME: __token__ - TWINE_NON_INTERACTIVE: 1 - TWINE_PASSWORD: ${{ secrets.test_pypi_password }} - run: twine upload --non-interactive --skip-existing --verbose 'dist/*' + if: env.PRE_RELEASE == 'true' + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + packages-dir: dist/ + skip-existing: true diff --git a/.github/workflows/plot-k27-no-bitfield.yaml b/.github/workflows/plot-k27-no-bitfield.yaml index 133b97c8b..943da131c 100644 --- a/.github/workflows/plot-k27-no-bitfield.yaml +++ b/.github/workflows/plot-k27-no-bitfield.yaml @@ -1,6 +1,14 @@ name: Plot k=27 without bitfield -on: [push, pull_request] +on: + push: + branches: + - main + release: + types: [published] + pull_request: + branches: + - '**' concurrency: group: ${{ github.ref }}-${{ github.workflow }}-${{ github.event_name }}--${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/heads/long_lived/')) && github.sha || '' }} diff --git a/.github/workflows/plot-k27.yaml b/.github/workflows/plot-k27.yaml index 5b81906a8..bdaffdae8 100644 --- a/.github/workflows/plot-k27.yaml +++ b/.github/workflows/plot-k27.yaml @@ -1,6 +1,14 @@ name: Plot k=27 -on: [push, pull_request] +on: + push: + branches: + - main + release: + types: [published] + pull_request: + branches: + - '**' concurrency: group: ${{ github.ref }}-${{ github.workflow }}-${{ github.event_name }}--${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/heads/long_lived/')) && github.sha || '' }} diff --git a/.gitignore b/.gitignore index 8ba5e60ea..956d66bbf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.idea/ ProofOfSpace RunTests HellmanAttacks @@ -25,3 +26,9 @@ build .mypy_cache *.whl venv +build-tsan +build-* +cmake-build* +*.zip +*.tar.gz +libs/ \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index a01abc8f0..77a48219c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ else() FetchContent_Declare( pybind11-src GIT_REPOSITORY https://github.com/pybind/pybind11.git - GIT_TAG v2.10.0 + GIT_TAG v2.11.1 ) FetchContent_MakeAvailable(pybind11-src) endif() @@ -30,14 +30,33 @@ endif() FetchContent_Declare( cxxopts GIT_REPOSITORY https://github.com/jarro2783/cxxopts.git - GIT_TAG v2.2.1 + GIT_TAG v3.1.1 ) FetchContent_MakeAvailable(cxxopts) +option(CP_LINK_BLADEBIT_HARVESTER "Links libbladebit_harvester at build time instead of dynamically loading it." OFF) +option(CP_BUILD_BLADEBIT_HARVESTER "Pulls bladebit harvester target from git and builds it as a dependency.") + +if (${CP_BUILD_BLADEBIT_HARVESTER} AND NOT ${CP_LINK_BLADEBIT_HARVESTER}) + set(CP_LINK_BLADEBIT_HARVESTER ON) +endif() + +if (${CP_BUILD_BLADEBIT_HARVESTER}) + FetchContent_Declare( + bladebit + GIT_REPOSITORY https://github.com/Chia-Network/bladebit.git + GIT_TAG cuda-compression + ) + + set(BB_HARVESTER_ONLY ON) + set(BB_HARVESTER_STATIC ON) + FetchContent_MakeAvailable(bladebit) +endif() + FetchContent_Declare( gulrak GIT_REPOSITORY https://github.com/gulrak/filesystem.git - GIT_TAG v1.5.6 + GIT_TAG v1.5.14 ) FetchContent_MakeAvailable(gulrak) @@ -57,7 +76,11 @@ include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../lib/FiniteStateEntropy/lib ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/test - ) +) + +IF (${CP_LINK_BLADEBIT_HARVESTER}) + message ("Bladebit Harvesting Enabled") +ENDIF () add_library(fse ${FSE_FILES}) @@ -99,47 +122,35 @@ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -fno-omit-frame-pointer -fsanitize=thre set (CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -fno-omit-frame-pointer -fsanitize=thread") ENDIF() -IF (APPLE) -# on macOS "uname -m" returns the architecture (x86_64 or arm64) -execute_process( - COMMAND uname -m - RESULT_VARIABLE result - OUTPUT_VARIABLE OSX_NATIVE_ARCHITECTURE - OUTPUT_STRIP_TRAILING_WHITESPACE) -ENDIF() - -IF (WIN32) -set(BLAKE3_SRC - src/b3/blake3.c - src/b3/blake3_portable.c - src/b3/blake3_dispatch.c - src/b3/blake3_avx2.c - src/b3/blake3_avx512.c - src/b3/blake3_sse41.c -) -ELSEIF(OSX_NATIVE_ARCHITECTURE STREQUAL "arm64") -set(BLAKE3_SRC - src/b3/blake3.c - src/b3/blake3_portable.c - src/b3/blake3_dispatch.c +pybind11_add_module(chiapos ${CMAKE_CURRENT_SOURCE_DIR}/python-bindings/chiapos.cpp src/chacha8.c) +add_executable(ProofOfSpace + src/cli.cpp + src/chacha8.c ) -ELSE() -set(BLAKE3_SRC - src/b3/blake3.c - src/b3/blake3_portable.c - src/b3/blake3_dispatch.c - src/b3/blake3_avx2_x86-64_unix.S - src/b3/blake3_avx512_x86-64_unix.S - src/b3/blake3_sse41_x86-64_unix.S + +FetchContent_Declare( + blake3 + GIT_REPOSITORY https://github.com/BLAKE3-team/BLAKE3.git + GIT_TAG 1.5.0 ) -ENDIF() -pybind11_add_module(chiapos ${CMAKE_CURRENT_SOURCE_DIR}/python-bindings/chiapos.cpp src/chacha8.c ${BLAKE3_SRC}) +FetchContent_GetProperties(blake3) +if(NOT blake3_POPULATED) + FetchContent_Populate(blake3) -add_executable(ProofOfSpace - src/cli.cpp - src/chacha8.c - ${BLAKE3_SRC} + # Set BLAKE3 to build as a static library + set(BUILD_SHARED_LIBS FALSE CACHE BOOL "Build static libraries" FORCE) + + add_subdirectory(${blake3_SOURCE_DIR}/c ${blake3_BINARY_DIR}) +endif() + +set(BLAKE3_SRC ${blake3_SOURCE_DIR}/c) +set(BLAKE3_INCLUDE_DIR ${blake3_SOURCE_DIR}/c) +target_link_libraries(chiapos PRIVATE blake3) +target_link_libraries(ProofOfSpace PRIVATE blake3) +include_directories( + ${INCLUDE_DIRECTORIES} + ${BLAKE3_INCLUDE_DIR} ) option(BUILD_PROOF_OF_SPACE_STATICALLY "Build ProofOfSpace target statically" OFF) @@ -151,21 +162,21 @@ ENDIF() FetchContent_Declare( Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git - GIT_TAG v3.2.1 + GIT_TAG v3.5.2 ) FetchContent_MakeAvailable(Catch2) add_executable(RunTests tests/test.cpp src/chacha8.c - ${BLAKE3_SRC} ) target_link_libraries(RunTests - PRIVATE + PRIVATE fse Threads::Threads Catch2::Catch2 + blake3 ) find_package(Threads REQUIRED) @@ -175,26 +186,69 @@ target_include_directories(uint128 PUBLIC uint128_t) target_compile_features(fse PUBLIC cxx_std_17) target_compile_features(chiapos PUBLIC cxx_std_17) -target_compile_features(RunTests PUBLIC cxx_std_17) - -if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") - target_link_libraries(chiapos PRIVATE fse Threads::Threads) - target_link_libraries(ProofOfSpace fse Threads::Threads) -elseif (${CMAKE_SYSTEM_NAME} MATCHES "OpenBSD") - target_link_libraries(chiapos PRIVATE fse Threads::Threads) - target_link_libraries(ProofOfSpace fse Threads::Threads) -elseif (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") - target_link_libraries(chiapos PRIVATE fse Threads::Threads) - target_link_libraries(ProofOfSpace fse Threads::Threads) -elseif (MSVC) - target_link_libraries(chiapos PRIVATE fse Threads::Threads uint128) - target_link_libraries(ProofOfSpace fse Threads::Threads uint128) - target_link_libraries(RunTests PRIVATE uint128) -else() - target_link_libraries(chiapos PRIVATE fse stdc++fs Threads::Threads) - target_link_libraries(ProofOfSpace fse stdc++fs Threads::Threads) - target_link_libraries(RunTests PRIVATE stdc++fs) +# target_compile_features(RunTests PUBLIC cxx_std_17) + +target_link_libraries(chiapos PRIVATE fse Threads::Threads + $<$:uint128> + $<$>:stdc++fs> +) +target_link_libraries(ProofOfSpace PRIVATE fse Threads::Threads + $<$:uint128> + $<$>:stdc++fs> +) +target_link_libraries(RunTests PRIVATE fse Threads::Threads Catch2::Catch2WithMain + $<$:uint128> + $<$>:stdc++fs> +) + +if (${CP_LINK_BLADEBIT_HARVESTER}) + + set(bb_defs + USE_GREEN_REAPER=1 + BLADEBIT_HARVESTER_LINKED=1 + $<$:BLADEBIT_IS_PROJECT_DEPENDENCY=1> + ) + set(bb_libs + bladebit_harvester + $<$>:dl> + ) + + include_directories( + ${INCLUDE_DIRECTORIES} + ${CMAKE_CURRENT_SOURCE_DIR}/libs/green_reaper/include + ) + + link_directories( + ${LINK_DIRECTORIES} + ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib + ) + + target_compile_definitions(chiapos PUBLIC ${bb_defs}) + target_compile_definitions(ProofOfSpace PUBLIC ${bb_defs}) + target_compile_definitions(RunTests PUBLIC ${bb_defs}) + + target_link_libraries(chiapos PUBLIC ${bb_libs}) + target_link_libraries(ProofOfSpace PUBLIC ${bb_libs}) + target_link_libraries(RunTests PUBLIC ${bb_libs}) + + target_link_directories(chiapos PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) + target_link_directories(ProofOfSpace PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) + target_link_directories(RunTests PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) + + set_property(TARGET chiapos APPEND PROPERTY BUILD_RPATH "$ORIGIN") + set_property(TARGET ProofOfSpace APPEND PROPERTY BUILD_RPATH "$ORIGIN") + set_property(TARGET RunTests APPEND PROPERTY BUILD_RPATH "$ORIGIN") + + if (WIN32) + add_custom_command(TARGET chiapos POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${CMAKE_SOURCE_DIR}/libs/green_reaper/lib/bladebit_harvester.dll" + "$/bladebit_harvester.dll" + ) + message("The bladebit dll was copied to: $/bladebit_harvester.dll") + endif() endif() + enable_testing() add_test(NAME RunTests COMMAND RunTests) diff --git a/pyproject.toml b/pyproject.toml index 7c1146b6b..966ac339d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,40 @@ build-backend = "setuptools.build_meta" [tool.setuptools_scm] local_scheme = "no-local-version" + +[tool.cibuildwheel] +test-requires = "pytest" +test-command = "pytest -v {project}/tests" +skip = "*-manylinux_i686 *-win32 *-musllinux_*" + +[tool.cibuildwheel.linux] +build-verbosity = 1 +environment = {CP_USE_GREEN_REAPER=1} +before-all = ''' +set -eo pipefail && set -x && set -eo pipefail && ARCH=$(uname -m) +if [[ $ARCH == x86_64 ]]; then + .github/actions/fetch_bladebit_harvester.sh linux x86-64 +else + .github/actions/fetch_bladebit_harvester.sh linux arm64 +fi +''' +before-build = "python -m pip install --upgrade pip" + +[tool.cibuildwheel.macos] +build-verbosity = 1 +before-all = ''' +brew install gmp boost cmake +set -eo pipefail +ARCH=$(uname -m) +if [[ $ARCH == x86_64 ]]; then + .github/actions/fetch_bladebit_harvester.sh macos x86-64 +else + .github/actions/fetch_bladebit_harvester.sh macos arm64 +fi +''' +before-build = "python -m pip install --upgrade pip" +environment = {MACOSX_DEPLOYMENT_TARGET="11", SYSTEM_VERSION_COMPAT=0, CP_USE_GREEN_REAPER=1} + +[tool.cibuildwheel.windows] +build-verbosity = 1 +environment = "CP_USE_GREEN_REAPER=1" diff --git a/python-bindings/chiapos.cpp b/python-bindings/chiapos.cpp index 42431aa54..3a1a95a21 100644 --- a/python-bindings/chiapos.cpp +++ b/python-bindings/chiapos.cpp @@ -105,6 +105,7 @@ PYBIND11_MODULE(chiapos, m) return py::bytes(reinterpret_cast(id.data()), id.size()); }) .def("get_size", [](DiskProver &dp) { return dp.GetSize(); }) + .def("get_compression_level", [](DiskProver &dp) { return dp.GetCompressionLevel(); }) .def("get_filename", [](DiskProver &dp) { return dp.GetFilename(); }) .def( "get_qualities_for_challenge", @@ -179,6 +180,10 @@ PYBIND11_MODULE(chiapos, m) delete[] quality_buf; return stdx::optional(quality_py); }); + + py::class_(m, "ContextQueue") + .def("init", &ContextQueue::init); + m.attr("decompressor_context_queue") = &decompressor_context_queue; } #endif // PYTHON_BINDINGS_PYTHON_BINDINGS_HPP_ diff --git a/setup.py b/setup.py index 44d097dff..59689f22e 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,11 @@ #!/usr/bin/python3 import os -import re import sys import platform import subprocess -from setuptools import setup, setuptools, Extension +from setuptools import setup, Extension from setuptools.command.build_ext import build_ext -from distutils.version import LooseVersion class CMakeExtension(Extension): @@ -19,7 +17,7 @@ def __init__(self, name, sourcedir=""): class CMakeBuild(build_ext): def run(self): try: - out = subprocess.check_output(["cmake", "--version"]) + subprocess.check_output(["cmake", "--version"]) except OSError: raise RuntimeError( "CMake must be installed to build" @@ -27,13 +25,6 @@ def run(self): + ", ".join(e.name for e in self.extensions) ) - if platform.system() == "Windows": - cmake_version = LooseVersion( - re.search(r"version\s*([\d.]+)", out.decode()).group(1) - ) - if cmake_version < "3.1.0": - raise RuntimeError("CMake >= 3.1.0 is required on Windows") - for ext in self.extensions: self.build_extension(ext) @@ -44,6 +35,9 @@ def build_extension(self, ext): "-DPYTHON_EXECUTABLE=" + sys.executable, ] + if os.getenv("CP_USE_GREEN_REAPER") == "1": + cmake_args.append("-DCP_LINK_BLADEBIT_HARVESTER=ON") + cfg = "Debug" if self.debug else "Release" build_args = ["--config", cfg] @@ -72,147 +66,18 @@ def build_extension(self, ext): ) -class get_pybind_include(object): - """Helper class to determine the pybind11 include path - - The purpose of this class is to postpone importing pybind11 - until it is actually installed, so that the ``get_include()`` - method can be invoked.""" - - def __init__(self, user=False): - self.user = user - - def __str__(self): - import pybind11 - - return pybind11.get_include(self.user) - - -ext_modules = [ - Extension( - "chiapos", - [ - "lib/FiniteStateEntropy/lib/entropy_common.c", - "lib/FiniteStateEntropy/lib/fse_compress.c", - "lib/FiniteStateEntropy/lib/fse_decompress.c", - "lib/FiniteStateEntropy/lib/hist.c", - "python-bindings/chiapos.cpp", - "uint128_t/uint128_t.cpp", - "src/b3/blake3.c", - "src/b3/blake3_portable.c", - "src/b3/blake3_dispatch.c", - "src/b3/blake3_avx2.c", - "src/b3/blake3_avx512.c", - "src/b3/blake3_sse41.c", - "src/chacha8.c", - ], - include_dirs=[ - # Path to pybind11 headers - get_pybind_include(), - get_pybind_include(user=True), - "src", - "uint128_t", - ".", - ], - ), -] - - -# As of Python 3.6, CCompiler has a `has_flag` method. -# cf http://bugs.python.org/issue26689 -def has_flag(compiler, flagname): - """Return a boolean indicating whether a flag name is supported on - the specified compiler. - """ - import tempfile - - with tempfile.NamedTemporaryFile("w", suffix=".cpp") as f: - f.write("int main (int argc, char **argv) { return 0; }") - try: - compiler.compile([f.name], extra_postargs=[flagname]) - except setuptools.distutils.errors.CompileError: - return False - return True - - -def cpp_flag(compiler): - """Return the -std=c++[11/14/17] compiler flag. - - The newer version is prefered over c++11 (when it is available). - """ - flags = ["-std=c++17", "-std=c++14", "-std=c++11"] - - for flag in flags: - if has_flag(compiler, flag): - return flag - - raise RuntimeError("Unsupported compiler -- at least C++11 support " "is needed!") - - -class BuildExt(build_ext): - """A custom build extension for adding compiler-specific options.""" - - c_opts = { - "msvc": ["/EHsc", "/std:c++17", "/O2"], - "unix": [""], - } - l_opts = { - "msvc": [], - "unix": [""], - } - - if sys.platform == "darwin": - darwin_opts = ["-stdlib=libc++", "-mmacosx-version-min=10.14"] - c_opts["unix"] += darwin_opts - l_opts["unix"] += darwin_opts # type: ignore - - def build_extensions(self): - ct = self.compiler.compiler_type - opts = self.c_opts.get(ct, []) - link_opts = self.l_opts.get(ct, []) - if ct == "unix": - opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) - opts.append(cpp_flag(self.compiler)) - if has_flag(self.compiler, "-fvisibility=hidden"): - opts.append("-fvisibility=hidden") - elif ct == "msvc": - opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) - for ext in self.extensions: - ext.extra_compile_args = opts - ext.extra_link_args = link_opts - build_ext.build_extensions(self) - - -if platform.system() == "Windows": - setup( - name="chiapos", - author="Mariano Sorgente", - author_email="mariano@chia.net", - description="Chia proof of space plotting, proving, and verifying (wraps C++)", - license="Apache License", - python_requires=">=3.7", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/Chia-Network/chiapos", - setup_requires=["pybind11>=2.10.0"], - tests_require=["pytest"], - ext_modules=ext_modules, - cmdclass={"build_ext": BuildExt}, - zip_safe=False, - ) -else: - setup( - name="chiapos", - author="Mariano Sorgente", - author_email="mariano@chia.net", - description="Chia proof of space plotting, proving, and verifying (wraps C++)", - license="Apache License", - python_requires=">=3.7", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/Chia-Network/chiapos", - tests_require=["pytest"], - ext_modules=[CMakeExtension("chiapos", ".")], - cmdclass=dict(build_ext=CMakeBuild), - zip_safe=False, - ) +setup( + name="chiapos", + author="Mariano Sorgente", + author_email="mariano@chia.net", + description="Chia proof of space plotting, proving, and verifying (wraps C++)", + license="Apache License", + python_requires=">=3.7", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/Chia-Network/chiapos", + tests_require=["pytest"], + ext_modules=[CMakeExtension("chiapos", ".")], + cmdclass=dict(build_ext=CMakeBuild), + zip_safe=False, +) diff --git a/src/b3/blake3.c b/src/b3/blake3.c deleted file mode 100644 index 0acefbade..000000000 --- a/src/b3/blake3.c +++ /dev/null @@ -1,598 +0,0 @@ -#include -#include -#include - -#include "blake3.h" -#include "blake3_impl.h" - -INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], - uint8_t flags) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; - self->blocks_compressed = 0; - self->flags = flags; -} - -INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], - uint64_t chunk_counter) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = chunk_counter; - self->blocks_compressed = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; -} - -INLINE size_t chunk_state_len(const blake3_chunk_state *self) { - return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + - ((size_t)self->buf_len); -} - -INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, - const uint8_t *input, size_t input_len) { - size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); - if (take > input_len) { - take = input_len; - } - uint8_t *dest = self->buf + ((size_t)self->buf_len); - memcpy(dest, input, take); - self->buf_len += (uint8_t)take; - return take; -} - -INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { - if (self->blocks_compressed == 0) { - return CHUNK_START; - } else { - return 0; - } -} - -typedef struct { - uint32_t input_cv[8]; - uint64_t counter; - uint8_t block[BLAKE3_BLOCK_LEN]; - uint8_t block_len; - uint8_t flags; -} output_t; - -INLINE output_t make_output(const uint32_t input_cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - output_t ret; - memcpy(ret.input_cv, input_cv, 32); - memcpy(ret.block, block, BLAKE3_BLOCK_LEN); - ret.block_len = block_len; - ret.counter = counter; - ret.flags = flags; - return ret; -} - -// Chaining values within a given chunk (specifically the compress_in_place -// interface) are represented as words. This avoids unnecessary bytes<->words -// conversion overhead in the portable implementation. However, the hash_many -// interface handles both user input and parent node blocks, so it accepts -// bytes. For that reason, chaining values in the CV stack are represented as -// bytes. -INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { - uint32_t cv_words[8]; - memcpy(cv_words, self->input_cv, 32); - blake3_compress_in_place(cv_words, self->block, self->block_len, - self->counter, self->flags); - memcpy(cv, cv_words, 32); -} - -INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, - size_t out_len) { - uint64_t output_block_counter = seek / 64; - size_t offset_within_block = seek % 64; - uint8_t wide_buf[64]; - while (out_len > 0) { - blake3_compress_xof(self->input_cv, self->block, self->block_len, - output_block_counter, self->flags | ROOT, wide_buf); - size_t available_bytes = 64 - offset_within_block; - size_t memcpy_len; - if (out_len > available_bytes) { - memcpy_len = available_bytes; - } else { - memcpy_len = out_len; - } - memcpy(out, wide_buf + offset_within_block, memcpy_len); - out += memcpy_len; - out_len -= memcpy_len; - output_block_counter += 1; - offset_within_block = 0; - } -} - -INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, - size_t input_len) { - if (self->buf_len > 0) { - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; - if (input_len > 0) { - blake3_compress_in_place( - self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - self->buf_len = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - } - } - - while (input_len > BLAKE3_BLOCK_LEN) { - blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, - self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - input += BLAKE3_BLOCK_LEN; - input_len -= BLAKE3_BLOCK_LEN; - } - - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; -} - -INLINE output_t chunk_state_output(const blake3_chunk_state *self) { - uint8_t block_flags = - self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; - return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, - block_flags); -} - -INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], - const uint32_t key[8], uint8_t flags) { - return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); -} - -// Given some input larger than one chunk, return the number of bytes that -// should go in the left subtree. This is the largest power-of-2 number of -// chunks that leaves at least 1 byte for the right subtree. -INLINE size_t left_len(size_t content_len) { - // Subtract 1 to reserve at least one byte for the right side. content_len - // should always be greater than BLAKE3_CHUNK_LEN. - size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; - return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time -// on a single thread. Write out the chunk chaining values and return the -// number of chunks hashed. These chunks are never the root and never empty; -// those cases use a different codepath. -INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, - uint8_t *out) { -#if defined(BLAKE3_TESTING) - assert(0 < input_len); - assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); -#endif - - const uint8_t *chunks_array[MAX_SIMD_DEGREE]; - size_t input_position = 0; - size_t chunks_array_len = 0; - while (input_len - input_position >= BLAKE3_CHUNK_LEN) { - chunks_array[chunks_array_len] = &input[input_position]; - input_position += BLAKE3_CHUNK_LEN; - chunks_array_len += 1; - } - - blake3_hash_many(chunks_array, chunks_array_len, - BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, - true, flags, CHUNK_START, CHUNK_END, out); - - // Hash the remaining partial chunk, if there is one. Note that the empty - // chunk (meaning the empty message) is a different codepath. - if (input_len > input_position) { - uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, key, flags); - chunk_state.chunk_counter = counter; - chunk_state_update(&chunk_state, &input[input_position], - input_len - input_position); - output_t output = chunk_state_output(&chunk_state); - output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); - return chunks_array_len + 1; - } else { - return chunks_array_len; - } -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time -// on a single thread. Write out the parent chaining values and return the -// number of parents hashed. (If there's an odd input chaining value left over, -// return it as an additional output.) These parents are never the root and -// never empty; those cases use a different codepath. -INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, - size_t num_chaining_values, - const uint32_t key[8], uint8_t flags, - uint8_t *out) { -#if defined(BLAKE3_TESTING) - assert(2 <= num_chaining_values); - assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); -#endif - - const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; - size_t parents_array_len = 0; - while (num_chaining_values - (2 * parents_array_len) >= 2) { - parents_array[parents_array_len] = - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; - parents_array_len += 1; - } - - blake3_hash_many(parents_array, parents_array_len, 1, key, - 0, // Parents always use counter 0. - false, flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out); - - // If there's an odd child left over, it becomes an output. - if (num_chaining_values > 2 * parents_array_len) { - memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], - BLAKE3_OUT_LEN); - return parents_array_len + 1; - } else { - return parents_array_len; - } -} - -// The wide helper function returns (writes out) an array of chaining values -// and returns the length of that array. The number of chaining values returned -// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, -// if the input is shorter than that many chunks. The reason for maintaining a -// wide array of chaining values going back up the tree, is to allow the -// implementation to hash as many parents in parallel as possible. -// -// As a special case when the SIMD degree is 1, this function will still return -// at least 2 outputs. This guarantees that this function doesn't perform the -// root compression. (If it did, it would use the wrong flags, and also we -// wouldn't be able to implement exendable ouput.) Note that this function is -// not used when the whole input is only 1 chunk long; that's a different -// codepath. -// -// Why not just have the caller split the input on the first update(), instead -// of implementing this special rule? Because we don't want to limit SIMD or -// multi-threading parallelism for that update(). -static size_t blake3_compress_subtree_wide(const uint8_t *input, - size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, - uint8_t flags, uint8_t *out) { - // Note that the single chunk case does *not* bump the SIMD degree up to 2 - // when it is 1. If this implementation adds multi-threading in the future, - // this gives us the option of multi-threading even the 2-chunk case, which - // can help performance on smaller platforms. - if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { - return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, - out); - } - - // With more than simd_degree chunks, we need to recurse. Start by dividing - // the input into left and right subtrees. (Note that this is only optimal - // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree - // of 3 or something, we'll need a more complicated strategy.) - size_t left_input_len = left_len(input_len); - size_t right_input_len = input_len - left_input_len; - const uint8_t *right_input = &input[left_input_len]; - uint64_t right_chunk_counter = - chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); - - // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to - // account for the special case of returning 2 outputs when the SIMD degree - // is 1. - uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t degree = blake3_simd_degree(); - if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { - // The special case: We always use a degree of at least two, to make - // sure there are two outputs. Except, as noted above, at the chunk - // level, where we allow degree=1. (Note that the 1-chunk-input case is - // a different codepath.) - degree = 2; - } - uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; - - // Recurse! If this implementation adds multi-threading support in the - // future, this is where it will go. - size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, - chunk_counter, flags, cv_array); - size_t right_n = blake3_compress_subtree_wide( - right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); - - // The special case again. If simd_degree=1, then we'll have left_n=1 and - // right_n=1. Rather than compressing them into a single output, return - // them directly, to make sure we always have at least two outputs. - if (left_n == 1) { - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); - return 2; - } - - // Otherwise, do one layer of parent node compression. - size_t num_chaining_values = left_n + right_n; - return compress_parents_parallel(cv_array, num_chaining_values, key, flags, - out); -} - -// Hash a subtree with compress_subtree_wide(), and then condense the resulting -// list of chaining values down to a single parent node. Don't compress that -// last parent node, however. Instead, return its message bytes (the -// concatenated chaining values of its children). This is necessary when the -// first call to update() supplies a complete subtree, because the topmost -// parent node of that subtree could end up being the root. It's also necessary -// for extended output in the general case. -// -// As with compress_subtree_wide(), this function is not used on inputs of 1 -// chunk or less. That's a different codepath. -INLINE void compress_subtree_to_parent_node( - const uint8_t *input, size_t input_len, const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { -#if defined(BLAKE3_TESTING) - assert(input_len > BLAKE3_CHUNK_LEN); -#endif - - uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, - chunk_counter, flags, cv_array); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, - // compress_subtree_wide() returns more than 2 chaining values. Condense - // them into 2 by forming parent nodes repeatedly. - uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - while (num_cvs > 2) { - num_cvs = - compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); - memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); - } - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); -} - -INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], - uint8_t flags) { - memcpy(self->key, key, BLAKE3_KEY_LEN); - chunk_state_init(&self->chunk, key, flags); - self->cv_stack_len = 0; -} - -void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } - -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]) { - uint32_t key_words[8]; - load_key_words(key, key_words); - hasher_init_base(self, key_words, KEYED_HASH); -} - -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { - blake3_hasher context_hasher; - hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); - blake3_hasher_update(&context_hasher, context, strlen(context)); - uint8_t context_key[BLAKE3_KEY_LEN]; - blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); - uint32_t context_key_words[8]; - load_key_words(context_key, context_key_words); - hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); -} - -// As described in hasher_push_cv() below, we do "lazy merging", delaying -// merges until right before the next CV is about to be added. This is -// different from the reference implementation. Another difference is that we -// aren't always merging 1 chunk at a time. Instead, each CV might represent -// any power-of-two number of chunks, as long as the smaller-above-larger stack -// order is maintained. Instead of the "count the trailing 0-bits" algorithm -// described in the spec, we use a "count the total number of 1-bits" variant -// that doesn't require us to retain the subtree size of the CV on top of the -// stack. The principle is the same: each CV that should remain in the stack is -// represented by a 1-bit in the total number of chunks (or bytes) so far. -INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { - size_t post_merge_stack_len = (size_t)popcnt(total_len); - while (self->cv_stack_len > post_merge_stack_len) { - uint8_t *parent_node = - &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; - output_t output = parent_output(parent_node, self->key, self->chunk.flags); - output_chaining_value(&output, parent_node); - self->cv_stack_len -= 1; - } -} - -// In reference_impl.rs, we merge the new CV with existing CVs from the stack -// before pushing it. We can do that because we know more input is coming, so -// we know none of the merges are root. -// -// This setting is different. We want to feed as much input as possible to -// compress_subtree_wide(), without setting aside anything for the chunk_state. -// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once -// as a single subtree, if at all possible. -// -// This leads to two problems: -// 1) This 64 KiB input might be the only call that ever gets made to update. -// In this case, the root node of the 64 KiB subtree would be the root node -// of the whole tree, and it would need to be ROOT finalized. We can't -// compress it until we know. -// 2) This 64 KiB input might complete a larger tree, whose root node is -// similarly going to be the the root of the whole tree. For example, maybe -// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the -// node at the root of the 256 KiB subtree until we know how to finalize it. -// -// The second problem is solved with "lazy merging". That is, when we're about -// to add a CV to the stack, we don't merge it with anything first, as the -// reference impl does. Instead we do merges using the *previous* CV that was -// added, which is sitting on top of the stack, and we put the new CV -// (unmerged) on top of the stack afterwards. This guarantees that we never -// merge the root node until finalize(). -// -// Solving the first problem requires an additional tool, -// compress_subtree_to_parent_node(). That function always returns the top -// *two* chaining values of the subtree it's compressing. We then do lazy -// merging with each of them separately, so that the second CV will always -// remain unmerged. (That also helps us support extendable output when we're -// hashing an input all-at-once.) -INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], - uint64_t chunk_counter) { - hasher_merge_cv_stack(self, chunk_counter); - memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, - BLAKE3_OUT_LEN); - self->cv_stack_len += 1; -} - -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector v; - // blake3_hasher_update(&hasher, v.data(), v.size()); - if (input_len == 0) { - return; - } - - const uint8_t *input_bytes = (const uint8_t *)input; - - // If we have some partial chunk bytes in the internal chunk_state, we need - // to finish that chunk first. - if (chunk_state_len(&self->chunk) > 0) { - size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); - if (take > input_len) { - take = input_len; - } - chunk_state_update(&self->chunk, input_bytes, take); - input_bytes += take; - input_len -= take; - // If we've filled the current chunk and there's more coming, finalize this - // chunk and proceed. In this case we know it's not the root. - if (input_len > 0) { - output_t output = chunk_state_output(&self->chunk); - uint8_t chunk_cv[32]; - output_chaining_value(&output, chunk_cv); - hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); - chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); - } else { - return; - } - } - - // Now the chunk_state is clear, and we have more input. If there's more than - // a single chunk (so, definitely not the root chunk), hash the largest whole - // subtree we can, with the full benefits of SIMD (and maybe in the future, - // multi-threading) parallelism. Two restrictions: - // - The subtree has to be a power-of-2 number of chunks. Only subtrees along - // the right edge can be incomplete, and we don't know where the right edge - // is going to be until we get to finalize(). - // - The subtree must evenly divide the total number of chunks up until this - // point (if total is not 0). If the current incomplete subtree is only - // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have - // to complete the current subtree first. - // Because we might need to break up the input to form powers of 2, or to - // evenly divide what we already have, this part runs in a loop. - while (input_len > BLAKE3_CHUNK_LEN) { - size_t subtree_len = round_down_to_power_of_2(input_len); - uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; - // Shrink the subtree_len until it evenly divides the count so far. We know - // that subtree_len itself is a power of 2, so we can use a bitmasking - // trick instead of an actual remainder operation. (Note that if the caller - // consistently passes power-of-2 inputs of the same size, as is hopefully - // typical, this loop condition will always fail, and subtree_len will - // always be the full length of the input.) - // - // An aside: We don't have to shrink subtree_len quite this much. For - // example, if count_so_far is 1, we could pass 2 chunks to - // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still - // get the right answer in the end, and we might get to use 2-way SIMD - // parallelism. The problem with this optimization, is that it gets us - // stuck always hashing 2 chunks. The total number of chunks will remain - // odd, and we'll never graduate to higher degrees of parallelism. See - // https://github.com/BLAKE3-team/BLAKE3/issues/69. - while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { - subtree_len /= 2; - } - // The shrunken subtree_len might now be 1 chunk long. If so, hash that one - // chunk by itself. Otherwise, compress the subtree into a pair of CVs. - uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; - if (subtree_len <= BLAKE3_CHUNK_LEN) { - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, self->key, self->chunk.flags); - chunk_state.chunk_counter = self->chunk.chunk_counter; - chunk_state_update(&chunk_state, input_bytes, subtree_len); - output_t output = chunk_state_output(&chunk_state); - uint8_t cv[BLAKE3_OUT_LEN]; - output_chaining_value(&output, cv); - hasher_push_cv(self, cv, chunk_state.chunk_counter); - } else { - // This is the high-performance happy path, though getting here depends - // on the caller giving us a long enough input. - uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; - compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, - self->chunk.chunk_counter, - self->chunk.flags, cv_pair); - hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); - hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], - self->chunk.chunk_counter + (subtree_chunks / 2)); - } - self->chunk.chunk_counter += subtree_chunks; - input_bytes += subtree_len; - input_len -= subtree_len; - } - - // If there's any remaining input less than a full chunk, add it to the chunk - // state. In that case, also do a final merge loop to make sure the subtree - // stack doesn't contain any unmerged pairs. The remaining input means we - // know these merges are non-root. This merge loop isn't strictly necessary - // here, because hasher_push_chunk_cv already does its own merge loop, but it - // simplifies blake3_hasher_finalize below. - if (input_len > 0) { - chunk_state_update(&self->chunk, input_bytes, input_len); - hasher_merge_cv_stack(self, self->chunk.chunk_counter); - } -} - -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len) { - blake3_hasher_finalize_seek(self, 0, out, out_len); -} - -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector v; - // blake3_hasher_finalize(&hasher, v.data(), v.size()); - if (out_len == 0) { - return; - } - - // If the subtree stack is empty, then the current chunk is the root. - if (self->cv_stack_len == 0) { - output_t output = chunk_state_output(&self->chunk); - output_root_bytes(&output, seek, out, out_len); - return; - } - // If there are any bytes in the chunk state, finalize that chunk and do a - // roll-up merge between that chunk hash and every subtree in the stack. In - // this case, the extra merge loop at the end of blake3_hasher_update - // guarantees that none of the subtrees in the stack need to be merged with - // each other first. Otherwise, if there are no bytes in the chunk state, - // then the top of the stack is a chunk hash, and we start the merge from - // that. - output_t output; - size_t cvs_remaining; - if (chunk_state_len(&self->chunk) > 0) { - cvs_remaining = self->cv_stack_len; - output = chunk_state_output(&self->chunk); - } else { - // There are always at least 2 CVs in the stack in this case. - cvs_remaining = self->cv_stack_len - 2; - output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, - self->chunk.flags); - } - while (cvs_remaining > 0) { - cvs_remaining -= 1; - uint8_t parent_block[BLAKE3_BLOCK_LEN]; - memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); - output_chaining_value(&output, &parent_block[32]); - output = parent_output(parent_block, self->key, self->chunk.flags); - } - output_root_bytes(&output, seek, out, out_len); -} diff --git a/src/b3/blake3.h b/src/b3/blake3.h deleted file mode 100644 index 5060e38b7..000000000 --- a/src/b3/blake3.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef BLAKE3_H -#define BLAKE3_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define BLAKE3_KEY_LEN 32 -#define BLAKE3_OUT_LEN 32 -#define BLAKE3_BLOCK_LEN 64 -#define BLAKE3_CHUNK_LEN 1024 -#define BLAKE3_MAX_DEPTH 54 -#define BLAKE3_MAX_SIMD_DEGREE 16 - -// This struct is a private implementation detail. It has to be here because -// it's part of blake3_hasher below. -typedef struct { - uint32_t cv[8]; - uint64_t chunk_counter; - uint8_t buf[BLAKE3_BLOCK_LEN]; - uint8_t buf_len; - uint8_t blocks_compressed; - uint8_t flags; -} blake3_chunk_state; - -typedef struct { - uint32_t key[8]; - blake3_chunk_state chunk; - uint8_t cv_stack_len; - // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, - // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk - // requires a 4th entry, rather than merging everything down to 1, because we - // don't know whether more input is coming. This is different from how the - // reference implementation does things. - uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; -} blake3_hasher; - -void blake3_hasher_init(blake3_hasher *self); -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]); -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len); -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len); -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len); - -#ifdef __cplusplus -} -#endif - -#endif /* BLAKE3_H */ diff --git a/src/b3/blake3_avx2.c b/src/b3/blake3_avx2.c deleted file mode 100644 index c5a2ce9e2..000000000 --- a/src/b3/blake3_avx2.c +++ /dev/null @@ -1,325 +0,0 @@ -#include "blake3_impl.h" - -#include - -#define DEGREE 8 - -INLINE __m256i loadu(const uint8_t src[32]) { - return _mm256_loadu_si256((const __m256i *)src); -} - -INLINE void storeu(__m256i src, uint8_t dest[16]) { - _mm256_storeu_si256((__m256i *)dest, src); -} - -INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } - -// Note that clang-format doesn't like the name "xor" for some reason. -INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } - -INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } - -INLINE __m256i rot16(__m256i x) { - return _mm256_shuffle_epi8( - x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, - 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); -} - -INLINE __m256i rot12(__m256i x) { - return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); -} - -INLINE __m256i rot8(__m256i x) { - return _mm256_shuffle_epi8( - x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, - 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); -} - -INLINE __m256i rot7(__m256i x) { - return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); -} - -INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -INLINE void transpose_vecs(__m256i vecs[DEGREE]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high - // is 22/33/66/77. - __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is - // 11/33. - __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); - vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); - vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); - vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); - vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); - vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); - vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); - vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); -} - -INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m256i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); - out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); - out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); - out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); - out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); - out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); - out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); - out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); - out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); - out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); - out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); - out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); - out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); - for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[8]); -} - -INLINE void load_counters(uint64_t counter, bool increment_counter, - __m256i *out_lo, __m256i *out_hi) { - const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); - const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i add1 = _mm256_and_si256(mask, add0); - __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); - __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), - _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); - __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m256i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m256i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m256i block_flags_vec = set1(block_flags); - __m256i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m256i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(h_vecs); - storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); - storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); - storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); - storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); - storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); - storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); - storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); -} - -#if !defined(BLAKE3_NO_SSE41) -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#else -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif - -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; - } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } -#if !defined(BLAKE3_NO_SSE41) - blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); -#else - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); -#endif -} diff --git a/src/b3/blake3_avx2_x86-64_unix.S b/src/b3/blake3_avx2_x86-64_unix.S deleted file mode 100644 index d2b14d440..000000000 --- a/src/b3/blake3_avx2_x86-64_unix.S +++ /dev/null @@ -1,1802 +0,0 @@ -#ifdef __x86_64__ -.intel_syntax noprefix -.global _blake3_hash_many_avx2 -.global blake3_hash_many_avx2 -#ifdef __APPLE__ -.text -#else -.section .text -#endif - .p2align 6 -_blake3_hash_many_avx2: -blake3_hash_many_avx2: - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 680 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9d - vmovd xmm0, r9d - vpbroadcastd ymm0, xmm0 - vmovdqa ymmword ptr [rsp+0x280], ymm0 - vpand ymm1, ymm0, ymmword ptr [ADD0+rip] - vpand ymm2, ymm0, ymmword ptr [ADD1+rip] - vmovdqa ymmword ptr [rsp+0x220], ymm2 - vmovd xmm2, r8d - vpbroadcastd ymm2, xmm2 - vpaddd ymm2, ymm2, ymm1 - vmovdqa ymmword ptr [rsp+0x240], ymm2 - vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm1, ymm2 - shr r8, 32 - vmovd xmm3, r8d - vpbroadcastd ymm3, xmm3 - vpsubd ymm3, ymm3, ymm2 - vmovdqa ymmword ptr [rsp+0x260], ymm3 - shl rdx, 6 - mov qword ptr [rsp+0x2A0], rdx - cmp rsi, 8 - jc 3f -2: - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x2A0] - cmove eax, ebx - mov dword ptr [rsp+0x200], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x20], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x40], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x60], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x80], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0xA0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0xC0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0xE0], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x100], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x120], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x140], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x160], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x180], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x1A0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x1C0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x1E0], ymm11 - vpbroadcastd ymm15, dword ptr [rsp+0x200] - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] - vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] - vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] - vpxor ymm15, ymm3, ymm15 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] - vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] - vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] - vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x38] - jne 9b - mov rbx, qword ptr [rbp+0x50] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp+0x220] - vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] - vmovdqa ymmword ptr [rsp+0x240], ymm1 - vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm0, ymm2 - vmovdqa ymm0, ymmword ptr [rsp+0x260] - vpsubd ymm2, ymm0, ymm2 - vmovdqa ymmword ptr [rsp+0x260], ymm2 - add rdi, 64 - add rbx, 256 - mov qword ptr [rbp+0x50], rbx - sub rsi, 8 - cmp rsi, 8 - jnc 2b - test rsi, rsi - jnz 3f -4: - vzeroupper - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - mov rbx, qword ptr [rbp+0x50] - mov r15, qword ptr [rsp+0x2A0] - movzx r13d, byte ptr [rbp+0x38] - movzx r12d, byte ptr [rbp+0x48] - test rsi, 0x4 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovdqa ymm8, ymm0 - vmovdqa ymm9, ymm1 - vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] - vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] - vpunpckldq ymm14, ymm12, ymm13 - vpunpckhdq ymm15, ymm12, ymm13 - vpermq ymm14, ymm14, 0x50 - vpermq ymm15, ymm15, 0x50 - vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - vpblendd ymm14, ymm14, ymm12, 0x44 - vpblendd ymm15, ymm15, ymm12, 0x44 - vmovdqa ymmword ptr [rsp], ymm14 - vmovdqa ymmword ptr [rsp+0x20], ymm15 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vmovups ymm2, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm2, ymm3, 136 - vshufps ymm5, ymm2, ymm3, 221 - vmovups ymm2, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm2, ymm3, 136 - vshufps ymm7, ymm2, ymm3, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - vmovups ymm10, ymmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 - vshufps ymm12, ymm10, ymm11, 136 - vshufps ymm13, ymm10, ymm11, 221 - vmovups ymm10, ymmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 - vshufps ymm14, ymm10, ymm11, 136 - vshufps ymm15, ymm10, ymm11, 221 - vpshufd ymm14, ymm14, 0x93 - vpshufd ymm15, ymm15, 0x93 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - vpbroadcastd ymm2, dword ptr [rsp+0x200] - vmovdqa ymm3, ymmword ptr [rsp] - vmovdqa ymm11, ymmword ptr [rsp+0x20] - vpblendd ymm3, ymm3, ymm2, 0x88 - vpblendd ymm11, ymm11, ymm2, 0x88 - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa ymm10, ymm2 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm8, ymm8, ymm12 - vmovdqa ymmword ptr [rsp+0x40], ymm4 - nop - vmovdqa ymmword ptr [rsp+0x60], ymm12 - nop - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vmovdqa ymmword ptr [rsp+0x80], ymm5 - vmovdqa ymmword ptr [rsp+0xA0], ymm13 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm8, ymm8, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpshufd ymm10, ymm10, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm8, ymm8, ymm14 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm8, ymm8, ymm15 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm8, ymm8, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x93 - vpshufd ymm10, ymm10, 0x93 - dec al - je 9f - vmovdqa ymm4, ymmword ptr [rsp+0x40] - vmovdqa ymm5, ymmword ptr [rsp+0x80] - vshufps ymm12, ymm4, ymm5, 214 - vpshufd ymm13, ymm4, 0x0F - vpshufd ymm4, ymm12, 0x39 - vshufps ymm12, ymm6, ymm7, 250 - vpblendd ymm13, ymm13, ymm12, 0xAA - vpunpcklqdq ymm12, ymm7, ymm5 - vpblendd ymm12, ymm12, ymm6, 0x88 - vpshufd ymm12, ymm12, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymmword ptr [rsp+0x40], ymm13 - vmovdqa ymmword ptr [rsp+0x80], ymm12 - vmovdqa ymm12, ymmword ptr [rsp+0x60] - vmovdqa ymm13, ymmword ptr [rsp+0xA0] - vshufps ymm5, ymm12, ymm13, 214 - vpshufd ymm6, ymm12, 0x0F - vpshufd ymm12, ymm5, 0x39 - vshufps ymm5, ymm14, ymm15, 250 - vpblendd ymm6, ymm6, ymm5, 0xAA - vpunpcklqdq ymm5, ymm15, ymm13 - vpblendd ymm5, ymm5, ymm14, 0x88 - vpshufd ymm5, ymm5, 0x78 - vpunpckhdq ymm13, ymm13, ymm15 - vpunpckldq ymm14, ymm14, ymm13 - vpshufd ymm15, ymm14, 0x1E - vmovdqa ymm13, ymm6 - vmovdqa ymm14, ymm5 - vmovdqa ymm5, ymmword ptr [rsp+0x40] - vmovdqa ymm6, ymmword ptr [rsp+0x80] - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - vpxor ymm8, ymm8, ymm10 - vpxor ymm9, ymm9, ymm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqu xmmword ptr [rbx+0x40], xmm8 - vmovdqu xmmword ptr [rbx+0x50], xmm9 - vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 - vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 - vmovaps xmm8, xmmword ptr [rsp+0x280] - vmovaps xmm0, xmmword ptr [rsp+0x240] - vmovaps xmm1, xmmword ptr [rsp+0x250] - vmovaps xmm2, xmmword ptr [rsp+0x260] - vmovaps xmm3, xmmword ptr [rsp+0x270] - vblendvps xmm0, xmm0, xmm1, xmm8 - vblendvps xmm2, xmm2, xmm3, xmm8 - vmovaps xmmword ptr [rsp+0x240], xmm0 - vmovaps xmmword ptr [rsp+0x260], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test rsi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp+0x240] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x244] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] - vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x200] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovaps ymm8, ymmword ptr [rsp+0x280] - vmovaps ymm0, ymmword ptr [rsp+0x240] - vmovups ymm1, ymmword ptr [rsp+0x248] - vmovaps ymm2, ymmword ptr [rsp+0x260] - vmovups ymm3, ymmword ptr [rsp+0x268] - vblendvps ymm0, ymm0, ymm1, ymm8 - vblendvps ymm2, ymm2, ymm3, ymm8 - vmovaps ymmword ptr [rsp+0x240], ymm0 - vmovaps ymmword ptr [rsp+0x260], ymm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test rsi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm3, dword ptr [rsp+0x240] - vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 - vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm14, xmmword ptr [ROT16+rip] - vmovdqa xmm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa xmm3, xmm13 - vpinsrd xmm3, xmm3, eax, 3 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b - - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 -ADD1: - .long 8, 8, 8, 8, 8, 8, 8, 8 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A - -#endif // __x86_64__ diff --git a/src/b3/blake3_avx512.c b/src/b3/blake3_avx512.c deleted file mode 100644 index 77a5c385c..000000000 --- a/src/b3/blake3_avx512.c +++ /dev/null @@ -1,1204 +0,0 @@ -#include "blake3_impl.h" - -#include - -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) - -INLINE __m128i loadu_128(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); -} - -INLINE __m256i loadu_256(const uint8_t src[32]) { - return _mm256_loadu_si256((const __m256i *)src); -} - -INLINE __m512i loadu_512(const uint8_t src[64]) { - return _mm512_loadu_si512((const __m512i *)src); -} - -INLINE void storeu_128(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); -} - -INLINE void storeu_256(__m256i src, uint8_t dest[16]) { - _mm256_storeu_si256((__m256i *)dest, src); -} - -INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } - -INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } - -INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } - -INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } - -INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } - -INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } - -INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } - -INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } - -INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } - -INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); -} - -INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } - -INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } - -INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } - -INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } - -INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } - -INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } - -INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } - -INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } - -INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } - -INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } - -INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } - -INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } - -/* - * ---------------------------------------------------------------------------- - * compress_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = add_128(add_128(*row0, m), *row1); - *row3 = xor_128(*row3, *row0); - *row3 = rot16_128(*row3); - *row2 = add_128(*row2, *row3); - *row1 = xor_128(*row1, *row2); - *row1 = rot12_128(*row1); -} - -INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = add_128(add_128(*row0, m), *row1); - *row3 = xor_128(*row3, *row0); - *row3 = rot8_128(*row3); - *row2 = add_128(*row2, *row3); - *row1 = xor_128(*row1, *row2); - *row1 = rot7_128(*row1); -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); -} - -INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); -} - -INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu_128((uint8_t *)&cv[0]); - rows[1] = loadu_128((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); -} - -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu_128(xor_128(rows[0], rows[2]), &out[0]); - storeu_128(xor_128(rows[1], rows[3]), &out[16]); - storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); - storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); -} - -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); -} - -/* - * ---------------------------------------------------------------------------- - * hash4_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[15] = rot16_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot12_128(v[4]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[15] = rot8_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot7_128(v[4]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot16_128(v[15]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[4] = rot12_128(v[4]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot8_128(v[15]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - v[4] = rot7_128(v[4]); -} - -INLINE void transpose_vecs_128(__m128i vecs[4]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_128(&out[0]); - transpose_vecs_128(&out[4]); - transpose_vecs_128(&out[8]); - transpose_vecs_128(&out[12]); -} - -INLINE void load_counters4(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m256i mask_vec = _mm256_set1_epi64x(mask); - __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); - deltas = _mm256_and_si256(mask_vec, deltas); - __m256i counters = - _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); - *out_lo = _mm256_cvtepi64_epi32(counters); - *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); -} - -void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), - set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters4(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1_128(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn4(v, msg_vecs, 0); - round_fn4(v, msg_vecs, 1); - round_fn4(v, msg_vecs, 2); - round_fn4(v, msg_vecs, 3); - round_fn4(v, msg_vecs, 4); - round_fn4(v, msg_vecs, 5); - round_fn4(v, msg_vecs, 6); - h_vecs[0] = xor_128(v[0], v[8]); - h_vecs[1] = xor_128(v[1], v[9]); - h_vecs[2] = xor_128(v[2], v[10]); - h_vecs[3] = xor_128(v[3], v[11]); - h_vecs[4] = xor_128(v[4], v[12]); - h_vecs[5] = xor_128(v[5], v[13]); - h_vecs[6] = xor_128(v[6], v[14]); - h_vecs[7] = xor_128(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs_128(&h_vecs[0]); - transpose_vecs_128(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); -} - -/* - * ---------------------------------------------------------------------------- - * hash8_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_256(v[0], v[4]); - v[1] = add_256(v[1], v[5]); - v[2] = add_256(v[2], v[6]); - v[3] = add_256(v[3], v[7]); - v[12] = xor_256(v[12], v[0]); - v[13] = xor_256(v[13], v[1]); - v[14] = xor_256(v[14], v[2]); - v[15] = xor_256(v[15], v[3]); - v[12] = rot16_256(v[12]); - v[13] = rot16_256(v[13]); - v[14] = rot16_256(v[14]); - v[15] = rot16_256(v[15]); - v[8] = add_256(v[8], v[12]); - v[9] = add_256(v[9], v[13]); - v[10] = add_256(v[10], v[14]); - v[11] = add_256(v[11], v[15]); - v[4] = xor_256(v[4], v[8]); - v[5] = xor_256(v[5], v[9]); - v[6] = xor_256(v[6], v[10]); - v[7] = xor_256(v[7], v[11]); - v[4] = rot12_256(v[4]); - v[5] = rot12_256(v[5]); - v[6] = rot12_256(v[6]); - v[7] = rot12_256(v[7]); - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_256(v[0], v[4]); - v[1] = add_256(v[1], v[5]); - v[2] = add_256(v[2], v[6]); - v[3] = add_256(v[3], v[7]); - v[12] = xor_256(v[12], v[0]); - v[13] = xor_256(v[13], v[1]); - v[14] = xor_256(v[14], v[2]); - v[15] = xor_256(v[15], v[3]); - v[12] = rot8_256(v[12]); - v[13] = rot8_256(v[13]); - v[14] = rot8_256(v[14]); - v[15] = rot8_256(v[15]); - v[8] = add_256(v[8], v[12]); - v[9] = add_256(v[9], v[13]); - v[10] = add_256(v[10], v[14]); - v[11] = add_256(v[11], v[15]); - v[4] = xor_256(v[4], v[8]); - v[5] = xor_256(v[5], v[9]); - v[6] = xor_256(v[6], v[10]); - v[7] = xor_256(v[7], v[11]); - v[4] = rot7_256(v[4]); - v[5] = rot7_256(v[5]); - v[6] = rot7_256(v[6]); - v[7] = rot7_256(v[7]); - - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_256(v[0], v[5]); - v[1] = add_256(v[1], v[6]); - v[2] = add_256(v[2], v[7]); - v[3] = add_256(v[3], v[4]); - v[15] = xor_256(v[15], v[0]); - v[12] = xor_256(v[12], v[1]); - v[13] = xor_256(v[13], v[2]); - v[14] = xor_256(v[14], v[3]); - v[15] = rot16_256(v[15]); - v[12] = rot16_256(v[12]); - v[13] = rot16_256(v[13]); - v[14] = rot16_256(v[14]); - v[10] = add_256(v[10], v[15]); - v[11] = add_256(v[11], v[12]); - v[8] = add_256(v[8], v[13]); - v[9] = add_256(v[9], v[14]); - v[5] = xor_256(v[5], v[10]); - v[6] = xor_256(v[6], v[11]); - v[7] = xor_256(v[7], v[8]); - v[4] = xor_256(v[4], v[9]); - v[5] = rot12_256(v[5]); - v[6] = rot12_256(v[6]); - v[7] = rot12_256(v[7]); - v[4] = rot12_256(v[4]); - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_256(v[0], v[5]); - v[1] = add_256(v[1], v[6]); - v[2] = add_256(v[2], v[7]); - v[3] = add_256(v[3], v[4]); - v[15] = xor_256(v[15], v[0]); - v[12] = xor_256(v[12], v[1]); - v[13] = xor_256(v[13], v[2]); - v[14] = xor_256(v[14], v[3]); - v[15] = rot8_256(v[15]); - v[12] = rot8_256(v[12]); - v[13] = rot8_256(v[13]); - v[14] = rot8_256(v[14]); - v[10] = add_256(v[10], v[15]); - v[11] = add_256(v[11], v[12]); - v[8] = add_256(v[8], v[13]); - v[9] = add_256(v[9], v[14]); - v[5] = xor_256(v[5], v[10]); - v[6] = xor_256(v[6], v[11]); - v[7] = xor_256(v[7], v[8]); - v[4] = xor_256(v[4], v[9]); - v[5] = rot7_256(v[5]); - v[6] = rot7_256(v[6]); - v[7] = rot7_256(v[7]); - v[4] = rot7_256(v[4]); -} - -INLINE void transpose_vecs_256(__m256i vecs[8]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high - // is 22/33/66/77. - __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is - // 11/33. - __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); - vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); - vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); - vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); - vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); - vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); - vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); - vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); -} - -INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, - size_t block_offset, __m256i out[16]) { - out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); - out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); - out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); - out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); - out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); - out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); - out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); - out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); - out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); - out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); - out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); - out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); - out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); - out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); - out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); - out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); - for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_256(&out[0]); - transpose_vecs_256(&out[8]); -} - -INLINE void load_counters8(uint64_t counter, bool increment_counter, - __m256i *out_lo, __m256i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m512i mask_vec = _mm512_set1_epi64(mask); - __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); - deltas = _mm512_and_si512(mask_vec, deltas); - __m512i counters = - _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); - *out_lo = _mm512_cvtepi64_epi32(counters); - *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); -} - -void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m256i h_vecs[8] = { - set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), - set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), - }; - __m256i counter_low_vec, counter_high_vec; - load_counters8(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); - __m256i block_flags_vec = set1_256(block_flags); - __m256i msg_vecs[16]; - transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m256i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn8(v, msg_vecs, 0); - round_fn8(v, msg_vecs, 1); - round_fn8(v, msg_vecs, 2); - round_fn8(v, msg_vecs, 3); - round_fn8(v, msg_vecs, 4); - round_fn8(v, msg_vecs, 5); - round_fn8(v, msg_vecs, 6); - h_vecs[0] = xor_256(v[0], v[8]); - h_vecs[1] = xor_256(v[1], v[9]); - h_vecs[2] = xor_256(v[2], v[10]); - h_vecs[3] = xor_256(v[3], v[11]); - h_vecs[4] = xor_256(v[4], v[12]); - h_vecs[5] = xor_256(v[5], v[13]); - h_vecs[6] = xor_256(v[6], v[14]); - h_vecs[7] = xor_256(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs_256(h_vecs); - storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); - storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); - storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); - storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); - storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); - storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); - storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); - storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); -} - -/* - * ---------------------------------------------------------------------------- - * hash16_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_512(v[0], v[4]); - v[1] = add_512(v[1], v[5]); - v[2] = add_512(v[2], v[6]); - v[3] = add_512(v[3], v[7]); - v[12] = xor_512(v[12], v[0]); - v[13] = xor_512(v[13], v[1]); - v[14] = xor_512(v[14], v[2]); - v[15] = xor_512(v[15], v[3]); - v[12] = rot16_512(v[12]); - v[13] = rot16_512(v[13]); - v[14] = rot16_512(v[14]); - v[15] = rot16_512(v[15]); - v[8] = add_512(v[8], v[12]); - v[9] = add_512(v[9], v[13]); - v[10] = add_512(v[10], v[14]); - v[11] = add_512(v[11], v[15]); - v[4] = xor_512(v[4], v[8]); - v[5] = xor_512(v[5], v[9]); - v[6] = xor_512(v[6], v[10]); - v[7] = xor_512(v[7], v[11]); - v[4] = rot12_512(v[4]); - v[5] = rot12_512(v[5]); - v[6] = rot12_512(v[6]); - v[7] = rot12_512(v[7]); - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_512(v[0], v[4]); - v[1] = add_512(v[1], v[5]); - v[2] = add_512(v[2], v[6]); - v[3] = add_512(v[3], v[7]); - v[12] = xor_512(v[12], v[0]); - v[13] = xor_512(v[13], v[1]); - v[14] = xor_512(v[14], v[2]); - v[15] = xor_512(v[15], v[3]); - v[12] = rot8_512(v[12]); - v[13] = rot8_512(v[13]); - v[14] = rot8_512(v[14]); - v[15] = rot8_512(v[15]); - v[8] = add_512(v[8], v[12]); - v[9] = add_512(v[9], v[13]); - v[10] = add_512(v[10], v[14]); - v[11] = add_512(v[11], v[15]); - v[4] = xor_512(v[4], v[8]); - v[5] = xor_512(v[5], v[9]); - v[6] = xor_512(v[6], v[10]); - v[7] = xor_512(v[7], v[11]); - v[4] = rot7_512(v[4]); - v[5] = rot7_512(v[5]); - v[6] = rot7_512(v[6]); - v[7] = rot7_512(v[7]); - - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_512(v[0], v[5]); - v[1] = add_512(v[1], v[6]); - v[2] = add_512(v[2], v[7]); - v[3] = add_512(v[3], v[4]); - v[15] = xor_512(v[15], v[0]); - v[12] = xor_512(v[12], v[1]); - v[13] = xor_512(v[13], v[2]); - v[14] = xor_512(v[14], v[3]); - v[15] = rot16_512(v[15]); - v[12] = rot16_512(v[12]); - v[13] = rot16_512(v[13]); - v[14] = rot16_512(v[14]); - v[10] = add_512(v[10], v[15]); - v[11] = add_512(v[11], v[12]); - v[8] = add_512(v[8], v[13]); - v[9] = add_512(v[9], v[14]); - v[5] = xor_512(v[5], v[10]); - v[6] = xor_512(v[6], v[11]); - v[7] = xor_512(v[7], v[8]); - v[4] = xor_512(v[4], v[9]); - v[5] = rot12_512(v[5]); - v[6] = rot12_512(v[6]); - v[7] = rot12_512(v[7]); - v[4] = rot12_512(v[4]); - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_512(v[0], v[5]); - v[1] = add_512(v[1], v[6]); - v[2] = add_512(v[2], v[7]); - v[3] = add_512(v[3], v[4]); - v[15] = xor_512(v[15], v[0]); - v[12] = xor_512(v[12], v[1]); - v[13] = xor_512(v[13], v[2]); - v[14] = xor_512(v[14], v[3]); - v[15] = rot8_512(v[15]); - v[12] = rot8_512(v[12]); - v[13] = rot8_512(v[13]); - v[14] = rot8_512(v[14]); - v[10] = add_512(v[10], v[15]); - v[11] = add_512(v[11], v[12]); - v[8] = add_512(v[8], v[13]); - v[9] = add_512(v[9], v[14]); - v[5] = xor_512(v[5], v[10]); - v[6] = xor_512(v[6], v[11]); - v[7] = xor_512(v[7], v[8]); - v[4] = xor_512(v[4], v[9]); - v[5] = rot7_512(v[5]); - v[6] = rot7_512(v[6]); - v[7] = rot7_512(v[7]); - v[4] = rot7_512(v[4]); -} - -// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order -#define LO_IMM8 0x88 - -INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { - return _mm512_shuffle_i32x4(a, b, LO_IMM8); -} - -// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order -#define HI_IMM8 0xdd - -INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { - return _mm512_shuffle_i32x4(a, b, HI_IMM8); -} - -INLINE void transpose_vecs_512(__m512i vecs[16]) { - // Interleave 32-bit lanes. The _0 unpack is lanes - // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes - // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. - __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); - __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); - __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); - __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); - __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); - __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); - __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); - __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); - __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); - __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); - __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); - __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); - __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); - __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); - __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); - __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); - - // Interleave 64-bit lates. The _0 unpack is lanes - // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes - // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes - // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes - // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. - __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); - __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); - __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); - __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); - __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); - __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); - __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); - __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); - __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); - __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); - __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); - __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); - __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); - __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); - __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); - __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); - - // Interleave 128-bit lanes. The _0 unpack is - // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is - // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. - __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); - __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); - __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); - __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); - __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); - __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); - __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); - __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); - __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); - __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); - __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); - __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); - __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); - __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); - __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); - __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); - - // Interleave 128-bit lanes again for the final outputs. - vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); - vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); - vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); - vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); - vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); - vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); - vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); - vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); - vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); - vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); - vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); - vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); - vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); - vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); - vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); - vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); -} - -INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, - size_t block_offset, __m512i out[16]) { - out[0] = loadu_512(&inputs[0][block_offset]); - out[1] = loadu_512(&inputs[1][block_offset]); - out[2] = loadu_512(&inputs[2][block_offset]); - out[3] = loadu_512(&inputs[3][block_offset]); - out[4] = loadu_512(&inputs[4][block_offset]); - out[5] = loadu_512(&inputs[5][block_offset]); - out[6] = loadu_512(&inputs[6][block_offset]); - out[7] = loadu_512(&inputs[7][block_offset]); - out[8] = loadu_512(&inputs[8][block_offset]); - out[9] = loadu_512(&inputs[9][block_offset]); - out[10] = loadu_512(&inputs[10][block_offset]); - out[11] = loadu_512(&inputs[11][block_offset]); - out[12] = loadu_512(&inputs[12][block_offset]); - out[13] = loadu_512(&inputs[13][block_offset]); - out[14] = loadu_512(&inputs[14][block_offset]); - out[15] = loadu_512(&inputs[15][block_offset]); - for (size_t i = 0; i < 16; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_512(out); -} - -INLINE void load_counters16(uint64_t counter, bool increment_counter, - __m512i *out_lo, __m512i *out_hi) { - const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); - const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const __m512i add1 = _mm512_and_si512(mask, add0); - __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); - __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); - __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t *out) { - __m512i h_vecs[8] = { - set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), - set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), - }; - __m512i counter_low_vec, counter_high_vec; - load_counters16(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); - __m512i block_flags_vec = set1_512(block_flags); - __m512i msg_vecs[16]; - transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m512i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn16(v, msg_vecs, 0); - round_fn16(v, msg_vecs, 1); - round_fn16(v, msg_vecs, 2); - round_fn16(v, msg_vecs, 3); - round_fn16(v, msg_vecs, 4); - round_fn16(v, msg_vecs, 5); - round_fn16(v, msg_vecs, 6); - h_vecs[0] = xor_512(v[0], v[8]); - h_vecs[1] = xor_512(v[1], v[9]); - h_vecs[2] = xor_512(v[2], v[10]); - h_vecs[3] = xor_512(v[3], v[11]); - h_vecs[4] = xor_512(v[4], v[12]); - h_vecs[5] = xor_512(v[5], v[13]); - h_vecs[6] = xor_512(v[6], v[14]); - h_vecs[7] = xor_512(v[7], v[15]); - - block_flags = flags; - } - - // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 - // state vectors. Pad the matrix with zeros. After transposition, store the - // lower half of each vector. - __m512i padded[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_512(0), set1_512(0), set1_512(0), set1_512(0), - set1_512(0), set1_512(0), set1_512(0), set1_512(0), - }; - transpose_vecs_512(padded); - _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); - _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); - _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); - _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); - _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); - _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); - _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); - _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); - _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); - _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); - _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); - _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); - _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); - _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); - _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); - _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); -} - -/* - * ---------------------------------------------------------------------------- - * hash_many_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= 16) { - blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 16; - } - inputs += 16; - num_inputs -= 16; - out = &out[16 * BLAKE3_OUT_LEN]; - } - while (num_inputs >= 8) { - blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 8; - } - inputs += 8; - num_inputs -= 8; - out = &out[8 * BLAKE3_OUT_LEN]; - } - while (num_inputs >= 4) { - blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 4; - } - inputs += 4; - num_inputs -= 4; - out = &out[4 * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/b3/blake3_avx512_x86-64_unix.S b/src/b3/blake3_avx512_x86-64_unix.S deleted file mode 100644 index 621e1aa6d..000000000 --- a/src/b3/blake3_avx512_x86-64_unix.S +++ /dev/null @@ -1,2572 +0,0 @@ -#ifdef __x86_64__ -.intel_syntax noprefix - -.global _blake3_hash_many_avx512 -.global blake3_hash_many_avx512 -.global blake3_compress_in_place_avx512 -.global _blake3_compress_in_place_avx512 -.global blake3_compress_xof_avx512 -.global _blake3_compress_xof_avx512 - -#ifdef __APPLE__ -.text -#else -.section .text -#endif -.p2align 6 -_blake3_hash_many_avx512: -blake3_hash_many_avx512: - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 144 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9 - kmovw k1, r9d - vmovd xmm0, r8d - vpbroadcastd ymm0, xmm0 - shr r8, 32 - vmovd xmm1, r8d - vpbroadcastd ymm1, xmm1 - vmovdqa ymm4, ymm1 - vmovdqa ymm5, ymm1 - vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] - vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] - vpcmpltud k2, ymm2, ymm0 - vpcmpltud k3, ymm3, ymm0 - vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} - vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} - knotw k2, k1 - vmovdqa32 ymm2 {k2}, ymm0 - vmovdqa32 ymm3 {k2}, ymm0 - vmovdqa32 ymm4 {k2}, ymm1 - vmovdqa32 ymm5 {k2}, ymm1 - vmovdqa ymmword ptr [rsp], ymm2 - vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 - vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 - vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 - shl rdx, 6 - mov qword ptr [rsp+0x80], rdx - cmp rsi, 16 - jc 3f -2: - vpbroadcastd zmm0, dword ptr [rcx] - vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] - vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] - vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] - vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] - vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] - vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] - vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm8, zmm16, zmm17 - vpunpckhqdq zmm9, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm10, zmm18, zmm19 - vpunpckhqdq zmm11, zmm18, zmm19 - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm12, zmm16, zmm17 - vpunpckhqdq zmm13, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm14, zmm18, zmm19 - vpunpckhqdq zmm15, zmm18, zmm19 - vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] - vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] - vshufps zmm16, zmm8, zmm10, 136 - vshufps zmm17, zmm12, zmm14, 136 - vmovdqa32 zmm20, zmm16 - vpermt2d zmm16, zmm27, zmm17 - vpermt2d zmm20, zmm31, zmm17 - vshufps zmm17, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm21, zmm17 - vpermt2d zmm17, zmm27, zmm30 - vpermt2d zmm21, zmm31, zmm30 - vshufps zmm18, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm22, zmm18 - vpermt2d zmm18, zmm27, zmm8 - vpermt2d zmm22, zmm31, zmm8 - vshufps zmm19, zmm9, zmm11, 221 - vshufps zmm8, zmm13, zmm15, 221 - vmovdqa32 zmm23, zmm19 - vpermt2d zmm19, zmm27, zmm8 - vpermt2d zmm23, zmm31, zmm8 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm8, zmm24, zmm25 - vpunpckhqdq zmm9, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm10, zmm24, zmm25 - vpunpckhqdq zmm11, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm12, zmm24, zmm25 - vpunpckhqdq zmm13, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm14, zmm24, zmm25 - vpunpckhqdq zmm15, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vshufps zmm24, zmm8, zmm10, 136 - vshufps zmm30, zmm12, zmm14, 136 - vmovdqa32 zmm28, zmm24 - vpermt2d zmm24, zmm27, zmm30 - vpermt2d zmm28, zmm31, zmm30 - vshufps zmm25, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm29, zmm25 - vpermt2d zmm25, zmm27, zmm30 - vpermt2d zmm29, zmm31, zmm30 - vshufps zmm26, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm30, zmm26 - vpermt2d zmm26, zmm27, zmm8 - vpermt2d zmm30, zmm31, zmm8 - vshufps zmm8, zmm9, zmm11, 221 - vshufps zmm10, zmm13, zmm15, 221 - vpermi2d zmm27, zmm8, zmm10 - vpermi2d zmm31, zmm8, zmm10 - vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa32 zmm12, zmmword ptr [rsp] - vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] - vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm24 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm23 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm27 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm21 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm28 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm26 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm22 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm31 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpxord zmm0, zmm0, zmm8 - vpxord zmm1, zmm1, zmm9 - vpxord zmm2, zmm2, zmm10 - vpxord zmm3, zmm3, zmm11 - vpxord zmm4, zmm4, zmm12 - vpxord zmm5, zmm5, zmm13 - vpxord zmm6, zmm6, zmm14 - vpxord zmm7, zmm7, zmm15 - movzx eax, byte ptr [rbp+0x38] - jne 9b - mov rbx, qword ptr [rbp+0x50] - vpunpckldq zmm16, zmm0, zmm1 - vpunpckhdq zmm17, zmm0, zmm1 - vpunpckldq zmm18, zmm2, zmm3 - vpunpckhdq zmm19, zmm2, zmm3 - vpunpckldq zmm20, zmm4, zmm5 - vpunpckhdq zmm21, zmm4, zmm5 - vpunpckldq zmm22, zmm6, zmm7 - vpunpckhdq zmm23, zmm6, zmm7 - vpunpcklqdq zmm0, zmm16, zmm18 - vpunpckhqdq zmm1, zmm16, zmm18 - vpunpcklqdq zmm2, zmm17, zmm19 - vpunpckhqdq zmm3, zmm17, zmm19 - vpunpcklqdq zmm4, zmm20, zmm22 - vpunpckhqdq zmm5, zmm20, zmm22 - vpunpcklqdq zmm6, zmm21, zmm23 - vpunpckhqdq zmm7, zmm21, zmm23 - vshufi32x4 zmm16, zmm0, zmm4, 0x88 - vshufi32x4 zmm17, zmm1, zmm5, 0x88 - vshufi32x4 zmm18, zmm2, zmm6, 0x88 - vshufi32x4 zmm19, zmm3, zmm7, 0x88 - vshufi32x4 zmm20, zmm0, zmm4, 0xDD - vshufi32x4 zmm21, zmm1, zmm5, 0xDD - vshufi32x4 zmm22, zmm2, zmm6, 0xDD - vshufi32x4 zmm23, zmm3, zmm7, 0xDD - vshufi32x4 zmm0, zmm16, zmm17, 0x88 - vshufi32x4 zmm1, zmm18, zmm19, 0x88 - vshufi32x4 zmm2, zmm20, zmm21, 0x88 - vshufi32x4 zmm3, zmm22, zmm23, 0x88 - vshufi32x4 zmm4, zmm16, zmm17, 0xDD - vshufi32x4 zmm5, zmm18, zmm19, 0xDD - vshufi32x4 zmm6, zmm20, zmm21, 0xDD - vshufi32x4 zmm7, zmm22, zmm23, 0xDD - vmovdqu32 zmmword ptr [rbx], zmm0 - vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 - vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 - vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 - vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 - vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 - vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 - vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 - vmovdqa32 zmm0, zmmword ptr [rsp] - vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] - vmovdqa32 zmm2, zmm0 - vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} - vpcmpltud k2, zmm2, zmm0 - vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} - vmovdqa32 zmmword ptr [rsp], zmm2 - vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 - add rdi, 128 - add rbx, 512 - mov qword ptr [rbp+0x50], rbx - sub rsi, 16 - cmp rsi, 16 - jnc 2b - test rsi, rsi - jnz 3f -4: - vzeroupper - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 6 -3: - test esi, 0x8 - je 3f - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -2: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm16, ymm12, ymm14, 136 - vshufps ymm17, ymm12, ymm14, 221 - vshufps ymm18, ymm13, ymm15, 136 - vshufps ymm19, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm20, ymm12, ymm14, 136 - vshufps ymm21, ymm12, ymm14, 221 - vshufps ymm22, ymm13, ymm15, 136 - vshufps ymm23, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm24, ymm12, ymm14, 136 - vshufps ymm25, ymm12, ymm14, 221 - vshufps ymm26, ymm13, ymm15, 136 - vshufps ymm27, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm28, ymm12, ymm14, 136 - vshufps ymm29, ymm12, ymm14, 221 - vshufps ymm30, ymm13, ymm15, 136 - vshufps ymm31, ymm13, ymm15, 221 - vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa ymm12, ymmword ptr [rsp] - vmovdqa ymm13, ymmword ptr [rsp+0x40] - vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd ymm15, dword ptr [rsp+0x88] - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm24 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm23 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm27 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm21 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm28 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm26 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm22 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm31 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x38] - jne 2b - mov rbx, qword ptr [rbp+0x50] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp] - vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] - vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] - vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] - vmovdqa ymmword ptr [rsp], ymm0 - vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 - add rbx, 256 - mov qword ptr [rbp+0x50], rbx - add rdi, 64 - sub rsi, 8 -3: - mov rbx, qword ptr [rbp+0x50] - mov r15, qword ptr [rsp+0x80] - movzx r13, byte ptr [rbp+0x38] - movzx r12, byte ptr [rbp+0x48] - test esi, 0x4 - je 3f - vbroadcasti32x4 zmm0, xmmword ptr [rcx] - vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] - vmovdqa xmm12, xmmword ptr [rsp] - vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] - vpunpckldq xmm14, xmm12, xmm13 - vpunpckhdq xmm15, xmm12, xmm13 - vpermq ymm14, ymm14, 0xDC - vpermq ymm15, ymm15, 0xDC - vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] - vinserti64x4 zmm13, zmm14, ymm15, 0x01 - mov eax, 17476 - kmovw k2, eax - vpblendmd zmm13 {k2}, zmm13, zmm12 - vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov eax, 43690 - kmovw k3, eax - mov eax, 34952 - kmovw k4, eax - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vmovdqa32 zmm2, zmm15 - vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] - vpblendmd zmm3 {k4}, zmm13, zmm8 - vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x30] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 - vshufps zmm4, zmm8, zmm9, 136 - vshufps zmm5, zmm8, zmm9, 221 - vmovups zmm8, zmmword ptr [r8+rdx-0x20] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x10] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 - vshufps zmm6, zmm8, zmm9, 136 - vshufps zmm7, zmm8, zmm9, 221 - vpshufd zmm6, zmm6, 0x93 - vpshufd zmm7, zmm7, 0x93 - mov al, 7 -9: - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x93 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x39 - vpaddd zmm0, zmm0, zmm6 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm7 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x39 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x93 - dec al - jz 9f - vshufps zmm8, zmm4, zmm5, 214 - vpshufd zmm9, zmm4, 0x0F - vpshufd zmm4, zmm8, 0x39 - vshufps zmm8, zmm6, zmm7, 250 - vpblendmd zmm9 {k3}, zmm9, zmm8 - vpunpcklqdq zmm8, zmm7, zmm5 - vpblendmd zmm8 {k4}, zmm8, zmm6 - vpshufd zmm8, zmm8, 0x78 - vpunpckhdq zmm5, zmm5, zmm7 - vpunpckldq zmm6, zmm6, zmm5 - vpshufd zmm7, zmm6, 0x1E - vmovdqa32 zmm5, zmm9 - vmovdqa32 zmm6, zmm8 - jmp 9b -9: - vpxord zmm0, zmm0, zmm2 - vpxord zmm1, zmm1, zmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 - vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 - vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 - vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x40] - vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] - vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x40], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test esi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x4] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x88] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] - vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] - vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm14, dword ptr [rsp] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vpinsrd xmm3, xmm14, eax, 3 - vmovdqa xmm2, xmm15 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b -.p2align 6 -_blake3_compress_in_place_avx512: -blake3_compress_in_place_avx512: - vmovdqu xmm0, xmmword ptr [rdi] - vmovdqu xmm1, xmmword ptr [rdi+0x10] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - vmovq xmm3, rcx - vmovq xmm4, rdx - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rsi] - vmovups xmm9, xmmword ptr [rsi+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rsi+0x20] - vmovups xmm9, xmmword ptr [rsi+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vmovdqu xmmword ptr [rdi], xmm0 - vmovdqu xmmword ptr [rdi+0x10], xmm1 - ret - -.p2align 6 -_blake3_compress_xof_avx512: -blake3_compress_xof_avx512: - vmovdqu xmm0, xmmword ptr [rdi] - vmovdqu xmm1, xmmword ptr [rdi+0x10] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - vmovq xmm3, rcx - vmovq xmm4, rdx - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rsi] - vmovups xmm9, xmmword ptr [rsi+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rsi+0x20] - vmovups xmm9, xmmword ptr [rsi+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vpxor xmm2, xmm2, [rdi] - vpxor xmm3, xmm3, [rdi+0x10] - vmovdqu xmmword ptr [r9], xmm0 - vmovdqu xmmword ptr [r9+0x10], xmm1 - vmovdqu xmmword ptr [r9+0x20], xmm2 - vmovdqu xmmword ptr [r9+0x30], xmm3 - ret - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -INDEX0: - .long 0, 1, 2, 3, 16, 17, 18, 19 - .long 8, 9, 10, 11, 24, 25, 26, 27 -INDEX1: - .long 4, 5, 6, 7, 20, 21, 22, 23 - .long 12, 13, 14, 15, 28, 29, 30, 31 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 - .long 8, 9, 10, 11, 12, 13, 14, 15 -ADD1: .long 1 - -ADD16: .long 16 -BLAKE3_BLOCK_LEN: - .long 64 -.p2align 6 -BLAKE3_IV: -BLAKE3_IV_0: - .long 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A - -#endif // __x86_64__ diff --git a/src/b3/blake3_dispatch.c b/src/b3/blake3_dispatch.c deleted file mode 100644 index 684772564..000000000 --- a/src/b3/blake3_dispatch.c +++ /dev/null @@ -1,245 +0,0 @@ -#include -#include -#include - -#include "blake3_impl.h" - -#if defined(IS_X86) -#if defined(_MSC_VER) -#include -#elif defined(__GNUC__) -#include -#else -#error "Unimplemented!" -#endif -#endif - -#if defined(IS_X86) -static uint64_t xgetbv() { -#if defined(_MSC_VER) - return _xgetbv(0); -#else - uint32_t eax = 0, edx = 0; - __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); - return ((uint64_t)edx << 32) | eax; -#endif -} - -static void cpuid(uint32_t out[4], uint32_t id) { -#if defined(_MSC_VER) - __cpuid((int *)out, id); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#endif -} - -static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { -#if defined(_MSC_VER) - __cpuidex((int *)out, id, sid); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#endif -} - -#endif - -enum cpu_feature { - SSE2 = 1 << 0, - SSSE3 = 1 << 1, - SSE41 = 1 << 2, - AVX = 1 << 3, - AVX2 = 1 << 4, - AVX512F = 1 << 5, - AVX512VL = 1 << 6, - /* ... */ - UNDEFINED = 1 << 30 -}; - -#if !defined(BLAKE3_TESTING) -static /* Allow the variable to be controlled manually for testing */ -#endif - enum cpu_feature g_cpu_features = UNDEFINED; - -#if !defined(BLAKE3_TESTING) -static -#endif - enum cpu_feature - get_cpu_features() { - - if (g_cpu_features != UNDEFINED) { - return g_cpu_features; - } else { -#if defined(IS_X86) - uint32_t regs[4] = {0}; - uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; - (void)edx; - enum cpu_feature features = 0; - cpuid(regs, 0); - const int max_id = *eax; - cpuid(regs, 1); -#if defined(__amd64__) || defined(_M_X64) - features |= SSE2; -#else - if (*edx & (1UL << 26)) - features |= SSE2; -#endif - if (*ecx & (1UL << 0)) - features |= SSSE3; - if (*ecx & (1UL << 19)) - features |= SSE41; - - if (*ecx & (1UL << 27)) { // OSXSAVE - const uint64_t mask = xgetbv(); - if ((mask & 6) == 6) { // SSE and AVX states - if (*ecx & (1UL << 28)) - features |= AVX; - if (max_id >= 7) { - cpuidex(regs, 7, 0); - if (*ebx & (1UL << 5)) - features |= AVX2; - if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm - if (*ebx & (1UL << 31)) - features |= AVX512VL; - if (*ebx & (1UL << 16)) - features |= AVX512F; - } - } - } - } - g_cpu_features = features; - return features; -#else - /* How to detect NEON? */ - return 0; -#endif - } -} - -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if (features & AVX512VL) { - blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); - return; - } -#endif -#endif - blake3_compress_in_place_portable(cv, block, block_len, counter, flags); -} - -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if (features & AVX512VL) { - blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); - return; - } -#endif -#endif - blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); -} - -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { - blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#if !defined(BLAKE3_NO_AVX2) - if (features & AVX2) { - blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#endif - -#if defined(BLAKE3_USE_NEON) - blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - return; -#endif - - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); -} - -// The dynamically detected SIMD degree of the current platform. -size_t blake3_simd_degree(void) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { - return 16; - } -#endif -#if !defined(BLAKE3_NO_AVX2) - if (features & AVX2) { - return 8; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - return 4; - } -#endif -#endif -#if defined(BLAKE3_USE_NEON) - return 4; -#endif - return 1; -} diff --git a/src/b3/blake3_impl.h b/src/b3/blake3_impl.h deleted file mode 100644 index c384671f0..000000000 --- a/src/b3/blake3_impl.h +++ /dev/null @@ -1,235 +0,0 @@ -#ifndef BLAKE3_IMPL_H -#define BLAKE3_IMPL_H - -#include -#include -#include -#include -#include - -#include "blake3.h" - -// internal flags -enum blake3_flags { - CHUNK_START = 1 << 0, - CHUNK_END = 1 << 1, - PARENT = 1 << 2, - ROOT = 1 << 3, - KEYED_HASH = 1 << 4, - DERIVE_KEY_CONTEXT = 1 << 5, - DERIVE_KEY_MATERIAL = 1 << 6, -}; - -// This C implementation tries to support recent versions of GCC, Clang, and -// MSVC. -#if defined(_MSC_VER) -#define INLINE static __forceinline -#else -#define INLINE static inline __attribute__((always_inline)) -#endif - -#if defined(__x86_64__) || defined(_M_X64) -#define IS_X86 -#define IS_X86_64 -#endif - -#if defined(__i386__) || defined(_M_IX86) -#define IS_X86 -#define IS_X86_32 -#endif - -#if defined(IS_X86) -#if defined(_MSC_VER) -#include -#endif -#include -#endif - -#if defined(IS_X86) -#define MAX_SIMD_DEGREE 16 -#elif defined(BLAKE3_USE_NEON) -#define MAX_SIMD_DEGREE 4 -#else -#define MAX_SIMD_DEGREE 1 -#endif - -// There are some places where we want a static size that's equal to the -// MAX_SIMD_DEGREE, but also at least 2. -#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) - -static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, - 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, - 0x1F83D9ABUL, 0x5BE0CD19UL}; - -static const uint8_t MSG_SCHEDULE[7][16] = { - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, - {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, - {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, - {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, - {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, - {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, -}; - -/* Find index of the highest set bit */ -/* x is assumed to be nonzero. */ -static unsigned int highest_one(uint64_t x) { -#if defined(__GNUC__) || defined(__clang__) - return 63 ^ __builtin_clzll(x); -#elif defined(_MSC_VER) && defined(IS_X86_64) - unsigned long index; - _BitScanReverse64(&index, x); - return index; -#elif defined(_MSC_VER) && defined(IS_X86_32) - if(x >> 32) { - unsigned long index; - _BitScanReverse(&index, x >> 32); - return 32 + index; - } else { - unsigned long index; - _BitScanReverse(&index, x); - return index; - } -#else - unsigned int c = 0; - if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } - if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } - if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } - if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } - if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } - if(x & 0x0000000000000002ULL) { c += 1; } - return c; -#endif -} - -// Count the number of 1 bits. -INLINE unsigned int popcnt(uint64_t x) { -#if defined(__GNUC__) || defined(__clang__) - return __builtin_popcountll(x); -#else - unsigned int count = 0; - while (x != 0) { - count += 1; - x &= x - 1; - } - return count; -#endif -} - -// Largest power of two less than or equal to x. As a special case, returns 1 -// when x is 0. -INLINE uint64_t round_down_to_power_of_2(uint64_t x) { - return 1ULL << highest_one(x | 1); -} - -INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } - -INLINE uint32_t counter_high(uint64_t counter) { - return (uint32_t)(counter >> 32); -} - -INLINE uint32_t load32(const void *src) { - const uint8_t *p = (const uint8_t *)src; - return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | - ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); -} - -INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], - uint32_t key_words[8]) { - key_words[0] = load32(&key[0 * 4]); - key_words[1] = load32(&key[1 * 4]); - key_words[2] = load32(&key[2 * 4]); - key_words[3] = load32(&key[3 * 4]); - key_words[4] = load32(&key[4 * 4]); - key_words[5] = load32(&key[5 * 4]); - key_words[6] = load32(&key[6 * 4]); - key_words[7] = load32(&key[7 * 4]); -} - -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]); - -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -size_t blake3_simd_degree(void); - - -// Declarations for implementation-specific functions. -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); - -#if defined(IS_X86) -#if !defined(BLAKE3_NO_SSE41) -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#if !defined(BLAKE3_NO_AVX2) -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#if !defined(BLAKE3_NO_AVX512) -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#endif - -#if defined(BLAKE3_USE_NEON) -void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif - - -#endif /* BLAKE3_IMPL_H */ diff --git a/src/b3/blake3_portable.c b/src/b3/blake3_portable.c deleted file mode 100644 index 9ee2f4a42..000000000 --- a/src/b3/blake3_portable.c +++ /dev/null @@ -1,168 +0,0 @@ -#include "blake3_impl.h" -#include - -INLINE void store32(void *dst, uint32_t w) { - uint8_t *p = (uint8_t *)dst; - p[0] = (uint8_t)(w >> 0); - p[1] = (uint8_t)(w >> 8); - p[2] = (uint8_t)(w >> 16); - p[3] = (uint8_t)(w >> 24); -} - -INLINE uint32_t rotr32(uint32_t w, uint32_t c) { - return (w >> c) | (w << (32 - c)); -} - -INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, - uint32_t x, uint32_t y) { - state[a] = state[a] + state[b] + x; - state[d] = rotr32(state[d] ^ state[a], 16); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 12); - state[a] = state[a] + state[b] + y; - state[d] = rotr32(state[d] ^ state[a], 8); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 7); -} - -INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { - // Select the message schedule based on the round. - const uint8_t *schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the rows. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - uint32_t block_words[16]; - block_words[0] = load32(block + 4 * 0); - block_words[1] = load32(block + 4 * 1); - block_words[2] = load32(block + 4 * 2); - block_words[3] = load32(block + 4 * 3); - block_words[4] = load32(block + 4 * 4); - block_words[5] = load32(block + 4 * 5); - block_words[6] = load32(block + 4 * 6); - block_words[7] = load32(block + 4 * 7); - block_words[8] = load32(block + 4 * 8); - block_words[9] = load32(block + 4 * 9); - block_words[10] = load32(block + 4 * 10); - block_words[11] = load32(block + 4 * 11); - block_words[12] = load32(block + 4 * 12); - block_words[13] = load32(block + 4 * 13); - block_words[14] = load32(block + 4 * 14); - block_words[15] = load32(block + 4 * 15); - - state[0] = cv[0]; - state[1] = cv[1]; - state[2] = cv[2]; - state[3] = cv[3]; - state[4] = cv[4]; - state[5] = cv[5]; - state[6] = cv[6]; - state[7] = cv[7]; - state[8] = IV[0]; - state[9] = IV[1]; - state[10] = IV[2]; - state[11] = IV[3]; - state[12] = counter_low(counter); - state[13] = counter_high(counter); - state[14] = (uint32_t)block_len; - state[15] = (uint32_t)flags; - - round_fn(state, &block_words[0], 0); - round_fn(state, &block_words[0], 1); - round_fn(state, &block_words[0], 2); - round_fn(state, &block_words[0], 3); - round_fn(state, &block_words[0], 4); - round_fn(state, &block_words[0], 5); - round_fn(state, &block_words[0], 6); -} - -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - cv[0] = state[0] ^ state[8]; - cv[1] = state[1] ^ state[9]; - cv[2] = state[2] ^ state[10]; - cv[3] = state[3] ^ state[11]; - cv[4] = state[4] ^ state[12]; - cv[5] = state[5] ^ state[13]; - cv[6] = state[6] ^ state[14]; - cv[7] = state[7] ^ state[15]; -} - -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - - store32(&out[0 * 4], state[0] ^ state[8]); - store32(&out[1 * 4], state[1] ^ state[9]); - store32(&out[2 * 4], state[2] ^ state[10]); - store32(&out[3 * 4], state[3] ^ state[11]); - store32(&out[4 * 4], state[4] ^ state[12]); - store32(&out[5 * 4], state[5] ^ state[13]); - store32(&out[6 * 4], state[6] ^ state[14]); - store32(&out[7 * 4], state[7] ^ state[15]); - store32(&out[8 * 4], state[8] ^ cv[0]); - store32(&out[9 * 4], state[9] ^ cv[1]); - store32(&out[10 * 4], state[10] ^ cv[2]); - store32(&out[11 * 4], state[11] ^ cv[3]); - store32(&out[12 * 4], state[12] ^ cv[4]); - store32(&out[13 * 4], state[13] ^ cv[5]); - store32(&out[14 * 4], state[14] ^ cv[6]); - store32(&out[15 * 4], state[15] ^ cv[7]); -} - -INLINE void hash_one_portable(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, 32); -} - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs > 0) { - hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/b3/blake3_sse41.c b/src/b3/blake3_sse41.c deleted file mode 100644 index b31122533..000000000 --- a/src/b3/blake3_sse41.c +++ /dev/null @@ -1,559 +0,0 @@ -#include "blake3_impl.h" - -#include - -#define DEGREE 4 - -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) - -INLINE __m128i loadu(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); -} - -INLINE void storeu(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); -} - -INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } - -// Note that clang-format doesn't like the name "xor" for some reason. -INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } - -INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } - -INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); -} - -INLINE __m128i rot16(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); -} - -INLINE __m128i rot12(__m128i x) { - return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); -} - -INLINE __m128i rot8(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); -} - -INLINE __m128i rot7(__m128i x) { - return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); -} - -INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot16(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot12(*row1); -} - -INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot8(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot7(*row1); -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); -} - -INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); -} - -INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu((uint8_t *)&cv[0]); - rows[1] = loadu((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); -} - -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); -} - -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), &out[0]); - storeu(xorv(rows[1], rows[3]), &out[16]); - storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); - storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); -} - -INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -INLINE void transpose_vecs(__m128i vecs[DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[4]); - transpose_vecs(&out[8]); - transpose_vecs(&out[12]); -} - -INLINE void load_counters(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); - const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); - const __m128i add1 = _mm_and_si128(mask, add0); - __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); - __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), - _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); - __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&h_vecs[0]); - transpose_vecs(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); -} - -INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; - } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/b3/blake3_sse41_x86-64_unix.S b/src/b3/blake3_sse41_x86-64_unix.S deleted file mode 100644 index 024a8290f..000000000 --- a/src/b3/blake3_sse41_x86-64_unix.S +++ /dev/null @@ -1,2014 +0,0 @@ -#ifdef __x86_64__ -.intel_syntax noprefix -.global blake3_hash_many_sse41 -.global _blake3_hash_many_sse41 -.global blake3_compress_in_place_sse41 -.global _blake3_compress_in_place_sse41 -.global blake3_compress_xof_sse41 -.global _blake3_compress_xof_sse41 -#ifdef __APPLE__ -.text -#else -.section .text -#endif - .p2align 6 -_blake3_hash_many_sse41: -blake3_hash_many_sse41: - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 360 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9d - movd xmm0, r9d - pshufd xmm0, xmm0, 0x00 - movdqa xmmword ptr [rsp+0x130], xmm0 - movdqa xmm1, xmm0 - pand xmm1, xmmword ptr [ADD0+rip] - pand xmm0, xmmword ptr [ADD1+rip] - movdqa xmmword ptr [rsp+0x150], xmm0 - movd xmm0, r8d - pshufd xmm0, xmm0, 0x00 - paddd xmm0, xmm1 - movdqa xmmword ptr [rsp+0x110], xmm0 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm1, xmm0 - shr r8, 32 - movd xmm2, r8d - pshufd xmm2, xmm2, 0x00 - psubd xmm2, xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - mov rbx, qword ptr [rbp+0x50] - mov r15, rdx - shl r15, 6 - movzx r13d, byte ptr [rbp+0x38] - movzx r12d, byte ptr [rbp+0x48] - cmp rsi, 4 - jc 3f -2: - movdqu xmm3, xmmword ptr [rcx] - pshufd xmm0, xmm3, 0x00 - pshufd xmm1, xmm3, 0x55 - pshufd xmm2, xmm3, 0xAA - pshufd xmm3, xmm3, 0xFF - movdqu xmm7, xmmword ptr [rcx+0x10] - pshufd xmm4, xmm7, 0x00 - pshufd xmm5, xmm7, 0x55 - pshufd xmm6, xmm7, 0xAA - pshufd xmm7, xmm7, 0xFF - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -9: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movdqu xmm8, xmmword ptr [r8+rdx-0x40] - movdqu xmm9, xmmword ptr [r9+rdx-0x40] - movdqu xmm10, xmmword ptr [r10+rdx-0x40] - movdqu xmm11, xmmword ptr [r11+rdx-0x40] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp], xmm8 - movdqa xmmword ptr [rsp+0x10], xmm9 - movdqa xmmword ptr [rsp+0x20], xmm12 - movdqa xmmword ptr [rsp+0x30], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x30] - movdqu xmm9, xmmword ptr [r9+rdx-0x30] - movdqu xmm10, xmmword ptr [r10+rdx-0x30] - movdqu xmm11, xmmword ptr [r11+rdx-0x30] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x40], xmm8 - movdqa xmmword ptr [rsp+0x50], xmm9 - movdqa xmmword ptr [rsp+0x60], xmm12 - movdqa xmmword ptr [rsp+0x70], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x20] - movdqu xmm9, xmmword ptr [r9+rdx-0x20] - movdqu xmm10, xmmword ptr [r10+rdx-0x20] - movdqu xmm11, xmmword ptr [r11+rdx-0x20] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x80], xmm8 - movdqa xmmword ptr [rsp+0x90], xmm9 - movdqa xmmword ptr [rsp+0xA0], xmm12 - movdqa xmmword ptr [rsp+0xB0], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x10] - movdqu xmm9, xmmword ptr [r9+rdx-0x10] - movdqu xmm10, xmmword ptr [r10+rdx-0x10] - movdqu xmm11, xmmword ptr [r11+rdx-0x10] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0xC0], xmm8 - movdqa xmmword ptr [rsp+0xD0], xmm9 - movdqa xmmword ptr [rsp+0xE0], xmm12 - movdqa xmmword ptr [rsp+0xF0], xmm13 - movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] - movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] - movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] - movdqa xmm12, xmmword ptr [rsp+0x110] - movdqa xmm13, xmmword ptr [rsp+0x120] - movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - movd xmm15, eax - pshufd xmm15, xmm15, 0x00 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x80] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x70] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xB0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x50] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xC0] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xA0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0x60] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xF0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - pxor xmm0, xmm8 - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - pxor xmm6, xmm14 - pxor xmm7, xmm15 - mov eax, r13d - jne 9b - movdqa xmm9, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm9, xmm1 - movdqa xmm11, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm11, xmm3 - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 - punpckhqdq xmm1, xmm2 - movdqa xmm3, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm3, xmm11 - movdqu xmmword ptr [rbx], xmm0 - movdqu xmmword ptr [rbx+0x20], xmm1 - movdqu xmmword ptr [rbx+0x40], xmm9 - movdqu xmmword ptr [rbx+0x60], xmm3 - movdqa xmm9, xmm4 - punpckldq xmm4, xmm5 - punpckhdq xmm9, xmm5 - movdqa xmm11, xmm6 - punpckldq xmm6, xmm7 - punpckhdq xmm11, xmm7 - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm6 - punpckhqdq xmm5, xmm6 - movdqa xmm7, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm7, xmm11 - movdqu xmmword ptr [rbx+0x10], xmm4 - movdqu xmmword ptr [rbx+0x30], xmm5 - movdqu xmmword ptr [rbx+0x50], xmm9 - movdqu xmmword ptr [rbx+0x70], xmm7 - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm0, xmm1 - paddd xmm1, xmmword ptr [rsp+0x150] - movdqa xmmword ptr [rsp+0x110], xmm1 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm0, xmm1 - movdqa xmm1, xmmword ptr [rsp+0x120] - psubd xmm1, xmm0 - movdqa xmmword ptr [rsp+0x120], xmm1 - add rbx, 128 - add rdi, 32 - sub rsi, 4 - cmp rsi, 4 - jnc 2b - test rsi, rsi - jnz 3f -4: - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - test esi, 0x2 - je 3f - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm8, xmm0 - movaps xmm9, xmm1 - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp], xmm13 - movd xmm14, dword ptr [rsp+0x114] - pinsrd xmm14, dword ptr [rsp+0x124], 1 - pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp+0x10], xmm14 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm10, xmm2 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm3, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm3, xmm5, 221 - movaps xmm5, xmm3 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm3, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm3, xmm7, 221 - pshufd xmm7, xmm3, 0x93 - movups xmm12, xmmword ptr [r9+rdx-0x40] - movups xmm13, xmmword ptr [r9+rdx-0x30] - movaps xmm11, xmm12 - shufps xmm12, xmm13, 136 - shufps xmm11, xmm13, 221 - movaps xmm13, xmm11 - movups xmm14, xmmword ptr [r9+rdx-0x20] - movups xmm15, xmmword ptr [r9+rdx-0x10] - movaps xmm11, xmm14 - shufps xmm14, xmm15, 136 - pshufd xmm14, xmm14, 0x93 - shufps xmm11, xmm15, 221 - pshufd xmm15, xmm11, 0x93 - movaps xmm3, xmmword ptr [rsp] - movaps xmm11, xmmword ptr [rsp+0x10] - pinsrd xmm3, eax, 3 - pinsrd xmm11, eax, 3 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm8, xmm12 - movaps xmmword ptr [rsp+0x20], xmm4 - movaps xmmword ptr [rsp+0x30], xmm12 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm12, xmmword ptr [ROT16+rip] - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm5 - paddd xmm8, xmm13 - movaps xmmword ptr [rsp+0x40], xmm5 - movaps xmmword ptr [rsp+0x50], xmm13 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm13, xmmword ptr [ROT8+rip] - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x93 - pshufd xmm8, xmm8, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x39 - pshufd xmm10, xmm10, 0x39 - paddd xmm0, xmm6 - paddd xmm8, xmm14 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm7 - paddd xmm8, xmm15 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x39 - pshufd xmm8, xmm8, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x93 - pshufd xmm10, xmm10, 0x93 - dec al - je 9f - movdqa xmm12, xmmword ptr [rsp+0x20] - movdqa xmm5, xmmword ptr [rsp+0x40] - pshufd xmm13, xmm12, 0x0F - shufps xmm12, xmm5, 214 - pshufd xmm4, xmm12, 0x39 - movdqa xmm12, xmm6 - shufps xmm12, xmm7, 250 - pblendw xmm13, xmm12, 0xCC - movdqa xmm12, xmm7 - punpcklqdq xmm12, xmm5 - pblendw xmm12, xmm6, 0xC0 - pshufd xmm12, xmm12, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmmword ptr [rsp+0x20], xmm13 - movdqa xmmword ptr [rsp+0x40], xmm12 - movdqa xmm5, xmmword ptr [rsp+0x30] - movdqa xmm13, xmmword ptr [rsp+0x50] - pshufd xmm6, xmm5, 0x0F - shufps xmm5, xmm13, 214 - pshufd xmm12, xmm5, 0x39 - movdqa xmm5, xmm14 - shufps xmm5, xmm15, 250 - pblendw xmm6, xmm5, 0xCC - movdqa xmm5, xmm15 - punpcklqdq xmm5, xmm13 - pblendw xmm5, xmm14, 0xC0 - pshufd xmm5, xmm5, 0x78 - punpckhdq xmm13, xmm15 - punpckldq xmm14, xmm13 - pshufd xmm15, xmm14, 0x1E - movdqa xmm13, xmm6 - movdqa xmm14, xmm5 - movdqa xmm5, xmmword ptr [rsp+0x20] - movdqa xmm6, xmmword ptr [rsp+0x40] - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm8, xmm10 - pxor xmm9, xmm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - movups xmmword ptr [rbx+0x20], xmm8 - movups xmmword ptr [rbx+0x30], xmm9 - movdqa xmm0, xmmword ptr [rsp+0x130] - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm2, xmmword ptr [rsp+0x120] - movdqu xmm3, xmmword ptr [rsp+0x118] - movdqu xmm4, xmmword ptr [rsp+0x128] - blendvps xmm1, xmm3, xmm0 - blendvps xmm2, xmm4, xmm0 - movdqa xmmword ptr [rsp+0x110], xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - add rdi, 16 - add rbx, 64 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm3, xmm13 - pinsrd xmm3, eax, 3 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - jmp 4b - -.p2align 6 -blake3_compress_in_place_sse41: -_blake3_compress_in_place_sse41: - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - shl r8, 32 - add rdx, r8 - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - movups xmmword ptr [rdi], xmm0 - movups xmmword ptr [rdi+0x10], xmm1 - ret - -.p2align 6 -blake3_compress_xof_sse41: -_blake3_compress_xof_sse41: - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - movdqu xmm4, xmmword ptr [rdi] - movdqu xmm5, xmmword ptr [rdi+0x10] - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm2, xmm4 - pxor xmm3, xmm5 - movups xmmword ptr [r9], xmm0 - movups xmmword ptr [r9+0x10], xmm1 - movups xmmword ptr [r9+0x20], xmm2 - movups xmmword ptr [r9+0x30], xmm3 - ret - - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85 - .long 0x3C6EF372, 0xA54FF53A -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -ADD0: - .long 0, 1, 2, 3 -ADD1: - .long 4, 4, 4, 4 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 64, 64, 64, 64 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 - -#endif // __x86_64__ diff --git a/src/calculate_bucket.hpp b/src/calculate_bucket.hpp index 8d1cbcdac..11d736928 100644 --- a/src/calculate_bucket.hpp +++ b/src/calculate_bucket.hpp @@ -25,7 +25,7 @@ #include #include -#include "b3/blake3.h" +#include "blake3.h" #include "bits.hpp" #include "chacha8.h" #include "pos_constants.hpp" diff --git a/src/cli.cpp b/src/cli.cpp index 6156dce4a..3044c9785 100644 --- a/src/cli.cpp +++ b/src/cli.cpp @@ -63,6 +63,17 @@ void HelpAndQuit(cxxopts::Options options) exit(0); } +// Not thread safe +inline void InitDecompressorQueueDefault(bool no_cuda = false) +{ + static bool initialized = false; + if (initialized) { + return; + } + decompressor_context_queue.init(1, (uint32_t)std::thread::hardware_concurrency(), false, 9, !no_cuda, 0, false, 30); + initialized = true; +} + int main(int argc, char *argv[]) try { cxxopts::Options options( "ProofOfSpace", "Utility for plotting, generating and verifying proofs of space."); @@ -161,6 +172,8 @@ int main(int argc, char *argv[]) try { num_threads, phases_flags); } else if (operation == "prove") { + InitDecompressorQueueDefault(); + if (argc < 3) { HelpAndQuit(options); } @@ -238,6 +251,8 @@ int main(int argc, char *argv[]) try { } delete[] proof_bytes; } else if (operation == "check") { + InitDecompressorQueueDefault(); + uint32_t iterations = 1000; if (argc == 3) { iterations = std::stoi(argv[2]); @@ -247,6 +262,8 @@ int main(int argc, char *argv[]) try { Verifier verifier = Verifier(); uint32_t success = 0; + uint32_t failures = 0; + uint32_t exceptions = 0; std::vector id_bytes = prover.GetId(); k = prover.GetSize(); @@ -257,10 +274,10 @@ int main(int argc, char *argv[]) try { vector hash(picosha2::k_digest_size); picosha2::hash256(hash_input.begin(), hash_input.end(), hash.begin(), hash.end()); - try { - vector qualities = prover.GetQualitiesForChallenge(hash.data()); + vector qualities = prover.GetQualitiesForChallenge(hash.data()); - for (uint32_t i = 0; i < qualities.size(); i++) { + for (uint32_t i = 0; i < qualities.size(); i++) { + try { LargeBits proof = prover.GetFullProof(hash.data(), i, parallel_read); uint8_t *proof_data = new uint8_t[proof.GetSize() / 8]; proof.ToBytes(proof_data); @@ -275,22 +292,25 @@ int main(int argc, char *argv[]) try { success++; } else { cout << "Proof verification failed." << endl; + failures += 1; } delete[] proof_data; + } catch (const std::exception& error) { + cout << "Threw: " << error.what() << endl; + exceptions += 1; } - } catch (const std::exception& error) { - cout << "Threw: " << error.what() << endl; - continue; } } std::cout << "Total success: " << success << "/" << iterations << ", " << (success * 100 / static_cast(iterations)) << "%." << std::endl; + std::cout << "Total failures: " << failures << std::endl; + std::cout << "Exceptions: " << exceptions << std::endl; if (show_progress) { progress(4, 1, 1); } } else { - cout << "Invalid operation. Use create/prove/verify/check" << endl; + cout << "Invalid operation '" << operation << "'. Use create/prove/verify/check" << endl; } return 0; -} catch (const cxxopts::OptionException &e) { +} catch (const cxxopts::exceptions::exception &e) { cout << "error parsing options: " << e.what() << endl; return 1; } catch (const std::exception &e) { diff --git a/src/prover_disk.hpp b/src/prover_disk.hpp index 68808abc2..6326e3480 100644 --- a/src/prover_disk.hpp +++ b/src/prover_disk.hpp @@ -24,10 +24,12 @@ #include #include #include -#include #include #include #include +#include +#include +#include #include "../lib/include/picosha2.hpp" #include "calculate_bucket.hpp" @@ -36,6 +38,14 @@ #include "serialize.hpp" #include "util.hpp" +#if USE_GREEN_REAPER + #include "GreenReaperPortable.h" +#endif + +#define CHIA_PLOT_V2_MAGIC 0x544F4C50ul // "PLOT" +#define CHIA_PLOT_VERSION_2_0_0 2 + + struct plot_header { uint8_t magic[19]; uint8_t id[32]; @@ -44,6 +54,249 @@ struct plot_header { uint8_t fmt_desc[50]; }; +#if USE_GREEN_REAPER +static GRApi _grApi{}; +static bool _dcompressor_queue_initialized = false; +class ContextQueue { +public: + ContextQueue() {} + + bool init( + uint32_t context_count, + uint32_t thread_count, + bool no_cpu_affinity, + const uint32_t max_compression_level, + bool use_gpu_harvesting, + uint32_t gpu_index, + bool enforce_gpu_index, + uint16_t context_queue_timeout + ) { + assert(!_dcompressor_queue_initialized); + _dcompressor_queue_initialized = true; + + // Populate the API + #if _WIN32 + #define GR_LIB_PREFIX "" + #define GR_LIB_EXT ".dll" + #else + #define GR_LIB_PREFIX "lib" + + #if __APPLE__ + #define GR_LIB_EXT ".dylib" + #else + #define GR_LIB_EXT ".so" + #endif + #endif + + // void* lib = grLoadModule(GR_LIB_PREFIX "bladebit_harvester" GR_LIB_EXT); + + // if (lib == nullptr) { + // int code; + // #if _WIN32 + // code = (int)::GetLastError(); + // #else + // code = (int)dlerror(); + // #endif + + // std::stringstream err; err << "Failed to load bladebit_harvester with error: '" << code << "'"; + // throw std::runtime_error(err.str()); + // } + // #undef GR_LIB_PREFIX + // #undef GR_LIB_EXT + + // // Init GR API + // { + // const auto r = grPopulateApiFromModule(lib, &_grApi, sizeof(GRApi), GR_API_VERSION); + // if (r != GRResult_OK) { + // std::stringstream err; err << "Failed to initialize GR API with error " << r; + // throw std::runtime_error(err.str()); + // } + // } + + GreenReaperConfig cfg = {}; + cfg.apiVersion = GR_API_VERSION; + cfg.threadCount = thread_count; + cfg.disableCpuAffinity = no_cpu_affinity; + if (!use_gpu_harvesting) { + cfg.gpuRequest = GRGpuRequestKind_None; + } else { + if (enforce_gpu_index) { + cfg.gpuRequest = GRGpuRequestKind_ExactDevice; + } else { + cfg.gpuRequest = GRGpuRequestKind_FirstAvailable; + } + } + cfg.gpuDeviceIndex = gpu_index; + this->context_queue_timeout = context_queue_timeout; + + for (uint32_t i = 0; i < context_count; i++) { + + cfg.cpuOffset = i * thread_count; + GreenReaperContext* gr = nullptr; + auto result = grCreateContext(&gr, &cfg, sizeof(cfg)); + + std::string error_msg; + + if (result == GRResult_OK) { + assert(gr); + queue.push(gr); + + // Preallocate memory required fot the maximum compression level we are supporting initially + result = grPreallocateForCompressionLevel(gr, 32, max_compression_level); + if (result != GRResult_OK) { + std::stringstream err; err << "Failed to preallocate memory for contexts with result " << result; + error_msg = err.str(); + } + } + if (result != GRResult_OK) { + // Destroy contexts that were already created + while (!queue.empty()) { + grDestroyContext( queue.front() ); + queue.pop(); + } + if (error_msg.length() < 1) { + std::stringstream err; err << "Failed to create GRContext with result " << result; + error_msg = err.str(); + } + throw std::runtime_error(error_msg); + } + + if (i == 0 && use_gpu_harvesting) { + if (grHasGpuDecompressor(gr) == GR_TRUE) { + return true; + } else { + // default to CPU + cfg.gpuRequest = GRGpuRequestKind_None; + } + } + } + + return false; + } + + void push(GreenReaperContext* gr) { + std::unique_lock lock(mutex); + queue.push(gr); + lock.unlock(); + condition.notify_one(); + } + + GreenReaperContext* pop() { + std::unique_lock lock(mutex); + + std::chrono::duration wait_time = std::chrono::seconds(context_queue_timeout); + + while (queue.empty() && wait_time.count() > 0) { + auto before_wait = std::chrono::steady_clock::now(); + + if (condition.wait_for(lock, wait_time) == std::cv_status::timeout) { + break; + } + + auto elapsed = std::chrono::duration_cast>(std::chrono::steady_clock::now() - before_wait); + wait_time -= elapsed; + } + + if (queue.empty()) { + throw std::runtime_error("Timeout waiting for context queue."); + } + + GreenReaperContext* gr = queue.front(); + queue.pop(); + return gr; + } + +private: + std::queue queue; + std::mutex mutex; + std::condition_variable condition; + uint16_t context_queue_timeout; +}; + +class ProofCache { + static constexpr uint32_t MAX_ENTRIES = 64; + + struct Entry { + alignas(16) uint8_t challenge[32]; + uint32_t index; + }; + + uint32_t cache_entry_proof_position = 0; + std::vector challenges; + std::vector full_proofs; + mutable std::mutex lock; + +public: + inline ProofCache() = default; + inline ProofCache(ProofCache&& other) + : cache_entry_proof_position(other.cache_entry_proof_position) + , challenges(std::move(other.challenges)) + , full_proofs(std::move(other.full_proofs)) + { + other.cache_entry_proof_position = 0; + } + + inline ProofCache(ProofCache const& other) = delete; + + inline bool FoundCachedProof(const uint32_t index, const uint8_t* challenge, LargeBits& out_full_proof) { + std::lock_guard l(lock); + + Entry entry; + memcpy(entry.challenge, challenge, sizeof(entry.challenge)); + entry.index = index; + + for (uint32_t i = 0; i < challenges.size(); i++) { + if (memcmp(&challenges[i], &entry, sizeof(Entry)) == 0) { + out_full_proof = full_proofs[i]; + return true; + } + } + return false; + } + + inline void CacheProof(const uint32_t index, const uint8_t* challenge, const LargeBits& full_proof) { + std::lock_guard l(lock); + + Entry entry; + memcpy(entry.challenge, challenge, sizeof(entry.challenge)); + entry.index = index; + + if (challenges.size() < MAX_ENTRIES) { + challenges.emplace_back(entry); + full_proofs.emplace_back(full_proof); + } else { + challenges[cache_entry_proof_position] = entry; + full_proofs[cache_entry_proof_position] = full_proof; + } + cache_entry_proof_position = (cache_entry_proof_position + 1) % MAX_ENTRIES; + } + + static_assert(alignof(ProofCache::Entry) == 16); +}; +#else +// Dummy one for python +class ContextQueue { +public: + inline ContextQueue() {} + + inline bool init( + uint32_t context_count, + uint32_t thread_count, + bool no_cpu_affinity, + const uint32_t max_compression_level, + bool use_gpu_harvesting, + uint32_t gpu_index, + bool enforce_gpu_index, + uint16_t context_queue_timeout + ) + { + return false; + } +}; +#endif // USE_GREEN_REAPER + +ContextQueue decompressor_context_queue; + // The DiskProver, given a correctly formatted plot file, can efficiently generate valid proofs // of space, for a given challenge. @@ -55,6 +308,7 @@ class DiskProver { explicit DiskProver(const std::string& filename) : id(kIdLen) { struct plot_header header{}; + this->compression_level = 0; this->filename = filename; std::ifstream disk_file(filename, std::ios::in | std::ios::binary); @@ -70,27 +324,76 @@ class DiskProver { // 2 bytes - memo length // x bytes - memo - SafeRead(disk_file, (uint8_t*)&header, sizeof(header)); - if (memcmp(header.magic, "Proof of Space Plot", sizeof(header.magic)) != 0) - throw std::invalid_argument("Invalid plot header magic"); + // Check for V2 Magic. + uint8_t magic_2_bytes[4]; + SafeRead(disk_file, magic_2_bytes, 4); + uint32_t magic_2_result; + memcpy(&magic_2_result, magic_2_bytes, sizeof(magic_2_result)); + if (magic_2_result == CHIA_PLOT_V2_MAGIC) { + uint8_t version_bytes[4]; + SafeRead(disk_file, version_bytes, 4); + uint32_t version_result; + memcpy(&version_result, version_bytes, sizeof(version_result)); + if (version_result == CHIA_PLOT_VERSION_2_0_0) { + version = 2; + } + else { + throw std::invalid_argument("Unsupported version."); + } + } else { + // V1 + version = 1; + memcpy(header.magic, magic_2_bytes, sizeof(magic_2_bytes)); + uint8_t tmp_magic_buff[15]; + SafeRead(disk_file, tmp_magic_buff, sizeof(header.magic) - 4); + memcpy(header.magic + 4, tmp_magic_buff, sizeof(tmp_magic_buff)); + if (memcmp(header.magic, "Proof of Space Plot", sizeof(header.magic)) != 0) { + throw std::invalid_argument("Invalid plot header magic: " + Util::HexStr(header.magic, 19)); + } + } + + SafeRead(disk_file, (uint8_t*)&header.id, sizeof(header.id)); + SafeRead(disk_file, (uint8_t*)&header.k, sizeof(header.k)); - uint16_t fmt_desc_len = Util::TwoBytesToInt(header.fmt_desc_len); + if (version == 1) { + SafeRead(disk_file, (uint8_t*)&header.fmt_desc_len, sizeof(header.fmt_desc_len)); + SafeRead(disk_file, (uint8_t*)&header.fmt_desc, sizeof(header.fmt_desc)); - if (fmt_desc_len == kFormatDescription.size() && - !memcmp(header.fmt_desc, kFormatDescription.c_str(), fmt_desc_len)) { - // OK - } else { - throw std::invalid_argument("Invalid plot file format"); + uint16_t fmt_desc_len = Util::TwoBytesToInt(header.fmt_desc_len); + + if (fmt_desc_len == kFormatDescription.size() && + !memcmp(header.fmt_desc, kFormatDescription.c_str(), fmt_desc_len)) { + // OK + } else { + throw std::invalid_argument("Invalid plot file format"); + } + SafeSeek(disk_file, offsetof(struct plot_header, fmt_desc) + fmt_desc_len); } + memcpy(id.data(), header.id, sizeof(header.id)); this->k = header.k; - SafeSeek(disk_file, offsetof(struct plot_header, fmt_desc) + fmt_desc_len); uint8_t size_buf[2]; SafeRead(disk_file, size_buf, 2); memo.resize(Util::TwoBytesToInt(size_buf)); SafeRead(disk_file, memo.data(), memo.size()); + if (version == 2) { + uint8_t flags_bytes[4]; + SafeRead(disk_file, flags_bytes, sizeof(flags_bytes)); + uint32_t flags; + memcpy(&flags, flags_bytes, sizeof(flags)); + if (flags & 1) { + uint8_t compression_level; + SafeRead(disk_file, &compression_level, sizeof(compression_level)); + this->compression_level = compression_level; + } + } + #if !defined( USE_GREEN_REAPER ) + if (this->compression_level > 0) + throw std::logic_error("Harvester does not support compressed plots."); + #endif + this->table_begin_pointers = std::vector(11, 0); this->C2 = std::vector(); @@ -134,7 +437,7 @@ class DiskProver { { Deserializer deserializer(vecBytes); deserializer >> version; - if (version != VERSION) { + if (version != 1 && version != 2) { // TODO: Migrate to new version if we change something related to the data structure throw std::invalid_argument("DiskProver: Invalid version."); } @@ -144,16 +447,30 @@ class DiskProver { deserializer >> k; deserializer >> table_begin_pointers; deserializer >> C2; + if (version == 2) { + deserializer >> compression_level; + } else { + compression_level = 0; + } + + #if !defined( USE_GREEN_REAPER ) + if (compression_level > 0) + throw std::runtime_error("Harvester does not support compressed plots."); + #endif } DiskProver(DiskProver const&) = delete; DiskProver(DiskProver&& other) noexcept + #if USE_GREEN_REAPER + : cached_proofs(std::move(other.cached_proofs)) + #endif { filename = std::move(other.filename); memo = std::move(other.memo); id = std::move(other.id); k = other.k; + compression_level = other.compression_level; table_begin_pointers = std::move(other.table_begin_pointers); C2 = std::move(other.C2); version = std::move(other.version); @@ -180,6 +497,56 @@ class DiskProver { uint8_t GetSize() const noexcept { return k; } + uint8_t GetCompressionLevel() const noexcept { return compression_level; } + + bool CompareProofBits(const LargeBits& left, const LargeBits& right, uint8_t k) + { + uint16_t size = left.GetSize() / k; + assert(left.GetSize() == right.GetSize()); + for (int16_t i = size - 1; i >= 0; i--) { + LargeBits left_val = left.Slice(k * i, k * (i + 1)); + LargeBits right_val = right.Slice(k * i, k * (i + 1)); + if (left_val < right_val) { + return true; + } + if (left_val > right_val) { + return false; + } + } + return false; + } + + LargeBits GetQualityStringFromProof( + LargeBits proof, + const uint8_t* challenge) + { + Bits challenge_bits = Bits(challenge, 256 / 8, 256); + uint16_t quality_index = challenge_bits.Slice(256 - 5).GetValue() << 1; + + // Converts the proof from proof ordering to plot ordering + for (uint8_t table_index = 1; table_index < 7; table_index++) { + LargeBits new_proof; + uint16_t size = k * (1 << (table_index - 1)); + for (int j = 0; j < (1 << (7 - table_index)); j += 2) { + LargeBits L = proof.Slice(j * size, (j + 1) * size); + LargeBits R = proof.Slice((j + 1) * size, (j + 2) * size); + if (CompareProofBits(L, R, k)) { + new_proof += (L + R); + } else { + new_proof += (R + L); + } + } + proof = new_proof; + } + // Hashes two of the x values, based on the quality index + std::vector hash_input(32 + Util::ByteAlign(2 * k) / 8, 0); + memcpy(hash_input.data(), challenge, 32); + proof.Slice(k * quality_index, k * (quality_index + 2)).ToBytes(hash_input.data() + 32); + std::vector hash(picosha2::k_digest_size); + picosha2::hash256(hash_input.begin(), hash_input.end(), hash.begin(), hash.end()); + return LargeBits(hash.data(), 32, 256); + } + // Given a challenge, returns a quality string, which is sha256(challenge + 2 adjecent x // values), from the 64 value proof. Note that this is more efficient than fetching all 64 x // values, which are in different parts of the disk. @@ -187,9 +554,10 @@ class DiskProver { { std::vector qualities; - std::lock_guard l(_mtx); + uint32_t p7_entries_size = 0; { + std::lock_guard l(_mtx); std::ifstream disk_file(filename, std::ios::in | std::ios::binary); if (!disk_file.is_open()) { @@ -203,15 +571,22 @@ class DiskProver { if (p7_entries.empty()) { return std::vector(); } + p7_entries_size = p7_entries.size(); // The last 5 bits of the challenge determine which route we take to get to // our two x values in the leaves. uint8_t last_5_bits = challenge[31] & 0x1f; for (uint64_t position : p7_entries) { + #if USE_GREEN_REAPER + if (compression_level >= 9) { + break; + } + #endif // This inner loop goes from table 6 to table 1, getting the two backpointers, // and following one of them. - for (uint8_t table_index = 6; table_index > 1; table_index--) { + uint64_t alt_position; + for (uint8_t table_index = 6; table_index > GetEndTable(); table_index--) { uint128_t line_point = ReadLinePoint(disk_file, table_index, position); auto xy = Encoding::LinePointToSquare(line_point); @@ -219,13 +594,47 @@ class DiskProver { if (((last_5_bits >> (table_index - 2)) & 1) == 0) { position = xy.second; + alt_position = xy.first; } else { position = xy.first; + alt_position = xy.second; } } - uint128_t new_line_point = ReadLinePoint(disk_file, 1, position); - auto x1x2 = Encoding::LinePointToSquare(new_line_point); - + uint128_t new_line_point = ReadLinePoint(disk_file, GetEndTable(), position); + std::pair x1x2; + + #if USE_GREEN_REAPER + if (compression_level > 0) { + GRCompressedQualitiesRequest req; + req.compressionLevel = compression_level; + req.plotId = id.data(); + req.challenge = challenge; + req.xLinePoints[0].hi = (uint64_t)(new_line_point >> 64); + req.xLinePoints[0].lo = (uint64_t)new_line_point; + if (compression_level >= 6) { + uint128_t alt_line_point = ReadLinePoint(disk_file, GetEndTable(), alt_position); + req.xLinePoints[1].hi = (uint64_t)(alt_line_point >> 64); + req.xLinePoints[1].lo = (uint64_t)alt_line_point; + } + + GreenReaperContext* gr = decompressor_context_queue.pop(); + assert(gr); + + auto res = grGetFetchQualitiesXPair(gr, &req); + decompressor_context_queue.push(gr); + + if (res != GRResult_OK) { + // Expect this will result in failure in a later step. + x1x2.first = x1x2.second = 0; + } else { + x1x2.first = req.x1; + x1x2.second = req.x2; + } + } else + #endif // #if USE_GREEN_REAPER + { + x1x2 = Encoding::LinePointToSquare(new_line_point); + } // The final two x values (which are stored in the same location) are hashed std::vector hash_input(32 + Util::ByteAlign(2 * k) / 8, 0); memcpy(hash_input.data(), challenge, 32); @@ -236,6 +645,23 @@ class DiskProver { qualities.emplace_back(hash.data(), 32, 256); } } // Scope for disk_file + + #if USE_GREEN_REAPER + if (compression_level >= 9) { + uint8_t failure_bytes[32]; + for (int i = 0; i < 32; i++) { + failure_bytes[i] = 255; + } + for (uint32_t i = 0; i < p7_entries_size; i++) { + try { + auto proof = GetFullProof(challenge, i); + qualities.push_back(GetQualityStringFromProof(proof, challenge)); + } catch (const std::exception& error) { + qualities.emplace_back(failure_bytes, 32, 256); + } + } + } + #endif return qualities; } @@ -245,6 +671,12 @@ class DiskProver { LargeBits GetFullProof(const uint8_t* challenge, uint32_t index, bool parallel_read = true) { LargeBits full_proof; + + #if USE_GREEN_REAPER + if (compression_level >= 9 && cached_proofs.FoundCachedProof(index, challenge, full_proof)) { + return full_proof; + } + #endif std::lock_guard l(_mtx); { @@ -262,11 +694,55 @@ class DiskProver { // Gets the 64 leaf x values, concatenated together into a k*64 bit string. std::vector xs; if (parallel_read) { - xs = GetInputs(p7_entries[index], 6); + xs = GetInputs(p7_entries[index], 6, nullptr); } else { xs = GetInputs(p7_entries[index], 6, &disk_file); // Passing in a disk_file disabled the parallel reads } + #if USE_GREEN_REAPER + if (compression_level > 0) { + auto gr = decompressor_context_queue.pop(); + + GRCompressedProofRequest req{}; + req.compressionLevel = compression_level; + req.plotId = id.data(); + + uint8_t compressed_proof_size = (compression_level <= 8 ? GR_POST_PROOF_CMP_X_COUNT : (GR_POST_PROOF_CMP_X_COUNT / 2)); + for (int i = 0; i < compressed_proof_size; i++) { + req.compressedProof[i] = xs[i].GetValue(); + } + + GRResult res = grFetchProofForChallenge(gr, &req); + decompressor_context_queue.push(gr); + + if (res != GRResult_OK) { + if (res == GRResult_NoProof) { + throw std::runtime_error("GRResult_NoProof received"); + } + if (res == GRResult_Failed) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_Failed"); + } + if (res == GRResult_OutOfMemory) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_OutOfMemory"); + } + if (res == GRResult_WrongVersion) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_WrongVersion"); + } + if (res == GRResult_InvalidGPU) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_InvalidGPU"); + } + if (res == GRResult_InvalidArg) { + throw std::runtime_error("GRResult is not GRResult_OK, received GRResult_InvalidArg"); + } + } + std::vector uncompressed_xs; + for (int i = 0; i < GR_POST_PROOF_X_COUNT; i++) { + uncompressed_xs.push_back(Bits(req.fullProof[i], k)); + } + xs = uncompressed_xs; + } + #endif + // Sorts them according to proof ordering, where // f1(x0) m= f1(x1), f2(x0, x1) m= f2(x2, x3), etc. On disk, they are not stored in // proof ordering, they're stored in plot ordering, due to the sorting in the Compress @@ -276,6 +752,12 @@ class DiskProver { full_proof += x; } } // Scope for disk_file + + #if USE_GREEN_REAPER + if (compression_level >= 9) { + cached_proofs.CacheProof(index, challenge, full_proof); + } + #endif return full_proof; } @@ -283,6 +765,9 @@ class DiskProver { { Serializer serializer; serializer << version << filename << memo << id << k << table_begin_pointers << C2; + if (version == 2) { + serializer << compression_level; + } return serializer.Data(); } @@ -293,8 +778,12 @@ class DiskProver { std::vector memo; std::vector id; // Unique plot id uint8_t k; + uint8_t compression_level; std::vector table_begin_pointers; std::vector C2; + #if USE_GREEN_REAPER + ProofCache cached_proofs; + #endif // Using this method instead of simply seeking will prevent segfaults that would arise when // continuing the process of looking up qualities. @@ -328,14 +817,53 @@ class DiskProver { } } + uint8_t GetEndTable() { + if (compression_level == 0) { + return 1; + } + if (compression_level <= 8) { + return 2; + } + return 3; + } + // Reads exactly one line point (pair of two k bit back-pointers) from the given table. // The entry at index "position" is read. First, the park index is calculated, then // the park is read, and finally, entry deltas are added up to the position that we // are looking for. - uint128_t ReadLinePoint(std::ifstream& disk_file, uint8_t table_index, uint64_t position) - { + uint128_t ReadLinePoint( + std::ifstream& disk_file, + uint8_t table_index, + uint64_t position + ) { + size_t compressed_park_size = 0; + uint32_t compressed_stub_size_bits = 0; + double compressed_ans_r_value = 0; + + const bool is_compressed = compression_level > 0 && table_index == GetEndTable(); + (void)is_compressed; + + #if USE_GREEN_REAPER + if (is_compressed) { + GRCompressionInfo info{}; + const auto r = grGetCompressionInfo(&info, sizeof(info), k, compression_level); + if (r != GRResult_OK) { + std::stringstream err; err << "Failed to obtain compression info with error " << r; + throw std::runtime_error(err.str()); + } + + compressed_park_size = info.tableParkSize;; + compressed_stub_size_bits = info.stubSizeBits; + compressed_ans_r_value = info.ansRValue; + } + #else + (void)compressed_stub_size_bits; + (void)compressed_ans_r_value; + (void)compressed_park_size; + #endif + uint64_t park_index = position / kEntriesPerPark; - uint32_t park_size_bits = EntrySizes::CalculateParkSize(k, table_index) * 8; + uint32_t park_size_bits = (is_compressed ? compressed_park_size : EntrySizes::CalculateParkSize(k, table_index)) * 8; SafeSeek(disk_file, table_begin_pointers[table_index] + (park_size_bits / 8) * park_index); @@ -346,12 +874,12 @@ class DiskProver { uint128_t line_point = Util::SliceInt128FromBytes(line_point_bin, 0, k * 2); // Reads EPP stubs - uint32_t stubs_size_bits = EntrySizes::CalculateStubsSize(k) * 8; + uint32_t stubs_size_bits = (is_compressed ? (Util::ByteAlign((kEntriesPerPark - 1) * compressed_stub_size_bits) / 8) : EntrySizes::CalculateStubsSize(k)) * 8; auto* stubs_bin = new uint8_t[stubs_size_bits / 8 + 7]; SafeRead(disk_file, stubs_bin, stubs_size_bits / 8); - // Reads EPP deltas - uint32_t max_deltas_size_bits = EntrySizes::CalculateMaxDeltasSize(k, table_index) * 8; + // Reads EPP deltas + uint32_t max_deltas_size_bits = (is_compressed ? compressed_park_size - (line_point_size + stubs_size_bits) : EntrySizes::CalculateMaxDeltasSize(k, table_index)) * 8; auto* deltas_bin = new uint8_t[max_deltas_size_bits / 8]; // Reads the size of the encoded deltas object @@ -374,13 +902,13 @@ class DiskProver { SafeRead(disk_file, deltas_bin, encoded_deltas_size); // Decodes the deltas - double R = kRValues[table_index - 1]; + double R = (is_compressed ? compressed_ans_r_value : kRValues[table_index - 1]); deltas = Encoding::ANSDecodeDeltas(deltas_bin, encoded_deltas_size, kEntriesPerPark - 1, R); } uint32_t start_bit = 0; - uint8_t stub_size = k - kStubMinusBits; + uint8_t stub_size = (uint8_t)(is_compressed ? compressed_stub_size_bits : k - kStubMinusBits); uint64_t sum_deltas = 0; uint64_t sum_stubs = 0; for (uint32_t i = 0; @@ -724,7 +1252,7 @@ class DiskProver { } std::pair xy = Encoding::LinePointToSquare(line_point); - if (depth == 1) { + if (depth == GetEndTable()) { // For table P1, the line point represents two concatenated x values. std::vector ret; ret.emplace_back(xy.second, k); // y diff --git a/tests/test.cpp b/tests/test.cpp index e2d976df4..a64057ed2 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -621,7 +621,7 @@ void TestProofOfSpace( picosha2::hash256(hash_input.begin(), hash_input.end(), hash.begin(), hash.end()); vector qualities = prover.GetQualitiesForChallenge(hash.data()); Verifier verifier = Verifier(); - + for (uint32_t index = 0; index < qualities.size(); index++) { LargeBits proof = prover.GetFullProof(hash.data(), index); proof.ToBytes(proof_data); @@ -1082,7 +1082,7 @@ TEST_CASE("DiskProver") { SECTION("Move constructor") { - std::string filename = "prover_test.plot"; + std::string filename = "prover_test_with_a_long_name_to_avoid_sso.plot"; DiskPlotter plotter = DiskPlotter(); std::vector memo{1, 2, 3}; plotter.CreatePlotDisk(