diff --git a/.asf.yaml b/.asf.yaml index ba325c2abf231..f3a8ed9fee90f 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -23,7 +23,6 @@ github: - benibus - jbonofre - js8544 - - laurentgo - vibhatha - ZhangHuiGui diff --git a/.env b/.env index 252f59ccfb2e6..a94803cc3277c 100644 --- a/.env +++ b/.env @@ -62,7 +62,7 @@ HDFS=3.2.1 JDK=11 KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. -LLVM=14 +LLVM=18 MAVEN=3.8.7 NODE=18 NUMBA=latest @@ -89,17 +89,17 @@ TZ=UTC # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. -VCPKG="943c5ef1c8f6b5e6ced092b242c8299caae2ff01" # 2024.04.26 Release +VCPKG="f7423ee180c4b7f40d43402c2feb3859161ef625" # 2024.06.15 Release # This must be updated when we update -# ci/docker/python-*-windows-*.dockerfile. +# ci/docker/python-*-windows-*.dockerfile or the vcpkg config. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2025-01-08 -PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2025-01-08 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2025-02-03 +PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2025-01-27 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker compose run --rm conan". # See https://github.com/conan-io/conan-docker-tools#readme and # https://hub.docker.com/u/conanio for available images. -CONAN_BASE=gcc10 -CONAN_VERSION=1.62.0 +CONAN_BASE=gcc11-ubuntu16.04 +CONAN_VERSION=2.12.1 diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index 6dc4da306a1ea..e486ef0e16e59 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -65,7 +65,7 @@ jobs: shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v5.3.0 + uses: actions/setup-python@v5.4.0 with: python-version: '3.9' - name: Install pygit2 binary wheel diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 83b6f6e31ffc3..578b47361b71e 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -42,7 +42,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 5ccefa32725f3..b0adbbfa0a4d6 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -69,65 +69,37 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: - docker-targets: - name: Docker targets - runs-on: ubuntu-latest - outputs: - targets: ${{ steps.detect-targets.outputs.targets }} - steps: - - name: Detect targets - id: detect-targets - run: | - echo "targets<> "$GITHUB_OUTPUT" - echo "[" >> "$GITHUB_OUTPUT" - cat <> "$GITHUB_OUTPUT" - { - "arch": "amd64", - "clang-tools": "14", - "image": "conda-cpp", - "llvm": "14", - "runs-on": "ubuntu-latest", - "simd-level": "AVX2", - "title": "AMD64 Conda C++ AVX2", - "ubuntu": "22.04" - }, - { - "arch": "amd64", - "clang-tools": "14", - "image": "ubuntu-cpp-sanitizer", - "llvm": "14", - "runs-on": "ubuntu-latest", - "title": "AMD64 Ubuntu 22.04 C++ ASAN UBSAN", - "ubuntu": "22.04" - } - JSON - if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then - echo "," >> "$GITHUB_OUTPUT" - cat <> "$GITHUB_OUTPUT" - { - "arch": "arm64v8", - "clang-tools": "10", - "image": "ubuntu-cpp", - "llvm": "10", - "runs-on": ["self-hosted", "arm", "linux"], - "title": "ARM64 Ubuntu 20.04 C++", - "ubuntu": "20.04" - } - JSON - fi - echo "]" >> "$GITHUB_OUTPUT" - echo "JSON" >> "$GITHUB_OUTPUT" - docker: name: ${{ matrix.title }} - needs: docker-targets runs-on: ${{ matrix.runs-on }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 75 strategy: fail-fast: false matrix: - include: ${{ fromJson(needs.docker-targets.outputs.targets) }} + include: + - arch: amd64 + clang-tools: 14 + image: conda-cpp + llvm: 14 + runs-on: ubuntu-latest + simd-level: AVX2 + title: AMD64 Conda C++ AVX2 + ubuntu: 22.04 + - arch: amd64 + clang-tools: 14 + image: ubuntu-cpp-sanitizer + llvm: 14 + runs-on: ubuntu-latest + title: AMD64 Ubuntu 22.04 C++ ASAN UBSAN + ubuntu: 22.04 + - arch: arm64v8 + clang-tools: 10 + image: ubuntu-cpp + llvm: 10 + runs-on: ubuntu-24.04-arm + title: ARM64 Ubuntu 20.04 C++ + ubuntu: 20.04 env: ARCH: ${{ matrix.arch }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} @@ -149,7 +121,7 @@ jobs: - name: Setup Python on hosted runner if: | matrix.runs-on == 'ubuntu-latest' - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3 - name: Setup Python on self-hosted runner @@ -262,7 +234,7 @@ jobs: $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest ${ARROW_HOME} - name: Set up Python - uses: actions/setup-python@v5.3.0 + uses: actions/setup-python@v5.4.0 with: python-version: 3.12 - name: Install Google Cloud Storage Testbench @@ -481,7 +453,7 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v5.3.0 + uses: actions/setup-python@v5.4.0 id: python-install with: python-version: 3.9 diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 0607c537d1b7f..adf6e63ff56db 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -54,11 +54,11 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4.2.0 + uses: actions/setup-dotnet@v4.3.0 with: dotnet-version: ${{ matrix.dotnet }} - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3 - name: Checkout Arrow @@ -86,7 +86,7 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4.2.0 + uses: actions/setup-dotnet@v4.3.0 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow @@ -113,11 +113,11 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4.2.0 + uses: actions/setup-dotnet@v4.3.0 with: dotnet-version: ${{ matrix.dotnet }} - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Checkout Arrow @@ -133,3 +133,88 @@ jobs: - name: Test shell: bash run: ci/scripts/csharp_test.sh $(pwd) + + package: + name: Package + # Branch or RC tag + if: github.ref_type != 'tag' || contains(github.ref_name, 'rc') + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + contents: write + steps: + - name: Checkout for utilities + if: github.ref_type == 'tag' + uses: actions/checkout@v4 + with: + path: arrow + - name: Download source archive + if: github.ref_type == 'tag' + run: | + arrow/dev/release/utils-watch-gh-workflow.sh \ + ${GITHUB_REF_NAME} \ + release_candidate.yml + gh release download ${GITHUB_REF_NAME} \ + --pattern "*.tar.gz" \ + --repo ${GITHUB_REPOSITORY} + tar -xf *.tar.gz --strip-components=1 + mv csharp/dummy.git .git + env: + GH_TOKEN: ${{ github.token }} + - name: Checkout + if: github.ref_type != 'tag' + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Prepare version + if: github.ref_type != 'tag' + run: | + # apache-arrow-20.0.0.dev-9-g758867f907 -> + # 20.0.0.dev-9-g758867f907 -> + # 20.0.0.dev-9 -> + # 20.0.0-dev-9 + semver="$(git describe --tags | \ + sed -E \ + -e 's/^apache-arrow-//' \ + -e 's/-[^-]*$//' \ + -e 's/^([0-9]*\.[0-9]*\.[0-9])\./\1-/')" + sed -i'' -E -e \ + "s/^ .+<\/Version>/ ${semver}<\/Version>/" \ + csharp/Directory.Build.props + - name: Setup Python + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + with: + python-version: 3 + - name: Setup Archery + run: | + python3 -m pip install -e 'dev/archery[docker]' + - name: Build + run: | + archery docker run ubuntu-csharp + - name: Prepare artifacts + run: | + shopt -s globstar + cp csharp/artifacts/**/*.{,s}nupkg ./ + for artifact in *.{,s}nupkg; do + dev/release/utils-generate-checksum.sh "${artifact}" + done + - name: Upload + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + with: + name: nuget + path: | + *.nupkg + *.sha256 + *.sha512 + *.snupkg + - name: Publish + if: github.ref_type == 'tag' + run: | + gh release upload ${GITHUB_REF_NAME} \ + --repo ${GITHUB_REPOSITORY} \ + *.nupkg \ + *.sha256 \ + *.sha512 \ + *.snupkg + env: + GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index d59da447612a6..6a8cc05ca686a 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -50,7 +50,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Install pre-commit @@ -109,7 +109,7 @@ jobs: with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: '3.12' - name: Install Ruby @@ -117,14 +117,14 @@ jobs: with: ruby-version: ruby - name: Install .NET - uses: actions/setup-dotnet@87b7050bc53ea08284295505d98d2aa94301e852 # v4.2.0 + uses: actions/setup-dotnet@3951f0dfe7a07e2313ec93c75700083e2005cbab # v4.3.0 with: dotnet-version: '8.0.x' - name: Install Dependencies shell: bash run: | gem install test-unit - pip install "cython>=0.29.31" setuptools pytest requests setuptools-scm + pip install "cython>=3" setuptools pytest requests setuptools-scm - name: Run Release Test env: ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 83f835d588af2..f9d8b72cbe0b4 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -52,7 +52,7 @@ jobs: key: debian-docs-${{ hashFiles('cpp/**') }} restore-keys: debian-docs- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 0e23394e8a453..252c08a460fc6 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -59,7 +59,7 @@ jobs: key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index aef81df074888..2e8e5d9a0c8d0 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -106,7 +106,7 @@ jobs: key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 5ef5b37c98815..e100e26a05d50 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -61,7 +61,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 2bdfd0743a547..101724b3e2cd3 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -46,24 +46,8 @@ permissions: jobs: ubuntu: - name: AMD64 Ubuntu 20.04 MATLAB - # Explicitly pin the Ubuntu version to 20.04 for the time being because: - # - # 1. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible - # with the GLIBCXX bundled with MATLAB R2023a. This is a relatively common - # issue. - # - # For example, see: - # - # https://www.mathworks.com/matlabcentral/answers/1907290-how-to-manually-select-the-libstdc-library-to-use-to-resolve-a-version-glibcxx_-not-found - # - # 2. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible with - # the version of GLIBCXX shipped with Debian 11. Several of the Arrow community - # members who work on the MATLAB bindings use Debian 11 locally for qualification. - # Using Ubuntu 20.04 eases development workflows for these community members. - # - # In the future, we can investigate adding support for building against more Linux (e.g. `ubuntu-22.04`) and MATLAB versions (e.g. R2023b). - runs-on: ubuntu-20.04 + name: AMD64 Ubuntu 22.04 MATLAB + runs-on: ubuntu-22.04 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository @@ -155,7 +139,7 @@ jobs: runs-on: windows-2022 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - - name: Check out repository + - name: Check out repository uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 1eedacf1abf31..a9ccdad0c52cb 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -82,7 +82,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml index 2c840e95c8db6..a6dd5f1275331 100644 --- a/.github/workflows/pr_review_trigger.yml +++ b/.github/workflows/pr_review_trigger.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Upload PR review Payload" - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: path: "${{ github.event_path }}" name: "pr_review_payload" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index ba05fab65ada2..bc8f670f13587 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -114,7 +114,7 @@ jobs: key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery @@ -183,7 +183,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v5.3.0 + uses: actions/setup-python@v5.4.0 with: python-version: '3.11' - name: Install Dependencies diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index bc7db519b64f7..407cc775c3949 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -155,7 +155,7 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery @@ -177,7 +177,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }} path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -214,7 +214,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery @@ -237,7 +237,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: test-output-bundled path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -299,7 +299,7 @@ jobs: # So that they're unique when multiple are downloaded in the next step shell: bash run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 4fcb399c91fc6..2da69bd7053db 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -60,7 +60,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8d54979502430..655d67df69efd 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -32,13 +32,13 @@ env: jobs: publish: - name: Publish + name: Publish runs-on: ubuntu-latest timeout-minutes: 5 steps: - name: Get Tag Name of Latest Release Candidate run: | - rc_tag=$(gh release list --repo apache/arrow | \ + rc_tag=$(gh release list --repo ${GITHUB_REPOSITORY} | \ cut -f3 | \ grep -F "${GITHUB_REF_NAME}-rc" | \ head -n1) @@ -52,23 +52,25 @@ jobs: echo "VERSION_WITH_RC=${version_with_rc}" >> ${GITHUB_ENV} echo "VERSION=${version}" >> ${GITHUB_ENV} echo "RC_NUM=${rc_num}" >> ${GITHUB_ENV} - - name: Download Release Candidate Artifacts + - name: Download Release Candidate Artifacts run: | mkdir release_candidate_artifacts - gh release download ${RELEASE_CANDIDATE_TAG_NAME} --repo apache/arrow --dir release_candidate_artifacts + gh release download ${RELEASE_CANDIDATE_TAG_NAME} \ + --dir release_candidate_artifacts \ + --repo ${GITHUB_REPOSITORY} - name: Create Release Title run: | title="Apache Arrow ${VERSION}" echo "RELEASE_TITLE=${title}" >> ${GITHUB_ENV} # Set the release notes to "TODO" temporarily. After the release notes page - # (https://arrow.apache.org/release/{VERSION}.html) is published, use - # gh release edit to update the release notes to refer to the newly + # (https://arrow.apache.org/release/{VERSION}.html) is published, use + # gh release edit to update the release notes to refer to the newly # pushed web page. See dev/post/post-05-update-gh-release-notes.sh - name: Create GitHub Release run: | gh release create ${GITHUB_REF_NAME} \ - --repo apache/arrow \ - --verify-tag \ - --title "${RELEASE_TITLE}" \ --notes "TODO" \ - release_candidate_artifacts/* \ No newline at end of file + --repo ${GITHUB_REPOSITORY} \ + --title "${RELEASE_TITLE}" \ + --verify-tag \ + release_candidate_artifacts/* diff --git a/.github/workflows/release_candidate.yml b/.github/workflows/release_candidate.yml index ec732f0eb33e0..5e222be06aa89 100644 --- a/.github/workflows/release_candidate.yml +++ b/.github/workflows/release_candidate.yml @@ -32,10 +32,10 @@ env: jobs: publish: - name: Publish + name: Publish runs-on: ubuntu-latest timeout-minutes: 5 - steps: + steps: - name: Checkout Arrow uses: actions/checkout@v4 with: @@ -58,8 +58,9 @@ jobs: echo "RELEASE_CANDIDATE_NOTES=${release_notes}" >> ${GITHUB_ENV} - name: Create Release tarball run: | - cd dev/release/ && ./utils-create-release-tarball.sh ${VERSION} ${RC_NUM} + dev/release/utils-create-release-tarball.sh ${VERSION} ${RC_NUM} echo "RELEASE_TARBALL=apache-arrow-${VERSION}.tar.gz" >> ${GITHUB_ENV} + dev/release/utils-generate-checksum.sh "apache-arrow-${VERSION}.tar.gz" - name: Create GitHub Release run: | gh release create ${GITHUB_REF_NAME} \ @@ -67,4 +68,4 @@ jobs: --prerelease \ --title "${RELEASE_CANDIDATE_TITLE}" \ --notes "Release Notes: ${RELEASE_CANDIDATE_NOTES}" \ - dev/release/${RELEASE_TARBALL} + ${RELEASE_TARBALL}* diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 589b74cd687fd..fe8e8e048693a 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -90,7 +90,7 @@ jobs: key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index 2fd55d457c208..971c2590c5af4 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -64,7 +64,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python on hosted runner - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3 - name: Setup Archery diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e54fe2393cf3d..e386b87be6e92 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -141,6 +141,18 @@ repos: ( ?^r/src/arrowExports\.cpp$| ) + - repo: https://github.com/rubocop/rubocop + rev: "v1.71.0" + hooks: + - id: rubocop + name: Ruby Format + alias: ruby-format + args: + - "--autocorrect" + exclude: >- + ( + ?^dev/tasks/homebrew-formulae/.*\.rb$| + ) - repo: https://github.com/cheshirekow/cmake-format-precommit rev: v0.6.13 hooks: @@ -183,4 +195,5 @@ repos: ?^ci/scripts/c_glib_build\.sh$| ?^ci/scripts/c_glib_test\.sh$| ?^c_glib/test/run-test\.sh$| + ?^dev/release/utils-generate-checksum\.sh$| ) diff --git a/ci/docker/conda-python-cython2.dockerfile b/.rubocop.yml similarity index 78% rename from ci/docker/conda-python-cython2.dockerfile rename to .rubocop.yml index 859ad868b0c71..0f4f4847e1905 100644 --- a/ci/docker/conda-python-cython2.dockerfile +++ b/.rubocop.yml @@ -15,10 +15,16 @@ # specific language governing permissions and limitations # under the License. -ARG repo -ARG arch -ARG python=3.9 -FROM ${repo}:${arch}-conda-python-${python} +# Ruby lint begins minimal. +# All of checkings changed to disable by default. +AllCops: + DisabledByDefault: true -RUN mamba install -q -y "cython<3" && \ - mamba clean --all +Lint: + Enabled: false + +Layout/LineLength: + Max: 100 + +Layout/ArgumentAlignment: + Enabled: true diff --git a/README.md b/README.md index f49ec4b8d98ee..c557716a4a88b 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Major components of the project include: - [Gandiva](https://github.com/apache/arrow/tree/main/cpp/src/gandiva): an [LLVM](https://llvm.org)-based Arrow expression compiler, part of the C++ codebase - [Go libraries](https://github.com/apache/arrow-go) - - [Java libraries](https://github.com/apache/arrow/tree/main/java) + - [Java libraries](https://github.com/apache/arrow-java) - [JavaScript libraries](https://github.com/apache/arrow/tree/main/js) - [Python libraries](https://github.com/apache/arrow/tree/main/python) - [R libraries](https://github.com/apache/arrow/tree/main/r) diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index 2169f8a05c77b..0704687fed56a 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -1017,6 +1017,38 @@ garrow_array_concatenate(GArrowArray *array, GList *other_arrays, GError **error } } +/** + * garrow_array_validate: + * @array: A #GArrowArray. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 20.0.0 + */ +gboolean +garrow_array_validate(GArrowArray *array, GError **error) +{ + const auto arrow_array = garrow_array_get_raw(array); + return garrow::check(error, arrow_array->Validate(), "[array][validate]"); +} + +/** + * garrow_array_validate_full: + * @array: A #GArrowArray. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 20.0.0 + */ +gboolean +garrow_array_validate_full(GArrowArray *array, GError **error) +{ + const auto arrow_array = garrow_array_get_raw(array); + return garrow::check(error, arrow_array->ValidateFull(), "[array][validate-full]"); +} + G_DEFINE_TYPE(GArrowNullArray, garrow_null_array, GARROW_TYPE_ARRAY) static void diff --git a/c_glib/arrow-glib/basic-array.h b/c_glib/arrow-glib/basic-array.h index dbffedde28164..bc597a8a93104 100644 --- a/c_glib/arrow-glib/basic-array.h +++ b/c_glib/arrow-glib/basic-array.h @@ -126,6 +126,14 @@ GARROW_AVAILABLE_IN_4_0 GArrowArray * garrow_array_concatenate(GArrowArray *array, GList *other_arrays, GError **error); +GARROW_AVAILABLE_IN_20_0 +gboolean +garrow_array_validate(GArrowArray *array, GError **error); + +GARROW_AVAILABLE_IN_20_0 +gboolean +garrow_array_validate_full(GArrowArray *array, GError **error); + #define GARROW_TYPE_NULL_ARRAY (garrow_null_array_get_type()) GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( diff --git a/c_glib/arrow-glib/record-batch.cpp b/c_glib/arrow-glib/record-batch.cpp index be9b361ae0397..2c94919d96609 100644 --- a/c_glib/arrow-glib/record-batch.cpp +++ b/c_glib/arrow-glib/record-batch.cpp @@ -191,12 +191,7 @@ garrow_record_batch_new(GArrowSchema *schema, } auto arrow_record_batch = arrow::RecordBatch::Make(arrow_schema, n_rows, arrow_columns); - auto status = arrow_record_batch->Validate(); - if (garrow_error_check(error, status, tag)) { - return garrow_record_batch_new_raw(&arrow_record_batch); - } else { - return NULL; - } + return garrow_record_batch_new_raw(&arrow_record_batch); } /** @@ -505,6 +500,40 @@ garrow_record_batch_serialize(GArrowRecordBatch *record_batch, } } +/** + * garrow_record_batch_validate + * @record_batch: A #GArrowRecordBatch + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 20.0.0 + */ +gboolean +garrow_record_batch_validate(GArrowRecordBatch *record_batch, GError **error) +{ + const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + return garrow::check(error, arrow_record_batch->Validate(), "[record-batch][validate]"); +} + +/** + * garrow_record_batch_validate_full + * @record_batch: A #GArrowRecordBatch + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 20.0.0 + */ +gboolean +garrow_record_batch_validate_full(GArrowRecordBatch *record_batch, GError **error) +{ + const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + return garrow::check(error, + arrow_record_batch->ValidateFull(), + "[record-batch][validate-full]"); +} + typedef struct GArrowRecordBatchIteratorPrivate_ { arrow::RecordBatchIterator iterator; diff --git a/c_glib/arrow-glib/record-batch.h b/c_glib/arrow-glib/record-batch.h index e7ffd83795ed4..5a51ad983bbee 100644 --- a/c_glib/arrow-glib/record-batch.h +++ b/c_glib/arrow-glib/record-batch.h @@ -109,6 +109,14 @@ garrow_record_batch_serialize(GArrowRecordBatch *record_batch, GArrowWriteOptions *options, GError **error); +GARROW_AVAILABLE_IN_20_0 +gboolean +garrow_record_batch_validate(GArrowRecordBatch *record_batch, GError **error); + +GARROW_AVAILABLE_IN_20_0 +gboolean +garrow_record_batch_validate_full(GArrowRecordBatch *record_batch, GError **error); + #define GARROW_TYPE_RECORD_BATCH_ITERATOR (garrow_record_batch_iterator_get_type()) GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchIterator, diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index f8569366685a2..7b11ae342cdfb 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -339,20 +339,10 @@ garrow_table_new_values(GArrowSchema *schema, GList *values, GError **error) if (!arrow_chunked_arrays.empty()) { auto arrow_table = arrow::Table::Make(arrow_schema, std::move(arrow_chunked_arrays)); - auto status = arrow_table->Validate(); - if (garrow_error_check(error, status, context)) { - return garrow_table_new_raw(&arrow_table); - } else { - return NULL; - } + return garrow_table_new_raw(&arrow_table); } else if (!arrow_arrays.empty()) { auto arrow_table = arrow::Table::Make(arrow_schema, std::move(arrow_arrays)); - auto status = arrow_table->Validate(); - if (garrow_error_check(error, status, context)) { - return garrow_table_new_raw(&arrow_table); - } else { - return NULL; - } + return garrow_table_new_raw(&arrow_table); } else { auto maybe_table = arrow::Table::FromRecordBatches(arrow_schema, std::move(arrow_record_batches)); @@ -390,12 +380,7 @@ garrow_table_new_chunked_arrays(GArrowSchema *schema, } auto arrow_table = arrow::Table::Make(arrow_schema, arrow_chunked_arrays); - auto status = arrow_table->Validate(); - if (garrow_error_check(error, status, "[table][new][chunked-arrays]")) { - return garrow_table_new_raw(&arrow_table); - } else { - return NULL; - } + return garrow_table_new_raw(&arrow_table); } /** @@ -422,12 +407,7 @@ garrow_table_new_arrays(GArrowSchema *schema, } auto arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays); - auto status = arrow_table->Validate(); - if (garrow_error_check(error, status, "[table][new][arrays]")) { - return garrow_table_new_raw(&arrow_table); - } else { - return NULL; - } + return garrow_table_new_raw(&arrow_table); } /** @@ -756,6 +736,24 @@ garrow_table_combine_chunks(GArrowTable *table, GError **error) } } +/** + * garrow_table_validate + * @table: A #GArrowTable + * @error: (nullable): Return location for a #GError or %NULL. + * + * Validate the given table. This is a cheap validation. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 20.0.0 + */ +gboolean +garrow_table_validate(GArrowTable *table, GError **error) +{ + const auto arrow_table = garrow_table_get_raw(table); + return garrow::check(error, arrow_table->Validate(), "[table][validate]"); +} + typedef struct GArrowFeatherWritePropertiesPrivate_ { arrow::ipc::feather::WriteProperties properties; diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index d790e413df5fc..184c7c8f38811 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -142,6 +142,10 @@ GARROW_AVAILABLE_IN_0_16 GArrowTable * garrow_table_combine_chunks(GArrowTable *table, GError **error); +GARROW_AVAILABLE_IN_20_0 +gboolean +garrow_table_validate(GArrowTable *table, GError **error); + #define GARROW_TYPE_FEATHER_WRITE_PROPERTIES (garrow_feather_write_properties_get_type()) GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowFeatherWriteProperties, diff --git a/c_glib/meson.build b/c_glib/meson.build index bd7843d8bc362..017765cd14626 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -35,7 +35,7 @@ project('arrow-glib', 'c', 'cpp', # * 22.04: 0.61.2 meson_version: '>=0.53.2') -version = '19.0.0-SNAPSHOT' +version = '20.0.0-SNAPSHOT' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp index 2b8e2bdeac026..738fb4fd824c8 100644 --- a/c_glib/parquet-glib/arrow-file-writer.cpp +++ b/c_glib/parquet-glib/arrow-file-writer.cpp @@ -574,7 +574,6 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, /** * gparquet_arrow_file_writer_new_row_group: * @writer: A #GParquetArrowFileWriter. - * @chunk_size: The max number of rows in a row group. * @error: (nullable): Return location for a #GError or %NULL. * * Start a new row group. @@ -584,13 +583,11 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, * Since: 18.0.0 */ gboolean -gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, - gsize chunk_size, - GError **error) +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, GError **error) { auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); return garrow::check(error, - parquet_arrow_file_writer->NewRowGroup(chunk_size), + parquet_arrow_file_writer->NewRowGroup(), "[parquet][arrow][file-writer][new-row-group]"); } diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h index 2c82f7c1f87de..4986430c951d0 100644 --- a/c_glib/parquet-glib/arrow-file-writer.h +++ b/c_glib/parquet-glib/arrow-file-writer.h @@ -135,9 +135,7 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, GPARQUET_AVAILABLE_IN_18_0 gboolean -gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, - gsize chunk_size, - GError **error); +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, GError **error); GPARQUET_AVAILABLE_IN_18_0 gboolean diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index 96deedf6b4eb0..25c50ef9e4ff9 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -91,15 +91,15 @@ def create_dataset dataset = @factory.finish expected_table = build_table(count: [ - build_int32_array([1, 10]), - build_int32_array([2]), - build_int32_array([3]), - ], - label: [ - build_string_array(["a", "a"]), - build_string_array(["b"]), - build_string_array(["c"]), - ]) + build_int32_array([1, 10]), + build_int32_array([2]), + build_int32_array([3]), + ], + label: [ + build_string_array(["a", "a"]), + build_string_array(["b"]), + build_string_array(["c"]), + ]) return dataset, expected_table end diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb index d8344bf1c50b0..418de4782d0b0 100644 --- a/c_glib/test/parquet/test-arrow-file-writer.rb +++ b/c_glib/test/parquet/test-arrow-file-writer.rb @@ -89,10 +89,10 @@ def test_write_table def test_write_chunked_array schema = build_schema("enabled" => :boolean) writer = Parquet::ArrowFileWriter.new(schema, @file.path) - writer.new_row_group(2) + writer.new_row_group chunked_array = Arrow::ChunkedArray.new([build_boolean_array([true, nil])]) writer.write_chunked_array(chunked_array) - writer.new_row_group(1) + writer.new_row_group chunked_array = Arrow::ChunkedArray.new([build_boolean_array([false])]) writer.write_chunked_array(chunked_array) writer.close diff --git a/c_glib/test/test-array.rb b/c_glib/test/test-array.rb index 4da641b203f9c..18a54a2963134 100644 --- a/c_glib/test/test-array.rb +++ b/c_glib/test/test-array.rb @@ -118,8 +118,9 @@ def test_to_s sub_test_case("#view") do def test_valid + int32_array = build_int32_array([0, 1069547520, -1071644672, nil]) assert_equal(build_float_array([0.0, 1.5, -2.5, nil]), - build_int32_array([0, 1069547520, -1071644672, nil]).view(Arrow::FloatDataType.new)) + int32_array.view(Arrow::FloatDataType.new)) end def test_invalid @@ -185,4 +186,47 @@ def test_mixed_type end end end + + sub_test_case("#validate") do + def test_valid + array = build_int32_array([1, 2, 3, 4, 5]) + assert do + array.validate + end + end + + def test_invalid + message = "[array][validate]: Invalid: Array length is negative" + array = Arrow::Int8Array.new(-1, Arrow::Buffer.new(""), Arrow::Buffer.new(""), -1) + assert_raise(Arrow::Error::Invalid.new(message)) do + array.validate + end + end + end + + sub_test_case("#validate_full") do + def test_valid + array = build_int32_array([1, 2, 3, 4, 5]) + assert do + array.validate_full + end + end + + def test_invalid + message = "[array][validate-full]: Invalid: Invalid UTF8 sequence at string index 0" + + # U+3042 HIRAGANA LETTER A, U+3044 HIRAGANA LETTER I + data = "\u3042\u3044".b[0..-2] + value_offsets = Arrow::Buffer.new([0, data.size].pack("l*")) + array = Arrow::StringArray.new(1, + value_offsets, + Arrow::Buffer.new(data), + Arrow::Buffer.new([0b01].pack("C*")), + -1) + + assert_raise(Arrow::Error::Invalid.new(message)) do + array.validate_full + end + end + end end diff --git a/c_glib/test/test-chunked-array-datum.rb b/c_glib/test/test-chunked-array-datum.rb index b82f3eed8a7af..99e35fc57b085 100644 --- a/c_glib/test/test-chunked-array-datum.rb +++ b/c_glib/test/test-chunked-array-datum.rb @@ -49,7 +49,14 @@ def test_false end def test_to_string - assert_equal("ChunkedArray([\n" + " [\n" + " true,\n" + " false\n" + " ]\n" + "])", @datum.to_s) + assert_equal(<<-DATUM.chomp, @datum.to_s) +ChunkedArray([ + [ + true, + false + ] +]) + DATUM end def test_value diff --git a/c_glib/test/test-large-list-array.rb b/c_glib/test/test-large-list-array.rb index 2f7efab5a074a..fa9c92ec87d0c 100644 --- a/c_glib/test/test-large-list-array.rb +++ b/c_glib/test/test-large-list-array.rb @@ -88,10 +88,10 @@ def test_value_length def test_value_offsets array = build_large_list_array(Arrow::Int8DataType.new, - [ - [-29, 29], - [-1, 0, 1], - ]) + [ + [-29, 29], + [-1, 0, 1], + ]) assert_equal([0, 2, 5], array.value_offsets) end diff --git a/c_glib/test/test-record-batch-datum.rb b/c_glib/test/test-record-batch-datum.rb index ec572e0f13023..c50e50f9029e8 100644 --- a/c_glib/test/test-record-batch-datum.rb +++ b/c_glib/test/test-record-batch-datum.rb @@ -49,7 +49,13 @@ def test_false end def test_to_string - assert_equal("RecordBatch(visible: [\n" + " true,\n" + " false\n" + " ]\n" + ")", @datum.to_s) + assert_equal(<<-DATUM.chomp, @datum.to_s) +RecordBatch(visible: [ + true, + false + ] +) + DATUM end def test_value diff --git a/c_glib/test/test-record-batch.rb b/c_glib/test/test-record-batch.rb index bbdbf82d07689..ff821ddf028fe 100644 --- a/c_glib/test/test-record-batch.rb +++ b/c_glib/test/test-record-batch.rb @@ -189,5 +189,78 @@ def test_serialize assert_equal(@record_batch, input_stream.read_record_batch(@record_batch.schema)) end + + sub_test_case("#validate") do + def setup + @id_field = Arrow::Field.new("id", Arrow::UInt8DataType.new) + @name_field = Arrow::Field.new("name", Arrow::StringDataType.new) + @schema = Arrow::Schema.new([@id_field, @name_field]) + + @id_value = build_uint_array([1]) + @name_value = build_string_array(["abc"]) + @values = [@id_value, @name_value] + end + + def test_valid + n_rows = @id_value.length + record_batch = Arrow::RecordBatch.new(@schema, n_rows, @values) + + assert do + record_batch.validate + end + end + + def test_invalid + message = "[record-batch][validate]: Invalid: " + + "Number of rows in column 0 did not match batch: 1 vs 2" + n_rows = @id_value.length + 1 # incorrect number of rows + + record_batch = Arrow::RecordBatch.new(@schema, n_rows, @values) + assert_raise(Arrow::Error::Invalid.new(message)) do + record_batch.validate + end + end + end + + sub_test_case("#validate_full") do + def setup + @id_field = Arrow::Field.new("uint8", Arrow::UInt8DataType.new) + @name_field = Arrow::Field.new("string", Arrow::StringDataType.new) + @schema = Arrow::Schema.new([@id_field, @name_field]) + + @uint8_value = build_uint_array([1]) + @valid_name_value = build_string_array(["abc"]) + @n_rows = @uint8_value.length + + # U+3042 HIRAGANA LETTER A, U+3044 HIRAGANA LETTER I + data = "\u3042\u3044".b[0..-2] + value_offsets = Arrow::Buffer.new([0,data.size].pack("l*")) + @invalid_name_value = Arrow::StringArray.new(1, + value_offsets, + Arrow::Buffer.new(data), + nil, + -1) + end + + def test_valid + columns = [@uint8_value, @valid_name_value] + record_batch = Arrow::RecordBatch.new(@schema, @n_rows, columns) + + assert do + record_batch.validate_full + end + end + + def test_invalid + message = "[record-batch][validate-full]: Invalid: " + + "In column 1: Invalid: Invalid UTF8 sequence at string index 0" + columns = [@uint8_value, @invalid_name_value] + record_batch = Arrow::RecordBatch.new(@schema, @n_rows, columns) + + assert_raise(Arrow::Error::Invalid.new(message)) do + record_batch.validate_full + end + end + end end end diff --git a/c_glib/test/test-struct-field-options.rb b/c_glib/test/test-struct-field-options.rb index 4a614de6df6e7..f9b492fc4da94 100644 --- a/c_glib/test/test-struct-field-options.rb +++ b/c_glib/test/test-struct-field-options.rb @@ -42,7 +42,8 @@ def test_set_dot_path end def test_set_invalid - message = "[struct-field-options][set-field-ref]: Invalid: Dot path '[foo]' contained an unterminated index" + message = "[struct-field-options][set-field-ref]: " + + "Invalid: Dot path '[foo]' contained an unterminated index" assert_raise(Arrow::Error::Invalid.new(message)) do @options.field_ref = "[foo]" end diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb index 615a90c2f0baf..8e11340094696 100644 --- a/c_glib/test/test-table.rb +++ b/c_glib/test/test-table.rb @@ -243,6 +243,37 @@ def test_combine_chunks all_values) end + sub_test_case("#validate") do + def setup + @id_field = Arrow::Field.new("id", Arrow::UInt8DataType.new) + @name_field = Arrow::Field.new("name", Arrow::StringDataType.new) + @schema = Arrow::Schema.new([@id_field, @name_field]) + + @id_array = build_uint_array([1]) + @name_array = build_string_array(["abc"]) + @arrays = [@id_array, @name_array] + end + + def test_valid + table = Arrow::Table.new(@schema, @arrays) + + assert do + table.validate + end + end + + def test_invalid + message = "[table][validate]: Invalid: " + + "Column 1 named name expected length 1 but got length 2" + + invalid_values = [@id_array, build_string_array(["abc", "def"])] + table = Arrow::Table.new(@schema, invalid_values) + assert_raise(Arrow::Error::Invalid.new(message)) do + table.validate + end + end + end + sub_test_case("#write_as_feather") do def setup super diff --git a/c_glib/test/test-uint-array-builder.rb b/c_glib/test/test-uint-array-builder.rb index 89621189b4571..3aa3a1c488d83 100644 --- a/c_glib/test/test-uint-array-builder.rb +++ b/c_glib/test/test-uint-array-builder.rb @@ -32,9 +32,9 @@ def test_uint16 values = [0, border_value] assert_equal(build_uint_array([*values, nil]), Arrow::UInt16Array.new(3, - Arrow::Buffer.new(values.pack("S*")), - Arrow::Buffer.new([0b011].pack("C*")), - -1)) + Arrow::Buffer.new(values.pack("S*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) end def test_uint32 @@ -42,9 +42,9 @@ def test_uint32 values = [0, border_value] assert_equal(build_uint_array([*values, nil]), Arrow::UInt32Array.new(3, - Arrow::Buffer.new(values.pack("L*")), - Arrow::Buffer.new([0b011].pack("C*")), - -1)) + Arrow::Buffer.new(values.pack("L*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) end def test_uint64 diff --git a/c_glib/tool/generate-version-header.py b/c_glib/tool/generate-version-header.py index 4995ce570aeb0..6a8976204c05a 100755 --- a/c_glib/tool/generate-version-header.py +++ b/c_glib/tool/generate-version-header.py @@ -140,6 +140,7 @@ def generate_availability_macros(library: str) -> str: ALL_VERSIONS = [ + (20, 0), (19, 0), (18, 0), (17, 0), diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json index f2717f7e27cf2..5873fd9f28ec2 100644 --- a/c_glib/vcpkg.json +++ b/c_glib/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow-glib", - "version-string": "19.0.0-SNAPSHOT", + "version-string": "20.0.0-SNAPSHOT", "dependencies": [ "glib", "gobject-introspection", diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat index 912b130acff45..ff159bd0b4b59 100644 --- a/ci/appveyor-cpp-setup.bat +++ b/ci/appveyor-cpp-setup.bat @@ -70,7 +70,6 @@ conda create -n arrow ^ "ninja" ^ "nomkl" ^ "pandas" ^ - "fsspec" ^ "python=%PYTHON%" ^ || exit /B conda list -n arrow @@ -86,7 +85,7 @@ set CXX=cl.exe @rem Download Minio somewhere on PATH, for unit tests @rem if "%ARROW_S3%" == "ON" ( - appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z -FileName C:\Windows\Minio.exe || exit /B + appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2025-01-20T14-49-07Z -FileName C:\Windows\Minio.exe || exit /B ) @rem diff --git a/ci/conan/all/conandata.yml b/ci/conan/all/conandata.yml index fb75f3995c62e..a13b31c2e82df 100644 --- a/ci/conan/all/conandata.yml +++ b/ci/conan/all/conandata.yml @@ -21,64 +21,47 @@ # SOFTWARE. sources: + "18.1.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-18.1.0/apache-arrow-18.1.0.tar.gz?action=download" + sha256: "2dc8da5f8796afe213ecc5e5aba85bb82d91520eff3cf315784a52d0fa61d7fc" + "18.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-18.0.0/apache-arrow-18.0.0.tar.gz?action=download" + sha256: "abcf1934cd0cdddd33664e9f2d9a251d6c55239d1122ad0ed223b13a583c82a9" + "17.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-17.0.0/apache-arrow-17.0.0.tar.gz?action=download" + sha256: "9d280d8042e7cf526f8c28d170d93bfab65e50f94569f6a790982a878d8d898d" + "16.1.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-16.1.0/apache-arrow-16.1.0.tar.gz?action=download" + sha256: "c9e60c7e87e59383d21b20dc874b17153729ee153264af6d21654b7dff2c60d7" "15.0.0": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-15.0.0/apache-arrow-15.0.0.tar.gz?action=download" sha256: "01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d" "14.0.2": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.2/apache-arrow-14.0.2.tar.gz?action=download" sha256: "1304dedb41896008b89fe0738c71a95d9b81752efc77fa70f264cb1da15d9bc2" - "14.0.1": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.1/apache-arrow-14.0.1.tar.gz?action=download" - sha256: "5c70eafb1011f9d124bafb328afe54f62cc5b9280b7080e1e3d668f78c0e407e" - "14.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.0/apache-arrow-14.0.0.tar.gz?action=download" - sha256: "4eb0da50ec071baf15fc163cb48058931e006f1c862c8def0e180fd07d531021" - "13.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-13.0.0/apache-arrow-13.0.0.tar.gz?action=download" - sha256: "35dfda191262a756be934eef8afee8d09762cad25021daa626eb249e251ac9e6" - "12.0.1": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.1/apache-arrow-12.0.1.tar.gz?action=download" - sha256: "3481c411393aa15c75e88d93cf8315faf7f43e180fe0790128d3840d417de858" - "12.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.0/apache-arrow-12.0.0.tar.gz?action=download" - sha256: "ddd8347882775e53af7d0965a1902b7d8fcd0a030fd14f783d4f85e821352d52" - "11.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-11.0.0/apache-arrow-11.0.0.tar.gz?action=download" - sha256: "2dd8f0ea0848a58785628ee3a57675548d509e17213a2f5d72b0d900b43f5430" - "10.0.1": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-10.0.1/apache-arrow-10.0.1.tar.gz?action=download" - sha256: "c814e0670112a22c1a6ec03ab420a52ae236a9a42e9e438c3cbd37f37e658fb3" - "10.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-10.0.0/apache-arrow-10.0.0.tar.gz?action=download" - sha256: "5b46fa4c54f53e5df0019fe0f9d421e93fc906b625ebe8e89eed010d561f1f12" - "8.0.1": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-8.0.1/apache-arrow-8.0.1.tar.gz?action=download" - sha256: "82d46929f7574715551da21700f100b39f99c3c4d6790f26cac86d869d64e94e" - "8.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-8.0.0/apache-arrow-8.0.0.tar.gz?action=download" - sha256: "ad9a05705117c989c116bae9ac70492fe015050e1b80fb0e38fde4b5d863aaa3" - "7.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-7.0.0/apache-arrow-7.0.0.tar.gz?action=download" - sha256: "e8f49b149a15ecef4e40fcfab1b87c113c6b1ee186005c169e5cdf95d31a99de" patches: - "8.0.1": - - patch_file: "patches/8.0.0-0005-install-utils.patch" - patch_description: "enable utils installation" + "18.1.0": + - patch_file: "patches/18.0.0-0001-fix-cmake.patch" + patch_description: "use cci package" patch_type: "conan" - - patch_file: "patches/8.0.0-0006-fix-cmake.patch" + "18.0.0": + - patch_file: "patches/18.0.0-0001-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" - "8.0.0": - - patch_file: "patches/8.0.0-0005-install-utils.patch" - patch_description: "enable utils installation" + "17.0.0": + - patch_file: "patches/16.0.0-0001-fix-cmake.patch" + patch_description: "use cci package" patch_type: "conan" - - patch_file: "patches/8.0.0-0006-fix-cmake.patch" + "16.1.0": + - patch_file: "patches/16.0.0-0001-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" - "7.0.0": - - patch_file: "patches/7.0.0-0006-install-utils.patch" - patch_description: "enable utils installation" + "15.0.0": + - patch_file: "patches/11.0.0-0001-fix-cmake.patch" + patch_description: "use cci package" patch_type: "conan" - - patch_file: "patches/7.0.0-0007-fix-cmake.patch" + "14.0.2": + - patch_file: "patches/11.0.0-0001-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" + \ No newline at end of file diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py index 178cd03da1555..5db9fe356726a 100644 --- a/ci/conan/all/conanfile.py +++ b/ci/conan/all/conanfile.py @@ -31,7 +31,7 @@ import os import glob -required_conan_version = ">=1.53.0" +required_conan_version = ">=2.1.0" class ArrowConan(ConanFile): name = "arrow" @@ -93,7 +93,7 @@ class ArrowConan(ConanFile): "shared": False, "fPIC": True, "gandiva": False, - "parquet": False, + "parquet": True, "skyhook": False, "substrait": False, "acero": False, @@ -108,7 +108,7 @@ class ArrowConan(ConanFile): "simd_level": "default", "runtime_simd_level": "max", "with_backtrace": False, - "with_boost": False, + "with_boost": True, "with_brotli": False, "with_bz2": False, "with_csv": False, @@ -122,7 +122,7 @@ class ArrowConan(ConanFile): "with_glog": False, "with_grpc": False, "with_json": False, - "with_thrift": False, + "with_thrift": True, "with_llvm": False, "with_openssl": False, "with_opentelemetry": False, @@ -133,7 +133,7 @@ class ArrowConan(ConanFile): "with_utf8proc": False, "with_lz4": False, "with_snappy": False, - "with_zlib": False, + "with_zlib": True, "with_zstd": False, } short_paths = True @@ -144,21 +144,6 @@ def _min_cppstd(self): # https://github.com/apache/arrow/pull/13991 return "11" if Version(self.version) < "10.0.0" else "17" - @property - def _compilers_minimum_version(self): - return { - "11": { - "clang": "3.9", - }, - "17": { - "gcc": "8", - "clang": "7", - "apple-clang": "10", - "Visual Studio": "15", - "msvc": "191", - }, - }.get(self._min_cppstd, {}) - def export_sources(self): export_conandata_patches(self) copy(self, "conan_cmake_project_include.cmake", self.recipe_folder, os.path.join(self.export_sources_folder, "src")) @@ -183,15 +168,15 @@ def _requires_rapidjson(self): def requirements(self): if self.options.with_thrift: - self.requires("thrift/0.17.0") + self.requires("thrift/0.20.0") if self.options.with_protobuf: - self.requires("protobuf/3.21.9") + self.requires("protobuf/3.21.12") if self.options.with_jemalloc: self.requires("jemalloc/5.3.0") if self.options.with_mimalloc: self.requires("mimalloc/1.7.6") if self.options.with_boost: - self.requires("boost/1.84.0") + self.requires("boost/1.85.0") if self.options.with_gflags: self.requires("gflags/2.2.2") if self.options.with_glog: @@ -223,18 +208,23 @@ def requirements(self): if self.options.with_snappy: self.requires("snappy/1.1.9") if self.options.get_safe("simd_level") != None or \ - self.options.get_safe("runtime_simd_level") != None: - self.requires("xsimd/9.0.1") + self.options.get_safe("runtime_simd_level") != None: + if Version(self.version) < 8: + self.requires("xsimd/9.0.1") + else: + self.requires("xsimd/13.0.0") if self.options.with_zlib: self.requires("zlib/[>=1.2.11 <2]") if self.options.with_zstd: - self.requires("zstd/1.5.5") + self.requires("zstd/[>=1.5 <1.6]") if self.options.with_re2: self.requires("re2/20230301") if self.options.with_utf8proc: self.requires("utf8proc/2.8.0") if self.options.with_backtrace: self.requires("libbacktrace/cci.20210118") + if self.options.with_orc: + self.requires("orc/2.0.0") def validate(self): # Do not allow options with 'auto' value @@ -247,27 +237,35 @@ def validate(self): # From https://github.com/conan-io/conan-center-index/pull/23163#issuecomment-2039808851 if self.options.gandiva: if not self.options.with_re2: - raise ConanException("'with_re2' option should be True when'gandiva=True'") + raise ConanException("'with_re2' option should be True when 'gandiva=True'") if not self.options.with_boost: - raise ConanException("'with_boost' option should be True when'gandiva=True'") + raise ConanException("'with_boost' option should be True when 'gandiva=True'") if not self.options.with_utf8proc: - raise ConanException("'with_utf8proc' option should be True when'gandiva=True'") + raise ConanException("'with_utf8proc' option should be True when 'gandiva=True'") + if self.options.with_thrift and not self.options.with_boost: + raise ConanException("'with_boost' option should be True when 'thrift=True'") + if self.options.parquet: + if not self.options.with_thrift: + raise ConanException("'with_thrift' option should be True when 'parquet=True'") + if self.options.with_flight_rpc and not self.options.with_protobuf: + raise ConanException("'with_protobuf' option should be True when 'with_flight_rpc=True'") if self.settings.compiler.get_safe("cppstd"): check_min_cppstd(self, self._min_cppstd) - minimum_version = self._compilers_minimum_version.get(str(self.settings.compiler), False) - if minimum_version and Version(self.settings.compiler.version) < minimum_version: + if ( + Version(self.version) < "10.0.0" + and self.settings.compiler == "clang" + and Version(self.settings.compiler.version) < "3.9" + ): raise ConanInvalidConfiguration( - f"{self.ref} requires C++{self._min_cppstd}, which your compiler does not support." + f"{self.ref} requires C++11, which needs at least clang-3.9" ) if self.options.get_safe("skyhook", False): raise ConanInvalidConfiguration("CCI has no librados recipe (yet)") if self.options.with_cuda: raise ConanInvalidConfiguration("CCI has no cuda recipe (yet)") - if self.options.with_orc: - raise ConanInvalidConfiguration("CCI has no orc recipe (yet)") if self.options.with_s3 and not self.dependencies["aws-sdk-cpp"].options.config: raise ConanInvalidConfiguration("arrow:with_s3 requires aws-sdk-cpp:config is True.") @@ -275,6 +273,11 @@ def validate(self): if self.dependencies["jemalloc"].options.enable_cxx: raise ConanInvalidConfiguration("jemmalloc.enable_cxx of a static jemalloc must be disabled") + if self.options.with_thrift and not self.options.with_zlib: + raise ConanInvalidConfiguration("arrow:with_thrift requires arrow:with_zlib") + + if self.options.parquet and not self.options.with_thrift: + raise ConanInvalidConfiguration("arrow:parquet requires arrow:with_thrift") def build_requirements(self): if Version(self.version) >= "13.0.0": @@ -352,6 +355,7 @@ def generate(self): tc.variables["GLOG_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_BACKTRACE"] = bool(self.options.with_backtrace) tc.variables["ARROW_WITH_BROTLI"] = bool(self.options.with_brotli) + tc.variables["ARROW_WITH_RE2"] = bool(self.options.with_re2) tc.variables["brotli_SOURCE"] = "SYSTEM" if self.options.with_brotli: tc.variables["ARROW_BROTLI_USE_SHARED"] = bool(self.dependencies["brotli"].options.shared) @@ -383,8 +387,10 @@ def generate(self): tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.dependencies["zstd"].options.shared) tc.variables["ORC_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_THRIFT"] = bool(self.options.with_thrift) + tc.variables["ARROW_THRIFT"] = bool(self.options.with_thrift) tc.variables["Thrift_SOURCE"] = "SYSTEM" if self.options.with_thrift: + tc.variables["ARROW_THRIFT"] = True tc.variables["THRIFT_VERSION"] = bool(self.dependencies["thrift"].ref.version) # a recent thrift does not require boost tc.variables["ARROW_THRIFT_USE_SHARED"] = bool(self.dependencies["thrift"].options.shared) tc.variables["ARROW_USE_OPENSSL"] = self.options.with_openssl @@ -444,28 +450,53 @@ def _patch_sources(self): def build(self): self._patch_sources() - cmake =CMake(self) + cmake = CMake(self) cmake.configure(build_script_folder=os.path.join(self.source_folder, "cpp")) cmake.build() def package(self): copy(self, pattern="LICENSE.txt", dst=os.path.join(self.package_folder, "licenses"), src=self.source_folder) copy(self, pattern="NOTICE.txt", dst=os.path.join(self.package_folder, "licenses"), src=self.source_folder) - cmake =CMake(self) + cmake = CMake(self) cmake.install() rmdir(self, os.path.join(self.package_folder, "lib", "cmake")) rmdir(self, os.path.join(self.package_folder, "lib", "pkgconfig")) rmdir(self, os.path.join(self.package_folder, "share")) + cmake_suffix = "shared" if self.options.shared else "static" + + alias_map = { f"Arrow::arrow_{cmake_suffix}": f"arrow::arrow_{cmake_suffix}" } + + if self.options.parquet: + alias_map[f"Parquet::parquet_{cmake_suffix}"] = f"arrow::parquet_{cmake_suffix}" + + if self.options.get_safe("substrait"): + alias_map[f"Arrow::arrow_substrait_{cmake_suffix}"] = f"arrow::arrow_substrait_{cmake_suffix}" + + if self.options.acero: + alias_map[f"Arrow::arrow_acero_{cmake_suffix}"] = f"arrow::arrow_acero_{cmake_suffix}" + + if self.options.gandiva: + alias_map[f"Gandiva::gandiva_{cmake_suffix}"] = f"arrow::gandiva_{cmake_suffix}" + + if self.options.with_flight_rpc: + alias_map[f"ArrowFlight::arrow_flight_sql_{cmake_suffix}"] = f"arrow::arrow_flight_sql_{cmake_suffix}" + + @property + def _module_subfolder(self): + return os.path.join("lib", "cmake") + def package_info(self): # FIXME: fix CMake targets of components self.cpp_info.set_property("cmake_file_name", "Arrow") suffix = "_static" if is_msvc(self) and not self.options.shared else "" + cmake_suffix = "shared" if self.options.shared else "static" self.cpp_info.components["libarrow"].set_property("pkg_config_name", "arrow") + self.cpp_info.components["libarrow"].set_property("cmake_target_name", f"Arrow::arrow_{cmake_suffix}") self.cpp_info.components["libarrow"].libs = [f"arrow{suffix}"] if not self.options.shared: self.cpp_info.components["libarrow"].defines = ["ARROW_STATIC"] @@ -474,6 +505,7 @@ def package_info(self): if self.options.parquet: self.cpp_info.components["libparquet"].set_property("pkg_config_name", "parquet") + self.cpp_info.components["libparquet"].set_property("cmake_target_name", f"Parquet::parquet_{cmake_suffix}") self.cpp_info.components["libparquet"].libs = [f"parquet{suffix}"] self.cpp_info.components["libparquet"].requires = ["libarrow"] if not self.options.shared: @@ -481,6 +513,7 @@ def package_info(self): if self.options.get_safe("substrait"): self.cpp_info.components["libarrow_substrait"].set_property("pkg_config_name", "arrow_substrait") + self.cpp_info.components["libarrow_substrait"].set_property("cmake_target_name", f"Arrow::arrow_substrait_{cmake_suffix}") self.cpp_info.components["libarrow_substrait"].libs = [f"arrow_substrait{suffix}"] self.cpp_info.components["libarrow_substrait"].requires = ["libparquet", "dataset"] @@ -488,6 +521,8 @@ def package_info(self): del self.options.plasma if self.options.acero: + self.cpp_info.components["libacero"].set_property("pkg_config_name", "acero") + self.cpp_info.components["libacero"].set_property("cmake_target_name", f"Acero::arrow_acero_{cmake_suffix}") self.cpp_info.components["libacero"].libs = [f"arrow_acero{suffix}"] self.cpp_info.components["libacero"].names["cmake_find_package"] = "acero" self.cpp_info.components["libacero"].names["cmake_find_package_multi"] = "acero" @@ -496,6 +531,7 @@ def package_info(self): if self.options.gandiva: self.cpp_info.components["libgandiva"].set_property("pkg_config_name", "gandiva") + self.cpp_info.components["libgandiva"].set_property("cmake_target_name", f"Gandiva::gandiva_{cmake_suffix}") self.cpp_info.components["libgandiva"].libs = [f"gandiva{suffix}"] self.cpp_info.components["libgandiva"].requires = ["libarrow"] if not self.options.shared: @@ -503,11 +539,16 @@ def package_info(self): if self.options.with_flight_rpc: self.cpp_info.components["libarrow_flight"].set_property("pkg_config_name", "flight_rpc") + self.cpp_info.components["libarrow_flight"].set_property("cmake_target_name", f"ArrowFlight::arrow_flight_{cmake_suffix}") self.cpp_info.components["libarrow_flight"].libs = [f"arrow_flight{suffix}"] self.cpp_info.components["libarrow_flight"].requires = ["libarrow"] + # https://github.com/apache/arrow/pull/43137#pullrequestreview-2267476893 + if Version(self.version) >= "18.0.0" and self.options.with_openssl: + self.cpp_info.components["libarrow_flight"].requires.append("openssl::openssl") if self.options.get_safe("with_flight_sql"): self.cpp_info.components["libarrow_flight_sql"].set_property("pkg_config_name", "flight_sql") + self.cpp_info.components["libarrow_flight_sql"].set_property("cmake_target_name", f"ArrowFlightSql::arrow_flight_sql_{cmake_suffix}") self.cpp_info.components["libarrow_flight_sql"].libs = [f"arrow_flight_sql{suffix}"] self.cpp_info.components["libarrow_flight_sql"].requires = ["libarrow", "libarrow_flight"] @@ -560,7 +601,8 @@ def package_info(self): if self._requires_rapidjson(): self.cpp_info.components["libarrow"].requires.append("rapidjson::rapidjson") if self.options.with_s3: - self.cpp_info.components["libarrow"].requires.append("aws-sdk-cpp::s3") + # https://github.com/apache/arrow/blob/6b268f62a8a172249ef35f093009c740c32e1f36/cpp/src/arrow/CMakeLists.txt#L98 + self.cpp_info.components["libarrow"].requires.extend([f"aws-sdk-cpp::{x}" for x in ["cognito-identity", "core", "identity-management", "s3", "sts"]]) if self.options.get_safe("with_gcs"): self.cpp_info.components["libarrow"].requires.append("google-cloud-cpp::storage") if self.options.with_orc: @@ -581,32 +623,7 @@ def package_info(self): self.cpp_info.components["libarrow"].requires.append("zlib::zlib") if self.options.with_zstd: self.cpp_info.components["libarrow"].requires.append("zstd::zstd") - if self.options.with_boost: - self.cpp_info.components["libarrow"].requires.append("boost::boost") if self.options.with_grpc: self.cpp_info.components["libarrow"].requires.append("grpc::grpc") if self.options.with_flight_rpc: self.cpp_info.components["libarrow_flight"].requires.append("protobuf::protobuf") - - # TODO: to remove in conan v2 - self.cpp_info.filenames["cmake_find_package"] = "Arrow" - self.cpp_info.filenames["cmake_find_package_multi"] = "Arrow" - self.cpp_info.components["libarrow"].names["cmake_find_package"] = "arrow" - self.cpp_info.components["libarrow"].names["cmake_find_package_multi"] = "arrow" - if self.options.parquet: - self.cpp_info.components["libparquet"].names["cmake_find_package"] = "parquet" - self.cpp_info.components["libparquet"].names["cmake_find_package_multi"] = "parquet" - if self.options.get_safe("substrait"): - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package_multi"] = "arrow_substrait" - if self.options.gandiva: - self.cpp_info.components["libgandiva"].names["cmake_find_package"] = "gandiva" - self.cpp_info.components["libgandiva"].names["cmake_find_package_multi"] = "gandiva" - if self.options.with_flight_rpc: - self.cpp_info.components["libarrow_flight"].names["cmake_find_package"] = "flight_rpc" - self.cpp_info.components["libarrow_flight"].names["cmake_find_package_multi"] = "flight_rpc" - if self.options.get_safe("with_flight_sql"): - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package"] = "flight_sql" - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package_multi"] = "flight_sql" - if self.options.cli and (self.options.with_cuda or self.options.with_flight_rpc or self.options.parquet): - self.env_info.PATH.append(os.path.join(self.package_folder, "bin")) diff --git a/ci/conan/all/patches/1.0.0-0003-fix-shared-msvc.patch b/ci/conan/all/patches/1.0.0-0003-fix-shared-msvc.patch deleted file mode 100644 index 45210d1b8cc51..0000000000000 --- a/ci/conan/all/patches/1.0.0-0003-fix-shared-msvc.patch +++ /dev/null @@ -1,35 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/src/arrow/CMakeLists.txt -+++ cpp/src/arrow/CMakeLists.txt -@@ -490,6 +490,10 @@ - target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) - endif() - -+if(ARROW_BUILD_SHARED AND WIN32) -+target_compile_definitions(arrow_shared PRIVATE ARROW_EXPORTING) -+endif() -+ - if(ARROW_WITH_BACKTRACE) - find_package(Backtrace) - diff --git a/ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch b/ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch deleted file mode 100644 index 199804bff00ab..0000000000000 --- a/ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch +++ /dev/null @@ -1,44 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/src/arrow/vendored/datetime/date.h b/cpp/src/arrow/vendored/datetime/date.h -index 02a4909..2b168d2 100644 ---- a/cpp/src/arrow/vendored/datetime/date.h -+++ b/cpp/src/arrow/vendored/datetime/date.h -@@ -5152,7 +5152,7 @@ to_stream(std::basic_ostream& os, const CharT* fmt, - if (modified == CharT{}) - #endif - { -- auto h = *fmt == CharT{'I'} ? make12(hms.hours()) : hms.hours(); -+ auto h = *fmt == CharT{'I'} ? arrow_vendored::date::make12(hms.hours()) : hms.hours(); - if (h < hours{10}) - os << CharT{'0'}; - os << h.count(); -@@ -5366,7 +5366,7 @@ to_stream(std::basic_ostream& os, const CharT* fmt, - save_ostream _(os); - os.fill('0'); - os.width(2); -- os << make12(tod.hours()).count() << CharT{':'}; -+ os << arrow_vendored::date::make12(tod.hours()).count() << CharT{':'}; - os.width(2); - os << tod.minutes().count() << CharT{':'}; - os.width(2); diff --git a/ci/conan/all/patches/1.0.0-0006-fix-cmake.patch b/ci/conan/all/patches/1.0.0-0006-fix-cmake.patch deleted file mode 100644 index 3ecd0bf9f3968..0000000000000 --- a/ci/conan/all/patches/1.0.0-0006-fix-cmake.patch +++ /dev/null @@ -1,355 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index 300f043..0127a7a 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -654,7 +654,7 @@ endif() - - if(ARROW_WITH_BROTLI) - # Order is important for static linking -- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) -+ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) - list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) - list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) - endif() -@@ -664,7 +664,7 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) - endif() - - if(ARROW_WITH_SNAPPY) -@@ -800,8 +800,11 @@ endif() - - if(ARROW_MIMALLOC) - add_definitions(-DARROW_MIMALLOC) -- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) -+ if (TARGET mimalloc-static) -+ list(APPEND ARROW_LINK_LIBS mimalloc-static) -+ else() -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) -+ endif() - endif() - - # ---------------------------------------------------------------------- -diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake -index eb10ebe..9c81017 100644 ---- a/cpp/cmake_modules/BuildUtils.cmake -+++ b/cpp/cmake_modules/BuildUtils.cmake -@@ -165,10 +165,10 @@ function(create_merged_static_lib output_target) - set(ar_script_path ${CMAKE_BINARY_DIR}/${ARG_NAME}.ar) - - file(WRITE ${ar_script_path}.in "CREATE ${output_lib_path}\n") -- file(APPEND ${ar_script_path}.in "ADDLIB $\n") -+ file(APPEND ${ar_script_path}.in "ADDLIB $\n") - - foreach(lib ${ARG_TO_MERGE}) -- file(APPEND ${ar_script_path}.in "ADDLIB $\n") -+ file(APPEND ${ar_script_path}.in "ADDLIB $\n") - endforeach() - - file(APPEND ${ar_script_path}.in "SAVE\nEND\n") -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index 807e2b9..016c8db 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -154,16 +154,7 @@ macro(build_dependency DEPENDENCY_NAME) - endmacro() - - macro(resolve_dependency DEPENDENCY_NAME) -- if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO") -- find_package(${DEPENDENCY_NAME} MODULE) -- if(NOT ${${DEPENDENCY_NAME}_FOUND}) -- build_dependency(${DEPENDENCY_NAME}) -- endif() -- elseif(${DEPENDENCY_NAME}_SOURCE STREQUAL "BUNDLED") -- build_dependency(${DEPENDENCY_NAME}) -- elseif(${DEPENDENCY_NAME}_SOURCE STREQUAL "SYSTEM") -- find_package(${DEPENDENCY_NAME} REQUIRED) -- endif() -+ find_package(${DEPENDENCY_NAME} REQUIRED) - endmacro() - - macro(resolve_dependency_with_version DEPENDENCY_NAME REQUIRED_VERSION) -@@ -765,6 +756,7 @@ endif() - # - Tests need Boost at runtime. - # - S3FS and Flight benchmarks need Boost at runtime. - if(ARROW_BUILD_INTEGRATION -+ OR ARROW_BOOST_REQUIRED - OR ARROW_BUILD_TESTS - OR ARROW_GANDIVA - OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) -@@ -785,7 +777,7 @@ if(ARROW_BOOST_REQUIRED) - elseif(BOOST_SOURCE STREQUAL "BUNDLED") - build_boost() - elseif(BOOST_SOURCE STREQUAL "SYSTEM") -- find_package(BoostAlt ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) -+ find_package(Boost ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) - endif() - - if(TARGET Boost::system) -@@ -936,11 +928,11 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli) -+ resolve_dependency(brotli) - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon -+ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) -- include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) -+ include_directories(SYSTEM ${brotli_INCLUDE_DIR}) - endif() - - if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) -@@ -1146,9 +1138,10 @@ if(ARROW_NEED_GFLAGS) - endif() - endif() - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) -+ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) -+ set(GFLAGS_LIBRARIES ${gflags_LIBRARIES}) - -- if(NOT TARGET ${GFLAGS_LIBRARIES}) -+ if(0) - if(TARGET gflags-shared) - set(GFLAGS_LIBRARIES gflags-shared) - elseif(TARGET gflags_shared) -@@ -1237,12 +1230,13 @@ endmacro() - if(ARROW_WITH_THRIFT) - # We already may have looked for Thrift earlier, when considering whether - # to build Boost, so don't look again if already found. -- if(NOT Thrift_FOUND AND NOT THRIFT_FOUND) -+ if(0) - # Thrift c++ code generated by 0.13 requires 0.11 or greater - resolve_dependency_with_version(Thrift 0.11.0) - endif() -+ find_package(Thrift CONFIG REQUIRED) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) -+ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) - endif() - - # ---------------------------------------------------------------------- -@@ -1407,6 +1401,7 @@ endif() - # jemalloc - Unix-only high-performance allocator - - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1465,12 +1460,18 @@ if(ARROW_JEMALLOC) - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+if(0) - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1518,6 +1519,11 @@ if(ARROW_MIMALLOC) - add_dependencies(toolchain mimalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) -+else() -+ find_package(mimalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -1918,11 +1924,16 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -- resolve_dependency(Lz4) -+ resolve_dependency(lz4) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) -- include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) -+ if(TARGET LZ4::lz4_static) -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) -+ else() -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) -+ endif() -+ include_directories(SYSTEM ${lz4_INCLUDE_DIR}) -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${lz4_LIBRARIES_TARGETS} ) - endif() - - macro(build_zstd) -@@ -2037,10 +2048,10 @@ macro(build_re2) - endmacro() - - if(ARROW_GANDIVA) -- resolve_dependency(RE2) -+ resolve_dependency(re2) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(RE2_INCLUDE_DIR RE2::re2 INTERFACE_INCLUDE_DIRECTORIES) -+ get_target_property(RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${RE2_INCLUDE_DIR}) - endif() - -@@ -2480,17 +2491,24 @@ if(ARROW_WITH_GRPC) - endif() - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) -+ # get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) -+ if(grpc_INCLUDE_DIRS_RELEASE) -+ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_RELEASE}) -+ elseif(grpc_INCLUDE_DIRS_DEBUG) -+ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_DEBUG}) -+ endif() - include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) -+ include_directories(SYSTEM ${absl_INCLUDE_DIR}) -+ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) - - if(GRPC_VENDORED) - set(GRPCPP_PP_INCLUDE TRUE) - else() - # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp - # depending on the gRPC version. -- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") -+ if(EXISTS ${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE TRUE) -- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") -+ elseif(EXISTS ${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE FALSE) - else() - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index 5797a78..da6bd4d 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -292,10 +292,15 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() -+ - if(ARROW_MIMALLOC) -- list(APPEND _allocator_dependencies mimalloc_ep) -+ if (TARGET mimalloc-static) -+ list(APPEND _allocator_dependencies mimalloc-static) -+ else() -+ list(APPEND _allocator_dependencies mimalloc) -+ endif() - endif() - - if(_allocator_dependencies) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index 784bf7b..8f005a5 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -31,7 +31,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC -diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt -index 85e8db6..cd70c63 100644 ---- a/cpp/src/gandiva/CMakeLists.txt -+++ b/cpp/src/gandiva/CMakeLists.txt -@@ -25,7 +25,7 @@ add_custom_target(gandiva-benchmarks) - - add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) - --find_package(LLVMAlt REQUIRED) -+find_package(LLVM REQUIRED) - - if(LLVM_VERSION_MAJOR LESS "10") - set(GANDIVA_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -@@ -88,9 +88,16 @@ set(SRC_FILES - random_generator_holder.cc - ${GANDIVA_PRECOMPILED_CC_PATH}) - --set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared LLVM::LLVM_INTERFACE RE2::re2) - --set(GANDIVA_STATIC_LINK_LIBS arrow_static LLVM::LLVM_INTERFACE RE2::re2) -+ function(get_all_targets var) -+ set(targets) -+ get_all_targets_recursive(targets ${CMAKE_CURRENT_SOURCE_DIR}) -+ set(${var} ${targets} PARENT_SCOPE) -+endfunction() -+ -+set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared llvm-core::llvm-core re2::re2) -+ -+set(GANDIVA_STATIC_LINK_LIBS arrow_static llvm-core::llvm-core re2::re2) - - if(ARROW_GANDIVA_STATIC_LIBSTDCPP - AND (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)) -@@ -131,7 +138,7 @@ add_arrow_lib(gandiva - arrow_dependencies - precompiled - EXTRA_INCLUDES -- $ -+ $ - SHARED_LINK_FLAGS - ${GANDIVA_SHARED_LINK_FLAGS} - SHARED_LINK_LIBS -@@ -203,7 +210,7 @@ endfunction() - - set(GANDIVA_INTERNALS_TEST_ARGUMENTS) - if(WIN32) -- list(APPEND GANDIVA_INTERNALS_TEST_ARGUMENTS EXTRA_LINK_LIBS LLVM::LLVM_INTERFACE) -+ list(APPEND GANDIVA_INTERNALS_TEST_ARGUMENTS EXTRA_LINK_LIBS llvm-core::llvm-core) - endif() - add_gandiva_test(internals-test - SOURCES -@@ -225,9 +232,9 @@ add_gandiva_test(internals-test - decimal_type_util_test.cc - random_generator_holder_test.cc - EXTRA_DEPENDENCIES -- LLVM::LLVM_INTERFACE -+ llvm-core::llvm-core - EXTRA_INCLUDES -- $ -+ $ - ${GANDIVA_INTERNALS_TEST_ARGUMENTS}) - - if(ARROW_GANDIVA_JAVA) diff --git a/ci/conan/all/patches/11.0.0-0001-fix-cmake.patch b/ci/conan/all/patches/11.0.0-0001-fix-cmake.patch new file mode 100644 index 0000000000000..37f36f99a0c33 --- /dev/null +++ b/ci/conan/all/patches/11.0.0-0001-fix-cmake.patch @@ -0,0 +1,64 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/cmake_modules/FindThriftAlt.cmake b/cpp/cmake_modules/FindThriftAlt.cmake +index f3e49021d..95177c2a6 100644 +--- a/cpp/cmake_modules/FindThriftAlt.cmake ++++ b/cpp/cmake_modules/FindThriftAlt.cmake +@@ -45,22 +45,21 @@ endif() + # * https://github.com/apache/thrift/pull/2725 + # * https://github.com/apache/thrift/pull/2726 + # * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 +-if(NOT WIN32) +- set(find_package_args "") +- if(ThriftAlt_FIND_VERSION) +- list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) +- endif() +- if(ThriftAlt_FIND_QUIETLY) +- list(APPEND find_package_args QUIET) +- endif() +- find_package(Thrift ${find_package_args}) +- if(Thrift_FOUND) +- set(ThriftAlt_FOUND TRUE) +- add_executable(thrift::compiler IMPORTED) +- set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION +- "${THRIFT_COMPILER}") +- return() +- endif() ++ ++set(find_package_args "") ++if(ThriftAlt_FIND_VERSION) ++ list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) ++endif() ++if(ThriftAlt_FIND_QUIETLY) ++ list(APPEND find_package_args QUIET) ++endif() ++find_package(Thrift ${find_package_args}) ++if(Thrift_FOUND) ++ set(ThriftAlt_FOUND TRUE) ++ add_executable(thrift::compiler IMPORTED) ++ set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION ++ "${THRIFT_COMPILER}") ++ return() + endif() + + function(extract_thrift_version) diff --git a/ci/conan/all/patches/16.0.0-0001-fix-cmake.patch b/ci/conan/all/patches/16.0.0-0001-fix-cmake.patch new file mode 100644 index 0000000000000..6077237139d49 --- /dev/null +++ b/ci/conan/all/patches/16.0.0-0001-fix-cmake.patch @@ -0,0 +1,84 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/cmake_modules/FindThriftAlt.cmake b/cpp/cmake_modules/FindThriftAlt.cmake +index f3e49021d..3e63f1edf 100644 +--- a/cpp/cmake_modules/FindThriftAlt.cmake ++++ b/cpp/cmake_modules/FindThriftAlt.cmake +@@ -45,23 +45,23 @@ endif() + # * https://github.com/apache/thrift/pull/2725 + # * https://github.com/apache/thrift/pull/2726 + # * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 +-if(NOT WIN32) +- set(find_package_args "") +- if(ThriftAlt_FIND_VERSION) +- list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) +- endif() +- if(ThriftAlt_FIND_QUIETLY) +- list(APPEND find_package_args QUIET) +- endif() +- find_package(Thrift ${find_package_args}) +- if(Thrift_FOUND) +- set(ThriftAlt_FOUND TRUE) +- add_executable(thrift::compiler IMPORTED) +- set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION +- "${THRIFT_COMPILER}") +- return() +- endif() ++ ++set(find_package_args "") ++if(ThriftAlt_FIND_VERSION) ++ list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) ++endif() ++if(ThriftAlt_FIND_QUIETLY) ++ list(APPEND find_package_args QUIET) + endif() ++find_package(Thrift ${find_package_args}) ++if(Thrift_FOUND) ++ set(ThriftAlt_FOUND TRUE) ++ add_executable(thrift::compiler IMPORTED) ++ set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION ++ "${THRIFT_COMPILER}") ++ return() ++endif() ++ + + function(extract_thrift_version) + if(ThriftAlt_INCLUDE_DIR) +diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt +index 93f2e72d8..e00f73f7d 100644 +--- a/cpp/src/parquet/CMakeLists.txt ++++ b/cpp/src/parquet/CMakeLists.txt +@@ -262,11 +262,11 @@ if(NOT PARQUET_MINIMAL_DEPENDENCY) + + # These are libraries that we will link privately with parquet_shared (as they + # do not need to be linked transitively by other linkers) +- list(APPEND PARQUET_SHARED_PRIVATE_LINK_LIBS thrift::thrift) ++ list(APPEND PARQUET_SHARED_PRIVATE_LINK_LIBS Boost::headers thrift::thrift) + + # Link publicly with parquet_static (because internal users need to + # transitively link all dependencies) +- list(APPEND PARQUET_STATIC_LINK_LIBS thrift::thrift) ++ list(APPEND PARQUET_STATIC_LINK_LIBS Boost::headers thrift::thrift) + if(NOT THRIFT_VENDORED) + list(APPEND PARQUET_STATIC_INSTALL_INTERFACE_LIBS thrift::thrift) + endif() diff --git a/ci/conan/all/patches/18.0.0-0001-fix-cmake.patch b/ci/conan/all/patches/18.0.0-0001-fix-cmake.patch new file mode 100644 index 0000000000000..9abff332e4b6d --- /dev/null +++ b/ci/conan/all/patches/18.0.0-0001-fix-cmake.patch @@ -0,0 +1,81 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/cmake_modules/FindThriftAlt.cmake b/cpp/cmake_modules/FindThriftAlt.cmake +index 98a706d..edf195e 100644 +--- a/cpp/cmake_modules/FindThriftAlt.cmake ++++ b/cpp/cmake_modules/FindThriftAlt.cmake +@@ -45,22 +45,20 @@ endif() + # * https://github.com/apache/thrift/pull/2725 + # * https://github.com/apache/thrift/pull/2726 + # * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 +-if(NOT WIN32) +- set(find_package_args "") +- if(ThriftAlt_FIND_VERSION) +- list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) +- endif() +- if(ThriftAlt_FIND_QUIETLY) +- list(APPEND find_package_args QUIET) +- endif() +- find_package(Thrift ${find_package_args}) +- if(Thrift_FOUND) +- set(ThriftAlt_FOUND TRUE) +- add_executable(thrift::compiler IMPORTED) +- set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION +- "${THRIFT_COMPILER}") +- return() +- endif() ++set(find_package_args "") ++if(ThriftAlt_FIND_VERSION) ++ list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) ++endif() ++if(ThriftAlt_FIND_QUIETLY) ++ list(APPEND find_package_args QUIET) ++endif() ++find_package(Thrift ${find_package_args}) ++if(Thrift_FOUND) ++ set(ThriftAlt_FOUND TRUE) ++ add_executable(thrift::compiler IMPORTED) ++ set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION ++ "${THRIFT_COMPILER}") ++ return() + endif() + + function(extract_thrift_version) +diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt +index b984ef7..429fc6d 100644 +--- a/cpp/src/parquet/CMakeLists.txt ++++ b/cpp/src/parquet/CMakeLists.txt +@@ -263,11 +263,11 @@ if(NOT PARQUET_MINIMAL_DEPENDENCY) + + # These are libraries that we will link privately with parquet_shared (as they + # do not need to be linked transitively by other linkers) +- list(APPEND PARQUET_SHARED_PRIVATE_LINK_LIBS thrift::thrift) ++ list(APPEND PARQUET_SHARED_PRIVATE_LINK_LIBS Boost::headers thrift::thrift) + + # Link publicly with parquet_static (because internal users need to + # transitively link all dependencies) +- list(APPEND PARQUET_STATIC_LINK_LIBS thrift::thrift) ++ list(APPEND PARQUET_STATIC_LINK_LIBS Boost::headers thrift::thrift) + if(NOT THRIFT_VENDORED) + list(APPEND PARQUET_STATIC_INSTALL_INTERFACE_LIBS thrift::thrift) + endif() diff --git a/ci/conan/all/patches/2.0.0-0003-fix-shared-msvc.patch b/ci/conan/all/patches/2.0.0-0003-fix-shared-msvc.patch deleted file mode 100644 index 3583e5c221707..0000000000000 --- a/ci/conan/all/patches/2.0.0-0003-fix-shared-msvc.patch +++ /dev/null @@ -1,35 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/src/arrow/CMakeLists.txt -+++ cpp/src/arrow/CMakeLists.txt -@@ -504,6 +504,10 @@ - target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) - endif() - -+if(ARROW_BUILD_SHARED AND WIN32) -+target_compile_definitions(arrow_shared PRIVATE ARROW_EXPORTING) -+endif() -+ - if(ARROW_WITH_BACKTRACE) - find_package(Backtrace) - diff --git a/ci/conan/all/patches/2.0.0-0005-gandiva-engine.patch b/ci/conan/all/patches/2.0.0-0005-gandiva-engine.patch deleted file mode 100644 index 6dc0c7947a5e0..0000000000000 --- a/ci/conan/all/patches/2.0.0-0005-gandiva-engine.patch +++ /dev/null @@ -1,35 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/src/gandiva/engine.cc -+++ cpp/src/gandiva/engine.cc -@@ -64,6 +64,10 @@ - #include - #include - -+#if GANDIVA_LLVM_VERSION >= 11 -+#include -+#endif -+ - #if defined(_MSC_VER) - #pragma warning(pop) - #endif diff --git a/ci/conan/all/patches/2.0.0-0008-fix-cmake.patch b/ci/conan/all/patches/2.0.0-0008-fix-cmake.patch deleted file mode 100644 index abdcf7a0fa36a..0000000000000 --- a/ci/conan/all/patches/2.0.0-0008-fix-cmake.patch +++ /dev/null @@ -1,295 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index 515e6af..7488161 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -109,7 +109,7 @@ set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") - set(ARROW_CMAKE_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") - set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") - --set(ARROW_LLVM_VERSIONS "10" "9" "8" "7") -+set(ARROW_LLVM_VERSIONS "13" "12" "11" "10" "9" "8" "7") - list(GET ARROW_LLVM_VERSIONS 0 ARROW_LLVM_VERSION_PRIMARY) - string(REGEX - REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_LLVM_VERSION_PRIMARY_MAJOR -@@ -667,7 +667,7 @@ endif() - - if(ARROW_WITH_BROTLI) - # Order is important for static linking -- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) -+ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) - list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) - list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) - if(Brotli_SOURCE STREQUAL "SYSTEM") -@@ -683,9 +683,9 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) - if(Lz4_SOURCE STREQUAL "SYSTEM") -- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS lz4::lz4) - endif() - endif() - -@@ -842,8 +842,14 @@ endif() - - if(ARROW_MIMALLOC) - add_definitions(-DARROW_MIMALLOC) -- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) -+ if (TARGET mimalloc-static) -+ list(APPEND ARROW_LINK_LIBS mimalloc-static) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc-static) -+ else() -+ list(APPEND ARROW_LINK_LIBS mimalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) -+ endif() -+ - endif() - - # ---------------------------------------------------------------------- -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index cc37a3c..8fe6db9 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -171,6 +171,7 @@ macro(provide_find_module DEPENDENCY_NAME) - endmacro() - - macro(resolve_dependency DEPENDENCY_NAME) -+if(0) - set(options) - set(one_value_args REQUIRED_VERSION) - cmake_parse_arguments(ARG -@@ -207,6 +208,14 @@ macro(resolve_dependency DEPENDENCY_NAME) - provide_find_module(${DEPENDENCY_NAME}) - list(APPEND ARROW_SYSTEM_DEPENDENCIES ${DEPENDENCY_NAME}) - endif() -+else() -+ if(ARG_REQUIRED_VERSION) -+ find_package(${DEPENDENCY_NAME} ${ARG_REQUIRED_VERSION} REQUIRED) -+ else() -+ find_package(${DEPENDENCY_NAME} REQUIRED) -+ endif() -+ list(APPEND ARROW_SYSTEM_DEPENDENCIES ${DEPENDENCY_NAME}) -+endif() - endmacro() - - # ---------------------------------------------------------------------- -@@ -826,6 +835,7 @@ endif() - # - Tests need Boost at runtime. - # - S3FS and Flight benchmarks need Boost at runtime. - if(ARROW_BUILD_INTEGRATION -+ OR ARROW_BOOST_REQUIRED - OR ARROW_BUILD_TESTS - OR ARROW_GANDIVA - OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) -@@ -846,7 +856,7 @@ if(ARROW_BOOST_REQUIRED) - elseif(BOOST_SOURCE STREQUAL "BUNDLED") - build_boost() - elseif(BOOST_SOURCE STREQUAL "SYSTEM") -- find_package(BoostAlt ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) -+ find_package(Boost ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) - endif() - - if(TARGET Boost::system) -@@ -973,11 +983,11 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli) -+ resolve_dependency(brotli) - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon -+ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) -- include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) -+ include_directories(SYSTEM ${brotli_INCLUDE_DIR}) - endif() - - if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) -@@ -1200,9 +1210,10 @@ if(ARROW_NEED_GFLAGS) - endif() - endif() - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) -+ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) -+ set(GFLAGS_LIBRARIES ${gflags_LIBRARIES}) - -- if(NOT TARGET ${GFLAGS_LIBRARIES}) -+ if(0) - if(TARGET gflags-shared) - set(GFLAGS_LIBRARIES gflags-shared) - elseif(TARGET gflags_shared) -@@ -1291,12 +1302,13 @@ endmacro() - if(ARROW_WITH_THRIFT) - # We already may have looked for Thrift earlier, when considering whether - # to build Boost, so don't look again if already found. -- if(NOT Thrift_FOUND AND NOT THRIFT_FOUND) -+ if(0) - # Thrift c++ code generated by 0.13 requires 0.11 or greater - resolve_dependency(Thrift REQUIRED_VERSION 0.11.0) - endif() -+ find_package(Thrift CONFIG REQUIRED) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) -+ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) - endif() - - # ---------------------------------------------------------------------- -@@ -1461,6 +1473,7 @@ endif() - # jemalloc - Unix-only high-performance allocator - - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1519,12 +1532,18 @@ if(ARROW_JEMALLOC) - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+if(0) - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1572,6 +1591,11 @@ if(ARROW_MIMALLOC) - add_dependencies(toolchain mimalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) -+else() -+ find_package(mimalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -1971,11 +1995,16 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -- resolve_dependency(Lz4) -+ resolve_dependency(lz4) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) -- include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) -+ if(TARGET LZ4::lz4_static) -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) -+ else() -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) -+ endif() -+ include_directories(SYSTEM ${lz4_INCLUDE_DIR}) -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${lz4_LIBRARIES_TARGETS} ) - endif() - - macro(build_zstd) -@@ -2090,10 +2119,10 @@ macro(build_re2) - endmacro() - - if(ARROW_GANDIVA) -- resolve_dependency(RE2) -+ resolve_dependency(re2) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(RE2_INCLUDE_DIR RE2::re2 INTERFACE_INCLUDE_DIRECTORIES) -+ get_target_property(RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${RE2_INCLUDE_DIR}) - endif() - -@@ -2541,17 +2570,24 @@ if(ARROW_WITH_GRPC) - endif() - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) -+ if(grpc_INCLUDE_DIRS_RELEASE) -+ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_RELEASE}) -+ elseif(grpc_INCLUDE_DIRS_DEBUG) -+ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_DEBUG}) -+ endif() -+ - include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) -+ include_directories(SYSTEM ${absl_INCLUDE_DIR}) -+ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) - - if(GRPC_VENDORED) - set(GRPCPP_PP_INCLUDE TRUE) - else() - # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp - # depending on the gRPC version. -- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") -+ if(EXISTS ${gRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE TRUE) -- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") -+ elseif(EXISTS ${gRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE FALSE) - else() - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index 2751254..842fc9e 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -307,10 +307,14 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) -- list(APPEND _allocator_dependencies mimalloc_ep) -+ if (TARGET mimalloc-static) -+ list(APPEND _allocator_dependencies mimalloc-static) -+ else() -+ list(APPEND _allocator_dependencies mimalloc) -+ endif() - endif() - - if(_allocator_dependencies) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index 784bf7b..8f005a5 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -31,7 +31,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/7.0.0-0006-install-utils.patch b/ci/conan/all/patches/7.0.0-0006-install-utils.patch deleted file mode 100644 index 7674174c8e254..0000000000000 --- a/ci/conan/all/patches/7.0.0-0006-install-utils.patch +++ /dev/null @@ -1,39 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt -index 495018e..f6cee6f 100644 ---- a/cpp/src/arrow/ipc/CMakeLists.txt -+++ b/cpp/src/arrow/ipc/CMakeLists.txt -@@ -61,8 +61,12 @@ endif() - if(ARROW_BUILD_UTILITIES OR ARROW_BUILD_INTEGRATION) - add_executable(arrow-file-to-stream file_to_stream.cc) - target_link_libraries(arrow-file-to-stream ${ARROW_UTIL_LIB}) -+ install(TARGETS arrow-file-to-stream ${INSTALL_IS_OPTIONAL} -+ DESTINATION ${CMAKE_INSTALL_BINDIR}) - add_executable(arrow-stream-to-file stream_to_file.cc) - target_link_libraries(arrow-stream-to-file ${ARROW_UTIL_LIB}) -+ install(TARGETS arrow-stream-to-file ${INSTALL_IS_OPTIONAL} -+ DESTINATION ${CMAKE_INSTALL_BINDIR}) - - if(ARROW_BUILD_INTEGRATION) - add_dependencies(arrow-integration arrow-file-to-stream) diff --git a/ci/conan/all/patches/7.0.0-0007-fix-cmake.patch b/ci/conan/all/patches/7.0.0-0007-fix-cmake.patch deleted file mode 100644 index eb2acb1523fc3..0000000000000 --- a/ci/conan/all/patches/7.0.0-0007-fix-cmake.patch +++ /dev/null @@ -1,369 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index 2d7baf1..dff5b1a 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -699,7 +699,7 @@ endif() - - if(ARROW_WITH_BROTLI) - # Order is important for static linking -- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) -+ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) - list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) - list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) - if(Brotli_SOURCE STREQUAL "SYSTEM") -@@ -715,10 +715,17 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -- if(Lz4_SOURCE STREQUAL "SYSTEM") -- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) -- endif() -+ if (TARGET LZ4::lz4_static) -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) -+ if(Lz4_SOURCE STREQUAL "SYSTEM") -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_static) -+ endif() -+ else() -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_shared) -+ if(Lz4_SOURCE STREQUAL "SYSTEM") -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_shared) -+ endif() -+endif() - endif() - - if(ARROW_WITH_SNAPPY) -@@ -907,8 +914,13 @@ endif() - - if(ARROW_MIMALLOC) - add_definitions(-DARROW_MIMALLOC) -- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) -+ if (TARGET mimalloc-static) -+ list(APPEND ARROW_LINK_LIBS mimalloc-static) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc-static) -+ else() -+ list(APPEND ARROW_LINK_LIBS mimalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) -+ endif() - endif() - - # ---------------------------------------------------------------------- -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index bc38952..62bf314 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -954,7 +954,7 @@ endif() - - if(ARROW_BOOST_REQUIRED) - resolve_dependency(Boost -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_BOOST_REQUIRED_VERSION} -@@ -965,7 +965,7 @@ if(ARROW_BOOST_REQUIRED) - if(TARGET Boost::system) - set(BOOST_SYSTEM_LIBRARY Boost::system) - set(BOOST_FILESYSTEM_LIBRARY Boost::filesystem) -- elseif(BoostAlt_FOUND) -+ elseif(Boost_FOUND) - set(BOOST_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY}) - set(BOOST_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY}) - else() -@@ -1108,9 +1108,9 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) -+ resolve_dependency(brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon -+ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) - endif() -@@ -1302,22 +1302,17 @@ endmacro() - if(ARROW_NEED_GFLAGS) - set(ARROW_GFLAGS_REQUIRED_VERSION "2.1.0") - resolve_dependency(gflags -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_GFLAGS_REQUIRED_VERSION} - IS_RUNTIME_DEPENDENCY - FALSE) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) -+ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) - -- if(NOT TARGET ${GFLAGS_LIBRARIES}) -- if(TARGET gflags-shared) -- set(GFLAGS_LIBRARIES gflags-shared) -- elseif(TARGET gflags_shared) -- set(GFLAGS_LIBRARIES gflags_shared) -- endif() -- endif() -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${gflags_LIBRARIES_TARGETS}) -+ set(GFLAGS_LIBRARIES gflags::gflags) - endif() - - # ---------------------------------------------------------------------- -@@ -1411,9 +1406,9 @@ if(ARROW_WITH_THRIFT) - thrift) - endif() - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) -+ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) - -- string(REPLACE "." ";" VERSION_LIST ${THRIFT_VERSION}) -+ string(REPLACE "." ";" VERSION_LIST ${Thrift_VERSION}) - list(GET VERSION_LIST 0 THRIFT_VERSION_MAJOR) - list(GET VERSION_LIST 1 THRIFT_VERSION_MINOR) - list(GET VERSION_LIST 2 THRIFT_VERSION_PATCH) -@@ -1528,6 +1523,7 @@ if(ARROW_WITH_PROTOBUF) - set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") - endif() - resolve_dependency(Protobuf -+ USE_CONFIG - REQUIRED_VERSION - ${ARROW_PROTOBUF_REQUIRED_VERSION} - PC_PACKAGE_NAMES -@@ -1538,7 +1534,7 @@ if(ARROW_WITH_PROTOBUF) - endif() - - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR}) -+ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) - - if(TARGET arrow::protobuf::libprotobuf) - set(ARROW_PROTOBUF_LIBPROTOBUF arrow::protobuf::libprotobuf) -@@ -1547,9 +1543,9 @@ if(ARROW_WITH_PROTOBUF) - if(NOT TARGET protobuf::libprotobuf) - add_library(protobuf::libprotobuf UNKNOWN IMPORTED) - set_target_properties(protobuf::libprotobuf -- PROPERTIES IMPORTED_LOCATION "${PROTOBUF_LIBRARY}" -+ PROPERTIES IMPORTED_LOCATION "${Protobuf_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES -- "${PROTOBUF_INCLUDE_DIR}") -+ "${Protobuf_INCLUDE_DIR}") - endif() - set(ARROW_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) - endif() -@@ -1569,7 +1565,7 @@ if(ARROW_WITH_PROTOBUF) - set_target_properties(protobuf::libprotoc - PROPERTIES IMPORTED_LOCATION "${Protobuf_PROTOC_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES -- "${PROTOBUF_INCLUDE_DIR}") -+ "${Protobuf_INCLUDE_DIR}") - endif() - set(ARROW_PROTOBUF_LIBPROTOC protobuf::libprotoc) - endif() -@@ -1600,6 +1596,7 @@ endif() - # jemalloc - Unix-only high-performance allocator - - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1665,12 +1662,18 @@ if(ARROW_JEMALLOC) - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS}) -+endif() - endif() - - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+if(0) - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1716,6 +1719,11 @@ if(ARROW_MIMALLOC) - add_dependencies(toolchain mimalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) -+else() -+ find_package(mimalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -2001,7 +2009,7 @@ endmacro() - if(ARROW_WITH_RAPIDJSON) - set(ARROW_RAPIDJSON_REQUIRED_VERSION "1.1.0") - resolve_dependency(RapidJSON -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_RAPIDJSON_REQUIRED_VERSION} -@@ -2038,10 +2046,9 @@ endmacro() - - if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE" - )) -- set(xsimd_SOURCE "BUNDLED") - resolve_dependency(xsimd) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${XSIMD_INCLUDE_DIR}) -+ include_directories(SYSTEM ${xsimd_INCLUDE_DIR}) - endif() - - macro(build_zlib) -@@ -2140,10 +2147,14 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -- resolve_dependency(Lz4 PC_PACKAGE_NAMES liblz4) -+ resolve_dependency(lz4) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) -+ if (TARGET LZ4::lz4_static) -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) -+ else() -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) -+ endif() - include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) - endif() - -@@ -2274,7 +2285,7 @@ if(ARROW_WITH_RE2) - # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may - # include -std=c++11. It's not compatible with C source and C++ - # source not uses C++ 11. -- resolve_dependency(re2 HAVE_ALT TRUE) -+ resolve_dependency(re2 USE_CONFIG TRUE) - if(${re2_SOURCE} STREQUAL "SYSTEM") - get_target_property(RE2_LIB re2::re2 IMPORTED_LOCATION) - string(APPEND ARROW_PC_LIBS_PRIVATE " ${RE2_LIB}") -@@ -2337,7 +2348,7 @@ endmacro() - if(ARROW_WITH_BZ2) - resolve_dependency(BZip2) - if(${BZip2_SOURCE} STREQUAL "SYSTEM") -- string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZIP2_LIBRARIES}") -+ string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZip2_LIBRARIES}") - endif() - - if(NOT TARGET BZip2::BZip2) -@@ -2346,7 +2357,7 @@ if(ARROW_WITH_BZ2) - PROPERTIES IMPORTED_LOCATION "${BZIP2_LIBRARIES}" - INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIR}") - endif() -- include_directories(SYSTEM "${BZIP2_INCLUDE_DIR}") -+ include_directories(SYSTEM "${BZip2_INCLUDE_DIR}") - endif() - - macro(build_utf8proc) -@@ -3555,7 +3566,7 @@ if(ARROW_WITH_GRPC) - set(gRPC_SOURCE "${Protobuf_SOURCE}") - endif() - resolve_dependency(gRPC -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_GRPC_REQUIRED_VERSION} -@@ -3573,9 +3584,9 @@ if(ARROW_WITH_GRPC) - else() - # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp - # depending on the gRPC version. -- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") -+ if(EXISTS ${gRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE TRUE) -- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") -+ elseif(EXISTS ${gPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE FALSE) - else() - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") -@@ -4097,9 +4108,9 @@ macro(build_opentelemetry) - endmacro() - - if(ARROW_WITH_OPENTELEMETRY) -- set(opentelemetry-cpp_SOURCE "AUTO") -+ set(opentelemetry-cpp_SOURCE "SYSTEM") - resolve_dependency(opentelemetry-cpp) -- get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api -+ get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::opentelemetry_common - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${OPENTELEMETRY_INCLUDE_DIR}) - message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index b984bc1..2c78cd9 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -323,10 +323,14 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) -- list(APPEND _allocator_dependencies mimalloc_ep) -+ if (TARGET mimalloc-static) -+ list(APPEND _allocator_dependencies mimalloc-static) -+ else() -+ list(APPEND _allocator_dependencies mimalloc) -+ endif() - endif() - - if(_allocator_dependencies) -diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt -index 2cf8c99..90ebb9a 100644 ---- a/cpp/src/arrow/flight/CMakeLists.txt -+++ b/cpp/src/arrow/flight/CMakeLists.txt -@@ -17,6 +17,9 @@ - - add_custom_target(arrow_flight) - -+# TODO: This is a temporary workaround. absl should be LINKED as TARGET. -+include_directories(SYSTEM ${absl_INCLUDE_DIR}) -+ - arrow_install_all_headers("arrow/flight") - - set(ARROW_FLIGHT_LINK_LIBS gRPC::grpc++ ${ARROW_PROTOBUF_LIBPROTOBUF}) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index 2dcfb01..0394c01 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -48,7 +48,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/8.0.0-0005-install-utils.patch b/ci/conan/all/patches/8.0.0-0005-install-utils.patch deleted file mode 100644 index 98075913ed109..0000000000000 --- a/ci/conan/all/patches/8.0.0-0005-install-utils.patch +++ /dev/null @@ -1,65 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index aba18c8..bb463d0 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -721,7 +721,7 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) - if(Lz4_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) - endif() -@@ -907,8 +907,8 @@ endif() - if(ARROW_JEMALLOC) - add_definitions(-DARROW_JEMALLOC) - add_definitions(-DARROW_JEMALLOC_INCLUDE_DIR=${JEMALLOC_INCLUDE_DIR}) -- list(APPEND ARROW_LINK_LIBS jemalloc::jemalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS jemalloc::jemalloc) -+ list(APPEND ARROW_LINK_LIBS jemalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS jemalloc) - endif() - - if(ARROW_MIMALLOC) -diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt -index 495018e..3dcb35d 100644 ---- a/cpp/src/arrow/ipc/CMakeLists.txt -+++ b/cpp/src/arrow/ipc/CMakeLists.txt -@@ -61,9 +61,13 @@ endif() - if(ARROW_BUILD_UTILITIES OR ARROW_BUILD_INTEGRATION) - add_executable(arrow-file-to-stream file_to_stream.cc) - target_link_libraries(arrow-file-to-stream ${ARROW_UTIL_LIB}) -+ install(TARGETS arrow-file-to-stream ${INSTALL_IS_OPTIONAL} -+ DESTINATION ${CMAKE_INSTALL_BINDIR}) - add_executable(arrow-stream-to-file stream_to_file.cc) - target_link_libraries(arrow-stream-to-file ${ARROW_UTIL_LIB}) -- -+ install(TARGETS arrow-stream-to-file ${INSTALL_IS_OPTIONAL} -+ DESTINATION ${CMAKE_INSTALL_BINDIR}) -+ - if(ARROW_BUILD_INTEGRATION) - add_dependencies(arrow-integration arrow-file-to-stream) - add_dependencies(arrow-integration arrow-stream-to-file) diff --git a/ci/conan/all/patches/8.0.0-0006-fix-cmake.patch b/ci/conan/all/patches/8.0.0-0006-fix-cmake.patch deleted file mode 100644 index 7be516e1b4855..0000000000000 --- a/ci/conan/all/patches/8.0.0-0006-fix-cmake.patch +++ /dev/null @@ -1,447 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index bb463d0..ce2d1df 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -705,7 +705,7 @@ endif() - - if(ARROW_WITH_BROTLI) - # Order is important for static linking -- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) -+ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) - list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) - list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) - if(Brotli_SOURCE STREQUAL "SYSTEM") -@@ -721,11 +721,18 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) -- if(Lz4_SOURCE STREQUAL "SYSTEM") -- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) -+ if (TARGET LZ4::lz4_static) -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) -+ if(Lz4_SOURCE STREQUAL "SYSTEM") -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_static) -+ endif() -+ else() -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_shared) -+ if(Lz4_SOURCE STREQUAL "SYSTEM") -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_shared) - endif() - endif() -+endif() - - if(ARROW_WITH_SNAPPY) - list(APPEND ARROW_STATIC_LINK_LIBS Snappy::snappy) -@@ -913,8 +920,13 @@ endif() - - if(ARROW_MIMALLOC) - add_definitions(-DARROW_MIMALLOC) -- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) -+ if (TARGET mimalloc-static) -+ list(APPEND ARROW_LINK_LIBS mimalloc-static) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc-static) -+ else() -+ list(APPEND ARROW_LINK_LIBS mimalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) -+ endif() - endif() - - # ---------------------------------------------------------------------- -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index f070323..16faf73 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -959,6 +959,7 @@ endif() - # - Tests need Boost at runtime. - # - S3FS and Flight benchmarks need Boost at runtime. - if(ARROW_BUILD_INTEGRATION -+ OR ARROW_BOOST_REQUIRED - OR ARROW_BUILD_TESTS - OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) - OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)) -@@ -975,7 +976,7 @@ endif() - - if(ARROW_BOOST_REQUIRED) - resolve_dependency(Boost -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_BOOST_REQUIRED_VERSION} -@@ -986,7 +987,7 @@ if(ARROW_BOOST_REQUIRED) - if(TARGET Boost::system) - set(BOOST_SYSTEM_LIBRARY Boost::system) - set(BOOST_FILESYSTEM_LIBRARY Boost::filesystem) -- elseif(BoostAlt_FOUND) -+ elseif(Boost_FOUND) - set(BOOST_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY}) - set(BOOST_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY}) - else() -@@ -1129,9 +1130,9 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) -+ resolve_dependency(brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon -+ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) - endif() -@@ -1323,22 +1324,16 @@ endmacro() - if(ARROW_NEED_GFLAGS) - set(ARROW_GFLAGS_REQUIRED_VERSION "2.1.0") - resolve_dependency(gflags -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_GFLAGS_REQUIRED_VERSION} - IS_RUNTIME_DEPENDENCY - FALSE) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) -- -- if(NOT TARGET ${GFLAGS_LIBRARIES}) -- if(TARGET gflags-shared) -- set(GFLAGS_LIBRARIES gflags-shared) -- elseif(TARGET gflags_shared) -- set(GFLAGS_LIBRARIES gflags_shared) -- endif() -- endif() -+ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${gflags_LIBRARIES_TARGETS}) -+ set(GFLAGS_LIBRARIES gflags::gflags) - endif() - - # ---------------------------------------------------------------------- -@@ -1432,9 +1427,9 @@ if(ARROW_WITH_THRIFT) - thrift) - endif() - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) -+ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) - -- string(REPLACE "." ";" VERSION_LIST ${THRIFT_VERSION}) -+ string(REPLACE "." ";" VERSION_LIST ${Thrift_VERSION}) - list(GET VERSION_LIST 0 THRIFT_VERSION_MAJOR) - list(GET VERSION_LIST 1 THRIFT_VERSION_MINOR) - list(GET VERSION_LIST 2 THRIFT_VERSION_PATCH) -@@ -1557,6 +1552,7 @@ if(ARROW_WITH_PROTOBUF) - set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") - endif() - resolve_dependency(Protobuf -+ USE_CONFIG - REQUIRED_VERSION - ${ARROW_PROTOBUF_REQUIRED_VERSION} - PC_PACKAGE_NAMES -@@ -1567,7 +1563,7 @@ if(ARROW_WITH_PROTOBUF) - endif() - - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR}) -+ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) - - if(TARGET arrow::protobuf::libprotobuf) - set(ARROW_PROTOBUF_LIBPROTOBUF arrow::protobuf::libprotobuf) -@@ -1576,9 +1572,9 @@ if(ARROW_WITH_PROTOBUF) - if(NOT TARGET protobuf::libprotobuf) - add_library(protobuf::libprotobuf UNKNOWN IMPORTED) - set_target_properties(protobuf::libprotobuf -- PROPERTIES IMPORTED_LOCATION "${PROTOBUF_LIBRARY}" -+ PROPERTIES IMPORTED_LOCATION "${Protobuf_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES -- "${PROTOBUF_INCLUDE_DIR}") -+ "${Protobuf_INCLUDE_DIR}") - endif() - set(ARROW_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) - endif() -@@ -1598,7 +1594,7 @@ if(ARROW_WITH_PROTOBUF) - set_target_properties(protobuf::libprotoc - PROPERTIES IMPORTED_LOCATION "${Protobuf_PROTOC_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES -- "${PROTOBUF_INCLUDE_DIR}") -+ "${Protobuf_INCLUDE_DIR}") - endif() - set(ARROW_PROTOBUF_LIBPROTOC protobuf::libprotoc) - endif() -@@ -1690,11 +1686,12 @@ macro(build_substrait) - - add_custom_target(substrait_gen ALL DEPENDS ${SUBSTRAIT_PROTO_GEN_ALL}) - -- set(SUBSTRAIT_INCLUDES ${SUBSTRAIT_CPP_DIR} ${PROTOBUF_INCLUDE_DIR}) -+ set(SUBSTRAIT_INCLUDES ${SUBSTRAIT_CPP_DIR} ${protobuf_INCLUDE_DIR}) - - add_library(substrait STATIC ${SUBSTRAIT_SOURCES}) - set_target_properties(substrait PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_include_directories(substrait PUBLIC ${SUBSTRAIT_INCLUDES}) -+ target_include_directories(substrait PUBLIC ${PROTOBUF_INCLUDE_DIR}) - target_link_libraries(substrait INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) - add_dependencies(substrait substrait_gen) - -@@ -1711,6 +1708,7 @@ endif() - # jemalloc - Unix-only high-performance allocator - - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1780,12 +1778,18 @@ if(ARROW_JEMALLOC) - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS}) -+endif() - endif() - - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+if(0) - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1836,6 +1840,11 @@ if(ARROW_MIMALLOC) - add_dependencies(toolchain mimalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) -+else() -+ find_package(mimalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -2121,7 +2130,7 @@ endmacro() - if(ARROW_WITH_RAPIDJSON) - set(ARROW_RAPIDJSON_REQUIRED_VERSION "1.1.0") - resolve_dependency(RapidJSON -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_RAPIDJSON_REQUIRED_VERSION} -@@ -2158,10 +2167,10 @@ endmacro() - - if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE" - )) -- set(xsimd_SOURCE "BUNDLED") -+ set(xsimd_SOURCE "SYSTEM") - resolve_dependency(xsimd) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${XSIMD_INCLUDE_DIR}) -+ include_directories(SYSTEM ${xsimd_INCLUDE_DIR}) - endif() - - macro(build_zlib) -@@ -2260,10 +2269,14 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -- resolve_dependency(Lz4 PC_PACKAGE_NAMES liblz4) -+ resolve_dependency(Lz4) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) -+ if (TARGET LZ4::lz4_static) -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) -+ else() -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) -+ endif() - include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) - endif() - -@@ -2394,7 +2407,7 @@ if(ARROW_WITH_RE2) - # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may - # include -std=c++11. It's not compatible with C source and C++ - # source not uses C++ 11. -- resolve_dependency(re2 HAVE_ALT TRUE) -+ resolve_dependency(re2 USE_CONFIG TRUE) - if(${re2_SOURCE} STREQUAL "SYSTEM") - get_target_property(RE2_LIB re2::re2 IMPORTED_LOCATION_${UPPERCASE_BUILD_TYPE}) - if(NOT RE2_LIB) -@@ -2464,7 +2477,7 @@ endmacro() - if(ARROW_WITH_BZ2) - resolve_dependency(BZip2) - if(${BZip2_SOURCE} STREQUAL "SYSTEM") -- string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZIP2_LIBRARIES}") -+ string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZip2_LIBRARIES}") - endif() - - if(NOT TARGET BZip2::BZip2) -@@ -2473,7 +2486,7 @@ if(ARROW_WITH_BZ2) - PROPERTIES IMPORTED_LOCATION "${BZIP2_LIBRARIES}" - INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIR}") - endif() -- include_directories(SYSTEM "${BZIP2_INCLUDE_DIR}") -+ include_directories(SYSTEM "${BZip2_INCLUDE_DIR}") - endif() - - macro(build_utf8proc) -@@ -3709,7 +3722,7 @@ if(ARROW_WITH_GRPC) - set(gRPC_SOURCE "${Protobuf_SOURCE}") - endif() - resolve_dependency(gRPC -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_GRPC_REQUIRED_VERSION} -@@ -3727,9 +3740,9 @@ if(ARROW_WITH_GRPC) - else() - # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp - # depending on the gRPC version. -- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") -+ if(EXISTS ${gRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE TRUE) -- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") -+ elseif(EXISTS ${gRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE FALSE) - else() - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") -@@ -3937,7 +3950,7 @@ macro(build_google_cloud_cpp_storage) - endmacro() - - if(ARROW_WITH_GOOGLE_CLOUD_CPP) -- resolve_dependency(google_cloud_cpp_storage) -+ resolve_dependency(google_cloud_cpp) - get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${google_cloud_cpp_storage_INCLUDE_DIR}) -@@ -4264,9 +4277,9 @@ if(ARROW_WITH_OPENTELEMETRY) - # cURL is required whether we build from source or use an existing installation - # (OTel's cmake files do not call find_curl for you) - find_curl() -- set(opentelemetry-cpp_SOURCE "AUTO") -+ set(opentelemetry-cpp_SOURCE "SYSTEM") - resolve_dependency(opentelemetry-cpp) -- get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api -+ get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::opentelemetry_common - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${OPENTELEMETRY_INCLUDE_DIR}) - message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index 690c51a..752f3b9 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -326,10 +326,14 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) -- list(APPEND _allocator_dependencies mimalloc_ep) -+ if (TARGET mimalloc-static) -+ list(APPEND _allocator_dependencies mimalloc-static) -+ else() -+ list(APPEND _allocator_dependencies mimalloc) -+ endif() - endif() - - if(_allocator_dependencies) -diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt -index f9d1356..c9bcf79 100644 ---- a/cpp/src/arrow/flight/CMakeLists.txt -+++ b/cpp/src/arrow/flight/CMakeLists.txt -@@ -17,6 +17,9 @@ - - add_custom_target(arrow_flight) - -+# TODO: This is a temporary workaround. absl should be LINKED as TARGET. -+include_directories(SYSTEM ${absl_INCLUDE_DIR}) -+ - arrow_install_all_headers("arrow/flight") - - set(ARROW_FLIGHT_LINK_LIBS gRPC::grpc++ ${ARROW_PROTOBUF_LIBPROTOBUF}) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index ed1c2d8..37a89da 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -52,7 +52,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC -diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt -index 71faf9a..3aabea1 100644 ---- a/cpp/src/gandiva/CMakeLists.txt -+++ b/cpp/src/gandiva/CMakeLists.txt -@@ -25,7 +25,7 @@ add_custom_target(gandiva-benchmarks) - - add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) - --find_package(LLVMAlt REQUIRED) -+find_package(LLVM REQUIRED) - - if(LLVM_VERSION_MAJOR LESS "10") - set(GANDIVA_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -@@ -40,7 +40,7 @@ endif() - - add_definitions(-DGANDIVA_LLVM_VERSION=${LLVM_VERSION_MAJOR}) - --find_package(OpenSSLAlt REQUIRED) -+find_package(OpenSSL REQUIRED) - - # Set the path where the bitcode file generated, see precompiled/CMakeLists.txt - set(GANDIVA_PRECOMPILED_BC_PATH "${CMAKE_CURRENT_BINARY_DIR}/irhelpers.bc") -@@ -98,10 +98,11 @@ set(SRC_FILES - random_generator_holder.cc - ${GANDIVA_PRECOMPILED_CC_PATH}) - --set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared LLVM::LLVM_INTERFACE -- ${GANDIVA_OPENSSL_LIBS}) -+set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared llvm-core::llvm-core NTERFACE -+ ${GANDIVA_OPENSSL_LIBS}) -+ -+set(GANDIVA_STATIC_LINK_LIBS arrow_static llvm-core::llvm-core ${GANDIVA_OPENSSL_LIBS}) - --set(GANDIVA_STATIC_LINK_LIBS arrow_static LLVM::LLVM_INTERFACE ${GANDIVA_OPENSSL_LIBS}) - - if(ARROW_GANDIVA_STATIC_LIBSTDCPP AND (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX - )) -@@ -139,7 +140,7 @@ add_arrow_lib(gandiva - arrow_dependencies - precompiled - EXTRA_INCLUDES -- $ -+ $ - ${GANDIVA_OPENSSL_INCLUDE_DIR} - ${UTF8PROC_INCLUDE_DIR} - SHARED_LINK_FLAGS diff --git a/ci/conan/all/test_package/CMakeLists.txt b/ci/conan/all/test_package/CMakeLists.txt index 18761d0f52c21..b25c8e889cb84 100644 --- a/ci/conan/all/test_package/CMakeLists.txt +++ b/ci/conan/all/test_package/CMakeLists.txt @@ -26,7 +26,13 @@ project(test_package LANGUAGES CXX) find_package(Arrow REQUIRED CONFIG) add_executable(${PROJECT_NAME} test_package.cpp) -target_link_libraries(${PROJECT_NAME} PRIVATE arrow::arrow) + +if (TARGET Arrow::arrow_shared) + target_link_libraries(${PROJECT_NAME} PRIVATE Arrow::arrow_shared) +else() + target_link_libraries(${PROJECT_NAME} PRIVATE Arrow::arrow_static) +endif() + if (${Arrow_VERSION} VERSION_LESS "10.0.0") target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_11) else() diff --git a/ci/conan/all/test_v1_package/CMakeLists.txt b/ci/conan/all/test_v1_package/CMakeLists.txt deleted file mode 100644 index faf547dec70c2..0000000000000 --- a/ci/conan/all/test_v1_package/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -# MIT License -# -# Copyright (c) 2019 Conan.io -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -cmake_minimum_required(VERSION 3.1) - -project(test_package) - -include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake) -conan_basic_setup(TARGETS) - -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../test_package/ - ${CMAKE_CURRENT_BINARY_DIR}/test_package/) diff --git a/ci/conan/all/test_v1_package/conanfile.py b/ci/conan/all/test_v1_package/conanfile.py deleted file mode 100644 index 4f5cc2b61011b..0000000000000 --- a/ci/conan/all/test_v1_package/conanfile.py +++ /dev/null @@ -1,40 +0,0 @@ -# MIT License -# -# Copyright (c) 2019 Conan.io -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -from conans import ConanFile, CMake -from conan.tools.build import cross_building -import os - - -class TestPackageV1Conan(ConanFile): - settings = "os", "arch", "compiler", "build_type" - generators = "cmake", "cmake_find_package_multi" - - def build(self): - cmake = CMake(self) - cmake.configure() - cmake.build() - - def test(self): - if not cross_building(self): - bin_path = os.path.join("bin", "test_package") - self.run(bin_path, run_environment=True) diff --git a/ci/conan/config.yml b/ci/conan/config.yml index 3fa90be6f669a..cbb2fce054738 100644 --- a/ci/conan/config.yml +++ b/ci/conan/config.yml @@ -21,29 +21,15 @@ # SOFTWARE. versions: - "15.0.0": - folder: all - "14.0.2": - folder: all - "14.0.1": - folder: all - "14.0.0": - folder: all - "13.0.0": + "18.1.0": folder: all - "12.0.1": + "18.0.0": folder: all - "12.0.0": + "17.0.0": folder: all - "11.0.0": + "16.1.0": folder: all - "10.0.1": - folder: all - "10.0.0": - folder: all - "8.0.1": - folder: all - "8.0.0": - folder: all - "7.0.0": + "15.0.0": folder: all + "14.0.2": + folder: all \ No newline at end of file diff --git a/ci/conan/merge_status.sh b/ci/conan/merge_status.sh index bd99c22def1c9..600385c0e1770 100644 --- a/ci/conan/merge_status.sh +++ b/ci/conan/merge_status.sh @@ -15,4 +15,4 @@ # specific language governing permissions and limitations # under the License. -UPSTREAM_REVISION=681a40adca5f83c80581814fe92316d6298ed96f +UPSTREAM_REVISION=a9b270f9d2052e193ce3c0a6c4e2fda0b0ac5ade diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index bf915493de302..9a48f26b79c6e 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -15,10 +15,12 @@ # specific language governing permissions and limitations # under the License. -# don't add pandas here, because it is not a mandatory test dependency -boto3 # not a direct dependency of s3fs, but needed for our s3fs fixture +# Don't add pandas here, because it is not a mandatory test dependency + +# Not a direct dependency of s3fs, but needed for our s3fs fixture +boto3 cffi -cython>=0.29.31 +cython>=3 cloudpickle fsspec hypothesis diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 4665a32e24bbe..751df9b2f3c01 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -30,9 +30,5 @@ sphinx-lint sphinxcontrib-jquery sphinxcontrib-mermaid sphinx==6.2 -# Requirement for doctest-cython -# Needs upper pin of 0.3.0, see: -# https://github.com/lgpage/pytest-cython/issues/67 -# With 0.3.* bug fix release, the pin can be removed -pytest-cython==0.2.2 +pytest-cython pandas diff --git a/ci/docker/conda.dockerfile b/ci/docker/conda.dockerfile index fbd81903b0a3a..0d48fb3ef83d0 100644 --- a/ci/docker/conda.dockerfile +++ b/ci/docker/conda.dockerfile @@ -21,9 +21,15 @@ FROM ${arch}/ubuntu:22.04 # install build essentials RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update -y -q && \ - apt-get install -y -q curl wget tzdata libc6-dbg gdb \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + apt-get install -y -q \ + curl \ + gdb \ + libc6-dbg \ + patch \ + tzdata \ + wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* # install conda and mamba via miniforge COPY ci/scripts/install_conda.sh /arrow/ci/scripts/ diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index f486d07ff8894..fe3976248cc86 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -84,6 +84,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler-grpc \ python3-dev \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 8dc778d544a6d..259c5fb77fa41 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -106,6 +106,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler \ python3-dev \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 28cef2946385c..721b37dcae842 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -111,6 +111,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler \ protobuf-compiler-grpc \ diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 3f486b09f95ff..592a9a6a232e5 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -111,6 +111,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler \ protobuf-compiler-grpc \ diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 4c567d550b92a..efeed954006c1 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=18.1.0.9000 +pkgver=19.0.0.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/conan_build.sh b/ci/scripts/conan_build.sh index 0ea3fc29192dd..03e5cab8426c6 100755 --- a/ci/scripts/conan_build.sh +++ b/ci/scripts/conan_build.sh @@ -25,7 +25,6 @@ build_dir=${1} shift export ARROW_HOME=${source_dir} -export CONAN_HOOK_ERROR_LEVEL=40 conan_args=() conan_args+=(--build=missing) @@ -67,6 +66,7 @@ fi version=$(grep '^set(ARROW_VERSION ' ${ARROW_HOME}/cpp/CMakeLists.txt | \ grep -E -o '([0-9.]*)') +conan_args+=(--version ${version}) rm -rf ~/.conan/data/arrow/ rm -rf ${build_dir}/conan || sudo rm -rf ${build_dir}/conan @@ -78,4 +78,4 @@ else sudo chown -R $(id -u):$(id -g) ${build_dir}/conan/ fi cd ${build_dir}/conan/all -conan create . arrow/${version}@ "${conan_args[@]}" "$@" +conan create . "${conan_args[@]}" "$@" diff --git a/ci/scripts/conan_setup.sh b/ci/scripts/conan_setup.sh index bc56ee296a234..d665ce5436b2b 100755 --- a/ci/scripts/conan_setup.sh +++ b/ci/scripts/conan_setup.sh @@ -19,5 +19,4 @@ set -eux -conan config install https://github.com/conan-io/hooks.git -sf hooks -tf hooks -conan config set hooks.conan-center +conan profile detect diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh index 6f9701ab5a150..8685ced0bd1ab 100755 --- a/ci/scripts/install_minio.sh +++ b/ci/scripts/install_minio.sh @@ -63,7 +63,7 @@ if [ "${version}" != "latest" ]; then fi # Use specific versions for minio server and client to avoid CI failures on new releases. -minio_version="minio.RELEASE.2024-09-13T20-26-02Z" +minio_version="minio.RELEASE.2025-01-20T14-49-07Z" mc_version="mc.RELEASE.2024-09-16T17-43-14Z" download() diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 22b3a890f036b..84fcaba42e699 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -29,7 +29,7 @@ def validate_wheel(path): f = zipfile.ZipFile(wheels[0]) outliers = [ info.filename for info in f.filelist if not re.match( - r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/)', info.filename + r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)', info.filename ) ] assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 9287d2471deec..833d31059c710 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -27,11 +27,6 @@ py -0p call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat" @echo on -@REM Install a more recent msvcp140.dll in C:\Windows\System32 -choco install -r -y --no-progress vcredist140 -choco upgrade -r -y --no-progress vcredist140 -dir C:\Windows\System32\msvcp140.dll - echo "=== (%PYTHON%) Clear output directories and leftovers ===" del /s /q C:\arrow-build del /s /q C:\arrow-dist @@ -133,9 +128,6 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python -@REM Bundle the C++ runtime -cp C:\Windows\System32\msvcp140.dll pyarrow\ - @REM Build wheel %PYTHON_CMD% setup.py bdist_wheel || exit /B 1 @@ -144,16 +136,13 @@ cp C:\Windows\System32\msvcp140.dll pyarrow\ @REM Since we bundled the Arrow C++ libraries ourselves, we only need to @REM mangle msvcp140.dll so as to avoid ABI issues when msvcp140.dll is @REM required by multiple Python libraries in the same process. -@REM -@REM For now this requires a custom version of delvewheel: -@REM https://github.com/adang1345/delvewheel/pull/59 -%PYTHON_CMD% -m pip install https://github.com/pitrou/delvewheel/archive/refs/heads/fixes-for-arrow.zip || exit /B 1 +%PYTHON_CMD% -m pip install delvewheel || exit /B 1 for /f %%i in ('dir dist\pyarrow-*.whl /B') do (set WHEEL_NAME=%cd%\dist\%%i) || exit /B 1 echo "Wheel name: %WHEEL_NAME%" %PYTHON_CMD% -m delvewheel repair -vv ^ - --mangle-only=msvcp140.dll --no-patch ^ + --ignore-existing --with-mangle ^ -w repaired_wheels %WHEEL_NAME% || exit /B 1 popd diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 67fb2a4a3ea76..39b51874b1c0e 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -29,17 +29,16 @@ index a79c72a59..6b7fa6a66 100644 vcpkg_cmake_install(ADD_BIN_TO_PATH) diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake -index 0c7098082..c603c3653 100644 +index 0312b2ae1..fdb576b5f 100644 --- a/ports/snappy/portfile.cmake +++ b/ports/snappy/portfile.cmake -@@ -10,6 +10,7 @@ vcpkg_from_github( - PATCHES +@@ -8,5 +8,6 @@ vcpkg_from_github( fix_clang-cl_build.patch no-werror.patch + pkgconfig.diff + "snappy-disable-bmi.patch" ) - - vcpkg_cmake_configure( + file(COPY "${CURRENT_PORT_DIR}/snappy.pc.in" DESTINATION "${SOURCE_PATH}") diff --git a/ports/snappy/snappy-disable-bmi.patch b/ports/snappy/snappy-disable-bmi.patch new file mode 100644 index 000000000..e839c93a4 diff --git a/ci/vcpkg/vcpkg.json b/ci/vcpkg/vcpkg.json index 58b1382d1ca88..5dfe61a0c6062 100644 --- a/ci/vcpkg/vcpkg.json +++ b/ci/vcpkg/vcpkg.json @@ -5,15 +5,10 @@ "homepage": "https://arrow.apache.org", "supports": "x64 | (arm64 & !windows)", "dependencies": [ - "boost-filesystem", { "name": "boost-multiprecision", "platform": "windows" }, - { - "name": "boost-system", - "platform": "windows" - }, "brotli", "bzip2", "curl", @@ -42,6 +37,8 @@ "description": "Development dependencies", "dependencies": [ "benchmark", + "boost-crc", + "boost-filesystem", "boost-process", "gtest" ] @@ -78,6 +75,7 @@ { "name": "llvm", "default-features": false, + "version>=": "18.1", "features": [ "clang", "default-targets", diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 103e0f08445d9..a7d80c2e96c23 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -79,7 +79,7 @@ if(POLICY CMP0170) cmake_policy(SET CMP0170 NEW) endif() -set(ARROW_VERSION "19.0.0-SNAPSHOT") +set(ARROW_VERSION "20.0.0-SNAPSHOT") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index abfe6d274f7b8..e08e1cb2c6119 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -988,9 +988,11 @@ endif() # Enable s/ccache if set by parent. if(CMAKE_C_COMPILER_LAUNCHER AND CMAKE_CXX_COMPILER_LAUNCHER) + file(TO_CMAKE_PATH "${CMAKE_C_COMPILER_LAUNCHER}" EP_CMAKE_C_COMPILER_LAUNCHER) + file(TO_CMAKE_PATH "${CMAKE_CXX_COMPILER_LAUNCHER}" EP_CMAKE_CXX_COMPILER_LAUNCHER) list(APPEND EP_COMMON_CMAKE_ARGS - -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} - -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}) + -DCMAKE_C_COMPILER_LAUNCHER=${EP_CMAKE_C_COMPILER_LAUNCHER} + -DCMAKE_CXX_COMPILER_LAUNCHER=${EP_CMAKE_CXX_COMPILER_LAUNCHER}) endif() if(NOT ARROW_VERBOSE_THIRDPARTY_BUILD) @@ -1256,6 +1258,7 @@ endif() # - S3FS and Flight benchmarks need Boost at runtime. if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS + OR ARROW_FUZZING OR (ARROW_FLIGHT AND (ARROW_TESTING OR ARROW_BUILD_BENCHMARKS)) OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)) set(ARROW_USE_BOOST TRUE) @@ -4573,11 +4576,16 @@ target_include_directories(arrow::hadoop INTERFACE "${HADOOP_HOME}/include") function(build_orc) message(STATUS "Building Apache ORC from source") + # Remove this and "patch" in "ci/docker/{debian,ubuntu}-*.dockerfile" once we have a patch for ORC 2.1.1 + find_program(PATCH patch REQUIRED) + set(ORC_PATCH_COMMAND ${PATCH} -p1 -i ${CMAKE_CURRENT_LIST_DIR}/orc.diff) + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.29) fetchcontent_declare(orc ${FC_DECLARE_COMMON_OPTIONS} URL ${ORC_SOURCE_URL} - URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}") + URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}" + PATCH_COMMAND ${ORC_PATCH_COMMAND}) prepare_fetchcontent() set(CMAKE_UNITY_BUILD FALSE) @@ -4667,16 +4675,10 @@ function(build_orc) OFF CACHE BOOL "" FORCE) - # We can remove this with ORC 2.0.2 or later. - list(PREPEND CMAKE_MODULE_PATH - ${CMAKE_CURRENT_BINARY_DIR}/_deps/orc-src/cmake_modules) - fetchcontent_makeavailable(orc) add_library(orc::orc INTERFACE IMPORTED) target_link_libraries(orc::orc INTERFACE orc) - target_include_directories(orc::orc INTERFACE "${orc_BINARY_DIR}/c++/include" - "${orc_SOURCE_DIR}/c++/include") list(APPEND ARROW_BUNDLED_STATIC_LIBS orc) else() @@ -4701,6 +4703,9 @@ function(build_orc) get_target_property(ORC_ZSTD_ROOT ${ARROW_ZSTD_LIBZSTD} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ORC_ZSTD_ROOT "${ORC_ZSTD_ROOT}" DIRECTORY) + get_target_property(ORC_ZLIB_ROOT ZLIB::ZLIB INTERFACE_INCLUDE_DIRECTORIES) + get_filename_component(ORC_ZLIB_ROOT "${ORC_ZLIB_ROOT}" DIRECTORY) + set(ORC_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ORC_PREFIX}" @@ -4710,7 +4715,6 @@ function(build_orc) -DBUILD_TOOLS=OFF -DBUILD_CPP_TESTS=OFF -DINSTALL_VENDORED_LIBS=OFF - "-DLZ4_HOME=${ORC_LZ4_ROOT}" "-DPROTOBUF_EXECUTABLE=$" "-DPROTOBUF_HOME=${ORC_PROTOBUF_ROOT}" "-DPROTOBUF_INCLUDE_DIR=$" @@ -4718,16 +4722,17 @@ function(build_orc) "-DPROTOC_LIBRARY=$" "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}" "-DSNAPPY_LIBRARY=$" + "-DLZ4_HOME=${ORC_LZ4_ROOT}" "-DLZ4_LIBRARY=$" "-DLZ4_STATIC_LIB=$" "-DLZ4_INCLUDE_DIR=${ORC_LZ4_ROOT}/include" "-DSNAPPY_INCLUDE_DIR=${ORC_SNAPPY_INCLUDE_DIR}" "-DZSTD_HOME=${ORC_ZSTD_ROOT}" "-DZSTD_INCLUDE_DIR=$" - "-DZSTD_LIBRARY=$") - if(ZLIB_ROOT) - set(ORC_CMAKE_ARGS ${ORC_CMAKE_ARGS} "-DZLIB_HOME=${ZLIB_ROOT}") - endif() + "-DZSTD_LIBRARY=$" + "-DZLIB_HOME=${ORC_ZLIB_ROOT}" + "-DZLIB_INCLUDE_DIR=$" + "-DZLIB_LIBRARY=$") # Work around CMake bug file(MAKE_DIRECTORY ${ORC_INCLUDE_DIR}) @@ -4743,7 +4748,8 @@ function(build_orc) ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET} LZ4::lz4 - ZLIB::ZLIB) + ZLIB::ZLIB + PATCH_COMMAND ${ORC_PATCH_COMMAND}) add_library(orc::orc STATIC IMPORTED) set_target_properties(orc::orc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}") target_include_directories(orc::orc BEFORE INTERFACE "${ORC_INCLUDE_DIR}") diff --git a/cpp/cmake_modules/orc.diff b/cpp/cmake_modules/orc.diff new file mode 100644 index 0000000000000..7bdbfa1cf5d33 --- /dev/null +++ b/cpp/cmake_modules/orc.diff @@ -0,0 +1,289 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 1f8931508..f8e57bf5f 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -30,8 +30,8 @@ SET(CPACK_PACKAGE_VERSION_MAJOR "2") + SET(CPACK_PACKAGE_VERSION_MINOR "1") + SET(CPACK_PACKAGE_VERSION_PATCH "0") + SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") +-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") + set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # For clang-tidy. ++list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") + + option (BUILD_JAVA + "Include ORC Java library in the build process" +@@ -225,5 +225,3 @@ if (BUILD_CPP_TESTS) + ) + endif () + endif () +- +-INCLUDE(CheckFormat) +diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt +index 694667c06..af13a94aa 100644 +--- a/c++/src/CMakeLists.txt ++++ b/c++/src/CMakeLists.txt +@@ -218,8 +218,8 @@ target_include_directories (orc + INTERFACE + $ + PUBLIC +- $ +- $ ++ $ ++ $ + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} +diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake +index 017e6c5b8..fe376ed16 100644 +--- a/cmake_modules/ThirdpartyToolchain.cmake ++++ b/cmake_modules/ThirdpartyToolchain.cmake +@@ -103,13 +103,13 @@ endif () + + # ---------------------------------------------------------------------- + # Macros for adding third-party libraries +-macro (add_resolved_library target_name link_lib include_dir) +- add_library (${target_name} INTERFACE IMPORTED) ++macro (orc_add_resolved_library target_name link_lib include_dir) ++ add_library (${target_name} INTERFACE IMPORTED GLOBAL) + target_link_libraries (${target_name} INTERFACE ${link_lib}) + target_include_directories (${target_name} SYSTEM INTERFACE ${include_dir}) + endmacro () + +-macro (add_built_library external_project_name target_name link_lib include_dir) ++macro (orc_add_built_library external_project_name target_name link_lib include_dir) + file (MAKE_DIRECTORY "${include_dir}") + + add_library (${target_name} STATIC IMPORTED) +@@ -122,7 +122,7 @@ macro (add_built_library external_project_name target_name link_lib include_dir) + endif () + endmacro () + +-function(provide_cmake_module MODULE_NAME) ++function(orc_provide_cmake_module MODULE_NAME) + set(module "${CMAKE_SOURCE_DIR}/cmake_modules/${MODULE_NAME}.cmake") + if(EXISTS "${module}") + message(STATUS "Providing CMake module for ${MODULE_NAME} as part of CMake package") +@@ -130,8 +130,8 @@ function(provide_cmake_module MODULE_NAME) + endif() + endfunction() + +-function(provide_find_module PACKAGE_NAME) +- provide_cmake_module("Find${PACKAGE_NAME}") ++function(orc_provide_find_module PACKAGE_NAME) ++ orc_provide_cmake_module("Find${PACKAGE_NAME}") + endfunction() + + # ---------------------------------------------------------------------- +@@ -156,7 +156,7 @@ ExternalProject_Add (orc-format_ep + # Snappy + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (Snappy REQUIRED CONFIG) +- add_resolved_library (orc_snappy ${Snappy_LIBRARIES} ${Snappy_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_snappy ${Snappy_LIBRARIES} ${Snappy_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -168,13 +168,13 @@ elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + elseif (NOT "${SNAPPY_HOME}" STREQUAL "") + find_package (Snappy REQUIRED) + if (ORC_PREFER_STATIC_SNAPPY AND SNAPPY_STATIC_LIB) +- add_resolved_library (orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) + else () +- add_resolved_library (orc_snappy ${SNAPPY_LIBRARY} ${SNAPPY_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_snappy ${SNAPPY_LIBRARY} ${SNAPPY_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +- provide_find_module (Snappy) ++ orc_provide_find_module (Snappy) + else () + set(SNAPPY_HOME "${THIRDPARTY_DIR}/snappy_ep-install") + set(SNAPPY_INCLUDE_DIR "${SNAPPY_HOME}/include") +@@ -194,7 +194,7 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}") + +- add_built_library (snappy_ep orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) ++ orc_add_built_library (snappy_ep orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_snappy|${SNAPPY_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -207,7 +207,7 @@ add_library (orc::snappy ALIAS orc_snappy) + + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (ZLIB REQUIRED CONFIG) +- add_resolved_library (orc_zlib ${ZLIB_LIBRARIES} ${ZLIB_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zlib ${ZLIB_LIBRARIES} ${ZLIB_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -219,13 +219,13 @@ elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + elseif (NOT "${ZLIB_HOME}" STREQUAL "") + find_package (ZLIB REQUIRED) + if (ORC_PREFER_STATIC_ZLIB AND ZLIB_STATIC_LIB) +- add_resolved_library (orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) + else () +- add_resolved_library (orc_zlib ${ZLIB_LIBRARY} ${ZLIB_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zlib ${ZLIB_LIBRARY} ${ZLIB_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +- provide_find_module (ZLIB) ++ orc_provide_find_module (ZLIB) + else () + set(ZLIB_PREFIX "${THIRDPARTY_DIR}/zlib_ep-install") + set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include") +@@ -252,7 +252,7 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}") + +- add_built_library (zlib_ep orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) ++ orc_add_built_library (zlib_ep orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_zlib|${ZLIB_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -265,7 +265,7 @@ add_library (orc::zlib ALIAS orc_zlib) + + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (ZSTD REQUIRED CONFIG) +- add_resolved_library (orc_zstd ${zstd_LIBRARIES} ${zstd_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zstd ${zstd_LIBRARIES} ${zstd_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZSTD) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -277,14 +277,14 @@ elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + elseif (NOT "${ZSTD_HOME}" STREQUAL "") + find_package (ZSTD REQUIRED) + if (ORC_PREFER_STATIC_ZSTD AND ZSTD_STATIC_LIB) +- add_resolved_library (orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + else () +- add_resolved_library (orc_zstd ${ZSTD_LIBRARY} ${ZSTD_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zstd ${ZSTD_LIBRARY} ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES ZSTD) +- provide_find_module (ZSTD) ++ orc_provide_find_module (ZSTD) + else () + set(ZSTD_HOME "${THIRDPARTY_DIR}/zstd_ep-install") + set(ZSTD_INCLUDE_DIR "${ZSTD_HOME}/include") +@@ -318,7 +318,7 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS ${ZSTD_STATIC_LIB}) + +- add_built_library (zstd_ep orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) ++ orc_add_built_library (zstd_ep orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_zstd|${ZSTD_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -330,7 +330,7 @@ add_library (orc::zstd ALIAS orc_zstd) + # LZ4 + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (LZ4 REQUIRED CONFIG) +- add_resolved_library (orc_lz4 ${lz4_LIBRARIES} ${lz4_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_lz4 ${lz4_LIBRARIES} ${lz4_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES LZ4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -342,13 +342,13 @@ elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + elseif (NOT "${LZ4_HOME}" STREQUAL "") + find_package (LZ4 REQUIRED) + if (ORC_PREFER_STATIC_LZ4 AND LZ4_STATIC_LIB) +- add_resolved_library (orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) + else () +- add_resolved_library (orc_lz4 ${LZ4_LIBRARY} ${LZ4_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_lz4 ${LZ4_LIBRARY} ${LZ4_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES LZ4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +- provide_find_module (LZ4) ++ orc_provide_find_module (LZ4) + else () + set(LZ4_PREFIX "${THIRDPARTY_DIR}/lz4_ep-install") + set(LZ4_INCLUDE_DIR "${LZ4_PREFIX}/include") +@@ -375,7 +375,7 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS ${LZ4_STATIC_LIB}) + +- add_built_library (lz4_ep orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) ++ orc_add_built_library (lz4_ep orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_lz4|${LZ4_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -491,7 +491,7 @@ endif () + + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (Protobuf REQUIRED CONFIG) +- add_resolved_library (orc_protobuf ${protobuf_LIBRARIES} ${protobuf_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protobuf ${protobuf_LIBRARIES} ${protobuf_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -505,20 +505,20 @@ elseif (NOT "${PROTOBUF_HOME}" STREQUAL "") + find_package (Protobuf REQUIRED) + + if (ORC_PREFER_STATIC_PROTOBUF AND PROTOBUF_STATIC_LIB) +- add_resolved_library (orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + else () +- add_resolved_library (orc_protobuf ${PROTOBUF_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protobuf ${PROTOBUF_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) + endif () + + if (ORC_PREFER_STATIC_PROTOBUF AND PROTOC_STATIC_LIB) +- add_resolved_library (orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + else () +- add_resolved_library (orc_protoc ${PROTOC_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protoc ${PROTOC_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) + endif () + + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +- provide_find_module (Protobuf) ++ orc_provide_find_module (Protobuf) + else () + set(PROTOBUF_PREFIX "${THIRDPARTY_DIR}/protobuf_ep-install") + set(PROTOBUF_INCLUDE_DIR "${PROTOBUF_PREFIX}/include") +@@ -556,8 +556,8 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}" "${PROTOC_STATIC_LIB}") + +- add_built_library (protobuf_ep orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) +- add_built_library (protobuf_ep orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_built_library (protobuf_ep orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_built_library (protobuf_ep orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_protobuf|${PROTOBUF_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -610,7 +610,7 @@ if(BUILD_LIBHDFSPP) + BUILD_BYPRODUCTS "${LIBHDFSPP_STATIC_LIB}" + CMAKE_ARGS ${LIBHDFSPP_CMAKE_ARGS}) + +- add_built_library(libhdfspp_ep libhdfspp ${LIBHDFSPP_STATIC_LIB} ${LIBHDFSPP_INCLUDE_DIR}) ++ orc_add_built_library(libhdfspp_ep libhdfspp ${LIBHDFSPP_STATIC_LIB} ${LIBHDFSPP_INCLUDE_DIR}) + + set (LIBHDFSPP_LIBRARIES + libhdfspp diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6e2294371e7a6..eb9860b240f16 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -771,13 +771,14 @@ if(ARROW_COMPUTE) compute/kernels/scalar_validity.cc compute/kernels/vector_array_sort.cc compute/kernels/vector_cumulative_ops.cc - compute/kernels/vector_pairwise.cc compute/kernels/vector_nested.cc + compute/kernels/vector_pairwise.cc compute/kernels/vector_rank.cc compute/kernels/vector_replace.cc compute/kernels/vector_run_end_encode.cc compute/kernels/vector_select_k.cc compute/kernels/vector_sort.cc + compute/kernels/vector_swizzle.cc compute/key_hash_internal.cc compute/key_map_internal.cc compute/light_array_internal.cc diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 0a2536b11e33c..54269f1df0eb6 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -117,7 +117,7 @@ if(ARROW_TESTING) if(ARROW_WITH_OPENTELEMETRY) target_link_libraries(arrow_acero_testing PRIVATE ${ARROW_OPENTELEMETRY_LIBS}) endif() - list(APPEND ARROW_ACERO_TEST_LINK_LIBS arrow_acero_testing) + list(APPEND ARROW_ACERO_TEST_LINK_LIBS arrow_acero_testing arrow_compute_testing) endif() # Only for hash_aggregate_test.cc. if(ARROW_USE_BOOST) diff --git a/cpp/src/arrow/acero/accumulation_queue.h b/cpp/src/arrow/acero/accumulation_queue.h index a173f9840388f..92d62d5d99d16 100644 --- a/cpp/src/arrow/acero/accumulation_queue.h +++ b/cpp/src/arrow/acero/accumulation_queue.h @@ -22,6 +22,7 @@ #include #include +#include "arrow/acero/visibility.h" #include "arrow/compute/exec.h" #include "arrow/result.h" @@ -70,7 +71,7 @@ class AccumulationQueue { /// For example, in a top-n node, the process callback should determine how many /// rows need to be delivered for the given batch, and then return a task to actually /// deliver those rows. -class SequencingQueue { +class ARROW_ACERO_EXPORT SequencingQueue { public: using Task = std::function; @@ -123,7 +124,7 @@ class SequencingQueue { /// /// It can be helpful to think of this as if a dedicated thread is running Process as /// batches arrive -class SerialSequencingQueue { +class ARROW_ACERO_EXPORT SerialSequencingQueue { public: /// Strategy that describes how to handle items class Processor { diff --git a/cpp/src/arrow/acero/aggregate_node.h b/cpp/src/arrow/acero/aggregate_node.h index 790264b208305..0c6fea16a8acc 100644 --- a/cpp/src/arrow/acero/aggregate_node.h +++ b/cpp/src/arrow/acero/aggregate_node.h @@ -24,6 +24,7 @@ #include "arrow/acero/visibility.h" #include "arrow/compute/api_aggregate.h" +#include "arrow/compute/test_util_internal.h" #include "arrow/compute/type_fwd.h" #include "arrow/result.h" #include "arrow/type_fwd.h" diff --git a/cpp/src/arrow/acero/aggregate_node_test.cc b/cpp/src/arrow/acero/aggregate_node_test.cc index c623271db9fb4..f980496d527d1 100644 --- a/cpp/src/arrow/acero/aggregate_node_test.cc +++ b/cpp/src/arrow/acero/aggregate_node_test.cc @@ -24,6 +24,7 @@ #include "arrow/acero/test_util_internal.h" #include "arrow/compute/api_aggregate.h" +#include "arrow/compute/test_util_internal.h" #include "arrow/result.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" @@ -32,6 +33,8 @@ namespace arrow { +using compute::ExecBatchFromJSON; + namespace acero { Result> TableGroupBy( diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index 64d41ccb1ab20..c726ac7c821a7 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -41,8 +41,9 @@ #include "arrow/acero/util.h" #include "arrow/api.h" #include "arrow/compute/api_scalar.h" -#include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/cast.h" #include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/compute/test_util_internal.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" @@ -67,6 +68,7 @@ namespace arrow { using compute::Cast; using compute::Divide; +using compute::ExecBatchFromJSON; using compute::Multiply; using compute::Subtract; diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 1e2975afc91b3..7f4b6dd75272f 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -42,7 +42,6 @@ #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" -#include "arrow/compute/row/grouper_internal.h" #include "arrow/table.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" @@ -70,9 +69,11 @@ using internal::checked_cast; using internal::checked_pointer_cast; using internal::ToChars; +using compute::ArgShape; using compute::CallFunction; using compute::CountOptions; using compute::default_exec_context; +using compute::ExecBatchFromJSON; using compute::ExecSpan; using compute::FunctionOptions; using compute::Grouper; @@ -84,6 +85,7 @@ using compute::SortKey; using compute::SortOrder; using compute::Take; using compute::TDigestOptions; +using compute::ValidateOutput; using compute::VarianceOptions; namespace acero { @@ -159,8 +161,6 @@ TEST(AggregateSchema, SingleKeyAndSegmentKey) { output_schema); } -namespace { - using GroupByFunction = std::function( const std::vector&, const std::vector&, const std::vector&, const std::vector&, bool, bool)>; @@ -538,930 +538,6 @@ Result GroupByTest(GroupByFunction group_by, const std::vector& ar return GroupByTest(group_by, arguments, keys, {}, aggregates, use_threads); } -template -void TestGroupClassSupportedKeys( - std::function>(const std::vector&)> - make_func) { - ASSERT_OK(make_func({boolean()})); - - ASSERT_OK(make_func({int8(), uint16(), int32(), uint64()})); - - ASSERT_OK(make_func({dictionary(int64(), utf8())})); - - ASSERT_OK(make_func({float16(), float32(), float64()})); - - ASSERT_OK(make_func({utf8(), binary(), large_utf8(), large_binary()})); - - ASSERT_OK(make_func({fixed_size_binary(16), fixed_size_binary(32)})); - - ASSERT_OK(make_func({decimal128(32, 10), decimal256(76, 20)})); - - ASSERT_OK(make_func({date32(), date64()})); - - for (auto unit : { - TimeUnit::SECOND, - TimeUnit::MILLI, - TimeUnit::MICRO, - TimeUnit::NANO, - }) { - ASSERT_OK(make_func({timestamp(unit), duration(unit)})); - } - - ASSERT_OK( - make_func({day_time_interval(), month_interval(), month_day_nano_interval()})); - - ASSERT_OK(make_func({null()})); - - ASSERT_RAISES(NotImplemented, make_func({struct_({field("", int64())})})); - - ASSERT_RAISES(NotImplemented, make_func({struct_({})})); - - ASSERT_RAISES(NotImplemented, make_func({list(int32())})); - - ASSERT_RAISES(NotImplemented, make_func({fixed_size_list(int32(), 5)})); - - ASSERT_RAISES(NotImplemented, make_func({dense_union({field("", int32())})})); -} - -void TestSegments(std::unique_ptr& segmenter, const ExecSpan& batch, - std::vector expected_segments) { - ASSERT_OK_AND_ASSIGN(auto actual_segments, segmenter->GetSegments(batch)); - ASSERT_EQ(actual_segments.size(), expected_segments.size()); - for (size_t i = 0; i < actual_segments.size(); ++i) { - SCOPED_TRACE("segment #" + ToChars(i)); - ASSERT_EQ(actual_segments[i], expected_segments[i]); - } -} - -Result> MakeGrouper(const std::vector& key_types) { - return Grouper::Make(key_types, default_exec_context()); -} - -Result> MakeRowSegmenter( - const std::vector& key_types) { - return RowSegmenter::Make(key_types, /*nullable_leys=*/false, default_exec_context()); -} - -Result> MakeGenericSegmenter( - const std::vector& key_types) { - return MakeAnyKeysSegmenter(key_types, default_exec_context()); -} - -} // namespace - -TEST(RowSegmenter, SupportedKeys) { - TestGroupClassSupportedKeys(MakeRowSegmenter); -} - -TEST(RowSegmenter, Basics) { - std::vector bad_types2 = {int32(), float32()}; - std::vector types2 = {int32(), int32()}; - std::vector bad_types1 = {float32()}; - std::vector types1 = {int32()}; - std::vector types0 = {}; - auto batch2 = ExecBatchFromJSON(types2, "[[1, 1], [1, 2], [2, 2]]"); - auto batch1 = ExecBatchFromJSON(types1, "[[1], [1], [2]]"); - ExecBatch batch0({}, 3); - { - SCOPED_TRACE("types0 segmenting of batch2"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types0)); - ExecSpan span2(batch2); - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 0 "), - segmenter->GetSegments(span2)); - ExecSpan span0(batch0); - TestSegments(segmenter, span0, {{0, 3, true, true}}); - } - { - SCOPED_TRACE("bad_types1 segmenting of batch1"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(bad_types1)); - ExecSpan span1(batch1); - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch value 0 of type "), - segmenter->GetSegments(span1)); - } - { - SCOPED_TRACE("types1 segmenting of batch2"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types1)); - ExecSpan span2(batch2); - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 1 "), - segmenter->GetSegments(span2)); - ExecSpan span1(batch1); - TestSegments(segmenter, span1, {{0, 2, false, true}, {2, 1, true, false}}); - } - { - SCOPED_TRACE("bad_types2 segmenting of batch2"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(bad_types2)); - ExecSpan span2(batch2); - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch value 1 of type "), - segmenter->GetSegments(span2)); - } - { - SCOPED_TRACE("types2 segmenting of batch1"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types2)); - ExecSpan span1(batch1); - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 2 "), - segmenter->GetSegments(span1)); - ExecSpan span2(batch2); - TestSegments(segmenter, span2, - {{0, 1, false, true}, {1, 1, false, false}, {2, 1, true, false}}); - } -} - -TEST(RowSegmenter, NonOrdered) { - for (int num_keys = 1; num_keys <= 2; ++num_keys) { - SCOPED_TRACE("non-ordered " + ToChars(num_keys) + " int32(s)"); - std::vector types(num_keys, int32()); - std::vector values(num_keys, ArrayFromJSON(int32(), "[1, 1, 2, 1, 2]")); - ExecBatch batch(std::move(values), 5); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 1, true, false}}); - } -} - -TEST(RowSegmenter, EmptyBatches) { - { - SCOPED_TRACE("empty batches {int32}"); - std::vector types = {int32()}; - std::vector batches = { - ExecBatchFromJSON(types, "[]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[2], [2]]"), ExecBatchFromJSON(types, "[]"), - }; - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batches[0]), {}); - TestSegments(segmenter, ExecSpan(batches[1]), {}); - TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[3]), {}); - TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[5]), {}); - TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); - TestSegments(segmenter, ExecSpan(batches[7]), {}); - } - { - SCOPED_TRACE("empty batches {int32, int32}"); - std::vector types = {int32(), int32()}; - std::vector batches = { - ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1, 1]]"), - ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1, 1]]"), - ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[2, 2], [2, 2]]"), - ExecBatchFromJSON(types, "[]"), - }; - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batches[0]), {}); - TestSegments(segmenter, ExecSpan(batches[1]), {}); - TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[3]), {}); - TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[5]), {}); - TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); - TestSegments(segmenter, ExecSpan(batches[7]), {}); - } -} - -TEST(RowSegmenter, MultipleSegments) { - auto test_with_keys = [](int num_keys, const std::shared_ptr& key) { - SCOPED_TRACE("multiple segments " + ToChars(num_keys) + " " + - key->type()->ToString()); - std::vector types(num_keys, key->type()); - std::vector values(num_keys, key); - ExecBatch batch(std::move(values), key->length()); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 2, false, false}, - {6, 2, false, false}, - {8, 1, true, false}}); - }; - for (int num_keys = 1; num_keys <= 2; ++num_keys) { - test_with_keys(num_keys, ArrayFromJSON(int32(), "[1, 1, 2, 5, 3, 3, 5, 5, 4]")); - test_with_keys( - num_keys, - ArrayFromJSON(fixed_size_binary(2), - R"(["aa", "aa", "bb", "ee", "cc", "cc", "ee", "ee", "dd"])")); - test_with_keys(num_keys, DictArrayFromJSON(dictionary(int8(), utf8()), - "[0, 0, 1, 4, 2, 2, 4, 4, 3]", - R"(["a", "b", "c", "d", "e"])")); - } -} - -TEST(RowSegmenter, MultipleSegmentsMultipleBatches) { - { - SCOPED_TRACE("multiple segments multiple batches {int32}"); - std::vector types = {int32()}; - std::vector batches = { - ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[[1], [2]]"), - ExecBatchFromJSON(types, "[[5], [3]]"), - ExecBatchFromJSON(types, "[[3], [5], [5]]"), ExecBatchFromJSON(types, "[[4]]")}; - - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batches[0]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[1]), - {{0, 1, false, true}, {1, 1, true, false}}); - TestSegments(segmenter, ExecSpan(batches[2]), - {{0, 1, false, false}, {1, 1, true, false}}); - TestSegments(segmenter, ExecSpan(batches[3]), - {{0, 1, false, true}, {1, 2, true, false}}); - TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}}); - } - { - SCOPED_TRACE("multiple segments multiple batches {int32, int32}"); - std::vector types = {int32(), int32()}; - std::vector batches = { - ExecBatchFromJSON(types, "[[1, 1]]"), - ExecBatchFromJSON(types, "[[1, 1], [2, 2]]"), - ExecBatchFromJSON(types, "[[5, 5], [3, 3]]"), - ExecBatchFromJSON(types, "[[3, 3], [5, 5], [5, 5]]"), - ExecBatchFromJSON(types, "[[4, 4]]")}; - - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batches[0]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[1]), - {{0, 1, false, true}, {1, 1, true, false}}); - TestSegments(segmenter, ExecSpan(batches[2]), - {{0, 1, false, false}, {1, 1, true, false}}); - TestSegments(segmenter, ExecSpan(batches[3]), - {{0, 1, false, true}, {1, 2, true, false}}); - TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}}); - } -} - -namespace { - -void TestRowSegmenterConstantBatch( - const std::shared_ptr& type, - std::function shape_func, - std::function>(int64_t key)> value_func, - std::function>(const std::vector&)> - make_segmenter) { - constexpr int64_t n_keys = 3, n_rows = 3, repetitions = 3; - std::vector types(n_keys, type); - std::vector full_values(n_keys); - for (int64_t i = 0; i < n_keys; i++) { - auto shape = shape_func(i); - ASSERT_OK_AND_ASSIGN(auto scalar, value_func(i)); - if (shape == ArgShape::SCALAR) { - full_values[i] = std::move(scalar); - } else { - ASSERT_OK_AND_ASSIGN(full_values[i], MakeArrayFromScalar(*scalar, n_rows)); - } - } - auto test_with_keys = [&](int64_t keys) -> Status { - SCOPED_TRACE("constant-batch with " + ToChars(keys) + " key(s)"); - std::vector values(full_values.begin(), full_values.begin() + keys); - ExecBatch batch(values, n_rows); - std::vector key_types(types.begin(), types.begin() + keys); - ARROW_ASSIGN_OR_RAISE(auto segmenter, make_segmenter(key_types)); - for (int64_t i = 0; i < repetitions; i++) { - TestSegments(segmenter, ExecSpan(batch), {{0, n_rows, true, true}}); - ARROW_RETURN_NOT_OK(segmenter->Reset()); - } - return Status::OK(); - }; - for (int64_t i = 0; i <= n_keys; i++) { - ASSERT_OK(test_with_keys(i)); - } -} - -} // namespace - -TEST(RowSegmenter, ConstantArrayBatch) { - TestRowSegmenterConstantBatch( - int32(), [](int64_t key) { return ArgShape::ARRAY; }, - [](int64_t key) { return MakeScalar(1); }, MakeRowSegmenter); -} - -TEST(RowSegmenter, ConstantScalarBatch) { - TestRowSegmenterConstantBatch( - int32(), [](int64_t key) { return ArgShape::SCALAR; }, - [](int64_t key) { return MakeScalar(1); }, MakeRowSegmenter); -} - -TEST(RowSegmenter, ConstantMixedBatch) { - TestRowSegmenterConstantBatch( - int32(), - [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, - [](int64_t key) { return MakeScalar(1); }, MakeRowSegmenter); -} - -TEST(RowSegmenter, ConstantArrayBatchWithAnyKeysSegmenter) { - TestRowSegmenterConstantBatch( - int32(), [](int64_t key) { return ArgShape::ARRAY; }, - [](int64_t key) { return MakeScalar(1); }, MakeGenericSegmenter); -} - -TEST(RowSegmenter, ConstantScalarBatchWithAnyKeysSegmenter) { - TestRowSegmenterConstantBatch( - int32(), [](int64_t key) { return ArgShape::SCALAR; }, - [](int64_t key) { return MakeScalar(1); }, MakeGenericSegmenter); -} - -TEST(RowSegmenter, ConstantMixedBatchWithAnyKeysSegmenter) { - TestRowSegmenterConstantBatch( - int32(), - [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, - [](int64_t key) { return MakeScalar(1); }, MakeGenericSegmenter); -} - -TEST(RowSegmenter, ConstantFixedSizeBinaryArrayBatch) { - constexpr int fsb = 8; - auto type = fixed_size_binary(fsb); - ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); - TestRowSegmenterConstantBatch( - type, [](int64_t key) { return ArgShape::ARRAY; }, - [&](int64_t key) { return value; }, MakeRowSegmenter); -} - -TEST(RowSegmenter, ConstantFixedSizeBinaryScalarBatch) { - constexpr int fsb = 8; - auto type = fixed_size_binary(fsb); - ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); - TestRowSegmenterConstantBatch( - fixed_size_binary(8), [](int64_t key) { return ArgShape::SCALAR; }, - [&](int64_t key) { return value; }, MakeRowSegmenter); -} - -TEST(RowSegmenter, ConstantFixedSizeBinaryMixedBatch) { - constexpr int fsb = 8; - auto type = fixed_size_binary(fsb); - ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); - TestRowSegmenterConstantBatch( - fixed_size_binary(8), - [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, - [&](int64_t key) { return value; }, MakeRowSegmenter); -} - -TEST(RowSegmenter, ConstantFixedSizeBinaryArrayBatchWithAnyKeysSegmenter) { - constexpr int fsb = 8; - auto type = fixed_size_binary(fsb); - ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); - TestRowSegmenterConstantBatch( - type, [](int64_t key) { return ArgShape::ARRAY; }, - [&](int64_t key) { return value; }, MakeGenericSegmenter); -} - -TEST(RowSegmenter, ConstantFixedSizeBinaryScalarBatchWithAnyKeysSegmenter) { - constexpr int fsb = 8; - auto type = fixed_size_binary(fsb); - ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); - TestRowSegmenterConstantBatch( - fixed_size_binary(8), [](int64_t key) { return ArgShape::SCALAR; }, - [&](int64_t key) { return value; }, MakeGenericSegmenter); -} - -TEST(RowSegmenter, ConstantFixedSizeBinaryMixedBatchWithAnyKeysSegmenter) { - constexpr int fsb = 8; - auto type = fixed_size_binary(fsb); - ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); - TestRowSegmenterConstantBatch( - fixed_size_binary(8), - [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, - [&](int64_t key) { return value; }, MakeGenericSegmenter); -} - -TEST(RowSegmenter, ConstantDictionaryArrayBatch) { - auto index_type = int32(); - auto value_type = utf8(); - auto dict_type = dictionary(index_type, value_type); - auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); - ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); - auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); - TestRowSegmenterConstantBatch( - dict_type, [](int64_t key) { return ArgShape::ARRAY; }, - [&](int64_t key) { return dict_value; }, MakeRowSegmenter); -} - -TEST(RowSegmenter, ConstantDictionaryScalarBatch) { - auto index_type = int32(); - auto value_type = utf8(); - auto dict_type = dictionary(index_type, value_type); - auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); - ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); - auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); - TestRowSegmenterConstantBatch( - dict_type, [](int64_t key) { return ArgShape::SCALAR; }, - [&](int64_t key) { return dict_value; }, MakeRowSegmenter); -} - -TEST(RowSegmenter, ConstantDictionaryMixedBatch) { - auto index_type = int32(); - auto value_type = utf8(); - auto dict_type = dictionary(index_type, value_type); - auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); - ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); - auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); - TestRowSegmenterConstantBatch( - dict_type, - [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, - [&](int64_t key) { return dict_value; }, MakeRowSegmenter); -} - -TEST(RowSegmenter, ConstantDictionaryArrayBatchWithAnyKeysSegmenter) { - auto index_type = int32(); - auto value_type = utf8(); - auto dict_type = dictionary(index_type, value_type); - auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); - ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); - auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); - TestRowSegmenterConstantBatch( - dict_type, [](int64_t key) { return ArgShape::ARRAY; }, - [&](int64_t key) { return dict_value; }, MakeGenericSegmenter); -} - -TEST(RowSegmenter, ConstantDictionaryScalarBatchWithAnyKeysSegmenter) { - auto index_type = int32(); - auto value_type = utf8(); - auto dict_type = dictionary(index_type, value_type); - auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); - ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); - auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); - TestRowSegmenterConstantBatch( - dict_type, [](int64_t key) { return ArgShape::SCALAR; }, - [&](int64_t key) { return dict_value; }, MakeGenericSegmenter); -} - -TEST(RowSegmenter, ConstantDictionaryMixedBatchWithAnyKeysSegmenter) { - auto index_type = int32(); - auto value_type = utf8(); - auto dict_type = dictionary(index_type, value_type); - auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); - ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); - auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); - TestRowSegmenterConstantBatch( - dict_type, - [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, - [&](int64_t key) { return dict_value; }, MakeGenericSegmenter); -} - -TEST(RowSegmenter, RowConstantBatch) { - constexpr size_t n = 3; - std::vector types = {int32(), int32(), int32()}; - auto full_batch = ExecBatchFromJSON(types, "[[1, 1, 1], [2, 2, 2], [3, 3, 3]]"); - std::vector expected_segments_for_size_0 = {{0, 3, true, true}}; - std::vector expected_segments = { - {0, 1, false, true}, {1, 1, false, false}, {2, 1, true, false}}; - auto test_by_size = [&](size_t size) -> Status { - SCOPED_TRACE("constant-batch with " + ToChars(size) + " key(s)"); - std::vector values(full_batch.values.begin(), - full_batch.values.begin() + size); - ExecBatch batch(values, full_batch.length); - std::vector key_types(types.begin(), types.begin() + size); - ARROW_ASSIGN_OR_RAISE(auto segmenter, MakeRowSegmenter(key_types)); - TestSegments(segmenter, ExecSpan(batch), - size == 0 ? expected_segments_for_size_0 : expected_segments); - return Status::OK(); - }; - for (size_t i = 0; i <= n; i++) { - ASSERT_OK(test_by_size(i)); - } -} - -TEST(Grouper, SupportedKeys) { TestGroupClassSupportedKeys(MakeGrouper); } - -struct TestGrouper { - explicit TestGrouper(std::vector types, std::vector shapes = {}) - : types_(std::move(types)), shapes_(std::move(shapes)) { - grouper_ = Grouper::Make(types_).ValueOrDie(); - - FieldVector fields; - for (const auto& type : types_) { - fields.push_back(field("", type.GetSharedPtr())); - } - key_schema_ = schema(std::move(fields)); - } - - void ExpectConsume(const std::string& key_json, const std::string& expected) { - auto expected_arr = ArrayFromJSON(uint32(), expected); - if (shapes_.size() > 0) { - ExpectConsume(ExecBatchFromJSON(types_, shapes_, key_json), expected_arr); - } else { - ExpectConsume(ExecBatchFromJSON(types_, key_json), expected_arr); - } - } - - void ExpectConsume(const std::vector& key_values, Datum expected) { - ASSERT_OK_AND_ASSIGN(auto key_batch, ExecBatch::Make(key_values)); - ExpectConsume(key_batch, expected); - } - - void ExpectConsume(const ExecBatch& key_batch, Datum expected) { - Datum ids; - ConsumeAndValidate(key_batch, &ids); - AssertEquivalentIds(expected, ids); - } - - void ExpectUniques(const ExecBatch& uniques) { - EXPECT_THAT(grouper_->GetUniques(), ResultWith(Eq(uniques))); - } - - void ExpectUniques(const std::string& uniques_json) { - if (shapes_.size() > 0) { - ExpectUniques(ExecBatchFromJSON(types_, shapes_, uniques_json)); - } else { - ExpectUniques(ExecBatchFromJSON(types_, uniques_json)); - } - } - - void AssertEquivalentIds(const Datum& expected, const Datum& actual) { - auto left = expected.make_array(); - auto right = actual.make_array(); - ASSERT_EQ(left->length(), right->length()) << "#ids unequal"; - int64_t num_ids = left->length(); - auto left_data = left->data(); - auto right_data = right->data(); - auto left_ids = reinterpret_cast(left_data->buffers[1]->data()); - auto right_ids = reinterpret_cast(right_data->buffers[1]->data()); - uint32_t max_left_id = 0; - uint32_t max_right_id = 0; - for (int64_t i = 0; i < num_ids; ++i) { - if (left_ids[i] > max_left_id) { - max_left_id = left_ids[i]; - } - if (right_ids[i] > max_right_id) { - max_right_id = right_ids[i]; - } - } - std::vector right_to_left_present(max_right_id + 1, false); - std::vector left_to_right_present(max_left_id + 1, false); - std::vector right_to_left(max_right_id + 1); - std::vector left_to_right(max_left_id + 1); - for (int64_t i = 0; i < num_ids; ++i) { - uint32_t left_id = left_ids[i]; - uint32_t right_id = right_ids[i]; - if (!left_to_right_present[left_id]) { - left_to_right[left_id] = right_id; - left_to_right_present[left_id] = true; - } - if (!right_to_left_present[right_id]) { - right_to_left[right_id] = left_id; - right_to_left_present[right_id] = true; - } - ASSERT_EQ(left_id, right_to_left[right_id]); - ASSERT_EQ(right_id, left_to_right[left_id]); - } - } - - void ConsumeAndValidate(const ExecBatch& key_batch, Datum* ids = nullptr) { - ASSERT_OK_AND_ASSIGN(Datum id_batch, grouper_->Consume(ExecSpan(key_batch))); - - ValidateConsume(key_batch, id_batch); - - if (ids) { - *ids = std::move(id_batch); - } - } - - void ValidateConsume(const ExecBatch& key_batch, const Datum& id_batch) { - if (uniques_.length == -1) { - ASSERT_OK_AND_ASSIGN(uniques_, grouper_->GetUniques()); - } else if (static_cast(grouper_->num_groups()) > uniques_.length) { - ASSERT_OK_AND_ASSIGN(ExecBatch new_uniques, grouper_->GetUniques()); - - // check that uniques_ are prefixes of new_uniques - for (int i = 0; i < uniques_.num_values(); ++i) { - auto new_unique = new_uniques[i].make_array(); - ValidateOutput(*new_unique); - - AssertDatumsEqual(uniques_[i], new_unique->Slice(0, uniques_.length), - /*verbose=*/true); - } - - uniques_ = std::move(new_uniques); - } - - // check that the ids encode an equivalent key sequence - auto ids = id_batch.make_array(); - ValidateOutput(*ids); - - for (int i = 0; i < key_batch.num_values(); ++i) { - SCOPED_TRACE(ToChars(i) + "th key array"); - auto original = - key_batch[i].is_array() - ? key_batch[i].make_array() - : *MakeArrayFromScalar(*key_batch[i].scalar(), key_batch.length); - ASSERT_OK_AND_ASSIGN(auto encoded, Take(*uniques_[i].make_array(), *ids)); - AssertArraysEqual(*original, *encoded, /*verbose=*/true, - EqualOptions().nans_equal(true)); - } - } - - std::vector types_; - std::vector shapes_; - std::shared_ptr key_schema_; - std::unique_ptr grouper_; - ExecBatch uniques_ = ExecBatch({}, -1); -}; - -TEST(Grouper, BooleanKey) { - TestGrouper g({boolean()}); - - g.ExpectConsume("[[true], [true]]", "[0, 0]"); - - g.ExpectConsume("[[true], [true]]", "[0, 0]"); - - g.ExpectConsume("[[false], [null]]", "[1, 2]"); - - g.ExpectConsume("[[true], [false], [true], [false], [null], [false], [null]]", - "[0, 1, 0, 1, 2, 1, 2]"); -} - -TEST(Grouper, NumericKey) { - for (auto ty : { - uint8(), - int8(), - uint16(), - int16(), - uint32(), - int32(), - uint64(), - int64(), - float16(), - float32(), - float64(), - }) { - SCOPED_TRACE("key type: " + ty->ToString()); - - TestGrouper g({ty}); - - g.ExpectConsume("[[3], [3]]", "[0, 0]"); - g.ExpectUniques("[[3]]"); - - g.ExpectConsume("[[3], [3]]", "[0, 0]"); - g.ExpectUniques("[[3]]"); - - g.ExpectConsume("[[27], [81], [81]]", "[1, 2, 2]"); - g.ExpectUniques("[[3], [27], [81]]"); - - g.ExpectConsume("[[3], [27], [3], [27], [null], [81], [27], [81]]", - "[0, 1, 0, 1, 3, 2, 1, 2]"); - g.ExpectUniques("[[3], [27], [81], [null]]"); - } -} - -TEST(Grouper, FloatingPointKey) { - TestGrouper g({float32()}); - - // -0.0 hashes differently from 0.0 - g.ExpectConsume("[[0.0], [-0.0]]", "[0, 1]"); - - g.ExpectConsume("[[Inf], [-Inf]]", "[2, 3]"); - - // assert(!(NaN == NaN)) does not cause spurious new groups - g.ExpectConsume("[[NaN], [NaN]]", "[4, 4]"); - - // TODO(bkietz) test denormal numbers, more NaNs -} - -TEST(Grouper, StringKey) { - for (auto ty : {utf8(), large_utf8(), fixed_size_binary(2)}) { - SCOPED_TRACE("key type: " + ty->ToString()); - - TestGrouper g({ty}); - - g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]"); - - g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]"); - - g.ExpectConsume(R"([["be"], [null]])", "[1, 2]"); - } -} - -TEST(Grouper, DictKey) { - TestGrouper g({dictionary(int32(), utf8())}); - - // For dictionary keys, all batches must share a single dictionary. - // Eventually, differing dictionaries will be unified and indices transposed - // during encoding to relieve this restriction. - const auto dict = ArrayFromJSON(utf8(), R"(["ex", "why", "zee", null])"); - - auto WithIndices = [&](const std::string& indices) { - return Datum(*DictionaryArray::FromArrays(ArrayFromJSON(int32(), indices), dict)); - }; - - // NB: null index is not considered equivalent to index=3 (which encodes null in dict) - g.ExpectConsume({WithIndices(" [3, 1, null, 0, 2]")}, - ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]")); - - g = TestGrouper({dictionary(int32(), utf8())}); - - g.ExpectConsume({WithIndices(" [0, 1, 2, 3, null]")}, - ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]")); - - g.ExpectConsume({WithIndices(" [3, 1, null, 0, 2]")}, - ArrayFromJSON(uint32(), "[3, 1, 4, 0, 2]")); - - auto dict_arr = *DictionaryArray::FromArrays( - ArrayFromJSON(int32(), "[0, 1]"), - ArrayFromJSON(utf8(), R"(["different", "dictionary"])")); - ExecSpan dict_span({*dict_arr->data()}, 2); - EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, - HasSubstr("Unifying differing dictionaries"), - g.grouper_->Consume(dict_span)); -} - -TEST(Grouper, StringInt64Key) { - TestGrouper g({utf8(), int64()}); - - g.ExpectConsume(R"([["eh", 0], ["eh", 0]])", "[0, 0]"); - - g.ExpectConsume(R"([["eh", 0], ["eh", null]])", "[0, 1]"); - - g.ExpectConsume(R"([["eh", 1], ["bee", 1]])", "[2, 3]"); - - g.ExpectConsume(R"([["eh", null], ["bee", 1]])", "[1, 3]"); - - g = TestGrouper({utf8(), int64()}); - - g.ExpectConsume(R"([ - ["ex", 0], - ["ex", 0], - ["why", 0], - ["ex", 1], - ["why", 0], - ["ex", 1], - ["ex", 0], - ["why", 1] - ])", - "[0, 0, 1, 2, 1, 2, 0, 3]"); - - g.ExpectConsume(R"([ - ["ex", 0], - [null, 0], - [null, 0], - ["ex", 1], - [null, null], - ["ex", 1], - ["ex", 0], - ["why", null] - ])", - "[0, 4, 4, 2, 5, 2, 0, 6]"); -} - -TEST(Grouper, DoubleStringInt64Key) { - TestGrouper g({float64(), utf8(), int64()}); - - g.ExpectConsume(R"([[1.5, "eh", 0], [1.5, "eh", 0]])", "[0, 0]"); - - g.ExpectConsume(R"([[1.5, "eh", 0], [1.5, "eh", 0]])", "[0, 0]"); - - g.ExpectConsume(R"([[1.0, "eh", 0], [1.0, "be", null]])", "[1, 2]"); - - // note: -0 and +0 hash differently - g.ExpectConsume(R"([[-0.0, "be", 7], [0.0, "be", 7]])", "[3, 4]"); -} - -TEST(Grouper, RandomInt64Keys) { - TestGrouper g({int64()}); - for (int i = 0; i < 4; ++i) { - SCOPED_TRACE(ToChars(i) + "th key batch"); - - ExecBatch key_batch{ - *random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)}; - g.ConsumeAndValidate(key_batch); - } -} - -TEST(Grouper, RandomStringInt64Keys) { - TestGrouper g({utf8(), int64()}); - for (int i = 0; i < 4; ++i) { - SCOPED_TRACE(ToChars(i) + "th key batch"); - - ExecBatch key_batch{ - *random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)}; - g.ConsumeAndValidate(key_batch); - } -} - -TEST(Grouper, RandomStringInt64DoubleInt32Keys) { - TestGrouper g({utf8(), int64(), float64(), int32()}); - for (int i = 0; i < 4; ++i) { - SCOPED_TRACE(ToChars(i) + "th key batch"); - - ExecBatch key_batch{ - *random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)}; - g.ConsumeAndValidate(key_batch); - } -} - -TEST(Grouper, NullKeys) { - TestGrouper g({null()}); - g.ExpectConsume("[[null], [null]]", "[0, 0]"); -} - -TEST(Grouper, MultipleNullKeys) { - TestGrouper g({null(), null(), null(), null()}); - g.ExpectConsume("[[null, null, null, null], [null, null, null, null]]", "[0, 0]"); -} - -TEST(Grouper, Int64NullKeys) { - TestGrouper g({int64(), null()}); - g.ExpectConsume("[[1, null], [2, null], [1, null]]", "[0, 1, 0]"); -} - -TEST(Grouper, StringNullKeys) { - TestGrouper g({utf8(), null()}); - g.ExpectConsume(R"([["be", null], ["eh", null]])", "[0, 1]"); -} - -TEST(Grouper, DoubleNullStringKey) { - TestGrouper g({float64(), null(), utf8()}); - - g.ExpectConsume(R"([[1.5, null, "eh"], [1.5, null, "eh"]])", "[0, 0]"); - g.ExpectConsume(R"([[null, null, "eh"], [1.0, null, null]])", "[1, 2]"); - g.ExpectConsume(R"([ - [1.0, null, "wh"], - [4.4, null, null], - [5.2, null, "eh"], - [6.5, null, "be"], - [7.3, null, null], - [1.0, null, "wh"], - [9.1, null, "eh"], - [10.2, null, "be"], - [1.0, null, null] - ])", - "[3, 4, 5, 6, 7, 3, 8, 9, 2]"); -} - -TEST(Grouper, EmptyNullKeys) { - TestGrouper g({null()}); - g.ExpectConsume("[]", "[]"); -} - -TEST(Grouper, MakeGroupings) { - auto ExpectGroupings = [](std::string ids_json, std::string expected_json) { - auto ids = checked_pointer_cast(ArrayFromJSON(uint32(), ids_json)); - auto expected = ArrayFromJSON(list(int32()), expected_json); - - auto num_groups = static_cast(expected->length()); - ASSERT_OK_AND_ASSIGN(auto actual, Grouper::MakeGroupings(*ids, num_groups)); - AssertArraysEqual(*expected, *actual, /*verbose=*/true); - - // validate ApplyGroupings - ASSERT_OK_AND_ASSIGN(auto grouped_ids, Grouper::ApplyGroupings(*actual, *ids)); - - for (uint32_t group = 0; group < num_groups; ++group) { - auto ids_slice = checked_pointer_cast(grouped_ids->value_slice(group)); - for (auto slot : *ids_slice) { - EXPECT_EQ(slot, group); - } - } - }; - - ExpectGroupings("[]", "[[]]"); - - ExpectGroupings("[0, 0, 0]", "[[0, 1, 2]]"); - - ExpectGroupings("[0, 0, 0, 1, 1, 2]", "[[0, 1, 2], [3, 4], [5], []]"); - - ExpectGroupings("[2, 1, 2, 1, 1, 2]", "[[], [1, 3, 4], [0, 2, 5], [], []]"); - - ExpectGroupings("[2, 2, 5, 5, 2, 3]", "[[], [], [0, 1, 4], [5], [], [2, 3], [], []]"); - - auto ids = checked_pointer_cast(ArrayFromJSON(uint32(), "[0, null, 1]")); - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("MakeGroupings with null ids"), - Grouper::MakeGroupings(*ids, 5)); -} - -TEST(Grouper, ScalarValues) { - // large_utf8 forces GrouperImpl over GrouperFastImpl - for (const auto& str_type : {utf8(), large_utf8()}) { - { - TestGrouper g( - {boolean(), int32(), decimal128(3, 2), decimal256(3, 2), fixed_size_binary(2), - str_type, int32()}, - {ArgShape::SCALAR, ArgShape::SCALAR, ArgShape::SCALAR, ArgShape::SCALAR, - ArgShape::SCALAR, ArgShape::SCALAR, ArgShape::ARRAY}); - g.ExpectConsume( - R"([ -[true, 1, "1.00", "2.00", "ab", "foo", 2], -[true, 1, "1.00", "2.00", "ab", "foo", 2], -[true, 1, "1.00", "2.00", "ab", "foo", 3] -])", - "[0, 0, 1]"); - } - { - auto dict_type = dictionary(int32(), utf8()); - TestGrouper g({dict_type, str_type}, {ArgShape::SCALAR, ArgShape::SCALAR}); - const auto dict = R"(["foo", null])"; - g.ExpectConsume( - {DictScalarFromJSON(dict_type, "0", dict), ScalarFromJSON(str_type, R"("")")}, - ArrayFromJSON(uint32(), "[0]")); - g.ExpectConsume( - {DictScalarFromJSON(dict_type, "1", dict), ScalarFromJSON(str_type, R"("")")}, - ArrayFromJSON(uint32(), "[1]")); - } - } -} - void TestSegmentKey(GroupByFunction group_by, const std::shared_ptr& table, Datum output, const std::vector& segment_keys); diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 76ad9c7d650eb..654fd59c45d5a 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -26,9 +26,9 @@ #include "arrow/acero/test_util_internal.h" #include "arrow/acero/util.h" #include "arrow/api.h" -#include "arrow/compute/kernels/test_util.h" #include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/compute/test_util_internal.h" #include "arrow/extension/uuid.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/generator.h" @@ -49,6 +49,7 @@ using compute::and_; using compute::call; using compute::default_exec_context; using compute::ExecBatchBuilder; +using compute::ExecBatchFromJSON; using compute::ExecSpan; using compute::field_ref; using compute::SortIndices; @@ -2350,7 +2351,7 @@ TEST(HashJoin, FineGrainedResidualFilter) { auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ [null, null, "r_payload"], [null, 0, "r_payload"], - [null, 42, "r_payload"], + [null, 42, "r_payload"], ["both1", null, "r_payload"], ["both2", null, "r_payload"], ["right_only", null, "r_payload"], @@ -2519,7 +2520,7 @@ TEST(HashJoin, FineGrainedResidualFilter) { auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ [null, null, "r_payload"], [null, 0, "r_payload"], - [null, 42, "r_payload"], + [null, 42, "r_payload"], ["both1", null, "r_payload"], ["both1", 0, "r_payload"], ["both1", 42, "r_payload"], @@ -2704,7 +2705,7 @@ TEST(HashJoin, FineGrainedResidualFilter) { auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ [null, null, "r_payload"], [null, 0, "r_payload"], - [null, 42, "r_payload"], + [null, 42, "r_payload"], ["right_only", null, "r_payload"], ["right_only", 0, "r_payload"], ["right_only", 42, "r_payload"]])"); @@ -2879,7 +2880,7 @@ TEST(HashJoin, FineGrainedResidualFilter) { auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ [null, null, "r_payload"], [null, 0, "r_payload"], - [null, 42, "r_payload"], + [null, 42, "r_payload"], ["both1", null, "r_payload"], ["both1", 0, "r_payload"], ["both2", null, "r_payload"], @@ -3054,7 +3055,7 @@ TEST(HashJoin, FineGrainedResidualFilter) { auto expected = ExecBatchFromJSON({utf8(), int32(), utf8()}, R"([ [null, null, "r_payload"], [null, 0, "r_payload"], - [null, 42, "r_payload"], + [null, 42, "r_payload"], ["both1", null, "r_payload"], ["both2", null, "r_payload"], ["right_only", null, "r_payload"], @@ -3370,8 +3371,10 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { constexpr int value_no_match_length_min = 128; constexpr int value_no_match_length_max = 129; constexpr int value_match_length = 130; + // The value "DDD..." will be hashed to the partition over 4GB of the hash table. + // Matching at this area gives us more coverage. const auto value_match = - std::make_shared(std::string(value_match_length, 'X')); + std::make_shared(std::string(value_match_length, 'D')); constexpr int16_t num_rows_per_batch_left = 128; constexpr int16_t num_rows_per_batch_right = 4096; const int64_t num_batches_left = 8; @@ -3446,5 +3449,104 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { num_batches_left * num_rows_per_batch_left * num_batches_right); } +// GH-45334: The row ids of the matching rows on the right side (the build side) are very +// big, causing the index calculation overflow. +TEST(HashJoin, BuildSideLargeRowIds) { + GTEST_SKIP() << "Test disabled due to excessively time and resource consuming, " + "for local debugging only."; + + // A fair amount of match rows to trigger both SIMD and non-SIMD code paths. + const int64_t num_match_rows = 35; + const int64_t num_rows_per_match_batch = 35; + const int64_t num_match_batches = num_match_rows / num_rows_per_match_batch; + + const int64_t num_unmatch_rows_large = 720898048; + const int64_t num_rows_per_unmatch_batch_large = 352001; + const int64_t num_unmatch_batches_large = + num_unmatch_rows_large / num_rows_per_unmatch_batch_large; + + auto schema_small = + schema({field("small_key", int64()), field("small_payload", int64())}); + auto schema_large = + schema({field("large_key", int64()), field("large_payload", int64())}); + + // A carefully chosen key value which hashes to 0xFFFFFFFE, making the match rows to be + // placed at higher address of the row table. + const int64_t match_key = 289339070; + const int64_t match_payload = 42; + + // Match arrays of length num_rows_per_match_batch. + ASSERT_OK_AND_ASSIGN( + auto match_key_arr, + Constant(MakeScalar(match_key))->Generate(num_rows_per_match_batch)); + ASSERT_OK_AND_ASSIGN( + auto match_payload_arr, + Constant(MakeScalar(match_payload))->Generate(num_rows_per_match_batch)); + // Append 1 row of null to trigger null processing code paths. + ASSERT_OK_AND_ASSIGN(auto null_arr, MakeArrayOfNull(int64(), 1)); + ASSERT_OK_AND_ASSIGN(match_key_arr, Concatenate({match_key_arr, null_arr})); + ASSERT_OK_AND_ASSIGN(match_payload_arr, Concatenate({match_payload_arr, null_arr})); + // Match batch. + ExecBatch match_batch({match_key_arr, match_payload_arr}, num_rows_per_match_batch + 1); + + // Small batch. + ExecBatch batch_small = match_batch; + + // Large unmatch batches. + const int64_t seed = 42; + std::vector unmatch_batches_large; + unmatch_batches_large.reserve(num_unmatch_batches_large); + ASSERT_OK_AND_ASSIGN(auto unmatch_payload_arr_large, + MakeArrayOfNull(int64(), num_rows_per_unmatch_batch_large)); + int64_t unmatch_range_per_batch = + (std::numeric_limits::max() - match_key) / num_unmatch_batches_large; + for (int i = 0; i < num_unmatch_batches_large; ++i) { + auto unmatch_key_arr_large = RandomArrayGenerator(seed).Int64( + num_rows_per_unmatch_batch_large, + /*min=*/match_key + 1 + i * unmatch_range_per_batch, + /*max=*/match_key + 1 + (i + 1) * unmatch_range_per_batch); + unmatch_batches_large.push_back( + ExecBatch({unmatch_key_arr_large, unmatch_payload_arr_large}, + num_rows_per_unmatch_batch_large)); + } + // Large match batch. + ExecBatch match_batch_large = match_batch; + + // Batches with schemas. + auto batches_small = BatchesWithSchema{ + std::vector(num_match_batches, batch_small), schema_small}; + auto batches_large = BatchesWithSchema{std::move(unmatch_batches_large), schema_large}; + for (int i = 0; i < num_match_batches; i++) { + batches_large.batches.push_back(match_batch_large); + } + + Declaration source_small{ + "exec_batch_source", + ExecBatchSourceNodeOptions(batches_small.schema, batches_small.batches)}; + Declaration source_large{ + "exec_batch_source", + ExecBatchSourceNodeOptions(batches_large.schema, batches_large.batches)}; + + HashJoinNodeOptions join_opts(JoinType::INNER, /*left_keys=*/{"small_key"}, + /*right_keys=*/{"large_key"}); + Declaration join{ + "hashjoin", {std::move(source_small), std::move(source_large)}, join_opts}; + + // Join should emit num_match_rows * num_match_rows rows. + ASSERT_OK_AND_ASSIGN(auto batches_result, DeclarationToExecBatches(std::move(join))); + Declaration result{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_result.schema), + std::move(batches_result.batches))}; + AssertRowCountEq(result, num_match_rows * num_match_rows); + + // All rows should be match_key/payload. + auto predicate = and_({equal(field_ref("small_key"), literal(match_key)), + equal(field_ref("small_payload"), literal(match_payload)), + equal(field_ref("large_key"), literal(match_key)), + equal(field_ref("large_payload"), literal(match_payload))}); + Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}}; + AssertRowCountEq(std::move(filter), num_match_rows * num_match_rows); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/plan_test.cc b/cpp/src/arrow/acero/plan_test.cc index e74ad6a6665a4..61ab09f6674d9 100644 --- a/cpp/src/arrow/acero/plan_test.cc +++ b/cpp/src/arrow/acero/plan_test.cc @@ -27,6 +27,7 @@ #include "arrow/acero/util.h" #include "arrow/compute/exec.h" #include "arrow/compute/expression.h" +#include "arrow/compute/test_util_internal.h" #include "arrow/io/util_internal.h" #include "arrow/record_batch.h" #include "arrow/table.h" @@ -51,8 +52,10 @@ using testing::UnorderedElementsAreArray; namespace arrow { +using compute::ArgShape; using compute::call; using compute::CountOptions; +using compute::ExecBatchFromJSON; using compute::field_ref; using compute::ScalarAggregateOptions; using compute::SortKey; diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index c068eeb50ff0a..85e14ac469ce7 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -439,11 +439,11 @@ Status RowArrayMerge::PrepareForMerge(RowArray* target, num_rows = 0; num_bytes = 0; for (size_t i = 0; i < sources.size(); ++i) { - target->rows_.mutable_offsets()[num_rows] = static_cast(num_bytes); + target->rows_.mutable_offsets()[num_rows] = num_bytes; num_rows += sources[i]->rows_.length(); num_bytes += sources[i]->rows_.offsets()[sources[i]->rows_.length()]; } - target->rows_.mutable_offsets()[num_rows] = static_cast(num_bytes); + target->rows_.mutable_offsets()[num_rows] = num_bytes; } return Status::OK(); @@ -477,14 +477,15 @@ void RowArrayMerge::CopyFixedLength(RowTableImpl* target, const RowTableImpl& so const int64_t* source_rows_permutation) { int64_t num_source_rows = source.length(); - int64_t fixed_length = target->metadata().fixed_length; + uint32_t fixed_length = target->metadata().fixed_length; // Permutation of source rows is optional. Without permutation all that is // needed is memcpy. // if (!source_rows_permutation) { - memcpy(target->mutable_data(1) + fixed_length * first_target_row_id, source.data(1), - fixed_length * num_source_rows); + DCHECK_LE(first_target_row_id, std::numeric_limits::max()); + memcpy(target->mutable_fixed_length_rows(static_cast(first_target_row_id)), + source.fixed_length_rows(/*row_id=*/0), fixed_length * num_source_rows); } else { // Row length must be a multiple of 64-bits due to enforced alignment. // Loop for each output row copying a fixed number of 64-bit words. @@ -494,10 +495,13 @@ void RowArrayMerge::CopyFixedLength(RowTableImpl* target, const RowTableImpl& so int64_t num_words_per_row = fixed_length / sizeof(uint64_t); for (int64_t i = 0; i < num_source_rows; ++i) { int64_t source_row_id = source_rows_permutation[i]; + DCHECK_LE(source_row_id, std::numeric_limits::max()); const uint64_t* source_row_ptr = reinterpret_cast( - source.data(1) + fixed_length * source_row_id); + source.fixed_length_rows(static_cast(source_row_id))); + int64_t target_row_id = first_target_row_id + i; + DCHECK_LE(target_row_id, std::numeric_limits::max()); uint64_t* target_row_ptr = reinterpret_cast( - target->mutable_data(1) + fixed_length * (first_target_row_id + i)); + target->mutable_fixed_length_rows(static_cast(target_row_id))); for (int64_t word = 0; word < num_words_per_row; ++word) { target_row_ptr[word] = source_row_ptr[word]; @@ -529,16 +533,16 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& // We can simply memcpy bytes of rows if their order has not changed. // - memcpy(target->mutable_data(2) + target_offsets[first_target_row_id], source.data(2), - source_offsets[num_source_rows] - source_offsets[0]); + memcpy(target->mutable_var_length_rows() + target_offsets[first_target_row_id], + source.var_length_rows(), source_offsets[num_source_rows] - source_offsets[0]); } else { int64_t target_row_offset = first_target_row_offset; - uint64_t* target_row_ptr = - reinterpret_cast(target->mutable_data(2) + target_row_offset); + uint64_t* target_row_ptr = reinterpret_cast( + target->mutable_var_length_rows() + target_row_offset); for (int64_t i = 0; i < num_source_rows; ++i) { int64_t source_row_id = source_rows_permutation[i]; const uint64_t* source_row_ptr = reinterpret_cast( - source.data(2) + source_offsets[source_row_id]); + source.var_length_rows() + source_offsets[source_row_id]); int64_t length = source_offsets[source_row_id + 1] - source_offsets[source_row_id]; // Though the row offset is 64-bit, the length of a single row must be 32-bit as // required by current row table implementation. @@ -564,14 +568,18 @@ void RowArrayMerge::CopyNulls(RowTableImpl* target, const RowTableImpl& source, const int64_t* source_rows_permutation) { int64_t num_source_rows = source.length(); int num_bytes_per_row = target->metadata().null_masks_bytes_per_row; - uint8_t* target_nulls = target->null_masks() + num_bytes_per_row * first_target_row_id; + DCHECK_LE(first_target_row_id, std::numeric_limits::max()); + uint8_t* target_nulls = + target->mutable_null_masks(static_cast(first_target_row_id)); if (!source_rows_permutation) { - memcpy(target_nulls, source.null_masks(), num_bytes_per_row * num_source_rows); + memcpy(target_nulls, source.null_masks(/*row_id=*/0), + num_bytes_per_row * num_source_rows); } else { - for (int64_t i = 0; i < num_source_rows; ++i) { + for (uint32_t i = 0; i < num_source_rows; ++i) { int64_t source_row_id = source_rows_permutation[i]; + DCHECK_LE(source_row_id, std::numeric_limits::max()); const uint8_t* source_nulls = - source.null_masks() + num_bytes_per_row * source_row_id; + source.null_masks(static_cast(source_row_id)); for (int64_t byte = 0; byte < num_bytes_per_row; ++byte) { *target_nulls++ = *source_nulls++; } diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 1d6b7eda6e6a0..deeee2a4e110d 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/acero/swiss_join_internal.h" +#include "arrow/compute/row/row_util_avx2_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/simd.h" @@ -46,7 +47,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu if (!is_fixed_length_column) { int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); - const uint8_t* row_ptr_base = rows.data(2); + const uint8_t* row_ptr_base = rows.var_length_rows(); const RowTableImpl::offset_type* row_offsets = rows.offsets(); auto row_offsets_i64 = reinterpret_cast(row_offsets); @@ -172,7 +173,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu if (is_fixed_length_row) { // Case 3: This is a fixed length column in fixed length row // - const uint8_t* row_ptr_base = rows.data(1); + const uint8_t* row_ptr_base = rows.fixed_length_rows(/*row_id=*/0); for (int i = 0; i < num_rows / kUnroll; ++i) { // Load 8 32-bit row ids. __m256i row_id = @@ -197,7 +198,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu } else { // Case 4: This is a fixed length column in varying length row // - const uint8_t* row_ptr_base = rows.data(2); + const uint8_t* row_ptr_base = rows.var_length_rows(); const RowTableImpl::offset_type* row_offsets = rows.offsets(); auto row_offsets_i64 = reinterpret_cast(row_offsets); @@ -237,31 +238,12 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, // constexpr int kUnroll = 8; - const uint8_t* null_masks = rows.null_masks(); - __m256i null_bits_per_row = - _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row); - __m256i pos_after_encoding = - _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id)); + uint32_t pos_after_encoding = rows.metadata().pos_after_encoding(column_id); for (int i = 0; i < num_rows / kUnroll; ++i) { __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row); - bit_id = _mm256_add_epi32(bit_id, pos_after_encoding); - __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast(null_masks), - _mm256_srli_epi32(bit_id, 3), 1); - __m256i bit_in_word = _mm256_sllv_epi32( - _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7))); - // `result` will contain one 32-bit word per tested null bit, either 0xffffffff if the - // null bit was set or 0 if it was unset. - __m256i result = - _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word); - // NB: Be careful about sign-extension when casting the return value of - // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will pollute the - // higher bits of the following OR. - uint32_t null_bytes_lo = static_cast( - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result)))); - uint64_t null_bytes_hi = - _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 1))); - uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32); + __m256i null32 = GetNullBitInt32(rows, pos_after_encoding, row_id); + null32 = _mm256_cmpeq_epi32(null32, _mm256_set1_epi32(1)); + uint64_t null_bytes = arrow::compute::Cmp32To8(null32); process_8_values_fn(i * kUnroll, null_bytes); } diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index f2f3ac5b1bf93..85f443b0323c7 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -72,7 +72,7 @@ class RowArrayAccessor { if (!is_fixed_length_column) { int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); - const uint8_t* row_ptr_base = rows.data(2); + const uint8_t* row_ptr_base = rows.var_length_rows(); const RowTableImpl::offset_type* row_offsets = rows.offsets(); uint32_t field_offset_within_row, field_length; @@ -108,22 +108,21 @@ class RowArrayAccessor { if (field_length == 0) { field_length = 1; } - uint32_t row_length = rows.metadata().fixed_length; bool is_fixed_length_row = rows.metadata().is_fixed_length; if (is_fixed_length_row) { // Case 3: This is a fixed length column in a fixed length row // - const uint8_t* row_ptr_base = rows.data(1) + field_offset_within_row; for (int i = 0; i < num_rows; ++i) { uint32_t row_id = row_ids[i]; - const uint8_t* row_ptr = row_ptr_base + row_length * row_id; + const uint8_t* row_ptr = + rows.fixed_length_rows(row_id) + field_offset_within_row; process_value_fn(i, row_ptr, field_length); } } else { // Case 4: This is a fixed length column in a varying length row // - const uint8_t* row_ptr_base = rows.data(2) + field_offset_within_row; + const uint8_t* row_ptr_base = rows.var_length_rows() + field_offset_within_row; const RowTableImpl::offset_type* row_offsets = rows.offsets(); for (int i = 0; i < num_rows; ++i) { uint32_t row_id = row_ids[i]; @@ -142,13 +141,10 @@ class RowArrayAccessor { template static void VisitNulls(const RowTableImpl& rows, int column_id, int num_rows, const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn) { - const uint8_t* null_masks = rows.null_masks(); - uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; uint32_t pos_after_encoding = rows.metadata().pos_after_encoding(column_id); for (int i = 0; i < num_rows; ++i) { uint32_t row_id = row_ids[i]; - int64_t bit_id = row_id * null_mask_num_bytes * 8 + pos_after_encoding; - process_value_fn(i, bit_util::GetBit(null_masks, bit_id) ? 0xff : 0); + process_value_fn(i, rows.is_null(row_id, pos_after_encoding) ? 0xff : 0); } } diff --git a/cpp/src/arrow/acero/test_util_internal.cc b/cpp/src/arrow/acero/test_util_internal.cc index 107a20354c0e7..2748d4107ed36 100644 --- a/cpp/src/arrow/acero/test_util_internal.cc +++ b/cpp/src/arrow/acero/test_util_internal.cc @@ -38,6 +38,7 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/exec.h" #include "arrow/compute/function_internal.h" +#include "arrow/compute/test_util_internal.h" #include "arrow/datum.h" #include "arrow/io/interfaces.h" #include "arrow/record_batch.h" @@ -59,67 +60,12 @@ namespace arrow { using arrow::internal::CpuInfo; using arrow::internal::Executor; +using compute::ExecBatchFromJSON; using compute::SortKey; using compute::Take; namespace acero { -namespace { - -void ValidateOutputImpl(const ArrayData& output) { - ASSERT_OK(::arrow::internal::ValidateArrayFull(output)); - TestInitialized(output); -} - -void ValidateOutputImpl(const ChunkedArray& output) { - ASSERT_OK(output.ValidateFull()); - for (const auto& chunk : output.chunks()) { - TestInitialized(*chunk); - } -} - -void ValidateOutputImpl(const RecordBatch& output) { - ASSERT_OK(output.ValidateFull()); - for (const auto& column : output.column_data()) { - TestInitialized(*column); - } -} - -void ValidateOutputImpl(const Table& output) { - ASSERT_OK(output.ValidateFull()); - for (const auto& column : output.columns()) { - for (const auto& chunk : column->chunks()) { - TestInitialized(*chunk); - } - } -} - -void ValidateOutputImpl(const Scalar& output) { ASSERT_OK(output.ValidateFull()); } - -} // namespace - -void ValidateOutput(const Datum& output) { - switch (output.kind()) { - case Datum::ARRAY: - ValidateOutputImpl(*output.array()); - break; - case Datum::CHUNKED_ARRAY: - ValidateOutputImpl(*output.chunked_array()); - break; - case Datum::RECORD_BATCH: - ValidateOutputImpl(*output.record_batch()); - break; - case Datum::TABLE: - ValidateOutputImpl(*output.table()); - break; - case Datum::SCALAR: - ValidateOutputImpl(*output.scalar()); - break; - default: - break; - } -} - std::vector HardwareFlagsForTesting() { // Acero currently only has AVX2 optimizations return arrow::GetSupportedHardwareFlags({CpuInfo::AVX2}); @@ -199,36 +145,6 @@ ExecNode* MakeDummyNode(ExecPlan* plan, std::string label, std::vector& types, std::string_view json) { - auto fields = ::arrow::internal::MapVector( - [](const TypeHolder& th) { return field("", th.GetSharedPtr()); }, types); - - ExecBatch batch{*RecordBatchFromJSON(schema(std::move(fields)), json)}; - - return batch; -} - -ExecBatch ExecBatchFromJSON(const std::vector& types, - const std::vector& shapes, std::string_view json) { - DCHECK_EQ(types.size(), shapes.size()); - - ExecBatch batch = ExecBatchFromJSON(types, json); - - auto value_it = batch.values.begin(); - for (ArgShape shape : shapes) { - if (shape == ArgShape::SCALAR) { - if (batch.length == 0) { - *value_it = MakeNullScalar(value_it->type()); - } else { - *value_it = value_it->make_array()->GetScalar(0).ValueOrDie(); - } - } - ++value_it; - } - - return batch; -} - Future<> StartAndFinish(ExecPlan* plan) { RETURN_NOT_OK(plan->Validate()); plan->StartProducing(); diff --git a/cpp/src/arrow/acero/test_util_internal.h b/cpp/src/arrow/acero/test_util_internal.h index 569fb1254db4a..2367524a5600c 100644 --- a/cpp/src/arrow/acero/test_util_internal.h +++ b/cpp/src/arrow/acero/test_util_internal.h @@ -36,8 +36,6 @@ namespace arrow::acero { -void ValidateOutput(const Datum& output); - // Enumerate all hardware flags that can be tested on this platform // and would lead to different code paths being tested in Acero. std::vector HardwareFlagsForTesting(); @@ -50,16 +48,6 @@ ExecNode* MakeDummyNode(ExecPlan* plan, std::string label, std::vector& types, std::string_view json); - -/// \brief Shape qualifier for value types. In certain instances -/// (e.g. "map_lookup" kernel), an argument may only be a scalar, where in -/// other kernels arguments can be arrays or scalars -enum class ArgShape { ANY, ARRAY, SCALAR }; - -ExecBatch ExecBatchFromJSON(const std::vector& types, - const std::vector& shapes, std::string_view json); - struct BatchesWithSchema { std::vector batches; std::shared_ptr schema; diff --git a/cpp/src/arrow/acero/tpch_node_test.cc b/cpp/src/arrow/acero/tpch_node_test.cc index 17fb43452bc58..f484d6c9d523e 100644 --- a/cpp/src/arrow/acero/tpch_node_test.cc +++ b/cpp/src/arrow/acero/tpch_node_test.cc @@ -27,7 +27,6 @@ #include "arrow/acero/test_util_internal.h" #include "arrow/acero/tpch_node.h" #include "arrow/acero/util.h" -#include "arrow/compute/kernels/test_util.h" #include "arrow/compute/row/row_encoder_internal.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index b9d6c53215b41..b3c314fccc0b3 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -235,7 +235,7 @@ void AssertTableWriteReadEqual(const std::vector>& input_ write_options.compression = Compression::UNCOMPRESSED; #endif write_options.file_version = adapters::orc::FileVersion(0, 11); - write_options.compression_block_size = 32768; + write_options.compression_block_size = 64 * 1024; write_options.row_index_stride = 5000; EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( buffer_output_stream.get(), write_options)); @@ -272,7 +272,7 @@ void AssertBatchWriteReadEqual( write_options.compression = Compression::UNCOMPRESSED; #endif write_options.file_version = adapters::orc::FileVersion(0, 11); - write_options.compression_block_size = 32768; + write_options.compression_block_size = 64 * 1024; write_options.row_index_stride = 5000; EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( buffer_output_stream.get(), write_options)); @@ -330,7 +330,7 @@ std::unique_ptr CreateWriter(uint64_t stripe_size, liborc::OutputStream* stream) { liborc::WriterOptions options; options.setStripeSize(stripe_size); - options.setCompressionBlockSize(1024); + options.setCompressionBlockSize(64 * 1024); options.setMemoryPool(liborc::getDefaultPool()); options.setRowIndexStride(0); return liborc::createWriter(type, stream, options); @@ -668,7 +668,7 @@ TEST_F(TestORCWriterTrivialNoWrite, noWrite) { write_options.compression = Compression::UNCOMPRESSED; #endif write_options.file_version = adapters::orc::FileVersion(0, 11); - write_options.compression_block_size = 32768; + write_options.compression_block_size = 64 * 1024; write_options.row_index_stride = 5000; EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( buffer_output_stream.get(), write_options)); diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index 545425c264619..77ba2477791bb 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -166,6 +166,36 @@ TEST(TestSparseUnionArray, Validate) { ASSERT_RAISES(Invalid, arr->ValidateFull()); } +TEST(TestSparseUnionArray, Comparison) { + auto ints1 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"); + auto ints2 = ArrayFromJSON(int32(), "[1, 2, -3, 4, -5, 6]"); + auto strs1 = ArrayFromJSON(utf8(), R"(["a", "b", "c", "d", "e", "f"])"); + auto strs2 = ArrayFromJSON(utf8(), R"(["a", "*", "c", "d", "e", "*"])"); + std::vector type_codes{8, 42}; + + auto check_equality = [&](const std::string& type_ids_json1, + const std::string& type_ids_json2, bool expected_equals) { + auto type_ids1 = ArrayFromJSON(int8(), type_ids_json1); + auto type_ids2 = ArrayFromJSON(int8(), type_ids_json2); + ASSERT_OK_AND_ASSIGN(auto arr1, + SparseUnionArray::Make(*type_ids1, {ints1, strs1}, type_codes)); + ASSERT_OK_AND_ASSIGN(auto arr2, + SparseUnionArray::Make(*type_ids2, {ints2, strs2}, type_codes)); + ASSERT_EQ(arr1->Equals(arr2), expected_equals); + ASSERT_EQ(arr2->Equals(arr1), expected_equals); + }; + + // Same type ids + check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", true); + check_equality("[8, 8, 42, 42, 42, 42]", "[8, 8, 42, 42, 42, 42]", false); + check_equality("[8, 8, 8, 42, 42, 8]", "[8, 8, 8, 42, 42, 8]", false); + check_equality("[8, 42, 42, 42, 42, 8]", "[8, 42, 42, 42, 42, 8]", false); + + // Different type ids + check_equality("[42, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", false); + check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 42]", false); +} + // ------------------------------------------------------------------------- // Tests for MakeDense and MakeSparse diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 23a921cc5a0a4..e0e6d183393a7 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -381,21 +381,49 @@ class RangeDataEqualsImpl { const int8_t* right_codes = right_.GetValues(1); // Unions don't have a null bitmap + int64_t run_start = 0; // Start index of the current run + for (int64_t i = 0; i < range_length_; ++i) { - const auto type_id = left_codes[left_start_idx_ + i]; - if (type_id != right_codes[right_start_idx_ + i]) { + const auto current_type_id = left_codes[left_start_idx_ + i]; + + if (current_type_id != right_codes[right_start_idx_ + i]) { result_ = false; break; } - const auto child_num = child_ids[type_id]; - // XXX can we instead detect runs of same-child union values? + // Check if the current element breaks the run + if (i > 0 && current_type_id != left_codes[left_start_idx_ + i - 1]) { + // Compare the previous run + const auto previous_child_num = child_ids[left_codes[left_start_idx_ + i - 1]]; + int64_t run_length = i - run_start; + + RangeDataEqualsImpl impl( + options_, floating_approximate_, *left_.child_data[previous_child_num], + *right_.child_data[previous_child_num], + left_start_idx_ + left_.offset + run_start, + right_start_idx_ + right_.offset + run_start, run_length); + + if (!impl.Compare()) { + result_ = false; + break; + } + + // Start a new run + run_start = i; + } + } + + // Handle the final run + if (result_) { + const auto final_child_num = child_ids[left_codes[left_start_idx_ + run_start]]; + int64_t final_run_length = range_length_ - run_start; + RangeDataEqualsImpl impl( - options_, floating_approximate_, *left_.child_data[child_num], - *right_.child_data[child_num], left_start_idx_ + left_.offset + i, - right_start_idx_ + right_.offset + i, 1); + options_, floating_approximate_, *left_.child_data[final_child_num], + *right_.child_data[final_child_num], left_start_idx_ + left_.offset + run_start, + right_start_idx_ + right_.offset + run_start, final_run_length); + if (!impl.Compare()) { result_ = false; - break; } } return Status::OK(); diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index ca811dac041fe..6deb2cbad8cb3 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -28,6 +28,14 @@ endif() # Unit tests # +# Define arrow_compute_testing object library for common test files +if(ARROW_TESTING) + add_library(arrow_compute_testing OBJECT test_util_internal.cc) + # Even though this is still just an object library we still need to "link" our + # dependencies so that include paths are configured correctly + target_link_libraries(arrow_compute_testing PUBLIC ${ARROW_GTEST_GMOCK}) +endif() + set(ARROW_COMPUTE_TEST_PREFIX "arrow-compute") set(ARROW_COMPUTE_TEST_LABELS "arrow-compute-tests") set(ARROW_COMPUTE_TEST_ARGS PREFIX ${ARROW_COMPUTE_TEST_PREFIX} LABELS @@ -87,9 +95,16 @@ add_arrow_test(internals_test function_test.cc exec_test.cc kernel_test.cc - registry_test.cc) + registry_test.cc + EXTRA_LINK_LIBS + arrow_compute_testing) + +add_arrow_compute_test(expression_test + SOURCES + expression_test.cc + EXTRA_LINK_LIBS + arrow_compute_testing) -add_arrow_compute_test(expression_test SOURCES expression_test.cc) add_arrow_compute_test(row_test SOURCES key_hash_test.cc @@ -98,7 +113,9 @@ add_arrow_compute_test(row_test row/grouper_test.cc row/row_encoder_internal_test.cc row/row_test.cc - util_internal_test.cc) + util_internal_test.cc + EXTRA_LINK_LIBS + arrow_compute_testing) add_arrow_benchmark(function_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index f0d5c0fcc3d72..61335de6ac09a 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -48,6 +48,7 @@ using compute::DictionaryEncodeOptions; using compute::FilterOptions; using compute::NullPlacement; using compute::RankOptions; +using compute::RankQuantileOptions; template <> struct EnumTraits @@ -151,10 +152,19 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("sort_keys", &RankOptions::sort_keys), DataMember("null_placement", &RankOptions::null_placement), DataMember("tiebreaker", &RankOptions::tiebreaker)); +static auto kRankQuantileOptionsType = GetFunctionOptionsType( + DataMember("sort_keys", &RankQuantileOptions::sort_keys), + DataMember("null_placement", &RankQuantileOptions::null_placement)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); static auto kListFlattenOptionsType = GetFunctionOptionsType( DataMember("recursive", &ListFlattenOptions::recursive)); +static auto kInversePermutationOptionsType = + GetFunctionOptionsType( + DataMember("max_index", &InversePermutationOptions::max_index), + DataMember("output_type", &InversePermutationOptions::output_type)); +static auto kScatterOptionsType = GetFunctionOptionsType( + DataMember("max_index", &ScatterOptions::max_index)); } // namespace } // namespace internal @@ -222,6 +232,13 @@ RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_plac tiebreaker(tiebreaker) {} constexpr char RankOptions::kTypeName[]; +RankQuantileOptions::RankQuantileOptions(std::vector sort_keys, + NullPlacement null_placement) + : FunctionOptions(internal::kRankQuantileOptionsType), + sort_keys(std::move(sort_keys)), + null_placement(null_placement) {} +constexpr char RankQuantileOptions::kTypeName[]; + PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; @@ -230,6 +247,17 @@ ListFlattenOptions::ListFlattenOptions(bool recursive) : FunctionOptions(internal::kListFlattenOptionsType), recursive(recursive) {} constexpr char ListFlattenOptions::kTypeName[]; +InversePermutationOptions::InversePermutationOptions( + int64_t max_index, std::shared_ptr output_type) + : FunctionOptions(internal::kInversePermutationOptionsType), + max_index(max_index), + output_type(std::move(output_type)) {} +constexpr char InversePermutationOptions::kTypeName[]; + +ScatterOptions::ScatterOptions(int64_t max_index) + : FunctionOptions(internal::kScatterOptionsType), max_index(max_index) {} +constexpr char ScatterOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -244,6 +272,8 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kInversePermutationOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kScatterOptionsType)); } } // namespace internal @@ -429,5 +459,19 @@ Result CumulativeMean(const Datum& values, const CumulativeOptions& optio return CallFunction("cumulative_mean", {Datum(values)}, &options, ctx); } +// ---------------------------------------------------------------------- +// Swizzle functions + +Result InversePermutation(const Datum& indices, + const InversePermutationOptions& options, + ExecContext* ctx) { + return CallFunction("inverse_permutation", {indices}, &options, ctx); +} + +Result Scatter(const Datum& values, const Datum& indices, + const ScatterOptions& options, ExecContext* ctx) { + return CallFunction("scatter", {values, indices}, &options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index e5bcc37329661..22bb164719756 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -195,6 +195,25 @@ class ARROW_EXPORT RankOptions : public FunctionOptions { Tiebreaker tiebreaker; }; +/// \brief Quantile rank options +class ARROW_EXPORT RankQuantileOptions : public FunctionOptions { + public: + explicit RankQuantileOptions(std::vector sort_keys = {}, + NullPlacement null_placement = NullPlacement::AtEnd); + /// Convenience constructor for array inputs + explicit RankQuantileOptions(SortOrder order, + NullPlacement null_placement = NullPlacement::AtEnd) + : RankQuantileOptions({SortKey("", order)}, null_placement) {} + + static constexpr char const kTypeName[] = "RankQuantileOptions"; + static RankQuantileOptions Defaults() { return RankQuantileOptions(); } + + /// Column key(s) to order by and how to order by these sort keys. + std::vector sort_keys; + /// Whether nulls and NaNs are placed at the start or at the end + NullPlacement null_placement; +}; + /// \brief Partitioning options for NthToIndices class ARROW_EXPORT PartitionNthOptions : public FunctionOptions { public: @@ -257,6 +276,40 @@ class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { bool recursive = false; }; +/// \brief Options for inverse_permutation function +class ARROW_EXPORT InversePermutationOptions : public FunctionOptions { + public: + explicit InversePermutationOptions(int64_t max_index = -1, + std::shared_ptr output_type = NULLPTR); + static constexpr char const kTypeName[] = "InversePermutationOptions"; + static InversePermutationOptions Defaults() { return InversePermutationOptions(); } + + /// \brief The max value in the input indices to allow. The length of the function's + /// output will be this value plus 1. If negative, this value will be set to the length + /// of the input indices minus 1 and the length of the function's output will be the + /// length of the input indices. + int64_t max_index = -1; + /// \brief The type of the output inverse permutation. If null, the output will be of + /// the same type as the input indices, otherwise must be signed integer type. An + /// invalid error will be reported if this type is not able to store the length of the + /// input indices. + std::shared_ptr output_type = NULLPTR; +}; + +/// \brief Options for scatter function +class ARROW_EXPORT ScatterOptions : public FunctionOptions { + public: + explicit ScatterOptions(int64_t max_index = -1); + static constexpr char const kTypeName[] = "ScatterOptions"; + static ScatterOptions Defaults() { return ScatterOptions(); } + + /// \brief The max value in the input indices to allow. The length of the function's + /// output will be this value plus 1. If negative, this value will be set to the length + /// of the input indices minus 1 and the length of the function's output will be the + /// length of the input indices. + int64_t max_index = -1; +}; + /// @} /// \brief Filter with a boolean selection filter @@ -705,5 +758,58 @@ Result> PairwiseDiff(const Array& array, bool check_overflow = false, ExecContext* ctx = NULLPTR); +/// \brief Return the inverse permutation of the given indices. +/// +/// For indices[i] = x, inverse_permutation[x] = i. And inverse_permutation[x] = null if x +/// does not appear in the input indices. Indices must be in the range of [0, max_index], +/// or null, which will be ignored. If multiple indices point to the same value, the last +/// one is used. +/// +/// For example, with +/// indices = [null, 0, null, 2, 4, 1, 1] +/// the inverse permutation is +/// [1, 6, 3, null, 4, null, null] +/// if max_index = 6. +/// +/// \param[in] indices array-like indices +/// \param[in] options configures the max index and the output type +/// \param[in] ctx the function execution context, optional +/// \return the resulting inverse permutation +/// +/// \since 20.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result InversePermutation( + const Datum& indices, + const InversePermutationOptions& options = InversePermutationOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Scatter the values into specified positions according to the indices. +/// +/// For indices[i] = x, output[x] = values[i]. And output[x] = null if x does not appear +/// in the input indices. Indices must be in the range of [0, max_index], or null, in +/// which case the corresponding value will be ignored. If multiple indices point to the +/// same value, the last one is used. +/// +/// For example, with +/// values = [a, b, c, d, e, f, g] +/// indices = [null, 0, null, 2, 4, 1, 1] +/// the output is +/// [b, g, d, null, e, null, null] +/// if max_index = 6. +/// +/// \param[in] values datum to scatter +/// \param[in] indices array-like indices +/// \param[in] options configures the max index of to scatter +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 20.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Scatter(const Datum& values, const Datum& indices, + const ScatterOptions& options = ScatterOptions::Defaults(), + ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index c269de0763217..b7d017d482013 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -136,6 +136,10 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new SelectKOptions(5, {{SortKey("key", SortOrder::Ascending)}})); options.emplace_back(new Utf8NormalizeOptions()); options.emplace_back(new Utf8NormalizeOptions(Utf8NormalizeOptions::NFD)); + options.emplace_back( + new InversePermutationOptions(/*max_index=*/42, /*output_type=*/int32())); + options.emplace_back(new ScatterOptions()); + options.emplace_back(new ScatterOptions(/*max_index=*/42)); for (size_t i = 0; i < options.size(); i++) { const size_t prev_i = i == 0 ? options.size() - 1 : i - 1; diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 7c7b9c8b68d45..4dedd1f23e090 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -18,9 +18,9 @@ # ---------------------------------------------------------------------- # Tests that don't require the full kernel library -# Define arrow_compute_testing object library for common test files +# Define arrow_compute_kernels_testing object library for common test files if(ARROW_TESTING) - add_library(arrow_compute_kernels_testing OBJECT test_util.cc) + add_library(arrow_compute_kernels_testing OBJECT test_util_internal.cc) # Even though this is still just an object library we still need to "link" our # dependencies so that include paths are configured correctly target_link_libraries(arrow_compute_kernels_testing PUBLIC ${ARROW_GTEST_GMOCK}) @@ -31,12 +31,14 @@ add_arrow_test(scalar_cast_test SOURCES scalar_cast_test.cc EXTRA_LINK_LIBS - arrow_compute_kernels_testing) + arrow_compute_kernels_testing + arrow_compute_testing) # ---------------------------------------------------------------------- # Scalar kernels -set(ARROW_COMPUTE_SCALAR_TYPE_TEST_LINK_LIBS arrow_compute_kernels_testing) +set(ARROW_COMPUTE_SCALAR_TYPE_TEST_LINK_LIBS arrow_compute_kernels_testing + arrow_compute_testing) if(ARROW_WITH_UTF8PROC) list(APPEND ARROW_COMPUTE_SCALAR_TYPE_TEST_LINK_LIBS utf8proc::utf8proc) endif() @@ -52,13 +54,15 @@ add_arrow_compute_test(scalar_if_else_test SOURCES scalar_if_else_test.cc EXTRA_LINK_LIBS - arrow_compute_kernels_testing) + arrow_compute_kernels_testing + arrow_compute_testing) add_arrow_compute_test(scalar_temporal_test SOURCES scalar_temporal_test.cc EXTRA_LINK_LIBS - arrow_compute_kernels_testing) + arrow_compute_kernels_testing + arrow_compute_testing) add_arrow_compute_test(scalar_math_test SOURCES @@ -66,7 +70,8 @@ add_arrow_compute_test(scalar_math_test scalar_compare_test.cc scalar_round_arithmetic_test.cc EXTRA_LINK_LIBS - arrow_compute_kernels_testing) + arrow_compute_kernels_testing + arrow_compute_testing) add_arrow_compute_test(scalar_utility_test SOURCES @@ -74,7 +79,8 @@ add_arrow_compute_test(scalar_utility_test scalar_set_lookup_test.cc scalar_validity_test.cc EXTRA_LINK_LIBS - arrow_compute_kernels_testing) + arrow_compute_kernels_testing + arrow_compute_testing) add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute") @@ -101,19 +107,29 @@ add_arrow_compute_test(vector_test vector_run_end_encode_test.cc select_k_test.cc EXTRA_LINK_LIBS - arrow_compute_kernels_testing) + arrow_compute_kernels_testing + arrow_compute_testing) add_arrow_compute_test(vector_sort_test SOURCES vector_sort_test.cc EXTRA_LINK_LIBS - arrow_compute_kernels_testing) + arrow_compute_kernels_testing + arrow_compute_testing) add_arrow_compute_test(vector_selection_test SOURCES vector_selection_test.cc EXTRA_LINK_LIBS - arrow_compute_kernels_testing) + arrow_compute_kernels_testing + arrow_compute_testing) + +add_arrow_compute_test(vector_swizzle_test + SOURCES + vector_swizzle_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing + arrow_compute_testing) add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute") add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute") @@ -132,6 +148,7 @@ add_arrow_compute_test(aggregate_test aggregate_test.cc EXTRA_LINK_LIBS arrow_compute_kernels_testing + arrow_compute_testing Boost::headers) # ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 65439af2748b5..e6ad915fd5667 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -32,7 +32,7 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" #include "arrow/compute/kernels/aggregate_internal.h" -#include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/kernels/test_util_internal.h" #include "arrow/compute/registry.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 594bd1fce0b84..2a492f581f53b 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -1037,8 +1037,9 @@ ArrayKernelExec GenerateFloatingPoint(detail::GetTypeId get_id) { // Generate a kernel given a templated functor for integer types // // See "Numeric" above for description of the generator functor -template