diff --git a/.asf.yaml b/.asf.yaml index 40b961dc6e885..a1c6434587703 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -21,10 +21,12 @@ github: collaborators: - anjakefala - benibus - - danepitkin - davisusanibar + - jbonofre - js8544 - vibhatha + - zanmato1984 + - ZhangHuiGui notifications: commits: commits@arrow.apache.org diff --git a/.env b/.env index 298c100c094b0..be35921f94c3a 100644 --- a/.env +++ b/.env @@ -56,7 +56,7 @@ UBUNTU=20.04 CLANG_TOOLS=14 CUDA=11.2.2 DASK=latest -DOTNET=7.0 +DOTNET=8.0 GCC_VERSION="" GO=1.21.8 STATICCHECK=v0.4.7 @@ -71,12 +71,12 @@ NUMBA=latest NUMPY=latest PANDAS=latest PYTHON=3.8 -R=4.2 +R=4.4 SPARK=master TURBODBC=latest -# These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-gcc-release:latest -R_IMAGE=ubuntu-gcc-release +# These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-release:latest +R_IMAGE=ubuntu-release R_ORG=rhub R_TAG=latest @@ -86,21 +86,19 @@ ARROW_R_DEV=TRUE R_PRUNE_DEPS=FALSE TZ=UTC -# Any non-empty string will install devtoolset-${DEVTOOLSET_VERSION} -DEVTOOLSET_VERSION= - # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. -VCPKG="a42af01b72c28a8e1d7b48107b33e4f286a55ef6" # 2023.11.20 Release +VCPKG="943c5ef1c8f6b5e6ced092b242c8299caae2ff01" # 2024.04.26 Release # This must be updated when we update # ci/docker/python-wheel-windows-vs2019.dockerfile. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-03-19 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-06-18 -# Use conanio/${CONAN} for "docker-compose run --rm conan". See -# https://github.com/conan-io/conan-docker-tools#readme for available -# images. -CONAN=gcc10 +# Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker-compose run --rm conan". +# See https://github.com/conan-io/conan-docker-tools#readme and +# https://hub.docker.com/u/conanio for available images. +CONAN_BASE=gcc10 +CONAN_VERSION=1.62.0 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e7e544c2b0e62..e495bfd147de6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -37,7 +37,7 @@ /go/ @zeroshade /java/ @lidavidm /js/ @domoritz @trxcllnt -/matlab/ @kevingurney @kou +/matlab/ @kevingurney @kou @sgilmore10 /python/pyarrow/_flight.pyx @lidavidm /python/pyarrow/**/*gandiva* @wjones127 /r/ @paleolimbot @thisisnic diff --git a/.github/dependabot.yml b/.github/dependabot.yml index e96cb8d2eb1e3..7d9ff2f42e887 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,30 +23,35 @@ updates: interval: "weekly" commit-message: prefix: "MINOR: [CI] " + open-pull-requests-limit: 10 - package-ecosystem: "gomod" directory: "/go/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [Go] " + open-pull-requests-limit: 10 - package-ecosystem: "maven" directory: "/java/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [Java] " + open-pull-requests-limit: 10 - package-ecosystem: "npm" directory: "/js/" schedule: interval: "monthly" commit-message: prefix: "MINOR: [JS] " + open-pull-requests-limit: 10 - package-ecosystem: "nuget" directory: "/csharp/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [C#] " + open-pull-requests-limit: 10 ignore: - dependency-name: "Microsoft.Extensions.*" update-types: diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index dbd24796db52b..c698baba2c816 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -32,7 +32,9 @@ on: - 'docker-compose.yml' env: + ARCHERY_DEBUG: 1 ARCHERY_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + ARCHERY_USE_DOCKER_CLI: 1 concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} @@ -57,9 +59,9 @@ jobs: shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: - python-version: '3.12' + python-version: '3.9' - name: Install pygit2 binary wheel run: pip install pygit2 --only-binary pygit2 - name: Install Archery, Crossbow- and Test Dependencies diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 038a468a81276..a34856d2dc81a 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -41,7 +41,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 3036d06d5d7b2..e539fadb859fe 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -21,6 +21,7 @@ on: push: paths: - '.github/workflows/cpp.yml' + - 'ci/conda_env_*' - 'ci/docker/**' - 'ci/scripts/cpp_*' - 'ci/scripts/install_azurite.sh' @@ -31,9 +32,11 @@ on: - 'cpp/**' - 'docker-compose.yml' - 'format/Flight.proto' + - 'testing' pull_request: paths: - '.github/workflows/cpp.yml' + - 'ci/conda_env_*' - 'ci/docker/**' - 'ci/scripts/cpp_*' - 'ci/scripts/install_azurite.sh' @@ -44,6 +47,7 @@ on: - 'cpp/**' - 'docker-compose.yml' - 'format/Flight.proto' + - 'testing' concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} @@ -53,6 +57,7 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" @@ -94,6 +99,7 @@ jobs: cat <> "$GITHUB_OUTPUT" { "arch": "arm64v8", + "archery-use-docker-cli": "0", "clang-tools": "10", "image": "ubuntu-cpp", "llvm": "10", @@ -118,6 +124,9 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} + # By default, use Docker CLI because docker-compose v1 is obsolete, + # except where the Docker client version is too old. + ARCHERY_USE_DOCKER_CLI: ${{ matrix.archery-use-docker-cli || '1' }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} CLANG_TOOLS: ${{ matrix.clang-tools }} LLVM: ${{ matrix.llvm }} @@ -237,7 +246,7 @@ jobs: $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest ${ARROW_HOME} - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: 3.12 - name: Install Google Cloud Storage Testbench @@ -458,7 +467,7 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: 3.9 - name: Install Google Cloud Storage Testbench diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 17ef2de81088f..e4db9f482e206 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -46,7 +46,7 @@ jobs: strategy: fail-fast: false matrix: - dotnet: ['7.0.x'] + dotnet: ['8.0.x'] steps: - name: Install C# uses: actions/setup-dotnet@v4 @@ -74,7 +74,7 @@ jobs: strategy: fail-fast: false matrix: - dotnet: ['7.0.x'] + dotnet: ['8.0.x'] steps: - name: Install C# uses: actions/setup-dotnet@v4 @@ -94,19 +94,23 @@ jobs: run: ci/scripts/csharp_test.sh $(pwd) macos: - name: AMD64 macOS 11 C# ${{ matrix.dotnet }} - runs-on: macos-latest + name: AMD64 macOS 13 C# ${{ matrix.dotnet }} + runs-on: macos-13 # Pending https://github.com/pythonnet/pythonnet/issues/2396 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 15 strategy: fail-fast: false matrix: - dotnet: ['7.0.x'] + dotnet: ['8.0.x'] steps: - name: Install C# uses: actions/setup-dotnet@v4 with: dotnet-version: ${{ matrix.dotnet }} + - name: Setup Python + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: 3.12 - name: Checkout Arrow uses: actions/checkout@v4 with: diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 3a48270a97c9a..5aec3638a8967 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -29,6 +29,10 @@ concurrency: permissions: contents: read +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 + jobs: lint: @@ -42,7 +46,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Install pre-commit @@ -101,22 +105,22 @@ jobs: with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3.12' - name: Install Ruby - uses: ruby/setup-ruby@250fcd6a742febb1123a77a841497ccaa8b9e939 # v1.152.0 + uses: ruby/setup-ruby@v1 with: - ruby-version: '2.7' + ruby-version: ruby - name: Install .NET uses: actions/setup-dotnet@4d6c8fcf3c8f7a60068d26b594648e99df24cee3 # v4.0.0 with: - dotnet-version: '7.0.x' + dotnet-version: '8.0.x' - name: Install Dependencies shell: bash run: | gem install test-unit - pip install "cython>=0.29.31" setuptools six pytest jira + pip install "cython>=0.29.31" setuptools pytest jira setuptools-scm - name: Run Release Test env: ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 82b43ee2363b5..36a0dc014db8d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -24,18 +24,20 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" jobs: complete: - name: AMD64 Ubuntu 22.04 Complete Documentation + name: AMD64 Debian 12 Complete Documentation runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 150 env: - UBUNTU: "22.04" + JDK: 17 steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -48,10 +50,10 @@ jobs: uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 with: path: .docker - key: ubuntu-docs-${{ hashFiles('cpp/**') }} - restore-keys: ubuntu-docs- + key: debian-docs-${{ hashFiles('cpp/**') }} + restore-keys: debian-docs- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Setup Archery @@ -60,7 +62,8 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run ubuntu-docs + JDK: 17 + run: archery docker run debian-docs - name: Docker Push if: >- success() && @@ -71,4 +74,4 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true - run: archery docker push ubuntu-docs + run: archery docker push debian-docs diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 306fc5135073d..947e2ac21b83c 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -31,8 +31,10 @@ on: permissions: contents: read - + env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" @@ -57,7 +59,7 @@ jobs: key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 7ff781d35e8ec..0d32628859fa0 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -41,6 +41,10 @@ concurrency: permissions: contents: read +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 + jobs: docker-targets: @@ -75,12 +79,14 @@ jobs: { "arch-label": "ARM64", "arch": "arm64v8", + "archery-use-docker-cli": "0", "go": "1.21", "runs-on": ["self-hosted", "arm", "linux"] }, { "arch-label": "ARM64", "arch": "arm64v8", + "archery-use-docker-cli": "0", "go": "1.22", "runs-on": ["self-hosted", "arm", "linux"] } @@ -101,6 +107,9 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} + # By default, use Docker CLI because docker-compose v1 is obsolete, + # except where the Docker client version is too old. + ARCHERY_USE_DOCKER_CLI: ${{ matrix.archery-use-docker-cli || '1' }} GO: ${{ matrix.go }} steps: - name: Checkout Arrow @@ -201,7 +210,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -241,7 +250,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -297,8 +306,8 @@ jobs: run: ci/scripts/go_test.sh $(pwd) macos: - name: AMD64 macOS 11 Go ${{ matrix.go }} - runs-on: macos-latest + name: AMD64 macOS 12 Go ${{ matrix.go }} + runs-on: macos-12 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 strategy: @@ -333,7 +342,7 @@ jobs: github.event_name == 'push' && github.repository == 'apache/arrow' && github.ref_name == 'main' - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3.10' - name: Run Benchmarks @@ -355,8 +364,8 @@ jobs: macos-cgo: - name: AMD64 macOS 11 Go ${{ matrix.go }} - CGO - runs-on: macos-latest + name: AMD64 macOS 12 Go ${{ matrix.go }} - CGO + runs-on: macos-12 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 strategy: diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index adb6fb2b57c75..f53f4aeb505d2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -29,6 +29,7 @@ on: - 'js/**' - 'cpp/**' - 'java/**' + - 'csharp/**' - 'format/**' pull_request: paths: @@ -40,6 +41,7 @@ on: - 'integration/**' - 'js/**' - 'cpp/**' + - 'csharp/**' - 'java/**' - 'format/**' @@ -51,6 +53,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -71,6 +75,11 @@ jobs: with: repository: apache/arrow-rs path: rust + - name: Checkout Arrow nanoarrow + uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + with: + repository: apache/arrow-nanoarrow + path: nanoarrow - name: Free up disk space run: | ci/scripts/util_free_space.sh @@ -81,7 +90,7 @@ jobs: key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -93,6 +102,7 @@ jobs: run: > archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ + -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \ -e ARCHERY_INTEGRATION_WITH_RUST=1 \ conda-integration - name: Docker Push diff --git a/.github/workflows/issue_bot.yml b/.github/workflows/issue_bot.yml index ec614ca1e7c56..2725825b56952 100644 --- a/.github/workflows/issue_bot.yml +++ b/.github/workflows/issue_bot.yml @@ -21,7 +21,6 @@ on: issues: types: - opened - - edited permissions: contents: read diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index a14977525b6c6..08dbe7c8068c0 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -45,6 +45,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -56,8 +58,8 @@ jobs: strategy: fail-fast: false matrix: - jdk: [8, 11, 17, 21] - maven: [3.9.5] + jdk: [8, 11, 17, 21, 22] + maven: [3.9.6] image: [java] env: JDK: ${{ matrix.jdk }} @@ -75,7 +77,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -84,11 +86,11 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} run: | archery docker run \ -e CI=true \ - -e "GRADLE_ENTERPRISE_ACCESS_KEY=$GRADLE_ENTERPRISE_ACCESS_KEY" \ + -e "DEVELOCITY_ACCESS_KEY=$DEVELOCITY_ACCESS_KEY" \ ${{ matrix.image }} - name: Docker Push if: >- @@ -103,9 +105,9 @@ jobs: run: archery docker push ${{ matrix.image }} macos: - name: AMD64 macOS 11 Java JDK ${{ matrix.jdk }} - runs-on: macos-latest - if: github.event_name == 'push' + name: AMD64 macOS 12 Java JDK ${{ matrix.jdk }} + runs-on: macos-12 + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 30 strategy: fail-fast: false @@ -125,12 +127,12 @@ jobs: - name: Build shell: bash env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} run: ci/scripts/java_build.sh $(pwd) $(pwd)/build - name: Test shell: bash env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} run: ci/scripts/java_test.sh $(pwd) $(pwd)/build windows: @@ -156,10 +158,10 @@ jobs: - name: Build shell: bash env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} run: ci/scripts/java_build.sh $(pwd) $(pwd)/build - name: Test shell: bash env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} run: ci/scripts/java_test.sh $(pwd) $(pwd)/build diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 46f3381ed0e8f..ea5f8d694a9c6 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -45,6 +45,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -52,7 +54,7 @@ jobs: name: AMD64 manylinux2014 Java JNI runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 90 + timeout-minutes: 240 steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -69,7 +71,7 @@ jobs: key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }} restore-keys: java-jni-manylinux-2014- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -109,7 +111,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -118,11 +120,11 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} run: | archery docker run \ -e CI=true \ - -e "GRADLE_ENTERPRISE_ACCESS_KEY=$GRADLE_ENTERPRISE_ACCESS_KEY" \ + -e "DEVELOCITY_ACCESS_KEY=$DEVELOCITY_ACCESS_KEY" \ conda-python-java-integration - name: Docker Push if: >- diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index c535dc4a07de3..f40d4ce5b42d6 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -58,7 +58,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 304eba41e4d37..c11c8254011f6 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -38,6 +38,10 @@ concurrency: permissions: contents: read +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 + jobs: docker: @@ -51,7 +55,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -77,10 +81,10 @@ jobs: run: archery docker push debian-js macos: - name: AMD64 macOS 11 NodeJS ${{ matrix.node }} - runs-on: macos-latest - if: github.event_name == 'push' - timeout-minutes: 90 + name: AMD64 macOS 12 NodeJS ${{ matrix.node }} + runs-on: macos-12 + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 30 strategy: fail-fast: false matrix: @@ -102,15 +106,16 @@ jobs: node-version: ${{ matrix.node }} - name: Build shell: bash - run: ci/scripts/js_build.sh $(pwd) + run: ci/scripts/js_build.sh $(pwd) build - name: Test shell: bash - run: ci/scripts/js_test.sh $(pwd) + run: ci/scripts/js_test.sh $(pwd) build windows: name: AMD64 Windows NodeJS ${{ matrix.node }} runs-on: windows-latest - if: github.event_name == 'push' + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 45 strategy: fail-fast: false matrix: @@ -132,7 +137,7 @@ jobs: node-version: ${{ matrix.node }} - name: Build shell: bash - run: ci/scripts/js_build.sh $(pwd) + run: ci/scripts/js_build.sh $(pwd) build - name: Test shell: bash - run: ci/scripts/js_test.sh $(pwd) + run: ci/scripts/js_test.sh $(pwd) build diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index eceeb551a0653..ca8280927f4a5 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -42,7 +42,23 @@ jobs: ubuntu: name: AMD64 Ubuntu 20.04 MATLAB - runs-on: ubuntu-latest + # Explicitly pin the Ubuntu version to 20.04 for the time being because: + # + # 1. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible + # with the GLIBCXX bundled with MATLAB R2023a. This is a relatively common + # issue. + # + # For example, see: + # + # https://www.mathworks.com/matlabcentral/answers/1907290-how-to-manually-select-the-libstdc-library-to-use-to-resolve-a-version-glibcxx_-not-found + # + # 2. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible with + # the version of GLIBCXX shipped with Debian 11. Several of the Arrow community + # members who work on the MATLAB bindings use Debian 11 locally for qualification. + # Using Ubuntu 20.04 eases development workflows for these community members. + # + # In the future, we can investigate adding support for building against more Linux (e.g. `ubuntu-22.04`) and MATLAB versions (e.g. R2023b). + runs-on: ubuntu-20.04 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository @@ -54,7 +70,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Install ccache run: sudo apt-get install ccache - name: Setup ccache @@ -74,24 +90,24 @@ jobs: run: ci/scripts/matlab_build.sh $(pwd) - name: Run MATLAB Tests env: - # libarrow.so requires a more recent version of libstdc++.so - # than is bundled with MATLAB under /sys/os/glnxa64. - # Therefore, if a MEX function that depends on libarrow.so - # is executed within the MATLAB address space, runtime linking - # errors will occur. To work around this issue, we can explicitly - # force MATLAB to use the system libstdc++.so via LD_PRELOAD. - LD_PRELOAD: /usr/lib/x86_64-linux-gnu/libstdc++.so.6 - # Add the installation directory to the MATLAB Search Path by # setting the MATLABPATH environment variable. MATLABPATH: matlab/install/arrow_matlab uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test + strict: true macos: - name: AMD64 macOS 11 MATLAB - runs-on: macos-latest + name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} MATLAB + runs-on: macos-${{ matrix.macos-version }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + strategy: + matrix: + include: + - architecture: AMD64 + macos-version: "12" + - architecture: ARM64 + macos-version: "14" steps: - name: Check out repository uses: actions/checkout@v4 @@ -102,7 +118,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Install ccache run: brew install ccache - name: Setup ccache @@ -127,7 +143,8 @@ jobs: MATLABPATH: matlab/install/arrow_matlab uses: matlab-actions/run-tests@v2 with: - select-by-folder: matlab/test + select-by-folder: matlab/test + strict: true windows: name: AMD64 Windows 2022 MATLAB runs-on: windows-2022 @@ -140,7 +157,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Download Timezone Database shell: bash run: ci/scripts/download_tz_database.sh @@ -173,4 +190,5 @@ jobs: MATLABPATH: matlab/install/arrow_matlab uses: matlab-actions/run-tests@v2 with: - select-by-folder: matlab/test + select-by-folder: matlab/test + strict: true diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 6af7dbe7680f5..e589610f536b3 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -82,7 +82,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 25d918bcc25aa..a568f8346e7fc 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -41,6 +41,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -100,7 +102,7 @@ jobs: key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -126,12 +128,20 @@ jobs: run: archery docker push ${{ matrix.image }} macos: - name: AMD64 macOS 12 Python 3 - runs-on: macos-latest + name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 + runs-on: macos-${{ matrix.macos-version }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + - architecture: AMD64 + macos-version: "12" + - architecture: ARM64 + macos-version: "14" env: - ARROW_HOME: /usr/local + ARROW_HOME: /tmp/local ARROW_AZURE: ON ARROW_DATASET: ON ARROW_FLIGHT: ON @@ -162,7 +172,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: '3.11' - name: Install Dependencies @@ -172,6 +182,19 @@ jobs: python -m pip install \ -r python/requirements-build.txt \ -r python/requirements-test.txt + - name: Setup ccache + shell: bash + run: ci/scripts/ccache_setup.sh + - name: ccache info + id: ccache-info + shell: bash + run: echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT + - name: Cache ccache + uses: actions/cache@v4 + with: + path: ${{ steps.ccache-info.outputs.cache-dir }} + key: python-ccache-macos-${{ matrix.macos-version }}-${{ hashFiles('cpp/**', 'python/**') }} + restore-keys: python-ccache-macos-${{ matrix.macos-version }}- - name: Build shell: bash run: | diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 8c47915b7b6d3..6bd940f806775 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -51,6 +51,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -119,7 +121,7 @@ jobs: strategy: fail-fast: false matrix: - r: ["4.3"] + r: ["4.4"] ubuntu: [20.04] force-tests: ["true"] env: @@ -142,7 +144,7 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -190,12 +192,11 @@ jobs: fail-fast: false matrix: config: - - { org: "rhub", image: "debian-gcc-devel", tag: "latest", devtoolset: "" } + - { org: "rhub", image: "ubuntu-gcc12", tag: "latest" } env: R_ORG: ${{ matrix.config.org }} R_IMAGE: ${{ matrix.config.image }} R_TAG: ${{ matrix.config.tag }} - DEVTOOLSET_VERSION: ${{ matrix.config.devtoolset }} steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -203,7 +204,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -369,11 +370,12 @@ jobs: MAKEFLAGS = paste0("-j", parallel::detectCores()), ARROW_R_DEV = TRUE, "_R_CHECK_FORCE_SUGGESTS_" = FALSE, - "_R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_" = TRUE + "_R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_" = TRUE, + "_R_CHECK_DONTTEST_EXAMPLES_" = TRUE ) rcmdcheck::rcmdcheck(".", build_args = '--no-build-vignettes', - args = c('--no-manual', '--as-cran', '--ignore-vignettes', '--run-donttest'), + args = c('--no-manual', '--as-cran', '--ignore-vignettes'), error_on = 'warning', check_dir = 'check', timeout = 3600 diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 6629b5c8a5673..af5382f90834c 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -60,7 +60,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000000000..8d54979502430 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Release + +on: + push: + tags: + # Trigger workflow when a tag whose name matches the pattern + # pattern "apache-arrow-{MAJOR}.{MINOR}.{PATCH}" is pushed. + - "apache-arrow-[0-9]+.[0-9]+.[0-9]+" + +permissions: + contents: write + +env: + GH_TOKEN: ${{ github.token }} + +jobs: + publish: + name: Publish + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Get Tag Name of Latest Release Candidate + run: | + rc_tag=$(gh release list --repo apache/arrow | \ + cut -f3 | \ + grep -F "${GITHUB_REF_NAME}-rc" | \ + head -n1) + echo "Latest Release Candidate Tag: ${rc_tag}" + echo "RELEASE_CANDIDATE_TAG_NAME=${rc_tag}" >> ${GITHUB_ENV} + - name: Store Version and Release Candidate Number + run: | + version_with_rc=${RELEASE_CANDIDATE_TAG_NAME#apache-arrow-} + version=${version_with_rc%-rc*} + rc_num=${version_with_rc#${version}-rc} + echo "VERSION_WITH_RC=${version_with_rc}" >> ${GITHUB_ENV} + echo "VERSION=${version}" >> ${GITHUB_ENV} + echo "RC_NUM=${rc_num}" >> ${GITHUB_ENV} + - name: Download Release Candidate Artifacts + run: | + mkdir release_candidate_artifacts + gh release download ${RELEASE_CANDIDATE_TAG_NAME} --repo apache/arrow --dir release_candidate_artifacts + - name: Create Release Title + run: | + title="Apache Arrow ${VERSION}" + echo "RELEASE_TITLE=${title}" >> ${GITHUB_ENV} + # Set the release notes to "TODO" temporarily. After the release notes page + # (https://arrow.apache.org/release/{VERSION}.html) is published, use + # gh release edit to update the release notes to refer to the newly + # pushed web page. See dev/post/post-05-update-gh-release-notes.sh + - name: Create GitHub Release + run: | + gh release create ${GITHUB_REF_NAME} \ + --repo apache/arrow \ + --verify-tag \ + --title "${RELEASE_TITLE}" \ + --notes "TODO" \ + release_candidate_artifacts/* \ No newline at end of file diff --git a/.github/workflows/release_candidate.yml b/.github/workflows/release_candidate.yml new file mode 100644 index 0000000000000..ec732f0eb33e0 --- /dev/null +++ b/.github/workflows/release_candidate.yml @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Release + +on: + push: + tags: + # Trigger workflow when a tag whose name matches the pattern + # "apache-arrow-{MAJOR}.{MINOR}.{PATCH}-rc{RC_NUM}" is pushed. + - "apache-arrow-[0-9]+.[0-9]+.[0-9]+-rc[0-9]+" + +permissions: + contents: write + +env: + GH_TOKEN: ${{ github.token }} + +jobs: + publish: + name: Publish + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Checkout Arrow + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Store Version and Release Candidate Number + run: | + version_with_rc=${GITHUB_REF_NAME#apache-arrow-} + version=${version_with_rc%-rc*} + rc_num=${version_with_rc#${version}-rc} + echo "VERSION_WITH_RC=${version_with_rc}" >> ${GITHUB_ENV} + echo "VERSION=${version}" >> ${GITHUB_ENV} + echo "RC_NUM=${rc_num}" >> ${GITHUB_ENV} + - name: Create Release Candidate Title + run: | + title="Apache Arrow ${VERSION} RC${RC_NUM}" + echo "RELEASE_CANDIDATE_TITLE=${title}" >> ${GITHUB_ENV} + - name: Create Release Candidate Notes + run: | + release_notes="Release Candidate: ${VERSION} RC${RC_NUM}" + echo "RELEASE_CANDIDATE_NOTES=${release_notes}" >> ${GITHUB_ENV} + - name: Create Release tarball + run: | + cd dev/release/ && ./utils-create-release-tarball.sh ${VERSION} ${RC_NUM} + echo "RELEASE_TARBALL=apache-arrow-${VERSION}.tar.gz" >> ${GITHUB_ENV} + - name: Create GitHub Release + run: | + gh release create ${GITHUB_REF_NAME} \ + --verify-tag \ + --prerelease \ + --title "${RELEASE_CANDIDATE_TITLE}" \ + --notes "Release Notes: ${RELEASE_CANDIDATE_NOTES}" \ + dev/release/${RELEASE_TARBALL} diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 74d56895f4c34..6a29ec8e72cab 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -53,6 +53,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -82,7 +84,7 @@ jobs: key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -115,7 +117,7 @@ jobs: run: archery docker push ubuntu-ruby macos: - name: AMD64 macOS 12 GLib & Ruby + name: AMD64 macOS 14 GLib & Ruby runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 @@ -130,7 +132,7 @@ jobs: ARROW_GCS: ON ARROW_GLIB_GTK_DOC: true ARROW_GLIB_WERROR: true - ARROW_HOME: /usr/local + ARROW_HOME: /tmp/local ARROW_JEMALLOC: OFF ARROW_ORC: OFF ARROW_PARQUET: ON @@ -139,7 +141,6 @@ jobs: ARROW_WITH_SNAPPY: ON ARROW_WITH_ZLIB: ON ARROW_WITH_ZSTD: ON - XML_CATALOG_FILES: /usr/local/etc/xml/catalog steps: - name: Checkout Arrow uses: actions/checkout@v4 @@ -185,7 +186,7 @@ jobs: shell: bash run: ci/scripts/ruby_test.sh $(pwd) $(pwd)/build - windows: + windows-mingw: name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} GLib & Ruby runs-on: windows-2019 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} @@ -196,9 +197,7 @@ jobs: mingw-n-bits: - 64 ruby-version: - # TODO: Use the latest Ruby again when we fix GH-39130. - # - ruby - - "3.1" + - ruby env: ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: OFF @@ -266,7 +265,6 @@ jobs: ridk exec bash ci\scripts\cpp_build.sh "${source_dir}" "${build_dir}" - name: Build GLib run: | - $Env:CMAKE_BUILD_PARALLEL_LEVEL = $Env:NUMBER_OF_PROCESSORS $source_dir = "$(ridk exec cygpath --unix "$(Get-Location)")" $build_dir = "$(ridk exec cygpath --unix "$(Get-Location)\build")" $ErrorActionPreference = "Continue" @@ -304,3 +302,119 @@ jobs: $Env:MAKE = "ridk exec make" $ErrorActionPreference = "Continue" rake -f ruby\Rakefile + + windows-msvc: + name: AMD64 Windows MSVC GLib + runs-on: windows-2019 + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 90 + strategy: + fail-fast: false + env: + ARROW_ACERO: ON + ARROW_BOOST_USE_SHARED: OFF + ARROW_BUILD_BENCHMARKS: OFF + ARROW_BUILD_SHARED: ON + ARROW_BUILD_STATIC: OFF + ARROW_BUILD_TESTS: OFF + ARROW_DATASET: ON + ARROW_DEPENDENCY_SOURCE: VCPKG + ARROW_DEPENDENCY_USE_SHARED: OFF + ARROW_FLIGHT: ON + ARROW_FLIGHT_SQL: ON + ARROW_GANDIVA: OFF + ARROW_GLIB_VAPI: "false" + ARROW_HDFS: OFF + ARROW_HOME: "${{ github.workspace }}/dist" + ARROW_JEMALLOC: OFF + ARROW_MIMALLOC: ON + ARROW_ORC: OFF + ARROW_PARQUET: ON + ARROW_SUBSTRAIT: OFF + ARROW_USE_GLOG: OFF + ARROW_VERBOSE_THIRDPARTY_BUILD: OFF + ARROW_WITH_BROTLI: OFF + ARROW_WITH_BZ2: OFF + ARROW_WITH_LZ4: OFF + ARROW_WITH_OPENTELEMETRY: OFF + ARROW_WITH_SNAPPY: ON + ARROW_WITH_ZLIB: ON + ARROW_WITH_ZSTD: ON + CMAKE_CXX_STANDARD: "17" + CMAKE_GENERATOR: Ninja + CMAKE_INSTALL_PREFIX: "${{ github.workspace }}/dist" + CMAKE_UNITY_BUILD: ON + VCPKG_BINARY_SOURCES: 'clear;nuget,GitHub,readwrite' + VCPKG_ROOT: "${{ github.workspace }}/vcpkg" + VCPKG_TRIPLET: x64-windows + permissions: + packages: write + steps: + - name: Disable Crash Dialogs + run: | + reg add ` + "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" ` + /v DontShowUI ` + /t REG_DWORD ` + /d 1 ` + /f + - name: Checkout Arrow + uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: recursive + - name: Install vcpkg + shell: bash + run: | + ci/scripts/install_vcpkg.sh "${VCPKG_ROOT}" + - name: Install meson + run: | + python -m pip install meson + - name: Install ccache + shell: bash + run: | + ci/scripts/install_ccache.sh 4.6.3 /usr + - name: Setup ccache + shell: bash + run: | + ci/scripts/ccache_setup.sh + - name: ccache info + id: ccache-info + shell: bash + run: | + echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT + - name: Cache ccache + uses: actions/cache@v4 + with: + path: ${{ steps.ccache-info.outputs.cache-dir }} + key: glib-ccache-msvc-${{ env.CACHE_VERSION }}-${{ hashFiles('cpp/**') }} + restore-keys: glib-ccache-msvc-${{ env.CACHE_VERSION }}- + env: + # We can invalidate the current cache by updating this. + CACHE_VERSION: "2024-05-09" + - name: Setup NuGet credentials for vcpkg caching + shell: bash + run: | + $(vcpkg/vcpkg.exe fetch nuget | tail -n 1) \ + sources add \ + -source "https://nuget.pkg.github.com/$GITHUB_REPOSITORY_OWNER/index.json" \ + -storepasswordincleartext \ + -name "GitHub" \ + -username "$GITHUB_REPOSITORY_OWNER" \ + -password "${{ secrets.GITHUB_TOKEN }}" + $(vcpkg/vcpkg.exe fetch nuget | tail -n 1) \ + setapikey "${{ secrets.GITHUB_TOKEN }}" \ + -source "https://nuget.pkg.github.com/$GITHUB_REPOSITORY_OWNER/index.json" + - name: Build C++ vcpkg dependencies + run: | + vcpkg\vcpkg.exe install --triplet $env:VCPKG_TRIPLET --x-manifest-root cpp --x-install-root build\cpp\vcpkg_installed + - name: Build C++ + shell: cmd + run: | + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + bash -c "ci/scripts/cpp_build.sh $(pwd) $(pwd)/build" + - name: Build GLib + shell: cmd + run: | + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + bash -c "ci/scripts/c_glib_build.sh $(pwd) $(pwd)/build" diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index f55e9e77503c0..3f039315b505a 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -41,6 +41,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.gitignore b/.gitignore index c7f5aa90e18e6..52ffa6c6124c2 100644 --- a/.gitignore +++ b/.gitignore @@ -102,4 +102,9 @@ __debug_bin .envrc # Develocity -.mvn/.gradle-enterprise/ +java/.mvn/.gradle-enterprise/ +java/.mvn/.develocity/ + +# rat +filtered_rat.txt +rat.txt diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from b/.golangci.yaml similarity index 77% rename from dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from rename to .golangci.yaml index 34187b2af5a74..7d486a9e85a0a 100644 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from +++ b/.golangci.yaml @@ -15,4 +15,15 @@ # specific language governing permissions and limitations # under the License. -arm64v8/debian:bullseye +linters: + # Disable all linters. + # Default: false + disable-all: true + # Enable specific linter + # https://golangci-lint.run/usage/linters/#enabled-by-default + enable: + - gofmt + - goimports + +issues: + fix: true \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2e598e0a95064..9bdd4f487bdec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,13 +34,19 @@ repos: hooks: - id: hadolint-docker name: Docker Format - exclude: ^dev/.*$ + # We can enable this after we fix all existing lint failures. + # files: (/Dockerfile|\.dockerfile)$ + files: >- + ( + ?^ci/docker/python-wheel-windows-test-vs2019\.dockerfile$| + ) + types: [] - repo: https://github.com/pycqa/flake8 rev: 6.1.0 hooks: - id: flake8 name: Python Format - files: ^(python|dev|integration)/ + files: ^(python|dev|c_glib|integration)/ types: - file - python @@ -116,19 +122,42 @@ repos: name: CMake Format files: >- ( + ?.*CMakeLists\.txt$| ?^ci/.*/.*\.cmake$| ?^cpp/.*/.*\.cmake\.in$| ?^cpp/.*/.*\.cmake$| - ?^cpp/.*/CMakeLists\.txt$| - ?^go/.*/CMakeLists\.txt$| - ?^java/.*/CMakeLists\.txt$| - ?^matlab/.*/CMakeLists\.txt$| - ?^python/.*/CMakeLists\.txt$| ) exclude: >- ( + ?^ci/conan/all/.*CMakeLists\.txt$| ?^cpp/cmake_modules/FindNumPy\.cmake$| ?^cpp/cmake_modules/FindPythonLibsNew\.cmake$| ?^cpp/cmake_modules/UseCython\.cmake$| - ?^cpp/src/arrow/util/config\.h\.cmake$| + ?^cpp/src/arrow/util/.*\.h\.cmake$| ) + - repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v0.9.1 + hooks: + - id: sphinx-lint + files: ^docs/source + exclude: ^docs/source/python/generated + args: [ + '--enable', + 'all', + '--disable', + 'dangling-hyphen,line-too-long', + ] + - repo: https://github.com/golangci/golangci-lint + rev: v1.59.0 + hooks: + # no built-in support for multiple go.mod + # https://github.com/golangci/golangci-lint/issues/828 + - id: golangci-lint-full + name: golangci-lint-full-arrow + entry: bash -c 'cd go/arrow && golangci-lint run' + - id: golangci-lint-full + name: golangci-lint-full-parquet + entry: bash -c 'cd go/parquet && golangci-lint run' + - id: golangci-lint-full + name: golangci-lint-full-internal + entry: bash -c 'cd go/internal && golangci-lint run' diff --git a/LICENSE.txt b/LICENSE.txt index 0423854567b26..7bb1330a1002b 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -2252,3 +2252,10 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java + +These file are derived from code from Netty, which is made available under the +Apache License 2.0. diff --git a/c_glib/arrow-cuda-glib/arrow-cuda-glib.h b/c_glib/arrow-cuda-glib/arrow-cuda-glib.h index b3c7f21087669..572ff92ed9b43 100644 --- a/c_glib/arrow-cuda-glib/arrow-cuda-glib.h +++ b/c_glib/arrow-cuda-glib/arrow-cuda-glib.h @@ -21,4 +21,6 @@ #include +#include + #include diff --git a/c_glib/arrow-cuda-glib/cuda.h b/c_glib/arrow-cuda-glib/cuda.h index 863743a620bf8..f04a3381259bb 100644 --- a/c_glib/arrow-cuda-glib/cuda.h +++ b/c_glib/arrow-cuda-glib/cuda.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GARROW_CUDA_TYPE_DEVICE_MANAGER (garrow_cuda_device_manager_get_type()) +GARROW_CUDA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GArrowCUDADeviceManager, garrow_cuda_device_manager, GARROW_CUDA, @@ -35,6 +38,7 @@ struct _GArrowCUDADeviceManagerClass }; #define GARROW_CUDA_TYPE_CONTEXT (garrow_cuda_context_get_type()) +GARROW_CUDA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE( GArrowCUDAContext, garrow_cuda_context, GARROW_CUDA, CONTEXT, GObject) struct _GArrowCUDAContextClass @@ -43,6 +47,7 @@ struct _GArrowCUDAContextClass }; #define GARROW_CUDA_TYPE_BUFFER (garrow_cuda_buffer_get_type()) +GARROW_CUDA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE( GArrowCUDABuffer, garrow_cuda_buffer, GARROW_CUDA, BUFFER, GArrowBuffer) struct _GArrowCUDABufferClass @@ -51,6 +56,7 @@ struct _GArrowCUDABufferClass }; #define GARROW_CUDA_TYPE_HOST_BUFFER (garrow_cuda_host_buffer_get_type()) +GARROW_CUDA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GArrowCUDAHostBuffer, garrow_cuda_host_buffer, GARROW_CUDA, @@ -62,6 +68,7 @@ struct _GArrowCUDAHostBufferClass }; #define GARROW_CUDA_TYPE_IPC_MEMORY_HANDLE (garrow_cuda_ipc_memory_handle_get_type()) +GARROW_CUDA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GArrowCUDAIPCMemoryHandle, garrow_cuda_ipc_memory_handle, GARROW_CUDA, @@ -73,6 +80,7 @@ struct _GArrowCUDAIPCMemoryHandleClass }; #define GARROW_CUDA_TYPE_BUFFER_INPUT_STREAM (garrow_cuda_buffer_input_stream_get_type()) +GARROW_CUDA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GArrowCUDABufferInputStream, garrow_cuda_buffer_input_stream, GARROW_CUDA, @@ -85,6 +93,7 @@ struct _GArrowCUDABufferInputStreamClass #define GARROW_CUDA_TYPE_BUFFER_OUTPUT_STREAM \ (garrow_cuda_buffer_output_stream_get_type()) +GARROW_CUDA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GArrowCUDABufferOutputStream, garrow_cuda_buffer_output_stream, GARROW_CUDA, @@ -95,71 +104,100 @@ struct _GArrowCUDABufferOutputStreamClass GArrowOutputStreamClass parent_class; }; +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDADeviceManager * garrow_cuda_device_manager_new(GError **error); +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDAContext * garrow_cuda_device_manager_get_context(GArrowCUDADeviceManager *manager, gint gpu_number, GError **error); + +GARROW_CUDA_AVAILABLE_IN_0_12 gsize garrow_cuda_device_manager_get_n_devices(GArrowCUDADeviceManager *manager); +GARROW_CUDA_AVAILABLE_IN_0_12 gint64 garrow_cuda_context_get_allocated_size(GArrowCUDAContext *context); +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDABuffer * garrow_cuda_buffer_new(GArrowCUDAContext *context, gint64 size, GError **error); + +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDABuffer * garrow_cuda_buffer_new_ipc(GArrowCUDAContext *context, GArrowCUDAIPCMemoryHandle *handle, GError **error); + +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDABuffer * garrow_cuda_buffer_new_record_batch(GArrowCUDAContext *context, GArrowRecordBatch *record_batch, GError **error); + +GARROW_CUDA_AVAILABLE_IN_0_12 GBytes * garrow_cuda_buffer_copy_to_host(GArrowCUDABuffer *buffer, gint64 position, gint64 size, GError **error); + +GARROW_CUDA_AVAILABLE_IN_0_12 gboolean garrow_cuda_buffer_copy_from_host(GArrowCUDABuffer *buffer, const guint8 *data, gint64 size, GError **error); + +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDAIPCMemoryHandle * garrow_cuda_buffer_export(GArrowCUDABuffer *buffer, GError **error); + +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDAContext * garrow_cuda_buffer_get_context(GArrowCUDABuffer *buffer); + +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowRecordBatch * garrow_cuda_buffer_read_record_batch(GArrowCUDABuffer *buffer, GArrowSchema *schema, GArrowReadOptions *options, GError **error); +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDAHostBuffer * garrow_cuda_host_buffer_new(gint gpu_number, gint64 size, GError **error); +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDAIPCMemoryHandle * garrow_cuda_ipc_memory_handle_new(const guint8 *data, gsize size, GError **error); +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowBuffer * garrow_cuda_ipc_memory_handle_serialize(GArrowCUDAIPCMemoryHandle *handle, GError **error); +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDABufferInputStream * garrow_cuda_buffer_input_stream_new(GArrowCUDABuffer *buffer); +GARROW_CUDA_AVAILABLE_IN_0_12 GArrowCUDABufferOutputStream * garrow_cuda_buffer_output_stream_new(GArrowCUDABuffer *buffer); +GARROW_CUDA_AVAILABLE_IN_0_12 gboolean garrow_cuda_buffer_output_stream_set_buffer_size(GArrowCUDABufferOutputStream *stream, gint64 size, GError **error); +GARROW_CUDA_AVAILABLE_IN_0_12 gint64 garrow_cuda_buffer_output_stream_get_buffer_size(GArrowCUDABufferOutputStream *stream); + +GARROW_CUDA_AVAILABLE_IN_0_12 gint64 garrow_cuda_buffer_output_stream_get_buffered_size(GArrowCUDABufferOutputStream *stream); diff --git a/c_glib/arrow-cuda-glib/meson.build b/c_glib/arrow-cuda-glib/meson.build index 88029e6dc2073..47bed70f03b60 100644 --- a/c_glib/arrow-cuda-glib/meson.build +++ b/c_glib/arrow-cuda-glib/meson.build @@ -31,10 +31,17 @@ cpp_headers = files( 'cuda.hpp', ) +version_h = configure_file( + input: 'version.h.in', + output: 'version.h', + command: [python3, generate_version_header_py, '--library', 'GARROW_CUDA', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], +) + +c_headers += version_h + headers = c_headers + cpp_headers install_headers(headers, subdir: 'arrow-cuda-glib') - dependencies = [ arrow_cuda, arrow_glib, @@ -45,6 +52,7 @@ libarrow_cuda_glib = library('arrow-cuda-glib', dependencies: dependencies, implicit_include_directories: false, include_directories: base_include_directories, + cpp_args: ['-DGARROW_CUDA_COMPILATION'], soversion: so_version, version: library_version) arrow_cuda_glib = declare_dependency(link_with: libarrow_cuda_glib, diff --git a/c_glib/arrow-cuda-glib/version.h.in b/c_glib/arrow-cuda-glib/version.h.in new file mode 100644 index 0000000000000..0ab5bfd562b41 --- /dev/null +++ b/c_glib/arrow-cuda-glib/version.h.in @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +/** + * SECTION: version + * @section_id: version-macros + * @title: Version related macros + * @include: arrow-cuda-glib/arrow-cuda-glib.h + * + * Apache Arrow CUDA GLib provides macros that can be used by C pre-processor. + * They are useful to check version related things at compile time. + */ + +/** + * GARROW_CUDA_VERSION_MAJOR: + * + * The major version. + * + * Since: 17.0.0 + */ +#define GARROW_CUDA_VERSION_MAJOR (@VERSION_MAJOR@) + +/** + * GARROW_CUDA_VERSION_MINOR: + * + * The minor version. + * + * Since: 17.0.0 + */ +#define GARROW_CUDA_VERSION_MINOR (@VERSION_MINOR@) + +/** + * GARROW_CUDA_VERSION_MICRO: + * + * The micro version. + * + * Since: 17.0.0 + */ +#define GARROW_CUDA_VERSION_MICRO (@VERSION_MICRO@) + +/** + * GARROW_CUDA_VERSION_TAG: + * + * The version tag. Normally, it's an empty string. It's "SNAPSHOT" + * for snapshot version. + * + * Since: 17.0.0 + */ +#define GARROW_CUDA_VERSION_TAG "@VERSION_TAG@" + +/** + * GARROW_CUDA_VERSION_CHECK: + * @major: A major version to check for. + * @minor: A minor version to check for. + * @micro: A micro version to check for. + * + * You can use this macro in C pre-processor. + * + * Returns: %TRUE if the compile time Apache Arrow GLib version is the + * same as or newer than the passed version, %FALSE otherwise. + * + * Since: 17.0.0 + */ +#define GARROW_CUDA_VERSION_CHECK(major, minor, micro) \ + (GARROW_CUDA_VERSION_MAJOR > (major) || \ + (GARROW_CUDA_VERSION_MAJOR == (major) && \ + GARROW_CUDA_VERSION_MINOR > (minor)) || \ + (GARROW_CUDA_VERSION_MAJOR == (major) && \ + GARROW_CUDA_VERSION_MINOR == (minor) && \ + GARROW_CUDA_VERSION_MICRO >= (micro))) + +/** + * GARROW_CUDA_DISABLE_DEPRECATION_WARNINGS: + * + * If this macro is defined, no deprecated warnings are produced. + * + * You must define this macro before including the + * arrow-glib/arrow-glib.h header. + * + * Since: 17.0.0 + */ + +#ifdef GARROW_CUDA_DISABLE_DEPRECATION_WARNINGS +# define GARROW_CUDA_DEPRECATED +# define GARROW_CUDA_DEPRECATED_FOR(function) +# define GARROW_CUDA_UNAVAILABLE(major, minor) +#else +# define GARROW_CUDA_DEPRECATED G_DEPRECATED +# define GARROW_CUDA_DEPRECATED_FOR(function) G_DEPRECATED_FOR(function) +# define GARROW_CUDA_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) +#endif + +@ENCODED_VERSIONS@ + +/** + * GARROW_CUDA_VERSION_MIN_REQUIRED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GARROW_CUDA_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GARROW_CUDA_VERSION_MIN_REQUIRED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * arrow-cuda-glib/arrow-cuda-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GARROW_CUDA_VERSION_MIN_REQUIRED +# define GARROW_CUDA_VERSION_MIN_REQUIRED GARROW_VERSION_MIN_REQUIRED +#endif + +/** + * GARROW_CUDA_VERSION_MAX_ALLOWED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GARROW_CUDA_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GARROW_CUDA_VERSION_MAX_ALLOWED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * arrow-cuda-glib/arrow-cuda-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GARROW_CUDA_VERSION_MAX_ALLOWED +# define GARROW_CUDA_VERSION_MAX_ALLOWED GARROW_VERSION_MAX_ALLOWED +#endif + +@VISIBILITY_MACROS@ + +@AVAILABILITY_MACROS@ diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h index 58f4e216cc715..7ebf36ddd2b78 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h @@ -21,6 +21,8 @@ #include +#include + #include #include #include diff --git a/c_glib/arrow-dataset-glib/dataset-definition.h b/c_glib/arrow-dataset-glib/dataset-definition.h index f278b05a135f5..bc52d6d3663a3 100644 --- a/c_glib/arrow-dataset-glib/dataset-definition.h +++ b/c_glib/arrow-dataset-glib/dataset-definition.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GADATASET_TYPE_DATASET (gadataset_dataset_get_type()) +GADATASET_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GADatasetDataset, gadataset_dataset, GADATASET, DATASET, GObject) struct _GADatasetDatasetClass { diff --git a/c_glib/arrow-dataset-glib/dataset-factory.h b/c_glib/arrow-dataset-glib/dataset-factory.h index 1dc875837fe21..e7d3bc27aea8f 100644 --- a/c_glib/arrow-dataset-glib/dataset-factory.h +++ b/c_glib/arrow-dataset-glib/dataset-factory.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GADATASET_TYPE_FINISH_OPTIONS (gadataset_finish_options_get_type()) +GADATASET_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE( GADatasetFinishOptions, gadataset_finish_options, GADATASET, FINISH_OPTIONS, GObject) struct _GADatasetFinishOptionsClass @@ -31,11 +32,12 @@ struct _GADatasetFinishOptionsClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_11_0 +GADATASET_AVAILABLE_IN_11_0 GADatasetFinishOptions * gadataset_finish_options_new(void); #define GADATASET_TYPE_DATASET_FACTORY (gadataset_dataset_factory_get_type()) +GADATASET_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GADatasetDatasetFactory, gadataset_dataset_factory, GADATASET, DATASET_FACTORY, GObject) struct _GADatasetDatasetFactoryClass @@ -43,7 +45,7 @@ struct _GADatasetDatasetFactoryClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 GADatasetDataset * gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, GADatasetFinishOptions *options, @@ -51,6 +53,7 @@ gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, #define GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY \ (gadataset_file_system_dataset_factory_get_type()) +GADATASET_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDatasetFactory, gadataset_file_system_dataset_factory, GADATASET, @@ -61,32 +64,33 @@ struct _GADatasetFileSystemDatasetFactoryClass GADatasetDatasetFactoryClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 GADatasetFileSystemDatasetFactory * gadataset_file_system_dataset_factory_new(GADatasetFileFormat *file_format); -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 gboolean gadataset_file_system_dataset_factory_set_file_system( GADatasetFileSystemDatasetFactory *factory, GArrowFileSystem *file_system, GError **error); +GADATASET_AVAILABLE_IN_5_0 gboolean gadataset_file_system_dataset_factory_set_file_system_uri( GADatasetFileSystemDatasetFactory *factory, const gchar *uri, GError **error); -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 gboolean gadataset_file_system_dataset_factory_add_path(GADatasetFileSystemDatasetFactory *factory, const gchar *path, GError **error); /* -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 gboolean gadataset_file_system_dataset_factory_add_file( GADatasetFileSystemDatasetFactory *factory, GArrowFileInfo *file, GError **error); -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 gboolean gadataset_file_system_dataset_factory_add_selector( GADatasetFileSystemDatasetFactory *factory, @@ -94,7 +98,7 @@ gadataset_file_system_dataset_factory_add_selector( GError **error); */ -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 GADatasetFileSystemDataset * gadataset_file_system_dataset_factory_finish(GADatasetFileSystemDatasetFactory *factory, GADatasetFinishOptions *options, diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp index 704d6b589ee94..f84e4e3db380a 100644 --- a/c_glib/arrow-dataset-glib/dataset.cpp +++ b/c_glib/arrow-dataset-glib/dataset.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -152,12 +153,46 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, GError **error) } auto arrow_scanner = *arrow_scanner_result; auto arrow_table_result = arrow_scanner->ToTable(); - if (!garrow::check(error, arrow_scanner_result, "[dataset][to-table]")) { + if (!garrow::check(error, arrow_table_result, "[dataset][to-table]")) { return NULL; } return garrow_table_new_raw(&(*arrow_table_result)); } +/** + * gadataset_dataset_to_record_batch_reader: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A #GArrowRecordBatchReader on success, %NULL on error. + * + * Since: 17.0.0 + */ +GArrowRecordBatchReader * +gadataset_dataset_to_record_batch_reader(GADatasetDataset *dataset, GError **error) +{ + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (!garrow::check(error, + arrow_scanner_builder_result, + "[dataset][to-record-batch-reader]")) { + return nullptr; + } + auto arrow_scanner_builder = *arrow_scanner_builder_result; + auto arrow_scanner_result = arrow_scanner_builder->Finish(); + if (!garrow::check(error, arrow_scanner_result, "[dataset][to-record-batch-reader]")) { + return nullptr; + } + auto arrow_scanner = *arrow_scanner_result; + auto arrow_reader_result = arrow_scanner->ToRecordBatchReader(); + if (!garrow::check(error, arrow_reader_result, "[dataset][to-record-batch-reader]")) { + return nullptr; + } + auto sources = g_list_prepend(nullptr, dataset); + return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), sources); +} + /** * gadataset_dataset_get_type_name: * @dataset: A #GADatasetDataset. diff --git a/c_glib/arrow-dataset-glib/dataset.h b/c_glib/arrow-dataset-glib/dataset.h index 57f6c7729f073..5b957f0538a2a 100644 --- a/c_glib/arrow-dataset-glib/dataset.h +++ b/c_glib/arrow-dataset-glib/dataset.h @@ -25,18 +25,22 @@ G_BEGIN_DECLS -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 GADatasetScannerBuilder * gadataset_dataset_begin_scan(GADatasetDataset *dataset, GError **error); -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 GArrowTable * gadataset_dataset_to_table(GADatasetDataset *dataset, GError **error); -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 gchar * gadataset_dataset_get_type_name(GADatasetDataset *dataset); +GADATASET_AVAILABLE_IN_17_0 +GArrowRecordBatchReader * +gadataset_dataset_to_record_batch_reader(GADatasetDataset *dataset, GError **error); #define GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS \ (gadataset_file_system_dataset_write_options_get_type()) +GADATASET_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDatasetWriteOptions, gadataset_file_system_dataset_write_options, GADATASET, @@ -47,11 +51,12 @@ struct _GADatasetFileSystemDatasetWriteOptionsClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 GADatasetFileSystemDatasetWriteOptions * gadataset_file_system_dataset_write_options_new(void); #define GADATASET_TYPE_FILE_SYSTEM_DATASET (gadataset_file_system_dataset_get_type()) +GADATASET_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDataset, gadataset_file_system_dataset, GADATASET, @@ -62,7 +67,7 @@ struct _GADatasetFileSystemDatasetClass GADatasetDatasetClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 gboolean gadataset_file_system_dataset_write_scanner( GADatasetScanner *scanner, diff --git a/c_glib/arrow-dataset-glib/enums.h.template b/c_glib/arrow-dataset-glib/enums.h.template index b7d3c99c0bef8..8b89a8b031bdc 100644 --- a/c_glib/arrow-dataset-glib/enums.h.template +++ b/c_glib/arrow-dataset-glib/enums.h.template @@ -22,6 +22,8 @@ #include +#include + G_BEGIN_DECLS /*** END file-header ***/ @@ -31,6 +33,7 @@ G_BEGIN_DECLS /*** END file-production ***/ /*** BEGIN value-header ***/ +GADATASET_AVAILABLE_IN_ALL GType @enum_name@_get_type(void) G_GNUC_CONST; #define @ENUMPREFIX@_TYPE_@ENUMSHORT@ (@enum_name@_get_type()) /*** END value-header ***/ diff --git a/c_glib/arrow-dataset-glib/file-format.h b/c_glib/arrow-dataset-glib/file-format.h index 29487e59d70dd..f70523597e7c6 100644 --- a/c_glib/arrow-dataset-glib/file-format.h +++ b/c_glib/arrow-dataset-glib/file-format.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GADATASET_TYPE_FILE_WRITE_OPTIONS (gadataset_file_write_options_get_type()) +GADATASET_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GADatasetFileWriteOptions, gadataset_file_write_options, GADATASET, @@ -35,6 +38,7 @@ struct _GADatasetFileWriteOptionsClass }; #define GADATASET_TYPE_FILE_WRITER (gadataset_file_writer_get_type()) +GADATASET_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GADatasetFileWriter, gadataset_file_writer, GADATASET, FILE_WRITER, GObject) struct _GADatasetFileWriterClass @@ -42,21 +46,22 @@ struct _GADatasetFileWriterClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 gboolean gadataset_file_writer_write_record_batch(GADatasetFileWriter *writer, GArrowRecordBatch *record_batch, GError **error); -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 gboolean gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter *writer, GArrowRecordBatchReader *reader, GError **error); -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 gboolean gadataset_file_writer_finish(GADatasetFileWriter *writer, GError **error); #define GADATASET_TYPE_FILE_FORMAT (gadataset_file_format_get_type()) +GADATASET_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE( GADatasetFileFormat, gadataset_file_format, GADATASET, FILE_FORMAT, GObject) struct _GADatasetFileFormatClass @@ -64,13 +69,13 @@ struct _GADatasetFileFormatClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_3_0 +GADATASET_AVAILABLE_IN_3_0 gchar * gadataset_file_format_get_type_name(GADatasetFileFormat *format); -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 GADatasetFileWriteOptions * gadataset_file_format_get_default_write_options(GADatasetFileFormat *format); -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 GADatasetFileWriter * gadataset_file_format_open_writer(GADatasetFileFormat *format, GArrowOutputStream *destination, @@ -80,12 +85,13 @@ gadataset_file_format_open_writer(GADatasetFileFormat *format, GADatasetFileWriteOptions *options, GError **error); -GARROW_AVAILABLE_IN_3_0 +GADATASET_AVAILABLE_IN_3_0 gboolean gadataset_file_format_equal(GADatasetFileFormat *format, GADatasetFileFormat *other_format); #define GADATASET_TYPE_CSV_FILE_FORMAT (gadataset_csv_file_format_get_type()) +GADATASET_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GADatasetCSVFileFormat, gadataset_csv_file_format, GADATASET, @@ -96,11 +102,12 @@ struct _GADatasetCSVFileFormatClass GADatasetFileFormatClass parent_class; }; -GARROW_AVAILABLE_IN_3_0 +GADATASET_AVAILABLE_IN_3_0 GADatasetCSVFileFormat * gadataset_csv_file_format_new(void); #define GADATASET_TYPE_IPC_FILE_FORMAT (gadataset_ipc_file_format_get_type()) +GADATASET_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GADatasetIPCFileFormat, gadataset_ipc_file_format, GADATASET, @@ -111,11 +118,12 @@ struct _GADatasetIPCFileFormatClass GADatasetFileFormatClass parent_class; }; -GARROW_AVAILABLE_IN_3_0 +GADATASET_AVAILABLE_IN_3_0 GADatasetIPCFileFormat * gadataset_ipc_file_format_new(void); #define GADATASET_TYPE_PARQUET_FILE_FORMAT (gadataset_parquet_file_format_get_type()) +GADATASET_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GADatasetParquetFileFormat, gadataset_parquet_file_format, GADATASET, @@ -126,7 +134,7 @@ struct _GADatasetParquetFileFormatClass GADatasetFileFormatClass parent_class; }; -GARROW_AVAILABLE_IN_3_0 +GADATASET_AVAILABLE_IN_3_0 GADatasetParquetFileFormat * gadataset_parquet_file_format_new(void); diff --git a/c_glib/arrow-dataset-glib/fragment.h b/c_glib/arrow-dataset-glib/fragment.h index 49acc360a3679..80eb9e19df3cc 100644 --- a/c_glib/arrow-dataset-glib/fragment.h +++ b/c_glib/arrow-dataset-glib/fragment.h @@ -21,11 +21,14 @@ #include +#include + G_BEGIN_DECLS /* arrow::dataset::Fragment */ #define GADATASET_TYPE_FRAGMENT (gadataset_fragment_get_type()) +GADATASET_AVAILABLE_IN_4_0 G_DECLARE_DERIVABLE_TYPE( GADatasetFragment, gadataset_fragment, GADATASET, FRAGMENT, GObject) struct _GADatasetFragmentClass @@ -36,6 +39,7 @@ struct _GADatasetFragmentClass /* arrow::dataset::InMemoryFragment */ #define GADATASET_TYPE_IN_MEMORY_FRAGMENT (gadataset_in_memory_fragment_get_type()) +GADATASET_AVAILABLE_IN_4_0 G_DECLARE_DERIVABLE_TYPE(GADatasetInMemoryFragment, gadataset_in_memory_fragment, GADATASET, @@ -46,7 +50,7 @@ struct _GADatasetInMemoryFragmentClass GADatasetFragmentClass parent_class; }; -GARROW_AVAILABLE_IN_4_0 +GADATASET_AVAILABLE_IN_4_0 GADatasetInMemoryFragment * gadataset_in_memory_fragment_new(GArrowSchema *schema, GArrowRecordBatch **record_batches, diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index 0c869a4183efa..3425efc5555c8 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -17,6 +17,8 @@ # specific language governing permissions and limitations # under the License. +project_name = 'arrow-dataset-glib' + sources = files( 'dataset-factory.cpp', 'dataset.cpp', @@ -47,20 +49,27 @@ cpp_headers = files( 'scanner.hpp', ) +version_h = configure_file( + input: 'version.h.in', + output: 'version.h', + command: [python3, generate_version_header_py, '--library', 'GADATASET', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], +) + +c_headers += version_h + enums = gnome.mkenums('enums', sources: c_headers, identifier_prefix: 'GADataset', symbol_prefix: 'gadataset', c_template: 'enums.c.template', h_template: 'enums.h.template', - install_dir: join_paths(include_dir, 'arrow-dataset-glib'), + install_dir: join_paths(include_dir, project_name), install_header: true) enums_source = enums[0] enums_header = enums[1] - headers = c_headers + cpp_headers -install_headers(headers, subdir: 'arrow-dataset-glib') +install_headers(headers, subdir: project_name) dependencies = [ arrow_dataset, @@ -72,6 +81,8 @@ libarrow_dataset_glib = library('arrow-dataset-glib', dependencies: dependencies, implicit_include_directories: false, include_directories: base_include_directories, + cpp_args: ['-DGADATASET_COMPILATION'], + c_args: ['-DGADATASET_COMPILATION'], soversion: so_version, version: library_version) arrow_dataset_glib = declare_dependency(link_with: libarrow_dataset_glib, diff --git a/c_glib/arrow-dataset-glib/partitioning.h b/c_glib/arrow-dataset-glib/partitioning.h index ccf476272e429..7671958d88e61 100644 --- a/c_glib/arrow-dataset-glib/partitioning.h +++ b/c_glib/arrow-dataset-glib/partitioning.h @@ -21,6 +21,8 @@ #include +#include + G_BEGIN_DECLS /** @@ -39,6 +41,7 @@ typedef enum { #define GADATASET_TYPE_PARTITIONING_FACTORY_OPTIONS \ (gadataset_partitioning_factory_options_get_type()) +GADATASET_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE(GADatasetPartitioningFactoryOptions, gadataset_partitioning_factory_options, GADATASET, @@ -49,11 +52,12 @@ struct _GADatasetPartitioningFactoryOptionsClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_11_0 +GADATASET_AVAILABLE_IN_11_0 GADatasetPartitioningFactoryOptions * gadataset_partitioning_factory_options_new(void); #define GADATASET_TYPE_PARTITIONING (gadataset_partitioning_get_type()) +GADATASET_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GADatasetPartitioning, gadataset_partitioning, GADATASET, PARTITIONING, GObject) struct _GADatasetPartitioningClass @@ -61,16 +65,17 @@ struct _GADatasetPartitioningClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 gchar * gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning); -GARROW_AVAILABLE_IN_12_0 +GADATASET_AVAILABLE_IN_12_0 GADatasetPartitioning * gadataset_partitioning_create_default(void); #define GADATASET_TYPE_KEY_VALUE_PARTITIONING_OPTIONS \ (gadataset_key_value_partitioning_options_get_type()) +GADATASET_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE(GADatasetKeyValuePartitioningOptions, gadataset_key_value_partitioning_options, GADATASET, @@ -81,12 +86,13 @@ struct _GADatasetKeyValuePartitioningOptionsClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_11_0 +GADATASET_AVAILABLE_IN_11_0 GADatasetKeyValuePartitioningOptions * gadataset_key_value_partitioning_options_new(void); #define GADATASET_TYPE_KEY_VALUE_PARTITIONING \ (gadataset_key_value_partitioning_get_type()) +GADATASET_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GADatasetKeyValuePartitioning, gadataset_key_value_partitioning, GADATASET, @@ -99,6 +105,7 @@ struct _GADatasetKeyValuePartitioningClass #define GADATASET_TYPE_DIRECTORY_PARTITIONING \ (gadataset_directory_partitioning_get_type()) +GADATASET_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GADatasetDirectoryPartitioning, gadataset_directory_partitioning, GADATASET, @@ -109,7 +116,7 @@ struct _GADatasetDirectoryPartitioningClass GADatasetKeyValuePartitioningClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 GADatasetDirectoryPartitioning * gadataset_directory_partitioning_new(GArrowSchema *schema, GList *dictionaries, @@ -118,6 +125,7 @@ gadataset_directory_partitioning_new(GArrowSchema *schema, #define GADATASET_TYPE_HIVE_PARTITIONING_OPTIONS \ (gadataset_hive_partitioning_options_get_type()) +GADATASET_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE(GADatasetHivePartitioningOptions, gadataset_hive_partitioning_options, GADATASET, @@ -128,11 +136,12 @@ struct _GADatasetHivePartitioningOptionsClass GADatasetKeyValuePartitioningOptionsClass parent_class; }; -GARROW_AVAILABLE_IN_11_0 +GADATASET_AVAILABLE_IN_11_0 GADatasetHivePartitioningOptions * gadataset_hive_partitioning_options_new(void); #define GADATASET_TYPE_HIVE_PARTITIONING (gadataset_hive_partitioning_get_type()) +GADATASET_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE(GADatasetHivePartitioning, gadataset_hive_partitioning, GADATASET, @@ -143,13 +152,13 @@ struct _GADatasetHivePartitioningClass GADatasetKeyValuePartitioningClass parent_class; }; -GARROW_AVAILABLE_IN_11_0 +GADATASET_AVAILABLE_IN_11_0 GADatasetHivePartitioning * gadataset_hive_partitioning_new(GArrowSchema *schema, GList *dictionaries, GADatasetHivePartitioningOptions *options, GError **error); -GARROW_AVAILABLE_IN_11_0 +GADATASET_AVAILABLE_IN_11_0 gchar * gadataset_hive_partitioning_get_null_fallback(GADatasetHivePartitioning *partitioning); diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index 717532db9220f..28af1f16e5968 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -128,6 +128,28 @@ gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error) } } +/** + * gadataset_scanner_to_record_batch_reader: + * @scanner: A #GADatasetScanner. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A #GArrowRecordBatchReader on success, %NULL on error. + * + * Since: 17.0.0 + */ +GArrowRecordBatchReader * +gadataset_scanner_to_record_batch_reader(GADatasetScanner *scanner, GError **error) +{ + auto arrow_scanner = gadataset_scanner_get_raw(scanner); + auto arrow_reader_result = arrow_scanner->ToRecordBatchReader(); + if (!garrow::check(error, arrow_reader_result, "[scanner][to-record-batch-reader]")) { + return nullptr; + } + auto sources = g_list_prepend(nullptr, scanner); + return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), sources); +} + typedef struct GADatasetScannerBuilderPrivate_ { std::shared_ptr scanner_builder; diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h index 3c7432fb268e4..d92eca5ab8420 100644 --- a/c_glib/arrow-dataset-glib/scanner.h +++ b/c_glib/arrow-dataset-glib/scanner.h @@ -21,21 +21,28 @@ #include #include +#include G_BEGIN_DECLS #define GADATASET_TYPE_SCANNER (gadataset_scanner_get_type()) +GADATASET_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GADatasetScanner, gadataset_scanner, GADATASET, SCANNER, GObject) struct _GADatasetScannerClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 GArrowTable * gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error); +GADATASET_AVAILABLE_IN_17_0 +GArrowRecordBatchReader * +gadataset_scanner_to_record_batch_reader(GADatasetScanner *scanner, GError **error); + #define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type()) +GADATASET_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GADatasetScannerBuilder, gadataset_scanner_builder, GADATASET, SCANNER_BUILDER, GObject) struct _GADatasetScannerBuilderClass @@ -43,20 +50,20 @@ struct _GADatasetScannerBuilderClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 GADatasetScannerBuilder * gadataset_scanner_builder_new(GADatasetDataset *dataset, GError **error); -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 GADatasetScannerBuilder * gadataset_scanner_builder_new_record_batch_reader(GArrowRecordBatchReader *reader); -GARROW_AVAILABLE_IN_6_0 +GADATASET_AVAILABLE_IN_6_0 gboolean gadataset_scanner_builder_set_filter(GADatasetScannerBuilder *builder, GArrowExpression *expression, GError **error); -GARROW_AVAILABLE_IN_5_0 +GADATASET_AVAILABLE_IN_5_0 GADatasetScanner * gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, GError **error); diff --git a/c_glib/arrow-dataset-glib/version.h.in b/c_glib/arrow-dataset-glib/version.h.in new file mode 100644 index 0000000000000..7e678bda3a875 --- /dev/null +++ b/c_glib/arrow-dataset-glib/version.h.in @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +/** + * SECTION: version + * @section_id: version-macros + * @title: Version related macros + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * Apache Arrow Dataset GLib provides macros that can be used by C pre-processor. + * They are useful to check version related things at compile time. + */ + +/** + * GADATASET_VERSION_MAJOR: + * + * The major version. + * + * Since: 17.0.0 + */ +#define GADATASET_VERSION_MAJOR (@VERSION_MAJOR@) + +/** + * GADATASET_VERSION_MINOR: + * + * The minor version. + * + * Since: 17.0.0 + */ +#define GADATASET_VERSION_MINOR (@VERSION_MINOR@) + +/** + * GADATASET_VERSION_MICRO: + * + * The micro version. + * + * Since: 17.0.0 + */ +#define GADATASET_VERSION_MICRO (@VERSION_MICRO@) + +/** + * GADATASET_VERSION_TAG: + * + * The version tag. Normally, it's an empty string. It's "SNAPSHOT" + * for snapshot version. + * + * Since: 17.0.0 + */ +#define GADATASET_VERSION_TAG "@VERSION_TAG@" + +/** + * GADATASET_VERSION_CHECK: + * @major: A major version to check for. + * @minor: A minor version to check for. + * @micro: A micro version to check for. + * + * You can use this macro in C pre-processor. + * + * Returns: %TRUE if the compile time Apache Arrow GLib version is the + * same as or newer than the passed version, %FALSE otherwise. + * + * Since: 17.0.0 + */ +#define GADATASET_VERSION_CHECK(major, minor, micro) \ + (GADATASET_VERSION_MAJOR > (major) || \ + (GADATASET_VERSION_MAJOR == (major) && \ + GADATASET_VERSION_MINOR > (minor)) || \ + (GADATASET_VERSION_MAJOR == (major) && \ + GADATASET_VERSION_MINOR == (minor) && \ + GADATASET_VERSION_MICRO >= (micro))) + +/** + * GADATASET_DISABLE_DEPRECATION_WARNINGS: + * + * If this macro is defined, no deprecated warnings are produced. + * + * You must define this macro before including the + * arrow-glib/arrow-glib.h header. + * + * Since: 17.0.0 + */ + +#ifdef GADATASET_DISABLE_DEPRECATION_WARNINGS +# define GADATASET_DEPRECATED +# define GADATASET_DEPRECATED_FOR(function) +# define GADATASET_UNAVAILABLE(major, minor) +#else +# define GADATASET_DEPRECATED G_DEPRECATED +# define GADATASET_DEPRECATED_FOR(function) G_DEPRECATED_FOR(function) +# define GADATASET_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) +#endif + +@ENCODED_VERSIONS@ + +/** + * GADATASET_VERSION_MIN_REQUIRED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GADATASET_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GADATASET_VERSION_MIN_REQUIRED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * arrow-dataset-glib/arrow-dataset-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GADATASET_VERSION_MIN_REQUIRED +# define GADATASET_VERSION_MIN_REQUIRED GARROW_VERSION_MIN_REQUIRED +#endif + +/** + * GADATASET_VERSION_MAX_ALLOWED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GADATASET_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GADATASET_VERSION_MAX_ALLOWED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * arrow-dataset-glib/arrow-dataset-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GADATASET_VERSION_MAX_ALLOWED +# define GADATASET_VERSION_MAX_ALLOWED GARROW_VERSION_MAX_ALLOWED +#endif + +@VISIBILITY_MACROS@ + +@AVAILABILITY_MACROS@ diff --git a/c_glib/arrow-flight-glib/arrow-flight-glib.h b/c_glib/arrow-flight-glib/arrow-flight-glib.h index 6fc8f43d8406e..7e973dd125dd4 100644 --- a/c_glib/arrow-flight-glib/arrow-flight-glib.h +++ b/c_glib/arrow-flight-glib/arrow-flight-glib.h @@ -19,6 +19,8 @@ #pragma once +#include + #include #include #include diff --git a/c_glib/arrow-flight-glib/client.h b/c_glib/arrow-flight-glib/client.h index f67d58371d583..a91bbe55e3c04 100644 --- a/c_glib/arrow-flight-glib/client.h +++ b/c_glib/arrow-flight-glib/client.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GAFLIGHT_TYPE_STREAM_READER (gaflight_stream_reader_get_type()) +GAFLIGHT_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GAFlightStreamReader, gaflight_stream_reader, GAFLIGHT, @@ -35,6 +36,7 @@ struct _GAFlightStreamReaderClass }; #define GAFLIGHT_TYPE_CALL_OPTIONS (gaflight_call_options_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GAFlightCallOptions, gaflight_call_options, GAFLIGHT, CALL_OPTIONS, GObject) struct _GAFlightCallOptionsClass @@ -42,25 +44,26 @@ struct _GAFlightCallOptionsClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightCallOptions * gaflight_call_options_new(void); -GARROW_AVAILABLE_IN_9_0 +GAFLIGHT_AVAILABLE_IN_9_0 void gaflight_call_options_add_header(GAFlightCallOptions *options, const gchar *name, const gchar *value); -GARROW_AVAILABLE_IN_9_0 +GAFLIGHT_AVAILABLE_IN_9_0 void gaflight_call_options_clear_headers(GAFlightCallOptions *options); -GARROW_AVAILABLE_IN_9_0 +GAFLIGHT_AVAILABLE_IN_9_0 void gaflight_call_options_foreach_header(GAFlightCallOptions *options, GAFlightHeaderFunc func, gpointer user_data); #define GAFLIGHT_TYPE_CLIENT_OPTIONS (gaflight_client_options_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GAFlightClientOptions, gaflight_client_options, GAFLIGHT, CLIENT_OPTIONS, GObject) struct _GAFlightClientOptionsClass @@ -68,28 +71,29 @@ struct _GAFlightClientOptionsClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightClientOptions * gaflight_client_options_new(void); #define GAFLIGHT_TYPE_CLIENT (gaflight_client_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightClient, gaflight_client, GAFLIGHT, CLIENT, GObject) struct _GAFlightClientClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightClient * gaflight_client_new(GAFlightLocation *location, GAFlightClientOptions *options, GError **error); -GARROW_AVAILABLE_IN_8_0 +GAFLIGHT_AVAILABLE_IN_8_0 gboolean gaflight_client_close(GAFlightClient *client, GError **error); -GARROW_AVAILABLE_IN_12_0 +GAFLIGHT_AVAILABLE_IN_12_0 gboolean gaflight_client_authenticate_basic_token(GAFlightClient *client, const gchar *user, @@ -99,21 +103,21 @@ gaflight_client_authenticate_basic_token(GAFlightClient *client, gchar **bearer_value, GError **error); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GList * gaflight_client_list_flights(GAFlightClient *client, GAFlightCriteria *criteria, GAFlightCallOptions *options, GError **error); -GARROW_AVAILABLE_IN_9_0 +GAFLIGHT_AVAILABLE_IN_9_0 GAFlightInfo * gaflight_client_get_flight_info(GAFlightClient *client, GAFlightDescriptor *descriptor, GAFlightCallOptions *options, GError **error); -GARROW_AVAILABLE_IN_6_0 +GAFLIGHT_AVAILABLE_IN_6_0 GAFlightStreamReader * gaflight_client_do_get(GAFlightClient *client, GAFlightTicket *ticket, diff --git a/c_glib/arrow-flight-glib/client.hpp b/c_glib/arrow-flight-glib/client.hpp index 6d7bdcecf3006..185a28e6dc4bd 100644 --- a/c_glib/arrow-flight-glib/client.hpp +++ b/c_glib/arrow-flight-glib/client.hpp @@ -23,17 +23,23 @@ #include +GAFLIGHT_EXTERN GAFlightStreamReader * gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader, gboolean is_owner); +GAFLIGHT_EXTERN arrow::flight::FlightCallOptions * gaflight_call_options_get_raw(GAFlightCallOptions *options); +GAFLIGHT_EXTERN arrow::flight::FlightClientOptions * gaflight_client_options_get_raw(GAFlightClientOptions *options); +GAFLIGHT_EXTERN std::shared_ptr gaflight_client_get_raw(GAFlightClient *client); + +GAFLIGHT_EXTERN GAFlightClient * gaflight_client_new_raw(std::shared_ptr *flight_client); diff --git a/c_glib/arrow-flight-glib/common.h b/c_glib/arrow-flight-glib/common.h index fcb23b1885ea7..b1d89f79c357e 100644 --- a/c_glib/arrow-flight-glib/common.h +++ b/c_glib/arrow-flight-glib/common.h @@ -21,6 +21,8 @@ #include +#include + G_BEGIN_DECLS typedef void (*GAFlightHeaderFunc)(const gchar *name, @@ -28,40 +30,43 @@ typedef void (*GAFlightHeaderFunc)(const gchar *name, gpointer user_data); #define GAFLIGHT_TYPE_CRITERIA (gaflight_criteria_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightCriteria, gaflight_criteria, GAFLIGHT, CRITERIA, GObject) struct _GAFlightCriteriaClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightCriteria * gaflight_criteria_new(GBytes *expression); #define GAFLIGHT_TYPE_LOCATION (gaflight_location_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightLocation, gaflight_location, GAFLIGHT, LOCATION, GObject) struct _GAFlightLocationClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightLocation * gaflight_location_new(const gchar *uri, GError **error); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gchar * gaflight_location_to_string(GAFlightLocation *location); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gchar * gaflight_location_get_scheme(GAFlightLocation *location); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gboolean gaflight_location_equal(GAFlightLocation *location, GAFlightLocation *other_location); #define GAFLIGHT_TYPE_DESCRIPTOR (gaflight_descriptor_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GAFlightDescriptor, gaflight_descriptor, GAFLIGHT, DESCRIPTOR, GObject) struct _GAFlightDescriptorClass @@ -69,16 +74,17 @@ struct _GAFlightDescriptorClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gchar * gaflight_descriptor_to_string(GAFlightDescriptor *descriptor); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gboolean gaflight_descriptor_equal(GAFlightDescriptor *descriptor, GAFlightDescriptor *other_descriptor); #define GAFLIGHT_TYPE_PATH_DESCRIPTOR (gaflight_path_descriptor_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightPathDescriptor, gaflight_path_descriptor, GAFLIGHT, @@ -89,15 +95,16 @@ struct _GAFlightPathDescriptorClass GAFlightDescriptorClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightPathDescriptor * gaflight_path_descriptor_new(const gchar **paths, gsize n_paths); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gchar ** gaflight_path_descriptor_get_paths(GAFlightPathDescriptor *descriptor); #define GAFLIGHT_TYPE_COMMAND_DESCRIPTOR (gaflight_command_descriptor_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightCommandDescriptor, gaflight_command_descriptor, GAFLIGHT, @@ -108,56 +115,59 @@ struct _GAFlightCommandDescriptorClass GAFlightDescriptorClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightCommandDescriptor * gaflight_command_descriptor_new(const gchar *command); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gchar * gaflight_command_descriptor_get_command(GAFlightCommandDescriptor *descriptor); #define GAFLIGHT_TYPE_TICKET (gaflight_ticket_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightTicket, gaflight_ticket, GAFLIGHT, TICKET, GObject) struct _GAFlightTicketClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightTicket * gaflight_ticket_new(GBytes *data); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gboolean gaflight_ticket_equal(GAFlightTicket *ticket, GAFlightTicket *other_ticket); #define GAFLIGHT_TYPE_ENDPOINT (gaflight_endpoint_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightEndpoint, gaflight_endpoint, GAFLIGHT, ENDPOINT, GObject) struct _GAFlightEndpointClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightEndpoint * gaflight_endpoint_new(GAFlightTicket *ticket, GList *locations); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gboolean gaflight_endpoint_equal(GAFlightEndpoint *endpoint, GAFlightEndpoint *other_endpoint); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GList * gaflight_endpoint_get_locations(GAFlightEndpoint *endpoint); #define GAFLIGHT_TYPE_INFO (gaflight_info_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightInfo, gaflight_info, GAFLIGHT, INFO, GObject) struct _GAFlightInfoClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightInfo * gaflight_info_new(GArrowSchema *schema, GAFlightDescriptor *descriptor, @@ -166,27 +176,28 @@ gaflight_info_new(GArrowSchema *schema, gint64 total_bytes, GError **error); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gboolean gaflight_info_equal(GAFlightInfo *info, GAFlightInfo *other_info); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GArrowSchema * gaflight_info_get_schema(GAFlightInfo *info, GArrowReadOptions *options, GError **error); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightDescriptor * gaflight_info_get_descriptor(GAFlightInfo *info); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GList * gaflight_info_get_endpoints(GAFlightInfo *info); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gint64 gaflight_info_get_total_records(GAFlightInfo *info); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gint64 gaflight_info_get_total_bytes(GAFlightInfo *info); #define GAFLIGHT_TYPE_STREAM_CHUNK (gaflight_stream_chunk_get_type()) +GAFLIGHT_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GAFlightStreamChunk, gaflight_stream_chunk, GAFLIGHT, STREAM_CHUNK, GObject) struct _GAFlightStreamChunkClass @@ -194,14 +205,15 @@ struct _GAFlightStreamChunkClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 +GAFLIGHT_AVAILABLE_IN_6_0 GArrowRecordBatch * gaflight_stream_chunk_get_data(GAFlightStreamChunk *chunk); -GARROW_AVAILABLE_IN_6_0 +GAFLIGHT_AVAILABLE_IN_6_0 GArrowBuffer * gaflight_stream_chunk_get_metadata(GAFlightStreamChunk *chunk); #define GAFLIGHT_TYPE_RECORD_BATCH_READER (gaflight_record_batch_reader_get_type()) +GAFLIGHT_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GAFlightRecordBatchReader, gaflight_record_batch_reader, GAFLIGHT, @@ -212,11 +224,11 @@ struct _GAFlightRecordBatchReaderClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 +GAFLIGHT_AVAILABLE_IN_6_0 GAFlightStreamChunk * gaflight_record_batch_reader_read_next(GAFlightRecordBatchReader *reader, GError **error); -GARROW_AVAILABLE_IN_6_0 +GAFLIGHT_AVAILABLE_IN_6_0 GArrowTable * gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError **error); diff --git a/c_glib/arrow-flight-glib/common.hpp b/c_glib/arrow-flight-glib/common.hpp index b748d6f382184..db56fff579baf 100644 --- a/c_glib/arrow-flight-glib/common.hpp +++ b/c_glib/arrow-flight-glib/common.hpp @@ -23,39 +23,59 @@ #include +GAFLIGHT_EXTERN GAFlightCriteria * gaflight_criteria_new_raw(const arrow::flight::Criteria *flight_criteria); + +GAFLIGHT_EXTERN arrow::flight::Criteria * gaflight_criteria_get_raw(GAFlightCriteria *criteria); +GAFLIGHT_EXTERN arrow::flight::Location * gaflight_location_get_raw(GAFlightLocation *location); +GAFLIGHT_EXTERN GAFlightDescriptor * gaflight_descriptor_new_raw(const arrow::flight::FlightDescriptor *flight_descriptor); + +GAFLIGHT_EXTERN arrow::flight::FlightDescriptor * gaflight_descriptor_get_raw(GAFlightDescriptor *descriptor); +GAFLIGHT_EXTERN GAFlightTicket * gaflight_ticket_new_raw(const arrow::flight::Ticket *flight_ticket); + +GAFLIGHT_EXTERN arrow::flight::Ticket * gaflight_ticket_get_raw(GAFlightTicket *ticket); +GAFLIGHT_EXTERN GAFlightEndpoint * gaflight_endpoint_new_raw(const arrow::flight::FlightEndpoint *flight_endpoint, GAFlightTicket *ticket); + +GAFLIGHT_EXTERN arrow::flight::FlightEndpoint * gaflight_endpoint_get_raw(GAFlightEndpoint *endpoint); +GAFLIGHT_EXTERN GAFlightInfo * gaflight_info_new_raw(arrow::flight::FlightInfo *flight_info); + +GAFLIGHT_EXTERN arrow::flight::FlightInfo * gaflight_info_get_raw(GAFlightInfo *info); +GAFLIGHT_EXTERN GAFlightStreamChunk * gaflight_stream_chunk_new_raw(arrow::flight::FlightStreamChunk *flight_chunk); + +GAFLIGHT_EXTERN arrow::flight::FlightStreamChunk * gaflight_stream_chunk_get_raw(GAFlightStreamChunk *chunk); +GAFLIGHT_EXTERN arrow::flight::MetadataRecordBatchReader * gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader); diff --git a/c_glib/arrow-flight-glib/meson.build b/c_glib/arrow-flight-glib/meson.build index 70db7400b124a..c1422e0d10a7d 100644 --- a/c_glib/arrow-flight-glib/meson.build +++ b/c_glib/arrow-flight-glib/meson.build @@ -37,6 +37,14 @@ cpp_headers = files( 'server.hpp', ) +version_h = configure_file( + input: 'version.h.in', + output: 'version.h', + command: [python3, generate_version_header_py, '--library', 'GAFLIGHT', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], +) + +c_headers += version_h + headers = c_headers + cpp_headers install_headers(headers, subdir: 'arrow-flight-glib') @@ -50,6 +58,7 @@ libarrow_flight_glib = library('arrow-flight-glib', dependencies: dependencies, implicit_include_directories: false, include_directories: base_include_directories, + cpp_args: ['-DGAFLIGHT_COMPILATION'], soversion: so_version, version: library_version) arrow_flight_glib = declare_dependency(link_with: libarrow_flight_glib, diff --git a/c_glib/arrow-flight-glib/server.h b/c_glib/arrow-flight-glib/server.h index 89f5a0a596e9e..7e594febb172f 100644 --- a/c_glib/arrow-flight-glib/server.h +++ b/c_glib/arrow-flight-glib/server.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GAFLIGHT_TYPE_DATA_STREAM (gaflight_data_stream_get_type()) +GAFLIGHT_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GAFlightDataStream, gaflight_data_stream, GAFLIGHT, DATA_STREAM, GObject) struct _GAFlightDataStreamClass @@ -32,6 +33,7 @@ struct _GAFlightDataStreamClass }; #define GAFLIGHT_TYPE_RECORD_BATCH_STREAM (gaflight_record_batch_stream_get_type()) +GAFLIGHT_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GAFlightRecordBatchStream, gaflight_record_batch_stream, GAFLIGHT, @@ -42,12 +44,13 @@ struct _GAFlightRecordBatchStreamClass GAFlightDataStreamClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 +GAFLIGHT_AVAILABLE_IN_6_0 GAFlightRecordBatchStream * gaflight_record_batch_stream_new(GArrowRecordBatchReader *reader, GArrowWriteOptions *options); #define GAFLIGHT_TYPE_MESSAGE_READER (gaflight_message_reader_get_type()) +GAFLIGHT_AVAILABLE_IN_14_0 G_DECLARE_DERIVABLE_TYPE(GAFlightMessageReader, gaflight_message_reader, GAFLIGHT, @@ -58,11 +61,12 @@ struct _GAFlightMessageReaderClass GAFlightRecordBatchReaderClass parent_class; }; -GARROW_AVAILABLE_IN_14_0 +GAFLIGHT_AVAILABLE_IN_14_0 GAFlightDescriptor * gaflight_message_reader_get_descriptor(GAFlightMessageReader *reader); #define GAFLIGHT_TYPE_SERVER_CALL_CONTEXT (gaflight_server_call_context_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightServerCallContext, gaflight_server_call_context, GAFLIGHT, @@ -73,13 +77,14 @@ struct _GAFlightServerCallContextClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_14_0 +GAFLIGHT_AVAILABLE_IN_14_0 void gaflight_server_call_context_foreach_incoming_header(GAFlightServerCallContext *context, GAFlightHeaderFunc func, gpointer user_data); #define GAFLIGHT_TYPE_SERVER_AUTH_SENDER (gaflight_server_auth_sender_get_type()) +GAFLIGHT_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE(GAFlightServerAuthSender, gaflight_server_auth_sender, GAFLIGHT, @@ -90,13 +95,14 @@ struct _GAFlightServerAuthSenderClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_12_0 +GAFLIGHT_AVAILABLE_IN_12_0 gboolean gaflight_server_auth_sender_write(GAFlightServerAuthSender *sender, GBytes *message, GError **error); #define GAFLIGHT_TYPE_SERVER_AUTH_READER (gaflight_server_auth_reader_get_type()) +GAFLIGHT_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE(GAFlightServerAuthReader, gaflight_server_auth_reader, GAFLIGHT, @@ -107,11 +113,12 @@ struct _GAFlightServerAuthReaderClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_12_0 +GAFLIGHT_AVAILABLE_IN_12_0 GBytes * gaflight_server_auth_reader_read(GAFlightServerAuthReader *reader, GError **error); #define GAFLIGHT_TYPE_SERVER_AUTH_HANDLER (gaflight_server_auth_handler_get_type()) +GAFLIGHT_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE(GAFlightServerAuthHandler, gaflight_server_auth_handler, GAFLIGHT, @@ -124,6 +131,7 @@ struct _GAFlightServerAuthHandlerClass #define GAFLIGHT_TYPE_SERVER_CUSTOM_AUTH_HANDLER \ (gaflight_server_custom_auth_handler_get_type()) +GAFLIGHT_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE(GAFlightServerCustomAuthHandler, gaflight_server_custom_auth_handler, GAFLIGHT, @@ -152,7 +160,7 @@ struct _GAFlightServerCustomAuthHandlerClass GError **error); }; -GARROW_AVAILABLE_IN_12_0 +GAFLIGHT_AVAILABLE_IN_12_0 void gaflight_server_custom_auth_handler_authenticate(GAFlightServerCustomAuthHandler *handler, GAFlightServerCallContext *context, @@ -160,7 +168,7 @@ gaflight_server_custom_auth_handler_authenticate(GAFlightServerCustomAuthHandler GAFlightServerAuthReader *reader, GError **error); -GARROW_AVAILABLE_IN_12_0 +GAFLIGHT_AVAILABLE_IN_12_0 GBytes * gaflight_server_custom_auth_handler_is_valid(GAFlightServerCustomAuthHandler *handler, GAFlightServerCallContext *context, @@ -168,6 +176,7 @@ gaflight_server_custom_auth_handler_is_valid(GAFlightServerCustomAuthHandler *ha GError **error); #define GAFLIGHT_TYPE_SERVER_OPTIONS (gaflight_server_options_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GAFlightServerOptions, gaflight_server_options, GAFLIGHT, SERVER_OPTIONS, GObject) struct _GAFlightServerOptionsClass @@ -175,14 +184,16 @@ struct _GAFlightServerOptionsClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GAFlightServerOptions * gaflight_server_options_new(GAFlightLocation *location); #define GAFLIGHT_TYPE_SERVABLE (gaflight_servable_get_type()) +GAFLIGHT_AVAILABLE_IN_9_0 G_DECLARE_INTERFACE(GAFlightServable, gaflight_servable, GAFLIGHT, SERVABLE, GObject) #define GAFLIGHT_TYPE_SERVER (gaflight_server_get_type()) +GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightServer, gaflight_server, GAFLIGHT, SERVER, GObject) /** * GAFlightServerClass: @@ -209,34 +220,34 @@ struct _GAFlightServerClass GError **error); }; -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gboolean gaflight_server_listen(GAFlightServer *server, GAFlightServerOptions *options, GError **error); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gint gaflight_server_get_port(GAFlightServer *server); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gboolean gaflight_server_shutdown(GAFlightServer *server, GError **error); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 gboolean gaflight_server_wait(GAFlightServer *server, GError **error); -GARROW_AVAILABLE_IN_5_0 +GAFLIGHT_AVAILABLE_IN_5_0 GList * gaflight_server_list_flights(GAFlightServer *server, GAFlightServerCallContext *context, GAFlightCriteria *criteria, GError **error); -GARROW_AVAILABLE_IN_9_0 +GAFLIGHT_AVAILABLE_IN_9_0 GAFlightInfo * gaflight_server_get_flight_info(GAFlightServer *server, GAFlightServerCallContext *context, GAFlightDescriptor *request, GError **error); -GARROW_AVAILABLE_IN_6_0 +GAFLIGHT_AVAILABLE_IN_6_0 GAFlightDataStream * gaflight_server_do_get(GAFlightServer *server, GAFlightServerCallContext *context, diff --git a/c_glib/arrow-flight-glib/server.hpp b/c_glib/arrow-flight-glib/server.hpp index 70da867d5b0e9..ec4815751c8d8 100644 --- a/c_glib/arrow-flight-glib/server.hpp +++ b/c_glib/arrow-flight-glib/server.hpp @@ -23,34 +23,49 @@ #include +GAFLIGHT_EXTERN arrow::flight::FlightDataStream * gaflight_data_stream_get_raw(GAFlightDataStream *stream); +GAFLIGHT_EXTERN GAFlightMessageReader * gaflight_message_reader_new_raw(arrow::flight::FlightMessageReader *flight_reader, gboolean is_owner); + +GAFLIGHT_EXTERN arrow::flight::FlightMessageReader * gaflight_message_reader_get_raw(GAFlightMessageReader *reader); +GAFLIGHT_EXTERN GAFlightServerCallContext * gaflight_server_call_context_new_raw( const arrow::flight::ServerCallContext *flight_call_context); + +GAFLIGHT_EXTERN const arrow::flight::ServerCallContext * gaflight_server_call_context_get_raw(GAFlightServerCallContext *call_context); +GAFLIGHT_EXTERN GAFlightServerAuthSender * gaflight_server_auth_sender_new_raw(arrow::flight::ServerAuthSender *flight_sender); + +GAFLIGHT_EXTERN arrow::flight::ServerAuthSender * gaflight_server_auth_sender_get_raw(GAFlightServerAuthSender *sender); +GAFLIGHT_EXTERN GAFlightServerAuthReader * gaflight_server_auth_reader_new_raw(arrow::flight::ServerAuthReader *flight_reader); + +GAFLIGHT_EXTERN arrow::flight::ServerAuthReader * gaflight_server_auth_reader_get_raw(GAFlightServerAuthReader *reader); +GAFLIGHT_EXTERN std::shared_ptr gaflight_server_auth_handler_get_raw(GAFlightServerAuthHandler *handler); +GAFLIGHT_EXTERN arrow::flight::FlightServerOptions * gaflight_server_options_get_raw(GAFlightServerOptions *options); @@ -61,6 +76,7 @@ struct _GAFlightServableInterface arrow::flight::FlightServerBase *(*get_raw)(GAFlightServable *servable); }; +GAFLIGHT_EXTERN arrow::flight::FlightServerBase * gaflight_servable_get_raw(GAFlightServable *servable); diff --git a/c_glib/arrow-flight-glib/version.h.in b/c_glib/arrow-flight-glib/version.h.in new file mode 100644 index 0000000000000..4a42c7f5aa91e --- /dev/null +++ b/c_glib/arrow-flight-glib/version.h.in @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +/** + * SECTION: version + * @section_id: version-macros + * @title: Version related macros + * @include: arrow-flight-glib/arrow-flight-glib.h + * + * Apache Arrow Flight GLib provides macros that can be used by C pre-processor. + * They are useful to check version related things at compile time. + */ + +/** + * GAFLIGHT_VERSION_MAJOR: + * + * The major version. + * + * Since: 17.0.0 + */ +#define GAFLIGHT_VERSION_MAJOR (@VERSION_MAJOR@) + +/** + * GAFLIGHT_VERSION_MINOR: + * + * The minor version. + * + * Since: 17.0.0 + */ +#define GAFLIGHT_VERSION_MINOR (@VERSION_MINOR@) + +/** + * GAFLIGHT_VERSION_MICRO: + * + * The micro version. + * + * Since: 17.0.0 + */ +#define GAFLIGHT_VERSION_MICRO (@VERSION_MICRO@) + +/** + * GAFLIGHT_VERSION_TAG: + * + * The version tag. Normally, it's an empty string. It's "SNAPSHOT" + * for snapshot version. + * + * Since: 17.0.0 + */ +#define GAFLIGHT_VERSION_TAG "@VERSION_TAG@" + +/** + * GAFLIGHT_VERSION_CHECK: + * @major: A major version to check for. + * @minor: A minor version to check for. + * @micro: A micro version to check for. + * + * You can use this macro in C pre-processor. + * + * Returns: %TRUE if the compile time Apache Arrow GLib version is the + * same as or newer than the passed version, %FALSE otherwise. + * + * Since: 17.0.0 + */ +#define GAFLIGHT_VERSION_CHECK(major, minor, micro) \ + (GAFLIGHT_VERSION_MAJOR > (major) || \ + (GAFLIGHT_VERSION_MAJOR == (major) && \ + GAFLIGHT_VERSION_MINOR > (minor)) || \ + (GAFLIGHT_VERSION_MAJOR == (major) && \ + GAFLIGHT_VERSION_MINOR == (minor) && \ + GAFLIGHT_VERSION_MICRO >= (micro))) + +/** + * GAFLIGHT_DISABLE_DEPRECATION_WARNINGS: + * + * If this macro is defined, no deprecated warnings are produced. + * + * You must define this macro before including the + * arrow-glib/arrow-glib.h header. + * + * Since: 17.0.0 + */ + +#ifdef GAFLIGHT_DISABLE_DEPRECATION_WARNINGS +# define GAFLIGHT_DEPRECATED +# define GAFLIGHT_DEPRECATED_FOR(function) +# define GAFLIGHT_UNAVAILABLE(major, minor) +#else +# define GAFLIGHT_DEPRECATED G_DEPRECATED +# define GAFLIGHT_DEPRECATED_FOR(function) G_DEPRECATED_FOR(function) +# define GAFLIGHT_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) +#endif + +@ENCODED_VERSIONS@ + +/** + * GAFLIGHT_VERSION_MIN_REQUIRED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GAFLIGHT_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GAFLIGHT_VERSION_MIN_REQUIRED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * arrow-flight-glib/arrow-flight-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GAFLIGHT_VERSION_MIN_REQUIRED +# define GAFLIGHT_VERSION_MIN_REQUIRED GARROW_VERSION_MIN_REQUIRED +#endif + +/** + * GAFLIGHT_VERSION_MAX_ALLOWED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GAFLIGHT_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GAFLIGHT_VERSION_MAX_ALLOWED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * arrow-flight-glib/arrow-flight-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GAFLIGHT_VERSION_MAX_ALLOWED +# define GAFLIGHT_VERSION_MAX_ALLOWED GARROW_VERSION_MAX_ALLOWED +#endif + +@VISIBILITY_MACROS@ + +@AVAILABILITY_MACROS@ diff --git a/c_glib/arrow-flight-sql-glib/arrow-flight-sql-glib.h b/c_glib/arrow-flight-sql-glib/arrow-flight-sql-glib.h index 8ebe39aee57a8..94e72d06f2b47 100644 --- a/c_glib/arrow-flight-sql-glib/arrow-flight-sql-glib.h +++ b/c_glib/arrow-flight-sql-glib/arrow-flight-sql-glib.h @@ -19,5 +19,7 @@ #pragma once +#include + #include #include diff --git a/c_glib/arrow-flight-sql-glib/client.h b/c_glib/arrow-flight-sql-glib/client.h index 9a5a8987f7195..b9e9baf41a59f 100644 --- a/c_glib/arrow-flight-sql-glib/client.h +++ b/c_glib/arrow-flight-sql-glib/client.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GAFLIGHTSQL_TYPE_PREPARED_STATEMENT (gaflightsql_prepared_statement_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_14_0 G_DECLARE_DERIVABLE_TYPE(GAFlightSQLPreparedStatement, gaflightsql_prepared_statement, GAFLIGHTSQL, @@ -34,52 +37,53 @@ struct _GAFlightSQLPreparedStatementClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GAFlightInfo * gaflightsql_prepared_statement_execute(GAFlightSQLPreparedStatement *statement, GAFlightCallOptions *options, GError **error); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 gint64 gaflightsql_prepared_statement_execute_update(GAFlightSQLPreparedStatement *statement, GAFlightCallOptions *options, GError **error); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GArrowSchema * gaflightsql_prepared_statement_get_parameter_schema( GAFlightSQLPreparedStatement *statement); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GArrowSchema * gaflightsql_prepared_statement_get_dataset_schema( GAFlightSQLPreparedStatement *statement); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 gboolean gaflightsql_prepared_statement_set_record_batch(GAFlightSQLPreparedStatement *statement, GArrowRecordBatch *record_batch, GError **error); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 gboolean gaflightsql_prepared_statement_set_record_batch_reader( GAFlightSQLPreparedStatement *statement, GArrowRecordBatchReader *reader, GError **error); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 gboolean gaflightsql_prepared_statement_close(GAFlightSQLPreparedStatement *statement, GAFlightCallOptions *options, GError **error); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 gboolean gaflightsql_prepared_statement_is_closed(GAFlightSQLPreparedStatement *statement); #define GAFLIGHTSQL_TYPE_CLIENT (gaflightsql_client_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_9_0 G_DECLARE_DERIVABLE_TYPE( GAFlightSQLClient, gaflightsql_client, GAFLIGHTSQL, CLIENT, GObject) struct _GAFlightSQLClientClass @@ -87,32 +91,32 @@ struct _GAFlightSQLClientClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_9_0 +GAFLIGHTSQL_AVAILABLE_IN_9_0 GAFlightSQLClient * gaflightsql_client_new(GAFlightClient *client); -GARROW_AVAILABLE_IN_9_0 +GAFLIGHTSQL_AVAILABLE_IN_9_0 GAFlightInfo * gaflightsql_client_execute(GAFlightSQLClient *client, const gchar *query, GAFlightCallOptions *options, GError **error); -GARROW_AVAILABLE_IN_13_0 +GAFLIGHTSQL_AVAILABLE_IN_13_0 gint64 gaflightsql_client_execute_update(GAFlightSQLClient *client, const gchar *query, GAFlightCallOptions *options, GError **error); -GARROW_AVAILABLE_IN_9_0 +GAFLIGHTSQL_AVAILABLE_IN_9_0 GAFlightStreamReader * gaflightsql_client_do_get(GAFlightSQLClient *client, GAFlightTicket *ticket, GAFlightCallOptions *options, GError **error); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GAFlightSQLPreparedStatement * gaflightsql_client_prepare(GAFlightSQLClient *client, const gchar *query, diff --git a/c_glib/arrow-flight-sql-glib/meson.build b/c_glib/arrow-flight-sql-glib/meson.build index e7abc605bb819..d588ba4917c76 100644 --- a/c_glib/arrow-flight-sql-glib/meson.build +++ b/c_glib/arrow-flight-sql-glib/meson.build @@ -34,6 +34,14 @@ cpp_headers = files( 'server.hpp', ) +version_h = configure_file( + input: 'version.h.in', + output: 'version.h', + command: [python3, generate_version_header_py, '--library', 'GAFLIGHTSQL', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], +) + +c_headers += version_h + headers = c_headers + cpp_headers install_headers(headers, subdir: 'arrow-flight-sql-glib') @@ -47,6 +55,7 @@ libarrow_flight_sql_glib = library('arrow-flight-sql-glib', dependencies: dependencies, implicit_include_directories: false, include_directories: base_include_directories, + cpp_args: ['-DGAFLIGHTSQL_COMPILATION'], soversion: so_version, version: library_version) arrow_flight_sql_glib = \ diff --git a/c_glib/arrow-flight-sql-glib/server.h b/c_glib/arrow-flight-sql-glib/server.h index d6fd7e4d10394..8cf0aace77644 100644 --- a/c_glib/arrow-flight-sql-glib/server.h +++ b/c_glib/arrow-flight-sql-glib/server.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GAFLIGHTSQL_TYPE_COMMAND (gaflightsql_command_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_9_0 G_DECLARE_DERIVABLE_TYPE( GAFlightSQLCommand, gaflightsql_command, GAFLIGHTSQL, COMMAND, GObject) struct _GAFlightSQLCommandClass @@ -32,6 +35,7 @@ struct _GAFlightSQLCommandClass }; #define GAFLIGHTSQL_TYPE_STATEMENT_QUERY (gaflightsql_statement_query_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_9_0 G_DECLARE_DERIVABLE_TYPE(GAFlightSQLStatementQuery, gaflightsql_statement_query, GAFLIGHTSQL, @@ -42,11 +46,12 @@ struct _GAFlightSQLStatementQueryClass GAFlightSQLCommandClass parent_class; }; -GARROW_AVAILABLE_IN_9_0 +GAFLIGHTSQL_AVAILABLE_IN_9_0 const gchar * gaflightsql_statement_query_get_query(GAFlightSQLStatementQuery *command); #define GAFLIGHTSQL_TYPE_STATEMENT_UPDATE (gaflightsql_statement_update_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_13_0 G_DECLARE_DERIVABLE_TYPE(GAFlightSQLStatementUpdate, gaflightsql_statement_update, GAFLIGHTSQL, @@ -57,12 +62,13 @@ struct _GAFlightSQLStatementUpdateClass GAFlightSQLCommandClass parent_class; }; -GARROW_AVAILABLE_IN_13_0 +GAFLIGHTSQL_AVAILABLE_IN_13_0 const gchar * gaflightsql_statement_update_get_query(GAFlightSQLStatementUpdate *command); #define GAFLIGHTSQL_TYPE_PREPARED_STATEMENT_UPDATE \ (gaflightsql_prepared_statement_update_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_14_0 G_DECLARE_DERIVABLE_TYPE(GAFlightSQLPreparedStatementUpdate, gaflightsql_prepared_statement_update, GAFLIGHTSQL, @@ -73,13 +79,14 @@ struct _GAFlightSQLPreparedStatementUpdateClass GAFlightSQLCommandClass parent_class; }; -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GBytes * gaflightsql_prepared_statement_update_get_handle( GAFlightSQLPreparedStatementUpdate *command); #define GAFLIGHTSQL_TYPE_STATEMENT_QUERY_TICKET \ (gaflightsql_statement_query_ticket_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_9_0 G_DECLARE_DERIVABLE_TYPE(GAFlightSQLStatementQueryTicket, gaflightsql_statement_query_ticket, GAFLIGHTSQL, @@ -90,15 +97,16 @@ struct _GAFlightSQLStatementQueryTicketClass GAFlightSQLCommandClass parent_class; }; -GARROW_AVAILABLE_IN_9_0 +GAFLIGHTSQL_AVAILABLE_IN_9_0 GBytes * gaflightsql_statement_query_ticket_generate_handle(const gchar *query, GError **error); -GARROW_AVAILABLE_IN_9_0 +GAFLIGHTSQL_AVAILABLE_IN_9_0 GBytes * gaflightsql_statement_query_ticket_get_handle(GAFlightSQLStatementQueryTicket *command); #define GAFLIGHTSQL_TYPE_CREATE_PREPARED_STATEMENT_REQUEST \ (gaflightsql_create_prepared_statement_request_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_14_0 G_DECLARE_DERIVABLE_TYPE(GAFlightSQLCreatePreparedStatementRequest, gaflightsql_create_prepared_statement_request, GAFLIGHTSQL, @@ -109,18 +117,19 @@ struct _GAFlightSQLCreatePreparedStatementRequestClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 const gchar * gaflightsql_create_prepared_statement_request_get_query( GAFlightSQLCreatePreparedStatementRequest *request); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 const gchar * gaflightsql_create_prepared_statement_request_get_transaction_id( GAFlightSQLCreatePreparedStatementRequest *request); #define GAFLIGHTSQL_TYPE_CREATE_PREPARED_STATEMENT_RESULT \ (gaflightsql_create_prepared_statement_result_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_14_0 G_DECLARE_DERIVABLE_TYPE(GAFlightSQLCreatePreparedStatementResult, gaflightsql_create_prepared_statement_result, GAFLIGHTSQL, @@ -131,36 +140,37 @@ struct _GAFlightSQLCreatePreparedStatementResultClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GAFlightSQLCreatePreparedStatementResult * gaflightsql_create_prepared_statement_result_new(void); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 void gaflightsql_create_prepared_statement_result_set_dataset_schema( GAFlightSQLCreatePreparedStatementResult *result, GArrowSchema *schema); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GArrowSchema * gaflightsql_create_prepared_statement_result_get_dataset_schema( GAFlightSQLCreatePreparedStatementResult *result); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 void gaflightsql_create_prepared_statement_result_set_parameter_schema( GAFlightSQLCreatePreparedStatementResult *result, GArrowSchema *schema); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GArrowSchema * gaflightsql_create_prepared_statement_result_get_parameter_schema( GAFlightSQLCreatePreparedStatementResult *result); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 void gaflightsql_create_prepared_statement_result_set_handle( GAFlightSQLCreatePreparedStatementResult *result, GBytes *handle); -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GBytes * gaflightsql_create_prepared_statement_result_get_handle( GAFlightSQLCreatePreparedStatementResult *result); #define GAFLIGHTSQL_TYPE_CLOSE_PREPARED_STATEMENT_REQUEST \ (gaflightsql_close_prepared_statement_request_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_14_0 G_DECLARE_DERIVABLE_TYPE(GAFlightSQLClosePreparedStatementRequest, gaflightsql_close_prepared_statement_request, GAFLIGHTSQL, @@ -171,12 +181,13 @@ struct _GAFlightSQLClosePreparedStatementRequestClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_14_0 +GAFLIGHTSQL_AVAILABLE_IN_14_0 GBytes * gaflightsql_close_prepared_statement_request_get_handle( GAFlightSQLClosePreparedStatementRequest *request); #define GAFLIGHTSQL_TYPE_SERVER (gaflightsql_server_get_type()) +GAFLIGHTSQL_AVAILABLE_IN_9_0 G_DECLARE_DERIVABLE_TYPE( GAFlightSQLServer, gaflightsql_server, GAFLIGHTSQL, SERVER, GAFlightServer) /** @@ -231,27 +242,27 @@ struct _GAFlightSQLServerClass GError **error); }; -GARROW_AVAILABLE_IN_9_0 +GAFLIGHTSQL_AVAILABLE_IN_9_0 GAFlightInfo * gaflightsql_server_get_flight_info_statement(GAFlightSQLServer *server, GAFlightServerCallContext *context, GAFlightSQLStatementQuery *command, GAFlightDescriptor *descriptor, GError **error); -GARROW_AVAILABLE_IN_9_0 +GAFLIGHTSQL_AVAILABLE_IN_9_0 GAFlightDataStream * gaflightsql_server_do_get_statement(GAFlightSQLServer *server, GAFlightServerCallContext *context, GAFlightSQLStatementQueryTicket *ticket, GError **error); -GARROW_AVAILABLE_IN_13_0 +GAFLIGHTSQL_AVAILABLE_IN_13_0 gint64 gaflightsql_server_do_put_command_statement_update(GAFlightSQLServer *server, GAFlightServerCallContext *context, GAFlightSQLStatementUpdate *command, GError **error); -/* We can restore this after we bump version to 14.0.0-SNAPSHOT. */ -/* GARROW_AVAILABLE_IN_14_0 */ + +GAFLIGHTSQL_AVAILABLE_IN_14_0 gint64 gaflightsql_server_do_put_prepared_statement_update( GAFlightSQLServer *server, @@ -259,16 +270,16 @@ gaflightsql_server_do_put_prepared_statement_update( GAFlightSQLPreparedStatementUpdate *command, GAFlightMessageReader *reader, GError **error); -/* We can restore this after we bump version to 14.0.0-SNAPSHOT. */ -/* GARROW_AVAILABLE_IN_14_0 */ + +GAFLIGHTSQL_AVAILABLE_IN_14_0 GAFlightSQLCreatePreparedStatementResult * gaflightsql_server_create_prepared_statement( GAFlightSQLServer *server, GAFlightServerCallContext *context, GAFlightSQLCreatePreparedStatementRequest *request, GError **error); -/* We can restore this after we bump version to 14.0.0-SNAPSHOT. */ -/* GARROW_AVAILABLE_IN_14_0 */ + +GAFLIGHTSQL_AVAILABLE_IN_14_0 void gaflightsql_server_close_prepared_statement( GAFlightSQLServer *server, diff --git a/c_glib/arrow-flight-sql-glib/version.h.in b/c_glib/arrow-flight-sql-glib/version.h.in new file mode 100644 index 0000000000000..e4373109b9008 --- /dev/null +++ b/c_glib/arrow-flight-sql-glib/version.h.in @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +/** + * SECTION: version + * @section_id: version-macros + * @title: Version related macros + * @include: arrow-flight-sql-glib/arrow-flight-sql-glib.h + * + * Apache Arrow Flight SQL GLib provides macros that can be used by C pre-processor. + * They are useful to check version related things at compile time. + */ + +/** + * GAFLIGHTSQL_VERSION_MAJOR: + * + * The major version. + * + * Since: 17.0.0 + */ +#define GAFLIGHTSQL_VERSION_MAJOR (@VERSION_MAJOR@) + +/** + * GAFLIGHTSQL_VERSION_MINOR: + * + * The minor version. + * + * Since: 17.0.0 + */ +#define GAFLIGHTSQL_VERSION_MINOR (@VERSION_MINOR@) + +/** + * GAFLIGHTSQL_VERSION_MICRO: + * + * The micro version. + * + * Since: 17.0.0 + */ +#define GAFLIGHTSQL_VERSION_MICRO (@VERSION_MICRO@) + +/** + * GAFLIGHTSQL_VERSION_TAG: + * + * The version tag. Normally, it's an empty string. It's "SNAPSHOT" + * for snapshot version. + * + * Since: 17.0.0 + */ +#define GAFLIGHTSQL_VERSION_TAG "@VERSION_TAG@" + +/** + * GAFLIGHTSQL_VERSION_CHECK: + * @major: A major version to check for. + * @minor: A minor version to check for. + * @micro: A micro version to check for. + * + * You can use this macro in C pre-processor. + * + * Returns: %TRUE if the compile time Apache Arrow GLib version is the + * same as or newer than the passed version, %FALSE otherwise. + * + * Since: 17.0.0 + */ +#define GAFLIGHTSQL_VERSION_CHECK(major, minor, micro) \ + (GAFLIGHTSQL_VERSION_MAJOR > (major) || \ + (GAFLIGHTSQL_VERSION_MAJOR == (major) && \ + GAFLIGHTSQL_VERSION_MINOR > (minor)) || \ + (GAFLIGHTSQL_VERSION_MAJOR == (major) && \ + GAFLIGHTSQL_VERSION_MINOR == (minor) && \ + GAFLIGHTSQL_VERSION_MICRO >= (micro))) + +/** + * GAFLIGHTSQL_DISABLE_DEPRECATION_WARNINGS: + * + * If this macro is defined, no deprecated warnings are produced. + * + * You must define this macro before including the + * arrow-glib/arrow-glib.h header. + * + * Since: 17.0.0 + */ + +#ifdef GAFLIGHTSQL_DISABLE_DEPRECATION_WARNINGS +# define GAFLIGHTSQL_DEPRECATED +# define GAFLIGHTSQL_DEPRECATED_FOR(function) +# define GAFLIGHTSQL_UNAVAILABLE(major, minor) +#else +# define GAFLIGHTSQL_DEPRECATED G_DEPRECATED +# define GAFLIGHTSQL_DEPRECATED_FOR(function) G_DEPRECATED_FOR(function) +# define GAFLIGHTSQL_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) +#endif + +@ENCODED_VERSIONS@ + +/** + * GAFLIGHTSQL_VERSION_MIN_REQUIRED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GAFLIGHTSQL_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GAFLIGHTSQL_VERSION_MIN_REQUIRED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * arrow-flight-sql-glib/arrow-flight-sql-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GAFLIGHTSQL_VERSION_MIN_REQUIRED +# define GAFLIGHTSQL_VERSION_MIN_REQUIRED GARROW_VERSION_MIN_REQUIRED +#endif + +/** + * GAFLIGHTSQL_VERSION_MAX_ALLOWED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GAFLIGHTSQL_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GAFLIGHTSQL_VERSION_MAX_ALLOWED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * arrow-flight-sql-glib/arrow-flight-sql-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GAFLIGHTSQL_VERSION_MAX_ALLOWED +# define GAFLIGHTSQL_VERSION_MAX_ALLOWED GARROW_VERSION_MAX_ALLOWED +#endif + +@VISIBILITY_MACROS@ + +@AVAILABILITY_MACROS@ diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index b498ecb51cedb..9b7c608ca8a5b 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -231,8 +231,8 @@ garrow_array_builder_append_values(GArrowArrayBuilder *builder, if (n_remains > 0) { ++n_loops; } + std::vector data(value_size * chunk_size); for (gint64 i = 0; i < n_loops; ++i) { - uint8_t data[value_size * chunk_size]; uint8_t *valid_bytes = nullptr; uint8_t valid_bytes_buffer[chunk_size]; if (is_valids_length > 0) { @@ -255,7 +255,7 @@ garrow_array_builder_append_values(GArrowArrayBuilder *builder, value = values[offset + j]; } if (value) { - get_value_function(data + (value_size * j), value, value_size); + get_value_function(data.data() + (value_size * j), value, value_size); } else { is_valid = false; if (!valid_bytes) { @@ -267,7 +267,7 @@ garrow_array_builder_append_values(GArrowArrayBuilder *builder, valid_bytes_buffer[j] = is_valid; } } - auto status = arrow_builder->AppendValues(data, n_values, valid_bytes); + auto status = arrow_builder->AppendValues(data.data(), n_values, valid_bytes); if (!garrow_error_check(error, status, context)) { return FALSE; } @@ -1035,13 +1035,13 @@ garrow_boolean_array_builder_append_values(GArrowBooleanArrayBuilder *builder, gint64 is_valids_length, GError **error) { - guint8 arrow_values[values_length]; + std::vector arrow_values(values_length); for (gint64 i = 0; i < values_length; ++i) { arrow_values[i] = values[i]; } return garrow_array_builder_append_values( GARROW_ARRAY_BUILDER(builder), - arrow_values, + arrow_values.data(), values_length, is_valids, is_valids_length, diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index 8a1385b9b8c1b..6a0d0154833a7 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -26,6 +26,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_ARRAY_BUILDER (garrow_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowArrayBuilder, garrow_array_builder, GARROW, ARRAY_BUILDER, GObject) struct _GArrowArrayBuilderClass @@ -33,11 +34,15 @@ struct _GArrowArrayBuilderClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDataType * garrow_array_builder_get_value_data_type(GArrowArrayBuilder *builder); + +GARROW_AVAILABLE_IN_ALL GArrowType garrow_array_builder_get_value_type(GArrowArrayBuilder *builder); +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_array_builder_finish(GArrowArrayBuilder *builder, GError **error); @@ -86,6 +91,7 @@ garrow_array_builder_append_empty_values(GArrowArrayBuilder *builder, GError **error); #define GARROW_TYPE_NULL_ARRAY_BUILDER (garrow_null_array_builder_get_type()) +GARROW_AVAILABLE_IN_0_13 G_DECLARE_DERIVABLE_TYPE(GArrowNullArrayBuilder, garrow_null_array_builder, GARROW, @@ -114,6 +120,7 @@ garrow_null_array_builder_append_nulls(GArrowNullArrayBuilder *builder, #endif #define GARROW_TYPE_BOOLEAN_ARRAY_BUILDER (garrow_boolean_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowBooleanArrayBuilder, garrow_boolean_array_builder, GARROW, @@ -124,10 +131,12 @@ struct _GArrowBooleanArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowBooleanArrayBuilder * garrow_boolean_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_boolean_array_builder_append_value) gboolean garrow_boolean_array_builder_append(GArrowBooleanArrayBuilder *builder, @@ -139,6 +148,8 @@ gboolean garrow_boolean_array_builder_append_value(GArrowBooleanArrayBuilder *builder, gboolean value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_boolean_array_builder_append_values(GArrowBooleanArrayBuilder *builder, const gboolean *values, @@ -159,6 +170,7 @@ garrow_boolean_array_builder_append_nulls(GArrowBooleanArrayBuilder *builder, #endif #define GARROW_TYPE_INT_ARRAY_BUILDER (garrow_int_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowIntArrayBuilder, garrow_int_array_builder, GARROW, @@ -169,10 +181,12 @@ struct _GArrowIntArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowIntArrayBuilder * garrow_int_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_int_array_builder_append_value) gboolean garrow_int_array_builder_append(GArrowIntArrayBuilder *builder, @@ -184,6 +198,7 @@ gboolean garrow_int_array_builder_append_value(GArrowIntArrayBuilder *builder, gint64 value, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_int_array_builder_append_values(GArrowIntArrayBuilder *builder, const gint64 *values, @@ -192,9 +207,12 @@ garrow_int_array_builder_append_values(GArrowIntArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_int_array_builder_append_null(GArrowIntArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_int_array_builder_append_nulls(GArrowIntArrayBuilder *builder, @@ -203,6 +221,7 @@ garrow_int_array_builder_append_nulls(GArrowIntArrayBuilder *builder, #endif #define GARROW_TYPE_UINT_ARRAY_BUILDER (garrow_uint_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowUIntArrayBuilder, garrow_uint_array_builder, GARROW, @@ -213,10 +232,12 @@ struct _GArrowUIntArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUIntArrayBuilder * garrow_uint_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint_array_builder_append_value) gboolean garrow_uint_array_builder_append(GArrowUIntArrayBuilder *builder, @@ -228,6 +249,8 @@ gboolean garrow_uint_array_builder_append_value(GArrowUIntArrayBuilder *builder, guint64 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_uint_array_builder_append_values(GArrowUIntArrayBuilder *builder, const guint64 *values, @@ -236,9 +259,12 @@ garrow_uint_array_builder_append_values(GArrowUIntArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_uint_array_builder_append_null(GArrowUIntArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_uint_array_builder_append_nulls(GArrowUIntArrayBuilder *builder, @@ -247,6 +273,7 @@ garrow_uint_array_builder_append_nulls(GArrowUIntArrayBuilder *builder, #endif #define GARROW_TYPE_INT8_ARRAY_BUILDER (garrow_int8_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowInt8ArrayBuilder, garrow_int8_array_builder, GARROW, @@ -257,10 +284,12 @@ struct _GArrowInt8ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt8ArrayBuilder * garrow_int8_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_int8_array_builder_append_value) gboolean garrow_int8_array_builder_append(GArrowInt8ArrayBuilder *builder, @@ -272,6 +301,8 @@ gboolean garrow_int8_array_builder_append_value(GArrowInt8ArrayBuilder *builder, gint8 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_int8_array_builder_append_values(GArrowInt8ArrayBuilder *builder, const gint8 *values, @@ -280,9 +311,12 @@ garrow_int8_array_builder_append_values(GArrowInt8ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_int8_array_builder_append_null(GArrowInt8ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_int8_array_builder_append_nulls(GArrowInt8ArrayBuilder *builder, @@ -291,6 +325,7 @@ garrow_int8_array_builder_append_nulls(GArrowInt8ArrayBuilder *builder, #endif #define GARROW_TYPE_UINT8_ARRAY_BUILDER (garrow_uint8_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowUInt8ArrayBuilder, garrow_uint8_array_builder, GARROW, @@ -301,10 +336,12 @@ struct _GArrowUInt8ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt8ArrayBuilder * garrow_uint8_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint8_array_builder_append_value) gboolean garrow_uint8_array_builder_append(GArrowUInt8ArrayBuilder *builder, @@ -316,6 +353,8 @@ gboolean garrow_uint8_array_builder_append_value(GArrowUInt8ArrayBuilder *builder, guint8 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_uint8_array_builder_append_values(GArrowUInt8ArrayBuilder *builder, const guint8 *values, @@ -324,9 +363,12 @@ garrow_uint8_array_builder_append_values(GArrowUInt8ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_uint8_array_builder_append_null(GArrowUInt8ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_uint8_array_builder_append_nulls(GArrowUInt8ArrayBuilder *builder, @@ -335,6 +377,7 @@ garrow_uint8_array_builder_append_nulls(GArrowUInt8ArrayBuilder *builder, #endif #define GARROW_TYPE_INT16_ARRAY_BUILDER (garrow_int16_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowInt16ArrayBuilder, garrow_int16_array_builder, GARROW, @@ -345,6 +388,7 @@ struct _GArrowInt16ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt16ArrayBuilder * garrow_int16_array_builder_new(void); @@ -360,6 +404,7 @@ gboolean garrow_int16_array_builder_append_value(GArrowInt16ArrayBuilder *builder, gint16 value, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_int16_array_builder_append_values(GArrowInt16ArrayBuilder *builder, const gint16 *values, @@ -368,9 +413,12 @@ garrow_int16_array_builder_append_values(GArrowInt16ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_int16_array_builder_append_null(GArrowInt16ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_int16_array_builder_append_nulls(GArrowInt16ArrayBuilder *builder, @@ -379,6 +427,7 @@ garrow_int16_array_builder_append_nulls(GArrowInt16ArrayBuilder *builder, #endif #define GARROW_TYPE_UINT16_ARRAY_BUILDER (garrow_uint16_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowUInt16ArrayBuilder, garrow_uint16_array_builder, GARROW, @@ -389,10 +438,12 @@ struct _GArrowUInt16ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt16ArrayBuilder * garrow_uint16_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint16_array_builder_append_value) gboolean garrow_uint16_array_builder_append(GArrowUInt16ArrayBuilder *builder, @@ -404,6 +455,8 @@ gboolean garrow_uint16_array_builder_append_value(GArrowUInt16ArrayBuilder *builder, guint16 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_uint16_array_builder_append_values(GArrowUInt16ArrayBuilder *builder, const guint16 *values, @@ -412,10 +465,13 @@ garrow_uint16_array_builder_append_values(GArrowUInt16ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_uint16_array_builder_append_null(GArrowUInt16ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_uint16_array_builder_append_nulls(GArrowUInt16ArrayBuilder *builder, @@ -424,6 +480,7 @@ garrow_uint16_array_builder_append_nulls(GArrowUInt16ArrayBuilder *builder, #endif #define GARROW_TYPE_INT32_ARRAY_BUILDER (garrow_int32_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowInt32ArrayBuilder, garrow_int32_array_builder, GARROW, @@ -434,10 +491,12 @@ struct _GArrowInt32ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt32ArrayBuilder * garrow_int32_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_int32_array_builder_append_value) gboolean garrow_int32_array_builder_append(GArrowInt32ArrayBuilder *builder, @@ -449,6 +508,8 @@ gboolean garrow_int32_array_builder_append_value(GArrowInt32ArrayBuilder *builder, gint32 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_int32_array_builder_append_values(GArrowInt32ArrayBuilder *builder, const gint32 *values, @@ -457,9 +518,12 @@ garrow_int32_array_builder_append_values(GArrowInt32ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_int32_array_builder_append_null(GArrowInt32ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_int32_array_builder_append_nulls(GArrowInt32ArrayBuilder *builder, @@ -468,6 +532,7 @@ garrow_int32_array_builder_append_nulls(GArrowInt32ArrayBuilder *builder, #endif #define GARROW_TYPE_UINT32_ARRAY_BUILDER (garrow_uint32_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowUInt32ArrayBuilder, garrow_uint32_array_builder, GARROW, @@ -478,10 +543,12 @@ struct _GArrowUInt32ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt32ArrayBuilder * garrow_uint32_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint32_array_builder_append_value) gboolean garrow_uint32_array_builder_append(GArrowUInt32ArrayBuilder *builder, @@ -493,6 +560,8 @@ gboolean garrow_uint32_array_builder_append_value(GArrowUInt32ArrayBuilder *builder, guint32 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_uint32_array_builder_append_values(GArrowUInt32ArrayBuilder *builder, const guint32 *values, @@ -501,10 +570,13 @@ garrow_uint32_array_builder_append_values(GArrowUInt32ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_uint32_array_builder_append_null(GArrowUInt32ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_uint32_array_builder_append_nulls(GArrowUInt32ArrayBuilder *builder, @@ -513,6 +585,7 @@ garrow_uint32_array_builder_append_nulls(GArrowUInt32ArrayBuilder *builder, #endif #define GARROW_TYPE_INT64_ARRAY_BUILDER (garrow_int64_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowInt64ArrayBuilder, garrow_int64_array_builder, GARROW, @@ -523,10 +596,12 @@ struct _GArrowInt64ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt64ArrayBuilder * garrow_int64_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_int64_array_builder_append_value) gboolean garrow_int64_array_builder_append(GArrowInt64ArrayBuilder *builder, @@ -538,6 +613,8 @@ gboolean garrow_int64_array_builder_append_value(GArrowInt64ArrayBuilder *builder, gint64 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_int64_array_builder_append_values(GArrowInt64ArrayBuilder *builder, const gint64 *values, @@ -546,9 +623,12 @@ garrow_int64_array_builder_append_values(GArrowInt64ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_int64_array_builder_append_null(GArrowInt64ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_int64_array_builder_append_nulls(GArrowInt64ArrayBuilder *builder, @@ -557,6 +637,7 @@ garrow_int64_array_builder_append_nulls(GArrowInt64ArrayBuilder *builder, #endif #define GARROW_TYPE_UINT64_ARRAY_BUILDER (garrow_uint64_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowUInt64ArrayBuilder, garrow_uint64_array_builder, GARROW, @@ -567,10 +648,12 @@ struct _GArrowUInt64ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt64ArrayBuilder * garrow_uint64_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint64_array_builder_append_value) gboolean garrow_uint64_array_builder_append(GArrowUInt64ArrayBuilder *builder, @@ -582,6 +665,8 @@ gboolean garrow_uint64_array_builder_append_value(GArrowUInt64ArrayBuilder *builder, guint64 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_uint64_array_builder_append_values(GArrowUInt64ArrayBuilder *builder, const guint64 *values, @@ -590,10 +675,13 @@ garrow_uint64_array_builder_append_values(GArrowUInt64ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_uint64_array_builder_append_null(GArrowUInt64ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_uint64_array_builder_append_nulls(GArrowUInt64ArrayBuilder *builder, @@ -602,6 +690,7 @@ garrow_uint64_array_builder_append_nulls(GArrowUInt64ArrayBuilder *builder, #endif #define GARROW_TYPE_HALF_FLOAT_ARRAY_BUILDER (garrow_half_float_array_builder_get_type()) +GARROW_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE(GArrowHalfFloatArrayBuilder, garrow_half_float_array_builder, GARROW, @@ -631,6 +720,7 @@ garrow_half_float_array_builder_append_values(GArrowHalfFloatArrayBuilder *build GError **error); #define GARROW_TYPE_FLOAT_ARRAY_BUILDER (garrow_float_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowFloatArrayBuilder, garrow_float_array_builder, GARROW, @@ -641,10 +731,12 @@ struct _GArrowFloatArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowFloatArrayBuilder * garrow_float_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_float_array_builder_append_value) gboolean garrow_float_array_builder_append(GArrowFloatArrayBuilder *builder, @@ -656,6 +748,8 @@ gboolean garrow_float_array_builder_append_value(GArrowFloatArrayBuilder *builder, gfloat value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_float_array_builder_append_values(GArrowFloatArrayBuilder *builder, const gfloat *values, @@ -664,9 +758,12 @@ garrow_float_array_builder_append_values(GArrowFloatArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_float_array_builder_append_null(GArrowFloatArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_float_array_builder_append_nulls(GArrowFloatArrayBuilder *builder, @@ -675,6 +772,7 @@ garrow_float_array_builder_append_nulls(GArrowFloatArrayBuilder *builder, #endif #define GARROW_TYPE_DOUBLE_ARRAY_BUILDER (garrow_double_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDoubleArrayBuilder, garrow_double_array_builder, GARROW, @@ -685,10 +783,12 @@ struct _GArrowDoubleArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDoubleArrayBuilder * garrow_double_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_double_array_builder_append_value) gboolean garrow_double_array_builder_append(GArrowDoubleArrayBuilder *builder, @@ -700,6 +800,8 @@ gboolean garrow_double_array_builder_append_value(GArrowDoubleArrayBuilder *builder, gdouble value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_double_array_builder_append_values(GArrowDoubleArrayBuilder *builder, const gdouble *values, @@ -708,10 +810,13 @@ garrow_double_array_builder_append_values(GArrowDoubleArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_double_array_builder_append_null(GArrowDoubleArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_double_array_builder_append_nulls(GArrowDoubleArrayBuilder *builder, @@ -720,6 +825,7 @@ garrow_double_array_builder_append_nulls(GArrowDoubleArrayBuilder *builder, #endif #define GARROW_TYPE_BINARY_ARRAY_BUILDER (garrow_binary_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowBinaryArrayBuilder, garrow_binary_array_builder, GARROW, @@ -730,10 +836,12 @@ struct _GArrowBinaryArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowBinaryArrayBuilder * garrow_binary_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_binary_array_builder_append_value) gboolean garrow_binary_array_builder_append(GArrowBinaryArrayBuilder *builder, @@ -761,10 +869,12 @@ garrow_binary_array_builder_append_values(GArrowBinaryArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_binary_array_builder_append_null(GArrowBinaryArrayBuilder *builder, GError **error); + GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) GARROW_AVAILABLE_IN_0_16 gboolean @@ -775,6 +885,7 @@ garrow_binary_array_builder_append_nulls(GArrowBinaryArrayBuilder *builder, #define GARROW_TYPE_LARGE_BINARY_ARRAY_BUILDER \ (garrow_large_binary_array_builder_get_type()) +GARROW_AVAILABLE_IN_0_16 G_DECLARE_DERIVABLE_TYPE(GArrowLargeBinaryArrayBuilder, garrow_large_binary_array_builder, GARROW, @@ -821,6 +932,7 @@ garrow_large_binary_array_builder_append_nulls(GArrowLargeBinaryArrayBuilder *bu #endif #define GARROW_TYPE_STRING_ARRAY_BUILDER (garrow_string_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowStringArrayBuilder, garrow_string_array_builder, GARROW, @@ -831,10 +943,12 @@ struct _GArrowStringArrayBuilderClass GArrowBinaryArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowStringArrayBuilder * garrow_string_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_string_array_builder_append_value) gboolean garrow_string_array_builder_append(GArrowStringArrayBuilder *builder, @@ -863,6 +977,7 @@ garrow_string_array_builder_append_string_len(GArrowStringArrayBuilder *builder, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_16_FOR(garrow_string_array_builder_append_strings) gboolean garrow_string_array_builder_append_values(GArrowStringArrayBuilder *builder, @@ -883,6 +998,7 @@ garrow_string_array_builder_append_strings(GArrowStringArrayBuilder *builder, #define GARROW_TYPE_LARGE_STRING_ARRAY_BUILDER \ (garrow_large_string_array_builder_get_type()) +GARROW_AVAILABLE_IN_0_16 G_DECLARE_DERIVABLE_TYPE(GArrowLargeStringArrayBuilder, garrow_large_string_array_builder, GARROW, @@ -919,6 +1035,7 @@ garrow_large_string_array_builder_append_strings(GArrowLargeStringArrayBuilder * #define GARROW_TYPE_FIXED_SIZE_BINARY_ARRAY_BUILDER \ (garrow_fixed_size_binary_array_builder_get_type()) +GARROW_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GArrowFixedSizeBinaryArrayBuilder, garrow_fixed_size_binary_array_builder, GARROW, @@ -963,6 +1080,7 @@ garrow_fixed_size_binary_array_builder_append_values_packed( GError **error); #define GARROW_TYPE_DATE32_ARRAY_BUILDER (garrow_date32_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDate32ArrayBuilder, garrow_date32_array_builder, GARROW, @@ -973,10 +1091,12 @@ struct _GArrowDate32ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDate32ArrayBuilder * garrow_date32_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_date32_array_builder_append_value) gboolean garrow_date32_array_builder_append(GArrowDate32ArrayBuilder *builder, @@ -988,6 +1108,8 @@ gboolean garrow_date32_array_builder_append_value(GArrowDate32ArrayBuilder *builder, gint32 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_date32_array_builder_append_values(GArrowDate32ArrayBuilder *builder, const gint32 *values, @@ -996,10 +1118,13 @@ garrow_date32_array_builder_append_values(GArrowDate32ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_date32_array_builder_append_null(GArrowDate32ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_date32_array_builder_append_nulls(GArrowDate32ArrayBuilder *builder, @@ -1008,6 +1133,7 @@ garrow_date32_array_builder_append_nulls(GArrowDate32ArrayBuilder *builder, #endif #define GARROW_TYPE_DATE64_ARRAY_BUILDER (garrow_date64_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDate64ArrayBuilder, garrow_date64_array_builder, GARROW, @@ -1018,10 +1144,12 @@ struct _GArrowDate64ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDate64ArrayBuilder * garrow_date64_array_builder_new(void); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_date64_array_builder_append_value) gboolean garrow_date64_array_builder_append(GArrowDate64ArrayBuilder *builder, @@ -1033,6 +1161,8 @@ gboolean garrow_date64_array_builder_append_value(GArrowDate64ArrayBuilder *builder, gint64 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_date64_array_builder_append_values(GArrowDate64ArrayBuilder *builder, const gint64 *values, @@ -1041,10 +1171,13 @@ garrow_date64_array_builder_append_values(GArrowDate64ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_date64_array_builder_append_null(GArrowDate64ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_date64_array_builder_append_nulls(GArrowDate64ArrayBuilder *builder, @@ -1053,6 +1186,7 @@ garrow_date64_array_builder_append_nulls(GArrowDate64ArrayBuilder *builder, #endif #define GARROW_TYPE_TIMESTAMP_ARRAY_BUILDER (garrow_timestamp_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTimestampArrayBuilder, garrow_timestamp_array_builder, GARROW, @@ -1063,10 +1197,12 @@ struct _GArrowTimestampArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTimestampArrayBuilder * garrow_timestamp_array_builder_new(GArrowTimestampDataType *data_type); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_timestamp_array_builder_append_value) gboolean garrow_timestamp_array_builder_append(GArrowTimestampArrayBuilder *builder, @@ -1078,6 +1214,8 @@ gboolean garrow_timestamp_array_builder_append_value(GArrowTimestampArrayBuilder *builder, gint64 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_timestamp_array_builder_append_values(GArrowTimestampArrayBuilder *builder, const gint64 *values, @@ -1086,10 +1224,13 @@ garrow_timestamp_array_builder_append_values(GArrowTimestampArrayBuilder *builde gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_timestamp_array_builder_append_null(GArrowTimestampArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_timestamp_array_builder_append_nulls(GArrowTimestampArrayBuilder *builder, @@ -1098,6 +1239,7 @@ garrow_timestamp_array_builder_append_nulls(GArrowTimestampArrayBuilder *builder #endif #define GARROW_TYPE_TIME32_ARRAY_BUILDER (garrow_time32_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTime32ArrayBuilder, garrow_time32_array_builder, GARROW, @@ -1108,10 +1250,12 @@ struct _GArrowTime32ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTime32ArrayBuilder * garrow_time32_array_builder_new(GArrowTime32DataType *data_type); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_time32_array_builder_append_value) gboolean garrow_time32_array_builder_append(GArrowTime32ArrayBuilder *builder, @@ -1123,6 +1267,8 @@ gboolean garrow_time32_array_builder_append_value(GArrowTime32ArrayBuilder *builder, gint32 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_time32_array_builder_append_values(GArrowTime32ArrayBuilder *builder, const gint32 *values, @@ -1131,10 +1277,13 @@ garrow_time32_array_builder_append_values(GArrowTime32ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_time32_array_builder_append_null(GArrowTime32ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_time32_array_builder_append_nulls(GArrowTime32ArrayBuilder *builder, @@ -1143,6 +1292,7 @@ garrow_time32_array_builder_append_nulls(GArrowTime32ArrayBuilder *builder, #endif #define GARROW_TYPE_TIME64_ARRAY_BUILDER (garrow_time64_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTime64ArrayBuilder, garrow_time64_array_builder, GARROW, @@ -1153,10 +1303,12 @@ struct _GArrowTime64ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTime64ArrayBuilder * garrow_time64_array_builder_new(GArrowTime64DataType *data_type); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_time64_array_builder_append_value) gboolean garrow_time64_array_builder_append(GArrowTime64ArrayBuilder *builder, @@ -1168,6 +1320,8 @@ gboolean garrow_time64_array_builder_append_value(GArrowTime64ArrayBuilder *builder, gint64 value, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_time64_array_builder_append_values(GArrowTime64ArrayBuilder *builder, const gint64 *values, @@ -1176,10 +1330,13 @@ garrow_time64_array_builder_append_values(GArrowTime64ArrayBuilder *builder, gint64 is_valids_length, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_time64_array_builder_append_null(GArrowTime64ArrayBuilder *builder, GError **error); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_nulls) gboolean garrow_time64_array_builder_append_nulls(GArrowTime64ArrayBuilder *builder, @@ -1189,6 +1346,7 @@ garrow_time64_array_builder_append_nulls(GArrowTime64ArrayBuilder *builder, #define GARROW_TYPE_MONTH_INTERVAL_ARRAY_BUILDER \ (garrow_month_interval_array_builder_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowMonthIntervalArrayBuilder, garrow_month_interval_array_builder, GARROW, @@ -1220,6 +1378,7 @@ garrow_month_interval_array_builder_append_values( #define GARROW_TYPE_DAY_TIME_INTERVAL_ARRAY_BUILDER \ (garrow_day_time_interval_array_builder_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowDayTimeIntervalArrayBuilder, garrow_day_time_interval_array_builder, GARROW, @@ -1252,6 +1411,7 @@ garrow_day_time_interval_array_builder_append_values( #define GARROW_TYPE_MONTH_DAY_NANO_INTERVAL_ARRAY_BUILDER \ (garrow_month_day_nano_interval_array_builder_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowMonthDayNanoIntervalArrayBuilder, garrow_month_day_nano_interval_array_builder, GARROW, @@ -1284,6 +1444,7 @@ garrow_month_day_nano_interval_array_builder_append_values( #define GARROW_TYPE_BINARY_DICTIONARY_ARRAY_BUILDER \ (garrow_binary_dictionary_array_builder_get_type()) +GARROW_AVAILABLE_IN_2_0 G_DECLARE_DERIVABLE_TYPE(GArrowBinaryDictionaryArrayBuilder, garrow_binary_dictionary_array_builder, GARROW, @@ -1350,6 +1511,7 @@ garrow_binary_dictionary_array_builder_reset_full( #define GARROW_TYPE_STRING_DICTIONARY_ARRAY_BUILDER \ (garrow_string_dictionary_array_builder_get_type()) +GARROW_AVAILABLE_IN_2_0 G_DECLARE_DERIVABLE_TYPE(GArrowStringDictionaryArrayBuilder, garrow_string_dictionary_array_builder, GARROW, @@ -1408,6 +1570,7 @@ garrow_string_dictionary_array_builder_reset_full( GArrowStringDictionaryArrayBuilder *builder); #define GARROW_TYPE_LIST_ARRAY_BUILDER (garrow_list_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowListArrayBuilder, garrow_list_array_builder, GARROW, @@ -1418,10 +1581,12 @@ struct _GArrowListArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowListArrayBuilder * garrow_list_array_builder_new(GArrowListDataType *data_type, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_list_array_builder_append_value) gboolean garrow_list_array_builder_append(GArrowListArrayBuilder *builder, GError **error); @@ -1430,15 +1595,18 @@ GARROW_AVAILABLE_IN_0_12 gboolean garrow_list_array_builder_append_value(GArrowListArrayBuilder *builder, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_list_array_builder_append_null(GArrowListArrayBuilder *builder, GError **error); #endif +GARROW_AVAILABLE_IN_ALL GArrowArrayBuilder * garrow_list_array_builder_get_value_builder(GArrowListArrayBuilder *builder); #define GARROW_TYPE_LARGE_LIST_ARRAY_BUILDER (garrow_large_list_array_builder_get_type()) +GARROW_AVAILABLE_IN_0_16 G_DECLARE_DERIVABLE_TYPE(GArrowLargeListArrayBuilder, garrow_large_list_array_builder, GARROW, @@ -1468,6 +1636,7 @@ GArrowArrayBuilder * garrow_large_list_array_builder_get_value_builder(GArrowLargeListArrayBuilder *builder); #define GARROW_TYPE_STRUCT_ARRAY_BUILDER (garrow_struct_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowStructArrayBuilder, garrow_struct_array_builder, GARROW, @@ -1478,10 +1647,12 @@ struct _GArrowStructArrayBuilderClass GArrowArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowStructArrayBuilder * garrow_struct_array_builder_new(GArrowStructDataType *data_type, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_struct_array_builder_append_value) gboolean garrow_struct_array_builder_append(GArrowStructArrayBuilder *builder, GError **error); @@ -1491,6 +1662,7 @@ gboolean garrow_struct_array_builder_append_value(GArrowStructArrayBuilder *builder, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_builder_append_null) gboolean garrow_struct_array_builder_append_null(GArrowStructArrayBuilder *builder, @@ -1498,15 +1670,19 @@ garrow_struct_array_builder_append_null(GArrowStructArrayBuilder *builder, #endif #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_12_0_FOR(garrow_array_builder_get_child) GArrowArrayBuilder * garrow_struct_array_builder_get_field_builder(GArrowStructArrayBuilder *builder, gint i); + +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_12_0_FOR(garrow_array_builder_get_children) GList * garrow_struct_array_builder_get_field_builders(GArrowStructArrayBuilder *builder); #endif #define GARROW_TYPE_MAP_ARRAY_BUILDER (garrow_map_array_builder_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowMapArrayBuilder, garrow_map_array_builder, GARROW, @@ -1554,6 +1730,7 @@ GArrowArrayBuilder * garrow_map_array_builder_get_value_builder(GArrowMapArrayBuilder *builder); #define GARROW_TYPE_DECIMAL128_ARRAY_BUILDER (garrow_decimal128_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDecimal128ArrayBuilder, garrow_decimal128_array_builder, GARROW, @@ -1564,10 +1741,12 @@ struct _GArrowDecimal128ArrayBuilderClass GArrowFixedSizeBinaryArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDecimal128ArrayBuilder * garrow_decimal128_array_builder_new(GArrowDecimal128DataType *data_type); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_decimal128_array_builder_append_value) gboolean garrow_decimal128_array_builder_append(GArrowDecimal128ArrayBuilder *builder, @@ -1596,6 +1775,7 @@ garrow_decimal128_array_builder_append_null(GArrowDecimal128ArrayBuilder *builde #endif #define GARROW_TYPE_DECIMAL256_ARRAY_BUILDER (garrow_decimal256_array_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDecimal256ArrayBuilder, garrow_decimal256_array_builder, GARROW, @@ -1606,6 +1786,7 @@ struct _GArrowDecimal256ArrayBuilderClass GArrowFixedSizeBinaryArrayBuilderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDecimal256ArrayBuilder * garrow_decimal256_array_builder_new(GArrowDecimal256DataType *data_type); @@ -1624,6 +1805,7 @@ garrow_decimal256_array_builder_append_values(GArrowDecimal256ArrayBuilder *buil GError **error); #define GARROW_TYPE_UNION_ARRAY_BUILDER (garrow_union_array_builder_get_type()) +GARROW_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE(GArrowUnionArrayBuilder, garrow_union_array_builder, GARROW, @@ -1648,6 +1830,7 @@ garrow_union_array_builder_append_value(GArrowUnionArrayBuilder *builder, #define GARROW_TYPE_DENSE_UNION_ARRAY_BUILDER \ (garrow_dense_union_array_builder_get_type()) +GARROW_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE(GArrowDenseUnionArrayBuilder, garrow_dense_union_array_builder, GARROW, @@ -1664,6 +1847,7 @@ garrow_dense_union_array_builder_new(GArrowDenseUnionDataType *data_type, GError #define GARROW_TYPE_SPARSE_UNION_ARRAY_BUILDER \ (garrow_sparse_union_array_builder_get_type()) +GARROW_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE(GArrowSparseUnionArrayBuilder, garrow_sparse_union_array_builder, GARROW, diff --git a/c_glib/arrow-glib/basic-array-definition.h b/c_glib/arrow-glib/basic-array-definition.h index 54642dae018ec..2fa67c09c1cc4 100644 --- a/c_glib/arrow-glib/basic-array-definition.h +++ b/c_glib/arrow-glib/basic-array-definition.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GARROW_TYPE_ARRAY (garrow_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowArray, garrow_array, GARROW, ARRAY, GObject) struct _GArrowArrayClass { @@ -31,6 +34,7 @@ struct _GArrowArrayClass }; #define GARROW_TYPE_EXTENSION_ARRAY (garrow_extension_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowExtensionArray, garrow_extension_array, GARROW, EXTENSION_ARRAY, GArrowArray) struct _GArrowExtensionArrayClass diff --git a/c_glib/arrow-glib/basic-array.h b/c_glib/arrow-glib/basic-array.h index ee6f40b1ddc24..95679aa37c57a 100644 --- a/c_glib/arrow-glib/basic-array.h +++ b/c_glib/arrow-glib/basic-array.h @@ -27,6 +27,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_EQUAL_OPTIONS (garrow_equal_options_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowEqualOptions, garrow_equal_options, GARROW, EQUAL_OPTIONS, GObject) struct _GArrowEqualOptionsClass @@ -52,6 +53,7 @@ garrow_array_export(GArrowArray *array, gpointer *c_abi_schema, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_array_equal(GArrowArray *array, GArrowArray *other_array); GARROW_AVAILABLE_IN_5_0 @@ -59,8 +61,11 @@ gboolean garrow_array_equal_options(GArrowArray *array, GArrowArray *other_array, GArrowEqualOptions *options); +GARROW_AVAILABLE_IN_ALL gboolean garrow_array_equal_approx(GArrowArray *array, GArrowArray *other_array); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_array_equal_range(GArrowArray *array, gint64 start_index, @@ -69,37 +74,60 @@ garrow_array_equal_range(GArrowArray *array, gint64 end_index, GArrowEqualOptions *options); +GARROW_AVAILABLE_IN_ALL gboolean garrow_array_is_null(GArrowArray *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_array_is_valid(GArrowArray *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_array_get_length(GArrowArray *array); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_array_get_offset(GArrowArray *array); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_array_get_n_nulls(GArrowArray *array); + +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_array_get_null_bitmap(GArrowArray *array); + +GARROW_AVAILABLE_IN_ALL GArrowDataType * garrow_array_get_value_data_type(GArrowArray *array); + +GARROW_AVAILABLE_IN_ALL GArrowType garrow_array_get_value_type(GArrowArray *array); + +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_array_slice(GArrowArray *array, gint64 offset, gint64 length); + +GARROW_AVAILABLE_IN_ALL gchar * garrow_array_to_string(GArrowArray *array, GError **error); + GARROW_AVAILABLE_IN_0_15 GArrowArray * garrow_array_view(GArrowArray *array, GArrowDataType *return_type, GError **error); + GARROW_AVAILABLE_IN_0_15 gchar * garrow_array_diff_unified(GArrowArray *array, GArrowArray *other_array); + GARROW_AVAILABLE_IN_4_0 GArrowArray * garrow_array_concatenate(GArrowArray *array, GList *other_arrays, GError **error); #define GARROW_TYPE_NULL_ARRAY (garrow_null_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowNullArray, garrow_null_array, GARROW, NULL_ARRAY, GArrowArray) struct _GArrowNullArrayClass @@ -107,10 +135,12 @@ struct _GArrowNullArrayClass GArrowArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowNullArray * garrow_null_array_new(gint64 length); #define GARROW_TYPE_PRIMITIVE_ARRAY (garrow_primitive_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowPrimitiveArray, garrow_primitive_array, GARROW, PRIMITIVE_ARRAY, GArrowArray) struct _GArrowPrimitiveArrayClass @@ -119,6 +149,7 @@ struct _GArrowPrimitiveArrayClass }; #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_1_0_FOR(garrow_primitive_array_get_data_buffer) GArrowBuffer * garrow_primitive_array_get_buffer(GArrowPrimitiveArray *array); @@ -128,6 +159,7 @@ GArrowBuffer * garrow_primitive_array_get_data_buffer(GArrowPrimitiveArray *array); #define GARROW_TYPE_BOOLEAN_ARRAY (garrow_boolean_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowBooleanArray, garrow_boolean_array, GARROW, BOOLEAN_ARRAY, GArrowPrimitiveArray) struct _GArrowBooleanArrayClass @@ -135,18 +167,23 @@ struct _GArrowBooleanArrayClass GArrowPrimitiveArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowBooleanArray * garrow_boolean_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gboolean garrow_boolean_array_get_value(GArrowBooleanArray *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL gboolean * garrow_boolean_array_get_values(GArrowBooleanArray *array, gint64 *length); #define GARROW_TYPE_NUMERIC_ARRAY (garrow_numeric_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowNumericArray, garrow_numeric_array, GARROW, NUMERIC_ARRAY, GArrowPrimitiveArray) struct _GArrowNumericArrayClass @@ -155,6 +192,7 @@ struct _GArrowNumericArrayClass }; #define GARROW_TYPE_INT8_ARRAY (garrow_int8_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowInt8Array, garrow_int8_array, GARROW, INT8_ARRAY, GArrowNumericArray) struct _GArrowInt8ArrayClass @@ -162,18 +200,23 @@ struct _GArrowInt8ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt8Array * garrow_int8_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gint8 garrow_int8_array_get_value(GArrowInt8Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gint8 * garrow_int8_array_get_values(GArrowInt8Array *array, gint64 *length); #define GARROW_TYPE_UINT8_ARRAY (garrow_uint8_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowUInt8Array, garrow_uint8_array, GARROW, UINT8_ARRAY, GArrowNumericArray) struct _GArrowUInt8ArrayClass @@ -181,18 +224,23 @@ struct _GArrowUInt8ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt8Array * garrow_uint8_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL guint8 garrow_uint8_array_get_value(GArrowUInt8Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const guint8 * garrow_uint8_array_get_values(GArrowUInt8Array *array, gint64 *length); #define GARROW_TYPE_INT16_ARRAY (garrow_int16_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowInt16Array, garrow_int16_array, GARROW, INT16_ARRAY, GArrowNumericArray) struct _GArrowInt16ArrayClass @@ -200,18 +248,23 @@ struct _GArrowInt16ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt16Array * garrow_int16_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gint16 garrow_int16_array_get_value(GArrowInt16Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gint16 * garrow_int16_array_get_values(GArrowInt16Array *array, gint64 *length); #define GARROW_TYPE_UINT16_ARRAY (garrow_uint16_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowUInt16Array, garrow_uint16_array, GARROW, UINT16_ARRAY, GArrowNumericArray) struct _GArrowUInt16ArrayClass @@ -219,18 +272,23 @@ struct _GArrowUInt16ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt16Array * garrow_uint16_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL guint16 garrow_uint16_array_get_value(GArrowUInt16Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const guint16 * garrow_uint16_array_get_values(GArrowUInt16Array *array, gint64 *length); #define GARROW_TYPE_INT32_ARRAY (garrow_int32_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowInt32Array, garrow_int32_array, GARROW, INT32_ARRAY, GArrowNumericArray) struct _GArrowInt32ArrayClass @@ -238,18 +296,23 @@ struct _GArrowInt32ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt32Array * garrow_int32_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gint32 garrow_int32_array_get_value(GArrowInt32Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gint32 * garrow_int32_array_get_values(GArrowInt32Array *array, gint64 *length); #define GARROW_TYPE_UINT32_ARRAY (garrow_uint32_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowUInt32Array, garrow_uint32_array, GARROW, UINT32_ARRAY, GArrowNumericArray) struct _GArrowUInt32ArrayClass @@ -257,18 +320,23 @@ struct _GArrowUInt32ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt32Array * garrow_uint32_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL guint32 garrow_uint32_array_get_value(GArrowUInt32Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const guint32 * garrow_uint32_array_get_values(GArrowUInt32Array *array, gint64 *length); #define GARROW_TYPE_INT64_ARRAY (garrow_int64_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowInt64Array, garrow_int64_array, GARROW, INT64_ARRAY, GArrowNumericArray) struct _GArrowInt64ArrayClass @@ -276,18 +344,23 @@ struct _GArrowInt64ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt64Array * garrow_int64_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gint64 garrow_int64_array_get_value(GArrowInt64Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gint64 * garrow_int64_array_get_values(GArrowInt64Array *array, gint64 *length); #define GARROW_TYPE_UINT64_ARRAY (garrow_uint64_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowUInt64Array, garrow_uint64_array, GARROW, UINT64_ARRAY, GArrowNumericArray) struct _GArrowUInt64ArrayClass @@ -295,18 +368,23 @@ struct _GArrowUInt64ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt64Array * garrow_uint64_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL guint64 garrow_uint64_array_get_value(GArrowUInt64Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const guint64 * garrow_uint64_array_get_values(GArrowUInt64Array *array, gint64 *length); #define GARROW_TYPE_HALF_FLOAT_ARRAY (garrow_half_float_array_get_type()) +GARROW_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE(GArrowHalfFloatArray, garrow_half_float_array, GARROW, @@ -332,6 +410,7 @@ const guint16 * garrow_half_float_array_get_values(GArrowHalfFloatArray *array, gint64 *length); #define GARROW_TYPE_FLOAT_ARRAY (garrow_float_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowFloatArray, garrow_float_array, GARROW, FLOAT_ARRAY, GArrowNumericArray) struct _GArrowFloatArrayClass @@ -339,18 +418,23 @@ struct _GArrowFloatArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowFloatArray * garrow_float_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gfloat garrow_float_array_get_value(GArrowFloatArray *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gfloat * garrow_float_array_get_values(GArrowFloatArray *array, gint64 *length); #define GARROW_TYPE_DOUBLE_ARRAY (garrow_double_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowDoubleArray, garrow_double_array, GARROW, DOUBLE_ARRAY, GArrowNumericArray) struct _GArrowDoubleArrayClass @@ -358,18 +442,23 @@ struct _GArrowDoubleArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDoubleArray * garrow_double_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gdouble garrow_double_array_get_value(GArrowDoubleArray *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gdouble * garrow_double_array_get_values(GArrowDoubleArray *array, gint64 *length); #define GARROW_TYPE_BINARY_ARRAY (garrow_binary_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowBinaryArray, garrow_binary_array, GARROW, BINARY_ARRAY, GArrowArray) struct _GArrowBinaryArrayClass @@ -377,6 +466,7 @@ struct _GArrowBinaryArrayClass GArrowArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowBinaryArray * garrow_binary_array_new(gint64 length, GArrowBuffer *value_offsets, @@ -384,9 +474,12 @@ garrow_binary_array_new(gint64 length, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL GBytes * garrow_binary_array_get_value(GArrowBinaryArray *array, gint64 i); + #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_1_0_FOR(garrow_binary_array_get_data_buffer) GArrowBuffer * garrow_binary_array_get_buffer(GArrowBinaryArray *array); @@ -394,10 +487,13 @@ garrow_binary_array_get_buffer(GArrowBinaryArray *array); GARROW_AVAILABLE_IN_1_0 GArrowBuffer * garrow_binary_array_get_data_buffer(GArrowBinaryArray *array); + +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_binary_array_get_offsets_buffer(GArrowBinaryArray *array); #define GARROW_TYPE_LARGE_BINARY_ARRAY (garrow_large_binary_array_get_type()) +GARROW_AVAILABLE_IN_0_16 G_DECLARE_DERIVABLE_TYPE(GArrowLargeBinaryArray, garrow_large_binary_array, GARROW, @@ -428,11 +524,13 @@ garrow_large_binary_array_get_buffer(GArrowLargeBinaryArray *array); GARROW_AVAILABLE_IN_1_0 GArrowBuffer * garrow_large_binary_array_get_data_buffer(GArrowLargeBinaryArray *array); + GARROW_AVAILABLE_IN_0_16 GArrowBuffer * garrow_large_binary_array_get_offsets_buffer(GArrowLargeBinaryArray *array); #define GARROW_TYPE_STRING_ARRAY (garrow_string_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowStringArray, garrow_string_array, GARROW, STRING_ARRAY, GArrowBinaryArray) struct _GArrowStringArrayClass @@ -440,6 +538,7 @@ struct _GArrowStringArrayClass GArrowBinaryArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowStringArray * garrow_string_array_new(gint64 length, GArrowBuffer *value_offsets, @@ -447,10 +546,12 @@ garrow_string_array_new(gint64 length, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gchar * garrow_string_array_get_string(GArrowStringArray *array, gint64 i); #define GARROW_TYPE_LARGE_STRING_ARRAY (garrow_large_string_array_get_type()) +GARROW_AVAILABLE_IN_0_16 G_DECLARE_DERIVABLE_TYPE(GArrowLargeStringArray, garrow_large_string_array, GARROW, @@ -474,6 +575,7 @@ gchar * garrow_large_string_array_get_string(GArrowLargeStringArray *array, gint64 i); #define GARROW_TYPE_DATE32_ARRAY (garrow_date32_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowDate32Array, garrow_date32_array, GARROW, DATE32_ARRAY, GArrowNumericArray) struct _GArrowDate32ArrayClass @@ -481,18 +583,23 @@ struct _GArrowDate32ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDate32Array * garrow_date32_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gint32 garrow_date32_array_get_value(GArrowDate32Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gint32 * garrow_date32_array_get_values(GArrowDate32Array *array, gint64 *length); #define GARROW_TYPE_DATE64_ARRAY (garrow_date64_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowDate64Array, garrow_date64_array, GARROW, DATE64_ARRAY, GArrowNumericArray) struct _GArrowDate64ArrayClass @@ -500,18 +607,23 @@ struct _GArrowDate64ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDate64Array * garrow_date64_array_new(gint64 length, GArrowBuffer *data, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gint64 garrow_date64_array_get_value(GArrowDate64Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gint64 * garrow_date64_array_get_values(GArrowDate64Array *array, gint64 *length); #define GARROW_TYPE_TIMESTAMP_ARRAY (garrow_timestamp_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTimestampArray, garrow_timestamp_array, GARROW, @@ -522,6 +634,7 @@ struct _GArrowTimestampArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTimestampArray * garrow_timestamp_array_new(GArrowTimestampDataType *data_type, gint64 length, @@ -529,12 +642,16 @@ garrow_timestamp_array_new(GArrowTimestampDataType *data_type, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gint64 garrow_timestamp_array_get_value(GArrowTimestampArray *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gint64 * garrow_timestamp_array_get_values(GArrowTimestampArray *array, gint64 *length); #define GARROW_TYPE_TIME32_ARRAY (garrow_time32_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowTime32Array, garrow_time32_array, GARROW, TIME32_ARRAY, GArrowNumericArray) struct _GArrowTime32ArrayClass @@ -542,6 +659,7 @@ struct _GArrowTime32ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTime32Array * garrow_time32_array_new(GArrowTime32DataType *data_type, gint64 length, @@ -549,12 +667,16 @@ garrow_time32_array_new(GArrowTime32DataType *data_type, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gint32 garrow_time32_array_get_value(GArrowTime32Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gint32 * garrow_time32_array_get_values(GArrowTime32Array *array, gint64 *length); #define GARROW_TYPE_TIME64_ARRAY (garrow_time64_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowTime64Array, garrow_time64_array, GARROW, TIME64_ARRAY, GArrowNumericArray) struct _GArrowTime64ArrayClass @@ -562,6 +684,7 @@ struct _GArrowTime64ArrayClass GArrowNumericArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTime64Array * garrow_time64_array_new(GArrowTime64DataType *data_type, gint64 length, @@ -569,12 +692,16 @@ garrow_time64_array_new(GArrowTime64DataType *data_type, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL gint64 garrow_time64_array_get_value(GArrowTime64Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL const gint64 * garrow_time64_array_get_values(GArrowTime64Array *array, gint64 *length); #define GARROW_TYPE_MONTH_INTERVAL_ARRAY (garrow_month_interval_array_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowMonthIntervalArray, garrow_month_interval_array, GARROW, @@ -594,11 +721,13 @@ garrow_month_interval_array_new(gint64 length, GARROW_AVAILABLE_IN_8_0 gint32 garrow_month_interval_array_get_value(GArrowMonthIntervalArray *array, gint64 i); + GARROW_AVAILABLE_IN_8_0 const gint32 * garrow_month_interval_array_get_values(GArrowMonthIntervalArray *array, gint64 *length); #define GARROW_TYPE_DAY_TIME_INTERVAL_ARRAY (garrow_day_time_interval_array_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowDayTimeIntervalArray, garrow_day_time_interval_array, GARROW, @@ -618,12 +747,14 @@ garrow_day_time_interval_array_new(gint64 length, GARROW_AVAILABLE_IN_8_0 GArrowDayMillisecond * garrow_day_time_interval_array_get_value(GArrowDayTimeIntervalArray *array, gint64 i); + GARROW_AVAILABLE_IN_8_0 GList * garrow_day_time_interval_array_get_values(GArrowDayTimeIntervalArray *array); #define GARROW_TYPE_MONTH_DAY_NANO_INTERVAL_ARRAY \ (garrow_month_day_nano_interval_array_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowMonthDayNanoIntervalArray, garrow_month_day_nano_interval_array, GARROW, @@ -649,6 +780,7 @@ GList * garrow_month_day_nano_interval_array_get_values(GArrowMonthDayNanoIntervalArray *array); #define GARROW_TYPE_FIXED_SIZE_BINARY_ARRAY (garrow_fixed_size_binary_array_get_type()) +GARROW_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GArrowFixedSizeBinaryArray, garrow_fixed_size_binary_array, GARROW, @@ -669,14 +801,17 @@ garrow_fixed_size_binary_array_new(GArrowFixedSizeBinaryDataType *data_type, GARROW_AVAILABLE_IN_3_0 gint32 garrow_fixed_size_binary_array_get_byte_width(GArrowFixedSizeBinaryArray *array); + GARROW_AVAILABLE_IN_3_0 GBytes * garrow_fixed_size_binary_array_get_value(GArrowFixedSizeBinaryArray *array, gint64 i); + GARROW_AVAILABLE_IN_3_0 GBytes * garrow_fixed_size_binary_array_get_values_bytes(GArrowFixedSizeBinaryArray *array); #define GARROW_TYPE_DECIMAL128_ARRAY (garrow_decimal128_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDecimal128Array, garrow_decimal128_array, GARROW, @@ -687,12 +822,16 @@ struct _GArrowDecimal128ArrayClass GArrowFixedSizeBinaryArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL gchar * garrow_decimal128_array_format_value(GArrowDecimal128Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL GArrowDecimal128 * garrow_decimal128_array_get_value(GArrowDecimal128Array *array, gint64 i); #define GARROW_TYPE_DECIMAL256_ARRAY (garrow_decimal256_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDecimal256Array, garrow_decimal256_array, GARROW, @@ -703,8 +842,11 @@ struct _GArrowDecimal256ArrayClass GArrowFixedSizeBinaryArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL gchar * garrow_decimal256_array_format_value(GArrowDecimal256Array *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL GArrowDecimal256 * garrow_decimal256_array_get_value(GArrowDecimal256Array *array, gint64 i); diff --git a/c_glib/arrow-glib/basic-array.hpp b/c_glib/arrow-glib/basic-array.hpp index f010cf3db4bc3..b2a7ed6ae075f 100644 --- a/c_glib/arrow-glib/basic-array.hpp +++ b/c_glib/arrow-glib/basic-array.hpp @@ -23,22 +23,32 @@ #include +GARROW_EXTERN arrow::EqualOptions * garrow_equal_options_get_raw(GArrowEqualOptions *equal_options); +GARROW_EXTERN GArrowArray * garrow_array_new_raw(std::shared_ptr *arrow_array); + +GARROW_EXTERN GArrowArray * garrow_array_new_raw(std::shared_ptr *arrow_array, const gchar *first_property_name, ...); + +GARROW_EXTERN GArrowArray * garrow_array_new_raw_valist(std::shared_ptr *arrow_array, const gchar *first_property_name, va_list args); + +GARROW_EXTERN GArrowExtensionArray * garrow_extension_array_new_raw(std::shared_ptr *arrow_array, GArrowArray *storage); + +GARROW_EXTERN std::shared_ptr garrow_array_get_raw(GArrowArray *array); diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 36646a9733cd3..d1c06000065dc 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -1801,6 +1801,8 @@ garrow_extension_data_type_wrap_chunked_array(GArrowExtensionDataType *data_type return garrow_chunked_array_new_raw(&arrow_extension_chunked_array); } +G_END_DECLS + static std::shared_ptr garrow_extension_data_type_get_storage_data_type_raw(GArrowExtensionDataType *data_type) { @@ -1808,8 +1810,6 @@ garrow_extension_data_type_get_storage_data_type_raw(GArrowExtensionDataType *da return garrow_data_type_get_raw(priv->storage_data_type); } -G_END_DECLS - namespace garrow { GExtensionType::GExtensionType(GArrowExtensionDataType *garrow_data_type) : arrow::ExtensionType( diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h index 01c9e5ef6e40a..77180018c9be8 100644 --- a/c_glib/arrow-glib/basic-data-type.h +++ b/c_glib/arrow-glib/basic-data-type.h @@ -28,6 +28,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_DATA_TYPE (garrow_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDataType, garrow_data_type, GARROW, DATA_TYPE, GObject) struct _GArrowDataTypeClass { @@ -42,17 +43,24 @@ GARROW_AVAILABLE_IN_6_0 gpointer garrow_data_type_export(GArrowDataType *data_type, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_data_type_equal(GArrowDataType *data_type, GArrowDataType *other_data_type); + +GARROW_AVAILABLE_IN_ALL gchar * garrow_data_type_to_string(GArrowDataType *data_type); + +GARROW_AVAILABLE_IN_ALL GArrowType garrow_data_type_get_id(GArrowDataType *data_type); + GARROW_AVAILABLE_IN_3_0 gchar * garrow_data_type_get_name(GArrowDataType *data_type); #define GARROW_TYPE_FIXED_WIDTH_DATA_TYPE (garrow_fixed_width_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowFixedWidthDataType, garrow_fixed_width_data_type, GARROW, @@ -63,6 +71,7 @@ struct _GArrowFixedWidthDataTypeClass GArrowDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL gint garrow_fixed_width_data_type_get_bit_width(GArrowFixedWidthDataType *data_type); /* TODO: @@ -71,6 +80,7 @@ GList *garrow_fixed_width_data_type_get_buffer_layout(GArrowFixedWidthDataType */ #define GARROW_TYPE_NULL_DATA_TYPE (garrow_null_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowNullDataType, garrow_null_data_type, GARROW, NULL_DATA_TYPE, GArrowDataType) struct _GArrowNullDataTypeClass @@ -78,10 +88,12 @@ struct _GArrowNullDataTypeClass GArrowDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowNullDataType * garrow_null_data_type_new(void); #define GARROW_TYPE_BOOLEAN_DATA_TYPE (garrow_boolean_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowBooleanDataType, garrow_boolean_data_type, GARROW, @@ -92,10 +104,12 @@ struct _GArrowBooleanDataTypeClass GArrowFixedWidthDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowBooleanDataType * garrow_boolean_data_type_new(void); #define GARROW_TYPE_NUMERIC_DATA_TYPE (garrow_numeric_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowNumericDataType, garrow_numeric_data_type, GARROW, @@ -107,6 +121,7 @@ struct _GArrowNumericDataTypeClass }; #define GARROW_TYPE_INTEGER_DATA_TYPE (garrow_integer_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowIntegerDataType, garrow_integer_data_type, GARROW, @@ -122,6 +137,7 @@ gboolean garrow_integer_data_type_is_signed(GArrowIntegerDataType *data_type); #define GARROW_TYPE_INT8_DATA_TYPE (garrow_int8_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowInt8DataType, garrow_int8_data_type, GARROW, @@ -132,10 +148,12 @@ struct _GArrowInt8DataTypeClass GArrowIntegerDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt8DataType * garrow_int8_data_type_new(void); #define GARROW_TYPE_UINT8_DATA_TYPE (garrow_uint8_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowUInt8DataType, garrow_uint8_data_type, GARROW, @@ -146,10 +164,12 @@ struct _GArrowUInt8DataTypeClass GArrowIntegerDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt8DataType * garrow_uint8_data_type_new(void); #define GARROW_TYPE_INT16_DATA_TYPE (garrow_int16_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowInt16DataType, garrow_int16_data_type, GARROW, @@ -160,10 +180,12 @@ struct _GArrowInt16DataTypeClass GArrowIntegerDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt16DataType * garrow_int16_data_type_new(void); #define GARROW_TYPE_UINT16_DATA_TYPE (garrow_uint16_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowUInt16DataType, garrow_uint16_data_type, GARROW, @@ -174,10 +196,12 @@ struct _GArrowUInt16DataTypeClass GArrowIntegerDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt16DataType * garrow_uint16_data_type_new(void); #define GARROW_TYPE_INT32_DATA_TYPE (garrow_int32_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowInt32DataType, garrow_int32_data_type, GARROW, @@ -188,10 +212,12 @@ struct _GArrowInt32DataTypeClass GArrowIntegerDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt32DataType * garrow_int32_data_type_new(void); #define GARROW_TYPE_UINT32_DATA_TYPE (garrow_uint32_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowUInt32DataType, garrow_uint32_data_type, GARROW, @@ -202,10 +228,12 @@ struct _GArrowUInt32DataTypeClass GArrowIntegerDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt32DataType * garrow_uint32_data_type_new(void); #define GARROW_TYPE_INT64_DATA_TYPE (garrow_int64_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowInt64DataType, garrow_int64_data_type, GARROW, @@ -216,10 +244,12 @@ struct _GArrowInt64DataTypeClass GArrowIntegerDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowInt64DataType * garrow_int64_data_type_new(void); #define GARROW_TYPE_UINT64_DATA_TYPE (garrow_uint64_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowUInt64DataType, garrow_uint64_data_type, GARROW, @@ -230,10 +260,12 @@ struct _GArrowUInt64DataTypeClass GArrowIntegerDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowUInt64DataType * garrow_uint64_data_type_new(void); #define GARROW_TYPE_FLOATING_POINT_DATA_TYPE (garrow_floating_point_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowFloatingPointDataType, garrow_floating_point_data_type, GARROW, @@ -245,6 +277,7 @@ struct _GArrowFloatingPointDataTypeClass }; #define GARROW_TYPE_HALF_FLOAT_DATA_TYPE (garrow_half_float_data_type_get_type()) +GARROW_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE(GArrowHalfFloatDataType, garrow_half_float_data_type, GARROW, @@ -260,6 +293,7 @@ GArrowHalfFloatDataType * garrow_half_float_data_type_new(void); #define GARROW_TYPE_FLOAT_DATA_TYPE (garrow_float_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowFloatDataType, garrow_float_data_type, GARROW, @@ -270,10 +304,12 @@ struct _GArrowFloatDataTypeClass GArrowFloatingPointDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowFloatDataType * garrow_float_data_type_new(void); #define GARROW_TYPE_DOUBLE_DATA_TYPE (garrow_double_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDoubleDataType, garrow_double_data_type, GARROW, @@ -284,10 +320,12 @@ struct _GArrowDoubleDataTypeClass GArrowFloatingPointDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDoubleDataType * garrow_double_data_type_new(void); #define GARROW_TYPE_BINARY_DATA_TYPE (garrow_binary_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowBinaryDataType, garrow_binary_data_type, GARROW, BINARY_DATA_TYPE, GArrowDataType) struct _GArrowBinaryDataTypeClass @@ -295,11 +333,13 @@ struct _GArrowBinaryDataTypeClass GArrowDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowBinaryDataType * garrow_binary_data_type_new(void); #define GARROW_TYPE_FIXED_SIZE_BINARY_DATA_TYPE \ (garrow_fixed_size_binary_data_type_get_type()) +GARROW_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GArrowFixedSizeBinaryDataType, garrow_fixed_size_binary_data_type, GARROW, @@ -319,6 +359,7 @@ garrow_fixed_size_binary_data_type_get_byte_width( GArrowFixedSizeBinaryDataType *data_type); #define GARROW_TYPE_LARGE_BINARY_DATA_TYPE (garrow_large_binary_data_type_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowLargeBinaryDataType, garrow_large_binary_data_type, GARROW, @@ -334,6 +375,7 @@ GArrowLargeBinaryDataType * garrow_large_binary_data_type_new(void); #define GARROW_TYPE_STRING_DATA_TYPE (garrow_string_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowStringDataType, garrow_string_data_type, GARROW, @@ -344,10 +386,12 @@ struct _GArrowStringDataTypeClass GArrowBinaryDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowStringDataType * garrow_string_data_type_new(void); #define GARROW_TYPE_LARGE_STRING_DATA_TYPE (garrow_large_string_data_type_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowLargeStringDataType, garrow_large_string_data_type, GARROW, @@ -363,6 +407,7 @@ GArrowLargeStringDataType * garrow_large_string_data_type_new(void); #define GARROW_TYPE_TEMPORAL_DATA_TYPE (garrow_temporal_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTemporalDataType, garrow_temporal_data_type, GARROW, @@ -374,6 +419,7 @@ struct _GArrowTemporalDataTypeClass }; #define GARROW_TYPE_DATE32_DATA_TYPE (garrow_date32_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDate32DataType, garrow_date32_data_type, GARROW, @@ -384,10 +430,12 @@ struct _GArrowDate32DataTypeClass GArrowTemporalDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDate32DataType * garrow_date32_data_type_new(void); #define GARROW_TYPE_DATE64_DATA_TYPE (garrow_date64_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDate64DataType, garrow_date64_data_type, GARROW, @@ -398,10 +446,12 @@ struct _GArrowDate64DataTypeClass GArrowTemporalDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDate64DataType * garrow_date64_data_type_new(void); #define GARROW_TYPE_TIMESTAMP_DATA_TYPE (garrow_timestamp_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTimestampDataType, garrow_timestamp_data_type, GARROW, @@ -412,12 +462,16 @@ struct _GArrowTimestampDataTypeClass GArrowTemporalDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTimestampDataType * garrow_timestamp_data_type_new(GArrowTimeUnit unit, GTimeZone *time_zone); + +GARROW_AVAILABLE_IN_ALL GArrowTimeUnit garrow_timestamp_data_type_get_unit(GArrowTimestampDataType *data_type); #define GARROW_TYPE_TIME_DATA_TYPE (garrow_time_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTimeDataType, garrow_time_data_type, GARROW, @@ -428,10 +482,12 @@ struct _GArrowTimeDataTypeClass GArrowTemporalDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTimeUnit garrow_time_data_type_get_unit(GArrowTimeDataType *time_data_type); #define GARROW_TYPE_TIME32_DATA_TYPE (garrow_time32_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTime32DataType, garrow_time32_data_type, GARROW, @@ -442,10 +498,12 @@ struct _GArrowTime32DataTypeClass GArrowTimeDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTime32DataType * garrow_time32_data_type_new(GArrowTimeUnit unit, GError **error); #define GARROW_TYPE_TIME64_DATA_TYPE (garrow_time64_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTime64DataType, garrow_time64_data_type, GARROW, @@ -456,10 +514,12 @@ struct _GArrowTime64DataTypeClass GArrowTimeDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTime64DataType * garrow_time64_data_type_new(GArrowTimeUnit unit, GError **error); #define GARROW_TYPE_INTERVAL_DATA_TYPE (garrow_interval_data_type_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE(GArrowIntervalDataType, garrow_interval_data_type, GARROW, @@ -475,6 +535,7 @@ GArrowIntervalType garrow_interval_data_type_get_interval_type(GArrowIntervalDataType *type); #define GARROW_TYPE_MONTH_INTERVAL_DATA_TYPE (garrow_month_interval_data_type_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE(GArrowMonthIntervalDataType, garrow_month_interval_data_type, GARROW, @@ -491,6 +552,7 @@ garrow_month_interval_data_type_new(void); #define GARROW_TYPE_DAY_TIME_INTERVAL_DATA_TYPE \ (garrow_day_time_interval_data_type_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE(GArrowDayTimeIntervalDataType, garrow_day_time_interval_data_type, GARROW, @@ -507,6 +569,7 @@ garrow_day_time_interval_data_type_new(void); #define GARROW_TYPE_MONTH_DAY_NANO_INTERVAL_DATA_TYPE \ (garrow_month_day_nano_interval_data_type_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE(GArrowMonthDayNanoIntervalDataType, garrow_month_day_nano_interval_data_type, GARROW, @@ -522,6 +585,7 @@ GArrowMonthDayNanoIntervalDataType * garrow_month_day_nano_interval_data_type_new(void); #define GARROW_TYPE_DECIMAL_DATA_TYPE (garrow_decimal_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDecimalDataType, garrow_decimal_data_type, GARROW, @@ -532,14 +596,20 @@ struct _GArrowDecimalDataTypeClass GArrowFixedSizeBinaryDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDecimalDataType * garrow_decimal_data_type_new(gint32 precision, gint32 scale, GError **error); + +GARROW_AVAILABLE_IN_ALL gint32 garrow_decimal_data_type_get_precision(GArrowDecimalDataType *decimal_data_type); + +GARROW_AVAILABLE_IN_ALL gint32 garrow_decimal_data_type_get_scale(GArrowDecimalDataType *decimal_data_type); #define GARROW_TYPE_DECIMAL128_DATA_TYPE (garrow_decimal128_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDecimal128DataType, garrow_decimal128_data_type, GARROW, @@ -559,6 +629,7 @@ GArrowDecimal128DataType * garrow_decimal128_data_type_new(gint32 precision, gint32 scale, GError **error); #define GARROW_TYPE_DECIMAL256_DATA_TYPE (garrow_decimal256_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDecimal256DataType, garrow_decimal256_data_type, GARROW, @@ -578,6 +649,7 @@ GArrowDecimal256DataType * garrow_decimal256_data_type_new(gint32 precision, gint32 scale, GError **error); #define GARROW_TYPE_EXTENSION_DATA_TYPE (garrow_extension_data_type_get_type()) +GARROW_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GArrowExtensionDataType, garrow_extension_data_type, GARROW, @@ -628,6 +700,7 @@ garrow_extension_data_type_wrap_chunked_array(GArrowExtensionDataType *data_type #define GARROW_TYPE_EXTENSION_DATA_TYPE_REGISTRY \ (garrow_extension_data_type_registry_get_type()) +GARROW_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GArrowExtensionDataTypeRegistry, garrow_extension_data_type_registry, GARROW, diff --git a/c_glib/arrow-glib/buffer.h b/c_glib/arrow-glib/buffer.h index 8f93a5ef0ddb2..29308e935aba2 100644 --- a/c_glib/arrow-glib/buffer.h +++ b/c_glib/arrow-glib/buffer.h @@ -21,44 +21,70 @@ #include +#include + G_BEGIN_DECLS #define GARROW_TYPE_BUFFER (garrow_buffer_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowBuffer, garrow_buffer, GARROW, BUFFER, GObject) struct _GArrowBufferClass { GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_buffer_new(const guint8 *data, gint64 size); + +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_buffer_new_bytes(GBytes *data); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_buffer_equal(GArrowBuffer *buffer, GArrowBuffer *other_buffer); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_buffer_equal_n_bytes(GArrowBuffer *buffer, GArrowBuffer *other_buffer, gint64 n_bytes); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_buffer_is_mutable(GArrowBuffer *buffer); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_buffer_get_capacity(GArrowBuffer *buffer); + +GARROW_AVAILABLE_IN_ALL GBytes * garrow_buffer_get_data(GArrowBuffer *buffer); + +GARROW_AVAILABLE_IN_ALL GBytes * garrow_buffer_get_mutable_data(GArrowBuffer *buffer); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_buffer_get_size(GArrowBuffer *buffer); + +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_buffer_get_parent(GArrowBuffer *buffer); +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_buffer_copy(GArrowBuffer *buffer, gint64 start, gint64 size, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_buffer_slice(GArrowBuffer *buffer, gint64 offset, gint64 size); #define GARROW_TYPE_MUTABLE_BUFFER (garrow_mutable_buffer_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowMutableBuffer, garrow_mutable_buffer, GARROW, MUTABLE_BUFFER, GArrowBuffer) struct _GArrowMutableBufferClass @@ -66,12 +92,19 @@ struct _GArrowMutableBufferClass GArrowBufferClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowMutableBuffer * garrow_mutable_buffer_new(guint8 *data, gint64 size); + +GARROW_AVAILABLE_IN_ALL GArrowMutableBuffer * garrow_mutable_buffer_new_bytes(GBytes *data); + +GARROW_AVAILABLE_IN_ALL GArrowMutableBuffer * garrow_mutable_buffer_slice(GArrowMutableBuffer *buffer, gint64 offset, gint64 size); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_mutable_buffer_set_data(GArrowMutableBuffer *buffer, gint64 offset, @@ -80,6 +113,7 @@ garrow_mutable_buffer_set_data(GArrowMutableBuffer *buffer, GError **error); #define GARROW_TYPE_RESIZABLE_BUFFER (garrow_resizable_buffer_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowResizableBuffer, garrow_resizable_buffer, GARROW, @@ -90,12 +124,17 @@ struct _GArrowResizableBufferClass GArrowMutableBufferClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowResizableBuffer * garrow_resizable_buffer_new(gint64 initial_size, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_resizable_buffer_resize(GArrowResizableBuffer *buffer, gint64 new_size, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_resizable_buffer_reserve(GArrowResizableBuffer *buffer, gint64 new_capacity, diff --git a/c_glib/arrow-glib/buffer.hpp b/c_glib/arrow-glib/buffer.hpp index 5d922371c3b6b..7e4d7ecee1c1c 100644 --- a/c_glib/arrow-glib/buffer.hpp +++ b/c_glib/arrow-glib/buffer.hpp @@ -23,20 +23,32 @@ #include +GARROW_EXTERN GArrowBuffer * garrow_buffer_new_raw(std::shared_ptr *arrow_buffer); + +GARROW_EXTERN GArrowBuffer * garrow_buffer_new_raw_bytes(std::shared_ptr *arrow_buffer, GBytes *data); + +GARROW_EXTERN GArrowBuffer * garrow_buffer_new_raw_parent(std::shared_ptr *arrow_buffer, GArrowBuffer *parent); + +GARROW_EXTERN std::shared_ptr garrow_buffer_get_raw(GArrowBuffer *buffer); +GARROW_EXTERN GArrowMutableBuffer * garrow_mutable_buffer_new_raw(std::shared_ptr *arrow_buffer); + +GARROW_EXTERN GArrowMutableBuffer * garrow_mutable_buffer_new_raw_bytes(std::shared_ptr *arrow_buffer, GBytes *data); + +GARROW_EXTERN GArrowResizableBuffer * garrow_resizable_buffer_new_raw(std::shared_ptr *arrow_buffer); diff --git a/c_glib/arrow-glib/chunked-array-definition.h b/c_glib/arrow-glib/chunked-array-definition.h index b687735419eeb..744f1077ea754 100644 --- a/c_glib/arrow-glib/chunked-array-definition.h +++ b/c_glib/arrow-glib/chunked-array-definition.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_CHUNKED_ARRAY (garrow_chunked_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowChunkedArray, garrow_chunked_array, GARROW, CHUNKED_ARRAY, GObject) struct _GArrowChunkedArrayClass diff --git a/c_glib/arrow-glib/chunked-array.h b/c_glib/arrow-glib/chunked-array.h index 6ca497942ff2e..712d16504f624 100644 --- a/c_glib/arrow-glib/chunked-array.h +++ b/c_glib/arrow-glib/chunked-array.h @@ -24,42 +24,61 @@ G_BEGIN_DECLS +GARROW_AVAILABLE_IN_ALL GArrowChunkedArray * garrow_chunked_array_new(GList *chunks, GError **error); + GARROW_AVAILABLE_IN_11_0 GArrowChunkedArray * garrow_chunked_array_new_empty(GArrowDataType *data_type, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_chunked_array_equal(GArrowChunkedArray *chunked_array, GArrowChunkedArray *other_chunked_array); +GARROW_AVAILABLE_IN_ALL GArrowDataType * garrow_chunked_array_get_value_data_type(GArrowChunkedArray *chunked_array); + +GARROW_AVAILABLE_IN_ALL GArrowType garrow_chunked_array_get_value_type(GArrowChunkedArray *chunked_array); GARROW_DEPRECATED_IN_0_15_FOR(garrow_chunked_array_get_n_rows) guint64 garrow_chunked_array_get_length(GArrowChunkedArray *chunked_array); + GARROW_AVAILABLE_IN_0_15 guint64 garrow_chunked_array_get_n_rows(GArrowChunkedArray *chunked_array); + +GARROW_AVAILABLE_IN_ALL guint64 garrow_chunked_array_get_n_nulls(GArrowChunkedArray *chunked_array); + +GARROW_AVAILABLE_IN_ALL guint garrow_chunked_array_get_n_chunks(GArrowChunkedArray *chunked_array); +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_chunked_array_get_chunk(GArrowChunkedArray *chunked_array, guint i); + +GARROW_AVAILABLE_IN_ALL GList * garrow_chunked_array_get_chunks(GArrowChunkedArray *chunked_array); + +GARROW_AVAILABLE_IN_ALL GArrowChunkedArray * garrow_chunked_array_slice(GArrowChunkedArray *chunked_array, guint64 offset, guint64 length); + +GARROW_AVAILABLE_IN_ALL gchar * garrow_chunked_array_to_string(GArrowChunkedArray *chunked_array, GError **error); + GARROW_AVAILABLE_IN_4_0 GArrowArray * garrow_chunked_array_combine(GArrowChunkedArray *chunked_array, GError **error); diff --git a/c_glib/arrow-glib/chunked-array.hpp b/c_glib/arrow-glib/chunked-array.hpp index 9ce6cc76adfbf..674ef9606b96e 100644 --- a/c_glib/arrow-glib/chunked-array.hpp +++ b/c_glib/arrow-glib/chunked-array.hpp @@ -23,10 +23,15 @@ #include +GARROW_EXTERN GArrowChunkedArray * garrow_chunked_array_new_raw(std::shared_ptr *arrow_chunked_array); + +GARROW_EXTERN GArrowChunkedArray * garrow_chunked_array_new_raw(std::shared_ptr *arrow_chunked_array, GArrowDataType *data_type); + +GARROW_EXTERN std::shared_ptr garrow_chunked_array_get_raw(GArrowChunkedArray *chunked_array); diff --git a/c_glib/arrow-glib/codec.h b/c_glib/arrow-glib/codec.h index 9b8611bb0a7ee..5865634a7d8e4 100644 --- a/c_glib/arrow-glib/codec.h +++ b/c_glib/arrow-glib/codec.h @@ -50,20 +50,25 @@ typedef enum { } GArrowCompressionType; #define GARROW_TYPE_CODEC (garrow_codec_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowCodec, garrow_codec, GARROW, CODEC, GObject) struct _GArrowCodecClass { GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowCodec * garrow_codec_new(GArrowCompressionType type, GError **error); +GARROW_AVAILABLE_IN_ALL const gchar * garrow_codec_get_name(GArrowCodec *codec); + GARROW_AVAILABLE_IN_2_0 GArrowCompressionType garrow_codec_get_compression_type(GArrowCodec *codec); + GARROW_AVAILABLE_IN_2_0 gint garrow_codec_get_compression_level(GArrowCodec *codec); diff --git a/c_glib/arrow-glib/codec.hpp b/c_glib/arrow-glib/codec.hpp index f4cfaba18a00e..baea842ddf6b5 100644 --- a/c_glib/arrow-glib/codec.hpp +++ b/c_glib/arrow-glib/codec.hpp @@ -23,12 +23,18 @@ #include +GARROW_EXTERN GArrowCompressionType garrow_compression_type_from_raw(arrow::Compression::type arrow_type); + +GARROW_EXTERN arrow::Compression::type garrow_compression_type_to_raw(GArrowCompressionType type); +GARROW_EXTERN GArrowCodec * garrow_codec_new_raw(std::shared_ptr *arrow_codec); + +GARROW_EXTERN std::shared_ptr garrow_codec_get_raw(GArrowCodec *codec); diff --git a/c_glib/arrow-glib/composite-array.h b/c_glib/arrow-glib/composite-array.h index c6e19f1c74e22..b8ba901363d0a 100644 --- a/c_glib/arrow-glib/composite-array.h +++ b/c_glib/arrow-glib/composite-array.h @@ -27,6 +27,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_LIST_ARRAY (garrow_list_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowListArray, garrow_list_array, GARROW, LIST_ARRAY, GArrowArray) struct _GArrowListArrayClass @@ -34,6 +35,7 @@ struct _GArrowListArrayClass GArrowArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowListArray * garrow_list_array_new(GArrowDataType *data_type, gint64 length, @@ -42,24 +44,32 @@ garrow_list_array_new(GArrowDataType *data_type, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL GArrowDataType * garrow_list_array_get_value_type(GArrowListArray *array); + +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_list_array_get_value(GArrowListArray *array, gint64 i); + GARROW_AVAILABLE_IN_2_0 GArrowArray * garrow_list_array_get_values(GArrowListArray *array); + GARROW_AVAILABLE_IN_2_0 gint32 garrow_list_array_get_value_offset(GArrowListArray *array, gint64 i); + GARROW_AVAILABLE_IN_2_0 gint32 garrow_list_array_get_value_length(GArrowListArray *array, gint64 i); + GARROW_AVAILABLE_IN_2_0 const gint32 * garrow_list_array_get_value_offsets(GArrowListArray *array, gint64 *n_offsets); #define GARROW_TYPE_LARGE_LIST_ARRAY (garrow_large_list_array_get_type()) +GARROW_AVAILABLE_IN_0_16 G_DECLARE_DERIVABLE_TYPE( GArrowLargeListArray, garrow_large_list_array, GARROW, LARGE_LIST_ARRAY, GArrowArray) struct _GArrowLargeListArrayClass @@ -79,23 +89,29 @@ garrow_large_list_array_new(GArrowDataType *data_type, GARROW_AVAILABLE_IN_0_16 GArrowDataType * garrow_large_list_array_get_value_type(GArrowLargeListArray *array); + GARROW_AVAILABLE_IN_0_16 GArrowArray * garrow_large_list_array_get_value(GArrowLargeListArray *array, gint64 i); + GARROW_AVAILABLE_IN_2_0 GArrowArray * garrow_large_list_array_get_values(GArrowLargeListArray *array); + GARROW_AVAILABLE_IN_2_0 gint64 garrow_large_list_array_get_value_offset(GArrowLargeListArray *array, gint64 i); + GARROW_AVAILABLE_IN_2_0 gint64 garrow_large_list_array_get_value_length(GArrowLargeListArray *array, gint64 i); + GARROW_AVAILABLE_IN_2_0 const gint64 * garrow_large_list_array_get_value_offsets(GArrowLargeListArray *array, gint64 *n_offsets); #define GARROW_TYPE_STRUCT_ARRAY (garrow_struct_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowStructArray, garrow_struct_array, GARROW, STRUCT_ARRAY, GArrowArray) struct _GArrowStructArrayClass @@ -103,6 +119,7 @@ struct _GArrowStructArrayClass GArrowArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowStructArray * garrow_struct_array_new(GArrowDataType *data_type, gint64 length, @@ -110,9 +127,11 @@ garrow_struct_array_new(GArrowDataType *data_type, GArrowBuffer *null_bitmap, gint64 n_nulls); +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_struct_array_get_field(GArrowStructArray *array, gint i); +GARROW_AVAILABLE_IN_ALL GList * garrow_struct_array_get_fields(GArrowStructArray *array); @@ -121,6 +140,7 @@ GList * garrow_struct_array_flatten(GArrowStructArray *array, GError **error); #define GARROW_TYPE_MAP_ARRAY (garrow_map_array_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE( GArrowMapArray, garrow_map_array, GARROW, MAP_ARRAY, GArrowListArray) struct _GArrowMapArrayClass @@ -134,14 +154,17 @@ garrow_map_array_new(GArrowArray *offsets, GArrowArray *keys, GArrowArray *items, GError **error); + GARROW_AVAILABLE_IN_0_17 GArrowArray * garrow_map_array_get_keys(GArrowMapArray *array); + GARROW_AVAILABLE_IN_0_17 GArrowArray * garrow_map_array_get_items(GArrowMapArray *array); #define GARROW_TYPE_UNION_ARRAY (garrow_union_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowUnionArray, garrow_union_array, GARROW, UNION_ARRAY, GArrowArray) struct _GArrowUnionArrayClass @@ -152,13 +175,17 @@ struct _GArrowUnionArrayClass GARROW_AVAILABLE_IN_12_0 gint8 garrow_union_array_get_type_code(GArrowUnionArray *array, gint64 i); + GARROW_AVAILABLE_IN_12_0 gint garrow_union_array_get_child_id(GArrowUnionArray *array, gint64 i); + +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_union_array_get_field(GArrowUnionArray *array, gint i); #define GARROW_TYPE_SPARSE_UNION_ARRAY (garrow_sparse_union_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowSparseUnionArray, garrow_sparse_union_array, GARROW, @@ -169,8 +196,11 @@ struct _GArrowSparseUnionArrayClass GArrowUnionArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowSparseUnionArray * garrow_sparse_union_array_new(GArrowInt8Array *type_ids, GList *fields, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowSparseUnionArray * garrow_sparse_union_array_new_data_type(GArrowSparseUnionDataType *data_type, GArrowInt8Array *type_ids, @@ -178,6 +208,7 @@ garrow_sparse_union_array_new_data_type(GArrowSparseUnionDataType *data_type, GError **error); #define GARROW_TYPE_DENSE_UNION_ARRAY (garrow_dense_union_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDenseUnionArray, garrow_dense_union_array, GARROW, @@ -188,22 +219,27 @@ struct _GArrowDenseUnionArrayClass GArrowUnionArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDenseUnionArray * garrow_dense_union_array_new(GArrowInt8Array *type_ids, GArrowInt32Array *value_offsets, GList *fields, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowDenseUnionArray * garrow_dense_union_array_new_data_type(GArrowDenseUnionDataType *data_type, GArrowInt8Array *type_ids, GArrowInt32Array *value_offsets, GList *fields, GError **error); + GARROW_AVAILABLE_IN_12_0 gint32 garrow_dense_union_array_get_value_offset(GArrowDenseUnionArray *array, gint64 i); #define GARROW_TYPE_DICTIONARY_ARRAY (garrow_dictionary_array_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowDictionaryArray, garrow_dictionary_array, GARROW, DICTIONARY_ARRAY, GArrowArray) struct _GArrowDictionaryArrayClass @@ -211,22 +247,29 @@ struct _GArrowDictionaryArrayClass GArrowArrayClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDictionaryArray * garrow_dictionary_array_new(GArrowDataType *data_type, GArrowArray *indices, GArrowArray *dictionary, GError **error); +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_dictionary_array_get_indices(GArrowDictionaryArray *array); + +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_dictionary_array_get_dictionary(GArrowDictionaryArray *array); + #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_1_0_FOR(garrow_array_get_value_data_type) GArrowDictionaryDataType * garrow_dictionary_array_get_dictionary_data_type(GArrowDictionaryArray *array); #endif #define GARROW_TYPE_RUN_END_ENCODED_ARRAY (garrow_run_end_encoded_array_get_type()) +GARROW_AVAILABLE_IN_13_0 G_DECLARE_DERIVABLE_TYPE(GArrowRunEndEncodedArray, garrow_run_end_encoded_array, GARROW, @@ -248,9 +291,11 @@ garrow_run_end_encoded_array_new(GArrowDataType *data_type, GARROW_AVAILABLE_IN_13_0 GArrowArray * garrow_run_end_encoded_array_get_run_ends(GArrowRunEndEncodedArray *array); + GARROW_AVAILABLE_IN_13_0 GArrowArray * garrow_run_end_encoded_array_get_values(GArrowRunEndEncodedArray *array); + GARROW_AVAILABLE_IN_13_0 GArrowArray * garrow_run_end_encoded_array_get_logical_run_ends(GArrowRunEndEncodedArray *array, @@ -258,9 +303,11 @@ garrow_run_end_encoded_array_get_logical_run_ends(GArrowRunEndEncodedArray *arra GARROW_AVAILABLE_IN_13_0 GArrowArray * garrow_run_end_encoded_array_get_logical_values(GArrowRunEndEncodedArray *array); + GARROW_AVAILABLE_IN_13_0 gint64 garrow_run_end_encoded_array_find_physical_offset(GArrowRunEndEncodedArray *array); + GARROW_AVAILABLE_IN_13_0 gint64 garrow_run_end_encoded_array_find_physical_length(GArrowRunEndEncodedArray *array); diff --git a/c_glib/arrow-glib/composite-data-type.h b/c_glib/arrow-glib/composite-data-type.h index e71d277a305c6..7a0a462af00f9 100644 --- a/c_glib/arrow-glib/composite-data-type.h +++ b/c_glib/arrow-glib/composite-data-type.h @@ -27,6 +27,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_LIST_DATA_TYPE (garrow_list_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowListDataType, garrow_list_data_type, GARROW, LIST_DATA_TYPE, GArrowDataType) struct _GArrowListDataTypeClass @@ -34,18 +35,23 @@ struct _GArrowListDataTypeClass GArrowDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowListDataType * garrow_list_data_type_new(GArrowField *field); + #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_13_FOR(garrow_list_data_type_get_field) GArrowField * garrow_list_data_type_get_value_field(GArrowListDataType *list_data_type); #endif + GARROW_AVAILABLE_IN_0_13 GArrowField * garrow_list_data_type_get_field(GArrowListDataType *list_data_type); #define GARROW_TYPE_LARGE_LIST_DATA_TYPE (garrow_large_list_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowLargeListDataType, garrow_large_list_data_type, GARROW, @@ -59,11 +65,13 @@ struct _GArrowLargeListDataTypeClass GARROW_AVAILABLE_IN_0_16 GArrowLargeListDataType * garrow_large_list_data_type_new(GArrowField *field); + GARROW_AVAILABLE_IN_0_16 GArrowField * garrow_large_list_data_type_get_field(GArrowLargeListDataType *large_list_data_type); #define GARROW_TYPE_STRUCT_DATA_TYPE (garrow_struct_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowStructDataType, garrow_struct_data_type, GARROW, STRUCT_DATA_TYPE, GArrowDataType) struct _GArrowStructDataTypeClass @@ -71,22 +79,34 @@ struct _GArrowStructDataTypeClass GArrowDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowStructDataType * garrow_struct_data_type_new(GList *fields); + +GARROW_AVAILABLE_IN_ALL gint garrow_struct_data_type_get_n_fields(GArrowStructDataType *struct_data_type); + +GARROW_AVAILABLE_IN_ALL GList * garrow_struct_data_type_get_fields(GArrowStructDataType *struct_data_type); + +GARROW_AVAILABLE_IN_ALL GArrowField * garrow_struct_data_type_get_field(GArrowStructDataType *struct_data_type, gint i); + +GARROW_AVAILABLE_IN_ALL GArrowField * garrow_struct_data_type_get_field_by_name(GArrowStructDataType *struct_data_type, const gchar *name); + +GARROW_AVAILABLE_IN_ALL gint garrow_struct_data_type_get_field_index(GArrowStructDataType *struct_data_type, const gchar *name); #define GARROW_TYPE_MAP_DATA_TYPE (garrow_map_data_type_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE( GArrowMapDataType, garrow_map_data_type, GARROW, MAP_DATA_TYPE, GArrowListDataType) struct _GArrowMapDataTypeClass @@ -105,6 +125,7 @@ GArrowDataType * garrow_map_data_type_get_item_type(GArrowMapDataType *map_data_type); #define GARROW_TYPE_UNION_DATA_TYPE (garrow_union_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowUnionDataType, garrow_union_data_type, GARROW, UNION_DATA_TYPE, GArrowDataType) struct _GArrowUnionDataTypeClass @@ -112,17 +133,25 @@ struct _GArrowUnionDataTypeClass GArrowDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL gint garrow_union_data_type_get_n_fields(GArrowUnionDataType *union_data_type); + +GARROW_AVAILABLE_IN_ALL GList * garrow_union_data_type_get_fields(GArrowUnionDataType *union_data_type); + +GARROW_AVAILABLE_IN_ALL GArrowField * garrow_union_data_type_get_field(GArrowUnionDataType *union_data_type, gint i); + +GARROW_AVAILABLE_IN_ALL gint8 * garrow_union_data_type_get_type_codes(GArrowUnionDataType *union_data_type, gsize *n_type_codes); #define GARROW_TYPE_SPARSE_UNION_DATA_TYPE (garrow_sparse_union_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowSparseUnionDataType, garrow_sparse_union_data_type, GARROW, @@ -133,10 +162,12 @@ struct _GArrowSparseUnionDataTypeClass GArrowUnionDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowSparseUnionDataType * garrow_sparse_union_data_type_new(GList *fields, gint8 *type_codes, gsize n_type_codes); #define GARROW_TYPE_DENSE_UNION_DATA_TYPE (garrow_dense_union_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDenseUnionDataType, garrow_dense_union_data_type, GARROW, @@ -147,10 +178,12 @@ struct _GArrowDenseUnionDataTypeClass GArrowUnionDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDenseUnionDataType * garrow_dense_union_data_type_new(GList *fields, gint8 *type_codes, gsize n_type_codes); #define GARROW_TYPE_DICTIONARY_DATA_TYPE (garrow_dictionary_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDictionaryDataType, garrow_dictionary_data_type, GARROW, @@ -161,22 +194,29 @@ struct _GArrowDictionaryDataTypeClass GArrowFixedWidthDataTypeClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDictionaryDataType * garrow_dictionary_data_type_new(GArrowDataType *index_data_type, GArrowDataType *value_data_type, gboolean ordered); + +GARROW_AVAILABLE_IN_ALL GArrowDataType * garrow_dictionary_data_type_get_index_data_type( GArrowDictionaryDataType *dictionary_data_type); + GARROW_AVAILABLE_IN_0_14 GArrowDataType * garrow_dictionary_data_type_get_value_data_type( GArrowDictionaryDataType *dictionary_data_type); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *dictionary_data_type); #define GARROW_TYPE_RUN_END_ENCODED_DATA_TYPE \ (garrow_run_end_encoded_data_type_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowRunEndEncodedDataType, garrow_run_end_encoded_data_type, GARROW, @@ -195,6 +235,7 @@ GARROW_AVAILABLE_IN_13_0 GArrowDataType * garrow_run_end_encoded_data_type_get_run_end_data_type( GArrowRunEndEncodedDataType *data_type); + GARROW_AVAILABLE_IN_13_0 GArrowDataType * garrow_run_end_encoded_data_type_get_value_data_type( diff --git a/c_glib/arrow-glib/compute-definition.h b/c_glib/arrow-glib/compute-definition.h index b699e9e99a9fc..a060f16f62cf6 100644 --- a/c_glib/arrow-glib/compute-definition.h +++ b/c_glib/arrow-glib/compute-definition.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GARROW_TYPE_FUNCTION_OPTIONS (garrow_function_options_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowFunctionOptions, garrow_function_options, GARROW, FUNCTION_OPTIONS, GObject) struct _GArrowFunctionOptionsClass @@ -32,6 +35,7 @@ struct _GArrowFunctionOptionsClass }; #define GARROW_TYPE_CAST_OPTIONS (garrow_cast_options_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowCastOptions, garrow_cast_options, GARROW, CAST_OPTIONS, GArrowFunctionOptions) struct _GArrowCastOptionsClass @@ -40,6 +44,7 @@ struct _GArrowCastOptionsClass }; #define GARROW_TYPE_EXPRESSION (garrow_expression_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowExpression, garrow_expression, GARROW, EXPRESSION, GObject) struct _GArrowExpressionClass { diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 029cab136ad8f..54b0ddb014fbb 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -26,6 +26,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_EXECUTE_CONTEXT (garrow_execute_context_get_type()) +GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE( GArrowExecuteContext, garrow_execute_context, GARROW, EXECUTE_CONTEXT, GObject) struct _GArrowExecuteContextClass @@ -46,6 +47,7 @@ gchar * garrow_function_options_to_string(GArrowFunctionOptions *options); #define GARROW_TYPE_FUNCTION_DOC (garrow_function_doc_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GArrowFunctionDoc, garrow_function_doc, GARROW, FUNCTION_DOC, GObject) struct _GArrowFunctionDocClass @@ -67,6 +69,7 @@ gchar * garrow_function_doc_get_options_class_name(GArrowFunctionDoc *doc); #define GARROW_TYPE_FUNCTION (garrow_function_get_type()) +GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE(GArrowFunction, garrow_function, GARROW, FUNCTION, GObject) struct _GArrowFunctionClass { @@ -110,6 +113,7 @@ gchar * garrow_function_to_string(GArrowFunction *function); #define GARROW_TYPE_EXECUTE_NODE_OPTIONS (garrow_execute_node_options_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GArrowExecuteNodeOptions, garrow_execute_node_options, GARROW, @@ -121,6 +125,7 @@ struct _GArrowExecuteNodeOptionsClass }; #define GARROW_TYPE_SOURCE_NODE_OPTIONS (garrow_source_node_options_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GArrowSourceNodeOptions, garrow_source_node_options, GARROW, @@ -142,6 +147,7 @@ GArrowSourceNodeOptions * garrow_source_node_options_new_table(GArrowTable *table); #define GARROW_TYPE_FILTER_NODE_OPTIONS (garrow_filter_node_options_get_type()) +GARROW_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE(GArrowFilterNodeOptions, garrow_filter_node_options, GARROW, @@ -157,6 +163,7 @@ GArrowFilterNodeOptions * garrow_filter_node_options_new(GArrowExpression *expression); #define GARROW_TYPE_PROJECT_NODE_OPTIONS (garrow_project_node_options_get_type()) +GARROW_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE(GArrowProjectNodeOptions, garrow_project_node_options, GARROW, @@ -172,6 +179,7 @@ GArrowProjectNodeOptions * garrow_project_node_options_new(GList *expressions, gchar **names, gsize n_names); #define GARROW_TYPE_AGGREGATION (garrow_aggregation_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GArrowAggregation, garrow_aggregation, GARROW, AGGREGATION, GObject) struct _GArrowAggregationClass @@ -187,6 +195,7 @@ garrow_aggregation_new(const gchar *function, const gchar *output); #define GARROW_TYPE_AGGREGATE_NODE_OPTIONS (garrow_aggregate_node_options_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GArrowAggregateNodeOptions, garrow_aggregate_node_options, GARROW, @@ -205,6 +214,7 @@ garrow_aggregate_node_options_new(GList *aggregations, GError **error); #define GARROW_TYPE_SINK_NODE_OPTIONS (garrow_sink_node_options_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GArrowSinkNodeOptions, garrow_sink_node_options, GARROW, @@ -249,6 +259,7 @@ typedef enum { } GArrowJoinType; #define GARROW_TYPE_HASH_JOIN_NODE_OPTIONS (garrow_hash_join_node_options_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE(GArrowHashJoinNodeOptions, garrow_hash_join_node_options, GARROW, @@ -281,6 +292,7 @@ garrow_hash_join_node_options_set_right_outputs(GArrowHashJoinNodeOptions *optio GError **error); #define GARROW_TYPE_EXECUTE_NODE (garrow_execute_node_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GArrowExecuteNode, garrow_execute_node, GARROW, EXECUTE_NODE, GObject) struct _GArrowExecuteNodeClass @@ -296,6 +308,7 @@ GArrowSchema * garrow_execute_node_get_output_schema(GArrowExecuteNode *node); #define GARROW_TYPE_EXECUTE_PLAN (garrow_execute_plan_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GArrowExecutePlan, garrow_execute_plan, GARROW, EXECUTE_PLAN, GObject) struct _GArrowExecutePlanClass @@ -365,10 +378,12 @@ GARROW_AVAILABLE_IN_6_0 gboolean garrow_execute_plan_wait(GArrowExecutePlan *plan, GError **error); +GARROW_AVAILABLE_IN_ALL GArrowCastOptions * garrow_cast_options_new(void); #define GARROW_TYPE_SCALAR_AGGREGATE_OPTIONS (garrow_scalar_aggregate_options_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowScalarAggregateOptions, garrow_scalar_aggregate_options, GARROW, @@ -401,6 +416,7 @@ typedef enum { } GArrowCountMode; #define GARROW_TYPE_COUNT_OPTIONS (garrow_count_options_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GArrowCountOptions, garrow_count_options, GARROW, COUNT_OPTIONS, GArrowFunctionOptions) struct _GArrowCountOptionsClass @@ -428,6 +444,7 @@ typedef enum { } GArrowFilterNullSelectionBehavior; #define GARROW_TYPE_FILTER_OPTIONS (garrow_filter_options_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowFilterOptions, garrow_filter_options, GARROW, @@ -443,6 +460,7 @@ GArrowFilterOptions * garrow_filter_options_new(void); #define GARROW_TYPE_TAKE_OPTIONS (garrow_take_options_get_type()) +GARROW_AVAILABLE_IN_0_14 G_DECLARE_DERIVABLE_TYPE( GArrowTakeOptions, garrow_take_options, GARROW, TAKE_OPTIONS, GArrowFunctionOptions) struct _GArrowTakeOptionsClass @@ -487,6 +505,7 @@ typedef enum /**/ { } GArrowNullPlacement; #define GARROW_TYPE_ARRAY_SORT_OPTIONS (garrow_array_sort_options_get_type()) +GARROW_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GArrowArraySortOptions, garrow_array_sort_options, GARROW, @@ -506,6 +525,7 @@ garrow_array_sort_options_equal(GArrowArraySortOptions *options, GArrowArraySortOptions *other_options); #define GARROW_TYPE_SORT_KEY (garrow_sort_key_get_type()) +GARROW_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GArrowSortKey, garrow_sort_key, GARROW, SORT_KEY, GObject) struct _GArrowSortKeyClass { @@ -521,6 +541,7 @@ gboolean garrow_sort_key_equal(GArrowSortKey *sort_key, GArrowSortKey *other_sort_key); #define GARROW_TYPE_SORT_OPTIONS (garrow_sort_options_get_type()) +GARROW_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE( GArrowSortOptions, garrow_sort_options, GARROW, SORT_OPTIONS, GArrowFunctionOptions) struct _GArrowSortOptionsClass @@ -545,6 +566,7 @@ void garrow_sort_options_add_sort_key(GArrowSortOptions *options, GArrowSortKey *sort_key); #define GARROW_TYPE_SET_LOOKUP_OPTIONS (garrow_set_lookup_options_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GArrowSetLookupOptions, garrow_set_lookup_options, GARROW, @@ -560,6 +582,7 @@ GArrowSetLookupOptions * garrow_set_lookup_options_new(GArrowDatum *value_set); #define GARROW_TYPE_VARIANCE_OPTIONS (garrow_variance_options_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GArrowVarianceOptions, garrow_variance_options, GARROW, @@ -620,6 +643,7 @@ typedef enum { } GArrowRoundMode; #define GARROW_TYPE_ROUND_OPTIONS (garrow_round_options_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE( GArrowRoundOptions, garrow_round_options, GARROW, ROUND_OPTIONS, GArrowFunctionOptions) struct _GArrowRoundOptionsClass @@ -633,6 +657,7 @@ garrow_round_options_new(void); #define GARROW_TYPE_ROUND_TO_MULTIPLE_OPTIONS \ (garrow_round_to_multiple_options_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE(GArrowRoundToMultipleOptions, garrow_round_to_multiple_options, GARROW, @@ -648,6 +673,7 @@ GArrowRoundToMultipleOptions * garrow_round_to_multiple_options_new(void); #define GARROW_TYPE_MATCH_SUBSTRING_OPTIONS (garrow_match_substring_options_get_type()) +GARROW_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE(GArrowMatchSubstringOptions, garrow_match_substring_options, GARROW, @@ -683,6 +709,7 @@ typedef enum /*< underscore_name=garrow_utf8_normalize_form >*/ { } GArrowUTF8NormalizeForm; #define GARROW_TYPE_UTF8_NORMALIZE_OPTIONS (garrow_utf8_normalize_options_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowUTF8NormalizeOptions, garrow_utf8_normalize_options, GARROW, @@ -719,6 +746,7 @@ typedef enum { } GArrowQuantileInterpolation; #define GARROW_TYPE_QUANTILE_OPTIONS (garrow_quantile_options_get_type()) +GARROW_AVAILABLE_IN_9_0 G_DECLARE_DERIVABLE_TYPE(GArrowQuantileOptions, garrow_quantile_options, GARROW, @@ -745,6 +773,7 @@ garrow_quantile_options_set_qs(GArrowQuantileOptions *options, gsize n); #define GARROW_TYPE_INDEX_OPTIONS (garrow_index_options_get_type()) +GARROW_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE( GArrowIndexOptions, garrow_index_options, GARROW, INDEX_OPTIONS, GArrowFunctionOptions) struct _GArrowIndexOptionsClass @@ -782,6 +811,7 @@ typedef enum { } GArrowRankTiebreaker; #define GARROW_TYPE_RANK_OPTIONS (garrow_rank_options_get_type()) +GARROW_AVAILABLE_IN_12_0 G_DECLARE_DERIVABLE_TYPE( GArrowRankOptions, garrow_rank_options, GARROW, RANK_OPTIONS, GArrowFunctionOptions) struct _GArrowRankOptionsClass @@ -805,18 +835,25 @@ GARROW_AVAILABLE_IN_12_0 void garrow_rank_options_add_sort_key(GArrowRankOptions *options, GArrowSortKey *sort_key); +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_array_cast(GArrowArray *array, GArrowDataType *target_data_type, GArrowCastOptions *options, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_array_unique(GArrowArray *array, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowDictionaryArray * garrow_array_dictionary_encode(GArrowArray *array, GError **error); + GARROW_AVAILABLE_IN_0_13 gint64 garrow_array_count(GArrowArray *array, GArrowCountOptions *options, GError **error); + GARROW_AVAILABLE_IN_0_13 GArrowStructArray * garrow_array_count_values(GArrowArray *array, GError **error); @@ -987,6 +1024,7 @@ garrow_record_batch_filter(GArrowRecordBatch *record_batch, GError **error); #define GARROW_TYPE_RUN_END_ENCODE_OPTIONS (garrow_run_end_encode_options_get_type()) +GARROW_AVAILABLE_IN_13_0 G_DECLARE_DERIVABLE_TYPE(GArrowRunEndEncodeOptions, garrow_run_end_encode_options, GARROW, @@ -1011,6 +1049,7 @@ GArrowArray * garrow_run_end_encoded_array_decode(GArrowRunEndEncodedArray *array, GError **error); #define GARROW_TYPE_STRPTIME_OPTIONS (garrow_strptime_options_get_type()) +GARROW_AVAILABLE_IN_16_0 G_DECLARE_DERIVABLE_TYPE(GArrowStrptimeOptions, garrow_strptime_options, GARROW, @@ -1026,6 +1065,7 @@ GArrowStrptimeOptions * garrow_strptime_options_new(void); #define GARROW_TYPE_STRFTIME_OPTIONS (garrow_strftime_options_get_type()) +GARROW_AVAILABLE_IN_16_0 G_DECLARE_DERIVABLE_TYPE(GArrowStrftimeOptions, garrow_strftime_options, GARROW, @@ -1041,6 +1081,7 @@ GArrowStrftimeOptions * garrow_strftime_options_new(void); #define GARROW_TYPE_SPLIT_PATTERN_OPTIONS (garrow_split_pattern_options_get_type()) +GARROW_AVAILABLE_IN_16_0 G_DECLARE_DERIVABLE_TYPE(GArrowSplitPatternOptions, garrow_split_pattern_options, GARROW, @@ -1056,6 +1097,7 @@ GArrowSplitPatternOptions * garrow_split_pattern_options_new(void); #define GARROW_TYPE_STRUCT_FIELD_OPTIONS (garrow_struct_field_options_get_type()) +GARROW_AVAILABLE_IN_16_0 G_DECLARE_DERIVABLE_TYPE(GArrowStructFieldOptions, garrow_struct_field_options, GARROW, diff --git a/c_glib/arrow-glib/datum.h b/c_glib/arrow-glib/datum.h index df5e9a1c2cf4f..fc9a2fe7ab907 100644 --- a/c_glib/arrow-glib/datum.h +++ b/c_glib/arrow-glib/datum.h @@ -28,6 +28,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_DATUM (garrow_datum_get_type()) +GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE(GArrowDatum, garrow_datum, GARROW, DATUM, GObject) struct _GArrowDatumClass { @@ -60,6 +61,7 @@ garrow_datum_to_string(GArrowDatum *datum); /* GARROW_TYPE_NONE_DATUM */ #define GARROW_TYPE_SCALAR_DATUM (garrow_scalar_datum_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowScalarDatum, garrow_scalar_datum, GARROW, SCALAR_DATUM, GArrowDatum) struct _GArrowScalarDatumClass @@ -72,6 +74,7 @@ GArrowScalarDatum * garrow_scalar_datum_new(GArrowScalar *value); #define GARROW_TYPE_ARRAY_DATUM (garrow_array_datum_get_type()) +GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE( GArrowArrayDatum, garrow_array_datum, GARROW, ARRAY_DATUM, GArrowDatum) struct _GArrowArrayDatumClass @@ -84,6 +87,7 @@ GArrowArrayDatum * garrow_array_datum_new(GArrowArray *value); #define GARROW_TYPE_CHUNKED_ARRAY_DATUM (garrow_chunked_array_datum_get_type()) +GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE(GArrowChunkedArrayDatum, garrow_chunked_array_datum, GARROW, @@ -99,6 +103,7 @@ GArrowChunkedArrayDatum * garrow_chunked_array_datum_new(GArrowChunkedArray *value); #define GARROW_TYPE_RECORD_BATCH_DATUM (garrow_record_batch_datum_get_type()) +GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchDatum, garrow_record_batch_datum, GARROW, @@ -114,6 +119,7 @@ GArrowRecordBatchDatum * garrow_record_batch_datum_new(GArrowRecordBatch *value); #define GARROW_TYPE_TABLE_DATUM (garrow_table_datum_get_type()) +GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE( GArrowTableDatum, garrow_table_datum, GARROW, TABLE_DATUM, GArrowDatum) struct _GArrowTableDatumClass diff --git a/c_glib/arrow-glib/decimal.h b/c_glib/arrow-glib/decimal.h index b967fa36d5611..f64afa800a19b 100644 --- a/c_glib/arrow-glib/decimal.h +++ b/c_glib/arrow-glib/decimal.h @@ -27,6 +27,7 @@ G_BEGIN_DECLS /* Disabled because it conflicts with GARROW_TYPE_DECIMAL128 in GArrowType. */ /* #define GARROW_TYPE_DECIMAL128 (garrow_decimal128_get_type()) */ +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowDecimal128, garrow_decimal128, GARROW, DECIMAL128, GObject) struct _GArrowDecimal128Class @@ -34,8 +35,10 @@ struct _GArrowDecimal128Class GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowDecimal128 * garrow_decimal128_new_string(const gchar *data, GError **error); +GARROW_AVAILABLE_IN_ALL GArrowDecimal128 * garrow_decimal128_new_integer(const gint64 data); GARROW_AVAILABLE_IN_3_0 @@ -62,25 +65,34 @@ GARROW_AVAILABLE_IN_0_12 gboolean garrow_decimal128_greater_than_or_equal(GArrowDecimal128 *decimal, GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_ALL gchar * garrow_decimal128_to_string_scale(GArrowDecimal128 *decimal, gint32 scale); +GARROW_AVAILABLE_IN_ALL gchar * garrow_decimal128_to_string(GArrowDecimal128 *decimal); GARROW_AVAILABLE_IN_3_0 GBytes * garrow_decimal128_to_bytes(GArrowDecimal128 *decimal); +GARROW_AVAILABLE_IN_ALL void garrow_decimal128_abs(GArrowDecimal128 *decimal); +GARROW_AVAILABLE_IN_ALL void garrow_decimal128_negate(GArrowDecimal128 *decimal); +GARROW_AVAILABLE_IN_ALL gint64 garrow_decimal128_to_integer(GArrowDecimal128 *decimal); +GARROW_AVAILABLE_IN_ALL GArrowDecimal128 * garrow_decimal128_plus(GArrowDecimal128 *left, GArrowDecimal128 *right); +GARROW_AVAILABLE_IN_ALL GArrowDecimal128 * garrow_decimal128_minus(GArrowDecimal128 *left, GArrowDecimal128 *right); +GARROW_AVAILABLE_IN_ALL GArrowDecimal128 * garrow_decimal128_multiply(GArrowDecimal128 *left, GArrowDecimal128 *right); +GARROW_AVAILABLE_IN_ALL GArrowDecimal128 * garrow_decimal128_divide(GArrowDecimal128 *left, GArrowDecimal128 *right, @@ -95,6 +107,7 @@ garrow_decimal128_rescale(GArrowDecimal128 *decimal, /* Disabled because it conflicts with GARROW_TYPE_DECIMAL256 in GArrowType. */ /* #define GARROW_TYPE_DECIMAL256 (garrow_decimal256_get_type()) */ +GARROW_AVAILABLE_IN_3_0 G_DECLARE_DERIVABLE_TYPE(GArrowDecimal256, garrow_decimal256, GARROW, DECIMAL256, GObject) struct _GArrowDecimal256Class diff --git a/c_glib/arrow-glib/enums.h.template b/c_glib/arrow-glib/enums.h.template index b7d3c99c0bef8..e49b717fb30db 100644 --- a/c_glib/arrow-glib/enums.h.template +++ b/c_glib/arrow-glib/enums.h.template @@ -22,6 +22,8 @@ #include +#include + G_BEGIN_DECLS /*** END file-header ***/ @@ -31,6 +33,7 @@ G_BEGIN_DECLS /*** END file-production ***/ /*** BEGIN value-header ***/ +GARROW_AVAILABLE_IN_ALL GType @enum_name@_get_type(void) G_GNUC_CONST; #define @ENUMPREFIX@_TYPE_@ENUMSHORT@ (@enum_name@_get_type()) /*** END value-header ***/ diff --git a/c_glib/arrow-glib/error.h b/c_glib/arrow-glib/error.h index 4414417a1a25b..e0c6a591a021b 100644 --- a/c_glib/arrow-glib/error.h +++ b/c_glib/arrow-glib/error.h @@ -21,6 +21,8 @@ #include +#include + G_BEGIN_DECLS /** @@ -66,6 +68,7 @@ typedef enum { #define GARROW_ERROR garrow_error_quark() +GARROW_AVAILABLE_IN_ALL GQuark garrow_error_quark(void); diff --git a/c_glib/arrow-glib/error.hpp b/c_glib/arrow-glib/error.hpp index 90a0f3161878e..c2c9b3c63028a 100644 --- a/c_glib/arrow-glib/error.hpp +++ b/c_glib/arrow-glib/error.hpp @@ -23,18 +23,26 @@ #include +GARROW_EXTERN gboolean garrow_error_check(GError **error, const arrow::Status &status, const char *context); + +GARROW_EXTERN GArrowError garrow_error_from_status(const arrow::Status &status); + +GARROW_EXTERN arrow::StatusCode garrow_error_to_status_code(GError *error, arrow::StatusCode default_code); + +GARROW_EXTERN arrow::Status garrow_error_to_status(GError *error, arrow::StatusCode default_code, const char *context); namespace garrow { + GARROW_EXTERN gboolean check(GError **error, const arrow::Status &status, const char *context); diff --git a/c_glib/arrow-glib/expression.h b/c_glib/arrow-glib/expression.h index 3141ed4df18b7..5a6bfb456fc64 100644 --- a/c_glib/arrow-glib/expression.h +++ b/c_glib/arrow-glib/expression.h @@ -31,6 +31,7 @@ gboolean garrow_expression_equal(GArrowExpression *expression, GArrowExpression *other_expression); #define GARROW_TYPE_LITERAL_EXPRESSION (garrow_literal_expression_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GArrowLiteralExpression, garrow_literal_expression, GARROW, @@ -46,6 +47,7 @@ GArrowLiteralExpression * garrow_literal_expression_new(GArrowDatum *datum); #define GARROW_TYPE_FIELD_EXPRESSION (garrow_field_expression_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GArrowFieldExpression, garrow_field_expression, GARROW, @@ -61,6 +63,7 @@ GArrowFieldExpression * garrow_field_expression_new(const gchar *reference, GError **error); #define GARROW_TYPE_CALL_EXPRESSION (garrow_call_expression_get_type()) +GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( GArrowCallExpression, garrow_call_expression, GARROW, CALL_EXPRESSION, GArrowExpression) struct _GArrowCallExpressionClass diff --git a/c_glib/arrow-glib/expression.hpp b/c_glib/arrow-glib/expression.hpp index 60d5c9fe2f1bd..cc96badbe67aa 100644 --- a/c_glib/arrow-glib/expression.hpp +++ b/c_glib/arrow-glib/expression.hpp @@ -23,7 +23,10 @@ #include +GARROW_EXTERN GArrowExpression * garrow_expression_new_raw(const arrow::compute::Expression &arrow_expression); + +GARROW_EXTERN arrow::compute::Expression * garrow_expression_get_raw(GArrowExpression *expression); diff --git a/c_glib/arrow-glib/field.h b/c_glib/arrow-glib/field.h index 8de63757878c9..4be13f6135975 100644 --- a/c_glib/arrow-glib/field.h +++ b/c_glib/arrow-glib/field.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_FIELD (garrow_field_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowField, garrow_field, GARROW, FIELD, GObject) struct _GArrowFieldClass { @@ -34,8 +35,10 @@ GARROW_AVAILABLE_IN_6_0 GArrowField * garrow_field_import(gpointer c_abi_schema, GError **error); +GARROW_AVAILABLE_IN_ALL GArrowField * garrow_field_new(const gchar *name, GArrowDataType *data_type); +GARROW_AVAILABLE_IN_ALL GArrowField * garrow_field_new_full(const gchar *name, GArrowDataType *data_type, gboolean nullable); @@ -43,18 +46,26 @@ GARROW_AVAILABLE_IN_6_0 gpointer garrow_field_export(GArrowField *field, GError **error); +GARROW_AVAILABLE_IN_ALL const gchar * garrow_field_get_name(GArrowField *field); + +GARROW_AVAILABLE_IN_ALL GArrowDataType * garrow_field_get_data_type(GArrowField *field); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_field_is_nullable(GArrowField *field); +GARROW_AVAILABLE_IN_ALL gboolean garrow_field_equal(GArrowField *field, GArrowField *other_field); +GARROW_AVAILABLE_IN_ALL gchar * garrow_field_to_string(GArrowField *field); + GARROW_AVAILABLE_IN_3_0 gchar * garrow_field_to_string_metadata(GArrowField *field, gboolean show_metadata); diff --git a/c_glib/arrow-glib/file-system.h b/c_glib/arrow-glib/file-system.h index d3d5fde73fe23..2e500672e145c 100644 --- a/c_glib/arrow-glib/file-system.h +++ b/c_glib/arrow-glib/file-system.h @@ -53,6 +53,7 @@ typedef enum { /* arrow::fs::FileInfo */ #define GARROW_TYPE_FILE_INFO (garrow_file_info_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowFileInfo, garrow_file_info, GARROW, FILE_INFO, GObject) struct _GArrowFileInfoClass { @@ -80,6 +81,7 @@ garrow_file_info_to_string(GArrowFileInfo *file_info); /* arrow::fs::FileSelector */ #define GARROW_TYPE_FILE_SELECTOR (garrow_file_selector_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE( GArrowFileSelector, garrow_file_selector, GARROW, FILE_SELECTOR, GObject) struct _GArrowFileSelectorClass @@ -90,6 +92,7 @@ struct _GArrowFileSelectorClass /* arrow::fs::FileSystem */ #define GARROW_TYPE_FILE_SYSTEM (garrow_file_system_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE( GArrowFileSystem, garrow_file_system, GARROW, FILE_SYSTEM, GObject) struct _GArrowFileSystemClass @@ -197,6 +200,7 @@ garrow_file_system_open_append_stream(GArrowFileSystem *file_system, /* arrow::fs::SubTreeFileSystem */ #define GARROW_TYPE_SUB_TREE_FILE_SYSTEM (garrow_sub_tree_file_system_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowSubTreeFileSystem, garrow_sub_tree_file_system, GARROW, @@ -215,6 +219,7 @@ garrow_sub_tree_file_system_new(const gchar *base_path, /* arrow::fs::SlowFileSystem */ #define GARROW_TYPE_SLOW_FILE_SYSTEM (garrow_slow_file_system_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowSlowFileSystem, garrow_slow_file_system, GARROW, @@ -244,6 +249,7 @@ garrow_slow_file_system_new_average_latency_and_seed(GArrowFileSystem *base_file gint32 seed); #define GARROW_TYPE_MOCK_FILE_SYSTEM (garrow_mock_file_system_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowMockFileSystem, garrow_mock_file_system, GARROW, @@ -255,6 +261,7 @@ struct _GArrowMockFileSystemClass }; #define GARROW_TYPE_HDFS_FILE_SYSTEM (garrow_hdfs_file_system_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowHDFSFileSystem, garrow_hdfs_file_system, GARROW, @@ -290,6 +297,7 @@ typedef enum { } GArrowS3LogLevel; #define GARROW_TYPE_S3_GLOBAL_OPTIONS (garrow_s3_global_options_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE( GArrowS3GlobalOptions, garrow_s3_global_options, GARROW, S3_GLOBAL_OPTIONS, GObject) struct _GArrowS3GlobalOptionsClass @@ -312,6 +320,7 @@ gboolean garrow_s3_finalize(GError **error); #define GARROW_TYPE_S3_FILE_SYSTEM (garrow_s3_file_system_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE( GArrowS3FileSystem, garrow_s3_file_system, GARROW, S3_FILE_SYSTEM, GArrowFileSystem) struct _GArrowS3FileSystemClass @@ -320,6 +329,7 @@ struct _GArrowS3FileSystemClass }; #define GARROW_TYPE_GCS_FILE_SYSTEM (garrow_gcs_file_system_get_type()) +GARROW_AVAILABLE_IN_7_0 G_DECLARE_DERIVABLE_TYPE( GArrowGCSFileSystem, garrow_gcs_file_system, GARROW, GCS_FILE_SYSTEM, GArrowFileSystem) struct _GArrowGCSFileSystemClass diff --git a/c_glib/arrow-glib/file-system.hpp b/c_glib/arrow-glib/file-system.hpp index f41fc6e9c75b0..c535958301c5c 100644 --- a/c_glib/arrow-glib/file-system.hpp +++ b/c_glib/arrow-glib/file-system.hpp @@ -23,28 +23,35 @@ #include +GARROW_EXTERN GArrowFileInfo * garrow_file_info_new_raw(const arrow::fs::FileInfo &arrow_file_info); +GARROW_EXTERN arrow::fs::FileInfo * garrow_file_info_get_raw(GArrowFileInfo *file_info); +GARROW_EXTERN GArrowFileSystem * garrow_file_system_new_raw(std::shared_ptr *arrow_file_system); +GARROW_EXTERN std::shared_ptr garrow_file_system_get_raw(GArrowFileSystem *file_system); +GARROW_EXTERN GArrowSubTreeFileSystem * garrow_sub_tree_file_system_new_raw( std::shared_ptr *arrow_file_system, GArrowFileSystem *base_file_system); +GARROW_EXTERN GArrowSlowFileSystem * garrow_slow_file_system_new_raw(std::shared_ptr *arrow_file_system, GArrowFileSystem *base_file_system); #ifdef ARROW_S3 +GARROW_EXTERN arrow::fs::S3GlobalOptions * garrow_s3_global_options_get_raw(GArrowS3GlobalOptions *options); #endif diff --git a/c_glib/arrow-glib/file.h b/c_glib/arrow-glib/file.h index 42afed139463c..799dd83b9c243 100644 --- a/c_glib/arrow-glib/file.h +++ b/c_glib/arrow-glib/file.h @@ -27,15 +27,22 @@ G_BEGIN_DECLS #define GARROW_TYPE_FILE (garrow_file_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_INTERFACE(GArrowFile, garrow_file, GARROW, FILE, GObject) +GARROW_AVAILABLE_IN_ALL gboolean garrow_file_close(GArrowFile *file, GError **error); + GARROW_AVAILABLE_IN_0_13 gboolean garrow_file_is_closed(GArrowFile *file); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_file_tell(GArrowFile *file, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowFileMode garrow_file_get_mode(GArrowFile *file); diff --git a/c_glib/arrow-glib/input-stream.cpp b/c_glib/arrow-glib/input-stream.cpp index 03a3f03fff7ce..52c79993e4ca8 100644 --- a/c_glib/arrow-glib/input-stream.cpp +++ b/c_glib/arrow-glib/input-stream.cpp @@ -35,6 +35,22 @@ #include #include +static std::shared_ptr +garrow_input_stream_get_raw_file_interface(GArrowFile *file) +{ + auto input_stream = GARROW_INPUT_STREAM(file); + auto arrow_input_stream = garrow_input_stream_get_raw(input_stream); + return arrow_input_stream; +} + +static std::shared_ptr +garrow_input_stream_get_raw_readable_interface(GArrowReadable *readable) +{ + auto input_stream = GARROW_INPUT_STREAM(readable); + auto arrow_input_stream = garrow_input_stream_get_raw(input_stream); + return arrow_input_stream; +} + G_BEGIN_DECLS /** @@ -71,28 +87,12 @@ enum { PROP_INPUT_STREAM = 1 }; -static std::shared_ptr -garrow_input_stream_get_raw_file_interface(GArrowFile *file) -{ - auto input_stream = GARROW_INPUT_STREAM(file); - auto arrow_input_stream = garrow_input_stream_get_raw(input_stream); - return arrow_input_stream; -} - static void garrow_input_stream_file_interface_init(GArrowFileInterface *iface) { iface->get_raw = garrow_input_stream_get_raw_file_interface; } -static std::shared_ptr -garrow_input_stream_get_raw_readable_interface(GArrowReadable *readable) -{ - auto input_stream = GARROW_INPUT_STREAM(readable); - auto arrow_input_stream = garrow_input_stream_get_raw(input_stream); - return arrow_input_stream; -} - static void garrow_input_stream_readable_interface_init(GArrowReadableInterface *iface) { diff --git a/c_glib/arrow-glib/input-stream.h b/c_glib/arrow-glib/input-stream.h index 3e2a2ecdbd4fa..676f2f44b0041 100644 --- a/c_glib/arrow-glib/input-stream.h +++ b/c_glib/arrow-glib/input-stream.h @@ -30,6 +30,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_INPUT_STREAM (garrow_input_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowInputStream, garrow_input_stream, GARROW, INPUT_STREAM, GInputStream) struct _GArrowInputStreamClass @@ -37,16 +38,22 @@ struct _GArrowInputStreamClass GInputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL gboolean garrow_input_stream_advance(GArrowInputStream *input_stream, gint64 n_bytes, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_input_stream_align(GArrowInputStream *input_stream, gint32 alignment, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowTensor * garrow_input_stream_read_tensor(GArrowInputStream *input_stream, GError **error); + GARROW_AVAILABLE_IN_1_0 GArrowRecordBatch * garrow_input_stream_read_record_batch(GArrowInputStream *input_stream, @@ -55,6 +62,7 @@ garrow_input_stream_read_record_batch(GArrowInputStream *input_stream, GError **error); #define GARROW_TYPE_SEEKABLE_INPUT_STREAM (garrow_seekable_input_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowSeekableInputStream, garrow_seekable_input_stream, GARROW, @@ -65,12 +73,17 @@ struct _GArrowSeekableInputStreamClass GArrowInputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL guint64 garrow_seekable_input_stream_get_size(GArrowSeekableInputStream *input_stream, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_seekable_input_stream_get_support_zero_copy( GArrowSeekableInputStream *input_stream); + +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_seekable_input_stream_read_at(GArrowSeekableInputStream *input_stream, gint64 position, @@ -89,6 +102,7 @@ garrow_seekable_input_stream_peek(GArrowSeekableInputStream *input_stream, GError **error); #define GARROW_TYPE_BUFFER_INPUT_STREAM (garrow_buffer_input_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowBufferInputStream, garrow_buffer_input_stream, GARROW, @@ -99,13 +113,16 @@ struct _GArrowBufferInputStreamClass GArrowSeekableInputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowBufferInputStream * garrow_buffer_input_stream_new(GArrowBuffer *buffer); +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_buffer_input_stream_get_buffer(GArrowBufferInputStream *input_stream); #define GARROW_TYPE_FILE_INPUT_STREAM (garrow_file_input_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowFileInputStream, garrow_file_input_stream, GARROW, @@ -116,15 +133,21 @@ struct _GArrowFileInputStreamClass GArrowSeekableInputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowFileInputStream * garrow_file_input_stream_new(const gchar *path, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowFileInputStream * garrow_file_input_stream_new_file_descriptor(gint file_descriptor, GError **error); + +GARROW_AVAILABLE_IN_ALL gint garrow_file_input_stream_get_file_descriptor(GArrowFileInputStream *stream); #define GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM \ (garrow_memory_mapped_input_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowMemoryMappedInputStream, garrow_memory_mapped_input_stream, GARROW, @@ -135,10 +158,12 @@ struct _GArrowMemoryMappedInputStreamClass GArrowSeekableInputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowMemoryMappedInputStream * garrow_memory_mapped_input_stream_new(const gchar *path, GError **error); #define GARROW_TYPE_GIO_INPUT_STREAM (garrow_gio_input_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowGIOInputStream, garrow_gio_input_stream, GARROW, @@ -149,15 +174,19 @@ struct _GArrowGIOInputStreamClass GArrowSeekableInputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowGIOInputStream * garrow_gio_input_stream_new(GInputStream *gio_input_stream); + #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL G_GNUC_DEPRECATED GInputStream * garrow_gio_input_stream_get_raw(GArrowGIOInputStream *input_stream); #endif #define GARROW_TYPE_COMPRESSED_INPUT_STREAM (garrow_compressed_input_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowCompressedInputStream, garrow_compressed_input_stream, GARROW, @@ -168,6 +197,7 @@ struct _GArrowCompressedInputStreamClass GArrowInputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowCompressedInputStream * garrow_compressed_input_stream_new(GArrowCodec *codec, GArrowInputStream *raw, diff --git a/c_glib/arrow-glib/input-stream.hpp b/c_glib/arrow-glib/input-stream.hpp index 7ae759370ddbd..0400398c4777f 100644 --- a/c_glib/arrow-glib/input-stream.hpp +++ b/c_glib/arrow-glib/input-stream.hpp @@ -26,34 +26,48 @@ #include +GARROW_EXTERN GArrowInputStream * garrow_input_stream_new_raw(std::shared_ptr *arrow_input_stream); + +GARROW_EXTERN std::shared_ptr garrow_input_stream_get_raw(GArrowInputStream *input_stream); +GARROW_EXTERN GArrowSeekableInputStream * garrow_seekable_input_stream_new_raw( std::shared_ptr *arrow_random_access_file); + +GARROW_EXTERN std::shared_ptr garrow_seekable_input_stream_get_raw(GArrowSeekableInputStream *input_stream); +GARROW_EXTERN GArrowBufferInputStream * garrow_buffer_input_stream_new_raw( std::shared_ptr *arrow_buffer_reader, GArrowBuffer *buffer); + +GARROW_EXTERN std::shared_ptr garrow_buffer_input_stream_get_raw(GArrowBufferInputStream *input_stream); +GARROW_EXTERN GArrowFileInputStream * garrow_file_input_stream_new_raw(std::shared_ptr *arrow_stream); +GARROW_EXTERN GArrowMemoryMappedInputStream * garrow_memory_mapped_input_stream_new_raw( std::shared_ptr *arrow_stream); +GARROW_EXTERN GArrowCompressedInputStream * garrow_compressed_input_stream_new_raw( std::shared_ptr *arrow_raw, GArrowCodec *codec, GArrowInputStream *raw); + +GARROW_EXTERN std::shared_ptr garrow_compressed_input_stream_get_raw(GArrowCompressedInputStream *stream); diff --git a/c_glib/arrow-glib/interval.h b/c_glib/arrow-glib/interval.h index a6c9e1ff1e1ef..8c23b9a509bb4 100644 --- a/c_glib/arrow-glib/interval.h +++ b/c_glib/arrow-glib/interval.h @@ -26,6 +26,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_DAY_MILLISECOND (garrow_day_millisecond_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE( GArrowDayMillisecond, garrow_day_millisecond, GARROW, DAY_MILLISECOND, GObject) @@ -47,6 +48,7 @@ garrow_day_millisecond_less_than(GArrowDayMillisecond *day_millisecond, GArrowDayMillisecond *other_day_millisecond); #define GARROW_TYPE_MONTH_DAY_NANO (garrow_month_day_nano_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE( GArrowMonthDayNano, garrow_month_day_nano, GARROW, MONTH_DAY_NANO, GObject) diff --git a/c_glib/arrow-glib/ipc-options.h b/c_glib/arrow-glib/ipc-options.h index 418b08f080152..1ddff059d2faf 100644 --- a/c_glib/arrow-glib/ipc-options.h +++ b/c_glib/arrow-glib/ipc-options.h @@ -26,6 +26,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_READ_OPTIONS (garrow_read_options_get_type()) +GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE( GArrowReadOptions, garrow_read_options, GARROW, READ_OPTIONS, GObject) struct _GArrowReadOptionsClass @@ -46,6 +47,7 @@ garrow_read_options_set_included_fields(GArrowReadOptions *options, gsize n_fields); #define GARROW_TYPE_WRITE_OPTIONS (garrow_write_options_get_type()) +GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE( GArrowWriteOptions, garrow_write_options, GARROW, WRITE_OPTIONS, GObject) struct _GArrowWriteOptionsClass diff --git a/c_glib/arrow-glib/ipc-options.hpp b/c_glib/arrow-glib/ipc-options.hpp index f57fbd3c11e5a..838d05d41dbac 100644 --- a/c_glib/arrow-glib/ipc-options.hpp +++ b/c_glib/arrow-glib/ipc-options.hpp @@ -23,10 +23,14 @@ #include +GARROW_EXTERN arrow::ipc::IpcReadOptions * garrow_read_options_get_raw(GArrowReadOptions *options); + +GARROW_EXTERN arrow::ipc::DictionaryMemo * garrow_read_options_get_dictionary_memo_raw(GArrowReadOptions *options); +GARROW_EXTERN arrow::ipc::IpcWriteOptions * garrow_write_options_get_raw(GArrowWriteOptions *options); diff --git a/c_glib/arrow-glib/local-file-system.h b/c_glib/arrow-glib/local-file-system.h index 9af4f8e8b168d..6ad2ee9f231ab 100644 --- a/c_glib/arrow-glib/local-file-system.h +++ b/c_glib/arrow-glib/local-file-system.h @@ -27,6 +27,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_LOCAL_FILE_SYSTEM_OPTIONS \ (garrow_local_file_system_options_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowLocalFileSystemOptions, garrow_local_file_system_options, GARROW, @@ -44,6 +45,7 @@ garrow_local_file_system_options_new(void); /* arrow::fs::LocalFileSystem */ #define GARROW_TYPE_LOCAL_FILE_SYSTEM (garrow_local_file_system_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowLocalFileSystem, garrow_local_file_system, GARROW, diff --git a/c_glib/arrow-glib/memory-pool.h b/c_glib/arrow-glib/memory-pool.h index de2a5d717a183..7da15a9eb1b47 100644 --- a/c_glib/arrow-glib/memory-pool.h +++ b/c_glib/arrow-glib/memory-pool.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GARROW_TYPE_MEMORY_POOL (garrow_memory_pool_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowMemoryPool, garrow_memory_pool, GARROW, MEMORY_POOL, GObject) struct _GArrowMemoryPoolClass @@ -31,12 +34,19 @@ struct _GArrowMemoryPoolClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowMemoryPool * garrow_memory_pool_default(); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_memory_pool_get_bytes_allocated(GArrowMemoryPool *memory_pool); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_memory_pool_get_max_memory(GArrowMemoryPool *memory_pool); + +GARROW_AVAILABLE_IN_ALL gchar * garrow_memory_pool_get_backend_name(GArrowMemoryPool *memory_pool); diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index 67909ff22c428..36a8274513ed2 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -205,14 +205,12 @@ cpp_internal_headers = files( 'internal-index.hpp', ) -version_h_conf = configuration_data() -version_h_conf.set('GARROW_VERSION_MAJOR', version_major) -version_h_conf.set('GARROW_VERSION_MINOR', version_minor) -version_h_conf.set('GARROW_VERSION_MICRO', version_micro) -version_h_conf.set('GARROW_VERSION_TAG', version_tag) -version_h = configure_file(input: 'version.h.in', - output: 'version.h', - configuration: version_h_conf) +version_h = configure_file( + input: 'version.h.in', + output: 'version.h', + command: [python3, generate_version_header_py, '--library', 'GARROW', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], +) + c_headers += version_h enums = gnome.mkenums('enums', @@ -226,11 +224,9 @@ enums = gnome.mkenums('enums', enums_source = enums[0] enums_header = enums[1] - headers = c_headers + cpp_headers install_headers(headers, subdir: meson.project_name()) - gobject = dependency('gobject-2.0') gobject_libdir = gobject.get_variable(pkgconfig: 'libdir') # This is for Homebrew. "pkg-config --cflags gio-2.0" includes the @@ -253,6 +249,8 @@ libarrow_glib = library('arrow-glib', dependencies: dependencies, implicit_include_directories: false, include_directories: base_include_directories, + cpp_args: ['-DGARROW_COMPILATION'], + c_args: ['-DGARROW_COMPILATION'], soversion: so_version, version: library_version) arrow_glib = declare_dependency(link_with: libarrow_glib, diff --git a/c_glib/arrow-glib/orc-file-reader.h b/c_glib/arrow-glib/orc-file-reader.h index 20089eb2866c6..4eb3df5242e48 100644 --- a/c_glib/arrow-glib/orc-file-reader.h +++ b/c_glib/arrow-glib/orc-file-reader.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_ORC_FILE_READER (garrow_orc_file_reader_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowORCFileReader, garrow_orc_file_reader, GARROW, ORC_FILE_READER, GObject) struct _GArrowORCFileReaderClass @@ -31,10 +32,12 @@ struct _GArrowORCFileReaderClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowORCFileReader * garrow_orc_file_reader_new(GArrowSeekableInputStream *file, GError **error); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_orc_file_reader_set_field_indices) void garrow_orc_file_reader_set_field_indexes(GArrowORCFileReader *reader, @@ -47,6 +50,7 @@ garrow_orc_file_reader_set_field_indices(GArrowORCFileReader *reader, const gint *field_indices, guint n_field_indices); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_12_FOR(garrow_orc_file_reader_get_field_indices) const gint * garrow_orc_file_reader_get_field_indexes(GArrowORCFileReader *reader, @@ -56,14 +60,24 @@ GARROW_AVAILABLE_IN_0_12 const gint * garrow_orc_file_reader_get_field_indices(GArrowORCFileReader *reader, guint *n_field_indices); + +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_orc_file_reader_read_type(GArrowORCFileReader *reader, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowTable * garrow_orc_file_reader_read_stripes(GArrowORCFileReader *reader, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowRecordBatch * garrow_orc_file_reader_read_stripe(GArrowORCFileReader *reader, gint64 i, GError **error); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_orc_file_reader_get_n_stripes(GArrowORCFileReader *reader); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_orc_file_reader_get_n_rows(GArrowORCFileReader *reader); diff --git a/c_glib/arrow-glib/output-stream.cpp b/c_glib/arrow-glib/output-stream.cpp index 83de2eb38a72a..d9bdf7ad8b786 100644 --- a/c_glib/arrow-glib/output-stream.cpp +++ b/c_glib/arrow-glib/output-stream.cpp @@ -33,6 +33,22 @@ #include #include +static std::shared_ptr +garrow_output_stream_get_raw_file_interface(GArrowFile *file) +{ + auto output_stream = GARROW_OUTPUT_STREAM(file); + auto arrow_output_stream = garrow_output_stream_get_raw(output_stream); + return arrow_output_stream; +} + +static std::shared_ptr +garrow_output_stream_get_raw_writable_interface(GArrowWritable *writable) +{ + auto output_stream = GARROW_OUTPUT_STREAM(writable); + auto arrow_output_stream = garrow_output_stream_get_raw(output_stream); + return arrow_output_stream; +} + G_BEGIN_DECLS /** @@ -65,28 +81,12 @@ enum { PROP_OUTPUT_STREAM }; -static std::shared_ptr -garrow_output_stream_get_raw_file_interface(GArrowFile *file) -{ - auto output_stream = GARROW_OUTPUT_STREAM(file); - auto arrow_output_stream = garrow_output_stream_get_raw(output_stream); - return arrow_output_stream; -} - static void garrow_output_stream_file_interface_init(GArrowFileInterface *iface) { iface->get_raw = garrow_output_stream_get_raw_file_interface; } -static std::shared_ptr -garrow_output_stream_get_raw_writable_interface(GArrowWritable *writable) -{ - auto output_stream = GARROW_OUTPUT_STREAM(writable); - auto arrow_output_stream = garrow_output_stream_get_raw(output_stream); - return arrow_output_stream; -} - static void garrow_output_stream_writable_interface_init(GArrowWritableInterface *iface) { diff --git a/c_glib/arrow-glib/output-stream.h b/c_glib/arrow-glib/output-stream.h index 1b18c08c14a5f..5c8b4b9374fc6 100644 --- a/c_glib/arrow-glib/output-stream.h +++ b/c_glib/arrow-glib/output-stream.h @@ -30,6 +30,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_OUTPUT_STREAM (garrow_output_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowOutputStream, garrow_output_stream, GARROW, OUTPUT_STREAM, GObject) struct _GArrowOutputStreamClass @@ -37,8 +38,11 @@ struct _GArrowOutputStreamClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL gboolean garrow_output_stream_align(GArrowOutputStream *stream, gint32 alignment, GError **error); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_output_stream_write_tensor(GArrowOutputStream *stream, GArrowTensor *tensor, @@ -51,6 +55,7 @@ garrow_output_stream_write_record_batch(GArrowOutputStream *stream, GError **error); #define GARROW_TYPE_FILE_OUTPUT_STREAM (garrow_file_output_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowFileOutputStream, garrow_file_output_stream, GARROW, @@ -61,10 +66,12 @@ struct _GArrowFileOutputStreamClass GArrowOutputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowFileOutputStream * garrow_file_output_stream_new(const gchar *path, gboolean append, GError **error); #define GARROW_TYPE_BUFFER_OUTPUT_STREAM (garrow_buffer_output_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowBufferOutputStream, garrow_buffer_output_stream, GARROW, @@ -75,10 +82,12 @@ struct _GArrowBufferOutputStreamClass GArrowOutputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowBufferOutputStream * garrow_buffer_output_stream_new(GArrowResizableBuffer *buffer); #define GARROW_TYPE_GIO_OUTPUT_STREAM (garrow_gio_output_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowGIOOutputStream, garrow_gio_output_stream, GARROW, @@ -89,15 +98,19 @@ struct _GArrowGIOOutputStreamClass GArrowOutputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowGIOOutputStream * garrow_gio_output_stream_new(GOutputStream *gio_output_stream); + #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL G_GNUC_DEPRECATED GOutputStream * garrow_gio_output_stream_get_raw(GArrowGIOOutputStream *output_stream); #endif #define GARROW_TYPE_COMPRESSED_OUTPUT_STREAM (garrow_compressed_output_stream_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowCompressedOutputStream, garrow_compressed_output_stream, GARROW, @@ -108,6 +121,7 @@ struct _GArrowCompressedOutputStreamClass GArrowOutputStreamClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowCompressedOutputStream * garrow_compressed_output_stream_new(GArrowCodec *codec, GArrowOutputStream *raw, diff --git a/c_glib/arrow-glib/output-stream.hpp b/c_glib/arrow-glib/output-stream.hpp index e41c65da88d82..515d969efc245 100644 --- a/c_glib/arrow-glib/output-stream.hpp +++ b/c_glib/arrow-glib/output-stream.hpp @@ -25,23 +25,32 @@ #include +GARROW_EXTERN GArrowOutputStream * garrow_output_stream_new_raw( std::shared_ptr *arrow_output_stream); + +GARROW_EXTERN std::shared_ptr garrow_output_stream_get_raw(GArrowOutputStream *output_stream); +GARROW_EXTERN GArrowFileOutputStream * garrow_file_output_stream_new_raw( std::shared_ptr *arrow_file_output_stream); + +GARROW_EXTERN GArrowBufferOutputStream * garrow_buffer_output_stream_new_raw( std::shared_ptr *arrow_buffer_output_stream); +GARROW_EXTERN GArrowCompressedOutputStream * garrow_compressed_output_stream_new_raw( std::shared_ptr *arrow_raw, GArrowCodec *codec, GArrowOutputStream *raw); + +GARROW_EXTERN std::shared_ptr garrow_compressed_output_stream_get_raw(GArrowCompressedOutputStream *stream); diff --git a/c_glib/arrow-glib/readable.h b/c_glib/arrow-glib/readable.h index d0b1f5b6a99ee..266b45849057e 100644 --- a/c_glib/arrow-glib/readable.h +++ b/c_glib/arrow-glib/readable.h @@ -25,10 +25,13 @@ G_BEGIN_DECLS #define GARROW_TYPE_READABLE (garrow_readable_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_INTERFACE(GArrowReadable, garrow_readable, GARROW, READABLE, GObject) +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_readable_read(GArrowReadable *readable, gint64 n_bytes, GError **error); + GARROW_AVAILABLE_IN_0_17 GBytes * garrow_readable_read_bytes(GArrowReadable *readable, gint64 n_bytes, GError **error); diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h index 96e4c5bbb5890..5401aa3bb1fc5 100644 --- a/c_glib/arrow-glib/reader.h +++ b/c_glib/arrow-glib/reader.h @@ -29,6 +29,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_RECORD_BATCH_READER (garrow_record_batch_reader_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchReader, garrow_record_batch_reader, GARROW, @@ -53,22 +54,29 @@ GARROW_AVAILABLE_IN_6_0 gpointer garrow_record_batch_reader_export(GArrowRecordBatchReader *reader, GError **error); +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_record_batch_reader_get_schema(GArrowRecordBatchReader *reader); + #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL G_GNUC_DEPRECATED_FOR(garrow_record_batch_reader_read_next) GArrowRecordBatch * garrow_record_batch_reader_get_next_record_batch(GArrowRecordBatchReader *reader, GError **error); #endif #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL G_GNUC_DEPRECATED_FOR(garrow_record_batch_reader_read_next) GArrowRecordBatch * garrow_record_batch_reader_read_next_record_batch(GArrowRecordBatchReader *reader, GError **error); #endif + +GARROW_AVAILABLE_IN_ALL GArrowRecordBatch * garrow_record_batch_reader_read_next(GArrowRecordBatchReader *reader, GError **error); + GARROW_AVAILABLE_IN_6_0 GArrowTable * garrow_record_batch_reader_read_all(GArrowRecordBatchReader *reader, GError **error); @@ -78,6 +86,7 @@ GList * garrow_record_batch_reader_get_sources(GArrowRecordBatchReader *reader); #define GARROW_TYPE_TABLE_BATCH_READER (garrow_table_batch_reader_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTableBatchReader, garrow_table_batch_reader, GARROW, @@ -88,6 +97,7 @@ struct _GArrowTableBatchReaderClass GArrowRecordBatchReaderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTableBatchReader * garrow_table_batch_reader_new(GArrowTable *table); @@ -98,6 +108,7 @@ garrow_table_batch_reader_set_max_chunk_size(GArrowTableBatchReader *reader, #define GARROW_TYPE_RECORD_BATCH_STREAM_READER \ (garrow_record_batch_stream_reader_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchStreamReader, garrow_record_batch_stream_reader, GARROW, @@ -108,10 +119,12 @@ struct _GArrowRecordBatchStreamReaderClass GArrowRecordBatchReaderClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchStreamReader * garrow_record_batch_stream_reader_new(GArrowInputStream *stream, GError **error); #define GARROW_TYPE_RECORD_BATCH_FILE_READER (garrow_record_batch_file_reader_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchFileReader, garrow_record_batch_file_reader, GARROW, @@ -122,28 +135,39 @@ struct _GArrowRecordBatchFileReaderClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchFileReader * garrow_record_batch_file_reader_new(GArrowSeekableInputStream *file, GError **error); +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_record_batch_file_reader_get_schema(GArrowRecordBatchFileReader *reader); + +GARROW_AVAILABLE_IN_ALL guint garrow_record_batch_file_reader_get_n_record_batches(GArrowRecordBatchFileReader *reader); + +GARROW_AVAILABLE_IN_ALL GArrowMetadataVersion garrow_record_batch_file_reader_get_version(GArrowRecordBatchFileReader *reader); + #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL G_GNUC_DEPRECATED_FOR(garrow_record_batch_file_reader_read_record_batch) GArrowRecordBatch * garrow_record_batch_file_reader_get_record_batch(GArrowRecordBatchFileReader *reader, guint i, GError **error); #endif + +GARROW_AVAILABLE_IN_ALL GArrowRecordBatch * garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *reader, guint i, GError **error); #define GARROW_TYPE_FEATHER_FILE_READER (garrow_feather_file_reader_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowFeatherFileReader, garrow_feather_file_reader, GARROW, @@ -154,18 +178,26 @@ struct _GArrowFeatherFileReaderClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowFeatherFileReader * garrow_feather_file_reader_new(GArrowSeekableInputStream *file, GError **error); +GARROW_AVAILABLE_IN_ALL gint garrow_feather_file_reader_get_version(GArrowFeatherFileReader *reader); + +GARROW_AVAILABLE_IN_ALL GArrowTable * garrow_feather_file_reader_read(GArrowFeatherFileReader *reader, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowTable * garrow_feather_file_reader_read_indices(GArrowFeatherFileReader *reader, const gint *indices, guint n_indices, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowTable * garrow_feather_file_reader_read_names(GArrowFeatherFileReader *reader, const gchar **names, @@ -173,6 +205,7 @@ garrow_feather_file_reader_read_names(GArrowFeatherFileReader *reader, GError **error); #define GARROW_TYPE_CSV_READ_OPTIONS (garrow_csv_read_options_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowCSVReadOptions, garrow_csv_read_options, GARROW, CSV_READ_OPTIONS, GObject) struct _GArrowCSVReadOptionsClass @@ -180,16 +213,23 @@ struct _GArrowCSVReadOptionsClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowCSVReadOptions * garrow_csv_read_options_new(void); + +GARROW_AVAILABLE_IN_ALL void garrow_csv_read_options_add_column_type(GArrowCSVReadOptions *options, const gchar *name, GArrowDataType *data_type); +GARROW_AVAILABLE_IN_ALL void garrow_csv_read_options_add_schema(GArrowCSVReadOptions *options, GArrowSchema *schema); + +GARROW_AVAILABLE_IN_ALL GHashTable * garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options); + GARROW_AVAILABLE_IN_0_14 void garrow_csv_read_options_set_null_values(GArrowCSVReadOptions *options, @@ -251,16 +291,20 @@ garrow_csv_read_options_add_timestamp_parser(GArrowCSVReadOptions *options, GArrowTimestampParser *parser); #define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader, garrow_csv_reader, GARROW, CSV_READER, GObject) struct _GArrowCSVReaderClass { GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowCSVReader * garrow_csv_reader_new(GArrowInputStream *input, GArrowCSVReadOptions *options, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowTable * garrow_csv_reader_read(GArrowCSVReader *reader, GError **error); @@ -279,6 +323,7 @@ typedef enum { } GArrowJSONReadUnexpectedFieldBehavior; #define GARROW_TYPE_JSON_READ_OPTIONS (garrow_json_read_options_get_type()) +GARROW_AVAILABLE_IN_0_14 G_DECLARE_DERIVABLE_TYPE( GArrowJSONReadOptions, garrow_json_read_options, GARROW, JSON_READ_OPTIONS, GObject) struct _GArrowJSONReadOptionsClass @@ -291,6 +336,7 @@ GArrowJSONReadOptions * garrow_json_read_options_new(void); #define GARROW_TYPE_JSON_READER (garrow_json_reader_get_type()) +GARROW_AVAILABLE_IN_0_14 G_DECLARE_DERIVABLE_TYPE( GArrowJSONReader, garrow_json_reader, GARROW, JSON_READER, GObject) struct _GArrowJSONReaderClass diff --git a/c_glib/arrow-glib/reader.hpp b/c_glib/arrow-glib/reader.hpp index 192497ef52e31..beec6766af2e6 100644 --- a/c_glib/arrow-glib/reader.hpp +++ b/c_glib/arrow-glib/reader.hpp @@ -27,42 +27,61 @@ #include +GARROW_EXTERN GArrowRecordBatchReader * garrow_record_batch_reader_new_raw( std::shared_ptr *arrow_reader, GList *sources); + +GARROW_EXTERN std::shared_ptr garrow_record_batch_reader_get_raw(GArrowRecordBatchReader *reader); +GARROW_EXTERN GArrowTableBatchReader * garrow_table_batch_reader_new_raw(std::shared_ptr *arrow_reader, GArrowTable *table); + +GARROW_EXTERN std::shared_ptr garrow_table_batch_reader_get_raw(GArrowTableBatchReader *reader); +GARROW_EXTERN GArrowRecordBatchStreamReader * garrow_record_batch_stream_reader_new_raw( std::shared_ptr *arrow_reader); +GARROW_EXTERN GArrowRecordBatchFileReader * garrow_record_batch_file_reader_new_raw( std::shared_ptr *arrow_reader); + +GARROW_EXTERN std::shared_ptr garrow_record_batch_file_reader_get_raw(GArrowRecordBatchFileReader *reader); +GARROW_EXTERN GArrowFeatherFileReader * garrow_feather_file_reader_new_raw( std::shared_ptr *arrow_reader); + +GARROW_EXTERN std::shared_ptr garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader); +GARROW_EXTERN GArrowCSVReader * garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader, GArrowInputStream *input); + +GARROW_EXTERN std::shared_ptr garrow_csv_reader_get_raw(GArrowCSVReader *reader); +GARROW_EXTERN GArrowJSONReader * garrow_json_reader_new_raw(std::shared_ptr *arrow_reader, GArrowInputStream *input); + +GARROW_EXTERN std::shared_ptr garrow_json_reader_get_raw(GArrowJSONReader *reader); diff --git a/c_glib/arrow-glib/record-batch.h b/c_glib/arrow-glib/record-batch.h index 3c995658224cb..e7ffd83795ed4 100644 --- a/c_glib/arrow-glib/record-batch.h +++ b/c_glib/arrow-glib/record-batch.h @@ -26,6 +26,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_RECORD_BATCH (garrow_record_batch_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( GArrowRecordBatch, garrow_record_batch, GARROW, RECORD_BATCH, GObject) struct _GArrowRecordBatchClass @@ -37,6 +38,7 @@ GARROW_AVAILABLE_IN_6_0 GArrowRecordBatch * garrow_record_batch_import(gpointer c_abi_array, GArrowSchema *schema, GError **error); +GARROW_AVAILABLE_IN_ALL GArrowRecordBatch * garrow_record_batch_new(GArrowSchema *schema, guint32 n_rows, @@ -50,6 +52,7 @@ garrow_record_batch_export(GArrowRecordBatch *record_batch, gpointer *c_abi_schema, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_record_batch_equal(GArrowRecordBatch *record_batch, GArrowRecordBatch *other_record_batch); @@ -59,28 +62,43 @@ garrow_record_batch_equal_metadata(GArrowRecordBatch *record_batch, GArrowRecordBatch *other_record_batch, gboolean check_metadata); +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_record_batch_get_schema(GArrowRecordBatch *record_batch); + GARROW_AVAILABLE_IN_0_15 GArrowArray * garrow_record_batch_get_column_data(GArrowRecordBatch *record_batch, gint i); + +GARROW_AVAILABLE_IN_ALL const gchar * garrow_record_batch_get_column_name(GArrowRecordBatch *record_batch, gint i); + +GARROW_AVAILABLE_IN_ALL guint garrow_record_batch_get_n_columns(GArrowRecordBatch *record_batch); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_record_batch_get_n_rows(GArrowRecordBatch *record_batch); + +GARROW_AVAILABLE_IN_ALL GArrowRecordBatch * garrow_record_batch_slice(GArrowRecordBatch *record_batch, gint64 offset, gint64 length); +GARROW_AVAILABLE_IN_ALL gchar * garrow_record_batch_to_string(GArrowRecordBatch *record_batch, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowRecordBatch * garrow_record_batch_add_column(GArrowRecordBatch *record_batch, guint i, GArrowField *field, GArrowArray *column, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowRecordBatch * garrow_record_batch_remove_column(GArrowRecordBatch *record_batch, guint i, @@ -92,6 +110,7 @@ garrow_record_batch_serialize(GArrowRecordBatch *record_batch, GError **error); #define GARROW_TYPE_RECORD_BATCH_ITERATOR (garrow_record_batch_iterator_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchIterator, garrow_record_batch_iterator, GARROW, diff --git a/c_glib/arrow-glib/record-batch.hpp b/c_glib/arrow-glib/record-batch.hpp index 4c3e5e8a78231..75c0432b390ba 100644 --- a/c_glib/arrow-glib/record-batch.hpp +++ b/c_glib/arrow-glib/record-batch.hpp @@ -23,13 +23,18 @@ #include +GARROW_EXTERN GArrowRecordBatch * garrow_record_batch_new_raw(std::shared_ptr *arrow_record_batch); + +GARROW_EXTERN std::shared_ptr garrow_record_batch_get_raw(GArrowRecordBatch *record_batch); +GARROW_EXTERN GArrowRecordBatchIterator * garrow_record_batch_iterator_new_raw(arrow::RecordBatchIterator *arrow_iterator); +GARROW_EXTERN arrow::RecordBatchIterator * garrow_record_batch_iterator_get_raw(GArrowRecordBatchIterator *iterator); diff --git a/c_glib/arrow-glib/scalar.cpp b/c_glib/arrow-glib/scalar.cpp index def6b15148355..f965b4970304b 100644 --- a/c_glib/arrow-glib/scalar.cpp +++ b/c_glib/arrow-glib/scalar.cpp @@ -1063,7 +1063,8 @@ garrow_base_binary_scalar_get_value(GArrowBaseBinaryScalar *scalar) if (!priv->value) { const auto arrow_scalar = std::static_pointer_cast( garrow_scalar_get_raw(GARROW_SCALAR(scalar))); - priv->value = garrow_buffer_new_raw(&(arrow_scalar->value)); + priv->value = garrow_buffer_new_raw( + const_cast *>(&(arrow_scalar->value))); } return priv->value; } @@ -1983,7 +1984,8 @@ garrow_base_list_scalar_get_value(GArrowBaseListScalar *scalar) if (!priv->value) { const auto arrow_scalar = std::static_pointer_cast( garrow_scalar_get_raw(GARROW_SCALAR(scalar))); - priv->value = garrow_array_new_raw(&(arrow_scalar->value)); + priv->value = garrow_array_new_raw( + const_cast *>(&(arrow_scalar->value))); } return priv->value; } diff --git a/c_glib/arrow-glib/scalar.h b/c_glib/arrow-glib/scalar.h index b4a6229c62fd1..5f9015d29c61c 100644 --- a/c_glib/arrow-glib/scalar.h +++ b/c_glib/arrow-glib/scalar.h @@ -25,6 +25,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_SCALAR (garrow_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowScalar, garrow_scalar, GARROW, SCALAR, GObject) struct _GArrowScalarClass { @@ -64,6 +65,7 @@ garrow_scalar_cast(GArrowScalar *scalar, GError **error); #define GARROW_TYPE_NULL_SCALAR (garrow_null_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowNullScalar, garrow_null_scalar, GARROW, NULL_SCALAR, GArrowScalar) struct _GArrowNullScalarClass @@ -76,6 +78,7 @@ GArrowNullScalar * garrow_null_scalar_new(void); #define GARROW_TYPE_BOOLEAN_SCALAR (garrow_boolean_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowBooleanScalar, garrow_boolean_scalar, GARROW, BOOLEAN_SCALAR, GArrowScalar) struct _GArrowBooleanScalarClass @@ -91,6 +94,7 @@ gboolean garrow_boolean_scalar_get_value(GArrowBooleanScalar *scalar); #define GARROW_TYPE_INT8_SCALAR (garrow_int8_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowInt8Scalar, garrow_int8_scalar, GARROW, INT8_SCALAR, GArrowScalar) struct _GArrowInt8ScalarClass @@ -106,6 +110,7 @@ gint8 garrow_int8_scalar_get_value(GArrowInt8Scalar *scalar); #define GARROW_TYPE_INT16_SCALAR (garrow_int16_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowInt16Scalar, garrow_int16_scalar, GARROW, INT16_SCALAR, GArrowScalar) struct _GArrowInt16ScalarClass @@ -121,6 +126,7 @@ gint16 garrow_int16_scalar_get_value(GArrowInt16Scalar *scalar); #define GARROW_TYPE_INT32_SCALAR (garrow_int32_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowInt32Scalar, garrow_int32_scalar, GARROW, INT32_SCALAR, GArrowScalar) struct _GArrowInt32ScalarClass @@ -136,6 +142,7 @@ gint32 garrow_int32_scalar_get_value(GArrowInt32Scalar *scalar); #define GARROW_TYPE_INT64_SCALAR (garrow_int64_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowInt64Scalar, garrow_int64_scalar, GARROW, INT64_SCALAR, GArrowScalar) struct _GArrowInt64ScalarClass @@ -151,6 +158,7 @@ gint64 garrow_int64_scalar_get_value(GArrowInt64Scalar *scalar); #define GARROW_TYPE_UINT8_SCALAR (garrow_uint8_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowUInt8Scalar, garrow_uint8_scalar, GARROW, UINT8_SCALAR, GArrowScalar) struct _GArrowUInt8ScalarClass @@ -166,6 +174,7 @@ guint8 garrow_uint8_scalar_get_value(GArrowUInt8Scalar *scalar); #define GARROW_TYPE_UINT16_SCALAR (garrow_uint16_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowUInt16Scalar, garrow_uint16_scalar, GARROW, UINT16_SCALAR, GArrowScalar) struct _GArrowUInt16ScalarClass @@ -181,6 +190,7 @@ guint16 garrow_uint16_scalar_get_value(GArrowUInt16Scalar *scalar); #define GARROW_TYPE_UINT32_SCALAR (garrow_uint32_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowUInt32Scalar, garrow_uint32_scalar, GARROW, UINT32_SCALAR, GArrowScalar) struct _GArrowUInt32ScalarClass @@ -196,6 +206,7 @@ guint32 garrow_uint32_scalar_get_value(GArrowUInt32Scalar *scalar); #define GARROW_TYPE_UINT64_SCALAR (garrow_uint64_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowUInt64Scalar, garrow_uint64_scalar, GARROW, UINT64_SCALAR, GArrowScalar) struct _GArrowUInt64ScalarClass @@ -211,6 +222,7 @@ guint64 garrow_uint64_scalar_get_value(GArrowUInt64Scalar *scalar); #define GARROW_TYPE_HALF_FLOAT_SCALAR (garrow_half_float_scalar_get_type()) +GARROW_AVAILABLE_IN_11_0 G_DECLARE_DERIVABLE_TYPE(GArrowHalfFloatScalar, garrow_half_float_scalar, GARROW, @@ -229,6 +241,7 @@ guint16 garrow_half_float_scalar_get_value(GArrowHalfFloatScalar *scalar); #define GARROW_TYPE_FLOAT_SCALAR (garrow_float_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowFloatScalar, garrow_float_scalar, GARROW, FLOAT_SCALAR, GArrowScalar) struct _GArrowFloatScalarClass @@ -244,6 +257,7 @@ gfloat garrow_float_scalar_get_value(GArrowFloatScalar *scalar); #define GARROW_TYPE_DOUBLE_SCALAR (garrow_double_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowDoubleScalar, garrow_double_scalar, GARROW, DOUBLE_SCALAR, GArrowScalar) struct _GArrowDoubleScalarClass @@ -259,6 +273,7 @@ gdouble garrow_double_scalar_get_value(GArrowDoubleScalar *scalar); #define GARROW_TYPE_BASE_BINARY_SCALAR (garrow_base_binary_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowBaseBinaryScalar, garrow_base_binary_scalar, GARROW, @@ -274,6 +289,7 @@ GArrowBuffer * garrow_base_binary_scalar_get_value(GArrowBaseBinaryScalar *scalar); #define GARROW_TYPE_BINARY_SCALAR (garrow_binary_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowBinaryScalar, garrow_binary_scalar, GARROW, BINARY_SCALAR, GArrowBaseBinaryScalar) struct _GArrowBinaryScalarClass @@ -286,6 +302,7 @@ GArrowBinaryScalar * garrow_binary_scalar_new(GArrowBuffer *value); #define GARROW_TYPE_STRING_SCALAR (garrow_string_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowStringScalar, garrow_string_scalar, GARROW, STRING_SCALAR, GArrowBaseBinaryScalar) struct _GArrowStringScalarClass @@ -298,6 +315,7 @@ GArrowStringScalar * garrow_string_scalar_new(GArrowBuffer *value); #define GARROW_TYPE_LARGE_BINARY_SCALAR (garrow_large_binary_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowLargeBinaryScalar, garrow_large_binary_scalar, GARROW, @@ -313,6 +331,7 @@ GArrowLargeBinaryScalar * garrow_large_binary_scalar_new(GArrowBuffer *value); #define GARROW_TYPE_LARGE_STRING_SCALAR (garrow_large_string_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowLargeStringScalar, garrow_large_string_scalar, GARROW, @@ -328,6 +347,7 @@ GArrowLargeStringScalar * garrow_large_string_scalar_new(GArrowBuffer *value); #define GARROW_TYPE_FIXED_SIZE_BINARY_SCALAR (garrow_fixed_size_binary_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowFixedSizeBinaryScalar, garrow_fixed_size_binary_scalar, GARROW, @@ -344,6 +364,7 @@ garrow_fixed_size_binary_scalar_new(GArrowFixedSizeBinaryDataType *data_type, GArrowBuffer *value); #define GARROW_TYPE_DATE32_SCALAR (garrow_date32_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowDate32Scalar, garrow_date32_scalar, GARROW, DATE32_SCALAR, GArrowScalar) struct _GArrowDate32ScalarClass @@ -359,6 +380,7 @@ gint32 garrow_date32_scalar_get_value(GArrowDate32Scalar *scalar); #define GARROW_TYPE_DATE64_SCALAR (garrow_date64_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowDate64Scalar, garrow_date64_scalar, GARROW, DATE64_SCALAR, GArrowScalar) struct _GArrowDate64ScalarClass @@ -374,6 +396,7 @@ gint64 garrow_date64_scalar_get_value(GArrowDate64Scalar *scalar); #define GARROW_TYPE_TIME32_SCALAR (garrow_time32_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowTime32Scalar, garrow_time32_scalar, GARROW, TIME32_SCALAR, GArrowScalar) struct _GArrowTime32ScalarClass @@ -389,6 +412,7 @@ gint32 garrow_time32_scalar_get_value(GArrowTime32Scalar *scalar); #define GARROW_TYPE_TIME64_SCALAR (garrow_time64_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowTime64Scalar, garrow_time64_scalar, GARROW, TIME64_SCALAR, GArrowScalar) struct _GArrowTime64ScalarClass @@ -404,6 +428,7 @@ gint64 garrow_time64_scalar_get_value(GArrowTime64Scalar *scalar); #define GARROW_TYPE_TIMESTAMP_SCALAR (garrow_timestamp_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowTimestampScalar, garrow_timestamp_scalar, GARROW, TIMESTAMP_SCALAR, GArrowScalar) struct _GArrowTimestampScalarClass @@ -419,6 +444,7 @@ gint64 garrow_timestamp_scalar_get_value(GArrowTimestampScalar *scalar); #define GARROW_TYPE_MONTH_INTERVAL_SCALAR (garrow_month_interval_scalar_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowMonthIntervalScalar, garrow_month_interval_scalar, GARROW, @@ -437,6 +463,7 @@ gint32 garrow_month_interval_scalar_get_value(GArrowMonthIntervalScalar *scalar); #define GARROW_TYPE_DAY_TIME_INTERVAL_SCALAR (garrow_day_time_interval_scalar_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowDayTimeIntervalScalar, garrow_day_time_interval_scalar, GARROW, @@ -456,6 +483,7 @@ garrow_day_time_interval_scalar_get_value(GArrowDayTimeIntervalScalar *scalar); #define GARROW_TYPE_MONTH_DAY_NANO_INTERVAL_SCALAR \ (garrow_month_day_nano_interval_scalar_get_type()) +GARROW_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GArrowMonthDayNanoIntervalScalar, garrow_month_day_nano_interval_scalar, GARROW, @@ -474,6 +502,7 @@ GArrowMonthDayNano * garrow_month_day_nano_interval_scalar_get_value(GArrowMonthDayNanoIntervalScalar *scalar); #define GARROW_TYPE_DECIMAL128_SCALAR (garrow_decimal128_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowDecimal128Scalar, garrow_decimal128_scalar, GARROW, @@ -493,6 +522,7 @@ GArrowDecimal128 * garrow_decimal128_scalar_get_value(GArrowDecimal128Scalar *scalar); #define GARROW_TYPE_DECIMAL256_SCALAR (garrow_decimal256_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowDecimal256Scalar, garrow_decimal256_scalar, GARROW, @@ -512,6 +542,7 @@ GArrowDecimal256 * garrow_decimal256_scalar_get_value(GArrowDecimal256Scalar *scalar); #define GARROW_TYPE_BASE_LIST_SCALAR (garrow_base_list_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowBaseListScalar, garrow_base_list_scalar, GARROW, BASE_LIST_SCALAR, GArrowScalar) struct _GArrowBaseListScalarClass @@ -524,6 +555,7 @@ GArrowArray * garrow_base_list_scalar_get_value(GArrowBaseListScalar *scalar); #define GARROW_TYPE_LIST_SCALAR (garrow_list_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowListScalar, garrow_list_scalar, GARROW, LIST_SCALAR, GArrowBaseListScalar) struct _GArrowListScalarClass @@ -536,6 +568,7 @@ GArrowListScalar * garrow_list_scalar_new(GArrowListArray *value); #define GARROW_TYPE_LARGE_LIST_SCALAR (garrow_large_list_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowLargeListScalar, garrow_large_list_scalar, GARROW, @@ -551,6 +584,7 @@ GArrowLargeListScalar * garrow_large_list_scalar_new(GArrowLargeListArray *value); #define GARROW_TYPE_MAP_SCALAR (garrow_map_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowMapScalar, garrow_map_scalar, GARROW, MAP_SCALAR, GArrowBaseListScalar) struct _GArrowMapScalarClass @@ -563,6 +597,7 @@ GArrowMapScalar * garrow_map_scalar_new(GArrowStructArray *value); #define GARROW_TYPE_STRUCT_SCALAR (garrow_struct_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowStructScalar, garrow_struct_scalar, GARROW, STRUCT_SCALAR, GArrowScalar) struct _GArrowStructScalarClass @@ -578,6 +613,7 @@ GList * garrow_struct_scalar_get_value(GArrowStructScalar *scalar); #define GARROW_TYPE_UNION_SCALAR (garrow_union_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowUnionScalar, garrow_union_scalar, GARROW, UNION_SCALAR, GArrowScalar) struct _GArrowUnionScalarClass @@ -593,6 +629,7 @@ GArrowScalar * garrow_union_scalar_get_value(GArrowUnionScalar *scalar); #define GARROW_TYPE_SPARSE_UNION_SCALAR (garrow_sparse_union_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowSparseUnionScalar, garrow_sparse_union_scalar, GARROW, @@ -610,6 +647,7 @@ garrow_sparse_union_scalar_new(GArrowSparseUnionDataType *data_type, GArrowScalar *value); #define GARROW_TYPE_DENSE_UNION_SCALAR (garrow_dense_union_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GArrowDenseUnionScalar, garrow_dense_union_scalar, GARROW, @@ -627,6 +665,7 @@ garrow_dense_union_scalar_new(GArrowDenseUnionDataType *data_type, GArrowScalar *value); #define GARROW_TYPE_EXTENSION_SCALAR (garrow_extension_scalar_get_type()) +GARROW_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( GArrowExtensionScalar, garrow_extension_scalar, GARROW, EXTENSION_SCALAR, GArrowScalar) struct _GArrowExtensionScalarClass diff --git a/c_glib/arrow-glib/schema.h b/c_glib/arrow-glib/schema.h index 93cd5bd542cf8..aab740397b7d6 100644 --- a/c_glib/arrow-glib/schema.h +++ b/c_glib/arrow-glib/schema.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_SCHEMA (garrow_schema_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowSchema, garrow_schema, GARROW, SCHEMA, GObject) struct _GArrowSchemaClass { @@ -34,6 +35,7 @@ GARROW_AVAILABLE_IN_6_0 GArrowSchema * garrow_schema_import(gpointer c_abi_schema, GError **error); +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_schema_new(GList *fields); @@ -41,34 +43,48 @@ GARROW_AVAILABLE_IN_6_0 gpointer garrow_schema_export(GArrowSchema *schema, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_schema_equal(GArrowSchema *schema, GArrowSchema *other_schema); + +GARROW_AVAILABLE_IN_ALL GArrowField * garrow_schema_get_field(GArrowSchema *schema, guint i); + +GARROW_AVAILABLE_IN_ALL GArrowField * garrow_schema_get_field_by_name(GArrowSchema *schema, const gchar *name); GARROW_AVAILABLE_IN_0_15 gint garrow_schema_get_field_index(GArrowSchema *schema, const gchar *name); +GARROW_AVAILABLE_IN_ALL guint garrow_schema_n_fields(GArrowSchema *schema); + +GARROW_AVAILABLE_IN_ALL GList * garrow_schema_get_fields(GArrowSchema *schema); +GARROW_AVAILABLE_IN_ALL gchar * garrow_schema_to_string(GArrowSchema *schema); + GARROW_AVAILABLE_IN_0_17 gchar * garrow_schema_to_string_metadata(GArrowSchema *schema, gboolean show_metadata); +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_schema_add_field(GArrowSchema *schema, guint i, GArrowField *field, GError **error); +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_schema_remove_field(GArrowSchema *schema, guint i, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_schema_replace_field(GArrowSchema *schema, guint i, diff --git a/c_glib/arrow-glib/schema.hpp b/c_glib/arrow-glib/schema.hpp index 333f73391c900..ba6c459495461 100644 --- a/c_glib/arrow-glib/schema.hpp +++ b/c_glib/arrow-glib/schema.hpp @@ -23,7 +23,10 @@ #include +GARROW_EXTERN GArrowSchema * garrow_schema_new_raw(std::shared_ptr *arrow_schema); + +GARROW_EXTERN std::shared_ptr garrow_schema_get_raw(GArrowSchema *schema); diff --git a/c_glib/arrow-glib/table-builder.h b/c_glib/arrow-glib/table-builder.h index 0e13352bbdde3..6fad1ae79a40f 100644 --- a/c_glib/arrow-glib/table-builder.h +++ b/c_glib/arrow-glib/table-builder.h @@ -26,6 +26,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_RECORD_BATCH_BUILDER (garrow_record_batch_builder_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchBuilder, garrow_record_batch_builder, GARROW, @@ -36,34 +37,45 @@ struct _GArrowRecordBatchBuilderClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchBuilder * garrow_record_batch_builder_new(GArrowSchema *schema, GError **error); +GARROW_AVAILABLE_IN_ALL gint64 garrow_record_batch_builder_get_initial_capacity(GArrowRecordBatchBuilder *builder); + +GARROW_AVAILABLE_IN_ALL void garrow_record_batch_builder_set_initial_capacity(GArrowRecordBatchBuilder *builder, gint64 capacity); +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_record_batch_builder_get_schema(GArrowRecordBatchBuilder *builder); #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_13_FOR(garrow_record_batch_builder_get_n_columns) gint garrow_record_batch_builder_get_n_fields(GArrowRecordBatchBuilder *builder); #endif + GARROW_AVAILABLE_IN_0_13 gint garrow_record_batch_builder_get_n_columns(GArrowRecordBatchBuilder *builder); + #ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_ALL GARROW_DEPRECATED_IN_0_13_FOR(garrow_record_batch_builder_get_column_builder) GArrowArrayBuilder * garrow_record_batch_builder_get_field(GArrowRecordBatchBuilder *builder, gint i); #endif + GARROW_AVAILABLE_IN_0_13 GArrowArrayBuilder * garrow_record_batch_builder_get_column_builder(GArrowRecordBatchBuilder *builder, gint i); +GARROW_AVAILABLE_IN_ALL GArrowRecordBatch * garrow_record_batch_builder_flush(GArrowRecordBatchBuilder *builder, GError **error); diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index 1bf64d25a4f3f..d790e413df5fc 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -29,6 +29,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_TABLE_CONCATENATE_OPTIONS \ (garrow_table_concatenate_options_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTableConcatenateOptions, garrow_table_concatenate_options, GARROW, @@ -44,6 +45,7 @@ GArrowTableConcatenateOptions * garrow_table_concatenate_options_new(void); #define GARROW_TYPE_TABLE (garrow_table_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTable, garrow_table, GARROW, TABLE, GObject) struct _GArrowTableClass { @@ -53,18 +55,21 @@ struct _GArrowTableClass GARROW_AVAILABLE_IN_0_12 GArrowTable * garrow_table_new_values(GArrowSchema *schema, GList *values, GError **error); + GARROW_AVAILABLE_IN_0_15 GArrowTable * garrow_table_new_chunked_arrays(GArrowSchema *schema, GArrowChunkedArray **chunked_arrays, gsize n_chunked_arrays, GError **error); + GARROW_AVAILABLE_IN_0_12 GArrowTable * garrow_table_new_arrays(GArrowSchema *schema, GArrowArray **arrays, gsize n_arrays, GError **error); + GARROW_AVAILABLE_IN_0_12 GArrowTable * garrow_table_new_record_batches(GArrowSchema *schema, @@ -72,22 +77,29 @@ garrow_table_new_record_batches(GArrowSchema *schema, gsize n_record_batches, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_table_equal(GArrowTable *table, GArrowTable *other_table); + GARROW_AVAILABLE_IN_0_17 gboolean garrow_table_equal_metadata(GArrowTable *table, GArrowTable *other_table, gboolean check_metadata); +GARROW_AVAILABLE_IN_ALL GArrowSchema * garrow_table_get_schema(GArrowTable *table); + GARROW_AVAILABLE_IN_0_15 GArrowChunkedArray * garrow_table_get_column_data(GArrowTable *table, gint i); +GARROW_AVAILABLE_IN_ALL guint garrow_table_get_n_columns(GArrowTable *table); + +GARROW_AVAILABLE_IN_ALL guint64 garrow_table_get_n_rows(GArrowTable *table); @@ -98,8 +110,11 @@ garrow_table_add_column(GArrowTable *table, GArrowField *field, GArrowChunkedArray *chunked_array, GError **error); + +GARROW_AVAILABLE_IN_ALL GArrowTable * garrow_table_remove_column(GArrowTable *table, guint i, GError **error); + GARROW_AVAILABLE_IN_0_15 GArrowTable * garrow_table_replace_column(GArrowTable *table, @@ -107,22 +122,28 @@ garrow_table_replace_column(GArrowTable *table, GArrowField *field, GArrowChunkedArray *chunked_array, GError **error); + +GARROW_AVAILABLE_IN_ALL gchar * garrow_table_to_string(GArrowTable *table, GError **error); + GARROW_AVAILABLE_IN_0_14 GArrowTable * garrow_table_concatenate(GArrowTable *table, GList *other_tables, GArrowTableConcatenateOptions *options, GError **error); + GARROW_AVAILABLE_IN_0_14 GArrowTable * garrow_table_slice(GArrowTable *table, gint64 offset, gint64 length); + GARROW_AVAILABLE_IN_0_16 GArrowTable * garrow_table_combine_chunks(GArrowTable *table, GError **error); #define GARROW_TYPE_FEATHER_WRITE_PROPERTIES (garrow_feather_write_properties_get_type()) +GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowFeatherWriteProperties, garrow_feather_write_properties, GARROW, diff --git a/c_glib/arrow-glib/table.hpp b/c_glib/arrow-glib/table.hpp index 3077c2ece9b37..79fc97471a42c 100644 --- a/c_glib/arrow-glib/table.hpp +++ b/c_glib/arrow-glib/table.hpp @@ -24,10 +24,14 @@ #include +GARROW_EXTERN GArrowTable * garrow_table_new_raw(std::shared_ptr *arrow_table); + +GARROW_EXTERN std::shared_ptr garrow_table_get_raw(GArrowTable *table); +GARROW_EXTERN arrow::ipc::feather::WriteProperties * garrow_feather_write_properties_get_raw(GArrowFeatherWriteProperties *properties); diff --git a/c_glib/arrow-glib/tensor.h b/c_glib/arrow-glib/tensor.h index a6d11b248110e..5971c3af12600 100644 --- a/c_glib/arrow-glib/tensor.h +++ b/c_glib/arrow-glib/tensor.h @@ -25,12 +25,14 @@ G_BEGIN_DECLS #define GARROW_TYPE_TENSOR (garrow_tensor_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowTensor, garrow_tensor, GARROW, TENSOR, GObject) struct _GArrowTensorClass { GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowTensor * garrow_tensor_new(GArrowDataType *data_type, GArrowBuffer *data, @@ -40,30 +42,55 @@ garrow_tensor_new(GArrowDataType *data_type, gsize n_strides, gchar **dimension_names, gsize n_dimension_names); +GARROW_AVAILABLE_IN_ALL gboolean garrow_tensor_equal(GArrowTensor *tensor, GArrowTensor *other_tensor); + +GARROW_AVAILABLE_IN_ALL GArrowDataType * garrow_tensor_get_value_data_type(GArrowTensor *tensor); + +GARROW_AVAILABLE_IN_ALL GArrowType garrow_tensor_get_value_type(GArrowTensor *tensor); + +GARROW_AVAILABLE_IN_ALL GArrowBuffer * garrow_tensor_get_buffer(GArrowTensor *tensor); + +GARROW_AVAILABLE_IN_ALL gint64 * garrow_tensor_get_shape(GArrowTensor *tensor, gint *n_dimensions); + +GARROW_AVAILABLE_IN_ALL gint64 * garrow_tensor_get_strides(GArrowTensor *tensor, gint *n_strides); + +GARROW_AVAILABLE_IN_ALL gint garrow_tensor_get_n_dimensions(GArrowTensor *tensor); + +GARROW_AVAILABLE_IN_ALL const gchar * garrow_tensor_get_dimension_name(GArrowTensor *tensor, gint i); + +GARROW_AVAILABLE_IN_ALL gint64 garrow_tensor_get_size(GArrowTensor *tensor); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_tensor_is_mutable(GArrowTensor *tensor); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_tensor_is_contiguous(GArrowTensor *tensor); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_tensor_is_row_major(GArrowTensor *tensor); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_tensor_is_column_major(GArrowTensor *tensor); diff --git a/c_glib/arrow-glib/timestamp-parser.h b/c_glib/arrow-glib/timestamp-parser.h index 05cad54746eeb..a7265d6ef46fb 100644 --- a/c_glib/arrow-glib/timestamp-parser.h +++ b/c_glib/arrow-glib/timestamp-parser.h @@ -26,6 +26,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_TIMESTAMP_PARSER (garrow_timestamp_parser_get_type()) +GARROW_AVAILABLE_IN_16_0 G_DECLARE_DERIVABLE_TYPE( GArrowTimestampParser, garrow_timestamp_parser, GARROW, TIMESTAMP_PARSER, GObject) struct _GArrowTimestampParserClass @@ -39,6 +40,7 @@ garrow_timestamp_parser_get_kind(GArrowTimestampParser *parser); #define GARROW_TYPE_STRPTIME_TIMESTAMP_PARSER \ (garrow_strptime_timestamp_parser_get_type()) +GARROW_AVAILABLE_IN_16_0 G_DECLARE_DERIVABLE_TYPE(GArrowStrptimeTimestampParser, garrow_strptime_timestamp_parser, GARROW, @@ -58,6 +60,7 @@ const gchar * garrow_strptime_timestamp_parser_get_format(GArrowStrptimeTimestampParser *parser); #define GARROW_TYPE_ISO8601_TIMESTAMP_PARSER (garrow_iso8601_timestamp_parser_get_type()) +GARROW_AVAILABLE_IN_16_0 G_DECLARE_DERIVABLE_TYPE(GArrowISO8601TimestampParser, garrow_iso8601_timestamp_parser, GARROW, diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index a83c68a2a16dc..b530a088c8e38 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -19,6 +19,8 @@ #pragma once +#include + /** * SECTION: version * @section_id: version-macros @@ -36,7 +38,7 @@ * * Since: 0.10.0 */ -#define GARROW_VERSION_MAJOR (@GARROW_VERSION_MAJOR@) +#define GARROW_VERSION_MAJOR (@VERSION_MAJOR@) /** * GARROW_VERSION_MINOR: @@ -45,7 +47,7 @@ * * Since: 0.10.0 */ -#define GARROW_VERSION_MINOR (@GARROW_VERSION_MINOR@) +#define GARROW_VERSION_MINOR (@VERSION_MINOR@) /** * GARROW_VERSION_MICRO: @@ -54,7 +56,7 @@ * * Since: 0.10.0 */ -#define GARROW_VERSION_MICRO (@GARROW_VERSION_MICRO@) +#define GARROW_VERSION_MICRO (@VERSION_MICRO@) /** * GARROW_VERSION_TAG: @@ -64,7 +66,7 @@ * * Since: 0.10.0 */ -#define GARROW_VERSION_TAG "@GARROW_VERSION_TAG@" +#define GARROW_VERSION_TAG "@VERSION_TAG@" /** * GARROW_VERSION_CHECK: @@ -108,212 +110,7 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif -/** - * GARROW_VERSION_16_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 16.0.0 - */ -#define GARROW_VERSION_16_0 G_ENCODE_VERSION(16, 0) - -/** - * GARROW_VERSION_15_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 15.0.0 - */ -#define GARROW_VERSION_15_0 G_ENCODE_VERSION(15, 0) - -/** - * GARROW_VERSION_14_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 14.0.0 - */ -#define GARROW_VERSION_14_0 G_ENCODE_VERSION(14, 0) - -/** - * GARROW_VERSION_13_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 13.0.0 - */ -#define GARROW_VERSION_13_0 G_ENCODE_VERSION(13, 0) - -/** - * GARROW_VERSION_12_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 12.0.0 - */ -#define GARROW_VERSION_12_0 G_ENCODE_VERSION(12, 0) - -/** - * GARROW_VERSION_11_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 11.0.0 - */ -#define GARROW_VERSION_11_0 G_ENCODE_VERSION(11, 0) - -/** - * GARROW_VERSION_10_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 10.0.0 - */ -#define GARROW_VERSION_10_0 G_ENCODE_VERSION(10, 0) - -/** - * GARROW_VERSION_9_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 9.0.0 - */ -#define GARROW_VERSION_9_0 G_ENCODE_VERSION(9, 0) - -/** - * GARROW_VERSION_8_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 8.0.0 - */ -#define GARROW_VERSION_8_0 G_ENCODE_VERSION(8, 0) - -/** - * GARROW_VERSION_7_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 7.0.0 - */ -#define GARROW_VERSION_7_0 G_ENCODE_VERSION(7, 0) - -/** - * GARROW_VERSION_6_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 6.0.0 - */ -#define GARROW_VERSION_6_0 G_ENCODE_VERSION(6, 0) - -/** - * GARROW_VERSION_5_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 5.0.0 - */ -#define GARROW_VERSION_5_0 G_ENCODE_VERSION(5, 0) - -/** - * GARROW_VERSION_4_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 4.0.0 - */ -#define GARROW_VERSION_4_0 G_ENCODE_VERSION(4, 0) - -/** - * GARROW_VERSION_3_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 3.0.0 - */ -#define GARROW_VERSION_3_0 G_ENCODE_VERSION(3, 0) - -/** - * GARROW_VERSION_2_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 2.0.0 - */ -#define GARROW_VERSION_2_0 G_ENCODE_VERSION(2, 0) - -/** - * GARROW_VERSION_1_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 1.0.0 - */ -#define GARROW_VERSION_1_0 G_ENCODE_VERSION(1, 0) - -/** - * GARROW_VERSION_0_17: - * - * You can use this macro value for compile time API version check. - * - * Since: 0.17.0 - */ -#define GARROW_VERSION_0_17 G_ENCODE_VERSION(0, 17) - -/** - * GARROW_VERSION_0_16: - * - * You can use this macro value for compile time API version check. - * - * Since: 0.16.0 - */ -#define GARROW_VERSION_0_16 G_ENCODE_VERSION(0, 16) - -/** - * GARROW_VERSION_0_15: - * - * You can use this macro value for compile time API version check. - * - * Since: 0.15.0 - */ -#define GARROW_VERSION_0_15 G_ENCODE_VERSION(0, 15) - -/** - * GARROW_VERSION_0_14: - * - * You can use this macro value for compile time API version check. - * - * Since: 0.14.0 - */ -#define GARROW_VERSION_0_14 G_ENCODE_VERSION(0, 14) - -/** - * GARROW_VERSION_0_13: - * - * You can use this macro value for compile time API version check. - * - * Since: 0.13.0 - */ -#define GARROW_VERSION_0_13 G_ENCODE_VERSION(0, 13) - -/** - * GARROW_VERSION_0_12: - * - * You can use this macro value for compile time API version check. - * - * Since: 0.12.0 - */ -#define GARROW_VERSION_0_12 G_ENCODE_VERSION(0, 12) - -/** - * GARROW_VERSION_0_10: - * - * You can use this macro value for compile time API version check. - * - * Since: 0.10.0 - */ -#define GARROW_VERSION_0_10 G_ENCODE_VERSION(0, 10) +@ENCODED_VERSIONS@ /** * GARROW_VERSION_MIN_REQUIRED: @@ -359,327 +156,6 @@ G_ENCODE_VERSION(GARROW_VERSION_MAJOR, GARROW_VERSION_MINOR) #endif +@VISIBILITY_MACROS@ -#define GARROW_AVAILABLE_IN_ALL - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_16_0 -# define GARROW_DEPRECATED_IN_16_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_16_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_16_0 -# define GARROW_DEPRECATED_IN_16_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_16_0 -# define GARROW_AVAILABLE_IN_16_0 GARROW_UNAVAILABLE(16, 0) -#else -# define GARROW_AVAILABLE_IN_16_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_15_0 -# define GARROW_DEPRECATED_IN_15_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_15_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_15_0 -# define GARROW_DEPRECATED_IN_15_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_15_0 -# define GARROW_AVAILABLE_IN_15_0 GARROW_UNAVAILABLE(15, 0) -#else -# define GARROW_AVAILABLE_IN_15_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_14_0 -# define GARROW_DEPRECATED_IN_14_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_14_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_14_0 -# define GARROW_DEPRECATED_IN_14_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_14_0 -# define GARROW_AVAILABLE_IN_14_0 GARROW_UNAVAILABLE(14, 0) -#else -# define GARROW_AVAILABLE_IN_14_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_13_0 -# define GARROW_DEPRECATED_IN_13_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_13_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_13_0 -# define GARROW_DEPRECATED_IN_13_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_13_0 -# define GARROW_AVAILABLE_IN_13_0 GARROW_UNAVAILABLE(13, 0) -#else -# define GARROW_AVAILABLE_IN_13_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_12_0 -# define GARROW_DEPRECATED_IN_12_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_12_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_12_0 -# define GARROW_DEPRECATED_IN_12_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_12_0 -# define GARROW_AVAILABLE_IN_12_0 GARROW_UNAVAILABLE(12, 0) -#else -# define GARROW_AVAILABLE_IN_12_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_11_0 -# define GARROW_DEPRECATED_IN_11_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_11_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_11_0 -# define GARROW_DEPRECATED_IN_11_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_11_0 -# define GARROW_AVAILABLE_IN_11_0 GARROW_UNAVAILABLE(11, 0) -#else -# define GARROW_AVAILABLE_IN_11_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_10_0 -# define GARROW_DEPRECATED_IN_10_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_10_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_10_0 -# define GARROW_DEPRECATED_IN_10_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_10_0 -# define GARROW_AVAILABLE_IN_10_0 GARROW_UNAVAILABLE(10, 0) -#else -# define GARROW_AVAILABLE_IN_10_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_9_0 -# define GARROW_DEPRECATED_IN_9_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_9_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_9_0 -# define GARROW_DEPRECATED_IN_9_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_9_0 -# define GARROW_AVAILABLE_IN_9_0 GARROW_UNAVAILABLE(9, 0) -#else -# define GARROW_AVAILABLE_IN_9_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_8_0 -# define GARROW_DEPRECATED_IN_8_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_8_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_8_0 -# define GARROW_DEPRECATED_IN_8_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_8_0 -# define GARROW_AVAILABLE_IN_8_0 GARROW_UNAVAILABLE(8, 0) -#else -# define GARROW_AVAILABLE_IN_8_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_7_0 -# define GARROW_DEPRECATED_IN_7_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_7_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_7_0 -# define GARROW_DEPRECATED_IN_7_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_7_0 -# define GARROW_AVAILABLE_IN_7_0 GARROW_UNAVAILABLE(7, 0) -#else -# define GARROW_AVAILABLE_IN_7_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_6_0 -# define GARROW_DEPRECATED_IN_6_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_6_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_6_0 -# define GARROW_DEPRECATED_IN_6_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_6_0 -# define GARROW_AVAILABLE_IN_6_0 GARROW_UNAVAILABLE(6, 0) -#else -# define GARROW_AVAILABLE_IN_6_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_5_0 -# define GARROW_DEPRECATED_IN_5_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_5_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_5_0 -# define GARROW_DEPRECATED_IN_5_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_5_0 -# define GARROW_AVAILABLE_IN_5_0 GARROW_UNAVAILABLE(5, 0) -#else -# define GARROW_AVAILABLE_IN_5_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_4_0 -# define GARROW_DEPRECATED_IN_4_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_4_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_4_0 -# define GARROW_DEPRECATED_IN_4_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_4_0 -# define GARROW_AVAILABLE_IN_4_0 GARROW_UNAVAILABLE(4, 0) -#else -# define GARROW_AVAILABLE_IN_4_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_3_0 -# define GARROW_DEPRECATED_IN_3_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_3_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_3_0 -# define GARROW_DEPRECATED_IN_3_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_3_0 -# define GARROW_AVAILABLE_IN_3_0 GARROW_UNAVAILABLE(3, 0) -#else -# define GARROW_AVAILABLE_IN_3_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_2_0 -# define GARROW_DEPRECATED_IN_2_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_2_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_2_0 -# define GARROW_DEPRECATED_IN_2_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_2_0 -# define GARROW_AVAILABLE_IN_2_0 GARROW_UNAVAILABLE(2, 0) -#else -# define GARROW_AVAILABLE_IN_2_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_1_0 -# define GARROW_DEPRECATED_IN_1_0 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_1_0_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_1_0 -# define GARROW_DEPRECATED_IN_1_0_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_1_0 -# define GARROW_AVAILABLE_IN_1_0 GARROW_UNAVAILABLE(1, 0) -#else -# define GARROW_AVAILABLE_IN_1_0 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_17 -# define GARROW_DEPRECATED_IN_0_17 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_0_17_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_0_17 -# define GARROW_DEPRECATED_IN_0_17_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_17 -# define GARROW_AVAILABLE_IN_0_17 GARROW_UNAVAILABLE(0, 17) -#else -# define GARROW_AVAILABLE_IN_0_17 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_16 -# define GARROW_DEPRECATED_IN_0_16 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_0_16_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_0_16 -# define GARROW_DEPRECATED_IN_0_16_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_16 -# define GARROW_AVAILABLE_IN_0_16 GARROW_UNAVAILABLE(0, 16) -#else -# define GARROW_AVAILABLE_IN_0_16 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_15 -# define GARROW_DEPRECATED_IN_0_15 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_0_15_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_0_15 -# define GARROW_DEPRECATED_IN_0_15_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_15 -# define GARROW_AVAILABLE_IN_0_15 GARROW_UNAVAILABLE(0, 15) -#else -# define GARROW_AVAILABLE_IN_0_15 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_14 -# define GARROW_DEPRECATED_IN_0_14 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_0_14_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_0_14 -# define GARROW_DEPRECATED_IN_0_14_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_14 -# define GARROW_AVAILABLE_IN_0_14 GARROW_UNAVAILABLE(0, 14) -#else -# define GARROW_AVAILABLE_IN_0_14 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_13 -# define GARROW_DEPRECATED_IN_0_13 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_0_13_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_0_13 -# define GARROW_DEPRECATED_IN_0_13_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_13 -# define GARROW_AVAILABLE_IN_0_13 GARROW_UNAVAILABLE(0, 13) -#else -# define GARROW_AVAILABLE_IN_0_13 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_12 -# define GARROW_DEPRECATED_IN_0_12 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_0_12_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_0_12 -# define GARROW_DEPRECATED_IN_0_12_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_12 -# define GARROW_AVAILABLE_IN_0_12 GARROW_UNAVAILABLE(0, 12) -#else -# define GARROW_AVAILABLE_IN_0_12 -#endif - -#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_10 -# define GARROW_DEPRECATED_IN_0_10 GARROW_DEPRECATED -# define GARROW_DEPRECATED_IN_0_10_FOR(function) GARROW_DEPRECATED_FOR(function) -#else -# define GARROW_DEPRECATED_IN_0_10 -# define GARROW_DEPRECATED_IN_0_10_FOR(function) -#endif - -#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_10 -# define GARROW_AVAILABLE_IN_0_10 GARROW_UNAVAILABLE(0, 10) -#else -# define GARROW_AVAILABLE_IN_0_10 -#endif +@AVAILABILITY_MACROS@ diff --git a/c_glib/arrow-glib/writable-file.h b/c_glib/arrow-glib/writable-file.h index 555705767e4aa..e9aa9122e92fa 100644 --- a/c_glib/arrow-glib/writable-file.h +++ b/c_glib/arrow-glib/writable-file.h @@ -24,9 +24,11 @@ G_BEGIN_DECLS #define GARROW_TYPE_WRITABLE_FILE (garrow_writable_file_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_INTERFACE( GArrowWritableFile, garrow_writable_file, GARROW, WRITABLE_FILE, GObject) +GARROW_AVAILABLE_IN_ALL gboolean garrow_writable_file_write_at(GArrowWritableFile *writable_file, gint64 position, diff --git a/c_glib/arrow-glib/writable.h b/c_glib/arrow-glib/writable.h index a556443967b5a..dcc1e67668e78 100644 --- a/c_glib/arrow-glib/writable.h +++ b/c_glib/arrow-glib/writable.h @@ -24,13 +24,17 @@ G_BEGIN_DECLS #define GARROW_TYPE_WRITABLE (garrow_writable_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_INTERFACE(GArrowWritable, garrow_writable, GARROW, WRITABLE, GObject) +GARROW_AVAILABLE_IN_ALL gboolean garrow_writable_write(GArrowWritable *writable, const guint8 *data, gint64 n_bytes, GError **error); + +GARROW_AVAILABLE_IN_ALL gboolean garrow_writable_flush(GArrowWritable *writable, GError **error); diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h index 30b0ea987da39..46bbdddec8c9d 100644 --- a/c_glib/arrow-glib/writer.h +++ b/c_glib/arrow-glib/writer.h @@ -28,6 +28,7 @@ G_BEGIN_DECLS #define GARROW_TYPE_RECORD_BATCH_WRITER (garrow_record_batch_writer_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchWriter, garrow_record_batch_writer, GARROW, @@ -38,19 +39,23 @@ struct _GArrowRecordBatchWriterClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_ALL gboolean garrow_record_batch_writer_write_record_batch(GArrowRecordBatchWriter *writer, GArrowRecordBatch *record_batch, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_record_batch_writer_write_table(GArrowRecordBatchWriter *writer, GArrowTable *table, GError **error); +GARROW_AVAILABLE_IN_ALL gboolean garrow_record_batch_writer_close(GArrowRecordBatchWriter *writer, GError **error); #define GARROW_TYPE_RECORD_BATCH_STREAM_WRITER \ (garrow_record_batch_stream_writer_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchStreamWriter, garrow_record_batch_stream_writer, GARROW, @@ -61,12 +66,14 @@ struct _GArrowRecordBatchStreamWriterClass GArrowRecordBatchWriterClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchStreamWriter * garrow_record_batch_stream_writer_new(GArrowOutputStream *sink, GArrowSchema *schema, GError **error); #define GARROW_TYPE_RECORD_BATCH_FILE_WRITER (garrow_record_batch_file_writer_get_type()) +GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchFileWriter, garrow_record_batch_file_writer, GARROW, @@ -77,6 +84,7 @@ struct _GArrowRecordBatchFileWriterClass GArrowRecordBatchStreamWriterClass parent_class; }; +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchFileWriter * garrow_record_batch_file_writer_new(GArrowOutputStream *sink, GArrowSchema *schema, diff --git a/c_glib/example/vala/meson.build b/c_glib/example/vala/meson.build index 474f0b1e9a51a..b7eb86200ddd6 100644 --- a/c_glib/example/vala/meson.build +++ b/c_glib/example/vala/meson.build @@ -18,11 +18,15 @@ # under the License. if generate_vapi + c_flags = [ + '-Wno-unused-but-set-variable', + ] + c_flags = meson.get_compiler('c').get_supported_arguments(c_flags) vala_example_executable_kwargs = { 'c_args': [ '-I' + project_build_root, '-I' + project_source_root, - ], + ] + c_flags, 'dependencies': [ arrow_glib_vapi, dependency('gio-2.0'), diff --git a/c_glib/example/vala/read-file.vala b/c_glib/example/vala/read-file.vala index a0a06275c4b24..287eddac76352 100644 --- a/c_glib/example/vala/read-file.vala +++ b/c_glib/example/vala/read-file.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); diff --git a/c_glib/example/vala/read-stream.vala b/c_glib/example/vala/read-stream.vala index c58dc848930a8..4520c8609bdaf 100644 --- a/c_glib/example/vala/read-stream.vala +++ b/c_glib/example/vala/read-stream.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); diff --git a/c_glib/gandiva-glib/enums.h.template b/c_glib/gandiva-glib/enums.h.template index b7d3c99c0bef8..d362e14c1b2cb 100644 --- a/c_glib/gandiva-glib/enums.h.template +++ b/c_glib/gandiva-glib/enums.h.template @@ -22,6 +22,8 @@ #include +#include + G_BEGIN_DECLS /*** END file-header ***/ @@ -31,6 +33,7 @@ G_BEGIN_DECLS /*** END file-production ***/ /*** BEGIN value-header ***/ +GGANDIVA_AVAILABLE_IN_ALL GType @enum_name@_get_type(void) G_GNUC_CONST; #define @ENUMPREFIX@_TYPE_@ENUMSHORT@ (@enum_name@_get_type()) /*** END value-header ***/ diff --git a/c_glib/gandiva-glib/expression.h b/c_glib/gandiva-glib/expression.h index f8f061ceb08fa..bb7eb22ac01dc 100644 --- a/c_glib/gandiva-glib/expression.h +++ b/c_glib/gandiva-glib/expression.h @@ -26,6 +26,7 @@ G_BEGIN_DECLS #define GGANDIVA_TYPE_EXPRESSION (ggandiva_expression_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE( GGandivaExpression, ggandiva_expression, GGANDIVA, EXPRESSION, GObject) @@ -34,12 +35,16 @@ struct _GGandivaExpressionClass GObjectClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaExpression * ggandiva_expression_new(GGandivaNode *root_node, GArrowField *result_field); + +GGANDIVA_AVAILABLE_IN_0_12 gchar * ggandiva_expression_to_string(GGandivaExpression *expression); #define GGANDIVA_TYPE_CONDITION (ggandiva_condition_get_type()) +GGANDIVA_AVAILABLE_IN_4_0 G_DECLARE_DERIVABLE_TYPE( GGandivaCondition, ggandiva_condition, GGANDIVA, CONDITION, GGandivaExpression) diff --git a/c_glib/gandiva-glib/filter.h b/c_glib/gandiva-glib/filter.h index b95981198e0c4..0a2199ccfa106 100644 --- a/c_glib/gandiva-glib/filter.h +++ b/c_glib/gandiva-glib/filter.h @@ -25,6 +25,7 @@ G_BEGIN_DECLS #define GGANDIVA_TYPE_FILTER (ggandiva_filter_get_type()) +GGANDIVA_AVAILABLE_IN_4_0 G_DECLARE_DERIVABLE_TYPE(GGandivaFilter, ggandiva_filter, GGANDIVA, FILTER, GObject) struct _GGandivaFilterClass @@ -32,8 +33,11 @@ struct _GGandivaFilterClass GObjectClass parent_class; }; +GGANDIVA_AVAILABLE_IN_4_0 GGandivaFilter * ggandiva_filter_new(GArrowSchema *schema, GGandivaCondition *condition, GError **error); + +GGANDIVA_AVAILABLE_IN_4_0 gboolean ggandiva_filter_evaluate(GGandivaFilter *filter, GArrowRecordBatch *record_batch, diff --git a/c_glib/gandiva-glib/function-registry.h b/c_glib/gandiva-glib/function-registry.h index ed21e120a2533..e13f4b36d28dc 100644 --- a/c_glib/gandiva-glib/function-registry.h +++ b/c_glib/gandiva-glib/function-registry.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GGANDIVA_TYPE_FUNCTION_REGISTRY (ggandiva_function_registry_get_type()) +GGANDIVA_AVAILABLE_IN_14_0 G_DECLARE_DERIVABLE_TYPE(GGandivaFunctionRegistry, ggandiva_function_registry, GGANDIVA, @@ -35,14 +36,20 @@ struct _GGandivaFunctionRegistryClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_15_0 +GGANDIVA_AVAILABLE_IN_15_0 GGandivaFunctionRegistry * ggandiva_function_registry_default(void); + +GGANDIVA_AVAILABLE_IN_14_0 GGandivaFunctionRegistry * ggandiva_function_registry_new(void); + +GGANDIVA_AVAILABLE_IN_14_0 GGandivaNativeFunction * ggandiva_function_registry_lookup(GGandivaFunctionRegistry *function_registry, GGandivaFunctionSignature *function_signature); + +GGANDIVA_AVAILABLE_IN_14_0 GList * ggandiva_function_registry_get_native_functions( GGandivaFunctionRegistry *function_registry); diff --git a/c_glib/gandiva-glib/function-signature.h b/c_glib/gandiva-glib/function-signature.h index ef6834ea85723..4fd8cc8a7e761 100644 --- a/c_glib/gandiva-glib/function-signature.h +++ b/c_glib/gandiva-glib/function-signature.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GGANDIVA_TYPE_FUNCTION_SIGNATURE (ggandiva_function_signature_get_type()) +GGANDIVA_AVAILABLE_IN_14_0 G_DECLARE_DERIVABLE_TYPE(GGandivaFunctionSignature, ggandiva_function_signature, GGANDIVA, @@ -35,20 +38,31 @@ struct _GGandivaFunctionSignatureClass GObjectClass parent_class; }; +GGANDIVA_AVAILABLE_IN_14_0 GGandivaFunctionSignature * ggandiva_function_signature_new(const gchar *base_name, GList *parameter_types, GArrowDataType *return_type); + +GGANDIVA_AVAILABLE_IN_14_0 gboolean ggandiva_function_signature_equal(GGandivaFunctionSignature *function_signature, GGandivaFunctionSignature *other_function_signature); + +GGANDIVA_AVAILABLE_IN_14_0 gchar * ggandiva_function_signature_to_string(GGandivaFunctionSignature *function_signature); + +GGANDIVA_AVAILABLE_IN_14_0 GArrowDataType * ggandiva_function_signature_get_return_type( GGandivaFunctionSignature *function_signature); + +GGANDIVA_AVAILABLE_IN_14_0 gchar * ggandiva_function_signature_get_base_name(GGandivaFunctionSignature *function_signature); + +GGANDIVA_AVAILABLE_IN_14_0 GList * ggandiva_function_signature_get_param_types( GGandivaFunctionSignature *function_signature); diff --git a/c_glib/gandiva-glib/meson.build b/c_glib/gandiva-glib/meson.build index d5cab109dcf89..94b923388b7f2 100644 --- a/c_glib/gandiva-glib/meson.build +++ b/c_glib/gandiva-glib/meson.build @@ -53,14 +53,12 @@ cpp_headers = files( 'selection-vector.hpp', ) -version_h_conf = configuration_data() -version_h_conf.set('GGANDIVA_VERSION_MAJOR', version_major) -version_h_conf.set('GGANDIVA_VERSION_MINOR', version_minor) -version_h_conf.set('GGANDIVA_VERSION_MICRO', version_micro) -version_h_conf.set('GGANDIVA_VERSION_TAG', version_tag) -version_h = configure_file(input: 'version.h.in', - output: 'version.h', - configuration: version_h_conf) +version_h = configure_file( + input: 'version.h.in', + output: 'version.h', + command: [python3, generate_version_header_py, '--library', 'GGANDIVA', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], +) + c_headers += version_h enums = gnome.mkenums('enums', @@ -87,6 +85,8 @@ libgandiva_glib = library('gandiva-glib', dependencies: dependencies, implicit_include_directories: false, include_directories: base_include_directories, + cpp_args: ['-DGGANDIVA_COMPILATION'], + c_args: ['-DGGANDIVA_COMPILATION'], soversion: so_version, version: library_version) gandiva_glib = declare_dependency(link_with: libgandiva_glib, diff --git a/c_glib/gandiva-glib/native-function.h b/c_glib/gandiva-glib/native-function.h index 5ceef396ef40c..934d29ab7e33b 100644 --- a/c_glib/gandiva-glib/native-function.h +++ b/c_glib/gandiva-glib/native-function.h @@ -40,6 +40,7 @@ typedef enum { } GGandivaResultNullableType; #define GGANDIVA_TYPE_NATIVE_FUNCTION (ggandiva_native_function_get_type()) +GGANDIVA_AVAILABLE_IN_14_0 G_DECLARE_DERIVABLE_TYPE( GGandivaNativeFunction, ggandiva_native_function, GGANDIVA, NATIVE_FUNCTION, GObject) @@ -48,20 +49,33 @@ struct _GGandivaNativeFunctionClass GObjectClass parent_class; }; +GGANDIVA_AVAILABLE_IN_14_0 GList * ggandiva_native_function_get_signatures(GGandivaNativeFunction *native_function); + +GGANDIVA_AVAILABLE_IN_14_0 gboolean ggandiva_native_function_equal(GGandivaNativeFunction *native_function, GGandivaNativeFunction *other_native_function); + +GGANDIVA_AVAILABLE_IN_14_0 gchar * ggandiva_native_function_to_string(GGandivaNativeFunction *native_function); + +GGANDIVA_AVAILABLE_IN_14_0 GGandivaResultNullableType ggandiva_native_function_get_result_nullable_type( GGandivaNativeFunction *native_function); + +GGANDIVA_AVAILABLE_IN_14_0 gboolean ggandiva_native_function_need_context(GGandivaNativeFunction *native_function); + +GGANDIVA_AVAILABLE_IN_14_0 gboolean ggandiva_native_function_need_function_holder(GGandivaNativeFunction *native_function); + +GGANDIVA_AVAILABLE_IN_14_0 gboolean ggandiva_native_function_can_return_errors(GGandivaNativeFunction *native_function); diff --git a/c_glib/gandiva-glib/node.h b/c_glib/gandiva-glib/node.h index 715a3d6ebaf18..1733cac918c51 100644 --- a/c_glib/gandiva-glib/node.h +++ b/c_glib/gandiva-glib/node.h @@ -26,6 +26,7 @@ G_BEGIN_DECLS #define GGANDIVA_TYPE_NODE (ggandiva_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaNode, ggandiva_node, GGANDIVA, NODE, GObject) struct _GGandivaNodeClass @@ -33,10 +34,12 @@ struct _GGandivaNodeClass GObjectClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_16 gchar * ggandiva_node_to_string(GGandivaNode *node); #define GGANDIVA_TYPE_FIELD_NODE (ggandiva_field_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE( GGandivaFieldNode, ggandiva_field_node, GGANDIVA, FIELD_NODE, GGandivaNode) struct _GGandivaFieldNodeClass @@ -44,10 +47,12 @@ struct _GGandivaFieldNodeClass GGandivaNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaFieldNode * ggandiva_field_node_new(GArrowField *field); #define GGANDIVA_TYPE_FUNCTION_NODE (ggandiva_function_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE( GGandivaFunctionNode, ggandiva_function_node, GGANDIVA, FUNCTION_NODE, GGandivaNode) struct _GGandivaFunctionNodeClass @@ -55,14 +60,18 @@ struct _GGandivaFunctionNodeClass GGandivaNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaFunctionNode * ggandiva_function_node_new(const gchar *name, GList *parameters, GArrowDataType *return_type); + +GGANDIVA_AVAILABLE_IN_0_12 GList * ggandiva_function_node_get_parameters(GGandivaFunctionNode *node); #define GGANDIVA_TYPE_LITERAL_NODE (ggandiva_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE( GGandivaLiteralNode, ggandiva_literal_node, GGANDIVA, LITERAL_NODE, GGandivaNode) struct _GGandivaLiteralNodeClass @@ -71,6 +80,7 @@ struct _GGandivaLiteralNodeClass }; #define GGANDIVA_TYPE_NULL_LITERAL_NODE (ggandiva_null_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaNullLiteralNode, ggandiva_null_literal_node, GGANDIVA, @@ -81,10 +91,12 @@ struct _GGandivaNullLiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaNullLiteralNode * ggandiva_null_literal_node_new(GArrowDataType *return_type, GError **error); #define GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE (ggandiva_boolean_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaBooleanLiteralNode, ggandiva_boolean_literal_node, GGANDIVA, @@ -95,12 +107,16 @@ struct _GGandivaBooleanLiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaBooleanLiteralNode * ggandiva_boolean_literal_node_new(gboolean value); + +GGANDIVA_AVAILABLE_IN_0_12 gboolean ggandiva_boolean_literal_node_get_value(GGandivaBooleanLiteralNode *node); #define GGANDIVA_TYPE_INT8_LITERAL_NODE (ggandiva_int8_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaInt8LiteralNode, ggandiva_int8_literal_node, GGANDIVA, @@ -111,12 +127,16 @@ struct _GGandivaInt8LiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaInt8LiteralNode * ggandiva_int8_literal_node_new(gint8 value); + +GGANDIVA_AVAILABLE_IN_0_12 gint8 ggandiva_int8_literal_node_get_value(GGandivaInt8LiteralNode *node); #define GGANDIVA_TYPE_UINT8_LITERAL_NODE (ggandiva_uint8_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaUInt8LiteralNode, ggandiva_uint8_literal_node, GGANDIVA, @@ -127,12 +147,16 @@ struct _GGandivaUInt8LiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaUInt8LiteralNode * ggandiva_uint8_literal_node_new(guint8 value); + +GGANDIVA_AVAILABLE_IN_0_12 guint8 ggandiva_uint8_literal_node_get_value(GGandivaUInt8LiteralNode *node); #define GGANDIVA_TYPE_INT16_LITERAL_NODE (ggandiva_int16_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaInt16LiteralNode, ggandiva_int16_literal_node, GGANDIVA, @@ -143,12 +167,16 @@ struct _GGandivaInt16LiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaInt16LiteralNode * ggandiva_int16_literal_node_new(gint16 value); + +GGANDIVA_AVAILABLE_IN_0_12 gint16 ggandiva_int16_literal_node_get_value(GGandivaInt16LiteralNode *node); #define GGANDIVA_TYPE_UINT16_LITERAL_NODE (ggandiva_uint16_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaUInt16LiteralNode, ggandiva_uint16_literal_node, GGANDIVA, @@ -159,12 +187,16 @@ struct _GGandivaUInt16LiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaUInt16LiteralNode * ggandiva_uint16_literal_node_new(guint16 value); + +GGANDIVA_AVAILABLE_IN_0_12 guint16 ggandiva_uint16_literal_node_get_value(GGandivaUInt16LiteralNode *node); #define GGANDIVA_TYPE_INT32_LITERAL_NODE (ggandiva_int32_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaInt32LiteralNode, ggandiva_int32_literal_node, GGANDIVA, @@ -175,12 +207,16 @@ struct _GGandivaInt32LiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaInt32LiteralNode * ggandiva_int32_literal_node_new(gint32 value); + +GGANDIVA_AVAILABLE_IN_0_12 gint32 ggandiva_int32_literal_node_get_value(GGandivaInt32LiteralNode *node); #define GGANDIVA_TYPE_UINT32_LITERAL_NODE (ggandiva_uint32_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaUInt32LiteralNode, ggandiva_uint32_literal_node, GGANDIVA, @@ -191,12 +227,16 @@ struct _GGandivaUInt32LiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaUInt32LiteralNode * ggandiva_uint32_literal_node_new(guint32 value); + +GGANDIVA_AVAILABLE_IN_0_12 guint32 ggandiva_uint32_literal_node_get_value(GGandivaUInt32LiteralNode *node); #define GGANDIVA_TYPE_INT64_LITERAL_NODE (ggandiva_int64_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaInt64LiteralNode, ggandiva_int64_literal_node, GGANDIVA, @@ -207,12 +247,16 @@ struct _GGandivaInt64LiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaInt64LiteralNode * ggandiva_int64_literal_node_new(gint64 value); + +GGANDIVA_AVAILABLE_IN_0_12 gint64 ggandiva_int64_literal_node_get_value(GGandivaInt64LiteralNode *node); #define GGANDIVA_TYPE_UINT64_LITERAL_NODE (ggandiva_uint64_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaUInt64LiteralNode, ggandiva_uint64_literal_node, GGANDIVA, @@ -223,12 +267,16 @@ struct _GGandivaUInt64LiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaUInt64LiteralNode * ggandiva_uint64_literal_node_new(guint64 value); + +GGANDIVA_AVAILABLE_IN_0_12 guint64 ggandiva_uint64_literal_node_get_value(GGandivaUInt64LiteralNode *node); #define GGANDIVA_TYPE_FLOAT_LITERAL_NODE (ggandiva_float_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaFloatLiteralNode, ggandiva_float_literal_node, GGANDIVA, @@ -239,12 +287,16 @@ struct _GGandivaFloatLiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaFloatLiteralNode * ggandiva_float_literal_node_new(gfloat value); + +GGANDIVA_AVAILABLE_IN_0_12 gfloat ggandiva_float_literal_node_get_value(GGandivaFloatLiteralNode *node); #define GGANDIVA_TYPE_DOUBLE_LITERAL_NODE (ggandiva_double_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaDoubleLiteralNode, ggandiva_double_literal_node, GGANDIVA, @@ -255,12 +307,16 @@ struct _GGandivaDoubleLiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaDoubleLiteralNode * ggandiva_double_literal_node_new(gdouble value); + +GGANDIVA_AVAILABLE_IN_0_12 gdouble ggandiva_double_literal_node_get_value(GGandivaDoubleLiteralNode *node); #define GGANDIVA_TYPE_BINARY_LITERAL_NODE (ggandiva_binary_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaBinaryLiteralNode, ggandiva_binary_literal_node, GGANDIVA, @@ -271,14 +327,20 @@ struct _GGandivaBinaryLiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaBinaryLiteralNode * ggandiva_binary_literal_node_new(const guint8 *value, gsize size); + +GGANDIVA_AVAILABLE_IN_0_12 GGandivaBinaryLiteralNode * ggandiva_binary_literal_node_new_bytes(GBytes *value); + +GGANDIVA_AVAILABLE_IN_0_12 GBytes * ggandiva_binary_literal_node_get_value(GGandivaBinaryLiteralNode *node); #define GGANDIVA_TYPE_STRING_LITERAL_NODE (ggandiva_string_literal_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE(GGandivaStringLiteralNode, ggandiva_string_literal_node, GGANDIVA, @@ -289,12 +351,16 @@ struct _GGandivaStringLiteralNodeClass GGandivaLiteralNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaStringLiteralNode * ggandiva_string_literal_node_new(const gchar *value); + +GGANDIVA_AVAILABLE_IN_0_12 const gchar * ggandiva_string_literal_node_get_value(GGandivaStringLiteralNode *node); #define GGANDIVA_TYPE_IF_NODE (ggandiva_if_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE( GGandivaIfNode, ggandiva_if_node, GGANDIVA, IF_NODE, GGandivaNode) struct _GGandivaIfNodeClass @@ -302,6 +368,7 @@ struct _GGandivaIfNodeClass GGandivaNodeClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaIfNode * ggandiva_if_node_new(GGandivaNode *condition_node, GGandivaNode *then_node, @@ -310,6 +377,7 @@ ggandiva_if_node_new(GGandivaNode *condition_node, GError **error); #define GGANDIVA_TYPE_BOOLEAN_NODE (ggandiva_boolean_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE( GGandivaBooleanNode, ggandiva_boolean_node, GGANDIVA, BOOLEAN_NODE, GGandivaNode) @@ -323,6 +391,7 @@ GList * ggandiva_boolean_node_get_children(GGandivaBooleanNode *node); #define GGANDIVA_TYPE_AND_NODE (ggandiva_and_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE( GGandivaAndNode, ggandiva_and_node, GGANDIVA, AND_NODE, GGandivaBooleanNode) struct _GGandivaAndNodeClass @@ -335,6 +404,7 @@ GGandivaAndNode * ggandiva_and_node_new(GList *children); #define GGANDIVA_TYPE_OR_NODE (ggandiva_or_node_get_type()) +GGANDIVA_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE( GGandivaOrNode, ggandiva_or_node, GGANDIVA, OR_NODE, GGandivaBooleanNode) struct _GGandivaOrNodeClass diff --git a/c_glib/gandiva-glib/projector.h b/c_glib/gandiva-glib/projector.h index e0afec5cb1ba1..5fbf9c290beab 100644 --- a/c_glib/gandiva-glib/projector.h +++ b/c_glib/gandiva-glib/projector.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GGANDIVA_TYPE_PROJECTOR (ggandiva_projector_get_type()) +GGANDIVA_AVAILABLE_IN_0_12 G_DECLARE_DERIVABLE_TYPE( GGandivaProjector, ggandiva_projector, GGANDIVA, PROJECTOR, GObject) @@ -32,14 +33,18 @@ struct _GGandivaProjectorClass GObjectClass parent_class; }; +GGANDIVA_AVAILABLE_IN_0_12 GGandivaProjector * ggandiva_projector_new(GArrowSchema *schema, GList *expressions, GError **error); + +GGANDIVA_AVAILABLE_IN_0_12 GList * ggandiva_projector_evaluate(GGandivaProjector *projector, GArrowRecordBatch *record_batch, GError **error); #define GGANDIVA_TYPE_SELECTABLE_PROJECTOR (ggandiva_selectable_projector_get_type()) +GGANDIVA_AVAILABLE_IN_4_0 G_DECLARE_DERIVABLE_TYPE(GGandivaSelectableProjector, ggandiva_selectable_projector, GGANDIVA, diff --git a/c_glib/gandiva-glib/selection-vector.h b/c_glib/gandiva-glib/selection-vector.h index 6d78192e35e28..558b9b950cf84 100644 --- a/c_glib/gandiva-glib/selection-vector.h +++ b/c_glib/gandiva-glib/selection-vector.h @@ -47,6 +47,7 @@ typedef enum { } GGandivaSelectionVectorMode; #define GGANDIVA_TYPE_SELECTION_VECTOR (ggandiva_selection_vector_get_type()) +GGANDIVA_AVAILABLE_IN_4_0 G_DECLARE_DERIVABLE_TYPE( GGandivaSelectionVector, ggandiva_selection_vector, GGANDIVA, SELECTION_VECTOR, GObject) @@ -65,6 +66,7 @@ ggandiva_selection_vector_to_array(GGandivaSelectionVector *selection_vector); #define GGANDIVA_TYPE_UINT16_SELECTION_VECTOR \ (ggandiva_uint16_selection_vector_get_type()) +GGANDIVA_AVAILABLE_IN_4_0 G_DECLARE_DERIVABLE_TYPE(GGandivaUInt16SelectionVector, ggandiva_uint16_selection_vector, GGANDIVA, @@ -82,6 +84,7 @@ ggandiva_uint16_selection_vector_new(gint64 max_slots, GError **error); #define GGANDIVA_TYPE_UINT32_SELECTION_VECTOR \ (ggandiva_uint32_selection_vector_get_type()) +GGANDIVA_AVAILABLE_IN_4_0 G_DECLARE_DERIVABLE_TYPE(GGandivaUInt32SelectionVector, ggandiva_uint32_selection_vector, GGANDIVA, @@ -99,6 +102,7 @@ ggandiva_uint32_selection_vector_new(gint64 max_slots, GError **error); #define GGANDIVA_TYPE_UINT64_SELECTION_VECTOR \ (ggandiva_uint64_selection_vector_get_type()) +GGANDIVA_AVAILABLE_IN_4_0 G_DECLARE_DERIVABLE_TYPE(GGandivaUInt64SelectionVector, ggandiva_uint64_selection_vector, GGANDIVA, diff --git a/c_glib/gandiva-glib/version.h.in b/c_glib/gandiva-glib/version.h.in index 3c9e87c9d52e1..857c7367bd7e2 100644 --- a/c_glib/gandiva-glib/version.h.in +++ b/c_glib/gandiva-glib/version.h.in @@ -38,7 +38,7 @@ * * Since: 1.0.0 */ -#define GGANDIVA_VERSION_MAJOR (@GGANDIVA_VERSION_MAJOR@) +#define GGANDIVA_VERSION_MAJOR (@VERSION_MAJOR@) /** * GGANDIVA_VERSION_MINOR: @@ -47,7 +47,7 @@ * * Since: 1.0.0 */ -#define GGANDIVA_VERSION_MINOR (@GGANDIVA_VERSION_MINOR@) +#define GGANDIVA_VERSION_MINOR (@VERSION_MINOR@) /** * GGANDIVA_VERSION_MICRO: @@ -56,7 +56,7 @@ * * Since: 1.0.0 */ -#define GGANDIVA_VERSION_MICRO (@GGANDIVA_VERSION_MICRO@) +#define GGANDIVA_VERSION_MICRO (@VERSION_MICRO@) /** * GGANDIVA_VERSION_TAG: @@ -66,7 +66,7 @@ * * Since: 1.0.0 */ -#define GGANDIVA_VERSION_TAG "@GGANDIVA_VERSION_TAG@" +#define GGANDIVA_VERSION_TAG "@VERSION_TAG@" /** * GGANDIVA_VERSION_CHECK: @@ -110,23 +110,7 @@ # define GGANDIVA_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif -/** - * GGANDIVA_VERSION_1_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 1.0.0 - */ -#define GGANDIVA_VERSION_1_0 G_ENCODE_VERSION(1, 0) - -/** - * GGANDIVA_VERSION_4_0: - * - * You can use this macro value for compile time API version check. - * - * Since: 4.0.0 - */ -#define GGANDIVA_VERSION_4_0 G_ENCODE_VERSION(4, 0) +@ENCODED_VERSIONS@ /** * GGANDIVA_VERSION_MIN_REQUIRED: @@ -172,47 +156,6 @@ G_ENCODE_VERSION(GGANDIVA_VERSION_MAJOR, GGANDIVA_VERSION_MINOR) #endif +@VISIBILITY_MACROS@ -#define GGANDIVA_AVAILABLE_IN_ALL - -#if GGANDIVA_VERSION_MIN_REQUIRED >= GGANDIVA_VERSION_4_0 -# define GGANDIVA_DEPRECATED_IN_4_0 GGANDIVA_DEPRECATED -# define GGANDIVA_DEPRECATED_IN_4_0_FOR(function) GGANDIVA_DEPRECATED_FOR(function) -#else -# define GGANDIVA_DEPRECATED_IN_4_0 -# define GGANDIVA_DEPRECATED_IN_4_0_FOR(function) -#endif - -#if GGANDIVA_VERSION_MAX_ALLOWED < GGANDIVA_VERSION_4_0 -# define GGANDIVA_AVAILABLE_IN_4_0 GGANDIVA_UNAVAILABLE(4, 0) -#else -# define GGANDIVA_AVAILABLE_IN_4_0 -#endif - -#if GGANDIVA_VERSION_MIN_REQUIRED >= GGANDIVA_VERSION_1_0 -# define GGANDIVA_DEPRECATED_IN_1_0 GGANDIVA_DEPRECATED -# define GGANDIVA_DEPRECATED_IN_1_0_FOR(function) GGANDIVA_DEPRECATED_FOR(function) -#else -# define GGANDIVA_DEPRECATED_IN_1_0 -# define GGANDIVA_DEPRECATED_IN_1_0_FOR(function) -#endif - -#if GGANDIVA_VERSION_MAX_ALLOWED < GGANDIVA_VERSION_1_0 -# define GGANDIVA_AVAILABLE_IN_1_0 GGANDIVA_UNAVAILABLE(1, 0) -#else -# define GGANDIVA_AVAILABLE_IN_1_0 -#endif - -#if GGANDIVA_VERSION_MIN_REQUIRED >= GGANDIVA_VERSION_0_17 -# define GGANDIVA_DEPRECATED_IN_0_17 GGANDIVA_DEPRECATED -# define GGANDIVA_DEPRECATED_IN_0_17_FOR(function) GGANDIVA_DEPRECATED_FOR(function) -#else -# define GGANDIVA_DEPRECATED_IN_0_17 -# define GGANDIVA_DEPRECATED_IN_0_17_FOR(function) -#endif - -#if GGANDIVA_VERSION_MAX_ALLOWED < GGANDIVA_VERSION_0_17 -# define GGANDIVA_AVAILABLE_IN_0_17 GGANDIVA_UNAVAILABLE(0, 17) -#else -# define GGANDIVA_AVAILABLE_IN_0_17 -#endif +@AVAILABILITY_MACROS@ diff --git a/c_glib/meson.build b/c_glib/meson.build index 04d0129855b20..06aa5b941e77c 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -26,8 +26,6 @@ project('arrow-glib', 'c', 'cpp', # Debian: # https://packages.debian.org/search?keywords=meson # - # * bullseye: 0.56.2 - # * bullseye-backports:1.0.0 # * bookworm: 1.0.0 # # Ubuntu: @@ -37,7 +35,7 @@ project('arrow-glib', 'c', 'cpp', # * 22.04: 0.61.2 meson_version: '>=0.53.2') -version = '16.0.0-SNAPSHOT' +version = '17.0.0-SNAPSHOT' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] @@ -173,6 +171,10 @@ if cxx.get_id() != 'msvc' endif add_project_arguments(cxx.get_supported_arguments(cxx_flags), language: 'cpp') +python = import('python') +python3 = python.find_installation('python3') +generate_version_header_py = project_source_root / 'tool' / 'generate-version-header.py' + subdir('arrow-glib') if arrow_cuda.found() subdir('arrow-cuda-glib') diff --git a/c_glib/parquet-glib/arrow-file-reader.h b/c_glib/parquet-glib/arrow-file-reader.h index 63c14ac71da86..52d7293bad0fa 100644 --- a/c_glib/parquet-glib/arrow-file-reader.h +++ b/c_glib/parquet-glib/arrow-file-reader.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GPARQUET_TYPE_ARROW_FILE_READER (gparquet_arrow_file_reader_get_type()) +GPARQUET_AVAILABLE_IN_0_11 G_DECLARE_DERIVABLE_TYPE(GParquetArrowFileReader, gparquet_arrow_file_reader, GPARQUET, @@ -34,15 +35,19 @@ struct _GParquetArrowFileReaderClass GObjectClass parent_class; }; +GPARQUET_AVAILABLE_IN_0_11 GParquetArrowFileReader * gparquet_arrow_file_reader_new_arrow(GArrowSeekableInputStream *source, GError **error); + +GPARQUET_AVAILABLE_IN_0_11 GParquetArrowFileReader * gparquet_arrow_file_reader_new_path(const gchar *path, GError **error); +GPARQUET_AVAILABLE_IN_0_11 GArrowTable * gparquet_arrow_file_reader_read_table(GParquetArrowFileReader *reader, GError **error); -GARROW_AVAILABLE_IN_1_0 +GPARQUET_AVAILABLE_IN_1_0 GArrowTable * gparquet_arrow_file_reader_read_row_group(GParquetArrowFileReader *reader, gint row_group_index, @@ -50,26 +55,30 @@ gparquet_arrow_file_reader_read_row_group(GParquetArrowFileReader *reader, gsize n_column_indices, GError **error); +GPARQUET_AVAILABLE_IN_0_12 GArrowSchema * gparquet_arrow_file_reader_get_schema(GParquetArrowFileReader *reader, GError **error); +GPARQUET_AVAILABLE_IN_0_15 GArrowChunkedArray * gparquet_arrow_file_reader_read_column_data(GParquetArrowFileReader *reader, gint i, GError **error); +GPARQUET_AVAILABLE_IN_0_11 gint gparquet_arrow_file_reader_get_n_row_groups(GParquetArrowFileReader *reader); -GARROW_AVAILABLE_IN_6_0 +GPARQUET_AVAILABLE_IN_6_0 gint64 gparquet_arrow_file_reader_get_n_rows(GParquetArrowFileReader *reader); +GPARQUET_AVAILABLE_IN_0_11 void gparquet_arrow_file_reader_set_use_threads(GParquetArrowFileReader *reader, gboolean use_threads); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 GParquetFileMetadata * gparquet_arrow_file_reader_get_metadata(GParquetArrowFileReader *reader); diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h index 592ea4ae3f1ba..71cbfa195e842 100644 --- a/c_glib/parquet-glib/arrow-file-writer.h +++ b/c_glib/parquet-glib/arrow-file-writer.h @@ -20,10 +20,12 @@ #pragma once #include +#include G_BEGIN_DECLS #define GPARQUET_TYPE_WRITER_PROPERTIES (gparquet_writer_properties_get_type()) +GPARQUET_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GParquetWriterProperties, gparquet_writer_properties, GPARQUET, @@ -34,61 +36,62 @@ struct _GParquetWriterPropertiesClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 GParquetWriterProperties * gparquet_writer_properties_new(void); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 void gparquet_writer_properties_set_compression(GParquetWriterProperties *properties, GArrowCompressionType compression_type, const gchar *path); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 GArrowCompressionType gparquet_writer_properties_get_compression_path(GParquetWriterProperties *properties, const gchar *path); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 void gparquet_writer_properties_enable_dictionary(GParquetWriterProperties *properties, const gchar *path); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 void gparquet_writer_properties_disable_dictionary(GParquetWriterProperties *properties, const gchar *path); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 gboolean gparquet_writer_properties_is_dictionary_enabled(GParquetWriterProperties *properties, const gchar *path); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 void gparquet_writer_properties_set_dictionary_page_size_limit( GParquetWriterProperties *properties, gint64 limit); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 gint64 gparquet_writer_properties_get_dictionary_page_size_limit( GParquetWriterProperties *properties); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 void gparquet_writer_properties_set_batch_size(GParquetWriterProperties *properties, gint64 batch_size); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 gint64 gparquet_writer_properties_get_batch_size(GParquetWriterProperties *properties); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 void gparquet_writer_properties_set_max_row_group_length(GParquetWriterProperties *properties, gint64 length); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 gint64 gparquet_writer_properties_get_max_row_group_length(GParquetWriterProperties *properties); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 void gparquet_writer_properties_set_data_page_size(GParquetWriterProperties *properties, gint64 data_page_size); -GARROW_AVAILABLE_IN_0_17 +GPARQUET_AVAILABLE_IN_0_17 gint64 gparquet_writer_properties_get_data_page_size(GParquetWriterProperties *properties); #define GPARQUET_TYPE_ARROW_FILE_WRITER (gparquet_arrow_file_writer_get_type()) +GPARQUET_AVAILABLE_IN_0_11 G_DECLARE_DERIVABLE_TYPE(GParquetArrowFileWriter, gparquet_arrow_file_writer, GPARQUET, @@ -99,23 +102,28 @@ struct _GParquetArrowFileWriterClass GObjectClass parent_class; }; +GPARQUET_AVAILABLE_IN_0_11 GParquetArrowFileWriter * gparquet_arrow_file_writer_new_arrow(GArrowSchema *schema, GArrowOutputStream *sink, GParquetWriterProperties *writer_properties, GError **error); + +GPARQUET_AVAILABLE_IN_0_11 GParquetArrowFileWriter * gparquet_arrow_file_writer_new_path(GArrowSchema *schema, const gchar *path, GParquetWriterProperties *writer_properties, GError **error); +GPARQUET_AVAILABLE_IN_0_11 gboolean gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, GArrowTable *table, guint64 chunk_size, GError **error); +GPARQUET_AVAILABLE_IN_0_11 gboolean gparquet_arrow_file_writer_close(GParquetArrowFileWriter *writer, GError **error); diff --git a/c_glib/parquet-glib/meson.build b/c_glib/parquet-glib/meson.build index 67de0bf2d91fb..a3de1d0933f7f 100644 --- a/c_glib/parquet-glib/meson.build +++ b/c_glib/parquet-glib/meson.build @@ -42,10 +42,17 @@ cpp_headers = files( 'parquet-glib.hpp', ) +version_h = configure_file( + input: 'version.h.in', + output: 'version.h', + command: [python3, generate_version_header_py, '--library', 'GPARQUET', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], +) + +c_headers += version_h + headers = c_headers + cpp_headers install_headers(headers, subdir: project_name) - dependencies = [ arrow, parquet, @@ -57,6 +64,7 @@ libparquet_glib = library('parquet-glib', dependencies: dependencies, implicit_include_directories: false, include_directories: base_include_directories, + cpp_args: ['-DGPARQUET_COMPILATION'], soversion: so_version, version: library_version) parquet_glib = declare_dependency(link_with: libparquet_glib, diff --git a/c_glib/parquet-glib/metadata.h b/c_glib/parquet-glib/metadata.h index 1c9fce7cc778d..d79bf009751ca 100644 --- a/c_glib/parquet-glib/metadata.h +++ b/c_glib/parquet-glib/metadata.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS #define GPARQUET_TYPE_COLUMN_CHUNK_METADATA (gparquet_column_chunk_metadata_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GParquetColumnChunkMetadata, gparquet_column_chunk_metadata, GPARQUET, @@ -34,28 +35,29 @@ struct _GParquetColumnChunkMetadataClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_column_chunk_metadata_equal(GParquetColumnChunkMetadata *metadata, GParquetColumnChunkMetadata *other_metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_column_chunk_metadata_get_total_size(GParquetColumnChunkMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_column_chunk_metadata_get_total_compressed_size( GParquetColumnChunkMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_column_chunk_metadata_get_file_offset(GParquetColumnChunkMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_column_chunk_metadata_can_decompress(GParquetColumnChunkMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 GParquetStatistics * gparquet_column_chunk_metadata_get_statistics(GParquetColumnChunkMetadata *metadata); #define GPARQUET_TYPE_ROW_GROUP_METADATA (gparquet_row_group_metadata_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GParquetRowGroupMetadata, gparquet_row_group_metadata, GPARQUET, @@ -66,35 +68,36 @@ struct _GParquetRowGroupMetadataClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_row_group_metadata_equal(GParquetRowGroupMetadata *metadata, GParquetRowGroupMetadata *other_metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint gparquet_row_group_metadata_get_n_columns(GParquetRowGroupMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 GParquetColumnChunkMetadata * gparquet_row_group_metadata_get_column_chunk(GParquetRowGroupMetadata *metadata, gint index, GError **error); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_row_group_metadata_get_n_rows(GParquetRowGroupMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_row_group_metadata_get_total_size(GParquetRowGroupMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_row_group_metadata_get_total_compressed_size(GParquetRowGroupMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_row_group_metadata_get_file_offset(GParquetRowGroupMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_row_group_metadata_can_decompress(GParquetRowGroupMetadata *metadata); #define GPARQUET_TYPE_FILE_METADATA (gparquet_file_metadata_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE( GParquetFileMetadata, gparquet_file_metadata, GPARQUET, FILE_METADATA, GObject) struct _GParquetFileMetadataClass @@ -102,34 +105,34 @@ struct _GParquetFileMetadataClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_file_metadata_equal(GParquetFileMetadata *metadata, GParquetFileMetadata *other_metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint gparquet_file_metadata_get_n_columns(GParquetFileMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint gparquet_file_metadata_get_n_schema_elements(GParquetFileMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_file_metadata_get_n_rows(GParquetFileMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint gparquet_file_metadata_get_n_row_groups(GParquetFileMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 GParquetRowGroupMetadata * gparquet_file_metadata_get_row_group(GParquetFileMetadata *metadata, gint index, GError **error); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 const gchar * gparquet_file_metadata_get_created_by(GParquetFileMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 guint32 gparquet_file_metadata_get_size(GParquetFileMetadata *metadata); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_file_metadata_can_decompress(GParquetFileMetadata *metadata); diff --git a/c_glib/parquet-glib/parquet-glib.h b/c_glib/parquet-glib/parquet-glib.h index 23659421ce3d8..308adb87a7ed2 100644 --- a/c_glib/parquet-glib/parquet-glib.h +++ b/c_glib/parquet-glib/parquet-glib.h @@ -19,6 +19,8 @@ #pragma once +#include + #include #include #include diff --git a/c_glib/parquet-glib/statistics.h b/c_glib/parquet-glib/statistics.h index f28e2a3713638..25e02df8774b2 100644 --- a/c_glib/parquet-glib/statistics.h +++ b/c_glib/parquet-glib/statistics.h @@ -21,9 +21,12 @@ #include +#include + G_BEGIN_DECLS #define GPARQUET_TYPE_STATISTICS (gparquet_statistics_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE( GParquetStatistics, gparquet_statistics, GPARQUET, STATISTICS, GObject) struct _GParquetStatisticsClass @@ -31,30 +34,31 @@ struct _GParquetStatisticsClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_statistics_equal(GParquetStatistics *statistics, GParquetStatistics *other_statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_statistics_has_n_nulls(GParquetStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_statistics_get_n_nulls(GParquetStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_statistics_has_n_distinct_values(GParquetStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_statistics_get_n_distinct_values(GParquetStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_statistics_get_n_values(GParquetStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_statistics_has_min_max(GParquetStatistics *statistics); #define GPARQUET_TYPE_BOOLEAN_STATISTICS (gparquet_boolean_statistics_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GParquetBooleanStatistics, gparquet_boolean_statistics, GPARQUET, @@ -65,14 +69,15 @@ struct _GParquetBooleanStatisticsClass GParquetStatisticsClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_boolean_statistics_get_min(GParquetBooleanStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gboolean gparquet_boolean_statistics_get_max(GParquetBooleanStatistics *statistics); #define GPARQUET_TYPE_INT32_STATISTICS (gparquet_int32_statistics_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GParquetInt32Statistics, gparquet_int32_statistics, GPARQUET, @@ -83,14 +88,15 @@ struct _GParquetInt32StatisticsClass GParquetStatisticsClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint32 gparquet_int32_statistics_get_min(GParquetInt32Statistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint32 gparquet_int32_statistics_get_max(GParquetInt32Statistics *statistics); #define GPARQUET_TYPE_INT64_STATISTICS (gparquet_int64_statistics_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GParquetInt64Statistics, gparquet_int64_statistics, GPARQUET, @@ -101,14 +107,15 @@ struct _GParquetInt64StatisticsClass GParquetStatisticsClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_int64_statistics_get_min(GParquetInt64Statistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gint64 gparquet_int64_statistics_get_max(GParquetInt64Statistics *statistics); #define GPARQUET_TYPE_FLOAT_STATISTICS (gparquet_float_statistics_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GParquetFloatStatistics, gparquet_float_statistics, GPARQUET, @@ -119,14 +126,15 @@ struct _GParquetFloatStatisticsClass GParquetStatisticsClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gfloat gparquet_float_statistics_get_min(GParquetFloatStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gfloat gparquet_float_statistics_get_max(GParquetFloatStatistics *statistics); #define GPARQUET_TYPE_DOUBLE_STATISTICS (gparquet_double_statistics_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GParquetDoubleStatistics, gparquet_double_statistics, GPARQUET, @@ -137,14 +145,15 @@ struct _GParquetDoubleStatisticsClass GParquetStatisticsClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gdouble gparquet_double_statistics_get_min(GParquetDoubleStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 gdouble gparquet_double_statistics_get_max(GParquetDoubleStatistics *statistics); #define GPARQUET_TYPE_BYTE_ARRAY_STATISTICS (gparquet_byte_array_statistics_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GParquetByteArrayStatistics, gparquet_byte_array_statistics, GPARQUET, @@ -155,15 +164,16 @@ struct _GParquetByteArrayStatisticsClass GParquetStatisticsClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 GBytes * gparquet_byte_array_statistics_get_min(GParquetByteArrayStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 GBytes * gparquet_byte_array_statistics_get_max(GParquetByteArrayStatistics *statistics); #define GPARQUET_TYPE_FIXED_LENGTH_BYTE_ARRAY_STATISTICS \ (gparquet_fixed_length_byte_array_statistics_get_type()) +GPARQUET_AVAILABLE_IN_8_0 G_DECLARE_DERIVABLE_TYPE(GParquetFixedLengthByteArrayStatistics, gparquet_fixed_length_byte_array_statistics, GPARQUET, @@ -174,11 +184,11 @@ struct _GParquetFixedLengthByteArrayStatisticsClass GParquetStatisticsClass parent_class; }; -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 GBytes * gparquet_fixed_length_byte_array_statistics_get_min( GParquetFixedLengthByteArrayStatistics *statistics); -GARROW_AVAILABLE_IN_8_0 +GPARQUET_AVAILABLE_IN_8_0 GBytes * gparquet_fixed_length_byte_array_statistics_get_max( GParquetFixedLengthByteArrayStatistics *statistics); diff --git a/c_glib/parquet-glib/version.h.in b/c_glib/parquet-glib/version.h.in new file mode 100644 index 0000000000000..142b3b83e0f3d --- /dev/null +++ b/c_glib/parquet-glib/version.h.in @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +/** + * SECTION: version + * @section_id: version-macros + * @title: Version related macros + * @include: parquet-glib/parquet-glib.h + * + * Parquet GLib provides macros that can be used by C pre-processor. + * They are useful to check version related things at compile time. + */ + +/** + * GPARQUET_VERSION_MAJOR: + * + * The major version. + * + * Since: 17.0.0 + */ +#define GPARQUET_VERSION_MAJOR (@VERSION_MAJOR@) + +/** + * GPARQUET_VERSION_MINOR: + * + * The minor version. + * + * Since: 17.0.0 + */ +#define GPARQUET_VERSION_MINOR (@VERSION_MINOR@) + +/** + * GPARQUET_VERSION_MICRO: + * + * The micro version. + * + * Since: 17.0.0 + */ +#define GPARQUET_VERSION_MICRO (@VERSION_MICRO@) + +/** + * GPARQUET_VERSION_TAG: + * + * The version tag. Normally, it's an empty string. It's "SNAPSHOT" + * for snapshot version. + * + * Since: 17.0.0 + */ +#define GPARQUET_VERSION_TAG "@VERSION_TAG@" + +/** + * GPARQUET_VERSION_CHECK: + * @major: A major version to check for. + * @minor: A minor version to check for. + * @micro: A micro version to check for. + * + * You can use this macro in C pre-processor. + * + * Returns: %TRUE if the compile time Apache Arrow GLib version is the + * same as or newer than the passed version, %FALSE otherwise. + * + * Since: 17.0.0 + */ +#define GPARQUET_VERSION_CHECK(major, minor, micro) \ + (GPARQUET_VERSION_MAJOR > (major) || \ + (GPARQUET_VERSION_MAJOR == (major) && \ + GPARQUET_VERSION_MINOR > (minor)) || \ + (GPARQUET_VERSION_MAJOR == (major) && \ + GPARQUET_VERSION_MINOR == (minor) && \ + GPARQUET_VERSION_MICRO >= (micro))) + +/** + * GPARQUET_DISABLE_DEPRECATION_WARNINGS: + * + * If this macro is defined, no deprecated warnings are produced. + * + * You must define this macro before including the + * arrow-glib/arrow-glib.h header. + * + * Since: 17.0.0 + */ + +#ifdef GPARQUET_DISABLE_DEPRECATION_WARNINGS +# define GPARQUET_DEPRECATED +# define GPARQUET_DEPRECATED_FOR(function) +# define GPARQUET_UNAVAILABLE(major, minor) +#else +# define GPARQUET_DEPRECATED G_DEPRECATED +# define GPARQUET_DEPRECATED_FOR(function) G_DEPRECATED_FOR(function) +# define GPARQUET_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) +#endif + +@ENCODED_VERSIONS@ + +/** + * GPARQUET_VERSION_MIN_REQUIRED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GPARQUET_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GPARQUET_VERSION_MIN_REQUIRED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * parquet-glib/parquet-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GPARQUET_VERSION_MIN_REQUIRED +# define GPARQUET_VERSION_MIN_REQUIRED GARROW_VERSION_MIN_REQUIRED +#endif + +/** + * GPARQUET_VERSION_MAX_ALLOWED: + * + * You can use this macro for compile time API version check. + * + * This macro value must be one of the predefined version macros such + * as %GPARQUET_VERSION_0_10. + * + * If you use any functions that is defined by newer version than + * %GPARQUET_VERSION_MAX_ALLOWED, deprecated warnings are produced at + * compile time. + * + * You must define this macro before including the + * parquet-glib/parquet-glib.h header. + * + * Since: 17.0.0 + */ +#ifndef GPARQUET_VERSION_MAX_ALLOWED +# define GPARQUET_VERSION_MAX_ALLOWED GARROW_VERSION_MAX_ALLOWED +#endif + +@VISIBILITY_MACROS@ + +@AVAILABILITY_MACROS@ diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index 0e856b678f860..96deedf6b4eb0 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -56,6 +56,22 @@ def test_partitioning end def test_read_write + dataset, expected_table = create_dataset + assert_equal(expected_table, dataset.to_table) + end + + def test_to_record_batch_reader + dataset, expected_table = create_dataset + reader = dataset.to_record_batch_reader + begin + assert_equal(expected_table, reader.read_all) + ensure + # Unref to ensure the reader closes files and we can delete the temp directory + reader.unref + end + end + + def create_dataset table = build_table(label: build_string_array(["a", "a", "b", "c"]), count: build_int32_array([1, 10, 2, 3])) table_reader = Arrow::TableBatchReader.new(table) @@ -73,7 +89,8 @@ def test_read_write end @factory.partition_base_dir = @dir dataset = @factory.finish - assert_equal(build_table(count: [ + + expected_table = build_table(count: [ build_int32_array([1, 10]), build_int32_array([2]), build_int32_array([3]), @@ -82,7 +99,8 @@ def test_read_write build_string_array(["a", "a"]), build_string_array(["b"]), build_string_array(["c"]), - ]), - dataset.to_table) + ]) + + return dataset, expected_table end end diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb index f7702d4905fb6..5dc31eefc5f4c 100644 --- a/c_glib/test/dataset/test-scanner.rb +++ b/c_glib/test/dataset/test-scanner.rb @@ -45,4 +45,14 @@ def setup def test_to_table assert_equal(@table, @scanner.to_table) end + + def test_to_record_batch_reader + reader = @scanner.to_record_batch_reader + begin + assert_equal(@table, reader.read_all) + ensure + # Unref to ensure the reader closes files and we can delete the temp directory + reader.unref + end + end end diff --git a/c_glib/test/parquet/test-arrow-file-reader.rb b/c_glib/test/parquet/test-arrow-file-reader.rb index 45eb335965434..eff5ad966aea6 100644 --- a/c_glib/test/parquet/test-arrow-file-reader.rb +++ b/c_glib/test/parquet/test-arrow-file-reader.rb @@ -20,16 +20,23 @@ class TestParquetArrowFileReader < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @a_array = build_string_array(["foo", "bar"]) - @b_array = build_int32_array([123, 456]) - @table = build_table("a" => @a_array, - "b" => @b_array) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1 - writer.write_table(@table, chunk_size) - writer.close - @reader = Parquet::ArrowFileReader.new(@file.path) + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @a_array = build_string_array(["foo", "bar"]) + @b_array = build_int32_array([123, 456]) + @table = build_table("a" => @a_array, + "b" => @b_array) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1 + writer.write_table(@table, chunk_size) + writer.close + @reader = Parquet::ArrowFileReader.new(@file.path) + begin + yield + ensure + @reader.unref + end + end end def test_schema diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb index 855527444d063..f899e7273b2a2 100644 --- a/c_glib/test/parquet/test-arrow-file-writer.rb +++ b/c_glib/test/parquet/test-arrow-file-writer.rb @@ -20,7 +20,10 @@ class TestParquetArrowFileWriter < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + yield + end end def test_write @@ -33,14 +36,18 @@ def test_write writer.close reader = Parquet::ArrowFileReader.new(@file.path) - reader.use_threads = true - assert_equal([ - enabled_values.length / chunk_size, - true, - ], - [ - reader.n_row_groups, - table.equal_metadata(reader.read_table, false), - ]) + begin + reader.use_threads = true + assert_equal([ + enabled_values.length / chunk_size, + true, + ], + [ + reader.n_row_groups, + table.equal_metadata(reader.read_table, false), + ]) + ensure + reader.unref + end end end diff --git a/c_glib/test/parquet/test-boolean-statistics.rb b/c_glib/test/parquet/test-boolean-statistics.rb index 6131a22195cb8..244348641320e 100644 --- a/c_glib/test/parquet/test-boolean-statistics.rb +++ b/c_glib/test/parquet/test-boolean-statistics.rb @@ -20,14 +20,22 @@ class TestParquetBooleanStatistics < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @table = build_table("boolean" => build_boolean_array([nil, false, true])) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1024 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @statistics = reader.metadata.get_row_group(0).get_column_chunk(0).statistics + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @table = build_table("boolean" => build_boolean_array([nil, false, true])) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1024 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @statistics = + reader.metadata.get_row_group(0).get_column_chunk(0).statistics + yield + ensure + reader.unref + end + end end test("#min") do diff --git a/c_glib/test/parquet/test-byte-array-statistics.rb b/c_glib/test/parquet/test-byte-array-statistics.rb index 50ec409dbce7c..b9693a77fff13 100644 --- a/c_glib/test/parquet/test-byte-array-statistics.rb +++ b/c_glib/test/parquet/test-byte-array-statistics.rb @@ -20,14 +20,22 @@ class TestParquetByteArrayStatistics < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @table = build_table("string" => build_string_array([nil, "abc", "xyz"])) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1024 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @statistics = reader.metadata.get_row_group(0).get_column_chunk(0).statistics + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @table = build_table("string" => build_string_array([nil, "abc", "xyz"])) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1024 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @statistics = + reader.metadata.get_row_group(0).get_column_chunk(0).statistics + yield + ensure + reader.unref + end + end end test("#min") do diff --git a/c_glib/test/parquet/test-column-chunk-metadata.rb b/c_glib/test/parquet/test-column-chunk-metadata.rb index a93fe85bbfbf1..f0012f0124577 100644 --- a/c_glib/test/parquet/test-column-chunk-metadata.rb +++ b/c_glib/test/parquet/test-column-chunk-metadata.rb @@ -20,35 +20,46 @@ class TestParquetColumnChunkMetadata < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @string_array = build_string_array([nil, "hello"]) - fields = [ - Arrow::Field.new("int8", Arrow::Int8DataType.new), - Arrow::Field.new("boolean", Arrow::BooleanDataType.new), - ] - structs = [ - { - "int8" => -29, - "boolean" => true, - }, - nil, - ] - @struct_array = build_struct_array(fields, structs) - @table = build_table("string" => @string_array, - "struct" => @struct_array) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @metadata = reader.metadata.get_row_group(0).get_column_chunk(0) + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @string_array = build_string_array([nil, "hello"]) + fields = [ + Arrow::Field.new("int8", Arrow::Int8DataType.new), + Arrow::Field.new("boolean", Arrow::BooleanDataType.new), + ] + structs = [ + { + "int8" => -29, + "boolean" => true, + }, + nil, + ] + @struct_array = build_struct_array(fields, structs) + @table = build_table("string" => @string_array, + "struct" => @struct_array) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @metadata = reader.metadata.get_row_group(0).get_column_chunk(0) + yield + ensure + reader.unref + end + end end test("#==") do reader = Parquet::ArrowFileReader.new(@file.path) - other_metadata = reader.metadata.get_row_group(0).get_column_chunk(0) - assert do - @metadata == other_metadata + begin + other_metadata = reader.metadata.get_row_group(0).get_column_chunk(0) + assert do + @metadata == other_metadata + end + ensure + reader.unref end end diff --git a/c_glib/test/parquet/test-double-statistics.rb b/c_glib/test/parquet/test-double-statistics.rb index a610fb24a9bdf..6c7a95824570d 100644 --- a/c_glib/test/parquet/test-double-statistics.rb +++ b/c_glib/test/parquet/test-double-statistics.rb @@ -20,14 +20,22 @@ class TestParquetDoubleStatistics < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @table = build_table("double" => build_double_array([nil, -2.9, 2.9])) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1024 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @statistics = reader.metadata.get_row_group(0).get_column_chunk(0).statistics + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @table = build_table("double" => build_double_array([nil, -2.9, 2.9])) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1024 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @statistics = + reader.metadata.get_row_group(0).get_column_chunk(0).statistics + yield + ensure + reader.unref + end + end end test("#min") do diff --git a/c_glib/test/parquet/test-file-metadata.rb b/c_glib/test/parquet/test-file-metadata.rb index 2bca7e66e0b07..aec3f4ab829b9 100644 --- a/c_glib/test/parquet/test-file-metadata.rb +++ b/c_glib/test/parquet/test-file-metadata.rb @@ -20,35 +20,46 @@ class TestParquetFileMetadata < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @string_array = build_string_array([nil, "hello"]) - fields = [ - Arrow::Field.new("int8", Arrow::Int8DataType.new), - Arrow::Field.new("boolean", Arrow::BooleanDataType.new), - ] - structs = [ - { - "int8" => -29, - "boolean" => true, - }, - nil, - ] - @struct_array = build_struct_array(fields, structs) - @table = build_table("string" => @string_array, - "struct" => @struct_array) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @metadata = reader.metadata + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @string_array = build_string_array([nil, "hello"]) + fields = [ + Arrow::Field.new("int8", Arrow::Int8DataType.new), + Arrow::Field.new("boolean", Arrow::BooleanDataType.new), + ] + structs = [ + { + "int8" => -29, + "boolean" => true, + }, + nil, + ] + @struct_array = build_struct_array(fields, structs) + @table = build_table("string" => @string_array, + "struct" => @struct_array) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @metadata = reader.metadata + yield + ensure + reader.unref + end + end end test("#==") do reader = Parquet::ArrowFileReader.new(@file.path) - other_metadata = reader.metadata - assert do - @metadata == other_metadata + begin + other_metadata = reader.metadata + assert do + @metadata == other_metadata + end + ensure + reader.unref end end diff --git a/c_glib/test/parquet/test-fixed-length-byte-array-statistics.rb b/c_glib/test/parquet/test-fixed-length-byte-array-statistics.rb index 87a96d009c509..c2f179627d06a 100644 --- a/c_glib/test/parquet/test-fixed-length-byte-array-statistics.rb +++ b/c_glib/test/parquet/test-fixed-length-byte-array-statistics.rb @@ -20,16 +20,24 @@ class TestParquetFixedLengthByteArrayStatistics < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - data_type = Arrow::FixedSizeBinaryDataType.new(3) - array = build_fixed_size_binary_array(data_type, [nil, "abc", "xyz"]) - @table = build_table("binary" => array) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1024 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @statistics = reader.metadata.get_row_group(0).get_column_chunk(0).statistics + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + data_type = Arrow::FixedSizeBinaryDataType.new(3) + array = build_fixed_size_binary_array(data_type, [nil, "abc", "xyz"]) + @table = build_table("binary" => array) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1024 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @statistics = + reader.metadata.get_row_group(0).get_column_chunk(0).statistics + yield + ensure + reader.unref + end + end end test("#min") do diff --git a/c_glib/test/parquet/test-float-statistics.rb b/c_glib/test/parquet/test-float-statistics.rb index 2622a2bb36fe6..7d1a233f53ca0 100644 --- a/c_glib/test/parquet/test-float-statistics.rb +++ b/c_glib/test/parquet/test-float-statistics.rb @@ -20,14 +20,22 @@ class TestParquetFloatStatistics < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @table = build_table("float" => build_float_array([nil, -2.9, 2.9])) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1024 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @statistics = reader.metadata.get_row_group(0).get_column_chunk(0).statistics + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @table = build_table("float" => build_float_array([nil, -2.9, 2.9])) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1024 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @statistics = + reader.metadata.get_row_group(0).get_column_chunk(0).statistics + yield + ensure + reader.unref + end + end end test("#min") do diff --git a/c_glib/test/parquet/test-int32-statistics.rb b/c_glib/test/parquet/test-int32-statistics.rb index 041f07c74292f..8d41327f88014 100644 --- a/c_glib/test/parquet/test-int32-statistics.rb +++ b/c_glib/test/parquet/test-int32-statistics.rb @@ -20,14 +20,22 @@ class TestParquetInt32Statistics < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @table = build_table("int32" => build_int32_array([nil, -2, 9])) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1024 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @statistics = reader.metadata.get_row_group(0).get_column_chunk(0).statistics + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @table = build_table("int32" => build_int32_array([nil, -2, 9])) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1024 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @statistics = + reader.metadata.get_row_group(0).get_column_chunk(0).statistics + yield + ensure + reader.unref + end + end end test("#min") do diff --git a/c_glib/test/parquet/test-int64-statistics.rb b/c_glib/test/parquet/test-int64-statistics.rb index 0a014573c1144..81fce8a0bbbbd 100644 --- a/c_glib/test/parquet/test-int64-statistics.rb +++ b/c_glib/test/parquet/test-int64-statistics.rb @@ -20,15 +20,23 @@ class TestParquetInt64Statistics < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - array = build_int64_array([nil, -(2 ** 32), 2 ** 32]) - @table = build_table("int64" => array) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1024 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @statistics = reader.metadata.get_row_group(0).get_column_chunk(0).statistics + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + array = build_int64_array([nil, -(2 ** 32), 2 ** 32]) + @table = build_table("int64" => array) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1024 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @statistics = + reader.metadata.get_row_group(0).get_column_chunk(0).statistics + yield + ensure + reader.unref + end + end end test("#min") do diff --git a/c_glib/test/parquet/test-row-group-metadata.rb b/c_glib/test/parquet/test-row-group-metadata.rb index e68cb9d11ee62..f238dd3b5774e 100644 --- a/c_glib/test/parquet/test-row-group-metadata.rb +++ b/c_glib/test/parquet/test-row-group-metadata.rb @@ -20,35 +20,46 @@ class TestParquetRowGroupMetadata < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @string_array = build_string_array([nil, "hello"]) - fields = [ - Arrow::Field.new("int8", Arrow::Int8DataType.new), - Arrow::Field.new("boolean", Arrow::BooleanDataType.new), - ] - structs = [ - { - "int8" => -29, - "boolean" => true, - }, - nil, - ] - @struct_array = build_struct_array(fields, structs) - @table = build_table("string" => @string_array, - "struct" => @struct_array) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @metadata = reader.metadata.get_row_group(0) + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @string_array = build_string_array([nil, "hello"]) + fields = [ + Arrow::Field.new("int8", Arrow::Int8DataType.new), + Arrow::Field.new("boolean", Arrow::BooleanDataType.new), + ] + structs = [ + { + "int8" => -29, + "boolean" => true, + }, + nil, + ] + @struct_array = build_struct_array(fields, structs) + @table = build_table("string" => @string_array, + "struct" => @struct_array) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @metadata = reader.metadata.get_row_group(0) + yield + ensure + reader.unref + end + end end test("#==") do reader = Parquet::ArrowFileReader.new(@file.path) - other_metadata = reader.metadata.get_row_group(0) - assert do - @metadata == other_metadata + begin + other_metadata = reader.metadata.get_row_group(0) + assert do + @metadata == other_metadata + end + ensure + reader.unref end end diff --git a/c_glib/test/parquet/test-statistics.rb b/c_glib/test/parquet/test-statistics.rb index 0367084c88a49..09a47ac255927 100644 --- a/c_glib/test/parquet/test-statistics.rb +++ b/c_glib/test/parquet/test-statistics.rb @@ -20,22 +20,34 @@ class TestParquetStatistics < Test::Unit::TestCase def setup omit("Parquet is required") unless defined?(::Parquet) - @file = Tempfile.open(["data", ".parquet"]) - @table = build_table("int32" => build_int32_array([nil, 2, 2, 9])) - writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) - chunk_size = 1024 - writer.write_table(@table, chunk_size) - writer.close - reader = Parquet::ArrowFileReader.new(@file.path) - @statistics = reader.metadata.get_row_group(0).get_column_chunk(0).statistics + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + @table = build_table("int32" => build_int32_array([nil, 2, 2, 9])) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1024 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @statistics = + reader.metadata.get_row_group(0).get_column_chunk(0).statistics + yield + ensure + reader.unref + end + end end test("#==") do reader = Parquet::ArrowFileReader.new(@file.path) - other_statistics = - reader.metadata.get_row_group(0).get_column_chunk(0).statistics - assert do - @statistics == other_statistics + begin + other_statistics = + reader.metadata.get_row_group(0).get_column_chunk(0).statistics + assert do + @statistics == other_statistics + end + ensure + reader.unref end end diff --git a/c_glib/test/test-half-float-scalar.rb b/c_glib/test/test-half-float-scalar.rb index ac41f91ece621..3073d84d796cf 100644 --- a/c_glib/test/test-half-float-scalar.rb +++ b/c_glib/test/test-half-float-scalar.rb @@ -41,7 +41,7 @@ def test_equal end def test_to_s - assert_equal("[\n #{@half_float}\n]", @scalar.to_s) + assert_equal("1.0009765625", @scalar.to_s) end def test_value diff --git a/c_glib/tool/generate-version-header.py b/c_glib/tool/generate-version-header.py new file mode 100755 index 0000000000000..7422432251ff1 --- /dev/null +++ b/c_glib/tool/generate-version-header.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import argparse +from io import TextIOBase +from pathlib import Path +import re + + +def main(): + parser = argparse.ArgumentParser( + description="Generate C header with version macros") + parser.add_argument( + "--library", + required=True, + help="The library name to use in macro prefixes") + parser.add_argument( + "--version", + required=True, + help="The library version number") + parser.add_argument( + "--input", + type=Path, + required=True, + help="Path to the input template file") + parser.add_argument( + "--output", + type=Path, + required=True, + help="Path to the output file to generate") + + args = parser.parse_args() + + with open(args.input, "r", encoding="utf-8") as input_file, \ + open(args.output, "w", encoding="utf-8") as output_file: + write_header( + input_file, output_file, args.library, args.version) + + +def write_header( + input_file: TextIOBase, + output_file: TextIOBase, + library_name: str, + version: str): + if "-" in version: + version, version_tag = version.split("-") + else: + version_tag = "" + version_major, version_minor, version_micro = [int(v) for v in version.split(".")] + + encoded_versions = generate_encoded_versions(library_name) + visibility_macros = generate_visibility_macros(library_name) + availability_macros = generate_availability_macros(library_name) + + replacements = { + "VERSION_MAJOR": str(version_major), + "VERSION_MINOR": str(version_minor), + "VERSION_MICRO": str(version_micro), + "VERSION_TAG": version_tag, + "ENCODED_VERSIONS": encoded_versions, + "VISIBILITY_MACROS": visibility_macros, + "AVAILABILITY_MACROS": availability_macros, + } + + output_file.write(re.sub( + r"@([A-Z_]+)@", lambda match: replacements[match[1]], input_file.read())) + + +def generate_visibility_macros(library: str) -> str: + return f"""#if (defined(_WIN32) || defined(__CYGWIN__)) && defined(_MSC_VER) && \ + !defined({library}_STATIC_COMPILATION) +# define {library}_EXPORT __declspec(dllexport) +# define {library}_IMPORT __declspec(dllimport) +#else +# define {library}_EXPORT +# define {library}_IMPORT +#endif + +#ifdef {library}_COMPILATION +# define {library}_API {library}_EXPORT +#else +# define {library}_API {library}_IMPORT +#endif + +#define {library}_EXTERN {library}_API extern""" + + +def generate_encoded_versions(library: str) -> str: + macros = [] + + for major_version, minor_version in ALL_VERSIONS: + macros.append(f"""/** + * {library}_VERSION_{major_version}_{minor_version}: + * + * You can use this macro value for compile time API version check. + * + * Since: {major_version}.{minor_version}.0 + */ +#define {library}_VERSION_{major_version}_{minor_version} G_ENCODE_VERSION({major_version}, {minor_version})""") # noqa: E501 + + return "\n\n".join(macros) + + +def generate_availability_macros(library: str) -> str: + macros = [f"""#define {library}_AVAILABLE_IN_ALL {library}_EXTERN"""] + + for major_version, minor_version in ALL_VERSIONS: + macros.append(f"""#if {library}_VERSION_MIN_REQUIRED >= {library}_VERSION_{major_version}_{minor_version} +# define {library}_DEPRECATED_IN_{major_version}_{minor_version} {library}_DEPRECATED +# define {library}_DEPRECATED_IN_{major_version}_{minor_version}_FOR(function) {library}_DEPRECATED_FOR(function) +#else +# define {library}_DEPRECATED_IN_{major_version}_{minor_version} +# define {library}_DEPRECATED_IN_{major_version}_{minor_version}_FOR(function) +#endif + +#if {library}_VERSION_MAX_ALLOWED < {library}_VERSION_{major_version}_{minor_version} +# define {library}_AVAILABLE_IN_{major_version}_{minor_version} {library}_EXTERN {library}_UNAVAILABLE({major_version}, {minor_version}) +#else +# define {library}_AVAILABLE_IN_{major_version}_{minor_version} {library}_EXTERN +#endif""") # noqa: E501 + + return "\n\n".join(macros) + + +ALL_VERSIONS = [ + (17, 0), + (16, 0), + (15, 0), + (14, 0), + (13, 0), + (12, 0), + (11, 0), + (10, 0), + (9, 0), + (8, 0), + (7, 0), + (6, 0), + (5, 0), + (4, 0), + (3, 0), + (2, 0), + (1, 0), + (0, 17), + (0, 16), + (0, 15), + (0, 14), + (0, 13), + (0, 12), + (0, 11), + (0, 10), +] + + +if __name__ == '__main__': + main() diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json new file mode 100644 index 0000000000000..e88d2b8fe30d5 --- /dev/null +++ b/c_glib/vcpkg.json @@ -0,0 +1,9 @@ +{ + "name": "arrow-glib", + "version-string": "17.0.0-SNAPSHOT", + "dependencies": [ + "glib", + "gobject-introspection", + "pkgconf" + ] +} diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 8cfa67c437264..f688fbb63a9ad 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC% set PYARROW_WITH_PARQUET=ON set PYARROW_WITH_PARQUET_ENCRYPTION=ON set PYARROW_WITH_S3=%ARROW_S3% -set PYARROW_WITH_STATIC_BOOST=ON set PYARROW_WITH_SUBSTRAIT=ON set ARROW_HOME=%CONDA_PREFIX%\Library diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat index 5c4a11832d5ee..5a9dffa166fb7 100644 --- a/ci/appveyor-cpp-setup.bat +++ b/ci/appveyor-cpp-setup.bat @@ -66,6 +66,9 @@ set CONDA_PACKAGES=%CONDA_PACKAGES% --file=ci\conda_env_cpp.txt @rem Force conda to use conda-forge conda config --add channels conda-forge conda config --remove channels defaults +@rem Ensure using the latest information. If there are invalid caches, +@rem mamba may use invalid download URL. +mamba clean --all -y @rem Arrow conda environment mamba create -n arrow -y -c conda-forge ^ --file=ci\conda_env_python.txt ^ diff --git a/ci/conan/all/conan_cmake_project_include.cmake b/ci/conan/all/conan_cmake_project_include.cmake new file mode 100644 index 0000000000000..a6dee0c43461c --- /dev/null +++ b/ci/conan/all/conan_cmake_project_include.cmake @@ -0,0 +1,35 @@ +# MIT License +# +# Copyright (c) 2019 Conan.io +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +if(ARROW_S3) + find_package(AWSSDK REQUIRED) + # Fix issue where scripts expect a variable called "AWSSDK_LINK_LIBRARIES" + # which is not defined by the generated AWSSDKConfig.cmake + if(NOT DEFINED AWSSDK_LINK_LIBRARIES) + set(AWSSDK_LINK_LIBRARIES "${AWSSDK_LIBRARIES}") + endif() + + # Causes logic used for generated .pc file to not run + # avoiding instropection of target `aws-cpp-sdk::aws-cpp-sdk` + # This is fine because the generated .pc file is not of use + set(AWSSDK_SOURCE "conan") +endif() diff --git a/ci/conan/all/conandata.yml b/ci/conan/all/conandata.yml index 7402272a4b366..fb75f3995c62e 100644 --- a/ci/conan/all/conandata.yml +++ b/ci/conan/all/conandata.yml @@ -21,6 +21,30 @@ # SOFTWARE. sources: + "15.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-15.0.0/apache-arrow-15.0.0.tar.gz?action=download" + sha256: "01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d" + "14.0.2": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.2/apache-arrow-14.0.2.tar.gz?action=download" + sha256: "1304dedb41896008b89fe0738c71a95d9b81752efc77fa70f264cb1da15d9bc2" + "14.0.1": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.1/apache-arrow-14.0.1.tar.gz?action=download" + sha256: "5c70eafb1011f9d124bafb328afe54f62cc5b9280b7080e1e3d668f78c0e407e" + "14.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.0/apache-arrow-14.0.0.tar.gz?action=download" + sha256: "4eb0da50ec071baf15fc163cb48058931e006f1c862c8def0e180fd07d531021" + "13.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-13.0.0/apache-arrow-13.0.0.tar.gz?action=download" + sha256: "35dfda191262a756be934eef8afee8d09762cad25021daa626eb249e251ac9e6" + "12.0.1": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.1/apache-arrow-12.0.1.tar.gz?action=download" + sha256: "3481c411393aa15c75e88d93cf8315faf7f43e180fe0790128d3840d417de858" + "12.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.0/apache-arrow-12.0.0.tar.gz?action=download" + sha256: "ddd8347882775e53af7d0965a1902b7d8fcd0a030fd14f783d4f85e821352d52" + "11.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-11.0.0/apache-arrow-11.0.0.tar.gz?action=download" + sha256: "2dd8f0ea0848a58785628ee3a57675548d509e17213a2f5d72b0d900b43f5430" "10.0.1": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-10.0.1/apache-arrow-10.0.1.tar.gz?action=download" sha256: "c814e0670112a22c1a6ec03ab420a52ae236a9a42e9e438c3cbd37f37e658fb3" @@ -36,12 +60,6 @@ sources: "7.0.0": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-7.0.0/apache-arrow-7.0.0.tar.gz?action=download" sha256: "e8f49b149a15ecef4e40fcfab1b87c113c6b1ee186005c169e5cdf95d31a99de" - "2.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-2.0.0/apache-arrow-2.0.0.tar.gz?action=download" - sha256: "be0342cc847bb340d86aeaef43596a0b6c1dbf1ede9c789a503d939e01c71fbe" - "1.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-1.0.0/apache-arrow-1.0.0.tar.gz?action=download" - sha256: "86ddb9feb48203a5aaf9cc4f2827525e20a2ca4d7239e492af17e74532ccf243" patches: "8.0.1": - patch_file: "patches/8.0.0-0005-install-utils.patch" @@ -64,23 +82,3 @@ patches: - patch_file: "patches/7.0.0-0007-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" - "2.0.0": - - patch_file: "patches/2.0.0-0003-fix-shared-msvc.patch" - patch_description: "make shared enabled in msvc" - patch_type: "official" - - patch_file: "patches/2.0.0-0005-gandiva-engine.patch" - patch_description: "fix grandiva compilation error" - patch_type: "official" - - patch_file: "patches/2.0.0-0008-fix-cmake.patch" - patch_description: "use cci package" - patch_type: "conan" - "1.0.0": - - patch_file: "patches/1.0.0-0003-fix-shared-msvc.patch" - patch_description: "make shared enabled in msvc" - patch_type: "official" - - patch_file: "patches/1.0.0-0005-fix-make12-namespace.patch" - patch_description: "fix ambiguous `make12` function between std and date" - patch_type: "official" - - patch_file: "patches/1.0.0-0006-fix-cmake.patch" - patch_description: "use cci package" - patch_type: "conan" diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py index 7e87f82e7e018..178cd03da1555 100644 --- a/ci/conan/all/conanfile.py +++ b/ci/conan/all/conanfile.py @@ -21,12 +21,12 @@ # SOFTWARE. from conan import ConanFile -from conan.errors import ConanInvalidConfiguration -from conan.tools.microsoft import is_msvc_static_runtime, is_msvc, check_min_vs -from conan.tools.files import export_conandata_patches, apply_conandata_patches, get, copy, rmdir +from conan.errors import ConanInvalidConfiguration, ConanException from conan.tools.build import check_min_cppstd, cross_building -from conan.tools.scm import Version from conan.tools.cmake import CMake, CMakeDeps, CMakeToolchain, cmake_layout +from conan.tools.files import apply_conandata_patches, copy, export_conandata_patches, get, rmdir +from conan.tools.microsoft import is_msvc, is_msvc_static_runtime +from conan.tools.scm import Version import os import glob @@ -39,7 +39,8 @@ class ArrowConan(ConanFile): license = ("Apache-2.0",) url = "https://github.com/conan-io/conan-center-index" homepage = "https://arrow.apache.org/" - topics = ("memory", "gandiva", "parquet", "skyhook", "plasma", "hdfs", "csv", "cuda", "gcs", "json", "hive", "s3", "grpc") + topics = ("memory", "gandiva", "parquet", "skyhook", "acero", "hdfs", "csv", "cuda", "gcs", "json", "hive", "s3", "grpc") + package_type = "library" settings = "os", "arch", "compiler", "build_type" options = { "shared": [True, False], @@ -48,15 +49,15 @@ class ArrowConan(ConanFile): "parquet": ["auto", True, False], "substrait": [True, False], "skyhook": [True, False], - "plasma": [True, False], + "acero": [True, False], "cli": [True, False], "compute": ["auto", True, False], - "acero": ["auto", True, False], "dataset_modules": ["auto", True, False], "deprecated": [True, False], "encryption": [True, False], "filesystem_layer": [True, False], "hdfs_bridgs": [True, False], + "plasma": [True, False, "deprecated"], "simd_level": [None, "default", "sse4_2", "avx2", "avx512", "neon", ], "runtime_simd_level": [None, "sse4_2", "avx2", "avx512", "max"], "with_backtrace": [True, False], @@ -70,8 +71,9 @@ class ArrowConan(ConanFile): "with_glog": ["auto", True, False], "with_grpc": ["auto", True, False], "with_jemalloc": ["auto", True, False], - "with_mimalloc": ["auto", True, False], + "with_mimalloc": [True, False], "with_json": [True, False], + "with_thrift": ["auto", True, False], "with_llvm": ["auto", True, False], "with_openssl": ["auto", True, False], "with_opentelemetry": [True, False], @@ -91,43 +93,44 @@ class ArrowConan(ConanFile): "shared": False, "fPIC": True, "gandiva": False, - "parquet": "auto", + "parquet": False, "skyhook": False, "substrait": False, - "plasma": False, + "acero": False, "cli": False, - "compute": "auto", - "acero": "auto", - "dataset_modules": "auto", + "compute": False, + "dataset_modules": False, "deprecated": True, "encryption": False, "filesystem_layer": False, "hdfs_bridgs": False, + "plasma": "deprecated", "simd_level": "default", "runtime_simd_level": "max", "with_backtrace": False, - "with_boost": "auto", + "with_boost": False, "with_brotli": False, "with_bz2": False, "with_csv": False, "with_cuda": False, - "with_flight_rpc": "auto", + "with_flight_rpc": False, "with_flight_sql": False, "with_gcs": False, - "with_gflags": "auto", - "with_jemalloc": "auto", + "with_gflags": False, + "with_jemalloc": False, "with_mimalloc": False, - "with_glog": "auto", - "with_grpc": "auto", + "with_glog": False, + "with_grpc": False, "with_json": False, - "with_llvm": "auto", - "with_openssl": "auto", + "with_thrift": False, + "with_llvm": False, + "with_openssl": False, "with_opentelemetry": False, "with_orc": False, - "with_protobuf": "auto", - "with_re2": "auto", + "with_protobuf": False, + "with_re2": False, "with_s3": False, - "with_utf8proc": "auto", + "with_utf8proc": False, "with_lz4": False, "with_snappy": False, "with_zlib": False, @@ -136,283 +139,147 @@ class ArrowConan(ConanFile): short_paths = True @property - def _minimum_cpp_standard(self): + def _min_cppstd(self): # arrow >= 10.0.0 requires C++17. # https://github.com/apache/arrow/pull/13991 - return 11 if Version(self.version) < "10.0.0" else 17 + return "11" if Version(self.version) < "10.0.0" else "17" @property def _compilers_minimum_version(self): return { - "gcc": "8", - "clang": "7", - "apple-clang": "10", - } + "11": { + "clang": "3.9", + }, + "17": { + "gcc": "8", + "clang": "7", + "apple-clang": "10", + "Visual Studio": "15", + "msvc": "191", + }, + }.get(self._min_cppstd, {}) def export_sources(self): export_conandata_patches(self) + copy(self, "conan_cmake_project_include.cmake", self.recipe_folder, os.path.join(self.export_sources_folder, "src")) def config_options(self): if self.settings.os == "Windows": del self.options.fPIC - if Version(self.version) < "2.0.0": - del self.options.simd_level - del self.options.runtime_simd_level - elif Version(self.version) < "6.0.0": - self.options.simd_level = "sse4_2" - if Version(self.version) < "6.0.0": - del self.options.with_gcs - if Version(self.version) < "7.0.0": - del self.options.skyhook - del self.options.with_flight_sql - del self.options.with_opentelemetry if Version(self.version) < "8.0.0": del self.options.substrait + if is_msvc(self): + self.options.with_boost = True def configure(self): if self.options.shared: self.options.rm_safe("fPIC") - def validate(self): - if self.info.settings.compiler.cppstd: - check_min_cppstd(self, self._minimum_cpp_standard) - - if self._minimum_cpp_standard == 11: - if self.info.settings.compiler == "clang" and self.info.settings.compiler.version <= Version("3.9"): - raise ConanInvalidConfiguration("This recipe does not support this compiler version") - else: - check_min_vs(self, 191) - if not is_msvc(self): - minimum_version = self._compilers_minimum_version.get(str(self.info.settings.compiler), False) - if minimum_version and Version(self.info.settings.compiler.version) < minimum_version: - raise ConanInvalidConfiguration( - f"{self.ref} requires C++{self._minimum_cpp_standard}, which your compiler does not support." - ) - - if self.options.shared: - del self.options.fPIC - if self.options.compute == False and not self._compute(True): - raise ConanInvalidConfiguration("compute options is required (or choose auto)") - if self.options.acero == False and not self._acero(True): - raise ConanInvalidConfiguration("acero options is required (or choose auto)") - if self.options.parquet == False and self._parquet(True): - raise ConanInvalidConfiguration("parquet options is required (or choose auto)") - if self.options.dataset_modules == False and self._dataset_modules(True): - raise ConanInvalidConfiguration("dataset_modules options is required (or choose auto)") - if self.options.get_safe("skyhook", False): - raise ConanInvalidConfiguration("CCI has no librados recipe (yet)") - if self.options.with_jemalloc == False and self._with_jemalloc(True): - raise ConanInvalidConfiguration("with_jemalloc option is required (or choose auto)") - if self.options.with_re2 == False and self._with_re2(True): - raise ConanInvalidConfiguration("with_re2 option is required (or choose auto)") - if self.options.with_protobuf == False and self._with_protobuf(True): - raise ConanInvalidConfiguration("with_protobuf option is required (or choose auto)") - if self.options.with_gflags == False and self._with_gflags(True): - raise ConanInvalidConfiguration("with_gflags options is required (or choose auto)") - if self.options.with_flight_rpc == False and self._with_flight_rpc(True): - raise ConanInvalidConfiguration("with_flight_rpc options is required (or choose auto)") - if self.options.with_grpc == False and self._with_grpc(True): - raise ConanInvalidConfiguration("with_grpc options is required (or choose auto)") - if self.options.with_boost == False and self._with_boost(True): - raise ConanInvalidConfiguration("with_boost options is required (or choose auto)") - if self.options.with_openssl == False and self._with_openssl(True): - raise ConanInvalidConfiguration("with_openssl options is required (or choose auto)") - if self.options.with_llvm == False and self._with_llvm(True): - raise ConanInvalidConfiguration("with_llvm options is required (or choose auto)") - if self.options.with_cuda: - raise ConanInvalidConfiguration("CCI has no cuda recipe (yet)") - if self.options.with_orc: - raise ConanInvalidConfiguration("CCI has no orc recipe (yet)") - if self.options.with_s3 and not self.options["aws-sdk-cpp"].config: - raise ConanInvalidConfiguration("arrow:with_s3 requires aws-sdk-cpp:config is True.") - - if self.options.shared and self._with_jemalloc(): - if self.options["jemalloc"].enable_cxx: - raise ConanInvalidConfiguration("jemmalloc.enable_cxx of a static jemalloc must be disabled") - - if Version(self.version) < "6.0.0" and self.options.get_safe("simd_level") == "default": - raise ConanInvalidConfiguration(f"In {self.ref}, simd_level options is not supported `default` value.") - def layout(self): cmake_layout(self, src_folder="src") - def _compute(self, required=False): - if required or self.options.compute == "auto": - return bool(self._parquet()) or bool(self._acero()) - else: - return bool(self.options.compute) - - def _acero(self, required=False): - if required or self.options.acero == "auto": - return bool(self._dataset_modules()) - else: - return bool(self.options.acero) - - def _parquet(self, required=False): - if required or self.options.parquet == "auto": - return bool(self.options.get_safe("substrait", False)) - else: - return bool(self.options.parquet) - - def _plasma(self, required=False): - if Version(self.version) >= "12.0.0": - return False - else: - return required or self.options.plasma - - def _dataset_modules(self, required=False): - if required or self.options.dataset_modules == "auto": - return bool(self.options.get_safe("substrait", False)) - else: - return bool(self.options.dataset_modules) - - def _with_jemalloc(self, required=False): - if required or self.options.with_jemalloc == "auto": - return bool("BSD" in str(self.settings.os)) - else: - return bool(self.options.with_jemalloc) - - def _with_re2(self, required=False): - if required or self.options.with_re2 == "auto": - if self.options.gandiva or self.options.parquet: - return True - if Version(self) >= "7.0.0" and (self._compute() or self._dataset_modules()): - return True - return False - else: - return bool(self.options.with_re2) - - def _with_protobuf(self, required=False): - if required or self.options.with_protobuf == "auto": - return bool(self.options.gandiva or self._with_flight_rpc() or self.options.with_orc or self.options.get_safe("substrait", False)) - else: - return bool(self.options.with_protobuf) - - def _with_flight_rpc(self, required=False): - if required or self.options.with_flight_rpc == "auto": - return bool(self.options.get_safe("with_flight_sql", False)) - else: - return bool(self.options.with_flight_rpc) - - def _with_gflags(self, required=False): - if required or self.options.with_gflags == "auto": - return bool(self._plasma() or self._with_glog() or self._with_grpc()) - else: - return bool(self.options.with_gflags) - - def _with_glog(self, required=False): - if required or self.options.with_glog == "auto": - return False - else: - return bool(self.options.with_glog) - - def _with_grpc(self, required=False): - if required or self.options.with_grpc == "auto": - return self._with_flight_rpc() - else: - return bool(self.options.with_grpc) - - def _with_boost(self, required=False): - if required or self.options.with_boost == "auto": - if self.options.gandiva: - return True - version = Version(self.version) - if version.major == "1": - if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): - return True - elif version.major >= "2": - if is_msvc(self): - return True - return False - else: - return bool(self.options.with_boost) - - def _with_thrift(self, required=False): - # No self.options.with_thrift exists - return bool(required or self._parquet()) - - def _with_utf8proc(self, required=False): - if required or self.options.with_utf8proc == "auto": - return bool(self._compute() or self.options.gandiva) - else: - return bool(self.options.with_utf8proc) - - def _with_llvm(self, required=False): - if required or self.options.with_llvm == "auto": - return bool(self.options.gandiva) - else: - return bool(self.options.with_llvm) - - def _with_openssl(self, required=False): - if required or self.options.with_openssl == "auto": - return bool(self.options.encryption or self._with_flight_rpc() or self.options.with_s3) - else: - return bool(self.options.with_openssl) - - def _with_rapidjson(self): - if self.options.with_json: - return True - if Version(self.version) >= "7.0.0" and self.options.encryption: - return True - return False + def _requires_rapidjson(self): + return self.options.with_json or self.options.encryption def requirements(self): - if self._with_thrift(): - self.requires("zlib/1.2.13") + if self.options.with_thrift: self.requires("thrift/0.17.0") - if self._with_protobuf(): - self.requires("protobuf/3.21.4") - if self._with_jemalloc(): + if self.options.with_protobuf: + self.requires("protobuf/3.21.9") + if self.options.with_jemalloc: self.requires("jemalloc/5.3.0") if self.options.with_mimalloc: self.requires("mimalloc/1.7.6") - if self._with_boost(): - self.requires("boost/1.80.0") - if self._with_gflags(): + if self.options.with_boost: + self.requires("boost/1.84.0") + if self.options.with_gflags: self.requires("gflags/2.2.2") - if self._with_glog(): + if self.options.with_glog: self.requires("glog/0.6.0") if self.options.get_safe("with_gcs"): self.requires("google-cloud-cpp/1.40.1") - if self._with_grpc(): + if self.options.with_grpc: self.requires("grpc/1.50.0") - if self._with_rapidjson(): + if self._requires_rapidjson(): self.requires("rapidjson/1.1.0") - if self._with_llvm(): + if self.options.with_llvm: self.requires("llvm-core/13.0.0") - if self._with_openssl(): + if self.options.with_openssl: # aws-sdk-cpp requires openssl/1.1.1. it uses deprecated functions in openssl/3.0.0 if self.options.with_s3: - self.requires("openssl/1.1.1s") + self.requires("openssl/1.1.1w") else: - self.requires("openssl/1.1.1s") + self.requires("openssl/[>=1.1 <4]") if self.options.get_safe("with_opentelemetry"): self.requires("opentelemetry-cpp/1.7.0") if self.options.with_s3: self.requires("aws-sdk-cpp/1.9.234") if self.options.with_brotli: - self.requires("brotli/1.0.9") + self.requires("brotli/1.1.0") if self.options.with_bz2: self.requires("bzip2/1.0.8") if self.options.with_lz4: self.requires("lz4/1.9.4") if self.options.with_snappy: self.requires("snappy/1.1.9") - if Version(self.version) >= "6.0.0" and \ - self.options.get_safe("simd_level") != None or \ + if self.options.get_safe("simd_level") != None or \ self.options.get_safe("runtime_simd_level") != None: self.requires("xsimd/9.0.1") if self.options.with_zlib: - self.requires("zlib/1.2.13") + self.requires("zlib/[>=1.2.11 <2]") if self.options.with_zstd: - self.requires("zstd/1.5.2") - if self._with_re2(): - self.requires("re2/20220601") - if self._with_utf8proc(): + self.requires("zstd/1.5.5") + if self.options.with_re2: + self.requires("re2/20230301") + if self.options.with_utf8proc: self.requires("utf8proc/2.8.0") if self.options.with_backtrace: self.requires("libbacktrace/cci.20210118") + def validate(self): + # Do not allow options with 'auto' value + # TODO: Remove "auto" from the possible values for these options + auto_options = [option for option, value in self.options.items() if value == "auto"] + if auto_options: + raise ConanException("Options with value 'auto' are deprecated. Please set them true/false or use its default value." + f" Please change the following options: {auto_options}") + + # From https://github.com/conan-io/conan-center-index/pull/23163#issuecomment-2039808851 + if self.options.gandiva: + if not self.options.with_re2: + raise ConanException("'with_re2' option should be True when'gandiva=True'") + if not self.options.with_boost: + raise ConanException("'with_boost' option should be True when'gandiva=True'") + if not self.options.with_utf8proc: + raise ConanException("'with_utf8proc' option should be True when'gandiva=True'") + + if self.settings.compiler.get_safe("cppstd"): + check_min_cppstd(self, self._min_cppstd) + + minimum_version = self._compilers_minimum_version.get(str(self.settings.compiler), False) + if minimum_version and Version(self.settings.compiler.version) < minimum_version: + raise ConanInvalidConfiguration( + f"{self.ref} requires C++{self._min_cppstd}, which your compiler does not support." + ) + + if self.options.get_safe("skyhook", False): + raise ConanInvalidConfiguration("CCI has no librados recipe (yet)") + if self.options.with_cuda: + raise ConanInvalidConfiguration("CCI has no cuda recipe (yet)") + if self.options.with_orc: + raise ConanInvalidConfiguration("CCI has no orc recipe (yet)") + if self.options.with_s3 and not self.dependencies["aws-sdk-cpp"].options.config: + raise ConanInvalidConfiguration("arrow:with_s3 requires aws-sdk-cpp:config is True.") + + if self.options.shared and self.options.with_jemalloc: + if self.dependencies["jemalloc"].options.enable_cxx: + raise ConanInvalidConfiguration("jemmalloc.enable_cxx of a static jemalloc must be disabled") + + + def build_requirements(self): + if Version(self.version) >= "13.0.0": + self.tool_requires("cmake/[>=3.16 <4]") + def source(self): # START # This block should be removed when we update upstream: @@ -435,17 +302,15 @@ def source(self): return # END get(self, **self.conan_data["sources"][self.version], - filename=f"apache-arrow-{self.version}.tar.gz", destination=self.source_folder, strip_root=True) + filename=f"apache-arrow-{self.version}.tar.gz", strip_root=True) def generate(self): - # BUILD_SHARED_LIBS and POSITION_INDEPENDENT_CODE are automatically parsed when self.options.shared or self.options.fPIC exist tc = CMakeToolchain(self) if cross_building(self): cmake_system_processor = { "armv8": "aarch64", "armv8.3": "aarch64", }.get(str(self.settings.arch), str(self.settings.arch)) - tc.variables["CMAKE_SYSTEM_PROCESSOR"] = cmake_system_processor if cmake_system_processor == "aarch64": tc.variables["ARROW_CPU_FLAG"] = "armv8" if is_msvc(self): @@ -453,12 +318,10 @@ def generate(self): tc.variables["ARROW_DEPENDENCY_SOURCE"] = "SYSTEM" tc.variables["ARROW_PACKAGE_KIND"] = "conan" # See https://github.com/conan-io/conan-center-index/pull/14903/files#r1057938314 for details tc.variables["ARROW_GANDIVA"] = bool(self.options.gandiva) - tc.variables["ARROW_PARQUET"] = self._parquet() + tc.variables["ARROW_PARQUET"] = self.options.parquet tc.variables["ARROW_SUBSTRAIT"] = bool(self.options.get_safe("substrait", False)) - if Version(self.version) < "12.0.0": - tc.variables["ARROW_PLASMA"] = bool(self._plasma()) - tc.variables["ARROW_ACERO"] = self._acero() - tc.variables["ARROW_DATASET"] = self._dataset_modules() + tc.variables["ARROW_ACERO"] = bool(self.options.acero) + tc.variables["ARROW_DATASET"] = self.options.dataset_modules tc.variables["ARROW_FILESYSTEM"] = bool(self.options.filesystem_layer) tc.variables["PARQUET_REQUIRE_ENCRYPTION"] = bool(self.options.encryption) tc.variables["ARROW_HDFS"] = bool(self.options.hdfs_bridgs) @@ -466,12 +329,12 @@ def generate(self): tc.variables["ARROW_BUILD_SHARED"] = bool(self.options.shared) tc.variables["ARROW_BUILD_STATIC"] = not bool(self.options.shared) tc.variables["ARROW_NO_DEPRECATED_API"] = not bool(self.options.deprecated) - tc.variables["ARROW_FLIGHT"] = self._with_flight_rpc() + tc.variables["ARROW_FLIGHT"] = self.options.with_flight_rpc tc.variables["ARROW_FLIGHT_SQL"] = bool(self.options.get_safe("with_flight_sql", False)) - tc.variables["ARROW_COMPUTE"] = self._compute() + tc.variables["ARROW_COMPUTE"] = bool(self.options.compute) tc.variables["ARROW_CSV"] = bool(self.options.with_csv) tc.variables["ARROW_CUDA"] = bool(self.options.with_cuda) - tc.variables["ARROW_JEMALLOC"] = self._with_jemalloc() + tc.variables["ARROW_JEMALLOC"] = self.options.with_jemalloc tc.variables["jemalloc_SOURCE"] = "SYSTEM" tc.variables["ARROW_MIMALLOC"] = bool(self.options.with_mimalloc) tc.variables["ARROW_JSON"] = bool(self.options.with_json) @@ -479,61 +342,58 @@ def generate(self): tc.variables["ARROW_GCS"] = bool(self.options.get_safe("with_gcs", False)) tc.variables["BOOST_SOURCE"] = "SYSTEM" tc.variables["Protobuf_SOURCE"] = "SYSTEM" - if self._with_protobuf(): - tc.variables["ARROW_PROTOBUF_USE_SHARED"] = bool(self.options["protobuf"].shared) + if self.options.with_protobuf: + tc.variables["ARROW_PROTOBUF_USE_SHARED"] = bool(self.dependencies["protobuf"].options.shared) tc.variables["gRPC_SOURCE"] = "SYSTEM" - if self._with_grpc(): - tc.variables["ARROW_GRPC_USE_SHARED"] = bool(self.options["grpc"].shared) + if self.options.with_grpc: + tc.variables["ARROW_GRPC_USE_SHARED"] = bool(self.dependencies["grpc"].options.shared) - tc.variables["ARROW_USE_GLOG"] = self._with_glog() + tc.variables["ARROW_USE_GLOG"] = self.options.with_glog tc.variables["GLOG_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_BACKTRACE"] = bool(self.options.with_backtrace) tc.variables["ARROW_WITH_BROTLI"] = bool(self.options.with_brotli) tc.variables["brotli_SOURCE"] = "SYSTEM" if self.options.with_brotli: - tc.variables["ARROW_BROTLI_USE_SHARED"] = bool(self.options["brotli"].shared) + tc.variables["ARROW_BROTLI_USE_SHARED"] = bool(self.dependencies["brotli"].options.shared) tc.variables["gflags_SOURCE"] = "SYSTEM" - if self._with_gflags(): - tc.variables["ARROW_GFLAGS_USE_SHARED"] = bool(self.options["gflags"].shared) + if self.options.with_gflags: + tc.variables["ARROW_GFLAGS_USE_SHARED"] = bool(self.dependencies["gflags"].options.shared) tc.variables["ARROW_WITH_BZ2"] = bool(self.options.with_bz2) tc.variables["BZip2_SOURCE"] = "SYSTEM" if self.options.with_bz2: - tc.variables["ARROW_BZ2_USE_SHARED"] = bool(self.options["bzip2"].shared) + tc.variables["ARROW_BZ2_USE_SHARED"] = bool(self.dependencies["bzip2"].options.shared) tc.variables["ARROW_WITH_LZ4"] = bool(self.options.with_lz4) tc.variables["lz4_SOURCE"] = "SYSTEM" if self.options.with_lz4: - tc.variables["ARROW_LZ4_USE_SHARED"] = bool(self.options["lz4"].shared) + tc.variables["ARROW_LZ4_USE_SHARED"] = bool(self.dependencies["lz4"].options.shared) tc.variables["ARROW_WITH_SNAPPY"] = bool(self.options.with_snappy) tc.variables["RapidJSON_SOURCE"] = "SYSTEM" tc.variables["Snappy_SOURCE"] = "SYSTEM" if self.options.with_snappy: - tc.variables["ARROW_SNAPPY_USE_SHARED"] = bool(self.options["snappy"].shared) + tc.variables["ARROW_SNAPPY_USE_SHARED"] = bool(self.dependencies["snappy"].options.shared) tc.variables["ARROW_WITH_ZLIB"] = bool(self.options.with_zlib) tc.variables["re2_SOURCE"] = "SYSTEM" tc.variables["ZLIB_SOURCE"] = "SYSTEM" tc.variables["xsimd_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_ZSTD"] = bool(self.options.with_zstd) - if Version(self.version) >= "2.0": - tc.variables["zstd_SOURCE"] = "SYSTEM" - tc.variables["ARROW_SIMD_LEVEL"] = str(self.options.simd_level).upper() - tc.variables["ARROW_RUNTIME_SIMD_LEVEL"] = str(self.options.runtime_simd_level).upper() - else: - tc.variables["ZSTD_SOURCE"] = "SYSTEM" + tc.variables["zstd_SOURCE"] = "SYSTEM" + tc.variables["ARROW_SIMD_LEVEL"] = str(self.options.simd_level).upper() + tc.variables["ARROW_RUNTIME_SIMD_LEVEL"] = str(self.options.runtime_simd_level).upper() if self.options.with_zstd: - tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.options["zstd"].shared) + tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.dependencies["zstd"].options.shared) tc.variables["ORC_SOURCE"] = "SYSTEM" - tc.variables["ARROW_WITH_THRIFT"] = self._with_thrift() + tc.variables["ARROW_WITH_THRIFT"] = bool(self.options.with_thrift) tc.variables["Thrift_SOURCE"] = "SYSTEM" - if self._with_thrift(): - tc.variables["THRIFT_VERSION"] = bool(self.deps_cpp_info["thrift"].version) # a recent thrift does not require boost - tc.variables["ARROW_THRIFT_USE_SHARED"] = bool(self.options["thrift"].shared) - tc.variables["ARROW_USE_OPENSSL"] = self._with_openssl() - if self._with_openssl(): - tc.variables["OPENSSL_ROOT_DIR"] = self.deps_cpp_info["openssl"].rootpath.replace("\\", "/") - tc.variables["ARROW_OPENSSL_USE_SHARED"] = bool(self.options["openssl"].shared) - if self._with_boost(): + if self.options.with_thrift: + tc.variables["THRIFT_VERSION"] = bool(self.dependencies["thrift"].ref.version) # a recent thrift does not require boost + tc.variables["ARROW_THRIFT_USE_SHARED"] = bool(self.dependencies["thrift"].options.shared) + tc.variables["ARROW_USE_OPENSSL"] = self.options.with_openssl + if self.options.with_openssl: + tc.variables["OPENSSL_ROOT_DIR"] = self.dependencies["openssl"].package_folder.replace("\\", "/") + tc.variables["ARROW_OPENSSL_USE_SHARED"] = bool(self.dependencies["openssl"].options.shared) + if self.options.with_boost: tc.variables["ARROW_USE_BOOST"] = True - tc.variables["ARROW_BOOST_USE_SHARED"] = bool(self.options["boost"].shared) + tc.variables["ARROW_BOOST_USE_SHARED"] = bool(self.dependencies["boost"].options.shared) tc.variables["ARROW_S3"] = bool(self.options.with_s3) tc.variables["AWSSDK_SOURCE"] = "SYSTEM" tc.variables["ARROW_BUILD_UTILITIES"] = bool(self.options.cli) @@ -544,16 +404,18 @@ def generate(self): tc.variables["ARROW_ENABLE_TIMING_TESTS"] = False tc.variables["ARROW_BUILD_BENCHMARKS"] = False tc.variables["LLVM_SOURCE"] = "SYSTEM" - tc.variables["ARROW_WITH_UTF8PROC"] = self._with_utf8proc() - tc.variables["ARROW_BOOST_REQUIRED"] = self._with_boost() + tc.variables["ARROW_WITH_UTF8PROC"] = self.options.with_utf8proc + tc.variables["ARROW_BOOST_REQUIRED"] = self.options.with_boost tc.variables["utf8proc_SOURCE"] = "SYSTEM" - if self._with_utf8proc(): - tc.variables["ARROW_UTF8PROC_USE_SHARED"] = bool(self.options["utf8proc"].shared) + if self.options.with_utf8proc: + tc.variables["ARROW_UTF8PROC_USE_SHARED"] = bool(self.dependencies["utf8proc"].options.shared) tc.variables["BUILD_WARNING_LEVEL"] = "PRODUCTION" if is_msvc(self): - tc.variables["ARROW_USE_STATIC_CRT"] = "MT" in str(self.settings.compiler.runtime) - if self._with_llvm(): - tc.variables["LLVM_DIR"] = self.deps_cpp_info["llvm-core"].rootpath.replace("\\", "/") + tc.variables["ARROW_USE_STATIC_CRT"] = is_msvc_static_runtime(self) + if self.options.with_llvm: + tc.variables["LLVM_DIR"] = self.dependencies["llvm-core"].package_folder.replace("\\", "/") + + tc.cache_variables["CMAKE_PROJECT_arrow_INCLUDE"] = os.path.join(self.source_folder, "conan_cmake_project_include.cmake") tc.generate() deps = CMakeDeps(self) @@ -561,10 +423,11 @@ def generate(self): def _patch_sources(self): apply_conandata_patches(self) - if "7.0.0" <= Version(self.version) < "10.0.0": + if Version(self.version) < "10.0.0": for filename in glob.glob(os.path.join(self.source_folder, "cpp", "cmake_modules", "Find*.cmake")): if os.path.basename(filename) not in [ "FindArrow.cmake", + "FindArrowAcero.cmake", "FindArrowCUDA.cmake", "FindArrowDataset.cmake", "FindArrowFlight.cmake", @@ -576,7 +439,6 @@ def _patch_sources(self): "FindArrowTesting.cmake", "FindGandiva.cmake", "FindParquet.cmake", - "FindPlasma.cmake", ]: os.remove(filename) @@ -596,129 +458,106 @@ def package(self): rmdir(self, os.path.join(self.package_folder, "lib", "pkgconfig")) rmdir(self, os.path.join(self.package_folder, "share")) - def _lib_name(self, name): - if is_msvc(self) and not self.options.shared: - return "{}_static".format(name) - else: - return "{}".format(name) - - def package_id(self): - self.info.options.with_gflags = self._with_gflags() - self.info.options.with_protobuf = self._with_protobuf() - self.info.options.with_re2 = self._with_re2() - self.info.options.with_jemalloc = self._with_jemalloc() - self.info.options.with_openssl = self._with_openssl() - self.info.options.with_boost = self._with_boost() - self.info.options.with_glog = self._with_glog() - self.info.options.with_grpc = self._with_grpc() - def package_info(self): - self.cpp_info.filenames["cmake_find_package"] = "Arrow" - self.cpp_info.filenames["cmake_find_package_multi"] = "Arrow" - self.cpp_info.components["libarrow"].libs = [self._lib_name("arrow")] - self.cpp_info.components["libarrow"].names["cmake_find_package"] = "arrow" - self.cpp_info.components["libarrow"].names["cmake_find_package_multi"] = "arrow" - self.cpp_info.components["libarrow"].names["pkg_config"] = "arrow" + # FIXME: fix CMake targets of components + + self.cpp_info.set_property("cmake_file_name", "Arrow") + + suffix = "_static" if is_msvc(self) and not self.options.shared else "" + + self.cpp_info.components["libarrow"].set_property("pkg_config_name", "arrow") + self.cpp_info.components["libarrow"].libs = [f"arrow{suffix}"] if not self.options.shared: self.cpp_info.components["libarrow"].defines = ["ARROW_STATIC"] if self.settings.os in ["Linux", "FreeBSD"]: self.cpp_info.components["libarrow"].system_libs = ["pthread", "m", "dl", "rt"] - if self._parquet(): - self.cpp_info.components["libparquet"].libs = [self._lib_name("parquet")] - self.cpp_info.components["libparquet"].names["cmake_find_package"] = "parquet" - self.cpp_info.components["libparquet"].names["cmake_find_package_multi"] = "parquet" - self.cpp_info.components["libparquet"].names["pkg_config"] = "parquet" + if self.options.parquet: + self.cpp_info.components["libparquet"].set_property("pkg_config_name", "parquet") + self.cpp_info.components["libparquet"].libs = [f"parquet{suffix}"] self.cpp_info.components["libparquet"].requires = ["libarrow"] if not self.options.shared: self.cpp_info.components["libparquet"].defines = ["PARQUET_STATIC"] - if self.options.get_safe("substrait", False): - self.cpp_info.components["libarrow_substrait"].libs = [self._lib_name("arrow_substrait")] - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package_multi"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].names["pkg_config"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].requires = ["libparquet", "dataset", "acero"] + if self.options.get_safe("substrait"): + self.cpp_info.components["libarrow_substrait"].set_property("pkg_config_name", "arrow_substrait") + self.cpp_info.components["libarrow_substrait"].libs = [f"arrow_substrait{suffix}"] + self.cpp_info.components["libarrow_substrait"].requires = ["libparquet", "dataset"] + + # Plasma was deprecated in Arrow 12.0.0 + del self.options.plasma - if self._plasma(): - self.cpp_info.components["libplasma"].libs = [self._lib_name("plasma")] - self.cpp_info.components["libplasma"].names["cmake_find_package"] = "plasma" - self.cpp_info.components["libplasma"].names["cmake_find_package_multi"] = "plasma" - self.cpp_info.components["libplasma"].names["pkg_config"] = "plasma" - self.cpp_info.components["libplasma"].requires = ["libarrow"] + if self.options.acero: + self.cpp_info.components["libacero"].libs = [f"arrow_acero{suffix}"] + self.cpp_info.components["libacero"].names["cmake_find_package"] = "acero" + self.cpp_info.components["libacero"].names["cmake_find_package_multi"] = "acero" + self.cpp_info.components["libacero"].names["pkg_config"] = "acero" + self.cpp_info.components["libacero"].requires = ["libarrow"] if self.options.gandiva: - self.cpp_info.components["libgandiva"].libs = [self._lib_name("gandiva")] - self.cpp_info.components["libgandiva"].names["cmake_find_package"] = "gandiva" - self.cpp_info.components["libgandiva"].names["cmake_find_package_multi"] = "gandiva" - self.cpp_info.components["libgandiva"].names["pkg_config"] = "gandiva" + self.cpp_info.components["libgandiva"].set_property("pkg_config_name", "gandiva") + self.cpp_info.components["libgandiva"].libs = [f"gandiva{suffix}"] self.cpp_info.components["libgandiva"].requires = ["libarrow"] if not self.options.shared: self.cpp_info.components["libgandiva"].defines = ["GANDIVA_STATIC"] - if self._with_flight_rpc(): - self.cpp_info.components["libarrow_flight"].libs = [self._lib_name("arrow_flight")] - self.cpp_info.components["libarrow_flight"].names["cmake_find_package"] = "flight_rpc" - self.cpp_info.components["libarrow_flight"].names["cmake_find_package_multi"] = "flight_rpc" - self.cpp_info.components["libarrow_flight"].names["pkg_config"] = "flight_rpc" + if self.options.with_flight_rpc: + self.cpp_info.components["libarrow_flight"].set_property("pkg_config_name", "flight_rpc") + self.cpp_info.components["libarrow_flight"].libs = [f"arrow_flight{suffix}"] self.cpp_info.components["libarrow_flight"].requires = ["libarrow"] if self.options.get_safe("with_flight_sql"): - self.cpp_info.components["libarrow_flight_sql"].libs = [self._lib_name("arrow_flight_sql")] - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package"] = "flight_sql" - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package_multi"] = "flight_sql" - self.cpp_info.components["libarrow_flight_sql"].names["pkg_config"] = "flight_sql" + self.cpp_info.components["libarrow_flight_sql"].set_property("pkg_config_name", "flight_sql") + self.cpp_info.components["libarrow_flight_sql"].libs = [f"arrow_flight_sql{suffix}"] self.cpp_info.components["libarrow_flight_sql"].requires = ["libarrow", "libarrow_flight"] - if self._acero(): - self.cpp_info.components["acero"].libs = ["arrow_acero"] - - if self._dataset_modules(): + if self.options.dataset_modules: self.cpp_info.components["dataset"].libs = ["arrow_dataset"] + if self.options.parquet: + self.cpp_info.components["dataset"].requires = ["libparquet"] - if (self.options.cli and (self.options.with_cuda or self._with_flight_rpc() or self._parquet())) or self._plasma(): + if self.options.cli and (self.options.with_cuda or self.options.with_flight_rpc or self.options.parquet): binpath = os.path.join(self.package_folder, "bin") self.output.info(f"Appending PATH env var: {binpath}") self.env_info.PATH.append(binpath) - if self._with_boost(): + if self.options.with_boost: if self.options.gandiva: # FIXME: only filesystem component is used self.cpp_info.components["libgandiva"].requires.append("boost::boost") - if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): + if self.options.parquet and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): self.cpp_info.components["libparquet"].requires.append("boost::boost") - if Version(self.version) >= "2.0": - # FIXME: only headers components is used - self.cpp_info.components["libarrow"].requires.append("boost::boost") - if self._with_openssl(): + # FIXME: only headers components is used + self.cpp_info.components["libarrow"].requires.append("boost::boost") + if self.options.with_openssl: self.cpp_info.components["libarrow"].requires.append("openssl::openssl") - if self._with_gflags(): + if self.options.with_gflags: self.cpp_info.components["libarrow"].requires.append("gflags::gflags") - if self._with_glog(): + if self.options.with_glog: self.cpp_info.components["libarrow"].requires.append("glog::glog") - if self._with_jemalloc(): + if self.options.with_jemalloc: self.cpp_info.components["libarrow"].requires.append("jemalloc::jemalloc") if self.options.with_mimalloc: self.cpp_info.components["libarrow"].requires.append("mimalloc::mimalloc") - if self._with_re2(): + if self.options.with_re2: if self.options.gandiva: self.cpp_info.components["libgandiva"].requires.append("re2::re2") - if self._parquet(): + if self.options.parquet: self.cpp_info.components["libparquet"].requires.append("re2::re2") self.cpp_info.components["libarrow"].requires.append("re2::re2") - if self._with_llvm(): + if self.options.with_llvm: self.cpp_info.components["libgandiva"].requires.append("llvm-core::llvm-core") - if self._with_protobuf(): + if self.options.with_protobuf: self.cpp_info.components["libarrow"].requires.append("protobuf::protobuf") - if self._with_utf8proc(): + if self.options.with_utf8proc: self.cpp_info.components["libarrow"].requires.append("utf8proc::utf8proc") - if self._with_thrift(): + if self.options.with_thrift: self.cpp_info.components["libarrow"].requires.append("thrift::thrift") if self.options.with_backtrace: self.cpp_info.components["libarrow"].requires.append("libbacktrace::libbacktrace") if self.options.with_cuda: self.cpp_info.components["libarrow"].requires.append("cuda::cuda") - if self._with_rapidjson(): + if self._requires_rapidjson(): self.cpp_info.components["libarrow"].requires.append("rapidjson::rapidjson") if self.options.with_s3: self.cpp_info.components["libarrow"].requires.append("aws-sdk-cpp::s3") @@ -742,9 +581,32 @@ def package_info(self): self.cpp_info.components["libarrow"].requires.append("zlib::zlib") if self.options.with_zstd: self.cpp_info.components["libarrow"].requires.append("zstd::zstd") - if self._with_boost(): + if self.options.with_boost: self.cpp_info.components["libarrow"].requires.append("boost::boost") - if self._with_grpc(): + if self.options.with_grpc: self.cpp_info.components["libarrow"].requires.append("grpc::grpc") - if self._with_flight_rpc(): + if self.options.with_flight_rpc: self.cpp_info.components["libarrow_flight"].requires.append("protobuf::protobuf") + + # TODO: to remove in conan v2 + self.cpp_info.filenames["cmake_find_package"] = "Arrow" + self.cpp_info.filenames["cmake_find_package_multi"] = "Arrow" + self.cpp_info.components["libarrow"].names["cmake_find_package"] = "arrow" + self.cpp_info.components["libarrow"].names["cmake_find_package_multi"] = "arrow" + if self.options.parquet: + self.cpp_info.components["libparquet"].names["cmake_find_package"] = "parquet" + self.cpp_info.components["libparquet"].names["cmake_find_package_multi"] = "parquet" + if self.options.get_safe("substrait"): + self.cpp_info.components["libarrow_substrait"].names["cmake_find_package"] = "arrow_substrait" + self.cpp_info.components["libarrow_substrait"].names["cmake_find_package_multi"] = "arrow_substrait" + if self.options.gandiva: + self.cpp_info.components["libgandiva"].names["cmake_find_package"] = "gandiva" + self.cpp_info.components["libgandiva"].names["cmake_find_package_multi"] = "gandiva" + if self.options.with_flight_rpc: + self.cpp_info.components["libarrow_flight"].names["cmake_find_package"] = "flight_rpc" + self.cpp_info.components["libarrow_flight"].names["cmake_find_package_multi"] = "flight_rpc" + if self.options.get_safe("with_flight_sql"): + self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package"] = "flight_sql" + self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package_multi"] = "flight_sql" + if self.options.cli and (self.options.with_cuda or self.options.with_flight_rpc or self.options.parquet): + self.env_info.PATH.append(os.path.join(self.package_folder, "bin")) diff --git a/ci/conan/config.yml b/ci/conan/config.yml index be333447f348c..3fa90be6f669a 100644 --- a/ci/conan/config.yml +++ b/ci/conan/config.yml @@ -21,6 +21,22 @@ # SOFTWARE. versions: + "15.0.0": + folder: all + "14.0.2": + folder: all + "14.0.1": + folder: all + "14.0.0": + folder: all + "13.0.0": + folder: all + "12.0.1": + folder: all + "12.0.0": + folder: all + "11.0.0": + folder: all "10.0.1": folder: all "10.0.0": @@ -31,7 +47,3 @@ versions: folder: all "7.0.0": folder: all - "2.0.0": - folder: all - "1.0.0": - folder: all diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index 52e456eaab0cc..f28a24cac8d2d 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -21,7 +21,7 @@ azure-identity-cpp>=1.6.0 azure-storage-blobs-cpp>=12.10.0 azure-storage-common-cpp>=12.5.0 azure-storage-files-datalake-cpp>=12.9.0 -benchmark>=1.6.0 +benchmark>=1.6.0,!=1.8.4 boost-cpp>=1.68.0 brotli bzip2 diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 4366e30010389..bf915493de302 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -26,5 +26,5 @@ numpy>=1.16.6 pytest pytest-faulthandler s3fs>=2023.10.0 -setuptools -setuptools_scm +setuptools>=64 +setuptools_scm>=8 diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 6899f9c36a7f6..4665a32e24bbe 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -26,8 +26,13 @@ pydata-sphinx-theme=0.14 sphinx-autobuild sphinx-design sphinx-copybutton +sphinx-lint sphinxcontrib-jquery +sphinxcontrib-mermaid sphinx==6.2 # Requirement for doctest-cython -pytest-cython +# Needs upper pin of 0.3.0, see: +# https://github.com/lgpage/pytest-cython/issues/67 +# With 0.3.* bug fix release, the pin can be removed +pytest-cython==0.2.2 pandas diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index a747ccbc7262f..30b9cd5199fab 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -56,7 +56,7 @@ RUN wget -nv -O - https://dl.google.com/go/go${go}.linux-${arch}.tar.gz | tar -x ENV DOTNET_ROOT=/opt/dotnet \ PATH=/opt/dotnet:$PATH -RUN curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Channel 7.0 -InstallDir /opt/dotnet +RUN curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Channel 8.0 -InstallDir /opt/dotnet ENV ARROW_ACERO=OFF \ ARROW_AZURE=OFF \ diff --git a/ci/docker/conda-python-substrait.dockerfile b/ci/docker/conda-python-substrait.dockerfile index 191795f253000..36dd64e51e7ad 100644 --- a/ci/docker/conda-python-substrait.dockerfile +++ b/ci/docker/conda-python-substrait.dockerfile @@ -24,11 +24,19 @@ FROM ${repo}:${arch}-conda-python-${python} COPY ci/conda_env_python.txt \ ci/conda_env_sphinx.txt \ /arrow/ci/ + +# Note: openjdk is pinned to 17 because the +# substrait repo currently pins to jdk 17. +# Newer jdk versions are currently failing +# due to the recent upgrade to Gradle 8 via +# install_substrait_consumer.sh. +# https://github.com/substrait-io/substrait-java/issues/274 RUN mamba install -q -y \ --file arrow/ci/conda_env_python.txt \ --file arrow/ci/conda_env_sphinx.txt \ $([ "$python" == "3.9" ] && echo "pickle5") \ - python=${python} openjdk \ + python=${python} \ + openjdk=17 \ nomkl && \ mamba clean --all diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index 7036ddf27d52a..d7a6f9df2c2ee 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -119,7 +119,6 @@ ENV ARROW_ACERO=ON \ ARROW_GANDIVA=ON \ ARROW_GCS=ON \ ARROW_HOME=/usr/local \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index 76b5ae6f14363..1c916840e071b 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -18,21 +18,37 @@ ARG base FROM ${base} -ARG r=4.2 +ARG r=4.4 ARG jdk=8 -# See R install instructions at https://cloud.r-project.org/bin/linux/ubuntu/ +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium + +# See R install instructions at https://cloud.r-project.org/bin/linux/ RUN apt-get update -y && \ apt-get install -y \ - dirmngr \ apt-transport-https \ - software-properties-common && \ - wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ - tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ - add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran40/' && \ + dirmngr \ + gpg \ + lsb-release && \ + gpg --keyserver keyserver.ubuntu.com \ + --recv-key 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 && \ + gpg --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ + gpg --no-default-keyring \ + --keyring /usr/share/keyrings/cran.gpg \ + --import - && \ + echo "deb [signed-by=/usr/share/keyrings/cran.gpg] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ + tee /etc/apt/sources.list.d/cran.list && \ + if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ + sed -i \ + -e 's/main$/main contrib non-free non-free-firmware/g' \ + /etc/apt/sources.list.d/debian.sources; \ + fi && \ + apt-get update -y && \ apt-get install -y --no-install-recommends \ autoconf-archive \ automake \ + chromium \ + chromium-sandbox \ curl \ doxygen \ gi-docgen \ @@ -48,6 +64,8 @@ RUN apt-get update -y && \ libxml2-dev \ meson \ ninja-build \ + nodejs \ + npm \ nvidia-cuda-toolkit \ openjdk-${jdk}-jdk-headless \ pandoc \ @@ -55,9 +73,12 @@ RUN apt-get update -y && \ r-base=${r}* \ rsync \ ruby-dev \ + sudo \ wget && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* && \ + PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ + npm install -g yarn @mermaid-js/mermaid-cli ENV JAVA_HOME=/usr/lib/jvm/java-${jdk}-openjdk-amd64 @@ -68,20 +89,6 @@ RUN /arrow/ci/scripts/util_download_apache.sh \ ENV PATH=/opt/apache-maven-${maven}/bin:$PATH RUN mvn -version -ARG node=16 -RUN apt-get purge -y npm && \ - apt-get autoremove -y --purge && \ - wget -q -O - https://deb.nodesource.com/setup_${node}.x | bash - && \ - apt-get install -y nodejs && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - npm install -g yarn - -COPY docs/requirements.txt /arrow/docs/ -RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ - . ${ARROW_PYTHON_VENV}/bin/activate && \ - pip install -r arrow/docs/requirements.txt - COPY c_glib/Gemfile /arrow/c_glib/ RUN gem install --no-document bundler && \ bundle install --gemfile /arrow/c_glib/Gemfile @@ -98,6 +105,17 @@ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow && \ R -e "install.packages('pkgdown')" +RUN useradd --user-group --create-home --groups audio,video arrow +RUN echo "arrow ALL=(ALL:ALL) NOPASSWD:ALL" | \ + EDITOR=tee visudo -f /etc/sudoers.d/arrow +USER arrow + +COPY docs/requirements.txt /arrow/docs/ +RUN sudo chown -R arrow: ${ARROW_PYTHON_VENV} && \ + python3 -m venv ${ARROW_PYTHON_VENV} && \ + . ${ARROW_PYTHON_VENV}/bin/activate && \ + pip install -r arrow/docs/requirements.txt + ENV ARROW_ACERO=ON \ ARROW_AZURE=OFF \ ARROW_BUILD_STATIC=OFF \ diff --git a/ci/docker/linux-apt-lint.dockerfile b/ci/docker/linux-apt-lint.dockerfile index 2b94a48871847..9ec80440a3c21 100644 --- a/ci/docker/linux-apt-lint.dockerfile +++ b/ci/docker/linux-apt-lint.dockerfile @@ -40,7 +40,7 @@ RUN apt-get update && \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -ARG r=4.2 +ARG r=4.4 RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ # NOTE: Only R >= 4.0 is available in this repo diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index c59766c4a665c..630b96e1007b9 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -27,10 +27,15 @@ ENV R_PRUNE_DEPS=${r_prune_deps} ARG r_duckdb_dev=FALSE ENV R_DUCKDB_DEV=${r_duckdb_dev} +# This is needed to avoid errors with utf8 characters in some +# R package's DESCRIPTION files +# https://github.com/statnmap/HatchedPolygons/issues/4 +ENV LANG=C.UTF-8 + # Build R # [1] https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-18-04 # [2] https://linuxize.com/post/how-to-install-r-on-ubuntu-18-04/#installing-r-packages-from-cran -ARG r=3.6 +ARG r=4.4 RUN apt-get update -y && \ apt-get install -y \ dirmngr \ @@ -108,7 +113,6 @@ ENV \ ARROW_GANDIVA=OFF \ ARROW_HDFS=OFF \ ARROW_JSON=ON \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=OFF \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile index d368a6629c587..7b7e989adc0d1 100644 --- a/ci/docker/linux-r.dockerfile +++ b/ci/docker/linux-r.dockerfile @@ -27,9 +27,6 @@ ENV R_BIN=${r_bin} ARG r_dev=FALSE ENV ARROW_R_DEV=${r_dev} -ARG devtoolset_version= -ENV DEVTOOLSET_VERSION=${devtoolset_version} - ARG r_prune_deps=FALSE ENV R_PRUNE_DEPS=${r_prune_deps} diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index b1d9ed5ab88d9..68d4b27d2ca96 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -39,8 +39,7 @@ ENV CPYTHON_VERSION=cp38 ENV PATH=/opt/python/${CPYTHON_VERSION}-${CPYTHON_VERSION}/bin:${PATH} # Install CMake -# AWS SDK doesn't work with CMake=3.22 due to https://gitlab.kitware.com/cmake/cmake/-/issues/22524 -ARG cmake=3.21.4 +ARG cmake=3.29.2 COPY ci/scripts/install_cmake.sh arrow/ci/scripts/ RUN /arrow/ci/scripts/install_cmake.sh ${arch} linux ${cmake} /usr/local @@ -99,5 +98,4 @@ SHELL ["/bin/bash", "-i", "-c"] ENTRYPOINT ["/bin/bash", "-i", "-c"] COPY python/requirements-wheel-build.txt /arrow/python/ -# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release -RUN pip install -r /arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" +RUN pip install -r /arrow/python/requirements-wheel-build.txt diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile index 819324a74e12a..32bbb55e82689 100644 --- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -22,6 +22,8 @@ # contains choco and vs2019 preinstalled FROM abrarov/msvc-2019:2.11.0 +# hadolint shell=cmd.exe + # Add unix tools to path RUN setx path "%path%;C:\Program Files\Git\usr\bin" @@ -40,8 +42,8 @@ RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\P (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \ (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.5" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \ (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.0" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") -RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% -RUN python -m pip install -U pip setuptools # Install archiver to extract xz archives -RUN choco install --no-progress -r -y archiver +RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% & \ + python -m pip install --no-cache-dir -U pip setuptools & \ + choco install --no-progress -r -y archiver diff --git a/ci/docker/python-wheel-windows-vs2019.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile index 0ab5071abb86c..ff42de939d91f 100644 --- a/ci/docker/python-wheel-windows-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-vs2019.dockerfile @@ -89,8 +89,7 @@ RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% RUN python -m pip install -U pip setuptools COPY python/requirements-wheel-build.txt arrow/python/ -# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release -RUN python -m pip install -r arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" +RUN python -m pip install -r arrow/python/requirements-wheel-build.txt # ENV CLCACHE_DIR="C:\clcache" # ENV CLCACHE_COMPRESS=1 diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index ae2ba9421cd55..e17c0306f115d 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -85,7 +85,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 3e3b7ac3a6d99..d78c7a99cf4d6 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -101,6 +101,7 @@ RUN apt-get update -y -q && \ libutf8proc-dev \ libxml2-dev \ libzstd-dev \ + lld \ make \ ninja-build \ nlohmann-json3-dev \ @@ -157,13 +158,13 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ ARROW_SUBSTRAIT=ON \ ARROW_USE_ASAN=OFF \ ARROW_USE_CCACHE=ON \ + ARROW_USE_LLD=ON \ ARROW_USE_UBSAN=OFF \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index dd887a6d00ceb..341d8a87e8661 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -85,7 +85,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index e8416c1378a9a..f12e7456add8e 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -65,6 +65,7 @@ RUN latest_system_llvm=14 && \ RUN apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ autoconf \ + bzip2 \ ca-certificates \ ccache \ cmake \ @@ -115,10 +116,20 @@ RUN apt-get update -y -q && \ rapidjson-dev \ rsync \ tzdata \ - wget && \ + wget \ + xz-utils && \ apt-get clean && \ rm -rf /var/lib/apt/lists* +# install emscripten using EMSDK +ARG emscripten_version="3.1.45" +RUN cd ~ && git clone https://github.com/emscripten-core/emsdk.git && \ + cd emsdk && \ + ./emsdk install ${emscripten_version} && \ + ./emsdk activate ${emscripten_version} && \ + echo "Installed emsdk to:" ~/emsdk + + ARG gcc_version="" RUN if [ "${gcc_version}" = "" ]; then \ apt-get update -y -q && \ @@ -151,6 +162,9 @@ RUN if [ "${gcc_version}" = "" ]; then \ update-alternatives --set c++ /usr/bin/g++; \ fi +# make sure zlib is cached in the EMSDK folder +RUN source ~/emsdk/emsdk_env.sh && embuilder --pic build zlib + COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local @@ -182,7 +196,6 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-csharp.dockerfile b/ci/docker/ubuntu-22.04-csharp.dockerfile index aebbd8fab74e9..4d77ba060b877 100644 --- a/ci/docker/ubuntu-22.04-csharp.dockerfile +++ b/ci/docker/ubuntu-22.04-csharp.dockerfile @@ -16,7 +16,7 @@ # under the License. ARG arch=amd64 -ARG dotnet=7.0 +ARG dotnet=8.0 ARG platform=jammy FROM mcr.microsoft.com/dotnet/sdk:${dotnet}-${platform}-${arch} diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 629d532a3dc76..ecfb5e2f5096d 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -29,7 +29,7 @@ RUN echo "debconf debconf/frontend select Noninteractive" | \ # while debugging package list with docker build. ARG clang_tools ARG llvm -RUN latest_system_llvm=14 && \ +RUN latest_system_llvm=18 && \ if [ ${llvm} -gt ${latest_system_llvm} -o \ ${clang_tools} -gt ${latest_system_llvm} ]; then \ apt-get update -y -q && \ @@ -127,7 +127,7 @@ RUN if [ "${gcc_version}" = "" ]; then \ g++ \ gcc; \ else \ - if [ "${gcc_version}" -gt "12" ]; then \ + if [ "${gcc_version}" -gt "14" ]; then \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends software-properties-common && \ add-apt-repository ppa:ubuntu-toolchain-r/volatile; \ @@ -178,7 +178,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-swift.dockerfile b/ci/docker/ubuntu-swift.dockerfile index 4789c9188c226..26950b806d1bc 100644 --- a/ci/docker/ubuntu-swift.dockerfile +++ b/ci/docker/ubuntu-swift.dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM swift:5.7.3 +FROM swift:5.9.0 # Go is needed for generating test data RUN apt-get update -y -q && \ diff --git a/ci/etc/valgrind-cran.supp b/ci/etc/valgrind-cran.supp index 4d29220260823..e93c2a3465f79 100644 --- a/ci/etc/valgrind-cran.supp +++ b/ci/etc/valgrind-cran.supp @@ -16,7 +16,7 @@ # under the License. { - # `testthat::skip()`s cause a valgrind error that does not show up on CRAN. + # `testthat::skip()`s cause a valgrind error that does not show up on CRAN. Memcheck:Cond fun:gregexpr_Regexc @@ -32,3 +32,21 @@ fun:getvar fun:bcEval } +{ + # This also doesn't seem to cause issues on CRAN, so suppress it. + + Memcheck:Leak + match-leak-kinds: possible + fun:malloc + fun:libdeflate_alloc_compressor + fun:do_memCompress + fun:bcEval_loop + fun:bcEval + fun:Rf_eval + fun:R_execClosure + fun:applyClosure_core + fun:Rf_applyClosure + fun:Rf_eval + fun:do_set + fun:Rf_eval +} diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index dfdf90501f49c..f6bbc78be710e 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=15.0.2.9000 +pkgver=16.1.0.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/c_glib_build.sh b/ci/scripts/c_glib_build.sh index c4d2c4fdb5617..059e45e2a1386 100755 --- a/ci/scripts/c_glib_build.sh +++ b/ci/scripts/c_glib_build.sh @@ -28,17 +28,42 @@ build_root=${2} : ${BUILD_DOCS_C_GLIB:=OFF} with_doc=$([ "${BUILD_DOCS_C_GLIB}" == "ON" ] && echo "true" || echo "false") -export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig -export CFLAGS="-DARROW_NO_DEPRECATED_API" -export CXXFLAGS="-DARROW_NO_DEPRECATED_API" +if [ -n "${MSYSTEM:-}" ]; then + # Fix ARROW_HOME when running under MSYS2 + export ARROW_HOME="$(cygpath --unix "${ARROW_HOME}")" +fi + +export PATH="${ARROW_HOME}/bin:${PATH}" + +meson_pkg_config_path="${ARROW_HOME}/lib/pkgconfig" mkdir -p ${build_dir} +if [ -n "${VCPKG_ROOT:-}" -a -n "${VCPKG_TRIPLET:-}" ]; then + vcpkg_install_root="${build_root}/vcpkg_installed" + $VCPKG_ROOT/vcpkg install --x-manifest-root=${source_dir} --x-install-root=${vcpkg_install_root} + export PKG_CONFIG="${vcpkg_install_root}/${VCPKG_TRIPLET}/tools/pkgconf/pkgconf.exe" + meson_pkg_config_path="${vcpkg_install_root}/${VCPKG_TRIPLET}/lib/pkgconfig:${meson_pkg_config_path}" + # Configure PATH for libraries required by the gobject-introspection generated binary + cpp_vcpkg_install_root="${build_root}/cpp/vcpkg_installed" + PATH="${cpp_vcpkg_install_root}/${VCPKG_TRIPLET}/debug/bin:${PATH}" + PATH="${cpp_vcpkg_install_root}/${VCPKG_TRIPLET}/bin:${PATH}" + export PATH="${vcpkg_install_root}/${VCPKG_TRIPLET}/bin:${PATH}" +fi + +if [ -n "${VCToolsInstallDir:-}" -a -n "${MSYSTEM:-}" ]; then + # Meson finds the gnu link.exe instead of MSVC link.exe when running in MSYS2/git bash, + # so we need to make sure the MSCV link.exe is first in $PATH + export PATH="$(cygpath --unix "${VCToolsInstallDir}")/bin/HostX64/x64:${PATH}" +fi + # Build with Meson meson setup \ + --backend=ninja \ --prefix=$ARROW_HOME \ --libdir=lib \ + --pkg-config-path="${meson_pkg_config_path}" \ -Ddoc=${with_doc} \ -Dvapi=${ARROW_GLIB_VAPI} \ -Dwerror=${ARROW_GLIB_WERROR} \ diff --git a/ci/scripts/c_glib_test.sh b/ci/scripts/c_glib_test.sh index f8083c7759d8a..02753872dcb2d 100755 --- a/ci/scripts/c_glib_test.sh +++ b/ci/scripts/c_glib_test.sh @@ -24,6 +24,7 @@ build_dir=${2}/c_glib : ${ARROW_GLIB_VAPI:=true} +export DYLD_LIBRARY_PATH=${ARROW_HOME}/lib:${DYLD_LIBRARY_PATH} export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 diff --git a/ci/scripts/conan_build.sh b/ci/scripts/conan_build.sh index b1ee0a8fc2afd..0ea3fc29192dd 100755 --- a/ci/scripts/conan_build.sh +++ b/ci/scripts/conan_build.sh @@ -30,34 +30,39 @@ export CONAN_HOOK_ERROR_LEVEL=40 conan_args=() conan_args+=(--build=missing) if [ -n "${ARROW_CONAN_PARQUET:-}" ]; then - conan_args+=(--options arrow:parquet=${ARROW_CONAN_PARQUET}) + conan_args+=(--options arrow/*:parquet=${ARROW_CONAN_PARQUET}) + conan_args+=(--options arrow/*:with_thrift=${ARROW_CONAN_PARQUET}) + conan_args+=(--options arrow/*:with_boost=${ARROW_CONAN_PARQUET}) fi if [ -n "${ARROW_CONAN_WITH_BROTLI:-}" ]; then - conan_args+=(--options arrow:with_brotli=${ARROW_CONAN_WITH_BROTLI}) + conan_args+=(--options arrow/*:with_brotli=${ARROW_CONAN_WITH_BROTLI}) fi if [ -n "${ARROW_CONAN_WITH_BZ2:-}" ]; then - conan_args+=(--options arrow:with_bz2=${ARROW_CONAN_WITH_BZ2}) + conan_args+=(--options arrow/*:with_bz2=${ARROW_CONAN_WITH_BZ2}) fi if [ -n "${ARROW_CONAN_WITH_FLIGHT_RPC:-}" ]; then - conan_args+=(--options arrow:with_flight_rpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_flight_rpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_grpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_protobuf=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_re2=${ARROW_CONAN_WITH_FLIGHT_RPC}) fi if [ -n "${ARROW_CONAN_WITH_GLOG:-}" ]; then - conan_args+=(--options arrow:with_glog=${ARROW_CONAN_WITH_GLOG}) + conan_args+=(--options arrow/*:with_glog=${ARROW_CONAN_WITH_GLOG}) fi if [ -n "${ARROW_CONAN_WITH_JEMALLOC:-}" ]; then - conan_args+=(--options arrow:with_jemalloc=${ARROW_CONAN_WITH_JEMALLOC}) + conan_args+=(--options arrow/*:with_jemalloc=${ARROW_CONAN_WITH_JEMALLOC}) fi if [ -n "${ARROW_CONAN_WITH_JSON:-}" ]; then - conan_args+=(--options arrow:with_json=${ARROW_CONAN_WITH_JSON}) + conan_args+=(--options arrow/*:with_json=${ARROW_CONAN_WITH_JSON}) fi if [ -n "${ARROW_CONAN_WITH_LZ4:-}" ]; then - conan_args+=(--options arrow:with_lz4=${ARROW_CONAN_WITH_LZ4}) + conan_args+=(--options arrow/*:with_lz4=${ARROW_CONAN_WITH_LZ4}) fi if [ -n "${ARROW_CONAN_WITH_SNAPPY:-}" ]; then - conan_args+=(--options arrow:with_snappy=${ARROW_CONAN_WITH_SNAPPY}) + conan_args+=(--options arrow/*:with_snappy=${ARROW_CONAN_WITH_SNAPPY}) fi if [ -n "${ARROW_CONAN_WITH_ZSTD:-}" ]; then - conan_args+=(--options arrow:with_zstd=${ARROW_CONAN_WITH_ZSTD}) + conan_args+=(--options arrow/*:with_zstd=${ARROW_CONAN_WITH_ZSTD}) fi version=$(grep '^set(ARROW_VERSION ' ${ARROW_HOME}/cpp/CMakeLists.txt | \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 1e09924a5e576..3ee7fbd9d19cd 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -92,112 +92,134 @@ esac mkdir -p ${build_dir} pushd ${build_dir} -cmake \ - -Dabsl_SOURCE=${absl_SOURCE:-} \ - -DARROW_ACERO=${ARROW_ACERO:-OFF} \ - -DARROW_AZURE=${ARROW_AZURE:-OFF} \ - -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ - -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ - -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ - -DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \ - -DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \ - -DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \ - -DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \ - -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ - -DARROW_BUILD_UTILITIES=${ARROW_BUILD_UTILITIES:-ON} \ - -DARROW_COMPUTE=${ARROW_COMPUTE:-ON} \ - -DARROW_CSV=${ARROW_CSV:-ON} \ - -DARROW_CUDA=${ARROW_CUDA:-OFF} \ - -DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \ - -DARROW_CXX_FLAGS_DEBUG="${ARROW_CXX_FLAGS_DEBUG:-}" \ - -DARROW_CXX_FLAGS_RELEASE="${ARROW_CXX_FLAGS_RELEASE:-}" \ - -DARROW_CXX_FLAGS_RELWITHDEBINFO="${ARROW_CXX_FLAGS_RELWITHDEBINFO:-}" \ - -DARROW_C_FLAGS_DEBUG="${ARROW_C_FLAGS_DEBUG:-}" \ - -DARROW_C_FLAGS_RELEASE="${ARROW_C_FLAGS_RELEASE:-}" \ - -DARROW_C_FLAGS_RELWITHDEBINFO="${ARROW_C_FLAGS_RELWITHDEBINFO:-}" \ - -DARROW_DATASET=${ARROW_DATASET:-OFF} \ - -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ - -DARROW_ENABLE_THREADING=${ARROW_ENABLE_THREADING:-ON} \ - -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ - -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ - -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ - -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ - -DARROW_FLIGHT_SQL=${ARROW_FLIGHT_SQL:-OFF} \ - -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ - -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ - -DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \ - -DARROW_GCS=${ARROW_GCS:-OFF} \ - -DARROW_HDFS=${ARROW_HDFS:-ON} \ - -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ - -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ - -DARROW_JSON=${ARROW_JSON:-ON} \ - -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ - -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ - -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ - -DARROW_ORC=${ARROW_ORC:-OFF} \ - -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ - -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ - -DARROW_S3=${ARROW_S3:-OFF} \ - -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ - -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ - -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} \ - -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ - -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ - -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ - -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ - -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ - -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ - -DARROW_USE_MOLD=${ARROW_USE_MOLD:-OFF} \ - -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ - -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ - -DARROW_USE_TSAN=${ARROW_USE_TSAN:-OFF} \ - -DARROW_USE_UBSAN=${ARROW_USE_UBSAN:-OFF} \ - -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ - -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-OFF} \ - -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-OFF} \ - -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-OFF} \ - -DARROW_WITH_OPENTELEMETRY=${ARROW_WITH_OPENTELEMETRY:-OFF} \ - -DARROW_WITH_MUSL=${ARROW_WITH_MUSL:-OFF} \ - -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-OFF} \ - -DARROW_WITH_UCX=${ARROW_WITH_UCX:-OFF} \ - -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ - -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ - -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ - -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ - -DAzure_SOURCE=${Azure_SOURCE:-} \ - -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ - -DBOOST_SOURCE=${BOOST_SOURCE:-} \ - -DBrotli_SOURCE=${Brotli_SOURCE:-} \ - -DBUILD_WARNING_LEVEL=${BUILD_WARNING_LEVEL:-CHECKIN} \ - -Dc-ares_SOURCE=${cares_SOURCE:-} \ - -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \ - -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ - -DCMAKE_C_FLAGS="${CFLAGS:-}" \ - -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ - -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ - -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ - -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ - -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ - -Dgflags_SOURCE=${gflags_SOURCE:-} \ - -Dgoogle_cloud_cpp_storage_SOURCE=${google_cloud_cpp_storage_SOURCE:-} \ - -DgRPC_SOURCE=${gRPC_SOURCE:-} \ - -DGTest_SOURCE=${GTest_SOURCE:-} \ - -Dlz4_SOURCE=${lz4_SOURCE:-} \ - -DORC_SOURCE=${ORC_SOURCE:-} \ - -DPARQUET_BUILD_EXAMPLES=${PARQUET_BUILD_EXAMPLES:-OFF} \ - -DPARQUET_BUILD_EXECUTABLES=${PARQUET_BUILD_EXECUTABLES:-OFF} \ - -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} \ - -DProtobuf_SOURCE=${Protobuf_SOURCE:-} \ - -DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \ - -Dre2_SOURCE=${re2_SOURCE:-} \ - -DSnappy_SOURCE=${Snappy_SOURCE:-} \ - -DThrift_SOURCE=${Thrift_SOURCE:-} \ - -Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \ - -Dzstd_SOURCE=${zstd_SOURCE:-} \ - -Dxsimd_SOURCE=${xsimd_SOURCE:-} \ - -G "${CMAKE_GENERATOR:-Ninja}" \ - ${ARROW_CMAKE_ARGS} \ - ${source_dir} +if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then + if [ "${UBUNTU}" = "20.04" ]; then + echo "arrow emscripten build is not supported on Ubuntu 20.04, run with UBUNTU=22.04" + exit -1 + fi + n_jobs=2 # Emscripten build fails on docker unless this is set really low + source ~/emsdk/emsdk_env.sh + emcmake cmake \ + --preset=ninja-${ARROW_BUILD_TYPE:-debug}-emscripten \ + -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ + -DCMAKE_C_FLAGS="${CFLAGS:-}" \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ + -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ + ${ARROW_CMAKE_ARGS} \ + ${source_dir} +else + cmake \ + -Dabsl_SOURCE=${absl_SOURCE:-} \ + -DARROW_ACERO=${ARROW_ACERO:-OFF} \ + -DARROW_AZURE=${ARROW_AZURE:-OFF} \ + -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ + -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ + -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ + -DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \ + -DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \ + -DARROW_BUILD_OPENMP_BENCHMARKS=${ARROW_BUILD_OPENMP_BENCHMARKS:-OFF} \ + -DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \ + -DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \ + -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ + -DARROW_BUILD_UTILITIES=${ARROW_BUILD_UTILITIES:-ON} \ + -DARROW_COMPUTE=${ARROW_COMPUTE:-ON} \ + -DARROW_CSV=${ARROW_CSV:-ON} \ + -DARROW_CUDA=${ARROW_CUDA:-OFF} \ + -DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \ + -DARROW_CXX_FLAGS_DEBUG="${ARROW_CXX_FLAGS_DEBUG:-}" \ + -DARROW_CXX_FLAGS_RELEASE="${ARROW_CXX_FLAGS_RELEASE:-}" \ + -DARROW_CXX_FLAGS_RELWITHDEBINFO="${ARROW_CXX_FLAGS_RELWITHDEBINFO:-}" \ + -DARROW_C_FLAGS_DEBUG="${ARROW_C_FLAGS_DEBUG:-}" \ + -DARROW_C_FLAGS_RELEASE="${ARROW_C_FLAGS_RELEASE:-}" \ + -DARROW_C_FLAGS_RELWITHDEBINFO="${ARROW_C_FLAGS_RELWITHDEBINFO:-}" \ + -DARROW_DATASET=${ARROW_DATASET:-OFF} \ + -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ + -DARROW_DEPENDENCY_USE_SHARED=${ARROW_DEPENDENCY_USE_SHARED:-ON} \ + -DARROW_ENABLE_THREADING=${ARROW_ENABLE_THREADING:-ON} \ + -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ + -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ + -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ + -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ + -DARROW_FLIGHT_SQL=${ARROW_FLIGHT_SQL:-OFF} \ + -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ + -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ + -DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \ + -DARROW_GCS=${ARROW_GCS:-OFF} \ + -DARROW_HDFS=${ARROW_HDFS:-ON} \ + -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ + -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ + -DARROW_JSON=${ARROW_JSON:-ON} \ + -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ + -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ + -DARROW_ORC=${ARROW_ORC:-OFF} \ + -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ + -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ + -DARROW_S3=${ARROW_S3:-OFF} \ + -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ + -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ + -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} \ + -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ + -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ + -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ + -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ + -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ + -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ + -DARROW_USE_LLD=${ARROW_USE_LLD:-OFF} \ + -DARROW_USE_MOLD=${ARROW_USE_MOLD:-OFF} \ + -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ + -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ + -DARROW_USE_TSAN=${ARROW_USE_TSAN:-OFF} \ + -DARROW_USE_UBSAN=${ARROW_USE_UBSAN:-OFF} \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ + -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-OFF} \ + -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-OFF} \ + -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-OFF} \ + -DARROW_WITH_OPENTELEMETRY=${ARROW_WITH_OPENTELEMETRY:-OFF} \ + -DARROW_WITH_MUSL=${ARROW_WITH_MUSL:-OFF} \ + -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-OFF} \ + -DARROW_WITH_UCX=${ARROW_WITH_UCX:-OFF} \ + -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ + -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ + -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ + -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ + -DAzure_SOURCE=${Azure_SOURCE:-} \ + -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ + -DBOOST_SOURCE=${BOOST_SOURCE:-} \ + -DBrotli_SOURCE=${Brotli_SOURCE:-} \ + -DBUILD_WARNING_LEVEL=${BUILD_WARNING_LEVEL:-CHECKIN} \ + -Dc-ares_SOURCE=${cares_SOURCE:-} \ + -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \ + -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ + -DCMAKE_C_FLAGS="${CFLAGS:-}" \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ + -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ + -Dgflags_SOURCE=${gflags_SOURCE:-} \ + -Dgoogle_cloud_cpp_storage_SOURCE=${google_cloud_cpp_storage_SOURCE:-} \ + -DgRPC_SOURCE=${gRPC_SOURCE:-} \ + -DGTest_SOURCE=${GTest_SOURCE:-} \ + -Dlz4_SOURCE=${lz4_SOURCE:-} \ + -DORC_SOURCE=${ORC_SOURCE:-} \ + -DPARQUET_BUILD_EXAMPLES=${PARQUET_BUILD_EXAMPLES:-OFF} \ + -DPARQUET_BUILD_EXECUTABLES=${PARQUET_BUILD_EXECUTABLES:-OFF} \ + -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} \ + -DProtobuf_SOURCE=${Protobuf_SOURCE:-} \ + -DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \ + -Dre2_SOURCE=${re2_SOURCE:-} \ + -DSnappy_SOURCE=${Snappy_SOURCE:-} \ + -DThrift_SOURCE=${Thrift_SOURCE:-} \ + -Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \ + -Dzstd_SOURCE=${zstd_SOURCE:-} \ + -Dxsimd_SOURCE=${xsimd_SOURCE:-} \ + -G "${CMAKE_GENERATOR:-Ninja}" \ + ${ARROW_CMAKE_ARGS} \ + ${source_dir} +fi export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-$[${n_jobs} + 1]} time cmake --build . --target install @@ -208,12 +230,17 @@ find . -name "*.o" -delete popd if [ -x "$(command -v ldconfig)" ]; then - ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} + if [ -x "$(command -v sudo)" ]; then + SUDO=sudo + else + SUDO= + fi + ${SUDO} ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} fi if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo -e "===\n=== ccache statistics after build\n===" - ccache -sv 2>/dev/null || ccache -s + echo -e "===\n=== ccache statistics after build\n===" + ccache -sv 2>/dev/null || ccache -s fi if command -v sccache &> /dev/null; then @@ -223,6 +250,6 @@ fi if [ "${BUILD_DOCS_CPP}" == "ON" ]; then pushd ${source_dir}/apidoc - doxygen + OUTPUT_DIRECTORY=${build_dir}/apidoc doxygen popd fi diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index f388825fd0a98..2c640f2c1fb6a 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -80,6 +80,10 @@ case "$(uname)" in ;; esac +if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then + n_jobs=1 # avoid spurious fails on emscripten due to loading too many big executables +fi + pushd ${build_dir} if [ -z "${PYTHON}" ] && ! which python > /dev/null 2>&1; then diff --git a/ci/scripts/go_bench.sh b/ci/scripts/go_bench.sh old mode 100644 new mode 100755 diff --git a/ci/scripts/install_azurite.sh b/ci/scripts/install_azurite.sh index 2e7008360fdc3..dda5e99405b7f 100755 --- a/ci/scripts/install_azurite.sh +++ b/ci/scripts/install_azurite.sh @@ -19,17 +19,18 @@ set -e +# Pin azurite to 3.29.0 due to https://github.com/apache/arrow/issues/41505 case "$(uname)" in Darwin) - npm install -g azurite + npm install -g azurite@v3.29.0 which azurite ;; MINGW*) choco install nodejs.install - npm install -g azurite + npm install -g azurite@v3.29.0 ;; Linux) - npm install -g azurite + npm install -g azurite@v3.29.0 which azurite ;; esac diff --git a/ci/scripts/install_cmake.sh b/ci/scripts/install_cmake.sh index 2f5e5d52051ed..7fdb06d90f02c 100755 --- a/ci/scripts/install_cmake.sh +++ b/ci/scripts/install_cmake.sh @@ -21,7 +21,10 @@ set -e declare -A archs archs=([amd64]=x86_64 - [arm64v8]=aarch64) + [arch64]=aarch64 + [arm64]=aarch64 + [arm64v8]=aarch64 + [x86_64]=x86_64) declare -A platforms platforms=([linux]=linux @@ -38,5 +41,25 @@ platform=${platforms[$2]} version=$3 prefix=$4 -url="https://github.com/Kitware/CMake/releases/download/v${version}/cmake-${version}-${platform}-${arch}.tar.gz" -wget -q ${url} -O - | tar -xzf - --directory ${prefix} --strip-components=1 +mkdir -p ${prefix} +url="https://github.com/Kitware/CMake/releases/download/v${version}/cmake-${version}-${platform}-" +case ${platform} in + macos) + url+="universal.tar.gz" + curl -L ${url} | tar -xzf - --directory ${prefix} --strip-components=1 + ln -s CMake.app/Contents/bin ${prefix}/bin + ;; + windows) + url+="${arch}.zip" + archive_name=$(basename ${url}) + curl -L -o ${archive_name} ${url} + unzip ${archive_name} + base_name=$(basename ${archive_name} .zip) + mv ${base_name}/* ${prefix} + rm -rf ${base_name} ${archive_name} + ;; + *) + url+="${arch}.tar.gz" + curl -L ${url} | tar -xzf - --directory ${prefix} --strip-components=1 + ;; +esac diff --git a/ci/scripts/install_sccache.sh b/ci/scripts/install_sccache.sh index 0346c0cc9ce7d..136f39b3ae2ab 100755 --- a/ci/scripts/install_sccache.sh +++ b/ci/scripts/install_sccache.sh @@ -59,7 +59,7 @@ fi # Extract only the sccache binary into $PREFIX and ignore README and LICENSE. # --wildcards doesn't work on busybox. tar -xzvf $SCCACHE_ARCHIVE --strip-component=1 --directory $PREFIX --exclude="sccache*/*E*E*" -chmod u+x $PREFIX/sccache +chmod a+x $PREFIX/sccache if [ -n "${GITHUB_PATH}" ]; then echo "$PREFIX" >> $GITHUB_PATH diff --git a/ci/scripts/install_vcpkg.sh b/ci/scripts/install_vcpkg.sh index cc80582326ec5..08989d6444827 100755 --- a/ci/scripts/install_vcpkg.sh +++ b/ci/scripts/install_vcpkg.sh @@ -25,13 +25,16 @@ if [ "$#" -lt 1 ]; then fi arrow_dir=$(cd -- "$(dirname -- "$0")/../.." && pwd -P) -default_vcpkg_version=$(cat "${arrow_dir}/.env" | grep "VCPKG" | cut -d "=" -f2 | tr -d '"') default_vcpkg_ports_patch="${arrow_dir}/ci/vcpkg/ports.patch" vcpkg_destination=$1 -vcpkg_version=${2:-$default_vcpkg_version} +vcpkg_version=${2:-} vcpkg_ports_patch=${3:-$default_vcpkg_ports_patch} +if [ -z "${vcpkg_version}" ]; then + vcpkg_version=$(source "${arrow_dir}/.env" && echo "$VCPKG") +fi + # reduce the fetched data using a shallow clone git clone --shallow-since=2021-04-01 https://github.com/microsoft/vcpkg ${vcpkg_destination} diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index a5a012ad2c5c4..2eb58e8dc75ec 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -40,6 +40,8 @@ if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then pip install jpype1 fi +export ARROW_BUILD_ROOT=${build_dir} + # Get more detailed context on crashes export PYTHONFAULTHANDLER=1 diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index e5c31527aedff..9b54049a2b803 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -30,6 +30,8 @@ build_dir=${2} ${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir} +${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir} + if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir} fi diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 2103f0329baec..0fa1edab429c0 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -75,7 +75,16 @@ fi # Use `2 * ncores` threads mvn="${mvn} -T 2C" -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +mkdir -p ${build_dir} +rm -rf ${build_dir}/format +cp -aL ${arrow_dir}/format ${build_dir}/ +rm -rf ${build_dir}/java +cp -aL ${source_dir} ${build_dir}/ +pushd ${build_dir}/java if [ "${ARROW_JAVA_SHADE_FLATBUFFERS}" == "ON" ]; then mvn="${mvn} -Pshade-flatbuffers" @@ -95,7 +104,7 @@ if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 mkdir -p ${build_dir}/docs/java/reference ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site - rsync -a ${arrow_dir}/java/target/site/apidocs/ ${build_dir}/docs/java/reference + rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference fi popd diff --git a/ci/scripts/java_cdata_integration.sh b/ci/scripts/java_cdata_integration.sh index 86ea7cf155350..0ee5d3026aa09 100755 --- a/ci/scripts/java_cdata_integration.sh +++ b/ci/scripts/java_cdata_integration.sh @@ -20,9 +20,9 @@ set -ex arrow_dir=${1} -export ARROW_SOURCE_DIR=${arrow_dir} +build_dir=${2} -pushd ${arrow_dir}/java/c/src/test/python +pushd ${build_dir}/java/c/src/test/python python integration_tests.py diff --git a/ci/scripts/java_full_build.sh b/ci/scripts/java_full_build.sh index 2734f3e9dbec2..4beade50b4556 100755 --- a/ci/scripts/java_full_build.sh +++ b/ci/scripts/java_full_build.sh @@ -49,21 +49,14 @@ fi # build the entire project mvn clean \ install \ - assembly:single \ - source:jar \ - javadoc:jar \ -Papache-release \ -Parrow-c-data \ -Parrow-jni \ -Darrow.cpp.build.dir=$dist_dir \ -Darrow.c.jni.dist.dir=$dist_dir \ - -DdescriptorId=source-release + --no-transfer-progress # copy all jar, zip and pom files to the distribution folder -find . \ - "(" -name "*-javadoc.jar" -o -name "*-sources.jar" ")" \ - -exec echo {} ";" \ - -exec cp {} $dist_dir ";" find ~/.m2/repository/org/apache/arrow \ "(" \ -name "*.jar" -o \ diff --git a/ci/scripts/java_jni_manylinux_build.sh b/ci/scripts/java_jni_manylinux_build.sh index da4987d307ce4..6f3769751af42 100755 --- a/ci/scripts/java_jni_manylinux_build.sh +++ b/ci/scripts/java_jni_manylinux_build.sh @@ -58,7 +58,7 @@ export ARROW_ORC : ${VCPKG_ROOT:=/opt/vcpkg} : ${VCPKG_FEATURE_FLAGS:=-manifests} : ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}} -: ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-isystem;-lpthread} +: ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-lpthread} if [ "${ARROW_USE_CCACHE}" == "ON" ]; then echo "=== ccache statistics before build ===" diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh index d61f74f0b7ca1..196539ee0f101 100755 --- a/ci/scripts/js_build.sh +++ b/ci/scripts/js_build.sh @@ -25,7 +25,16 @@ build_dir=${2} : ${BUILD_DOCS_JS:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/js +mkdir -p ${build_dir} +cp -aL ${arrow_dir}/LICENSE.txt ${build_dir}/ +cp -aL ${arrow_dir}/NOTICE.txt ${build_dir}/ +cp -aL ${source_dir} ${build_dir}/js +pushd ${build_dir}/js yarn --immutable yarn lint:ci @@ -34,18 +43,18 @@ yarn build if [ "${BUILD_DOCS_JS}" == "ON" ]; then # If apache or upstream are defined use those as remote. # Otherwise use origin which could be a fork on PRs. - if [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then + if [ "$(git -C ${arrow_dir} config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then yarn doc --gitRemote apache - elif [[ "$(git config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then + elif [[ "$(git -C ${arrow_dir}config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then yarn doc --gitRemote upstream - elif [[ "$(basename -s .git $(git config --get remote.origin.url))" == "arrow" ]]; then + elif [[ "$(basename -s .git $(git -C ${arrow_dir} config --get remote.origin.url))" == "arrow" ]]; then yarn doc else echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." exit 0 fi mkdir -p ${build_dir}/docs/js - rsync -a ${arrow_dir}/js/doc/ ${build_dir}/docs/js + rsync -a doc/ ${build_dir}/docs/js fi popd diff --git a/ci/scripts/js_test.sh b/ci/scripts/js_test.sh index 40de974ede161..863b1c3d34613 100755 --- a/ci/scripts/js_test.sh +++ b/ci/scripts/js_test.sh @@ -20,8 +20,9 @@ set -ex source_dir=${1}/js +build_dir=${2}/js -pushd ${source_dir} +pushd ${build_dir} yarn lint yarn test diff --git a/ci/scripts/nanoarrow_build.sh b/ci/scripts/nanoarrow_build.sh new file mode 100755 index 0000000000000..1612b9a2d0102 --- /dev/null +++ b/ci/scripts/nanoarrow_build.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +arrow_dir=${1} +source_dir=${1}/nanoarrow +build_dir=${2}/nanoarrow + +# This file is used to build the nanoarrow binaries needed for the archery +# integration tests. Testing of the nanoarrow implementation in normal CI is handled +# by github workflows in the arrow-nanoarrow repository. + +if [ "${ARCHERY_INTEGRATION_WITH_NANOARROW}" -eq "0" ]; then + echo "=====================================================================" + echo "Not building nanoarrow" + echo "=====================================================================" + exit 0; +elif [ ! -d "${source_dir}" ]; then + echo "=====================================================================" + echo "The nanoarrow source is missing. Please clone the arrow-nanoarrow repository" + echo "to arrow/nanoarrow before running the integration tests:" + echo " git clone https://github.com/apache/arrow-nanoarrow.git path/to/arrow/nanoarrow" + echo "=====================================================================" + exit 1; +fi + +set -x + +mkdir -p ${build_dir} +pushd ${build_dir} + +cmake ${source_dir} -DNANOARROW_BUILD_INTEGRATION_TESTS=ON +cmake --build . + +popd diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index 7a24c1172f7b1..9455baf353633 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -74,19 +74,46 @@ export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} export PYARROW_PARALLEL=${n_jobs} +: ${CMAKE_PREFIX_PATH:=${ARROW_HOME}} +export CMAKE_PREFIX_PATH export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${python_build_dir} +cp -aL ${source_dir} ${python_build_dir} +pushd ${python_build_dir} # - Cannot call setup.py as it may install in the wrong directory # on Debian/Ubuntu (ARROW-15243). # - Cannot use build isolation as we want to use specific dependency versions # (e.g. Numpy, Pandas) on some CI jobs. ${PYTHON:-python} -m pip install --no-deps --no-build-isolation -vv . -# Remove build artifacts from source directory -find build/ -user root -delete popd if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then + # https://github.com/apache/arrow/issues/41429 + # TODO: We want to out-of-source build. This is a workaround. + # + # Copy docs/source because the "autosummary_generate = True" + # configuration generates files to docs/source/python/generated/. + rm -rf ${python_build_dir}/docs/source + mkdir -p ${python_build_dir}/docs + cp -a ${arrow_dir}/docs/source ${python_build_dir}/docs/ + rm -rf ${python_build_dir}/format + cp -a ${arrow_dir}/format ${python_build_dir}/ + rm -rf ${python_build_dir}/cpp/examples + mkdir -p ${python_build_dir}/cpp + cp -a ${arrow_dir}/cpp/examples ${python_build_dir}/cpp/ + rm -rf ${python_build_dir}/ci + cp -a ${arrow_dir}/ci/ ${python_build_dir}/ ncpus=$(python -c "import os; print(os.cpu_count())") - sphinx-build -b html -j ${ncpus} ${arrow_dir}/docs/source ${build_dir}/docs + export ARROW_CPP_DOXYGEN_XML=${build_dir}/cpp/apidoc/xml + pushd ${build_dir} + sphinx-build \ + -b html \ + ${python_build_dir}/docs/source \ + ${build_dir}/docs + popd fi diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index a94dac40e931f..3ed9d5d8dd12f 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -50,15 +50,12 @@ echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}" -# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release pip install \ --upgrade \ --only-binary=:all: \ --target $PIP_SITE_PACKAGES \ --platform $PIP_TARGET_PLATFORM \ - -r ${source_dir}/python/requirements-wheel-build.txt \ - --pre \ - --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" + -r ${source_dir}/python/requirements-wheel-build.txt pip install "delocate>=0.10.3" echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 6e29ef58d2318..aa86494a9d47d 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -160,6 +160,26 @@ export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python python setup.py bdist_wheel +echo "=== Strip symbols from wheel ===" +mkdir -p dist/temp-fix-wheel +mv dist/pyarrow-*.whl dist/temp-fix-wheel + +pushd dist/temp-fix-wheel +wheel_name=$(ls pyarrow-*.whl) +# Unzip and remove old wheel +unzip $wheel_name +rm $wheel_name +for filename in $(ls pyarrow/*.so pyarrow/*.so.*); do + echo "Stripping debug symbols from: $filename"; + strip --strip-debug $filename +done +# Zip wheel again after stripping symbols +zip -r $wheel_name . +mv $wheel_name .. +popd + +rm -rf dist/temp-fix-wheel + echo "=== (${PYTHON_VERSION}) Tag the wheel with manylinux${MANYLINUX_VERSION} ===" auditwheel repair -L . dist/pyarrow-*.whl -w repaired_wheels popd diff --git a/ci/scripts/r_build.sh b/ci/scripts/r_build.sh index 38b54e4434036..f4dc5a5781c6e 100755 --- a/ci/scripts/r_build.sh +++ b/ci/scripts/r_build.sh @@ -24,15 +24,29 @@ build_dir=${2} : ${BUILD_DOCS_R:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/r +cp -aL ${source_dir} ${build_dir}/r +pushd ${build_dir}/r # build first so that any stray compiled files in r/src are ignored ${R_BIN} CMD build . -${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz +if [ -x "$(command -v sudo)" ]; then + SUDO=sudo +else + SUDO= +fi +${SUDO} \ + env \ + PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH} \ + ${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz if [ "${BUILD_DOCS_R}" == "ON" ]; then ${R_BIN} -e "pkgdown::build_site(install = FALSE)" - rsync -a ${source_dir}/docs/ ${build_dir}/docs/r + rsync -a docs/ ${build_dir}/docs/r fi popd diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 52db2e6df6611..8a962fe576cbb 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -67,26 +67,6 @@ sloppiness = include_file_ctime hash_dir = false" >> ~/.ccache/ccache.conf fi -# Special hacking to try to reproduce quirks on centos using non-default build -# tooling. -if [[ -n "$DEVTOOLSET_VERSION" ]]; then - $PACKAGE_MANAGER install -y centos-release-scl - $PACKAGE_MANAGER install -y "devtoolset-$DEVTOOLSET_VERSION" - - # Enable devtoolset here so that `which gcc` finds the right compiler below - source /opt/rh/devtoolset-${DEVTOOLSET_VERSION}/enable - - # Build images which require the devtoolset don't have CXX17 variables - # set as the system compiler doesn't support C++17 - if [ ! "`{R_BIN} CMD config CXX17`" ]; then - mkdir -p ~/.R - echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars - echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars - echo "CXX17STD = -std=c++17" >> ~/.R/Makevars - echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars - fi -fi - if [ -f "${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh" ]; then "${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh" fi diff --git a/ci/scripts/r_install_system_dependencies.sh b/ci/scripts/r_install_system_dependencies.sh index be0d75ef235e6..7ddc2604f661a 100755 --- a/ci/scripts/r_install_system_dependencies.sh +++ b/ci/scripts/r_install_system_dependencies.sh @@ -21,29 +21,30 @@ set -ex : ${ARROW_SOURCE_HOME:=/arrow} -if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_GCS" == "ON" ] || [ "$ARROW_R_DEV" == "TRUE" ]; then - # Figure out what package manager we have - if [ "`which dnf`" ]; then - PACKAGE_MANAGER=dnf - elif [ "`which yum`" ]; then - PACKAGE_MANAGER=yum - elif [ "`which zypper`" ]; then - PACKAGE_MANAGER=zypper - else - PACKAGE_MANAGER=apt-get - apt-get update - fi +# Figure out what package manager we have +if [ "`which dnf`" ]; then + PACKAGE_MANAGER=dnf +elif [ "`which yum`" ]; then + PACKAGE_MANAGER=yum +elif [ "`which zypper`" ]; then + PACKAGE_MANAGER=zypper +else + PACKAGE_MANAGER=apt-get + apt-get update +fi - # Install curl and OpenSSL for S3/GCS support - case "$PACKAGE_MANAGER" in - apt-get) - apt-get install -y libcurl4-openssl-dev libssl-dev - ;; - *) - $PACKAGE_MANAGER install -y libcurl-devel openssl-devel - ;; - esac +# Install curl and OpenSSL (technically, only needed for S3/GCS support, but +# installing the R curl package fails without it) +case "$PACKAGE_MANAGER" in + apt-get) + apt-get install -y libcurl4-openssl-dev libssl-dev + ;; + *) + $PACKAGE_MANAGER install -y libcurl-devel openssl-devel + ;; +esac +if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_GCS" == "ON" ] || [ "$ARROW_R_DEV" == "TRUE" ]; then # The Dockerfile should have put this file here if [ "$ARROW_S3" == "ON" ] && [ -f "${ARROW_SOURCE_HOME}/ci/scripts/install_minio.sh" ] && [ "`which wget`" ]; then "${ARROW_SOURCE_HOME}/ci/scripts/install_minio.sh" latest /usr/local diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh index 600ee0fa2cbe5..fb3e9a5836387 100755 --- a/ci/scripts/r_sanitize.sh +++ b/ci/scripts/r_sanitize.sh @@ -46,10 +46,12 @@ unset ARROW_R_DEV export ARROW_R_VERBOSE_TEST=TRUE export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp" +# From the old rhub image https://github.com/r-hub/rhub-linux-builders/blob/master/fedora-clang-devel-san/Dockerfile +export ASAN_OPTIONS="alloc_dealloc_mismatch=0:detect_leaks=0:detect_odr_violation=0" # run tests pushd tests -${R_BIN} < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; } +${R_BIN} --no-save < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; } cat testthat.out if grep -q "runtime error" testthat.out; then @@ -58,7 +60,7 @@ fi # run examples popd -${R_BIN} -e 'library(arrow); testthat::test_examples(".")' >> examples.out 2>&1 || { cat examples.out; exit 1; } +${R_BIN} --no-save -e 'library(arrow); testthat::test_examples(".")' >> examples.out 2>&1 || { cat examples.out; exit 1; } cat examples.out if grep -q "runtime error" examples.out; then diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 72078ab3c06c2..fe9d18edb8cbb 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -46,7 +46,9 @@ if [ "$ARROW_USE_PKG_CONFIG" != "false" ]; then export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export R_LD_LIBRARY_PATH=${LD_LIBRARY_PATH} fi -export _R_CHECK_COMPILATION_FLAGS_KNOWN_=${ARROW_R_CXXFLAGS} + +export _R_CHECK_COMPILATION_FLAGS_KNOWN_="${_R_CHECK_COMPILATION_FLAGS_KNOWN_} ${ARROW_R_CXXFLAGS}" + if [ "$ARROW_R_DEV" = "TRUE" ]; then # These are sometimes used in the Arrow C++ build and are not a problem export _R_CHECK_COMPILATION_FLAGS_KNOWN_="${_R_CHECK_COMPILATION_FLAGS_KNOWN_} -Wno-attributes -msse4.2 -Wno-noexcept-type -Wno-subobject-linkage" @@ -108,16 +110,15 @@ SCRIPT="as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true') on.exit(tools::pskill(pid_flight), add = TRUE) } - run_donttest <- identical(tolower(Sys.getenv('_R_CHECK_DONTTEST_EXAMPLES_', 'true')), 'true') - if (run_donttest) { - args <- c(args, '--run-donttest') - } - install_args <- Sys.getenv('INSTALL_ARGS') if (nzchar(install_args)) { args <- c(args, paste0('--install-args=\"', install_args, '\"')) } + message('Running rcmdcheck with:\n') + print(build_args) + print(args) + rcmdcheck::rcmdcheck(build_args = build_args, args = args, error_on = 'warning', check_dir = 'check', timeout = 3600)" echo "$SCRIPT" | ${R_BIN} --no-save diff --git a/ci/scripts/r_valgrind.sh b/ci/scripts/r_valgrind.sh index a14cb803ca898..0e40d792111c4 100755 --- a/ci/scripts/r_valgrind.sh +++ b/ci/scripts/r_valgrind.sh @@ -33,7 +33,7 @@ ${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz pushd tests # to generate suppression files run: -# ${R_BIN} --vanilla -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --gen-suppressions=all --log-file=memcheck.log" -f testthat.supp +# ${R_BIN} --vanilla -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --gen-suppressions=all --log-file=memcheck.log" -f testthat.R ${R_BIN} --vanilla -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --suppressions=/${1}/ci/etc/valgrind-cran.supp" -f testthat.R |& tee testthat.out # valgrind --error-exitcode=1 should return an erroring exit code that we can catch, diff --git a/ci/scripts/ruby_test.sh b/ci/scripts/ruby_test.sh index 56c33a4d6378a..507fa7858e870 100755 --- a/ci/scripts/ruby_test.sh +++ b/ci/scripts/ruby_test.sh @@ -22,6 +22,7 @@ set -ex source_dir=${1}/ruby build_dir=${2}/ruby +export DYLD_LIBRARY_PATH=${ARROW_HOME}/lib:${DYLD_LIBRARY_PATH} export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 diff --git a/ci/scripts/swift_test.sh b/ci/scripts/swift_test.sh index b523e3891d93c..aba90f31e50d5 100755 --- a/ci/scripts/swift_test.sh +++ b/ci/scripts/swift_test.sh @@ -34,10 +34,14 @@ popd source_dir=${1}/swift/Arrow pushd ${source_dir} +sed 's/\/\/ build://g' Package.swift > Package.swift.build +mv Package.swift.build Package.swift swift test popd source_dir=${1}/swift/ArrowFlight pushd ${source_dir} +sed 's/\/\/ build://g' Package.swift > Package.swift.build +mv Package.swift.build Package.swift swift test popd diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 0d4fb540a2003..136b719ea72dd 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,11 +1,11 @@ diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake -index bdc544e9e..53f6bbc3b 100644 +index 7cab6f726..697ab1bb4 100644 --- a/ports/curl/portfile.cmake +++ b/ports/curl/portfile.cmake -@@ -74,9 +74,12 @@ vcpkg_cmake_configure( - -DENABLE_MANUAL=OFF +@@ -84,9 +84,12 @@ vcpkg_cmake_configure( + -DBUILD_TESTING=OFF + -DENABLE_CURL_MANUAL=OFF -DCURL_CA_FALLBACK=ON - -DCURL_USE_LIBPSL=OFF + -DCURL_CA_PATH=none + -DCURL_CA_BUNDLE=none -DCMAKE_DISABLE_FIND_PACKAGE_Perl=ON @@ -15,6 +15,19 @@ index bdc544e9e..53f6bbc3b 100644 ) vcpkg_cmake_install() vcpkg_copy_pdbs() +diff --git a/ports/llvm/portfile.cmake b/ports/llvm/portfile.cmake +index a79c72a59..6b7fa6a66 100644 +--- a/ports/llvm/portfile.cmake ++++ b/ports/llvm/portfile.cmake +@@ -292,6 +292,8 @@ vcpkg_cmake_configure( + ${FEATURE_OPTIONS} + MAYBE_UNUSED_VARIABLES + COMPILER_RT_ENABLE_IOS ++ BOLT_TOOLS_INSTALL_DIR ++ LIBOMP_INSTALL_ALIASES + ) + + vcpkg_cmake_install(ADD_BIN_TO_PATH) diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake index 0c7098082..c603c3653 100644 --- a/ports/snappy/portfile.cmake @@ -52,16 +65,3 @@ index 000000000..e839c93a4 + } + + static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { -diff --git a/ports/llvm/portfile.cmake b/ports/llvm/portfile.cmake -index bf9397b66..c3112b673 100644 ---- a/ports/llvm/portfile.cmake -+++ b/ports/llvm/portfile.cmake -@@ -293,6 +293,8 @@ vcpkg_cmake_configure( - ${FEATURE_OPTIONS} - MAYBE_UNUSED_VARIABLES - COMPILER_RT_ENABLE_IOS -+ BOLT_TOOLS_INSTALL_DIR -+ LIBOMP_INSTALL_ALIASES - ) - - vcpkg_cmake_install(ADD_BIN_TO_PATH) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1fbf0bfcfb528..679842c31e0b1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -71,7 +71,7 @@ if(POLICY CMP0135) cmake_policy(SET CMP0135 NEW) endif() -set(ARROW_VERSION "16.0.0-SNAPSHOT") +set(ARROW_VERSION "17.0.0-SNAPSHOT") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") @@ -256,9 +256,6 @@ if(ARROW_USE_CCACHE endif() if(ARROW_OPTIONAL_INSTALL) - # Don't make the "install" target depend on the "all" target - set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) - set(INSTALL_IS_OPTIONAL OPTIONAL) endif() @@ -434,10 +431,6 @@ endif() # Compiler flags # -if(ARROW_NO_DEPRECATED_API) - add_definitions(-DARROW_NO_DEPRECATED_API) -endif() - if(ARROW_EXTRA_ERROR_CONTEXT) add_definitions(-DARROW_EXTRA_ERROR_CONTEXT) endif() @@ -711,7 +704,7 @@ list(APPEND ARROW_TEST_LINK_LIBS ${ARROW_GTEST_GMOCK} ${ARROW_GTEST_GTEST_MAIN}) if(ARROW_BUILD_BENCHMARKS) set(ARROW_BENCHMARK_LINK_LIBS benchmark::benchmark_main ${ARROW_TEST_LINK_LIBS}) if(WIN32) - list(APPEND ARROW_BENCHMARK_LINK_LIBS Shlwapi.dll) + list(APPEND ARROW_BENCHMARK_LINK_LIBS shlwapi) endif() endif() diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 9d99b3b2a79e0..cb4cdfc03ac82 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -46,6 +46,32 @@ "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { + "name": "features-emscripten", + "hidden": true, + "cacheVariables": { + "ARROW_ACERO": "ON", + "ARROW_BUILD_SHARED": "OFF", + "ARROW_BUILD_STATIC": "ON", + "ARROW_CUDA": "OFF", + "ARROW_DEPENDENCY_SOURCE": "BUNDLED", + "ARROW_DEPENDENCY_USE_SHARED": "OFF", + "ARROW_ENABLE_THREADING": "OFF", + "ARROW_FLIGHT": "OFF", + "ARROW_IPC": "ON", + "ARROW_JEMALLOC": "OFF", + "ARROW_MIMALLOC": "OFF", + "ARROW_ORC": "ON", + "ARROW_RUNTIME_SIMD_LEVEL": "NONE", + "ARROW_S3": "OFF", + "ARROW_SIMD_LEVEL": "NONE", + "ARROW_SUBSTRAIT": "ON", + "ARROW_WITH_BROTLI": "ON", + "ARROW_WITH_OPENTELEMETRY": "OFF", + "ARROW_WITH_SNAPPY": "ON", + "CMAKE_C_BYTE_ORDER": "LITTLE_ENDIAN" + } + }, { "name": "features-minimal", "hidden": true, @@ -194,6 +220,14 @@ "PARQUET_REQUIRE_ENCRYPTION": "ON" } }, + { + "name": "features-valgrind", + "hidden": true, + "cacheVariables": { + "ARROW_RUNTIME_SIMD_LEVEL": "AVX2", + "ARROW_TEST_MEMCHECK": "ON" + } + }, { "name": "ninja-debug-minimal", "inherits": [ @@ -305,6 +339,46 @@ "displayName": "Debug build with everything enabled (except benchmarks)", "cacheVariables": {} }, + { + "name": "ninja-debug-valgrind-basic", + "inherits": [ + "base-debug", + "features-basic", + "features-valgrind" + ], + "displayName": "Debug build for Valgrind with reduced dependencies", + "cacheVariables": {} + }, + { + "name": "ninja-debug-valgrind", + "inherits": [ + "base-debug", + "features-main", + "features-valgrind" + ], + "displayName": "Debug build for Valgrind with more optional components", + "cacheVariables": {} + }, + { + "name": "ninja-debug-valgrind-minimal", + "inherits": [ + "base-debug", + "features-minimal", + "features-valgrind" + ], + "displayName": "Debug build for Valgrind without anything enabled", + "cacheVariables": {} + }, + { + "name": "ninja-debug-valgrind-maximal", + "inherits": [ + "base-debug", + "features-maximal", + "features-valgrind" + ], + "displayName": "Debug build for Valgrind with everything enabled", + "cacheVariables": {} + }, { "name": "ninja-release-minimal", "inherits": [ @@ -341,6 +415,24 @@ "displayName": "Release build with CUDA integration", "cacheVariables": {} }, + { + "name": "ninja-debug-emscripten", + "inherits": [ + "features-emscripten", + "base-debug" + ], + "displayName": "Debug build which builds an Emscripten library", + "cacheVariables": {} + }, + { + "name": "ninja-release-emscripten", + "inherits": [ + "features-emscripten", + "base-release" + ], + "displayName": "Release build which builds an Emscripten library", + "cacheVariables": {} + }, { "name": "ninja-release-flight", "inherits": [ diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index e19c933cd454f..5be93032c00d9 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -2168,16 +2168,17 @@ INCLUDE_FILE_PATTERNS = PREDEFINED = __attribute__(x)= \ __declspec(x)= \ - PARQUET_EXPORT= \ - GANDIVA_EXPORT= \ - ARROW_EXPORT= \ ARROW_ACERO_EXPORT= \ + ARROW_ARG_UNUSED(x)=x \ + ARROW_DEPRECATED(x)= \ ARROW_DS_EXPORT= \ ARROW_ENGINE_EXPORT= \ + ARROW_EXPORT= \ + ARROW_EXTERN_TEMPLATE= \ ARROW_FLIGHT_EXPORT= \ ARROW_FLIGHT_SQL_EXPORT= \ - ARROW_EXTERN_TEMPLATE= \ - ARROW_DEPRECATED(x)= + GANDIVA_EXPORT= \ + PARQUET_EXPORT= # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/cpp/build-support/emscripten-test-init.js b/cpp/build-support/emscripten-test-init.js new file mode 100644 index 0000000000000..bbb542a29f021 --- /dev/null +++ b/cpp/build-support/emscripten-test-init.js @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +var Module = { +}; + +// make sure tests can access the current parquet test data files +Module.preRun = () => {ENV.PARQUET_TEST_DATA = process.env.PARQUET_TEST_DATA; + ENV.ARROW_TEST_DATA = process.env.ARROW_TEST_DATA; +}; \ No newline at end of file diff --git a/cpp/build-support/update-thrift.sh b/cpp/build-support/update-thrift.sh index 9b8f2539cffe3..9e050a5e49d64 100755 --- a/cpp/build-support/update-thrift.sh +++ b/cpp/build-support/update-thrift.sh @@ -20,4 +20,4 @@ # Run this from cpp/ directory. thrift is expected to be in your path -thrift --gen cpp:moveable_types -out src/generated src/parquet/parquet.thrift +thrift --gen cpp:moveable_types,templates -out src/generated src/parquet/parquet.thrift diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 7a45e9cca59de..e7523add27223 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -760,8 +760,8 @@ function(ADD_TEST_CASE REL_TEST_NAME) valgrind --suppressions=valgrind.supp --tool=memcheck --gen-suppressions=all \ --num-callers=500 --leak-check=full --leak-check-heuristics=stdstring \ --error-exitcode=1 ${TEST_PATH} ${ARG_TEST_ARGUMENTS}") - elseif(WIN32) - add_test(${TEST_NAME} ${TEST_PATH} ${ARG_TEST_ARGUMENTS}) + elseif(WIN32 OR CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME} ${ARG_TEST_ARGUMENTS}) else() add_test(${TEST_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 5b8bcb3ac6965..41466a1c22404 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -158,8 +158,6 @@ if(ARROW_DEFINE_OPTIONS) define_option_string(ARROW_GIT_DESCRIPTION "The Arrow git commit description (if any)" "") - define_option(ARROW_NO_DEPRECATED_API "Exclude deprecated APIs from build" OFF) - define_option(ARROW_POSITION_INDEPENDENT_CODE "Whether to create position-independent target" ON) @@ -170,6 +168,8 @@ takes precedence over ccache if a storage backend is configured" ON) define_option(ARROW_USE_LD_GOLD "Use ld.gold for linking on Linux (if available)" OFF) + define_option(ARROW_USE_LLD "Use the LLVM lld for linking (if available)" OFF) + define_option(ARROW_USE_MOLD "Use mold for linking on Linux (if available)" OFF) define_option(ARROW_USE_PRECOMPILED_HEADERS "Use precompiled headers when compiling" diff --git a/cpp/cmake_modules/FindProtobufAlt.cmake b/cpp/cmake_modules/FindProtobufAlt.cmake index 15fe1b4f27ef7..703e05c4731b6 100644 --- a/cpp/cmake_modules/FindProtobufAlt.cmake +++ b/cpp/cmake_modules/FindProtobufAlt.cmake @@ -28,12 +28,27 @@ endif() if(ProtobufAlt_FIND_QUIETLY) list(APPEND find_package_args QUIET) endif() -find_package(Protobuf ${find_package_args}) -set(ProtobufAlt_FOUND ${Protobuf_FOUND}) +find_package(protobuf CONFIG ${find_package_args}) +set(ProtobufAlt_FOUND ${protobuf_FOUND}) if(ProtobufAlt_FOUND) - set(ProtobufAlt_VERSION ${Protobuf_VERSION}) - set(ProtobufAlt_VERSION_MAJOR ${Protobuf_VERSION_MAJOR}) - set(ProtobufAlt_VERSION_MINOR ${Protobuf_VERSION_MINOR}) - set(ProtobufAlt_VERSION_PATCH ${Protobuf_VERSION_PATCH}) - set(ProtobufAlt_VERSION_TWEEK ${Protobuf_VERSION_TWEEK}) + if(Protobuf_PROTOC_EXECUTABLE) + # work around https://github.com/protocolbuffers/protobuf/issues/14576 + set_target_properties(protobuf::protoc PROPERTIES IMPORTED_LOCATION_RELEASE + "${Protobuf_PROTOC_EXECUTABLE}") + endif() + set(ProtobufAlt_VERSION ${protobuf_VERSION}) + set(ProtobufAlt_VERSION_MAJOR ${protobuf_VERSION_MAJOR}) + set(ProtobufAlt_VERSION_MINOR ${protobuf_VERSION_MINOR}) + set(ProtobufAlt_VERSION_PATCH ${protobuf_VERSION_PATCH}) + set(ProtobufAlt_VERSION_TWEEK ${protobuf_VERSION_TWEEK}) +else() + find_package(Protobuf ${find_package_args}) + set(ProtobufAlt_FOUND ${Protobuf_FOUND}) + if(ProtobufAlt_FOUND) + set(ProtobufAlt_VERSION ${Protobuf_VERSION}) + set(ProtobufAlt_VERSION_MAJOR ${Protobuf_VERSION_MAJOR}) + set(ProtobufAlt_VERSION_MINOR ${Protobuf_VERSION_MINOR}) + set(ProtobufAlt_VERSION_PATCH ${Protobuf_VERSION_PATCH}) + set(ProtobufAlt_VERSION_TWEEK ${Protobuf_VERSION_TWEEK}) + endif() endif() diff --git a/cpp/cmake_modules/FindorcAlt.cmake b/cpp/cmake_modules/FindorcAlt.cmake index dc3b978cf4037..ce8cd11b4c3f0 100644 --- a/cpp/cmake_modules/FindorcAlt.cmake +++ b/cpp/cmake_modules/FindorcAlt.cmake @@ -29,6 +29,7 @@ endif() find_package(orc ${find_package_args}) if(orc_FOUND) set(orcAlt_FOUND TRUE) + set(orcAlt_VERSION ${orc_VERSION}) return() endif() @@ -51,8 +52,17 @@ else() NAMES orc/orc-config.hh PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() +if(ORC_INCLUDE_DIR) + file(READ "${ORC_INCLUDE_DIR}/orc/orc-config.hh" ORC_CONFIG_HH_CONTENT) + string(REGEX MATCH "#define ORC_VERSION \"[0-9.]+\"" ORC_VERSION_DEFINITION + "${ORC_CONFIG_HH_CONTENT}") + string(REGEX MATCH "[0-9.]+" ORC_VERSION "${ORC_VERSION_DEFINITION}") +endif() -find_package_handle_standard_args(orcAlt REQUIRED_VARS ORC_STATIC_LIB ORC_INCLUDE_DIR) +find_package_handle_standard_args( + orcAlt + REQUIRED_VARS ORC_STATIC_LIB ORC_INCLUDE_DIR + VERSION_VAR ORC_VERSION) if(orcAlt_FOUND) if(NOT TARGET orc::orc) @@ -61,4 +71,5 @@ if(orcAlt_FOUND) PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") endif() + set(orcAlt_VERSION ${ORC_VERSION}) endif() diff --git a/cpp/cmake_modules/Findutf8proc.cmake b/cpp/cmake_modules/Findutf8proc.cmake index e347414090549..9721f76f0631b 100644 --- a/cpp/cmake_modules/Findutf8proc.cmake +++ b/cpp/cmake_modules/Findutf8proc.cmake @@ -19,7 +19,7 @@ if(utf8proc_FOUND) return() endif() -if(ARROW_PACKAGE_KIND STREQUAL "vcpkg") +if(ARROW_PACKAGE_KIND STREQUAL "vcpkg" OR VCPKG_TOOLCHAIN) set(find_package_args "") if(utf8proc_FIND_VERSION) list(APPEND find_package_args ${utf8proc_FIND_VERSION}) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 90decb4224ec6..e2e1c4412abd0 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -24,7 +24,9 @@ include(CheckCXXSourceCompiles) message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") if(NOT DEFINED ARROW_CPU_FLAG) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64") + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + set(ARROW_CPU_FLAG "emscripten") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64") set(ARROW_CPU_FLAG "x86") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") set(ARROW_CPU_FLAG "aarch64") @@ -312,7 +314,13 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wextra") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wdocumentation") - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wshorten-64-to-32") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DARROW_WARN_DOCUMENTATION") + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # size_t is 32 bit in Emscripten wasm32 - ignore conversion errors + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-shorten-64-to-32") + else() + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wshorten-64-to-32") + endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-missing-braces") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-parameter") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-constant-logical-operand") @@ -322,8 +330,9 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-conversion") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-sign-conversion") - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wdate-time") + string(APPEND CXX_ONLY_FLAGS " -Wredundant-move") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") if(WIN32) @@ -633,19 +642,23 @@ if(NOT WIN32 AND NOT APPLE) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "12.1.0") set(MOLD_LINKER_FLAGS "-fuse-ld=mold") - message(STATUS "Using optional mold linker") else() message(STATUS "Need GCC 12.1.0 or later to use mold linker: ${CMAKE_CXX_COMPILER_VERSION}" ) endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(MOLD_LINKER_FLAGS "--ld-path=${LD_MOLD}") - message(STATUS "Using optional mold linker") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "12.0.0") + set(MOLD_LINKER_FLAGS "--ld-path=${LD_MOLD}") + else() + message(STATUS "Need clang 12.0.0 or later to use mold linker: ${CMAKE_CXX_COMPILER_VERSION}" + ) + endif() else() message(STATUS "Using the default linker because compiler doesn't support mold: ${CMAKE_CXX_COMPILER_ID}" ) endif() if(MOLD_LINKER_FLAGS) + message(STATUS "Using optional mold linker") string(APPEND CMAKE_EXE_LINKER_FLAGS " ${MOLD_LINKER_FLAGS}") string(APPEND CMAKE_MODULE_LINKER_FLAGS " ${MOLD_LINKER_FLAGS}") string(APPEND CMAKE_SHARED_LINKER_FLAGS " ${MOLD_LINKER_FLAGS}") @@ -656,6 +669,39 @@ if(NOT WIN32 AND NOT APPLE) endif() endif() +if(ARROW_USE_LLD) + find_program(LD_LLD ld.lld) + if(LD_LLD) + unset(LLD_LINKER_FLAGS) + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.1.0") + set(LLD_LINKER_FLAGS "-fuse-ld=lld") + else() + message(STATUS "Need GCC 9.1.0 or later to use LLD linker: ${CMAKE_CXX_COMPILER_VERSION}" + ) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "12.0.0") + set(LLD_LINKER_FLAGS "--ld-path=${LD_LLD}") + else() + message(STATUS "Need clang 12.0.0 or later to use LLD linker: ${CMAKE_CXX_COMPILER_VERSION}" + ) + endif() + else() + message(STATUS "Using the default linker because compiler doesn't support LLD: ${CMAKE_CXX_COMPILER_ID}" + ) + endif() + if(LLD_LINKER_FLAGS) + message(STATUS "Using optional LLVM LLD linker") + string(APPEND CMAKE_EXE_LINKER_FLAGS " ${LLD_LINKER_FLAGS}") + string(APPEND CMAKE_MODULE_LINKER_FLAGS " ${LLD_LINKER_FLAGS}") + string(APPEND CMAKE_SHARED_LINKER_FLAGS " ${LLD_LINKER_FLAGS}") + else() + message(STATUS "Using the default linker because the LLD isn't supported") + endif() + endif() +endif() + # compiler flags for different build types (run 'cmake -DCMAKE_BUILD_TYPE= .') # For all builds: # For CMAKE_BUILD_TYPE=Debug @@ -692,17 +738,36 @@ if(NOT MSVC) set(C_DEBUG_FLAGS "") set(CXX_DEBUG_FLAGS "") if(NOT MSVC) - if(NOT CMAKE_C_FLAGS_DEBUG MATCHES "-O") - string(APPEND C_DEBUG_FLAGS " -O0") - endif() - if(NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-O") - string(APPEND CXX_DEBUG_FLAGS " -O0") - endif() - if(ARROW_GGDB_DEBUG) - string(APPEND C_DEBUG_FLAGS " -ggdb") - string(APPEND CXX_DEBUG_FLAGS " -ggdb") - string(APPEND C_RELWITHDEBINFO_FLAGS " -ggdb") - string(APPEND CXX_RELWITHDEBINFO_FLAGS " -ggdb") + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # with -g it uses DWARF debug info, which is really slow to build + # on emscripten (and uses tons of memory) + string(REPLACE "-g" " " CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) + string(REPLACE "-g" " " CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + string(APPEND C_DEBUG_FLAGS " -g2") + string(APPEND CXX_DEBUG_FLAGS " -g2") + string(APPEND C_RELWITHDEBINFO_FLAGS " -g2") + string(APPEND CXX_RELWITHDEBINFO_FLAGS " -g2") + # without -O1, emscripten executables are *MASSIVE*. Don't use -O0 + if(NOT CMAKE_C_FLAGS_DEBUG MATCHES "-O") + string(APPEND C_DEBUG_FLAGS " -O1") + endif() + if(NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-O") + string(APPEND CXX_DEBUG_FLAGS " -O1") + endif() + else() + if(NOT CMAKE_C_FLAGS_DEBUG MATCHES "-O") + string(APPEND C_DEBUG_FLAGS " -O0") + endif() + if(NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-O") + string(APPEND CXX_DEBUG_FLAGS " -O0") + endif() + + if(ARROW_GGDB_DEBUG) + string(APPEND C_DEBUG_FLAGS " -ggdb") + string(APPEND CXX_DEBUG_FLAGS " -ggdb") + string(APPEND C_RELWITHDEBINFO_FLAGS " -ggdb") + string(APPEND CXX_RELWITHDEBINFO_FLAGS " -ggdb") + endif() endif() endif() @@ -733,3 +798,40 @@ if(MSVC) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MSVC_LINKER_FLAGS}") endif() endif() + +if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # flags are: + # 1) We force *everything* to build as position independent + # 2) And with support for C++ exceptions + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -fexceptions") + # deprecated-literal-operator error is thrown in datetime (vendored lib in arrow) + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -fPIC -fexceptions -Wno-error=deprecated-literal-operator") + + # flags for creating shared libraries (only used in pyarrow, because + # Emscripten builds libarrow as static) + # flags are: + # 1) Tell it to use JavaScript / WebAssembly 64 bit number support. + # 2) Tell it to build with support for C++ exceptions + # 3) Skip linker flags error which happens with -soname parameter + set(ARROW_EMSCRIPTEN_LINKER_FLAGS "-sWASM_BIGINT=1 -fexceptions -Wno-error=linkflags") + set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS + "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") + set(CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS + "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") + if(ARROW_TESTING) + # flags for building test executables for use in node + if("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + set(CMAKE_EXE_LINKER_FLAGS + "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sALLOW_MEMORY_GROWTH -lnodefs.js -lnoderawfs.js --pre-js ${BUILD_SUPPORT_DIR}/emscripten-test-init.js" + ) + else() + set(CMAKE_EXE_LINKER_FLAGS + "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sERROR_ON_WASM_CHANGES_AFTER_LINK=1 -sALLOW_MEMORY_GROWTH -lnodefs.js -lnoderawfs.js --pre-js ${BUILD_SUPPORT_DIR}/emscripten-test-init.js" + ) + endif() + else() + set(CMAKE_EXE_LINKER_FLAGS "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sALLOW_MEMORY_GROWTH") + endif() +endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ad7344b09dd4e..fe859a0121ca6 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -634,8 +634,10 @@ endif() if(DEFINED ENV{ARROW_CARES_URL}) set(CARES_SOURCE_URL "$ENV{ARROW_CARES_URL}") else() + string(REPLACE "." "_" ARROW_CARES_BUILD_VERSION_UNDERSCORES + ${ARROW_CARES_BUILD_VERSION}) set_urls(CARES_SOURCE_URL - "https://c-ares.haxx.se/download/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz" + "https://github.com/c-ares/c-ares/releases/download/cares-${ARROW_CARES_BUILD_VERSION_UNDERSCORES}/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz" "${THIRDPARTY_MIRROR_URL}/cares-${ARROW_CARES_BUILD_VERSION}.tar.gz") endif() @@ -976,6 +978,23 @@ set(EP_COMMON_CMAKE_ARGS -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT} -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE}) +# if building with a toolchain file, pass that through +if(CMAKE_TOOLCHAIN_FILE) + list(APPEND EP_COMMON_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}) +endif() + +# and crosscompiling emulator (for try_run() ) +if(CMAKE_CROSSCOMPILING_EMULATOR) + string(REPLACE ";" ${EP_LIST_SEPARATOR} EP_CMAKE_CROSSCOMPILING_EMULATOR + "${CMAKE_CROSSCOMPILING_EMULATOR}") + list(APPEND EP_COMMON_CMAKE_ARGS + -DCMAKE_CROSSCOMPILING_EMULATOR=${EP_CMAKE_CROSSCOMPILING_EMULATOR}) +endif() + +if(CMAKE_PROJECT_INCLUDE) + list(APPEND EP_COMMON_CMAKE_ARGS -DCMAKE_PROJECT_INCLUDE=${CMAKE_PROJECT_INCLUDE}) +endif() + # Enable s/ccache if set by parent. if(CMAKE_C_COMPILER_LAUNCHER AND CMAKE_CXX_COMPILER_LAUNCHER) list(APPEND EP_COMMON_CMAKE_ARGS @@ -1173,6 +1192,12 @@ if(MSVC AND ARROW_USE_STATIC_CRT) set(Boost_USE_STATIC_RUNTIME ON) endif() set(Boost_ADDITIONAL_VERSIONS + "1.84.0" + "1.84" + "1.83.0" + "1.83" + "1.82.0" + "1.82" "1.81.0" "1.81" "1.80.0" @@ -1240,7 +1265,7 @@ endif() # - S3FS and Flight benchmarks need Boost at runtime. if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS - OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) + OR (ARROW_FLIGHT AND (ARROW_TESTING OR ARROW_BUILD_BENCHMARKS)) OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)) set(ARROW_USE_BOOST TRUE) set(ARROW_BOOST_REQUIRE_LIBRARY TRUE) @@ -1349,6 +1374,14 @@ macro(build_snappy) set(SNAPPY_PATCH_COMMAND) endif() + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # ignore linker flag errors, as Snappy sets + # -Werror -Wall, and Emscripten doesn't support -soname + list(APPEND SNAPPY_CMAKE_ARGS + "-DCMAKE_SHARED_LINKER_FLAGS=${CMAKE_SHARED_LINKER_FLAGS}" + "-Wno-error=linkflags") + endif() + externalproject_add(snappy_ep ${EP_COMMON_OPTIONS} BUILD_IN_SOURCE 1 @@ -1394,6 +1427,7 @@ macro(build_brotli) message(STATUS "Building brotli from source") set(BROTLI_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/brotli_ep/src/brotli_ep-install") set(BROTLI_INCLUDE_DIR "${BROTLI_PREFIX}/include") + set(BROTLI_LIB_DIR "${BROTLI_PREFIX}/lib") set(BROTLI_STATIC_LIBRARY_ENC "${BROTLI_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" ) @@ -1405,6 +1439,26 @@ macro(build_brotli) ) set(BROTLI_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${BROTLI_PREFIX}") + set(BROTLI_EP_OPTIONS) + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # "cmake install" is disabled for Brotli on Emscripten, so the + # default INSTALL_COMMAND fails. We need to disable the default + # INSTALL_COMMAND. + list(APPEND + BROTLI_EP_OPTIONS + INSTALL_COMMAND + ${CMAKE_COMMAND} + -E + true) + + set(BROTLI_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/brotli_ep-prefix/src/brotli_ep-build) + set(BROTLI_BUILD_LIBS + "${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlidec-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlicommon-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + endif() + externalproject_add(brotli_ep ${EP_COMMON_OPTIONS} URL ${BROTLI_SOURCE_URL} @@ -1414,7 +1468,20 @@ macro(build_brotli) "${BROTLI_STATIC_LIBRARY_COMMON}" ${BROTLI_BUILD_BYPRODUCTS} CMAKE_ARGS ${BROTLI_CMAKE_ARGS} - STEP_TARGETS headers_copy) + STEP_TARGETS headers_copy ${BROTLI_EP_OPTIONS}) + + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # Copy the libraries to our install directory manually. + set(BROTLI_BUILD_INCLUDE_DIR + ${CMAKE_CURRENT_BINARY_DIR}/brotli_ep-prefix/src/brotli_ep/c/include/brotli) + add_custom_command(TARGET brotli_ep + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}*${CMAKE_STATIC_LIBRARY_SUFFIX} + ${BROTLI_LIB_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${BROTLI_BUILD_INCLUDE_DIR} ${BROTLI_INCLUDE_DIR}/brotli) + endif() file(MAKE_DIRECTORY "${BROTLI_INCLUDE_DIR}") @@ -1657,6 +1724,9 @@ macro(build_thrift) if(DEFINED BOOST_ROOT) list(APPEND THRIFT_CMAKE_ARGS "-DBOOST_ROOT=${BOOST_ROOT}") endif() + if(DEFINED Boost_INCLUDE_DIR) + list(APPEND THRIFT_CMAKE_ARGS "-DBoost_INCLUDE_DIR=${Boost_INCLUDE_DIR}") + endif() if(DEFINED Boost_NAMESPACE) list(APPEND THRIFT_CMAKE_ARGS "-DBoost_NAMESPACE=${Boost_NAMESPACE}") endif() @@ -1798,6 +1868,36 @@ macro(build_protobuf) add_dependencies(arrow::protobuf::protoc protobuf_ep) list(APPEND ARROW_BUNDLED_STATIC_LIBS arrow::protobuf::libprotobuf) + + if(CMAKE_CROSSCOMPILING) + # If we are cross compiling, we need to build protoc for the host + # system also, as it is used when building Arrow + # We do this by calling CMake as a child process + # with CXXFLAGS / CFLAGS and CMake flags cleared. + set(PROTOBUF_HOST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/protobuf_ep_host-install") + set(PROTOBUF_HOST_COMPILER "${PROTOBUF_HOST_PREFIX}/bin/protoc") + + set(PROTOBUF_HOST_CMAKE_ARGS + "-DCMAKE_CXX_FLAGS=" + "-DCMAKE_C_FLAGS=" + "-DCMAKE_INSTALL_PREFIX=${PROTOBUF_HOST_PREFIX}" + -Dprotobuf_BUILD_TESTS=OFF + -Dprotobuf_DEBUG_POSTFIX=) + + externalproject_add(protobuf_ep_host + ${EP_COMMON_OPTIONS} + CMAKE_ARGS ${PROTOBUF_HOST_CMAKE_ARGS} + BUILD_BYPRODUCTS "${PROTOBUF_HOST_COMPILER}" + BUILD_IN_SOURCE 1 + URL ${PROTOBUF_SOURCE_URL} + URL_HASH "SHA256=${ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM}") + + add_executable(arrow::protobuf::host_protoc IMPORTED) + set_target_properties(arrow::protobuf::host_protoc + PROPERTIES IMPORTED_LOCATION "${PROTOBUF_HOST_COMPILER}") + + add_dependencies(arrow::protobuf::host_protoc protobuf_ep_host) + endif() endmacro() if(ARROW_WITH_PROTOBUF) @@ -1862,7 +1962,11 @@ if(ARROW_WITH_PROTOBUF) else() set(ARROW_PROTOBUF_LIBPROTOC protobuf::libprotoc) endif() - if(TARGET arrow::protobuf::protoc) + if(TARGET arrow::protobuf::host_protoc) + # make sure host protoc is used for compiling protobuf files + # during build of e.g. orc + set(ARROW_PROTOBUF_PROTOC arrow::protobuf::host_protoc) + elseif(TARGET arrow::protobuf::protoc) set(ARROW_PROTOBUF_PROTOC arrow::protobuf::protoc) else() if(NOT TARGET protobuf::protoc) @@ -2164,8 +2268,15 @@ function(build_gtest) if(APPLE) string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-value" " -Wno-ignored-attributes") endif() - set(BUILD_SHARED_LIBS ON) - set(BUILD_STATIC_LIBS OFF) + # If we're building static libs for Emscripten, we need to build *everything* as + # static libs. + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + set(BUILD_SHARED_LIBS OFF) + set(BUILD_STATIC_LIBS ON) + else() + set(BUILD_SHARED_LIBS ON) + set(BUILD_STATIC_LIBS OFF) + endif() # We need to use "cache" variable to override the default # INSTALL_GTEST option by this value. See also: # https://cmake.org/cmake/help/latest/policy/CMP0077.html @@ -2403,37 +2514,58 @@ endif() macro(build_zlib) message(STATUS "Building ZLIB from source") - set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") - if(MSVC) - if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") - set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib) - else() - set(ZLIB_STATIC_LIB_NAME zlibstatic.lib) + + # ensure zlib is built with -fpic + # and make sure that the build finds the version in Emscripten ports + # - n.b. the actual linking happens because -sUSE_ZLIB=1 is + # set in the compiler variables, but cmake expects + # it to exist at configuration time if we aren't building it as + # bundled. We need to do this for all packages + # not just zlib as some depend on zlib, but we don't rebuild + # if it exists already + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # build zlib using Emscripten ports + if(NOT EXISTS ${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a) + execute_process(COMMAND embuilder --pic --force build zlib) endif() + add_library(ZLIB::ZLIB STATIC IMPORTED) + set_property(TARGET ZLIB::ZLIB + PROPERTY IMPORTED_LOCATION + "${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a") + list(APPEND ARROW_BUNDLED_STATIC_LIBS ZLIB::ZLIB) else() - set(ZLIB_STATIC_LIB_NAME libz.a) - endif() - set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") - set(ZLIB_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}") + set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") + if(MSVC) + if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") + set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib) + else() + set(ZLIB_STATIC_LIB_NAME zlibstatic.lib) + endif() + else() + set(ZLIB_STATIC_LIB_NAME libz.a) + endif() + set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") + set(ZLIB_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}") - externalproject_add(zlib_ep - ${EP_COMMON_OPTIONS} - URL ${ZLIB_SOURCE_URL} - URL_HASH "SHA256=${ARROW_ZLIB_BUILD_SHA256_CHECKSUM}" - BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" - CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) + externalproject_add(zlib_ep + ${EP_COMMON_OPTIONS} + URL ${ZLIB_SOURCE_URL} + URL_HASH "SHA256=${ARROW_ZLIB_BUILD_SHA256_CHECKSUM}" + BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" + CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) - file(MAKE_DIRECTORY "${ZLIB_PREFIX}/include") + file(MAKE_DIRECTORY "${ZLIB_PREFIX}/include") - add_library(ZLIB::ZLIB STATIC IMPORTED) - set(ZLIB_LIBRARIES ${ZLIB_STATIC_LIB}) - set(ZLIB_INCLUDE_DIRS "${ZLIB_PREFIX}/include") - set_target_properties(ZLIB::ZLIB PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES}) - target_include_directories(ZLIB::ZLIB BEFORE INTERFACE "${ZLIB_INCLUDE_DIRS}") + add_library(ZLIB::ZLIB STATIC IMPORTED) + set(ZLIB_LIBRARIES ${ZLIB_STATIC_LIB}) + set(ZLIB_INCLUDE_DIRS "${ZLIB_PREFIX}/include") + set_target_properties(ZLIB::ZLIB PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES}) + target_include_directories(ZLIB::ZLIB BEFORE INTERFACE "${ZLIB_INCLUDE_DIRS}") - add_dependencies(ZLIB::ZLIB zlib_ep) + add_dependencies(ZLIB::ZLIB zlib_ep) + list(APPEND ARROW_BUNDLED_STATIC_LIBS ZLIB::ZLIB) + endif() - list(APPEND ARROW_BUNDLED_STATIC_LIBS ZLIB::ZLIB) set(ZLIB_VENDORED TRUE) endmacro() @@ -2689,11 +2821,13 @@ macro(build_utf8proc) endmacro() if(ARROW_WITH_UTF8PROC) - resolve_dependency(utf8proc - PC_PACKAGE_NAMES - libutf8proc - REQUIRED_VERSION - "2.2.0") + set(utf8proc_resolve_dependency_args utf8proc PC_PACKAGE_NAMES libutf8proc) + if(NOT VCPKG_TOOLCHAIN) + # utf8proc in vcpkg doesn't provide version information: + # https://github.com/microsoft/vcpkg/issues/39176 + list(APPEND utf8proc_resolve_dependency_args REQUIRED_VERSION "2.2.0") + endif() + resolve_dependency(${utf8proc_resolve_dependency_args}) endif() macro(build_cares) @@ -4390,6 +4524,10 @@ macro(build_orc) "-DPROTOBUF_LIBRARY=$" "-DPROTOC_LIBRARY=$" "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}" + "-DSNAPPY_LIBRARY=$" + "-DLZ4_LIBRARY=$" + "-DLZ4_STATIC_LIB=$" + "-DLZ4_INCLUDE_DIR=${ORC_LZ4_ROOT}/include" "-DSNAPPY_INCLUDE_DIR=${ORC_SNAPPY_INCLUDE_DIR}" "-DZSTD_HOME=${ORC_ZSTD_ROOT}" "-DZSTD_INCLUDE_DIR=$" @@ -4431,6 +4569,15 @@ macro(build_orc) endif() target_link_libraries(orc::orc INTERFACE ${CMAKE_DL_LIBS}) endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9") + target_link_libraries(orc::orc INTERFACE stdc++fs) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8") + target_link_libraries(orc::orc INTERFACE c++fs) + endif() + endif() add_dependencies(orc::orc orc_ep) @@ -4440,6 +4587,11 @@ endmacro() if(ARROW_ORC) resolve_dependency(orc HAVE_ALT TRUE) target_link_libraries(orc::orc INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) + if(ORC_VENDORED) + set(ARROW_ORC_VERSION ${ARROW_ORC_BUILD_VERSION}) + else() + set(ARROW_ORC_VERSION ${orcAlt_VERSION}) + endif() message(STATUS "Found ORC static library: ${ORC_STATIC_LIB}") message(STATUS "Found ORC headers: ${ORC_INCLUDE_DIR}") endif() @@ -4463,8 +4615,11 @@ macro(build_opentelemetry) set(_OPENTELEMETRY_LIBS common http_client_curl + logs + ostream_log_record_exporter ostream_span_exporter otlp_http_client + otlp_http_log_record_exporter otlp_http_exporter otlp_recordable proto @@ -4497,6 +4652,14 @@ macro(build_opentelemetry) set(_OPENTELEMETRY_STATIC_LIBRARY "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http${CMAKE_STATIC_LIBRARY_SUFFIX}" ) + elseif(_OPENTELEMETRY_LIB STREQUAL "otlp_http_log_record_exporter") + set(_OPENTELEMETRY_STATIC_LIBRARY + "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http_log${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + elseif(_OPENTELEMETRY_LIB STREQUAL "ostream_log_record_exporter") + set(_OPENTELEMETRY_STATIC_LIBRARY + "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_ostream_logs${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) else() set(_OPENTELEMETRY_STATIC_LIBRARY "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_${_OPENTELEMETRY_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" @@ -4531,9 +4694,16 @@ macro(build_opentelemetry) IMPORTED_LOCATION) list(APPEND OPENTELEMETRY_CMAKE_ARGS - -DWITH_OTLP=ON -DWITH_OTLP_HTTP=ON -DWITH_OTLP_GRPC=OFF + # Disabled because it seemed to cause linking errors. May be worth a closer look. + -DWITH_FUNC_TESTS=OFF + # These options are slated for removal in v1.14 and their features are deemed stable + # as of v1.13. However, setting their corresponding ENABLE_* macros in headers seems + # finicky - resulting in build failures or ABI-related runtime errors during HTTP + # client initialization. There may still be a solution, but we disable them for now. + -DWITH_OTLP_HTTP_SSL_PREVIEW=OFF + -DWITH_OTLP_HTTP_SSL_TLS_PREVIEW=OFF "-DProtobuf_INCLUDE_DIR=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" "-DProtobuf_LIBRARY=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" "-DProtobuf_PROTOC_EXECUTABLE=${OPENTELEMETRY_PROTOC_EXECUTABLE}") @@ -4607,19 +4777,25 @@ macro(build_opentelemetry) target_link_libraries(opentelemetry-cpp::resources INTERFACE opentelemetry-cpp::common) target_link_libraries(opentelemetry-cpp::trace INTERFACE opentelemetry-cpp::common opentelemetry-cpp::resources) + target_link_libraries(opentelemetry-cpp::logs INTERFACE opentelemetry-cpp::common + opentelemetry-cpp::resources) target_link_libraries(opentelemetry-cpp::http_client_curl - INTERFACE opentelemetry-cpp::ext CURL::libcurl) + INTERFACE opentelemetry-cpp::common opentelemetry-cpp::ext + CURL::libcurl) target_link_libraries(opentelemetry-cpp::proto INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) target_link_libraries(opentelemetry-cpp::otlp_recordable - INTERFACE opentelemetry-cpp::trace opentelemetry-cpp::resources - opentelemetry-cpp::proto) + INTERFACE opentelemetry-cpp::logs opentelemetry-cpp::trace + opentelemetry-cpp::resources opentelemetry-cpp::proto) target_link_libraries(opentelemetry-cpp::otlp_http_client - INTERFACE opentelemetry-cpp::sdk opentelemetry-cpp::proto + INTERFACE opentelemetry-cpp::common opentelemetry-cpp::proto opentelemetry-cpp::http_client_curl nlohmann_json::nlohmann_json) target_link_libraries(opentelemetry-cpp::otlp_http_exporter INTERFACE opentelemetry-cpp::otlp_recordable opentelemetry-cpp::otlp_http_client) + target_link_libraries(opentelemetry-cpp::otlp_http_log_record_exporter + INTERFACE opentelemetry-cpp::otlp_recordable + opentelemetry-cpp::otlp_http_client) foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) add_dependencies(opentelemetry-cpp::${_OPENTELEMETRY_LIB} opentelemetry_ep) @@ -4641,7 +4817,11 @@ if(ARROW_WITH_OPENTELEMETRY) set(opentelemetry-cpp_SOURCE "AUTO") resolve_dependency(opentelemetry-cpp) set(ARROW_OPENTELEMETRY_LIBS - opentelemetry-cpp::trace opentelemetry-cpp::ostream_span_exporter + opentelemetry-cpp::trace + opentelemetry-cpp::logs + opentelemetry-cpp::otlp_http_log_record_exporter + opentelemetry-cpp::ostream_log_record_exporter + opentelemetry-cpp::ostream_span_exporter opentelemetry-cpp::otlp_http_exporter) get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api INTERFACE_INCLUDE_DIRECTORIES) @@ -5200,9 +5380,3 @@ if(ARROW_WITH_UCX) endif() message(STATUS "All bundled static libraries: ${ARROW_BUNDLED_STATIC_LIBS}") - -# Write out the package configurations. - -configure_file("src/arrow/util/config.h.cmake" "src/arrow/util/config.h" ESCAPE_QUOTES) -install(FILES "${ARROW_BINARY_DIR}/src/arrow/util/config.h" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/util") diff --git a/cpp/examples/arrow/filesystem_definition_example.cc b/cpp/examples/arrow/filesystem_definition_example.cc index efe1bd10470c0..65301bb843ba1 100644 --- a/cpp/examples/arrow/filesystem_definition_example.cc +++ b/cpp/examples/arrow/filesystem_definition_example.cc @@ -138,7 +138,7 @@ class ExampleFileSystem : public fs::FileSystem { } }; -fs::FileSystemRegistrar kExampleFileSystemModule{ +auto kExampleFileSystemModule = ARROW_REGISTER_FILESYSTEM( "example", [](const arrow::util::Uri& uri, const io::IOContext& io_context, std::string* out_path) -> Result> { @@ -148,4 +148,4 @@ fs::FileSystemRegistrar kExampleFileSystemModule{ } return fs; }, -}; + {}); diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 4bf1008af4cd0..6dc8358f502f5 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -163,7 +163,7 @@ if(CMAKE_THREAD_LIBS_INIT) endif() if(WIN32) - list(APPEND ARROW_SYSTEM_LINK_LIBS "ws2_32.dll") + list(APPEND ARROW_SYSTEM_LINK_LIBS "ws2_32") endif() if(NOT WIN32 AND NOT APPLE) @@ -200,22 +200,29 @@ function(arrow_add_object_library PREFIX) set(SOURCES ${ARGN}) string(TOLOWER "${PREFIX}" prefix) if(WIN32) - add_library(${prefix}_shared OBJECT ${SOURCES}) - add_library(${prefix}_static OBJECT ${SOURCES}) - set_target_properties(${prefix}_shared PROPERTIES POSITION_INDEPENDENT_CODE ON) - set_target_properties(${prefix}_static PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(${prefix}_shared PRIVATE ARROW_EXPORTING) - target_compile_definitions(${prefix}_static PRIVATE ARROW_STATIC) - target_compile_features(${prefix}_shared PRIVATE cxx_std_17) - target_compile_features(${prefix}_static PRIVATE cxx_std_17) - set(${PREFIX}_TARGET_SHARED - ${prefix}_shared - PARENT_SCOPE) - set(${PREFIX}_TARGET_STATIC - ${prefix}_static - PARENT_SCOPE) + set(targets) + if(ARROW_BUILD_SHARED) + add_library(${prefix}_shared OBJECT ${SOURCES}) + set_target_properties(${prefix}_shared PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(${prefix}_shared PRIVATE ARROW_EXPORTING) + target_compile_features(${prefix}_shared PRIVATE cxx_std_17) + set(${PREFIX}_TARGET_SHARED + ${prefix}_shared + PARENT_SCOPE) + list(APPEND targets ${prefix}_shared) + endif() + if(ARROW_BUILD_STATIC) + add_library(${prefix}_static OBJECT ${SOURCES}) + set_target_properties(${prefix}_static PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(${prefix}_static PRIVATE ARROW_STATIC) + target_compile_features(${prefix}_static PRIVATE cxx_std_17) + set(${PREFIX}_TARGET_STATIC + ${prefix}_static + PARENT_SCOPE) + list(APPEND targets ${prefix}_static) + endif() set(${PREFIX}_TARGETS - ${prefix}_shared ${prefix}_static + ${targets} PARENT_SCOPE) else() add_library(${prefix} OBJECT ${SOURCES}) @@ -351,6 +358,12 @@ macro(append_runtime_avx512_src SRCS SRC) endif() endmacro() +# Write out compile-time configuration constants +configure_file("util/config.h.cmake" "util/config.h" ESCAPE_QUOTES) +configure_file("util/config_internal.h.cmake" "util/config_internal.h" ESCAPE_QUOTES) +install(FILES "${CMAKE_CURRENT_BINARY_DIR}/util/config.h" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/util") + set(ARROW_SRCS builder.cc buffer.cc @@ -501,6 +514,7 @@ set(ARROW_UTIL_SRCS util/decimal.cc util/delimiting.cc util/dict_util.cc + util/fixed_width_internal.cc util/float16.cc util/formatting.cc util/future.cc @@ -508,6 +522,7 @@ set(ARROW_UTIL_SRCS util/int_util.cc util/io_util.cc util/list_util.cc + util/logger.cc util/logging.cc util/key_value_metadata.cc util/memory.cc @@ -613,6 +628,17 @@ if(ARROW_WITH_ZSTD) endforeach() endif() +if(ARROW_WITH_OPENTELEMETRY) + arrow_add_object_library(ARROW_TELEMETRY telemetry/logging.cc) + + foreach(ARROW_TELEMETRY_TARGET ${ARROW_TELEMETRY_TARGETS}) + target_link_libraries(${ARROW_TELEMETRY_TARGET} PRIVATE ${ARROW_OPENTELEMETRY_LIBS}) + endforeach() +else() + set(ARROW_TELEMETRY_TARGET_SHARED) + set(ARROW_TELEMETRY_TARGET_STATIC) +endif() + set(ARROW_TESTING_SHARED_LINK_LIBS arrow_shared ${ARROW_GTEST_GTEST}) set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON) set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers RapidJSON arrow_static @@ -628,14 +654,15 @@ else() list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS ArrowTesting::gtest) endif() if(WIN32) - list(APPEND ARROW_TESTING_SHARED_LINK_LIBS "ws2_32.dll") - list(APPEND ARROW_TESTING_STATIC_LINK_LIBS "ws2_32.dll") - list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS "ws2_32.dll") + list(APPEND ARROW_TESTING_SHARED_LINK_LIBS "ws2_32") + list(APPEND ARROW_TESTING_STATIC_LINK_LIBS "ws2_32") + list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS "ws2_32") endif() set(ARROW_TESTING_SRCS io/test_common.cc ipc/test_common.cc + testing/fixed_width_test_util.cc testing/gtest_util.cc testing/random.cc testing/generator.cc @@ -689,9 +716,9 @@ set(ARROW_COMPUTE_SRCS compute/function.cc compute/function_internal.cc compute/kernel.cc - compute/key_hash.cc - compute/key_map.cc - compute/light_array.cc + compute/key_hash_internal.cc + compute/key_map_internal.cc + compute/light_array_internal.cc compute/ordering.cc compute/registry.cc compute/kernels/codegen_internal.cc @@ -715,10 +742,11 @@ set(ARROW_COMPUTE_SRCS compute/row/compare_internal.cc compute/row/grouper.cc compute/row/row_internal.cc - compute/util.cc) + compute/util.cc + compute/util_internal.cc) -append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/key_hash_avx2.cc) -append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/key_map_avx2.cc) +append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/key_hash_internal_avx2.cc) +append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/key_map_internal_avx2.cc) append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/row/compare_internal_avx2.cc) append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/row/encode_internal_avx2.cc) append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/util_avx2.cc) @@ -899,6 +927,10 @@ if(ARROW_ORC) adapters/orc/util.cc) foreach(ARROW_ORC_TARGET ${ARROW_ORC_TARGETS}) target_link_libraries(${ARROW_ORC_TARGET} PRIVATE orc::orc) + if(ARROW_ORC_VERSION VERSION_LESS "2.0.0") + target_compile_definitions(${ARROW_ORC_TARGET} + PRIVATE ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK) + endif() endforeach() else() set(ARROW_ORC_TARGET_SHARED) @@ -996,6 +1028,7 @@ add_arrow_lib(arrow ${ARROW_JSON_TARGET_SHARED} ${ARROW_MEMORY_POOL_TARGET_SHARED} ${ARROW_ORC_TARGET_SHARED} + ${ARROW_TELEMETRY_TARGET_SHARED} ${ARROW_UTIL_TARGET_SHARED} ${ARROW_VENDORED_TARGET_SHARED} ${ARROW_SHARED_PRIVATE_LINK_LIBS} @@ -1011,6 +1044,7 @@ add_arrow_lib(arrow ${ARROW_JSON_TARGET_STATIC} ${ARROW_MEMORY_POOL_TARGET_STATIC} ${ARROW_ORC_TARGET_STATIC} + ${ARROW_TELEMETRY_TARGET_STATIC} ${ARROW_UTIL_TARGET_STATIC} ${ARROW_VENDORED_TARGET_STATIC} ${ARROW_SYSTEM_LINK_LIBS} @@ -1240,6 +1274,10 @@ if(ARROW_SUBSTRAIT) add_subdirectory(engine) endif() +if(ARROW_WITH_OPENTELEMETRY) + add_subdirectory(telemetry) +endif() + if(ARROW_TENSORFLOW) add_subdirectory(adapters/tensorflow) endif() diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 31ed4a6a69b6a..73079059f1dfd 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -173,13 +173,8 @@ add_arrow_acero_test(hash_join_node_test SOURCES hash_join_node_test.cc bloom_filter_test.cc) add_arrow_acero_test(pivot_longer_node_test SOURCES pivot_longer_node_test.cc) -# asof_join_node and sorted_merge_node use std::thread internally -# and doesn't use ThreadPool so it will -# be broken if threading is turned off -if(ARROW_ENABLE_THREADING) - add_arrow_acero_test(asof_join_node_test SOURCES asof_join_node_test.cc) - add_arrow_acero_test(sorted_merge_node_test SOURCES sorted_merge_node_test.cc) -endif() +add_arrow_acero_test(asof_join_node_test SOURCES asof_join_node_test.cc) +add_arrow_acero_test(sorted_merge_node_test SOURCES sorted_merge_node_test.cc) add_arrow_acero_test(tpch_node_test SOURCES tpch_node_test.cc) add_arrow_acero_test(union_node_test SOURCES union_node_test.cc) @@ -228,9 +223,7 @@ if(ARROW_BUILD_BENCHMARKS) add_arrow_acero_benchmark(project_benchmark SOURCES benchmark_util.cc project_benchmark.cc) - if(ARROW_ENABLE_THREADING) - add_arrow_acero_benchmark(asof_join_benchmark SOURCES asof_join_benchmark.cc) - endif() + add_arrow_acero_benchmark(asof_join_benchmark SOURCES asof_join_benchmark.cc) add_arrow_acero_benchmark(tpch_benchmark SOURCES tpch_benchmark.cc) @@ -253,9 +246,7 @@ if(ARROW_BUILD_BENCHMARKS) target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_static) target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_static) target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_static) - if(ARROW_ENABLE_THREADING) - target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_static) - endif() + target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_static) target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_static) if(ARROW_BUILD_OPENMP_BENCHMARKS) target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_static) @@ -264,9 +255,7 @@ if(ARROW_BUILD_BENCHMARKS) target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_shared) target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_shared) target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_shared) - if(ARROW_ENABLE_THREADING) - target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_shared) - endif() + target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_shared) target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_shared) if(ARROW_BUILD_OPENMP_BENCHMARKS) target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_shared) diff --git a/cpp/src/arrow/acero/aggregate_benchmark.cc b/cpp/src/arrow/acero/aggregate_benchmark.cc index 4db7e443227d9..854862e3e48ca 100644 --- a/cpp/src/arrow/acero/aggregate_benchmark.cc +++ b/cpp/src/arrow/acero/aggregate_benchmark.cc @@ -29,6 +29,7 @@ #include "arrow/util/benchmark_util.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_reader.h" +#include "arrow/util/byte_size.h" #include "arrow/util/string.h" namespace arrow { @@ -50,6 +51,7 @@ namespace acero { #include using arrow::internal::ToChars; +using arrow::util::TotalBufferSize; #ifdef ARROW_WITH_BENCHMARKS_REFERENCE @@ -371,9 +373,11 @@ static void BenchmarkGroupBy(benchmark::State& state, std::vector agg for (std::size_t arg_idx = 0; arg_idx < arguments.size(); arg_idx++) { aggregates[arg_idx].target = {FieldRef(static_cast(arg_idx))}; } + int64_t total_bytes = TotalBufferSize(*batch); for (auto _ : state) { ABORT_NOT_OK(BatchGroupBy(batch, aggregates, key_refs)); } + state.SetBytesProcessed(total_bytes * state.iterations()); } #define GROUP_BY_BENCHMARK(Name, Impl) \ @@ -578,6 +582,8 @@ static void SumKernel(benchmark::State& state) { for (auto _ : state) { ABORT_NOT_OK(Sum(array).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } static void SumKernelArgs(benchmark::internal::Benchmark* bench) { @@ -611,6 +617,8 @@ void ModeKernel(benchmark::State& state, int min, int max) { for (auto _ : state) { ABORT_NOT_OK(Mode(array).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } template @@ -625,13 +633,18 @@ void ModeKernelNarrow(benchmark::State& state) { template <> void ModeKernelNarrow(benchmark::State& state) { + using CType = typename TypeTraits::CType; + RegressionArgs args(state); + const int64_t array_size = args.size / sizeof(CType); auto rand = random::RandomArrayGenerator(1924); auto array = rand.Boolean(args.size * 8, 0.5, args.null_proportion); for (auto _ : state) { ABORT_NOT_OK(Mode(array).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } template @@ -668,6 +681,8 @@ static void MinMaxKernelBench(benchmark::State& state) { for (auto _ : state) { ABORT_NOT_OK(MinMax(array).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } static void MinMaxKernelBenchArgs(benchmark::internal::Benchmark* bench) { @@ -698,6 +713,8 @@ static void CountKernelBenchInt64(benchmark::State& state) { for (auto _ : state) { ABORT_NOT_OK(Count(array->Slice(1, array_size)).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } BENCHMARK(CountKernelBenchInt64)->Args({1 * 1024 * 1024, 2}); // 1M with 50% null. @@ -718,6 +735,8 @@ void VarianceKernelBench(benchmark::State& state) { for (auto _ : state) { ABORT_NOT_OK(Variance(array, options).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } static void VarianceKernelBenchArgs(benchmark::internal::Benchmark* bench) { diff --git a/cpp/src/arrow/acero/aggregate_internal.cc b/cpp/src/arrow/acero/aggregate_internal.cc index 9c4b7fe5ae98c..0c1bc3db365a6 100644 --- a/cpp/src/arrow/acero/aggregate_internal.cc +++ b/cpp/src/arrow/acero/aggregate_internal.cc @@ -102,7 +102,7 @@ Result> InitKernel(const HashAggregateKernel* kerne ARROW_ASSIGN_OR_RAISE( auto state, kernel->init(&kernel_ctx, KernelInitArgs{kernel, aggr_in_types, options})); - return std::move(state); + return state; } Result> GetKernels( @@ -129,7 +129,7 @@ Result>> InitKernels( ARROW_ASSIGN_OR_RAISE(states[i], InitKernel(kernels[i], ctx, aggregates[i], in_types[i])); } - return std::move(states); + return states; } Result ResolveKernels( @@ -242,7 +242,7 @@ Result> ExtractValues(const ExecBatch& input_batch, DCHECK(false); } } - return std::move(values); + return values; } } // namespace aggregate diff --git a/cpp/src/arrow/acero/asof_join_benchmark.cc b/cpp/src/arrow/acero/asof_join_benchmark.cc index 02116b09fc1fd..ed2ac2258eb6c 100644 --- a/cpp/src/arrow/acero/asof_join_benchmark.cc +++ b/cpp/src/arrow/acero/asof_join_benchmark.cc @@ -91,7 +91,7 @@ static void TableJoinOverhead(benchmark::State& state, ASSERT_OK(DeclarationToStatus(std::move(join_node), /*use_threads=*/false)); } - state.counters["input_rows_per_second"] = benchmark::Counter( + state.counters["rows_per_second"] = benchmark::Counter( static_cast(state.iterations() * (left_table_stats.rows + right_hand_rows)), benchmark::Counter::kIsRate); diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index cf0d475c1d770..848cbdf7506ad 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -45,8 +45,8 @@ #include "arrow/compute/function_internal.h" #endif #include "arrow/acero/time_series_util.h" -#include "arrow/compute/key_hash.h" -#include "arrow/compute/light_array.h" +#include "arrow/compute/key_hash_internal.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/status.h" @@ -548,8 +548,10 @@ class InputState { // true when the queue is empty and, when memo may have future entries (the case of a // positive tolerance), when the memo is empty. // used when checking whether RHS is up to date with LHS. - bool CurrentEmpty() const { - return memo_.no_future_ ? Empty() : memo_.times_.empty() && Empty(); + // NOTE: The emptiness must be decided by a single call to Empty() in caller, due to the + // potential race with Push(), see GH-41614. + bool CurrentEmpty(bool empty) const { + return memo_.no_future_ ? empty : (memo_.times_.empty() && empty); } // in case memo may not have future entries (the case of a non-positive tolerance), @@ -650,13 +652,15 @@ class InputState { // timestamp, update latest_time and latest_ref_row to the value that immediately pass // the horizon. Update the memo-store with any entries or future entries so observed. // Returns true if updates were made, false if not. - Result AdvanceAndMemoize(OnType ts) { + // NOTE: The emptiness must be decided by a single call to Empty() in caller, due to the + // potential race with Push(), see GH-41614. + Result AdvanceAndMemoize(OnType ts, bool empty) { // Advance the right side row index until we reach the latest right row (for each key) // for the given left timestamp. DEBUG_SYNC(node_, "Advancing input ", index_, DEBUG_MANIP(std::endl)); // Check if already updated for TS (or if there is no latest) - if (Empty()) { // can't advance if empty and no future entries + if (empty) { // can't advance if empty and no future entries return memo_.no_future_ ? false : memo_.RemoveEntriesWithLesserTime(ts); } @@ -918,34 +922,46 @@ class CompositeTableBuilder { // guaranteeing this probability is below 1 in a billion. The fix is 128-bit hashing. // See ARROW-17653 class AsofJoinNode : public ExecNode { - // Advances the RHS as far as possible to be up to date for the current LHS timestamp - Result UpdateRhs() { + // A simple wrapper for the result of a single call to UpdateRhs(), identifying: + // 1) If any RHS has advanced. + // 2) If all RHS are up to date with LHS. + struct RhsUpdateState { + bool any_advanced; + bool all_up_to_date_with_lhs; + }; + // Advances the RHS as far as possible to be up to date for the current LHS timestamp, + // and checks if all RHS are up to date with LHS. The reason they have to be performed + // together is that they both depend on the emptiness of the RHS, which can be changed + // by Push() executing in another thread. + Result UpdateRhs() { auto& lhs = *state_.at(0); auto lhs_latest_time = lhs.GetLatestTime(); - bool any_updated = false; - for (size_t i = 1; i < state_.size(); ++i) { - ARROW_ASSIGN_OR_RAISE(bool advanced, state_[i]->AdvanceAndMemoize(lhs_latest_time)); - any_updated |= advanced; - } - return any_updated; - } - - // Returns false if RHS not up to date for LHS - bool IsUpToDateWithLhsRow() const { - auto& lhs = *state_[0]; - if (lhs.Empty()) return false; // can't proceed if nothing on the LHS - OnType lhs_ts = lhs.GetLatestTime(); + RhsUpdateState update_state{/*any_advanced=*/false, /*all_up_to_date_with_lhs=*/true}; for (size_t i = 1; i < state_.size(); ++i) { auto& rhs = *state_[i]; - if (!rhs.Finished()) { + + // Obtain RHS emptiness once for subsequent AdvanceAndMemoize() and CurrentEmpty(). + bool rhs_empty = rhs.Empty(); + // Obtain RHS current time here because AdvanceAndMemoize() can change the + // emptiness. + OnType rhs_current_time = rhs_empty ? OnType{} : rhs.GetLatestTime(); + + ARROW_ASSIGN_OR_RAISE(bool advanced, + rhs.AdvanceAndMemoize(lhs_latest_time, rhs_empty)); + update_state.any_advanced |= advanced; + + if (update_state.all_up_to_date_with_lhs && !rhs.Finished()) { // If RHS is finished, then we know it's up to date - if (rhs.CurrentEmpty()) - return false; // RHS isn't finished, but is empty --> not up to date - if (lhs_ts > rhs.GetCurrentTime()) - return false; // RHS isn't up to date (and not finished) + if (rhs.CurrentEmpty(rhs_empty)) { + // RHS isn't finished, but is empty --> not up to date + update_state.all_up_to_date_with_lhs = false; + } else if (lhs_latest_time > rhs_current_time) { + // RHS isn't up to date (and not finished) + update_state.all_up_to_date_with_lhs = false; + } } } - return true; + return update_state; } Result> ProcessInner() { @@ -963,20 +979,19 @@ class AsofJoinNode : public ExecNode { // If LHS is finished or empty then there's nothing we can do here if (lhs.Finished() || lhs.Empty()) break; - // Advance each of the RHS as far as possible to be up to date for the LHS timestamp - ARROW_ASSIGN_OR_RAISE(bool any_rhs_advanced, UpdateRhs()); + ARROW_ASSIGN_OR_RAISE(auto rhs_update_state, UpdateRhs()); // If we have received enough inputs to produce the next output batch // (decided by IsUpToDateWithLhsRow), we will perform the join and // materialize the output batch. The join is done by advancing through // the LHS and adding joined row to rows_ (done by Emplace). Finally, // input batches that are no longer needed are removed to free up memory. - if (IsUpToDateWithLhsRow()) { + if (rhs_update_state.all_up_to_date_with_lhs) { dst.Emplace(state_, tolerance_); ARROW_ASSIGN_OR_RAISE(bool advanced, lhs.Advance()); if (!advanced) break; // if we can't advance LHS, we're done for this batch } else { - if (!any_rhs_advanced) break; // need to wait for new data + if (!rhs_update_state.any_advanced) break; // need to wait for new data } } @@ -999,6 +1014,8 @@ class AsofJoinNode : public ExecNode { } } +#ifdef ARROW_ENABLE_THREADING + template struct Defer { Callable callable; @@ -1085,6 +1102,7 @@ class AsofJoinNode : public ExecNode { } static void ProcessThreadWrapper(AsofJoinNode* node) { node->ProcessThread(); } +#endif public: AsofJoinNode(ExecPlan* plan, NodeVector inputs, std::vector input_labels, @@ -1116,8 +1134,10 @@ class AsofJoinNode : public ExecNode { } virtual ~AsofJoinNode() { - process_.Push(false); // poison pill +#ifdef ARROW_ENABLE_THREADING + PushProcess(false); process_thread_.join(); +#endif } const std::vector& indices_of_on_key() { return indices_of_on_key_; } @@ -1395,7 +1415,8 @@ class AsofJoinNode : public ExecNode { rb->ToString(), DEBUG_MANIP(std::endl)); ARROW_RETURN_NOT_OK(state_.at(k)->Push(rb)); - process_.Push(true); + PushProcess(true); + return Status::OK(); } @@ -1410,22 +1431,77 @@ class AsofJoinNode : public ExecNode { // The reason for this is that there are cases at the end of a table where we don't // know whether the RHS of the join is up-to-date until we know that the table is // finished. - process_.Push(true); + PushProcess(true); + return Status::OK(); } + void PushProcess(bool value) { +#ifdef ARROW_ENABLE_THREADING + process_.Push(value); +#else + if (value) { + ProcessNonThreaded(); + } else if (!process_task_.is_finished()) { + EndFromSingleThread(); + } +#endif + } - Status StartProducing() override { #ifndef ARROW_ENABLE_THREADING - return Status::NotImplemented("ASOF join requires threading enabled"); + bool ProcessNonThreaded() { + while (!process_task_.is_finished()) { + Result> result = ProcessInner(); + + if (result.ok()) { + auto out_rb = *result; + if (!out_rb) break; + ExecBatch out_b(*out_rb); + out_b.index = batches_produced_++; + DEBUG_SYNC(this, "produce batch ", out_b.index, ":", DEBUG_MANIP(std::endl), + out_rb->ToString(), DEBUG_MANIP(std::endl)); + Status st = output_->InputReceived(this, std::move(out_b)); + if (!st.ok()) { + // this isn't really from a thread, + // but we call through to this for consistency + EndFromSingleThread(std::move(st)); + return false; + } + } else { + // this isn't really from a thread, + // but we call through to this for consistency + EndFromSingleThread(result.status()); + return false; + } + } + auto& lhs = *state_.at(0); + if (lhs.Finished() && !process_task_.is_finished()) { + EndFromSingleThread(Status::OK()); + } + return true; + } + + void EndFromSingleThread(Status st = Status::OK()) { + process_task_.MarkFinished(st); + if (st.ok()) { + st = output_->InputFinished(this, batches_produced_); + } + for (const auto& s : state_) { + st &= s->ForceShutdown(); + } + } + #endif + Status StartProducing() override { ARROW_ASSIGN_OR_RAISE(process_task_, plan_->query_context()->BeginExternalTask( "AsofJoinNode::ProcessThread")); if (!process_task_.is_valid()) { // Plan has already aborted. Do not start process thread return Status::OK(); } +#ifdef ARROW_ENABLE_THREADING process_thread_ = std::thread(&AsofJoinNode::ProcessThreadWrapper, this); +#endif return Status::OK(); } @@ -1433,8 +1509,10 @@ class AsofJoinNode : public ExecNode { void ResumeProducing(ExecNode* output, int32_t counter) override {} Status StopProducingImpl() override { +#ifdef ARROW_ENABLE_THREADING process_.Clear(); - process_.Push(false); +#endif + PushProcess(false); return Status::OK(); } @@ -1464,11 +1542,13 @@ class AsofJoinNode : public ExecNode { // Backpressure counter common to all inputs std::atomic backpressure_counter_; +#ifdef ARROW_ENABLE_THREADING // Queue for triggering processing of a given input // (a false value is a poison pill) ConcurrentQueue process_; // Worker thread std::thread process_thread_; +#endif Future<> process_task_; // In-progress batches produced @@ -1496,9 +1576,13 @@ AsofJoinNode::AsofJoinNode(ExecPlan* plan, NodeVector inputs, debug_os_(join_options.debug_opts ? join_options.debug_opts->os : nullptr), debug_mutex_(join_options.debug_opts ? join_options.debug_opts->mutex : nullptr), #endif - backpressure_counter_(1), + backpressure_counter_(1) +#ifdef ARROW_ENABLE_THREADING + , process_(), - process_thread_() { + process_thread_() +#endif +{ for (auto& key_hasher : key_hashers_) { key_hasher->node_ = this; } diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index d95d2aaad3643..051e280a4c53c 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -1678,5 +1678,59 @@ TEST(AsofJoinTest, BackpressureWithBatchesGen) { /*slow_r0=*/false); } +// Reproduction of GH-40675: A logical race between Process() and Push() that can be more +// easily observed with single small batch. +TEST(AsofJoinTest, RhsEmptinessRace) { + auto left_batch = ExecBatchFromJSON( + {int64(), utf8()}, R"([[1, "a"], [1, "b"], [5, "a"], [6, "b"], [7, "f"]])"); + auto right_batch = ExecBatchFromJSON( + {int64(), utf8(), float64()}, R"([[2, "a", 1.0], [9, "b", 3.0], [15, "g", 5.0]])"); + + Declaration left{ + "exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("colA", int64()), field("col2", utf8())}), + {std::move(left_batch)})}; + Declaration right{ + "exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("colB", int64()), field("col3", utf8()), + field("colC", float64())}), + {std::move(right_batch)})}; + AsofJoinNodeOptions asof_join_opts({{{"colA"}, {{"col2"}}}, {{"colB"}, {{"col3"}}}}, 1); + Declaration asof_join{ + "asofjoin", {std::move(left), std::move(right)}, std::move(asof_join_opts)}; + + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(std::move(asof_join))); + + auto exp_batch = ExecBatchFromJSON( + {int64(), utf8(), float64()}, + R"([[1, "a", 1.0], [1, "b", null], [5, "a", null], [6, "b", null], [7, "f", null]])"); + AssertExecBatchesEqualIgnoringOrder(result.schema, {exp_batch}, result.batches); +} + +// Reproduction of GH-41149: Another case of the same root cause as GH-40675, but with +// empty "by" columns. +TEST(AsofJoinTest, RhsEmptinessRaceEmptyBy) { + auto left_batch = ExecBatchFromJSON({int64()}, R"([[1], [2], [3]])"); + auto right_batch = + ExecBatchFromJSON({utf8(), int64()}, R"([["Z", 2], ["B", 3], ["A", 4]])"); + + Declaration left{"exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("on", int64())}), + {std::move(left_batch)})}; + Declaration right{ + "exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("colVals", utf8()), field("on", int64())}), + {std::move(right_batch)})}; + AsofJoinNodeOptions asof_join_opts({{{"on"}, {}}, {{"on"}, {}}}, 1); + Declaration asof_join{ + "asofjoin", {std::move(left), std::move(right)}, std::move(asof_join_opts)}; + + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(std::move(asof_join))); + + auto exp_batch = + ExecBatchFromJSON({int64(), utf8()}, R"([[1, "Z"], [2, "Z"], [3, "B"]])"); + AssertExecBatchesEqualIgnoringOrder(result.schema, {exp_batch}, result.batches); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/backpressure_handler.h b/cpp/src/arrow/acero/backpressure_handler.h index 178272315d7fb..db6c3799354af 100644 --- a/cpp/src/arrow/acero/backpressure_handler.h +++ b/cpp/src/arrow/acero/backpressure_handler.h @@ -45,7 +45,7 @@ class BackpressureHandler { } BackpressureHandler backpressure_handler(input, low_threshold, high_threshold, std::move(backpressure_control)); - return std::move(backpressure_handler); + return backpressure_handler; } void Handle(size_t start_level, size_t end_level) { diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index bad331cfd99d1..a2d6e9575a1aa 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -27,7 +27,7 @@ #include "arrow/acero/task_util.h" #include "arrow/acero/test_util_internal.h" #include "arrow/acero/util.h" -#include "arrow/compute/key_hash.h" +#include "arrow/compute/key_hash_internal.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/config.h" #include "arrow/util/cpu_info.h" diff --git a/cpp/src/arrow/acero/exec_plan.cc b/cpp/src/arrow/acero/exec_plan.cc index 97119726d4b17..d9fb1942fccd8 100644 --- a/cpp/src/arrow/acero/exec_plan.cc +++ b/cpp/src/arrow/acero/exec_plan.cc @@ -128,7 +128,7 @@ struct ExecPlanImpl : public ExecPlan { Future<> scheduler_finished = arrow::util::AsyncTaskScheduler::Make( [this](arrow::util::AsyncTaskScheduler* async_scheduler) { QueryContext* ctx = query_context(); - RETURN_NOT_OK(ctx->Init(ctx->max_concurrency(), async_scheduler)); + RETURN_NOT_OK(ctx->Init(async_scheduler)); #ifdef ARROW_WITH_OPENTELEMETRY if (HasMetadata()) { diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 2626fd50379dd..743cb20d1960d 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -318,7 +318,7 @@ Result RunGroupBy(const BatchesWithSchema& input, { {"source", SourceNodeOptions{input.schema, input.gen(use_threads, /*slow=*/false)}}, - {"aggregate", AggregateNodeOptions{std::move(aggregates), std::move(keys), + {"aggregate", AggregateNodeOptions{aggregates, std::move(keys), std::move(segment_keys)}}, {"sink", SinkNodeOptions{&sink_gen}}, }) @@ -592,6 +592,12 @@ void TestSegments(std::unique_ptr& segmenter, const ExecSpan& batc ASSERT_EQ(expected_segment, segment); offset = segment.offset + segment.length; } + // Assert next is the last (empty) segment. + ASSERT_OK_AND_ASSIGN(auto segment, segmenter->GetNextSegment(batch, offset)); + ASSERT_GE(segment.offset, batch.length); + ASSERT_EQ(segment.length, 0); + ASSERT_TRUE(segment.is_open); + ASSERT_TRUE(segment.extends); } Result> MakeGrouper(const std::vector& key_types) { @@ -682,48 +688,142 @@ TEST(RowSegmenter, Basics) { } TEST(RowSegmenter, NonOrdered) { - std::vector types = {int32()}; - auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [1], [2]]"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 1, true, false}, - {5, 0, true, true}}); + { + std::vector types = {int32()}; + auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [1], [2]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 1, true, false}, + {5, 0, true, true}}); + } + { + std::vector types = {int32(), int32()}; + auto batch = ExecBatchFromJSON(types, "[[1, 1], [1, 1], [2, 2], [1, 2], [2, 2]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 1, true, false}, + {5, 0, true, true}}); + } } TEST(RowSegmenter, EmptyBatches) { - std::vector types = {int32()}; - std::vector batches = { - ExecBatchFromJSON(types, "[]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[2], [2]]"), ExecBatchFromJSON(types, "[]"), - }; - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batches[0]), {}); - TestSegments(segmenter, ExecSpan(batches[1]), {}); - TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[3]), {}); - TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[5]), {}); - TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); - TestSegments(segmenter, ExecSpan(batches[7]), {}); + { + std::vector types = {int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[]"), ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[2], [2]]"), ExecBatchFromJSON(types, "[]"), + }; + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {}); + TestSegments(segmenter, ExecSpan(batches[1]), {}); + TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[3]), {}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[5]), {}); + TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[7]), {}); + } + { + std::vector types = {int32(), int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1, 1]]"), + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1, 1]]"), + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[2, 2], [2, 2]]"), + ExecBatchFromJSON(types, "[]"), + }; + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {}); + TestSegments(segmenter, ExecSpan(batches[1]), {}); + TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[3]), {}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[5]), {}); + TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[7]), {}); + } } TEST(RowSegmenter, MultipleSegments) { - std::vector types = {int32()}; - auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [5], [3], [3], [5], [5], [4]]"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 2, false, false}, - {6, 2, false, false}, - {8, 1, true, false}, - {9, 0, true, true}}); + { + std::vector types = {int32()}; + auto batch = + ExecBatchFromJSON(types, "[[1], [1], [2], [5], [3], [3], [5], [5], [4]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 2, false, false}, + {6, 2, false, false}, + {8, 1, true, false}, + {9, 0, true, true}}); + } + { + std::vector types = {int32(), int32()}; + auto batch = ExecBatchFromJSON( + types, + "[[1, 1], [1, 1], [2, 2], [5, 5], [3, 3], [3, 3], [5, 5], [5, 5], [4, 4]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 2, false, false}, + {6, 2, false, false}, + {8, 1, true, false}, + {9, 0, true, true}}); + } +} + +TEST(RowSegmenter, MultipleSegmentsMultipleBatches) { + { + std::vector types = {int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[[1], [2]]"), + ExecBatchFromJSON(types, "[[5], [3]]"), + ExecBatchFromJSON(types, "[[3], [5], [5]]"), ExecBatchFromJSON(types, "[[4]]")}; + + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[1]), + {{0, 1, false, true}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[2]), + {{0, 1, false, false}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[3]), + {{0, 1, false, true}, {1, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}}); + } + { + std::vector types = {int32(), int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[[1, 1]]"), + ExecBatchFromJSON(types, "[[1, 1], [2, 2]]"), + ExecBatchFromJSON(types, "[[5, 5], [3, 3]]"), + ExecBatchFromJSON(types, "[[3, 3], [5, 5], [5, 5]]"), + ExecBatchFromJSON(types, "[[4, 4]]")}; + + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[1]), + {{0, 1, false, true}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[2]), + {{0, 1, false, false}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[3]), + {{0, 1, false, true}, {1, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}}); + } } namespace { diff --git a/cpp/src/arrow/acero/hash_join.cc b/cpp/src/arrow/acero/hash_join.cc index 296b2c56e00f4..5aa70a23f7c9e 100644 --- a/cpp/src/arrow/acero/hash_join.cc +++ b/cpp/src/arrow/acero/hash_join.cc @@ -791,7 +791,7 @@ class HashJoinBasicImpl : public HashJoinImpl { Result> HashJoinImpl::MakeBasic() { std::unique_ptr impl{new HashJoinBasicImpl()}; - return std::move(impl); + return impl; } } // namespace acero diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index 993c0b9a705b4..1f8e02e9f0fcf 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -83,8 +83,8 @@ class JoinBenchmark { build_metadata["null_probability"] = std::to_string(settings.null_percentage); build_metadata["min"] = std::to_string(min_build_value); build_metadata["max"] = std::to_string(max_build_value); - build_metadata["min_length"] = settings.var_length_min; - build_metadata["max_length"] = settings.var_length_max; + build_metadata["min_length"] = std::to_string(settings.var_length_min); + build_metadata["max_length"] = std::to_string(settings.var_length_max); std::unordered_map probe_metadata; probe_metadata["null_probability"] = std::to_string(settings.null_percentage); @@ -148,7 +148,7 @@ class JoinBenchmark { }; scheduler_ = TaskScheduler::Make(); - DCHECK_OK(ctx_.Init(settings.num_threads, nullptr)); + DCHECK_OK(ctx_.Init(nullptr)); auto register_task_group_callback = [&](std::function task, std::function cont) { diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index c0179fd160e4e..67f902e64be93 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -27,7 +27,7 @@ #include "arrow/acero/options.h" #include "arrow/acero/schema_util.h" #include "arrow/acero/util.h" -#include "arrow/compute/key_hash.h" +#include "arrow/compute/key_hash_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/thread_pool.h" @@ -351,7 +351,7 @@ Result HashJoinSchema::BindFilter(Expression filter, const Schema& right_schema, ExecContext* exec_context) { if (filter.IsBound() || filter == literal(true)) { - return std::move(filter); + return filter; } // Step 1: Construct filter schema FieldVector fields; @@ -386,7 +386,7 @@ Result HashJoinSchema::BindFilter(Expression filter, filter.ToString(), " evaluates to ", filter.type()->ToString()); } - return std::move(filter); + return filter; } Expression HashJoinSchema::RewriteFilterToUseFilterSchema( @@ -497,11 +497,11 @@ struct BloomFilterPushdownContext { using BuildFinishedCallback = std::function; using FiltersReceivedCallback = std::function; using FilterFinishedCallback = std::function; - void Init(HashJoinNode* owner, size_t num_threads, - RegisterTaskGroupCallback register_task_group_callback, - StartTaskGroupCallback start_task_group_callback, - FiltersReceivedCallback on_bloom_filters_received, bool disable_bloom_filter, - bool use_sync_execution); + Status Init(HashJoinNode* owner, size_t num_threads, + RegisterTaskGroupCallback register_task_group_callback, + StartTaskGroupCallback start_task_group_callback, + FiltersReceivedCallback on_bloom_filters_received, + bool disable_bloom_filter, bool use_sync_execution); Status StartProducing(size_t thread_index); @@ -559,8 +559,7 @@ struct BloomFilterPushdownContext { std::vector hashes(batch.length); std::vector bv(bit_vector_bytes); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * stack, - ctx_->GetTempStack(thread_index)); + arrow::util::TempVectorStack* stack = &tld_[thread_index].stack; // Start with full selection for the current batch memset(selected.data(), 0xff, bit_vector_bytes); @@ -654,7 +653,17 @@ struct BloomFilterPushdownContext { FiltersReceivedCallback all_received_callback_; FilterFinishedCallback on_finished_; } eval_; + + static constexpr auto kTempStackUsage = + Hashing32::kHashBatchTempStackUsage + + (sizeof(uint32_t) + /*extra=*/1) * arrow::util::MiniBatch::kMiniBatchLength; + + struct ThreadLocalData { + arrow::util::TempVectorStack stack; + }; + std::vector tld_; }; + bool HashJoinSchema::HasDictionaries() const { for (int side = 0; side <= 1; ++side) { for (int icol = 0; icol < proj_maps[side].num_cols(HashJoinProjection::INPUT); @@ -930,7 +939,7 @@ class HashJoinNode : public ExecNode, public TracedNode { // we will change it back to just the CPU's thread pool capacity. size_t num_threads = (GetCpuThreadPoolCapacity() + io::GetIOThreadPoolCapacity() + 1); - pushdown_context_.Init( + RETURN_NOT_OK(pushdown_context_.Init( this, num_threads, [ctx](std::function fn, std::function on_finished) { @@ -940,7 +949,7 @@ class HashJoinNode : public ExecNode, public TracedNode { return ctx->StartTaskGroup(task_group_id, num_tasks); }, [this](size_t thread_index) { return OnFiltersReceived(thread_index); }, - disable_bloom_filter_, use_sync_execution); + disable_bloom_filter_, use_sync_execution)); RETURN_NOT_OK(impl_->Init( ctx, join_type_, num_threads, &(schema_mgr_->proj_maps[0]), @@ -1037,7 +1046,7 @@ class HashJoinNode : public ExecNode, public TracedNode { BloomFilterPushdownContext pushdown_context_; }; -void BloomFilterPushdownContext::Init( +Status BloomFilterPushdownContext::Init( HashJoinNode* owner, size_t num_threads, RegisterTaskGroupCallback register_task_group_callback, StartTaskGroupCallback start_task_group_callback, @@ -1074,6 +1083,12 @@ void BloomFilterPushdownContext::Init( return eval_.on_finished_(thread_index, std::move(eval_.batches_)); }); start_task_group_callback_ = std::move(start_task_group_callback); + tld_.resize(num_threads); + for (auto& local_data : tld_) { + RETURN_NOT_OK(local_data.stack.Init(ctx_->memory_pool(), kTempStackUsage)); + } + + return Status::OK(); } Status BloomFilterPushdownContext::StartProducing(size_t thread_index) { @@ -1124,8 +1139,7 @@ Status BloomFilterPushdownContext::BuildBloomFilter_exec_task(size_t thread_inde } ARROW_ASSIGN_OR_RAISE(ExecBatch key_batch, ExecBatch::Make(std::move(key_columns))); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * stack, - ctx_->GetTempStack(thread_index)); + arrow::util::TempVectorStack* stack = &tld_[thread_index].stack; arrow::util::TempVectorHolder hash_holder( stack, arrow::util::MiniBatch::kMiniBatchLength); uint32_t* hashes = hash_holder.mutable_data(); diff --git a/cpp/src/arrow/acero/hash_join_node.h b/cpp/src/arrow/acero/hash_join_node.h index cca64d59830b2..ad60019ceabc4 100644 --- a/cpp/src/arrow/acero/hash_join_node.h +++ b/cpp/src/arrow/acero/hash_join_node.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "arrow/acero/options.h" @@ -88,7 +89,7 @@ class ARROW_ACERO_EXPORT HashJoinSchema { const Expression& filter); bool PayloadIsEmpty(int side) { - ARROW_DCHECK(side == 0 || side == 1); + assert(side == 0 || side == 1); return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0; } diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 63969d9a3ed4b..f7b442cc3c624 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -28,6 +28,7 @@ #include "arrow/api.h" #include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" @@ -41,6 +42,7 @@ namespace arrow { using compute::call; using compute::default_exec_context; +using compute::ExecBatchBuilder; using compute::ExecSpan; using compute::field_ref; using compute::SortIndices; @@ -2036,6 +2038,29 @@ TEST(HashJoin, ResidualFilter) { [3, 4, "alpha", 4, 16, "alpha"]])")}); } +TEST(HashJoin, FilterEmptyRows) { + // Regression test for GH-41121. + BatchesWithSchema input_left; + input_left.batches = { + ExecBatchFromJSON({int32(), utf8(), int32()}, R"([[2, "Jarry", 28]])")}; + input_left.schema = + schema({field("id", int32()), field("name", utf8()), field("age", int32())}); + + BatchesWithSchema input_right; + input_right.batches = {ExecBatchFromJSON( + {int32(), int32(), utf8()}, + R"([[2, 10, "Jack"], [3, 12, "Mark"], [4, 15, "Tom"], [1, 10, "Jack"]])")}; + input_right.schema = + schema({field("id", int32()), field("stu_id", int32()), field("subject", utf8())}); + + const ResidualFilterCaseRunner runner{std::move(input_left), std::move(input_right)}; + + Expression filter = greater(field_ref("age"), literal(25)); + + runner.Run(JoinType::LEFT_ANTI, {"id"}, {"stu_id"}, std::move(filter), + {ExecBatchFromJSON({int32(), utf8(), int32()}, R"([[2, "Jarry", 28]])")}); +} + TEST(HashJoin, TrivialResidualFilter) { Expression always_true = equal(call("add", {field_ref("l1"), field_ref("r1")}), literal(2)); // 1 + 1 == 2 @@ -3178,5 +3203,55 @@ TEST(HashJoin, ChainedIntegerHashJoins) { } } +// Test that a large number of joins don't overflow the temp vector stack, like GH-39582 +// and GH-39951. +TEST(HashJoin, ManyJoins) { + // The idea of this case is to create many nested join nodes that may possibly cause + // recursive usage of temp vector stack. To make sure that the recursion happens: + // 1. A left-deep join tree is created so that the left-most (the final probe side) + // table will go through all the hash tables from the right side. + // 2. Left-outer join is used so that every join will increase the cardinality. + // 3. The left-most table contains rows of unique integers from 0 to N. + // 4. Each right table at level i contains two rows of integer i, so that the probing of + // each level will increase the result by one row. + // 5. The left-most table is a single batch of enough rows, so that at each level, the + // probing will accumulate enough result rows to have to output to the subsequent level + // before finishing the current batch (releasing the buffer allocated on the temp vector + // stack), which is essentially the recursive usage of the temp vector stack. + + // A fair number of joins to guarantee temp vector stack overflow before GH-41335. + const int num_joins = 16; + + // `ExecBatchBuilder::num_rows_max()` is the number of rows for swiss join to accumulate + // before outputting. + const int num_left_rows = ExecBatchBuilder::num_rows_max(); + ASSERT_OK_AND_ASSIGN( + auto left_batches, + MakeIntegerBatches({[](int row_id) -> int64_t { return row_id; }}, + schema({field("l_key", int32())}), + /*num_batches=*/1, /*batch_size=*/num_left_rows)); + Declaration root{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(left_batches.schema), + std::move(left_batches.batches))}; + + HashJoinNodeOptions join_opts(JoinType::LEFT_OUTER, /*left_keys=*/{"l_key"}, + /*right_keys=*/{"r_key"}); + + for (int i = 0; i < num_joins; ++i) { + ASSERT_OK_AND_ASSIGN(auto right_batches, + MakeIntegerBatches({[i](int) -> int64_t { return i; }}, + schema({field("r_key", int32())}), + /*num_batches=*/1, /*batch_size=*/2)); + Declaration table{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(right_batches.schema), + std::move(right_batches.batches))}; + + Declaration new_root{"hashjoin", {std::move(root), std::move(table)}, join_opts}; + root = std::move(new_root); + } + + ASSERT_OK_AND_ASSIGN(std::ignore, DeclarationToTable(std::move(root))); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/order_by_impl.cc b/cpp/src/arrow/acero/order_by_impl.cc index 2c624f6ab895f..1165799fc6610 100644 --- a/cpp/src/arrow/acero/order_by_impl.cc +++ b/cpp/src/arrow/acero/order_by_impl.cc @@ -93,14 +93,14 @@ Result> OrderByImpl::MakeSort( ExecContext* ctx, const std::shared_ptr& output_schema, const SortOptions& options) { std::unique_ptr impl{new SortBasicImpl(ctx, output_schema, options)}; - return std::move(impl); + return impl; } Result> OrderByImpl::MakeSelectK( ExecContext* ctx, const std::shared_ptr& output_schema, const SelectKOptions& options) { std::unique_ptr impl{new SelectKBasicImpl(ctx, output_schema, options)}; - return std::move(impl); + return impl; } } // namespace acero diff --git a/cpp/src/arrow/acero/query_context.cc b/cpp/src/arrow/acero/query_context.cc index a27397d12079d..18beb19ab7f8b 100644 --- a/cpp/src/arrow/acero/query_context.cc +++ b/cpp/src/arrow/acero/query_context.cc @@ -40,8 +40,7 @@ QueryContext::QueryContext(QueryOptions opts, ExecContext exec_context) const CpuInfo* QueryContext::cpu_info() const { return CpuInfo::GetInstance(); } int64_t QueryContext::hardware_flags() const { return cpu_info()->hardware_flags(); } -Status QueryContext::Init(size_t max_num_threads, util::AsyncTaskScheduler* scheduler) { - tld_.resize(max_num_threads); +Status QueryContext::Init(util::AsyncTaskScheduler* scheduler) { async_scheduler_ = scheduler; return Status::OK(); } @@ -50,15 +49,6 @@ size_t QueryContext::GetThreadIndex() { return thread_indexer_(); } size_t QueryContext::max_concurrency() const { return thread_indexer_.Capacity(); } -Result QueryContext::GetTempStack(size_t thread_index) { - if (!tld_[thread_index].is_init) { - RETURN_NOT_OK(tld_[thread_index].stack.Init( - memory_pool(), 32 * util::MiniBatch::kMiniBatchLength * sizeof(uint64_t))); - tld_[thread_index].is_init = true; - } - return &tld_[thread_index].stack; -} - Result> QueryContext::BeginExternalTask(std::string_view name) { Future<> completion_future = Future<>::Make(); if (async_scheduler_->AddSimpleTask([completion_future] { return completion_future; }, diff --git a/cpp/src/arrow/acero/query_context.h b/cpp/src/arrow/acero/query_context.h index 9ea11679cba05..3eff299439828 100644 --- a/cpp/src/arrow/acero/query_context.h +++ b/cpp/src/arrow/acero/query_context.h @@ -38,7 +38,7 @@ class ARROW_ACERO_EXPORT QueryContext { QueryContext(QueryOptions opts = {}, ExecContext exec_context = *default_exec_context()); - Status Init(size_t max_num_threads, arrow::util::AsyncTaskScheduler* scheduler); + Status Init(arrow::util::AsyncTaskScheduler* scheduler); const ::arrow::internal::CpuInfo* cpu_info() const; int64_t hardware_flags() const; @@ -52,7 +52,6 @@ class ARROW_ACERO_EXPORT QueryContext { size_t GetThreadIndex(); size_t max_concurrency() const; - Result GetTempStack(size_t thread_index); /// \brief Start an external task /// @@ -145,11 +144,6 @@ class ARROW_ACERO_EXPORT QueryContext { std::unique_ptr task_scheduler_ = TaskScheduler::Make(); ThreadIndexer thread_indexer_; - struct ThreadLocalData { - bool is_init = false; - arrow::util::TempVectorStack stack; - }; - std::vector tld_; std::atomic in_flight_bytes_to_disk_{0}; }; diff --git a/cpp/src/arrow/acero/schema_util.h b/cpp/src/arrow/acero/schema_util.h index 6760022feb4be..db3076a58841a 100644 --- a/cpp/src/arrow/acero/schema_util.h +++ b/cpp/src/arrow/acero/schema_util.h @@ -17,13 +17,13 @@ #pragma once +#include #include #include #include #include -#include "arrow/compute/light_array.h" // for KeyColumnMetadata -#include "arrow/type.h" // for DataType, FieldRef, Field and Schema +#include "arrow/type.h" // for DataType, FieldRef, Field and Schema namespace arrow { @@ -47,8 +47,8 @@ struct SchemaProjectionMap { const int* source_to_base; const int* base_to_target; inline int get(int i) const { - ARROW_DCHECK(i >= 0 && i < num_cols); - ARROW_DCHECK(source_to_base[i] != kMissingField); + assert(i >= 0 && i < num_cols); + assert(source_to_base[i] != kMissingField); return base_to_target[source_to_base[i]]; } }; @@ -66,7 +66,7 @@ class SchemaProjectionMaps { Status Init(ProjectionIdEnum full_schema_handle, const Schema& schema, const std::vector& projection_handles, const std::vector*>& projections) { - ARROW_DCHECK(projection_handles.size() == projections.size()); + assert(projection_handles.size() == projections.size()); ARROW_RETURN_NOT_OK(RegisterSchema(full_schema_handle, schema)); for (size_t i = 0; i < projections.size(); ++i) { ARROW_RETURN_NOT_OK( @@ -174,7 +174,7 @@ class SchemaProjectionMaps { } } // We should never get here - ARROW_DCHECK(false); + assert(false); return -1; } @@ -207,7 +207,7 @@ class SchemaProjectionMaps { break; } } - ARROW_DCHECK(field_id != SchemaProjectionMap::kMissingField); + assert(field_id != SchemaProjectionMap::kMissingField); mapping[i] = field_id; inverse_mapping[field_id] = i; } diff --git a/cpp/src/arrow/acero/sink_node.cc b/cpp/src/arrow/acero/sink_node.cc index 4ab6b4537de02..66f447aa87f11 100644 --- a/cpp/src/arrow/acero/sink_node.cc +++ b/cpp/src/arrow/acero/sink_node.cc @@ -423,6 +423,7 @@ class ConsumingSinkNode : public ExecNode, std::atomic backpressure_counter_ = 0; std::unique_ptr sequencer_; }; + static Result MakeTableConsumingSinkNode(ExecPlan* plan, std::vector inputs, const ExecNodeOptions& options) { diff --git a/cpp/src/arrow/acero/sorted_merge_node.cc b/cpp/src/arrow/acero/sorted_merge_node.cc index 4d4565a6bb5e7..a71ac79efcc46 100644 --- a/cpp/src/arrow/acero/sorted_merge_node.cc +++ b/cpp/src/arrow/acero/sorted_merge_node.cc @@ -262,19 +262,22 @@ class SortedMergeNode : public ExecNode { : ExecNode(plan, inputs, GetInputLabels(inputs), std::move(output_schema)), ordering_(std::move(new_ordering)), input_counter(inputs_.size()), - output_counter(inputs_.size()), - process_thread() { + output_counter(inputs_.size()) +#ifdef ARROW_ENABLE_THREADING + , + process_thread() +#endif + { SetLabel("sorted_merge"); } ~SortedMergeNode() override { - process_queue.Push( - kPoisonPill); // poison pill - // We might create a temporary (such as to inspect the output - // schema), in which case there isn't anything to join + PushTask(kPoisonPill); +#ifdef ARROW_ENABLE_THREADING if (process_thread.joinable()) { process_thread.join(); } +#endif } static arrow::Result Make( @@ -355,10 +358,25 @@ class SortedMergeNode : public ExecNode { // InputState's ConcurrentQueue manages locking input_counter[index] += rb->num_rows(); ARROW_RETURN_NOT_OK(state[index]->Push(rb)); - process_queue.Push(kNewTask); + PushTask(kNewTask); return Status::OK(); } + void PushTask(bool ok) { +#ifdef ARROW_ENABLE_THREADING + process_queue.Push(ok); +#else + if (process_task.is_finished()) { + return; + } + if (ok == kNewTask) { + PollOnce(); + } else { + EndFromProcessThread(); + } +#endif + } + arrow::Status InputFinished(arrow::acero::ExecNode* input, int total_batches) override { ARROW_DCHECK(std_has(inputs_, input)); { @@ -368,7 +386,8 @@ class SortedMergeNode : public ExecNode { state.at(k)->set_total_batches(total_batches); } // Trigger a final process call for stragglers - process_queue.Push(kNewTask); + PushTask(kNewTask); + return Status::OK(); } @@ -379,13 +398,17 @@ class SortedMergeNode : public ExecNode { // Plan has already aborted. Do not start process thread return Status::OK(); } +#ifdef ARROW_ENABLE_THREADING process_thread = std::thread(&SortedMergeNode::StartPoller, this); +#endif return Status::OK(); } arrow::Status StopProducingImpl() override { +#ifdef ARROW_ENABLE_THREADING process_queue.Clear(); - process_queue.Push(kPoisonPill); +#endif + PushTask(kPoisonPill); return Status::OK(); } @@ -408,6 +431,7 @@ class SortedMergeNode : public ExecNode { << input_counter[i] << " != " << output_counter[i]; } +#ifdef ARROW_ENABLE_THREADING ARROW_UNUSED( plan_->query_context()->executor()->Spawn([this, st = std::move(st)]() mutable { Defer cleanup([this, &st]() { process_task.MarkFinished(st); }); @@ -415,6 +439,12 @@ class SortedMergeNode : public ExecNode { st = output_->InputFinished(this, batches_produced); } })); +#else + process_task.MarkFinished(st); + if (st.ok()) { + st = output_->InputFinished(this, batches_produced); + } +#endif } bool CheckEnded() { @@ -552,6 +582,7 @@ class SortedMergeNode : public ExecNode { return true; } +#ifdef ARROW_ENABLE_THREADING void EmitBatches() { while (true) { // Implementation note: If the queue is empty, we will block here @@ -567,6 +598,7 @@ class SortedMergeNode : public ExecNode { /// The entry point for processThread static void StartPoller(SortedMergeNode* node) { node->EmitBatches(); } +#endif arrow::Ordering ordering_; @@ -583,11 +615,13 @@ class SortedMergeNode : public ExecNode { std::atomic batches_produced{0}; +#ifdef ARROW_ENABLE_THREADING // Queue to trigger processing of a given input. False acts as a poison pill ConcurrentQueue process_queue; // Once StartProducing is called, we initialize this thread to poll the // input states and emit batches std::thread process_thread; +#endif arrow::Future<> process_task; // Map arg index --> completion counter diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 68b0e37b01aa9..732deb72861d6 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -25,7 +25,7 @@ #include "arrow/acero/util.h" #include "arrow/array/util.h" // MakeArrayFromScalar #include "arrow/compute/kernels/row_encoder_internal.h" -#include "arrow/compute/key_hash.h" +#include "arrow/compute/key_hash_internal.h" #include "arrow/compute/row/compare_internal.h" #include "arrow/compute/row/encode_internal.h" #include "arrow/util/bit_util.h" @@ -2167,6 +2167,11 @@ Status JoinResidualFilter::FilterOneBatch(const ExecBatch& keypayload_batch, ARROW_DCHECK(!output_payload_ids || payload_ids_maybe_null); *num_passing_rows = 0; + + if (num_batch_rows == 0) { + return Status::OK(); + } + ARROW_ASSIGN_OR_RAISE(Datum mask, EvalFilter(keypayload_batch, num_batch_rows, batch_row_ids, key_ids_maybe_null, payload_ids_maybe_null)); @@ -2465,6 +2470,8 @@ Status JoinProbeProcessor::OnFinished() { class SwissJoin : public HashJoinImpl { public: + static constexpr auto kTempStackUsage = 64 * arrow::util::MiniBatch::kMiniBatchLength; + Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads, const HashJoinProjectionMaps* proj_map_left, const HashJoinProjectionMaps* proj_map_right, @@ -2508,6 +2515,7 @@ class SwissJoin : public HashJoinImpl { local_states_.resize(num_threads_); for (int i = 0; i < num_threads_; ++i) { + RETURN_NOT_OK(local_states_[i].stack.Init(pool_, kTempStackUsage)); local_states_[i].hash_table_ready = false; local_states_[i].num_output_batches = 0; local_states_[i].materialize.Init(pool_, proj_map_left, proj_map_right); @@ -2561,8 +2569,7 @@ class SwissJoin : public HashJoinImpl { ExecBatch keypayload_batch; ARROW_ASSIGN_OR_RAISE(keypayload_batch, KeyPayloadFromInput(/*side=*/0, &batch)); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_index)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_index].stack; return CancelIfNotOK( probe_processor_.OnNextBatch(thread_index, keypayload_batch, temp_stack, @@ -2674,8 +2681,7 @@ class SwissJoin : public HashJoinImpl { input_batch.values[schema->num_cols(HashJoinProjection::KEY) + icol]; } } - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_id)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; RETURN_NOT_OK(CancelIfNotOK(hash_table_build_.PushNextBatch( static_cast(thread_id), key_batch, no_payload ? nullptr : &payload_batch, temp_stack))); @@ -2710,8 +2716,7 @@ class SwissJoin : public HashJoinImpl { Status MergeFinished(size_t thread_id) { RETURN_NOT_OK(status()); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_id)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; hash_table_build_.FinishPrtnMerge(temp_stack); return CancelIfNotOK(OnBuildHashTableFinished(static_cast(thread_id))); } @@ -2766,8 +2771,7 @@ class SwissJoin : public HashJoinImpl { std::min((task_id + 1) * kNumRowsPerScanTask, hash_table_.num_rows()); // Get thread index and related temp vector stack // - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_id)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; // Split into mini-batches // @@ -2944,6 +2948,7 @@ class SwissJoin : public HashJoinImpl { FinishedCallback finished_callback_; struct ThreadLocalState { + arrow::util::TempVectorStack stack; JoinResultMaterialize materialize; std::vector temp_column_arrays; int64_t num_output_batches; @@ -2980,7 +2985,7 @@ class SwissJoin : public HashJoinImpl { Result> HashJoinImpl::MakeSwiss() { std::unique_ptr impl{new SwissJoin()}; - return std::move(impl); + return impl; } } // namespace acero diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index aa36a61109274..dceb74abe4f1b 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -23,8 +23,8 @@ #include "arrow/acero/schema_util.h" #include "arrow/acero/task_util.h" #include "arrow/compute/kernels/row_encoder_internal.h" -#include "arrow/compute/key_map.h" -#include "arrow/compute/light_array.h" +#include "arrow/compute/key_map_internal.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/encode_internal.h" namespace arrow { diff --git a/cpp/src/arrow/acero/task_util.cc b/cpp/src/arrow/acero/task_util.cc index 4d8e9ecf76597..85378eaeeb27c 100644 --- a/cpp/src/arrow/acero/task_util.cc +++ b/cpp/src/arrow/acero/task_util.cc @@ -424,7 +424,7 @@ void TaskSchedulerImpl::Abort(AbortContinuationImpl impl) { std::unique_ptr TaskScheduler::Make() { std::unique_ptr impl{new TaskSchedulerImpl()}; - return std::move(impl); + return impl; } } // namespace acero diff --git a/cpp/src/arrow/acero/tpch_node.cc b/cpp/src/arrow/acero/tpch_node.cc index 9797a082b49d2..137b62ad38a95 100644 --- a/cpp/src/arrow/acero/tpch_node.cc +++ b/cpp/src/arrow/acero/tpch_node.cc @@ -336,7 +336,7 @@ Result TpchPseudotext::GenerateComments(size_t num_comments, size_t min_l } ArrayData ad(utf8(), num_comments, {nullptr, std::move(offset_buffer), std::move(comment_buffer)}); - return std::move(ad); + return ad; } bool TpchPseudotext::GenerateWord(int64_t& offset, random::pcg32_fast& rng, char* arr, @@ -611,7 +611,7 @@ Result RandomVString(random::pcg32_fast& rng, int64_t num_rows, int32_t m for (int32_t i = 0; i < offsets[num_rows]; i++) str[i] = alpha_numerics[char_dist(rng)]; ArrayData ad(utf8(), num_rows, {nullptr, std::move(offset_buff), std::move(str_buff)}); - return std::move(ad); + return ad; } void GeneratePhoneNumber(char* out, random::pcg32_fast& rng, int32_t country) { @@ -677,7 +677,7 @@ class PartAndPartSupplierGenerator { if (!part_output_queue_.empty()) { ExecBatch batch = std::move(part_output_queue_.front()); part_output_queue_.pop(); - return std::move(batch); + return batch; } else if (part_rows_generated_ == part_rows_to_generate_) { return std::nullopt; } else { @@ -732,7 +732,7 @@ class PartAndPartSupplierGenerator { if (!partsupp_output_queue_.empty()) { ExecBatch result = std::move(partsupp_output_queue_.front()); partsupp_output_queue_.pop(); - return std::move(result); + return result; } } { @@ -1337,7 +1337,7 @@ class OrdersAndLineItemGenerator { if (!orders_output_queue_.empty()) { ExecBatch batch = std::move(orders_output_queue_.front()); orders_output_queue_.pop(); - return std::move(batch); + return batch; } else if (orders_rows_generated_ == orders_rows_to_generate_) { return std::nullopt; } else { @@ -1401,12 +1401,12 @@ class OrdersAndLineItemGenerator { if (from_queue) { ARROW_DCHECK(queued.length <= batch_size_); tld.first_batch_offset = queued.length; - if (queued.length == batch_size_) return std::move(queued); + if (queued.length == batch_size_) return queued; } { std::lock_guard lock(orders_output_queue_mutex_); if (orders_rows_generated_ == orders_rows_to_generate_) { - if (from_queue) return std::move(queued); + if (from_queue) return queued; return std::nullopt; } diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 127ec49ba990f..25759f8471365 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -18,13 +18,16 @@ #include "arrow/adapters/orc/adapter.h" #include -#include #include #include #include #include #include +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK +#include +#endif + #include "arrow/adapters/orc/util.h" #include "arrow/builder.h" #include "arrow/io/interfaces.h" @@ -183,11 +186,9 @@ liborc::RowReaderOptions DefaultRowReaderOptions() { return options; } +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK // Proactively check timezone database availability for ORC versions older than 2.0.0 Status CheckTimeZoneDatabaseAvailability() { - if (GetOrcMajorVersion() >= 2) { - return Status::OK(); - } auto tz_dir = std::getenv("TZDIR"); bool is_tzdb_avaiable = tz_dir != nullptr ? std::filesystem::exists(tz_dir) @@ -200,6 +201,7 @@ Status CheckTimeZoneDatabaseAvailability() { } return Status::OK(); } +#endif } // namespace @@ -559,10 +561,12 @@ ORCFileReader::~ORCFileReader() {} Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); +#endif auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); - return std::move(result); + return result; } Result> ORCFileReader::ReadMetadata() { @@ -826,12 +830,14 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } Result> ORCFileWriter::Open( io::OutputStream* output_stream, const WriteOptions& writer_options) { +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); +#endif std::unique_ptr result = std::unique_ptr(new ORCFileWriter()); Status status = result->impl_->Open(output_stream, writer_options); RETURN_NOT_OK(status); - return std::move(result); + return result; } Status ORCFileWriter::Write(const Table& table) { return impl_->Write(table); } diff --git a/cpp/src/arrow/adapters/orc/util.cc b/cpp/src/arrow/adapters/orc/util.cc index 2a74bec1aa6fd..5bfe257ac7bad 100644 --- a/cpp/src/arrow/adapters/orc/util.cc +++ b/cpp/src/arrow/adapters/orc/util.cc @@ -1026,7 +1026,7 @@ Result> GetOrcType(const DataType& type) { SetAttributes(*it, orc_subtype.get()); out_type->addStructField(field_name, std::move(orc_subtype)); } - return std::move(out_type); + return out_type; } case Type::type::MAP: { const auto& key_field = checked_cast(type).key_field(); @@ -1048,7 +1048,7 @@ Result> GetOrcType(const DataType& type) { SetAttributes(arrow_field, orc_subtype.get()); out_type->addUnionChild(std::move(orc_subtype)); } - return std::move(out_type); + return out_type; } default: { return Status::NotImplemented("Unknown or unsupported Arrow type: ", @@ -1195,7 +1195,7 @@ Result> GetOrcType(const Schema& schema) { SetAttributes(field, orc_subtype.get()); out_type->addStructField(field->name(), std::move(orc_subtype)); } - return std::move(out_type); + return out_type; } Result> GetFieldMetadata( diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 6411aebf80442..716ae0722069e 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -224,6 +224,14 @@ class ARROW_EXPORT Array { /// \return Status Status ValidateFull() const; + /// \brief Return the device_type that this array's data is allocated on + /// + /// This just delegates to calling device_type on the underlying ArrayData + /// object which backs this Array. + /// + /// \return DeviceAllocationType + DeviceAllocationType device_type() const { return data_->device_type(); } + protected: Array() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 4ae9e3d6dcbfc..22d6d1fc5ae92 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1129,7 +1129,7 @@ TEST(TestDictionary, Validate) { arr = std::make_shared(dict_type, indices, MakeArray(invalid_data)); ASSERT_RAISES(Invalid, arr->ValidateFull()); -#if !defined(__APPLE__) && !defined(ARROW_VALGRIND) +#if !defined(__APPLE__) && !defined(ARROW_VALGRIND) && !defined(__EMSCRIPTEN__) // GH-35712: ASSERT_DEATH would make testing slow on macOS. ASSERT_DEATH( { diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index b08fa99168616..3d18d5f967b72 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -735,7 +735,7 @@ class TestListArray : public ::testing::Test { ArrayFromJSON(type, "[[1, 2], [3], [4], null, [5], [], [6]]")); auto sliced_list_array = std::dynamic_pointer_cast(list_array->Slice(3, 4)); - ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + ASSERT_OK_AND_ASSIGN(auto flattened, sliced_list_array->Flatten()); ASSERT_OK(flattened->ValidateFull()); // Note the difference between values() and Flatten(). EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[5, 6]"))); @@ -763,6 +763,52 @@ class TestListArray : public ::testing::Test { << flattened->ToString(); } + void TestFlattenRecursively() { + auto inner_type = std::make_shared(int32()); + auto type = std::make_shared(inner_type); + + // List types with two nested level: list> + auto nested_list_array = std::dynamic_pointer_cast(ArrayFromJSON(type, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])")); + ASSERT_OK_AND_ASSIGN(auto flattened, nested_list_array->FlattenRecursively()); + ASSERT_OK(flattened->ValidateFull()); + ASSERT_EQ(10, flattened->length()); + ASSERT_TRUE( + flattened->Equals(ArrayFromJSON(int32(), "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"))); + + // Empty nested list should flatten until non-list type is reached + nested_list_array = + std::dynamic_pointer_cast(ArrayFromJSON(type, R"([null])")); + ASSERT_OK_AND_ASSIGN(flattened, nested_list_array->FlattenRecursively()); + ASSERT_TRUE(flattened->type()->Equals(int32())); + + // List types with three nested level: list>> + type = std::make_shared(std::make_shared(fixed_size_list(int32(), 2))); + nested_list_array = std::dynamic_pointer_cast(ArrayFromJSON(type, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])")); + ASSERT_OK_AND_ASSIGN(flattened, nested_list_array->FlattenRecursively()); + ASSERT_OK(flattened->ValidateFull()); + ASSERT_EQ(10, flattened->length()); + ASSERT_EQ(3, flattened->null_count()); + ASSERT_TRUE(flattened->Equals( + ArrayFromJSON(int32(), "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"))); + } + Status ValidateOffsetsAndSizes(int64_t length, std::vector offsets, std::vector sizes, std::shared_ptr values, int64_t offset = 0) { @@ -925,10 +971,12 @@ TYPED_TEST(TestListArray, BuilderPreserveFieldName) { TYPED_TEST(TestListArray, FlattenSimple) { this->TestFlattenSimple(); } TYPED_TEST(TestListArray, FlattenNulls) { this->TestFlattenNulls(); } TYPED_TEST(TestListArray, FlattenAllEmpty) { this->TestFlattenAllEmpty(); } +TYPED_TEST(TestListArray, FlattenSliced) { this->TestFlattenSliced(); } TYPED_TEST(TestListArray, FlattenZeroLength) { this->TestFlattenZeroLength(); } TYPED_TEST(TestListArray, TestFlattenNonEmptyBackingNulls) { this->TestFlattenNonEmptyBackingNulls(); } +TYPED_TEST(TestListArray, FlattenRecursively) { this->TestFlattenRecursively(); } TYPED_TEST(TestListArray, ValidateDimensions) { this->TestValidateDimensions(); } @@ -1239,7 +1287,7 @@ TEST_F(TestMapArray, ValidateErrorNullKey) { } TEST_F(TestMapArray, FromArrays) { - std::shared_ptr offsets1, offsets2, offsets3, offsets4, keys, items; + std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, keys, items; std::vector offsets_is_valid3 = {true, false, true, true}; std::vector offsets_is_valid4 = {true, true, false, true}; @@ -1294,6 +1342,20 @@ TEST_F(TestMapArray, FromArrays) { // Zero-length offsets ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets1->Slice(0, 0), keys, items, pool_)); + // Offseted offsets + ASSERT_OK_AND_ASSIGN(auto map5, + MapArray::FromArrays(offsets1->Slice(1), keys, items, pool_)); + ASSERT_OK(map5->Validate()); + + AssertArraysEqual(*expected1.Slice(1), *map5); + + std::vector offset5_values = {2, 2, 6}; + ArrayFromVector(offset5_values, &offsets5); + ASSERT_OK_AND_ASSIGN(auto map6, MapArray::FromArrays(offsets5, keys, items, pool_)); + ASSERT_OK(map6->Validate()); + + AssertArraysEqual(*map5, *map6); + // Offsets not the right type ASSERT_RAISES(TypeError, MapArray::FromArrays(keys, offsets1, items, pool_)); @@ -1306,6 +1368,35 @@ TEST_F(TestMapArray, FromArrays) { ASSERT_EQ(keys_with_null->length(), tmp_items->length()); ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets1, keys_with_null, tmp_items, pool_)); + + // With null_bitmap and null_count=1 + auto null_bitmap_1 = ArrayFromJSON(boolean(), "[1, 0, 1]")->data()->buffers[1]; + ASSERT_OK_AND_ASSIGN(auto map7, + MapArray::FromArrays(offsets1, keys, items, pool_, null_bitmap_1)); + ASSERT_OK(map7->Validate()); + MapArray expected7(map_type, length, offsets1->data()->buffers[1], keys, items, + null_bitmap_1, 1); + ASSERT_EQ(map7->null_count(), 1); + AssertArraysEqual(expected7, *map7); + + // With null_bitmap and null_count=2 + auto null_bitmap_2 = ArrayFromJSON(boolean(), "[0, 1, 0]")->data()->buffers[1]; + ASSERT_OK_AND_ASSIGN(auto map8, + MapArray::FromArrays(offsets1, keys, items, pool_, null_bitmap_2)); + ASSERT_OK(map8->Validate()); + MapArray expected8(map_type, length, offsets1->data()->buffers[1], keys, items, + null_bitmap_2, 2); + ASSERT_EQ(map8->null_count(), 2); + AssertArraysEqual(expected8, *map8); + + // Null bitmap and offset with null + ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets3, keys, items, pool_, + offsets3->data()->buffers[0])); + + // Null bitmap and offset with offset + ASSERT_RAISES(NotImplemented, + MapArray::FromArrays(offsets1->Slice(2), keys, items, pool_, + offsets3->data()->buffers[0])); } TEST_F(TestMapArray, FromArraysEquality) { @@ -1714,4 +1805,23 @@ TEST_F(TestFixedSizeListArray, Flatten) { } } +TEST_F(TestFixedSizeListArray, FlattenRecursively) { + // Nested fixed-size list-array: fixed_size_list(fixed_size_list(int32, 2), 2) + auto inner_type = fixed_size_list(value_type_, 2); + type_ = fixed_size_list(inner_type, 2); + + auto values = std::dynamic_pointer_cast(ArrayFromJSON(type_, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])")); + ASSERT_OK(values->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto flattened, values->FlattenRecursively()); + ASSERT_OK(flattened->ValidateFull()); + ASSERT_EQ(8, flattened->length()); + ASSERT_EQ(2, flattened->null_count()); + AssertArraysEqual(*flattened, + *ArrayFromJSON(value_type_, "[0, 1, null, 3, 7, null, 2, 5]")); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 958c2e25380b0..47c0fd35829a1 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -42,6 +42,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/list_util.h" #include "arrow/util/logging.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -114,7 +115,7 @@ Result::ArrayType>> ListArrayFromArray return Status::TypeError("List offsets must be ", OffsetArrowType::type_name()); } - if (null_bitmap != nullptr && offsets.null_count() > 0) { + if (null_bitmap != nullptr && offsets.data()->MayHaveNulls()) { return Status::Invalid( "Ambiguous to specify both validity map and offsets with nulls"); } @@ -469,6 +470,49 @@ inline void SetListData(VarLengthListLikeArray* self, self->values_ = MakeArray(self->data_->child_data[0]); } +Result> FlattenLogicalListRecursively(const Array& in_array, + MemoryPool* memory_pool) { + std::shared_ptr array = in_array.Slice(0, in_array.length()); + for (auto kind = array->type_id(); is_list(kind) || is_list_view(kind); + kind = array->type_id()) { + switch (kind) { + case Type::LIST: { + ARROW_ASSIGN_OR_RAISE( + array, (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::LARGE_LIST: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::LIST_VIEW: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::LARGE_LIST_VIEW: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::FIXED_SIZE_LIST: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + default: + Unreachable("unexpected non-list type"); + break; + } + } + return array; +} + } // namespace internal // ---------------------------------------------------------------------- @@ -746,7 +790,7 @@ MapArray::MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& items, int64_t null_count, int64_t offset) { auto pair_data = ArrayData::Make(type->fields()[0]->type(), keys->data()->length, - {nullptr}, {keys->data(), items->data()}, 0, offset); + {nullptr}, {keys->data(), items->data()}, 0); auto map_data = ArrayData::Make(type, length, std::move(buffers), {pair_data}, null_count, offset); SetData(map_data); @@ -763,7 +807,7 @@ MapArray::MapArray(const std::shared_ptr& type, int64_t length, Result> MapArray::FromArraysInternal( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool) { + MemoryPool* pool, std::shared_ptr null_bitmap) { using offset_type = typename MapType::offset_type; using OffsetArrowType = typename CTypeTraits::ArrowType; @@ -783,7 +827,16 @@ Result> MapArray::FromArraysInternal( return Status::Invalid("Map key and item arrays must be equal length"); } - if (offsets->null_count() > 0) { + if (null_bitmap != nullptr && offsets->data()->MayHaveNulls()) { + return Status::Invalid( + "Ambiguous to specify both validity map and offsets with nulls"); + } + + if (null_bitmap != nullptr && offsets->offset() != 0) { + return Status::NotImplemented("Null bitmap with offsets slice not supported."); + } + + if (offsets->data()->MayHaveNulls()) { ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets(NULLPTR, *offsets, pool)); return std::make_shared(type, offsets->length() - 1, std::move(buffers), @@ -792,24 +845,34 @@ Result> MapArray::FromArraysInternal( using OffsetArrayType = typename TypeTraits::ArrayType; const auto& typed_offsets = checked_cast(*offsets); - auto buffers = BufferVector({nullptr, typed_offsets.values()}); + + BufferVector buffers; + buffers.resize(2); + int64_t null_count = 0; + if (null_bitmap) { + buffers[0] = std::move(null_bitmap); + null_count = kUnknownNullCount; + } + buffers[1] = typed_offsets.values(); return std::make_shared(type, offsets->length() - 1, std::move(buffers), keys, - items, /*null_count=*/0, offsets->offset()); + items, /*null_count=*/null_count, offsets->offset()); } Result> MapArray::FromArrays(const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool) { + MemoryPool* pool, + std::shared_ptr null_bitmap) { return FromArraysInternal(std::make_shared(keys->type(), items->type()), - offsets, keys, items, pool); + offsets, keys, items, pool, std::move(null_bitmap)); } Result> MapArray::FromArrays(std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool) { + MemoryPool* pool, + std::shared_ptr null_bitmap) { if (type->id() != Type::MAP) { return Status::TypeError("Expected map type, got ", type->ToString()); } @@ -820,7 +883,8 @@ Result> MapArray::FromArrays(std::shared_ptr ty if (!map_type.item_type()->Equals(items->type())) { return Status::TypeError("Mismatching map items type"); } - return FromArraysInternal(std::move(type), offsets, keys, items, pool); + return FromArraysInternal(std::move(type), offsets, keys, items, pool, + std::move(null_bitmap)); } Status MapArray::ValidateChildData( @@ -832,13 +896,13 @@ Status MapArray::ValidateChildData( if (pair_data->type->id() != Type::STRUCT) { return Status::Invalid("Map array child array should have struct type"); } - if (pair_data->null_count != 0) { + if (pair_data->MayHaveNulls()) { return Status::Invalid("Map array child array should have no nulls"); } if (pair_data->child_data.size() != 2) { return Status::Invalid("Map array child array should have two fields"); } - if (pair_data->child_data[0]->null_count != 0) { + if (pair_data->child_data[0]->MayHaveNulls()) { return Status::Invalid("Map array keys array should have no nulls"); } return Status::OK(); @@ -1133,7 +1197,7 @@ void SparseUnionArray::SetData(std::shared_ptr data) { } void DenseUnionArray::SetData(const std::shared_ptr& data) { - this->UnionArray::SetData(std::move(data)); + this->UnionArray::SetData(data); ARROW_CHECK_EQ(data_->type->id(), Type::DENSE_UNION); ARROW_CHECK_EQ(data_->buffers.size(), 3); diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 768a630e0af54..a6d4977839ef1 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -58,6 +58,20 @@ void SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, Type::type expected_type_id = TYPE::type_id); +/// \brief A version of Flatten that keeps recursively flattening until an array of +/// non-list values is reached. +/// +/// Array types considered to be lists by this function: +/// - list +/// - large_list +/// - list_view +/// - large_list_view +/// - fixed_size_list +/// +/// \see ListArray::Flatten +ARROW_EXPORT Result> FlattenLogicalListRecursively( + const Array& in_array, MemoryPool* memory_pool); + } // namespace internal /// Base class for variable-sized list and list-view arrays, regardless of offset size. @@ -103,6 +117,15 @@ class VarLengthListLikeArray : public Array { return values_->Slice(value_offset(i), value_length(i)); } + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + protected: friend void internal::SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, @@ -509,15 +532,18 @@ class ARROW_EXPORT MapArray : public ListArray { /// \param[in] keys Array containing key values /// \param[in] items Array containing item values /// \param[in] pool MemoryPool in case new offsets array needs to be + /// \param[in] null_bitmap Optional validity bitmap /// allocated because of null values static Result> FromArrays( const std::shared_ptr& offsets, const std::shared_ptr& keys, - const std::shared_ptr& items, MemoryPool* pool = default_memory_pool()); + const std::shared_ptr& items, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR); static Result> FromArrays( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool = default_memory_pool()); + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR); const MapType* map_type() const { return map_type_; } @@ -537,7 +563,7 @@ class ARROW_EXPORT MapArray : public ListArray { static Result> FromArraysInternal( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool); + MemoryPool* pool, std::shared_ptr null_bitmap = NULLPTR); private: const MapType* map_type_; @@ -595,6 +621,15 @@ class ARROW_EXPORT FixedSizeListArray : public Array { Result> Flatten( MemoryPool* memory_pool = default_memory_pool()) const; + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + /// \brief Construct FixedSizeListArray from child value array and value_length /// /// \param[in] values Array containing list values diff --git a/cpp/src/arrow/array/array_primitive.cc b/cpp/src/arrow/array/array_primitive.cc index 7c4a14d93400f..da3810aa392c9 100644 --- a/cpp/src/arrow/array/array_primitive.cc +++ b/cpp/src/arrow/array/array_primitive.cc @@ -56,7 +56,7 @@ int64_t BooleanArray::false_count() const { } int64_t BooleanArray::true_count() const { - if (data_->null_count.load() != 0) { + if (data_->MayHaveNulls()) { DCHECK(data_->buffers[0]); return internal::CountAndSetBits(data_->buffers[0]->data(), data_->offset, data_->buffers[1]->data(), data_->offset, diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 21ac1a09f56e7..32806d9d2edb3 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -477,6 +478,7 @@ TEST_F(TestArray, TestMakeArrayOfNull) { ASSERT_EQ(array->type(), type); ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); + ASSERT_EQ(array->device_type(), DeviceAllocationType::kCPU); if (is_union(type->id())) { ASSERT_EQ(array->null_count(), 0); ASSERT_EQ(array->ComputeLogicalNullCount(), length); @@ -603,11 +605,11 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) ASSERT_EQ(out->length(), 9); auto out_type_id = out->type()->id(); - const bool has_validity = internal::HasValidityBitmap(out_type_id); + const bool can_check_nulls = internal::may_have_validity_bitmap(out_type_id); // For a dictionary builder, the output dictionary won't necessarily be the same const bool can_check_values = !is_dictionary(out_type_id); - if (has_validity) { + if (can_check_nulls) { ASSERT_EQ(out->null_count(), 4); } else { ASSERT_EQ(out->null_count(), 0); @@ -718,6 +720,7 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); ASSERT_EQ(array->null_count(), 0); + ASSERT_EQ(array->device_type(), DeviceAllocationType::kCPU); // test case for ARROW-13321 for (int64_t i : {int64_t{0}, length / 2, length - 1}) { @@ -743,6 +746,7 @@ TEST_F(TestArray, TestMakeArrayFromScalarSliced) { auto sliced = array->Slice(1, 4); ASSERT_EQ(sliced->length(), 4); ASSERT_EQ(sliced->null_count(), 0); + ASSERT_EQ(array->device_type(), DeviceAllocationType::kCPU); ARROW_EXPECT_OK(sliced->ValidateFull()); } } @@ -757,6 +761,7 @@ TEST_F(TestArray, TestMakeArrayFromDictionaryScalar) { ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), 4); ASSERT_EQ(array->null_count(), 0); + ASSERT_EQ(array->device_type(), DeviceAllocationType::kCPU); for (int i = 0; i < 4; i++) { ASSERT_OK_AND_ASSIGN(auto item, array->GetScalar(i)); @@ -796,6 +801,7 @@ TEST_F(TestArray, TestMakeEmptyArray) { ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type)); ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), 0); + CheckSpanRoundTrip(*array); } } @@ -823,6 +829,44 @@ TEST_F(TestArray, TestFillFromScalar) { } } +// GH-40069: Data-race when concurrent calling ArraySpan::FillFromScalar of the same +// scalar instance. +TEST_F(TestArray, TestConcurrentFillFromScalar) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + for (auto type : TestArrayUtilitiesAgainstTheseTypes()) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + for (auto seed : {0u, 0xdeadbeef, 42u}) { + ARROW_SCOPED_TRACE("seed = ", seed); + + Field field("", type, /*nullable=*/true, + key_value_metadata({{"extension_allow_random_storage", "true"}})); + auto array = random::GenerateArray(field, 1, seed); + + ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(0)); + + // Lambda to create fill an ArraySpan with the scalar and use the ArraySpan a bit. + auto array_span_from_scalar = [&]() { + ArraySpan span(*scalar); + auto roundtripped_array = span.ToArray(); + ASSERT_OK(roundtripped_array->ValidateFull()); + + AssertArraysEqual(*array, *roundtripped_array); + ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0)); + AssertScalarsEqual(*scalar, *roundtripped_scalar); + }; + + // Two concurrent calls to the lambda are just enough for TSAN to detect a race + // condition. + auto fut1 = std::async(std::launch::async, array_span_from_scalar); + auto fut2 = std::async(std::launch::async, array_span_from_scalar); + fut1.get(); + fut2.get(); + } + } +} + TEST_F(TestArray, ExtensionSpanRoundTrip) { // Other types are checked in MakeEmptyArray but MakeEmptyArray doesn't // work for extension types so we check that here @@ -855,7 +899,8 @@ TEST_F(TestArray, TestAppendArraySlice) { span.SetMembers(*nulls->data()); ASSERT_OK(builder->AppendArraySlice(span, 0, 4)); ASSERT_EQ(12, builder->length()); - const bool has_validity_bitmap = internal::HasValidityBitmap(scalar->type->id()); + const bool has_validity_bitmap = + internal::may_have_validity_bitmap(scalar->type->id()); if (has_validity_bitmap) { ASSERT_EQ(4, builder->null_count()); } @@ -1307,6 +1352,13 @@ TEST(TestBooleanArray, TrueCountFalseCount) { CheckArray(checked_cast(*arr)); CheckArray(checked_cast(*arr->Slice(5))); CheckArray(checked_cast(*arr->Slice(0, 0))); + + // GH-41016 true_count() with array without validity buffer with null_count of -1 + auto arr_unknown_null_count = ArrayFromJSON(boolean(), "[true, false, true]"); + arr_unknown_null_count->data()->null_count = kUnknownNullCount; + ASSERT_EQ(arr_unknown_null_count->data()->null_count.load(), -1); + ASSERT_EQ(arr_unknown_null_count->null_bitmap(), nullptr); + ASSERT_EQ(checked_pointer_cast(arr_unknown_null_count)->true_count(), 2); } TEST(TestPrimitiveAdHoc, TestType) { diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 11036797e014f..ecd2136f5d20b 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -175,8 +175,9 @@ class ARROW_EXPORT ArrayBuilder { /// \brief Append a range of values from an array. /// /// The given array must be the same type as the builder. - virtual Status AppendArraySlice(const ArraySpan& array, int64_t offset, - int64_t length) { + virtual Status AppendArraySlice(const ArraySpan& ARROW_ARG_UNUSED(array), + int64_t ARROW_ARG_UNUSED(offset), + int64_t ARROW_ARG_UNUSED(length)) { return Status::NotImplemented("AppendArraySlice for builder for ", *type()); } @@ -331,7 +332,7 @@ inline Result> MakeBuilder( const std::shared_ptr& type, MemoryPool* pool = default_memory_pool()) { std::unique_ptr out; ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out)); - return std::move(out); + return out; } /// \brief Construct an empty ArrayBuilder corresponding to the data @@ -345,7 +346,7 @@ inline Result> MakeBuilderExactIndex( const std::shared_ptr& type, MemoryPool* pool = default_memory_pool()) { std::unique_ptr out; ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out)); - return std::move(out); + return out; } /// \brief Construct an empty DictionaryBuilder initialized optionally @@ -364,7 +365,7 @@ inline Result> MakeDictionaryBuilder( MemoryPool* pool = default_memory_pool()) { std::unique_ptr out; ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out)); - return std::move(out); + return out; } } // namespace arrow diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 429aa5c0488cd..1851ef9122274 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -181,13 +181,11 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { if constexpr (is_list_view(TYPE::type_id)) { sizes = array.GetValues(2); } - const bool all_valid = !array.MayHaveLogicalNulls(); - const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; + static_assert(internal::may_have_validity_bitmap(TYPE::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t row = offset; row < offset + length; row++) { - const bool is_valid = - all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || - array.IsValid(row); + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); int64_t size = 0; if (is_valid) { if constexpr (is_list_view(TYPE::type_id)) { @@ -250,7 +248,7 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { /// \brief Append dimensions for a single list slot. /// /// ListViewBuilder overrides this to also append the size. - virtual void UnsafeAppendDimensions(int64_t offset, int64_t size) { + virtual void UnsafeAppendDimensions(int64_t offset, int64_t ARROW_ARG_UNUSED(size)) { offsets_builder_.UnsafeAppend(static_cast(offset)); } @@ -569,13 +567,11 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { - const int32_t* offsets = array.GetValues(1); - const bool all_valid = !array.MayHaveLogicalNulls(); - const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; + const auto* offsets = array.GetValues(1); + static_assert(internal::may_have_validity_bitmap(MapType::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { - const bool is_valid = - all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || - array.IsValid(row); + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); if (is_valid) { ARROW_RETURN_NOT_OK(Append()); const int64_t slot_length = offsets[row + 1] - offsets[row]; @@ -646,6 +642,8 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { /// \brief Builder class for fixed-length list array value types class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { public: + using TypeClass = FixedSizeListType; + /// Use this constructor to define the built array's type explicitly. If value_builder /// has indeterminate type, this builder will also. FixedSizeListBuilder(MemoryPool* pool, diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 29e01d55edeb1..de7af1b46bdee 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -32,9 +32,10 @@ namespace arrow { class ARROW_EXPORT NullBuilder : public ArrayBuilder { public: explicit NullBuilder(MemoryPool* pool = default_memory_pool(), - int64_t alignment = kDefaultBufferAlignment) + int64_t ARROW_ARG_UNUSED(alignment) = kDefaultBufferAlignment) : ArrayBuilder(pool) {} - explicit NullBuilder(const std::shared_ptr& type, + + explicit NullBuilder(const std::shared_ptr& ARROW_ARG_UNUSED(type), MemoryPool* pool = default_memory_pool(), int64_t alignment = kDefaultBufferAlignment) : NullBuilder(pool, alignment) {} diff --git a/cpp/src/arrow/array/builder_run_end.cc b/cpp/src/arrow/array/builder_run_end.cc index cff8d72952385..ed384123d8b87 100644 --- a/cpp/src/arrow/array/builder_run_end.cc +++ b/cpp/src/arrow/array/builder_run_end.cc @@ -162,8 +162,7 @@ Status RunCompressorBuilder::FinishInternal(std::shared_ptr* out) { RunEndEncodedBuilder::ValueRunBuilder::ValueRunBuilder( MemoryPool* pool, const std::shared_ptr& value_builder, const std::shared_ptr& value_type, RunEndEncodedBuilder& ree_builder) - : RunCompressorBuilder(pool, std::move(value_builder), std::move(value_type)), - ree_builder_(ree_builder) {} + : RunCompressorBuilder(pool, value_builder, value_type), ree_builder_(ree_builder) {} RunEndEncodedBuilder::RunEndEncodedBuilder( MemoryPool* pool, const std::shared_ptr& run_end_builder, diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index ff9ed66d1149f..87e55246c78fe 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -317,7 +317,7 @@ class ConcatenateImpl { } Status Concatenate(std::shared_ptr* out) && { - if (out_->null_count != 0 && internal::HasValidityBitmap(out_->type->id())) { + if (out_->null_count != 0 && internal::may_have_validity_bitmap(out_->type->id())) { RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_->buffers[0])); } RETURN_NOT_OK(VisitTypeInline(*out_->type, this)); @@ -522,7 +522,8 @@ class ConcatenateImpl { } out_data += data->length * index_width; } - return std::move(out); + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(out)); } Status Visit(const DictionaryType& d) { diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 80c411dfa6a6d..83eeb56c496cf 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -53,7 +53,7 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length, if (type_id == Type::NA) { *null_count = length; (*buffers)[0] = nullptr; - } else if (internal::HasValidityBitmap(type_id)) { + } else if (internal::may_have_validity_bitmap(type_id)) { if (*null_count == 0) { // In case there are no nulls, don't keep an allocated null bitmap around (*buffers)[0] = nullptr; @@ -224,6 +224,54 @@ int64_t ArrayData::ComputeLogicalNullCount() const { return ArraySpan(*this).ComputeLogicalNullCount(); } +DeviceAllocationType ArrayData::device_type() const { + // we're using 0 as a sentinel value for NOT YET ASSIGNED + // there is explicitly no constant DeviceAllocationType to represent + // the "UNASSIGNED" case as it is invalid for data to not have an + // assigned device type. If it's still 0 at the end, then we return + // CPU as the allocation device type + int type = 0; + for (const auto& buf : buffers) { + if (!buf) continue; +#ifdef NDEBUG + return buf->device_type(); +#else + if (type == 0) { + type = static_cast(buf->device_type()); + } else { + DCHECK_EQ(type, static_cast(buf->device_type())); + } +#endif + } + + for (const auto& child : child_data) { + if (!child) continue; +#ifdef NDEBUG + return child->device_type(); +#else + if (type == 0) { + type = static_cast(child->device_type()); + } else { + DCHECK_EQ(type, static_cast(child->device_type())); + } +#endif + } + + if (dictionary) { +#ifdef NDEBUG + return dictionary->device_type(); +#else + if (type == 0) { + type = static_cast(dictionary->device_type()); + } else { + DCHECK_EQ(type, static_cast(dictionary->device_type())); + } +#endif + } + + return type == 0 ? DeviceAllocationType::kCPU : static_cast(type); +} + // ---------------------------------------------------------------------- // Methods for ArraySpan @@ -283,25 +331,15 @@ void ArraySpan::SetMembers(const ArrayData& data) { namespace { -template -BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { - auto* offsets = reinterpret_cast(scratch_space); - offsets[0] = 0; - offsets[1] = static_cast(value_size); - static_assert(2 * sizeof(offset_type) <= 16); - return {scratch_space, sizeof(offset_type) * 2}; +BufferSpan OffsetsForScalar(uint8_t* scratch_space, int64_t offset_width) { + return {scratch_space, offset_width * 2}; } -template std::pair OffsetsAndSizesForScalar(uint8_t* scratch_space, - offset_type value_size) { + int64_t offset_width) { auto* offsets = scratch_space; - auto* sizes = scratch_space + sizeof(offset_type); - reinterpret_cast(offsets)[0] = 0; - reinterpret_cast(sizes)[0] = value_size; - static_assert(2 * sizeof(offset_type) <= 16); - return {BufferSpan{offsets, sizeof(offset_type)}, - BufferSpan{sizes, sizeof(offset_type)}}; + auto* sizes = scratch_space + offset_width; + return {BufferSpan{offsets, offset_width}, BufferSpan{sizes, offset_width}}; } int GetNumBuffers(const DataType& type) { @@ -345,7 +383,7 @@ void FillZeroLengthArray(const DataType* type, ArraySpan* span) { span->buffers[i].size = 0; } - if (!HasValidityBitmap(type->id())) { + if (!may_have_validity_bitmap(type->id())) { span->buffers[0] = {}; } @@ -380,7 +418,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { if (type_id == Type::NA) { this->null_count = 1; - } else if (!internal::HasValidityBitmap(type_id)) { + } else if (!internal::may_have_validity_bitmap(type_id)) { this->null_count = 0; } else { // Populate null count and validity bitmap @@ -415,26 +453,23 @@ void ArraySpan::FillFromScalar(const Scalar& value) { data_size = scalar.value->size(); } if (is_binary_like(type_id)) { - this->buffers[1] = - OffsetsForScalar(scalar.scratch_space_, static_cast(data_size)); + const auto& binary_scalar = checked_cast(value); + this->buffers[1] = OffsetsForScalar(binary_scalar.scratch_space_, sizeof(int32_t)); } else { // is_large_binary_like - this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, data_size); + const auto& large_binary_scalar = checked_cast(value); + this->buffers[1] = + OffsetsForScalar(large_binary_scalar.scratch_space_, sizeof(int64_t)); } this->buffers[2].data = const_cast(data_buffer); this->buffers[2].size = data_size; } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { - const auto& scalar = checked_cast(value); + const auto& scalar = checked_cast(value); this->buffers[1].size = BinaryViewType::kSize; this->buffers[1].data = scalar.scratch_space_; - static_assert(sizeof(BinaryViewType::c_type) <= sizeof(scalar.scratch_space_)); - auto* view = new (&scalar.scratch_space_) BinaryViewType::c_type; if (scalar.is_valid) { - *view = util::ToBinaryView(std::string_view{*scalar.value}, 0, 0); this->buffers[2] = internal::PackVariadicBuffers({&scalar.value, 1}); - } else { - *view = {}; } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast(value); @@ -443,12 +478,10 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } else if (is_var_length_list_like(type_id) || type_id == Type::FIXED_SIZE_LIST) { const auto& scalar = checked_cast(value); - int64_t value_length = 0; this->child_data.resize(1); if (scalar.value != nullptr) { // When the scalar is null, scalar.value can also be null this->child_data[0].SetMembers(*scalar.value->data()); - value_length = scalar.value->length(); } else { // Even when the value is null, we still must populate the // child_data to yield a valid array. Tedious @@ -456,17 +489,25 @@ void ArraySpan::FillFromScalar(const Scalar& value) { &this->child_data[0]); } - if (type_id == Type::LIST || type_id == Type::MAP) { - this->buffers[1] = - OffsetsForScalar(scalar.scratch_space_, static_cast(value_length)); + if (type_id == Type::LIST) { + const auto& list_scalar = checked_cast(value); + this->buffers[1] = OffsetsForScalar(list_scalar.scratch_space_, sizeof(int32_t)); + } else if (type_id == Type::MAP) { + const auto& map_scalar = checked_cast(value); + this->buffers[1] = OffsetsForScalar(map_scalar.scratch_space_, sizeof(int32_t)); } else if (type_id == Type::LARGE_LIST) { - this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, value_length); + const auto& large_list_scalar = checked_cast(value); + this->buffers[1] = + OffsetsForScalar(large_list_scalar.scratch_space_, sizeof(int64_t)); } else if (type_id == Type::LIST_VIEW) { - std::tie(this->buffers[1], this->buffers[2]) = OffsetsAndSizesForScalar( - scalar.scratch_space_, static_cast(value_length)); - } else if (type_id == Type::LARGE_LIST_VIEW) { + const auto& list_view_scalar = checked_cast(value); std::tie(this->buffers[1], this->buffers[2]) = - OffsetsAndSizesForScalar(scalar.scratch_space_, value_length); + OffsetsAndSizesForScalar(list_view_scalar.scratch_space_, sizeof(int32_t)); + } else if (type_id == Type::LARGE_LIST_VIEW) { + const auto& large_list_view_scalar = + checked_cast(value); + std::tie(this->buffers[1], this->buffers[2]) = OffsetsAndSizesForScalar( + large_list_view_scalar.scratch_space_, sizeof(int64_t)); } else { DCHECK_EQ(type_id, Type::FIXED_SIZE_LIST); // FIXED_SIZE_LIST: does not have a second buffer @@ -480,27 +521,19 @@ void ArraySpan::FillFromScalar(const Scalar& value) { this->child_data[i].FillFromScalar(*scalar.value[i]); } } else if (is_union(type_id)) { - // Dense union needs scratch space to store both offsets and a type code - struct UnionScratchSpace { - alignas(int64_t) int8_t type_code; - alignas(int64_t) uint8_t offsets[sizeof(int32_t) * 2]; - }; - static_assert(sizeof(UnionScratchSpace) <= sizeof(UnionScalar::scratch_space_)); - auto* union_scratch_space = reinterpret_cast( - &checked_cast(value).scratch_space_); - // First buffer is kept null since unions have no validity vector this->buffers[0] = {}; - union_scratch_space->type_code = checked_cast(value).type_code; - this->buffers[1].data = reinterpret_cast(&union_scratch_space->type_code); - this->buffers[1].size = 1; - this->child_data.resize(this->type->num_fields()); if (type_id == Type::DENSE_UNION) { const auto& scalar = checked_cast(value); - this->buffers[2] = - OffsetsForScalar(union_scratch_space->offsets, static_cast(1)); + auto* union_scratch_space = + reinterpret_cast(&scalar.scratch_space_); + + this->buffers[1].data = reinterpret_cast(&union_scratch_space->type_code); + this->buffers[1].size = 1; + + this->buffers[2] = OffsetsForScalar(union_scratch_space->offsets, sizeof(int32_t)); // We can't "see" the other arrays in the union, but we put the "active" // union array in the right place and fill zero-length arrays for the // others @@ -517,6 +550,12 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } } else { const auto& scalar = checked_cast(value); + auto* union_scratch_space = + reinterpret_cast(&scalar.scratch_space_); + + this->buffers[1].data = reinterpret_cast(&union_scratch_space->type_code); + this->buffers[1].size = 1; + // Sparse union scalars have a full complement of child values even // though only one of them is relevant, so we just fill them in here for (int i = 0; i < static_cast(this->child_data.size()); ++i) { @@ -541,7 +580,6 @@ void ArraySpan::FillFromScalar(const Scalar& value) { e.null_count = 0; e.buffers[1].data = scalar.scratch_space_; e.buffers[1].size = sizeof(run_end); - reinterpret_cast(scalar.scratch_space_)[0] = run_end; }; switch (scalar.run_end_type()->id()) { diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index d8a6663cec580..e0508fe6980a7 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -46,6 +46,7 @@ ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i); ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data); + } // namespace internal // When slicing, we do not know the null count of the sliced range without @@ -100,6 +101,11 @@ struct ARROW_EXPORT ArrayData { int64_t null_count = kUnknownNullCount, int64_t offset = 0) : ArrayData(std::move(type), length, null_count, offset) { this->buffers = std::move(buffers); +#ifndef NDEBUG + // in debug mode, call the `device_type` function to trigger + // the DCHECKs that validate all the buffers are on the same device + ARROW_UNUSED(this->device_type()); +#endif } ArrayData(std::shared_ptr type, int64_t length, @@ -109,6 +115,12 @@ struct ARROW_EXPORT ArrayData { : ArrayData(std::move(type), length, null_count, offset) { this->buffers = std::move(buffers); this->child_data = std::move(child_data); +#ifndef NDEBUG + // in debug mode, call the `device_type` function to trigger + // the DCHECKs that validate all the buffers (including children) + // are on the same device + ARROW_UNUSED(this->device_type()); +#endif } static std::shared_ptr Make(std::shared_ptr type, int64_t length, @@ -357,6 +369,16 @@ struct ARROW_EXPORT ArrayData { /// \see GetNullCount int64_t ComputeLogicalNullCount() const; + /// \brief Return the device_type of the underlying buffers and children + /// + /// If there are no buffers in this ArrayData object, it just returns + /// DeviceAllocationType::kCPU as a default. We also assume that all buffers + /// should be allocated on the same device type and perform DCHECKs to confirm + /// this in debug mode. + /// + /// \return DeviceAllocationType + DeviceAllocationType device_type() const; + std::shared_ptr type; int64_t length = 0; mutable std::atomic null_count{0}; diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 86e2ffcae4de7..b56ea25f9e421 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -95,7 +95,7 @@ class ArrayDataEndianSwapper { Status SwapType(const DataType& type) { RETURN_NOT_OK(VisitTypeInline(type, this)); RETURN_NOT_OK(SwapChildren(type.fields())); - if (internal::HasValidityBitmap(type.id())) { + if (internal::may_have_validity_bitmap(type.id())) { // Copy null bitmap out_->buffers[0] = data_->buffers[0]; } @@ -125,7 +125,8 @@ class ArrayDataEndianSwapper { for (int64_t i = 0; i < length; i++) { out_data[i] = bit_util::ByteSwap(in_data[i]); } - return std::move(out_buffer); + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(out_buffer)); } template @@ -548,7 +549,7 @@ class NullArrayFactory { } Status Visit(const StructType& type) { - for (int i = 0; i < type_->num_fields(); ++i) { + for (int i = 0; i < type.num_fields(); ++i) { ARROW_ASSIGN_OR_RAISE(out_->child_data[i], CreateChild(type, i, length_)); } return Status::OK(); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 8dd3eb3f90c15..0d940d3bc869e 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -550,7 +550,7 @@ struct ValidateArrayImpl { if (full_validation) { if (data.null_count != kUnknownNullCount) { int64_t actual_null_count; - if (HasValidityBitmap(data.type->id()) && data.buffers[0]) { + if (may_have_validity_bitmap(data.type->id()) && data.buffers[0]) { // Do not call GetNullCount() as it would also set the `null_count` member actual_null_count = data.length - CountSetBits(data.buffers[0]->data(), data.offset, data.length); diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 1bd789b7cafe6..0eb22a9d1553d 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -41,7 +41,8 @@ Result> Buffer::CopySlice(const int64_t start, ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateResizableBuffer(nbytes, pool)); std::memcpy(new_buffer->mutable_data(), data() + start, static_cast(nbytes)); - return std::move(new_buffer); + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(new_buffer)); } Buffer::Buffer() : Buffer(memory_pool::internal::kZeroSizeArea, 0) {} @@ -185,7 +186,8 @@ Result> AllocateBitmap(int64_t length, MemoryPool* pool) if (buf->size() > 0) { buf->mutable_data()[buf->size() - 1] = 0; } - return std::move(buf); + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(buf)); } Result> AllocateEmptyBitmap(int64_t length, MemoryPool* pool) { @@ -197,7 +199,8 @@ Result> AllocateEmptyBitmap(int64_t length, int64_t alig ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(bit_util::BytesForBits(length), alignment, pool)); memset(buf->mutable_data(), 0, static_cast(buf->size())); - return std::move(buf); + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(buf)); } Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out) { @@ -219,7 +222,8 @@ Result> ConcatenateBuffers( out_data += buffer->size(); } } - return std::move(out); + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(out)); } } // namespace arrow diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index d004de7a2ea9f..eba575f4cf39c 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -528,7 +528,7 @@ namespace { struct ExportedArrayPrivateData : PoolAllocationMixin { // The buffers are owned by the ArrayData member SmallVector buffers_; - struct ArrowArray dictionary_; + struct ArrowArray dictionary_ {}; SmallVector children_; SmallVector child_pointers_; @@ -576,7 +576,7 @@ struct ArrayExporter { // Store buffer pointers size_t n_buffers = data->buffers.size(); auto buffers_begin = data->buffers.begin(); - if (n_buffers > 0 && !internal::HasValidityBitmap(data->type->id())) { + if (n_buffers > 0 && !internal::may_have_validity_bitmap(data->type->id())) { --n_buffers; ++buffers_begin; } @@ -1059,8 +1059,14 @@ struct SchemaImporter { ARROW_ASSIGN_OR_RAISE( type_, registered_ext_type->Deserialize(std::move(type_), metadata_.extension_serialized)); - RETURN_NOT_OK(metadata_.metadata->DeleteMany( - {metadata_.extension_name_index, metadata_.extension_serialized_index})); + // If metadata is present, delete both metadata keys (otherwise, just remove + // the extension name key) + if (metadata_.extension_serialized_index >= 0) { + RETURN_NOT_OK(metadata_.metadata->DeleteMany( + {metadata_.extension_name_index, metadata_.extension_serialized_index})); + } else { + RETURN_NOT_OK(metadata_.metadata->Delete(metadata_.extension_name_index)); + } } } @@ -1448,6 +1454,7 @@ namespace { // The ArrowArray is released on destruction. struct ImportedArrayData { struct ArrowArray array_; + DeviceAllocationType device_type_; std::shared_ptr device_sync_; ImportedArrayData() { @@ -1514,6 +1521,7 @@ struct ArrayImporter { recursion_level_ = 0; import_ = std::make_shared(); c_struct_ = &import_->array_; + import_->device_type_ = device_type_; ArrowArrayMove(src, c_struct_); return DoImport(); } @@ -1541,7 +1549,8 @@ struct ArrayImporter { "cannot be imported as RecordBatch"); } return RecordBatch::Make(std::move(schema), data_->length, - std::move(data_->child_data)); + std::move(data_->child_data), import_->device_type_, + import_->device_sync_); } Status ImportChild(const ArrayImporter* parent, struct ArrowArray* src) { @@ -1868,24 +1877,17 @@ struct ArrayImporter { template Status ImportStringValuesBuffer(int32_t offsets_buffer_id, int32_t buffer_id, int64_t byte_width = 1) { - if (device_type_ == DeviceAllocationType::kCPU) { - auto offsets = data_->GetValues(offsets_buffer_id); + int64_t buffer_size = 0; + if (c_struct_->length > 0) { + int64_t last_offset_value_offset = + (c_struct_->length + c_struct_->offset) * sizeof(OffsetType); + OffsetType last_offset_value; + RETURN_NOT_OK(MemoryManager::CopyBufferSliceToCPU( + data_->buffers[offsets_buffer_id], last_offset_value_offset, sizeof(OffsetType), + reinterpret_cast(&last_offset_value))); // Compute visible size of buffer - int64_t buffer_size = - (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0; - return ImportBuffer(buffer_id, buffer_size); - } - - // we only need the value of the last offset so let's just copy that - // one value from device to host. - auto single_value_buf = - SliceBuffer(data_->buffers[offsets_buffer_id], - c_struct_->length * sizeof(OffsetType), sizeof(OffsetType)); - ARROW_ASSIGN_OR_RAISE( - auto cpubuf, Buffer::ViewOrCopy(single_value_buf, default_cpu_memory_manager())); - auto offsets = cpubuf->data_as(); - // Compute visible size of buffer - int64_t buffer_size = (c_struct_->length > 0) ? byte_width * offsets[0] : 0; + buffer_size = byte_width * last_offset_value; + } return ImportBuffer(buffer_id, buffer_size); } @@ -2041,6 +2043,23 @@ Status ExportStreamNext(const std::shared_ptr& src, int64_t i } } +// the int64_t i input here is unused, but exists simply to allow utilizing the +// overload of this with the version for ChunkedArrays. If we removed the int64_t +// from the signature despite it being unused, we wouldn't be able to leverage the +// overloading in the templated exporters. +Status ExportStreamNext(const std::shared_ptr& src, int64_t i, + struct ArrowDeviceArray* out_array) { + std::shared_ptr batch; + RETURN_NOT_OK(src->ReadNext(&batch)); + if (batch == nullptr) { + // End of stream + ArrowArrayMarkReleased(&out_array->array); + return Status::OK(); + } else { + return ExportDeviceRecordBatch(*batch, batch->GetSyncEvent(), out_array); + } +} + Status ExportStreamNext(const std::shared_ptr& src, int64_t i, struct ArrowArray* out_array) { if (i >= src->num_chunks()) { @@ -2052,8 +2071,27 @@ Status ExportStreamNext(const std::shared_ptr& src, int64_t i, } } -template +Status ExportStreamNext(const std::shared_ptr& src, int64_t i, + struct ArrowDeviceArray* out_array) { + if (i >= src->num_chunks()) { + // End of stream + ArrowArrayMarkReleased(&out_array->array); + return Status::OK(); + } else { + return ExportDeviceArray(*src->chunk(static_cast(i)), nullptr, out_array); + } +} + +template class ExportedArrayStream { + using StreamTraits = + std::conditional_t; + using StreamType = typename StreamTraits::CType; + using ArrayTraits = std::conditional_t; + using ArrayType = typename ArrayTraits::CType; + public: struct PrivateData { explicit PrivateData(std::shared_ptr reader) @@ -2067,13 +2105,13 @@ class ExportedArrayStream { ARROW_DISALLOW_COPY_AND_ASSIGN(PrivateData); }; - explicit ExportedArrayStream(struct ArrowArrayStream* stream) : stream_(stream) {} + explicit ExportedArrayStream(StreamType* stream) : stream_(stream) {} Status GetSchema(struct ArrowSchema* out_schema) { return ExportStreamSchema(reader(), out_schema); } - Status GetNext(struct ArrowArray* out_array) { + Status GetNext(ArrayType* out_array) { return ExportStreamNext(reader(), next_batch_num(), out_array); } @@ -2083,38 +2121,35 @@ class ExportedArrayStream { } void Release() { - if (ArrowArrayStreamIsReleased(stream_)) { + if (StreamTraits::IsReleasedFunc(stream_)) { return; } + DCHECK_NE(private_data(), nullptr); delete private_data(); - ArrowArrayStreamMarkReleased(stream_); + StreamTraits::MarkReleased(stream_); } // C-compatible callbacks - static int StaticGetSchema(struct ArrowArrayStream* stream, - struct ArrowSchema* out_schema) { + static int StaticGetSchema(StreamType* stream, struct ArrowSchema* out_schema) { ExportedArrayStream self{stream}; return self.ToCError(self.GetSchema(out_schema)); } - static int StaticGetNext(struct ArrowArrayStream* stream, - struct ArrowArray* out_array) { + static int StaticGetNext(StreamType* stream, ArrayType* out_array) { ExportedArrayStream self{stream}; return self.ToCError(self.GetNext(out_array)); } - static void StaticRelease(struct ArrowArrayStream* stream) { - ExportedArrayStream{stream}.Release(); - } + static void StaticRelease(StreamType* stream) { ExportedArrayStream{stream}.Release(); } - static const char* StaticGetLastError(struct ArrowArrayStream* stream) { + static const char* StaticGetLastError(StreamType* stream) { return ExportedArrayStream{stream}.GetLastError(); } - static Status Make(std::shared_ptr reader, struct ArrowArrayStream* out) { + static Status Make(std::shared_ptr reader, StreamType* out) { out->get_schema = ExportedArrayStream::StaticGetSchema; out->get_next = ExportedArrayStream::StaticGetNext; out->get_last_error = ExportedArrayStream::StaticGetLastError; @@ -2150,19 +2185,36 @@ class ExportedArrayStream { int64_t next_batch_num() { return private_data()->batch_num_++; } - struct ArrowArrayStream* stream_; + StreamType* stream_; }; } // namespace Status ExportRecordBatchReader(std::shared_ptr reader, struct ArrowArrayStream* out) { - return ExportedArrayStream::Make(std::move(reader), out); + memset(out, 0, sizeof(struct ArrowArrayStream)); + return ExportedArrayStream::Make(std::move(reader), out); } Status ExportChunkedArray(std::shared_ptr chunked_array, struct ArrowArrayStream* out) { - return ExportedArrayStream::Make(std::move(chunked_array), out); + memset(out, 0, sizeof(struct ArrowArrayStream)); + return ExportedArrayStream::Make(std::move(chunked_array), out); +} + +Status ExportDeviceRecordBatchReader(std::shared_ptr reader, + struct ArrowDeviceArrayStream* out) { + memset(out, 0, sizeof(struct ArrowDeviceArrayStream)); + out->device_type = static_cast(reader->device_type()); + return ExportedArrayStream::Make(std::move(reader), out); +} + +Status ExportDeviceChunkedArray(std::shared_ptr chunked_array, + DeviceAllocationType device_type, + struct ArrowDeviceArrayStream* out) { + memset(out, 0, sizeof(struct ArrowDeviceArrayStream)); + out->device_type = static_cast(device_type); + return ExportedArrayStream::Make(std::move(chunked_array), out); } ////////////////////////////////////////////////////////////////////////// @@ -2170,33 +2222,65 @@ Status ExportChunkedArray(std::shared_ptr chunked_array, namespace { +template class ArrayStreamReader { + protected: + using StreamTraits = + std::conditional_t; + using StreamType = typename StreamTraits::CType; + using ArrayTraits = std::conditional_t; + using ArrayType = typename ArrayTraits::CType; + public: - explicit ArrayStreamReader(struct ArrowArrayStream* stream) { - ArrowArrayStreamMove(stream, &stream_); - DCHECK(!ArrowArrayStreamIsReleased(&stream_)); + explicit ArrayStreamReader(StreamType* stream, + const DeviceMemoryMapper mapper = DefaultDeviceMemoryMapper) + : mapper_{std::move(mapper)} { + StreamTraits::MoveFunc(stream, &stream_); + DCHECK(!StreamTraits::IsReleasedFunc(&stream_)); } ~ArrayStreamReader() { ReleaseStream(); } void ReleaseStream() { - if (!ArrowArrayStreamIsReleased(&stream_)) { - ArrowArrayStreamRelease(&stream_); - } - DCHECK(ArrowArrayStreamIsReleased(&stream_)); + // all our trait release funcs check IsReleased so we don't + // need to repeat it here + StreamTraits::ReleaseFunc(&stream_); + DCHECK(StreamTraits::IsReleasedFunc(&stream_)); } protected: - Status ReadNextArrayInternal(struct ArrowArray* array) { - ArrowArrayMarkReleased(array); + Status ReadNextArrayInternal(ArrayType* array) { + ArrayTraits::MarkReleased(array); Status status = StatusFromCError(stream_.get_next(&stream_, array)); - if (!status.ok() && !ArrowArrayIsReleased(array)) { - ArrowArrayRelease(array); + if (!status.ok()) { + ArrayTraits::ReleaseFunc(array); } return status; } + Result> ImportRecordBatchInternal( + struct ArrowArray* array, std::shared_ptr schema) { + return ImportRecordBatch(array, schema); + } + + Result> ImportRecordBatchInternal( + struct ArrowDeviceArray* array, std::shared_ptr schema) { + return ImportDeviceRecordBatch(array, schema, mapper_); + } + + Result> ImportArrayInternal( + struct ArrowArray* array, std::shared_ptr type) { + return ImportArray(array, type); + } + + Result> ImportArrayInternal( + struct ArrowDeviceArray* array, std::shared_ptr type) { + return ImportDeviceArray(array, type, mapper_); + } + Result> ReadSchema() { struct ArrowSchema c_schema = {}; ARROW_RETURN_NOT_OK( @@ -2214,19 +2298,19 @@ class ArrayStreamReader { } Status CheckNotReleased() { - if (ArrowArrayStreamIsReleased(&stream_)) { + if (StreamTraits::IsReleasedFunc(&stream_)) { return Status::Invalid( "Attempt to read from a stream that has already been closed"); - } else { - return Status::OK(); } + + return Status::OK(); } Status StatusFromCError(int errno_like) const { return StatusFromCError(&stream_, errno_like); } - static Status StatusFromCError(struct ArrowArrayStream* stream, int errno_like) { + static Status StatusFromCError(StreamType* stream, int errno_like) { if (ARROW_PREDICT_TRUE(errno_like == 0)) { return Status::OK(); } @@ -2250,70 +2334,102 @@ class ArrayStreamReader { return {code, last_error ? std::string(last_error) : ""}; } + DeviceAllocationType get_device_type() const { + if constexpr (IsDevice) { + return static_cast(stream_.device_type); + } else { + return DeviceAllocationType::kCPU; + } + } + private: - mutable struct ArrowArrayStream stream_; + mutable StreamType stream_; + const DeviceMemoryMapper mapper_; }; -class ArrayStreamBatchReader : public RecordBatchReader, public ArrayStreamReader { +template +class ArrayStreamBatchReader : public RecordBatchReader, + public ArrayStreamReader { + using StreamTraits = + std::conditional_t; + using StreamType = typename StreamTraits::CType; + using ArrayTraits = std::conditional_t; + using ArrayType = typename ArrayTraits::CType; + public: - explicit ArrayStreamBatchReader(struct ArrowArrayStream* stream) - : ArrayStreamReader(stream) {} + explicit ArrayStreamBatchReader( + StreamType* stream, const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper) + : ArrayStreamReader(stream, mapper) {} Status Init() { - ARROW_ASSIGN_OR_RAISE(schema_, ReadSchema()); + ARROW_ASSIGN_OR_RAISE(schema_, this->ReadSchema()); return Status::OK(); } std::shared_ptr schema() const override { return schema_; } Status ReadNext(std::shared_ptr* batch) override { - ARROW_RETURN_NOT_OK(CheckNotReleased()); + ARROW_RETURN_NOT_OK(this->CheckNotReleased()); - struct ArrowArray c_array; - ARROW_RETURN_NOT_OK(ReadNextArrayInternal(&c_array)); + ArrayType c_array; + ARROW_RETURN_NOT_OK(this->ReadNextArrayInternal(&c_array)); - if (ArrowArrayIsReleased(&c_array)) { + if (ArrayTraits::IsReleasedFunc(&c_array)) { // End of stream batch->reset(); return Status::OK(); } else { - return ImportRecordBatch(&c_array, schema_).Value(batch); + return this->ImportRecordBatchInternal(&c_array, schema_).Value(batch); } } Status Close() override { - ReleaseStream(); + this->ReleaseStream(); return Status::OK(); } + DeviceAllocationType device_type() const override { return this->get_device_type(); } + private: std::shared_ptr schema_; }; -class ArrayStreamArrayReader : public ArrayStreamReader { +template +class ArrayStreamArrayReader : public ArrayStreamReader { + using StreamTraits = + std::conditional_t; + using StreamType = typename StreamTraits::CType; + using ArrayTraits = std::conditional_t; + using ArrayType = typename ArrayTraits::CType; + public: - explicit ArrayStreamArrayReader(struct ArrowArrayStream* stream) - : ArrayStreamReader(stream) {} + explicit ArrayStreamArrayReader( + StreamType* stream, const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper) + : ArrayStreamReader(stream, mapper) {} Status Init() { - ARROW_ASSIGN_OR_RAISE(field_, ReadField()); + ARROW_ASSIGN_OR_RAISE(field_, this->ReadField()); return Status::OK(); } std::shared_ptr data_type() const { return field_->type(); } Status ReadNext(std::shared_ptr* array) { - ARROW_RETURN_NOT_OK(CheckNotReleased()); + ARROW_RETURN_NOT_OK(this->CheckNotReleased()); - struct ArrowArray c_array; - ARROW_RETURN_NOT_OK(ReadNextArrayInternal(&c_array)); + ArrayType c_array; + ARROW_RETURN_NOT_OK(this->ReadNextArrayInternal(&c_array)); - if (ArrowArrayIsReleased(&c_array)) { + if (ArrayTraits::IsReleasedFunc(&c_array)) { // End of stream array->reset(); return Status::OK(); } else { - return ImportArray(&c_array, field_->type()).Value(array); + return this->ImportArrayInternal(&c_array, field_->type()).Value(array); } } @@ -2321,30 +2437,35 @@ class ArrayStreamArrayReader : public ArrayStreamReader { std::shared_ptr field_; }; -} // namespace - -Result> ImportRecordBatchReader( - struct ArrowArrayStream* stream) { - if (ArrowArrayStreamIsReleased(stream)) { - return Status::Invalid("Cannot import released ArrowArrayStream"); +template > +Result> ImportReader( + typename StreamTraits::CType* stream, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper) { + if (StreamTraits::IsReleasedFunc(stream)) { + return Status::Invalid("Cannot import released Arrow Stream"); } - auto reader = std::make_shared(stream); + auto reader = std::make_shared>(stream, mapper); ARROW_RETURN_NOT_OK(reader->Init()); return reader; } -Result> ImportChunkedArray( - struct ArrowArrayStream* stream) { - if (ArrowArrayStreamIsReleased(stream)) { - return Status::Invalid("Cannot import released ArrowArrayStream"); +template > +Result> ImportChunked( + typename StreamTraits::CType* stream, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper) { + if (StreamTraits::IsReleasedFunc(stream)) { + return Status::Invalid("Cannot import released Arrow Stream"); } - auto reader = std::make_shared(stream); + auto reader = std::make_shared>(stream, mapper); ARROW_RETURN_NOT_OK(reader->Init()); - std::shared_ptr data_type = reader->data_type(); - + auto data_type = reader->data_type(); ArrayVector chunks; std::shared_ptr chunk; while (true) { @@ -2360,4 +2481,26 @@ Result> ImportChunkedArray( return ChunkedArray::Make(std::move(chunks), std::move(data_type)); } +} // namespace + +Result> ImportRecordBatchReader( + struct ArrowArrayStream* stream) { + return ImportReader(stream); +} + +Result> ImportDeviceRecordBatchReader( + struct ArrowDeviceArrayStream* stream, const DeviceMemoryMapper& mapper) { + return ImportReader(stream, mapper); +} + +Result> ImportChunkedArray( + struct ArrowArrayStream* stream) { + return ImportChunked(stream); +} + +Result> ImportDeviceChunkedArray( + struct ArrowDeviceArrayStream* stream, const DeviceMemoryMapper& mapper) { + return ImportChunked(stream, mapper); +} + } // namespace arrow diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h index 74a302be4c27d..45367e4f93062 100644 --- a/cpp/src/arrow/c/bridge.h +++ b/cpp/src/arrow/c/bridge.h @@ -321,6 +321,31 @@ ARROW_EXPORT Status ExportChunkedArray(std::shared_ptr chunked_array, struct ArrowArrayStream* out); +/// \brief Export C++ RecordBatchReader using the C device stream interface +/// +/// The resulting ArrowDeviceArrayStream struct keeps the record batch reader +/// alive until its release callback is called by the consumer. The device +/// type is determined by calling device_type() on the RecordBatchReader. +/// +/// \param[in] reader RecordBatchReader object to export +/// \param[out] out C struct to export the stream to +ARROW_EXPORT +Status ExportDeviceRecordBatchReader(std::shared_ptr reader, + struct ArrowDeviceArrayStream* out); + +/// \brief Export C++ ChunkedArray using the C device data interface format. +/// +/// The resulting ArrowDeviceArrayStream keeps the chunked array data and buffers +/// alive until its release callback is called by the consumer. +/// +/// \param[in] chunked_array ChunkedArray object to export +/// \param[in] device_type the device type the data is located on +/// \param[out] out C struct to export the stream to +ARROW_EXPORT +Status ExportDeviceChunkedArray(std::shared_ptr chunked_array, + DeviceAllocationType device_type, + struct ArrowDeviceArrayStream* out); + /// \brief Import C++ RecordBatchReader from the C stream interface. /// /// The ArrowArrayStream struct has its contents moved to a private object @@ -343,6 +368,42 @@ Result> ImportRecordBatchReader( ARROW_EXPORT Result> ImportChunkedArray(struct ArrowArrayStream* stream); +/// \brief Import C++ RecordBatchReader from the C device stream interface +/// +/// The ArrowDeviceArrayStream struct has its contents moved to a private object +/// held alive by the resulting record batch reader. +/// +/// \note If there was a required sync event, sync events are accessible by individual +/// buffers of columns. We are not yet bubbling the sync events from the buffers up to +/// the `GetSyncEvent` method of an imported RecordBatch. This will be added in a future +/// update. +/// +/// \param[in,out] stream C device stream interface struct +/// \param[in] mapper mapping from device type and ID to memory manager +/// \return Imported RecordBatchReader object +ARROW_EXPORT +Result> ImportDeviceRecordBatchReader( + struct ArrowDeviceArrayStream* stream, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// \brief Import C++ ChunkedArray from the C device stream interface +/// +/// The ArrowDeviceArrayStream struct has its contents moved to a private object, +/// is consumed in its entirety, and released before returning all chunks as a +/// ChunkedArray. +/// +/// \note Any chunks that require synchronization for their device memory will have +/// the SyncEvent objects available by checking the individual buffers of each chunk. +/// These SyncEvents should be checked before accessing the data in those buffers. +/// +/// \param[in,out] stream C device stream interface struct +/// \param[in] mapper mapping from device type and ID to memory manager +/// \return Imported ChunkedArray object +ARROW_EXPORT +Result> ImportDeviceChunkedArray( + struct ArrowDeviceArrayStream* stream, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + /// @} } // namespace arrow diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index dba6e4736b673..09bb524adbdf0 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -53,11 +53,15 @@ namespace arrow { +using internal::ArrayDeviceExportTraits; +using internal::ArrayDeviceStreamExportTraits; using internal::ArrayExportGuard; using internal::ArrayExportTraits; using internal::ArrayStreamExportGuard; using internal::ArrayStreamExportTraits; using internal::checked_cast; +using internal::DeviceArrayExportGuard; +using internal::DeviceArrayStreamExportGuard; using internal::SchemaExportGuard; using internal::SchemaExportTraits; using internal::Zip; @@ -565,7 +569,7 @@ struct ArrayExportChecker { auto expected_n_buffers = static_cast(expected_data.buffers.size()); auto expected_buffers = expected_data.buffers.data(); - if (!internal::HasValidityBitmap(expected_data.type->id())) { + if (!internal::may_have_validity_bitmap(expected_data.type->id())) { --expected_n_buffers; ++expected_buffers; } @@ -1358,7 +1362,7 @@ class MyMemoryManager : public CPUMemoryManager { if (buf.size() > 0) { memcpy(dest->mutable_data(), buf.data(), static_cast(buf.size())); } - return std::move(dest); + return dest; } }; @@ -4098,6 +4102,23 @@ TEST_F(TestArrayRoundtrip, RegisteredExtension) { TestWithArrayFactory(NestedFactory(ExampleDictExtension)); } +TEST_F(TestArrayRoundtrip, RegisteredExtensionNoMetadata) { + auto ext_type = std::make_shared(); + ExtensionTypeGuard guard(ext_type); + + auto ext_metadata = + KeyValueMetadata::Make({"ARROW:extension:name"}, {ext_type->extension_name()}); + auto ext_field = field("", ext_type->storage_type(), true, std::move(ext_metadata)); + + struct ArrowSchema c_schema {}; + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK(ExportField(*ext_field, &c_schema)); + + ASSERT_OK_AND_ASSIGN(auto ext_type_roundtrip, ImportType(&c_schema)); + ASSERT_EQ(ext_type_roundtrip->id(), Type::EXTENSION); + AssertTypeEqual(ext_type_roundtrip, ext_type); +} + TEST_F(TestArrayRoundtrip, UnregisteredExtension) { auto StorageExtractor = [](ArrayFactory factory) { return [factory]() -> Result> { @@ -4746,4 +4767,516 @@ TEST_F(TestArrayStreamRoundtrip, ChunkedArrayRoundtripEmpty) { }); } +//////////////////////////////////////////////////////////////////////////// +// Array device stream export tests + +class TestArrayDeviceStreamExport : public BaseArrayStreamTest { + public: + void AssertStreamSchema(struct ArrowDeviceArrayStream* c_stream, + const Schema& expected) { + struct ArrowSchema c_schema; + ASSERT_EQ(0, c_stream->get_schema(c_stream, &c_schema)); + + SchemaExportGuard schema_guard(&c_schema); + ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); + ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); + AssertSchemaEqual(expected, *schema, /*check_metadata=*/true); + } + + void AssertStreamEnd(struct ArrowDeviceArrayStream* c_stream) { + struct ArrowDeviceArray c_array; + ASSERT_EQ(0, c_stream->get_next(c_stream, &c_array)); + + DeviceArrayExportGuard guard(&c_array); + ASSERT_TRUE(ArrowDeviceArrayIsReleased(&c_array)); + } + + void AssertStreamNext(struct ArrowDeviceArrayStream* c_stream, + const RecordBatch& expected) { + struct ArrowDeviceArray c_array; + ASSERT_EQ(0, c_stream->get_next(c_stream, &c_array)); + + DeviceArrayExportGuard guard(&c_array); + ASSERT_FALSE(ArrowDeviceArrayIsReleased(&c_array)); + + ASSERT_OK_AND_ASSIGN(auto batch, + ImportDeviceRecordBatch(&c_array, expected.schema(), + TestDeviceArrayRoundtrip::DeviceMapper)); + AssertBatchesEqual(expected, *batch); + } + + void AssertStreamNext(struct ArrowDeviceArrayStream* c_stream, const Array& expected) { + struct ArrowDeviceArray c_array; + ASSERT_EQ(0, c_stream->get_next(c_stream, &c_array)); + + DeviceArrayExportGuard guard(&c_array); + ASSERT_FALSE(ArrowDeviceArrayIsReleased(&c_array)); + + ASSERT_OK_AND_ASSIGN(auto array, + ImportDeviceArray(&c_array, expected.type(), + TestDeviceArrayRoundtrip::DeviceMapper)); + AssertArraysEqual(expected, *array); + } + + static Result> ToDeviceData( + const std::shared_ptr& mm, const ArrayData& data) { + arrow::BufferVector buffers; + for (const auto& buf : data.buffers) { + if (buf) { + ARROW_ASSIGN_OR_RAISE(auto dest, mm->CopyBuffer(buf, mm)); + buffers.push_back(dest); + } else { + buffers.push_back(nullptr); + } + } + + arrow::ArrayDataVector children; + for (const auto& child : data.child_data) { + ARROW_ASSIGN_OR_RAISE(auto dest, ToDeviceData(mm, *child)); + children.push_back(dest); + } + + return ArrayData::Make(data.type, data.length, buffers, children, data.null_count, + data.offset); + } + + static Result> ToDevice(const std::shared_ptr& mm, + const ArrayData& data) { + ARROW_ASSIGN_OR_RAISE(auto result, ToDeviceData(mm, data)); + return MakeArray(result); + } +}; + +TEST_F(TestArrayDeviceStreamExport, Empty) { + auto schema = arrow::schema({field("ints", int32())}); + auto batches = MakeBatches(schema, {}); + ASSERT_OK_AND_ASSIGN( + auto reader, + RecordBatchReader::Make(batches, schema, + static_cast(kMyDeviceType))); + + struct ArrowDeviceArrayStream c_stream; + ASSERT_OK(ExportDeviceRecordBatchReader(reader, &c_stream)); + DeviceArrayStreamExportGuard guard(&c_stream); + + ASSERT_FALSE(ArrowDeviceArrayStreamIsReleased(&c_stream)); + ASSERT_EQ(kMyDeviceType, c_stream.device_type); + AssertStreamSchema(&c_stream, *schema); + AssertStreamEnd(&c_stream); + AssertStreamEnd(&c_stream); +} + +TEST_F(TestArrayDeviceStreamExport, Simple) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + ASSERT_OK_AND_ASSIGN(auto arr1, + ToDevice(mm, *ArrayFromJSON(int32(), "[1, 2]")->data())); + ASSERT_EQ(device->device_type(), arr1->device_type()); + ASSERT_OK_AND_ASSIGN(auto arr2, + ToDevice(mm, *ArrayFromJSON(int32(), "[4, 5, null]")->data())); + ASSERT_EQ(device->device_type(), arr2->device_type()); + auto schema = arrow::schema({field("ints", int32())}); + auto batches = MakeBatches(schema, {arr1, arr2}); + ASSERT_OK_AND_ASSIGN(auto reader, + RecordBatchReader::Make(batches, schema, device->device_type())); + + struct ArrowDeviceArrayStream c_stream; + + ASSERT_OK(ExportDeviceRecordBatchReader(reader, &c_stream)); + DeviceArrayStreamExportGuard guard(&c_stream); + + ASSERT_FALSE(ArrowDeviceArrayStreamIsReleased(&c_stream)); + AssertStreamSchema(&c_stream, *schema); + ASSERT_EQ(kMyDeviceType, c_stream.device_type); + AssertStreamNext(&c_stream, *batches[0]); + AssertStreamNext(&c_stream, *batches[1]); + AssertStreamEnd(&c_stream); + AssertStreamEnd(&c_stream); +} + +TEST_F(TestArrayDeviceStreamExport, ArrayLifetime) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + ASSERT_OK_AND_ASSIGN(auto arr1, + ToDevice(mm, *ArrayFromJSON(int32(), "[1, 2]")->data())); + ASSERT_EQ(device->device_type(), arr1->device_type()); + ASSERT_OK_AND_ASSIGN(auto arr2, + ToDevice(mm, *ArrayFromJSON(int32(), "[4, 5, null]")->data())); + ASSERT_EQ(device->device_type(), arr2->device_type()); + auto schema = arrow::schema({field("ints", int32())}); + auto batches = MakeBatches(schema, {arr1, arr2}); + ASSERT_OK_AND_ASSIGN(auto reader, + RecordBatchReader::Make(batches, schema, device->device_type())); + + struct ArrowDeviceArrayStream c_stream; + struct ArrowSchema c_schema; + struct ArrowDeviceArray c_array0, c_array1; + + ASSERT_OK(ExportDeviceRecordBatchReader(reader, &c_stream)); + { + DeviceArrayStreamExportGuard guard(&c_stream); + ASSERT_FALSE(ArrowDeviceArrayStreamIsReleased(&c_stream)); + ASSERT_EQ(kMyDeviceType, c_stream.device_type); + + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array0)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array1)); + AssertStreamEnd(&c_stream); + } + + DeviceArrayExportGuard guard0(&c_array0), guard1(&c_array1); + + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto got_schema, ImportSchema(&c_schema)); + AssertSchemaEqual(*schema, *got_schema, /*check_metadata=*/true); + } + + ASSERT_EQ(kMyDeviceType, c_array0.device_type); + ASSERT_EQ(kMyDeviceType, c_array1.device_type); + + ASSERT_GT(pool_->bytes_allocated(), orig_allocated_); + ASSERT_OK_AND_ASSIGN( + auto batch, + ImportDeviceRecordBatch(&c_array1, schema, TestDeviceArrayRoundtrip::DeviceMapper)); + AssertBatchesEqual(*batches[1], *batch); + ASSERT_EQ(device->device_type(), batch->device_type()); + ASSERT_OK_AND_ASSIGN( + batch, + ImportDeviceRecordBatch(&c_array0, schema, TestDeviceArrayRoundtrip::DeviceMapper)); + AssertBatchesEqual(*batches[0], *batch); + ASSERT_EQ(device->device_type(), batch->device_type()); +} + +TEST_F(TestArrayDeviceStreamExport, Errors) { + auto reader = + std::make_shared(Status::Invalid("some example error")); + + struct ArrowDeviceArrayStream c_stream; + + ASSERT_OK(ExportDeviceRecordBatchReader(reader, &c_stream)); + DeviceArrayStreamExportGuard guard(&c_stream); + + struct ArrowSchema c_schema; + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); + AssertSchemaEqual(schema, arrow::schema({}), /*check_metadata=*/true); + } + + struct ArrowDeviceArray c_array; + ASSERT_EQ(EINVAL, c_stream.get_next(&c_stream, &c_array)); +} + +TEST_F(TestArrayDeviceStreamExport, ChunkedArrayExportEmpty) { + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({}, int32())); + + struct ArrowDeviceArrayStream c_stream; + struct ArrowSchema c_schema; + + ASSERT_OK(ExportDeviceChunkedArray( + chunked_array, static_cast(kMyDeviceType), &c_stream)); + DeviceArrayStreamExportGuard guard(&c_stream); + + { + DeviceArrayStreamExportGuard guard(&c_stream); + ASSERT_FALSE(ArrowDeviceArrayStreamIsReleased(&c_stream)); + + ASSERT_EQ(kMyDeviceType, c_stream.device_type); + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + AssertStreamEnd(&c_stream); + } + + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto got_type, ImportType(&c_schema)); + AssertTypeEqual(*chunked_array->type(), *got_type); + } +} + +TEST_F(TestArrayDeviceStreamExport, ChunkedArrayExport) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + ASSERT_OK_AND_ASSIGN(auto arr1, + ToDevice(mm, *ArrayFromJSON(int32(), "[1, 2]")->data())); + ASSERT_EQ(device->device_type(), arr1->device_type()); + ASSERT_OK_AND_ASSIGN(auto arr2, + ToDevice(mm, *ArrayFromJSON(int32(), "[4, 5, null]")->data())); + ASSERT_EQ(device->device_type(), arr2->device_type()); + + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({arr1, arr2})); + + struct ArrowDeviceArrayStream c_stream; + struct ArrowSchema c_schema; + struct ArrowDeviceArray c_array0, c_array1; + + ASSERT_OK(ExportDeviceChunkedArray(chunked_array, device->device_type(), &c_stream)); + DeviceArrayStreamExportGuard guard(&c_stream); + + { + DeviceArrayStreamExportGuard guard(&c_stream); + ASSERT_FALSE(ArrowDeviceArrayStreamIsReleased(&c_stream)); + ASSERT_EQ(kMyDeviceType, c_stream.device_type); + + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array0)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array1)); + AssertStreamEnd(&c_stream); + } + + DeviceArrayExportGuard guard0(&c_array0), guard1(&c_array1); + + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto got_type, ImportType(&c_schema)); + AssertTypeEqual(*chunked_array->type(), *got_type); + } + + ASSERT_EQ(kMyDeviceType, c_array0.device_type); + ASSERT_EQ(kMyDeviceType, c_array1.device_type); + + ASSERT_GT(pool_->bytes_allocated(), orig_allocated_); + ASSERT_OK_AND_ASSIGN(auto array, + ImportDeviceArray(&c_array0, chunked_array->type(), + TestDeviceArrayRoundtrip::DeviceMapper)); + ASSERT_EQ(device->device_type(), array->device_type()); + AssertArraysEqual(*chunked_array->chunk(0), *array); + ASSERT_OK_AND_ASSIGN(array, ImportDeviceArray(&c_array1, chunked_array->type(), + TestDeviceArrayRoundtrip::DeviceMapper)); + ASSERT_EQ(device->device_type(), array->device_type()); + AssertArraysEqual(*chunked_array->chunk(1), *array); +} + +//////////////////////////////////////////////////////////////////////////// +// Array device stream roundtrip tests + +class TestArrayDeviceStreamRoundtrip : public BaseArrayStreamTest { + public: + static Result> ToDeviceData( + const std::shared_ptr& mm, const ArrayData& data) { + arrow::BufferVector buffers; + for (const auto& buf : data.buffers) { + if (buf) { + ARROW_ASSIGN_OR_RAISE(auto dest, mm->CopyBuffer(buf, mm)); + buffers.push_back(dest); + } else { + buffers.push_back(nullptr); + } + } + + arrow::ArrayDataVector children; + for (const auto& child : data.child_data) { + ARROW_ASSIGN_OR_RAISE(auto dest, ToDeviceData(mm, *child)); + children.push_back(dest); + } + + return ArrayData::Make(data.type, data.length, buffers, children, data.null_count, + data.offset); + } + + static Result> ToDevice(const std::shared_ptr& mm, + const ArrayData& data) { + ARROW_ASSIGN_OR_RAISE(auto result, ToDeviceData(mm, data)); + return MakeArray(result); + } + + void Roundtrip(std::shared_ptr* reader, + struct ArrowDeviceArrayStream* c_stream) { + ASSERT_OK(ExportDeviceRecordBatchReader(*reader, c_stream)); + ASSERT_FALSE(ArrowDeviceArrayStreamIsReleased(c_stream)); + + ASSERT_OK_AND_ASSIGN( + auto got_reader, + ImportDeviceRecordBatchReader(c_stream, TestDeviceArrayRoundtrip::DeviceMapper)); + *reader = std::move(got_reader); + } + + void Roundtrip( + std::shared_ptr reader, + std::function&)> check_func) { + ArrowDeviceArrayStream c_stream; + + // NOTE: ReleaseCallback<> is not immediately usable with ArrowDeviceArayStream + // because get_next and get_schema need the original private_data. + std::weak_ptr weak_reader(reader); + ASSERT_EQ(weak_reader.use_count(), 1); // Expiration check will fail otherwise + + ASSERT_OK(ExportDeviceRecordBatchReader(std::move(reader), &c_stream)); + ASSERT_FALSE(ArrowDeviceArrayStreamIsReleased(&c_stream)); + + { + ASSERT_OK_AND_ASSIGN(auto new_reader, + ImportDeviceRecordBatchReader( + &c_stream, TestDeviceArrayRoundtrip::DeviceMapper)); + // stream was moved + ASSERT_TRUE(ArrowDeviceArrayStreamIsReleased(&c_stream)); + ASSERT_FALSE(weak_reader.expired()); + + check_func(new_reader); + } + // Stream was released when `new_reader` was destroyed + ASSERT_TRUE(weak_reader.expired()); + } + + void Roundtrip(std::shared_ptr src, + std::function&)> check_func) { + ArrowDeviceArrayStream c_stream; + + // One original copy to compare the result, one copy held by the stream + std::weak_ptr weak_src(src); + int64_t initial_use_count = weak_src.use_count(); + + ASSERT_OK(ExportDeviceChunkedArray( + std::move(src), static_cast(kMyDeviceType), &c_stream)); + ASSERT_FALSE(ArrowDeviceArrayStreamIsReleased(&c_stream)); + ASSERT_EQ(kMyDeviceType, c_stream.device_type); + + { + ASSERT_OK_AND_ASSIGN( + auto dst, + ImportDeviceChunkedArray(&c_stream, TestDeviceArrayRoundtrip::DeviceMapper)); + // Stream was moved, consumed, and released + ASSERT_TRUE(ArrowDeviceArrayStreamIsReleased(&c_stream)); + + // Stream was released by ImportDeviceChunkedArray but original copy remains + ASSERT_EQ(weak_src.use_count(), initial_use_count - 1); + + check_func(dst); + } + } + + void AssertReaderNext(const std::shared_ptr& reader, + const RecordBatch& expected) { + ASSERT_OK_AND_ASSIGN(auto batch, reader->Next()); + ASSERT_NE(batch, nullptr); + ASSERT_EQ(static_cast(kMyDeviceType), batch->device_type()); + AssertBatchesEqual(expected, *batch); + } + + void AssertReaderEnd(const std::shared_ptr& reader) { + ASSERT_OK_AND_ASSIGN(auto batch, reader->Next()); + ASSERT_EQ(batch, nullptr); + } + + void AssertReaderClosed(const std::shared_ptr& reader) { + ASSERT_THAT(reader->Next(), + Raises(StatusCode::Invalid, ::testing::HasSubstr("already been closed"))); + } + + void AssertReaderClose(const std::shared_ptr& reader) { + ASSERT_OK(reader->Close()); + AssertReaderClosed(reader); + } +}; + +TEST_F(TestArrayDeviceStreamRoundtrip, Simple) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + ASSERT_OK_AND_ASSIGN(auto arr1, + ToDevice(mm, *ArrayFromJSON(int32(), "[1, 2]")->data())); + ASSERT_EQ(device->device_type(), arr1->device_type()); + ASSERT_OK_AND_ASSIGN(auto arr2, + ToDevice(mm, *ArrayFromJSON(int32(), "[4, 5, null]")->data())); + ASSERT_EQ(device->device_type(), arr2->device_type()); + auto orig_schema = arrow::schema({field("ints", int32())}); + auto batches = MakeBatches(orig_schema, {arr1, arr2}); + ASSERT_OK_AND_ASSIGN( + auto reader, RecordBatchReader::Make(batches, orig_schema, device->device_type())); + + Roundtrip(std::move(reader), [&](const std::shared_ptr& reader) { + AssertSchemaEqual(*orig_schema, *reader->schema(), /*check_metadata=*/true); + AssertReaderNext(reader, *batches[0]); + AssertReaderNext(reader, *batches[1]); + AssertReaderEnd(reader); + AssertReaderEnd(reader); + AssertReaderClose(reader); + }); +} + +TEST_F(TestArrayDeviceStreamRoundtrip, CloseEarly) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + ASSERT_OK_AND_ASSIGN(auto arr1, + ToDevice(mm, *ArrayFromJSON(int32(), "[1, 2]")->data())); + ASSERT_EQ(device->device_type(), arr1->device_type()); + ASSERT_OK_AND_ASSIGN(auto arr2, + ToDevice(mm, *ArrayFromJSON(int32(), "[4, 5, null]")->data())); + ASSERT_EQ(device->device_type(), arr2->device_type()); + auto orig_schema = arrow::schema({field("ints", int32())}); + auto batches = MakeBatches(orig_schema, {arr1, arr2}); + ASSERT_OK_AND_ASSIGN( + auto reader, RecordBatchReader::Make(batches, orig_schema, device->device_type())); + + Roundtrip(std::move(reader), [&](const std::shared_ptr& reader) { + AssertReaderNext(reader, *batches[0]); + AssertReaderClose(reader); + }); +} + +TEST_F(TestArrayDeviceStreamRoundtrip, Errors) { + auto reader = std::make_shared( + Status::Invalid("roundtrip error example")); + + Roundtrip(std::move(reader), [&](const std::shared_ptr& reader) { + EXPECT_THAT(reader->Next(), Raises(StatusCode::Invalid, + ::testing::HasSubstr("roundtrip error example"))); + }); +} + +TEST_F(TestArrayDeviceStreamRoundtrip, SchemaError) { + struct ArrowDeviceArrayStream stream = {}; + stream.get_last_error = [](struct ArrowDeviceArrayStream* stream) { + return "Expected error"; + }; + stream.get_schema = [](struct ArrowDeviceArrayStream* stream, + struct ArrowSchema* schema) { return EIO; }; + stream.get_next = [](struct ArrowDeviceArrayStream* stream, + struct ArrowDeviceArray* array) { return EINVAL; }; + stream.release = [](struct ArrowDeviceArrayStream* stream) { + *static_cast(stream->private_data) = true; + std::memset(stream, 0, sizeof(*stream)); + }; + bool released = false; + stream.private_data = &released; + + EXPECT_RAISES_WITH_MESSAGE_THAT(IOError, ::testing::HasSubstr("Expected error"), + ImportDeviceRecordBatchReader(&stream)); + ASSERT_TRUE(released); +} + +TEST_F(TestArrayDeviceStreamRoundtrip, ChunkedArrayRoundtrip) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + ASSERT_OK_AND_ASSIGN(auto arr1, + ToDevice(mm, *ArrayFromJSON(int32(), "[1, 2]")->data())); + ASSERT_EQ(device->device_type(), arr1->device_type()); + ASSERT_OK_AND_ASSIGN(auto arr2, + ToDevice(mm, *ArrayFromJSON(int32(), "[4, 5, null]")->data())); + ASSERT_EQ(device->device_type(), arr2->device_type()); + + ASSERT_OK_AND_ASSIGN(auto src, ChunkedArray::Make({arr1, arr2})); + + Roundtrip(src, [&](const std::shared_ptr& dst) { + AssertTypeEqual(*dst->type(), *src->type()); + AssertChunkedEqual(*dst, *src); + }); +} + +TEST_F(TestArrayDeviceStreamRoundtrip, ChunkedArrayRoundtripEmpty) { + ASSERT_OK_AND_ASSIGN(auto src, ChunkedArray::Make({}, int32())); + + Roundtrip(src, [&](const std::shared_ptr& dst) { + AssertTypeEqual(*dst->type(), *src->type()); + AssertChunkedEqual(*dst, *src); + }); +} + } // namespace arrow diff --git a/cpp/src/arrow/c/helpers.h b/cpp/src/arrow/c/helpers.h index a24f272feac81..6e4df17f43ebf 100644 --- a/cpp/src/arrow/c/helpers.h +++ b/cpp/src/arrow/c/helpers.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include @@ -70,9 +71,17 @@ inline int ArrowArrayIsReleased(const struct ArrowArray* array) { return array->release == NULL; } +inline int ArrowDeviceArrayIsReleased(const struct ArrowDeviceArray* array) { + return ArrowArrayIsReleased(&array->array); +} + /// Mark the C array released (for use in release callbacks) inline void ArrowArrayMarkReleased(struct ArrowArray* array) { array->release = NULL; } +inline void ArrowDeviceArrayMarkReleased(struct ArrowDeviceArray* array) { + ArrowArrayMarkReleased(&array->array); +} + /// Move the C array from `src` to `dest` /// /// Note `dest` must *not* point to a valid array already, otherwise there @@ -84,6 +93,14 @@ inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) { ArrowArrayMarkReleased(src); } +inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src, + struct ArrowDeviceArray* dest) { + assert(dest != src); + assert(!ArrowDeviceArrayIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowDeviceArray)); + ArrowDeviceArrayMarkReleased(src); +} + /// Release the C array, if necessary, by calling its release callback inline void ArrowArrayRelease(struct ArrowArray* array) { if (!ArrowArrayIsReleased(array)) { @@ -93,16 +110,32 @@ inline void ArrowArrayRelease(struct ArrowArray* array) { } } +inline void ArrowDeviceArrayRelease(struct ArrowDeviceArray* array) { + if (!ArrowDeviceArrayIsReleased(array)) { + array->array.release(&array->array); + ARROW_C_ASSERT(ArrowDeviceArrayIsReleased(array), + "ArrowDeviceArrayRelease did not cleanup release callback"); + } +} + /// Query whether the C array stream is released inline int ArrowArrayStreamIsReleased(const struct ArrowArrayStream* stream) { return stream->release == NULL; } +inline int ArrowDeviceArrayStreamIsReleased(const struct ArrowDeviceArrayStream* stream) { + return stream->release == NULL; +} + /// Mark the C array stream released (for use in release callbacks) inline void ArrowArrayStreamMarkReleased(struct ArrowArrayStream* stream) { stream->release = NULL; } +inline void ArrowDeviceArrayStreamMarkReleased(struct ArrowDeviceArrayStream* stream) { + stream->release = NULL; +} + /// Move the C array stream from `src` to `dest` /// /// Note `dest` must *not* point to a valid stream already, otherwise there @@ -115,6 +148,14 @@ inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, ArrowArrayStreamMarkReleased(src); } +inline void ArrowDeviceArrayStreamMove(struct ArrowDeviceArrayStream* src, + struct ArrowDeviceArrayStream* dest) { + assert(dest != src); + assert(!ArrowDeviceArrayStreamIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowDeviceArrayStream)); + ArrowDeviceArrayStreamMarkReleased(src); +} + /// Release the C array stream, if necessary, by calling its release callback inline void ArrowArrayStreamRelease(struct ArrowArrayStream* stream) { if (!ArrowArrayStreamIsReleased(stream)) { @@ -124,6 +165,14 @@ inline void ArrowArrayStreamRelease(struct ArrowArrayStream* stream) { } } +inline void ArrowDeviceArrayStreamRelease(struct ArrowDeviceArrayStream* stream) { + if (!ArrowDeviceArrayStreamIsReleased(stream)) { + stream->release(stream); + ARROW_C_ASSERT(ArrowDeviceArrayStreamIsReleased(stream), + "ArrowDeviceArrayStreamRelease did not cleanup release callback"); + } +} + #ifdef __cplusplus } #endif diff --git a/cpp/src/arrow/c/util_internal.h b/cpp/src/arrow/c/util_internal.h index 6a33be9b0da8e..dc0e25710e987 100644 --- a/cpp/src/arrow/c/util_internal.h +++ b/cpp/src/arrow/c/util_internal.h @@ -32,12 +32,32 @@ struct ArrayExportTraits { typedef struct ArrowArray CType; static constexpr auto IsReleasedFunc = &ArrowArrayIsReleased; static constexpr auto ReleaseFunc = &ArrowArrayRelease; + static constexpr auto MoveFunc = &ArrowArrayMove; + static constexpr auto MarkReleased = &ArrowArrayMarkReleased; +}; + +struct ArrayDeviceExportTraits { + typedef struct ArrowDeviceArray CType; + static constexpr auto IsReleasedFunc = &ArrowDeviceArrayIsReleased; + static constexpr auto ReleaseFunc = &ArrowDeviceArrayRelease; + static constexpr auto MoveFunc = &ArrowDeviceArrayMove; + static constexpr auto MarkReleased = &ArrowDeviceArrayMarkReleased; }; struct ArrayStreamExportTraits { typedef struct ArrowArrayStream CType; static constexpr auto IsReleasedFunc = &ArrowArrayStreamIsReleased; static constexpr auto ReleaseFunc = &ArrowArrayStreamRelease; + static constexpr auto MoveFunc = &ArrowArrayStreamMove; + static constexpr auto MarkReleased = &ArrowArrayStreamMarkReleased; +}; + +struct ArrayDeviceStreamExportTraits { + typedef struct ArrowDeviceArrayStream CType; + static constexpr auto IsReleasedFunc = &ArrowDeviceArrayStreamIsReleased; + static constexpr auto ReleaseFunc = &ArrowDeviceArrayStreamRelease; + static constexpr auto MoveFunc = &ArrowDeviceArrayStreamMove; + static constexpr auto MarkReleased = &ArrowDeviceArrayStreamMarkReleased; }; // A RAII-style object to release a C Array / Schema struct at block scope exit. @@ -79,7 +99,9 @@ class ExportGuard { using SchemaExportGuard = ExportGuard; using ArrayExportGuard = ExportGuard; +using DeviceArrayExportGuard = ExportGuard; using ArrayStreamExportGuard = ExportGuard; +using DeviceArrayStreamExportGuard = ExportGuard; } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 29bccb52658f8..55eec53ced1c7 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -19,14 +19,14 @@ #include #include +#include #include #include #include "arrow/array.h" #include "arrow/record_batch.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { template @@ -54,6 +54,51 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { offsets[chunks.size()] = offset; return offsets; } + +/// \pre all the pre-conditions of ChunkResolver::ResolveMany() +/// \pre num_offsets - 1 <= std::numeric_limits::max() +template +void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets, + int64_t n_indices, const IndexType* logical_index_vec, + IndexType* out_chunk_index_vec, IndexType chunk_hint, + IndexType* out_index_in_chunk_vec) { + auto* offsets = reinterpret_cast(signed_offsets); + const auto num_chunks = static_cast(num_offsets - 1); + // chunk_hint in [0, num_offsets) per the precondition. + for (int64_t i = 0; i < n_indices; i++) { + const auto index = static_cast(logical_index_vec[i]); + if (index >= offsets[chunk_hint] && + (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) { + out_chunk_index_vec[i] = chunk_hint; // hint is correct! + continue; + } + // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` + auto chunk_index = + ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); + chunk_hint = static_cast(chunk_index); + out_chunk_index_vec[i] = chunk_hint; + } + if (out_index_in_chunk_vec != NULLPTR) { + for (int64_t i = 0; i < n_indices; i++) { + auto logical_index = logical_index_vec[i]; + auto chunk_index = out_chunk_index_vec[i]; + // chunk_index is in [0, chunks.size()] no matter what the + // value of logical_index is, so it's always safe to dereference + // offset_ as it contains chunks.size()+1 values. + out_index_in_chunk_vec[i] = + logical_index - static_cast(offsets[chunk_index]); +#if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) + // Make it more likely that Valgrind/ASAN can catch an invalid memory + // access by poisoning out_index_in_chunk_vec[i] when the logical + // index is out-of-bounds. + if (chunk_index == num_chunks) { + out_index_in_chunk_vec[i] = std::numeric_limits::max(); + } +#endif + } + } +} + } // namespace ChunkResolver::ChunkResolver(const ArrayVector& chunks) noexcept @@ -84,5 +129,32 @@ ChunkResolver& ChunkResolver::operator=(const ChunkResolver& other) noexcept { return *this; } -} // namespace internal -} // namespace arrow +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* logical_index_vec, + uint8_t* out_chunk_index_vec, uint8_t chunk_hint, + uint8_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* logical_index_vec, + uint32_t* out_chunk_index_vec, uint32_t chunk_hint, + uint32_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* logical_index_vec, + uint16_t* out_chunk_index_vec, uint16_t chunk_hint, + uint16_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_index_vec, + uint64_t* out_chunk_index_vec, uint64_t chunk_hint, + uint64_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +} // namespace arrow::internal diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index c5dad1a17b18e..a2a3d5a864243 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include "arrow/type_fwd.h" @@ -27,6 +29,8 @@ namespace arrow::internal { +struct ChunkResolver; + struct ChunkLocation { /// \brief Index of the chunk in the array of chunks /// @@ -36,8 +40,17 @@ struct ChunkLocation { /// \brief Index of the value in the chunk /// - /// The value is undefined if chunk_index >= chunks.size() + /// The value is UNDEFINED if chunk_index >= chunks.size() int64_t index_in_chunk = 0; + + ChunkLocation() = default; + + ChunkLocation(int64_t chunk_index, int64_t index_in_chunk) + : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {} + + bool operator==(ChunkLocation other) const { + return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk; + } }; /// \brief An utility that incrementally resolves logical indices into @@ -60,12 +73,35 @@ struct ARROW_EXPORT ChunkResolver { explicit ChunkResolver(const std::vector& chunks) noexcept; explicit ChunkResolver(const RecordBatchVector& batches) noexcept; + /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. + /// + /// The first offset must be 0 and the last offset must be the logical length of the + /// chunked array. Each offset before the last represents the starting logical index of + /// the corresponding chunk. + explicit ChunkResolver(std::vector offsets) noexcept + : offsets_(std::move(offsets)), cached_chunk_(0) { +#ifndef NDEBUG + assert(offsets_.size() >= 1); + assert(offsets_[0] == 0); + for (size_t i = 1; i < offsets_.size(); i++) { + assert(offsets_[i] >= offsets_[i - 1]); + } +#endif + } + ChunkResolver(ChunkResolver&& other) noexcept; ChunkResolver& operator=(ChunkResolver&& other) noexcept; ChunkResolver(const ChunkResolver& other) noexcept; ChunkResolver& operator=(const ChunkResolver& other) noexcept; + int64_t logical_array_length() const { return offsets_.back(); } + int64_t num_chunks() const { return static_cast(offsets_.size()) - 1; } + + int64_t chunk_length(int64_t chunk_index) const { + return offsets_[chunk_index + 1] - offsets_[chunk_index]; + } + /// \brief Resolve a logical index to a ChunkLocation. /// /// The returned ChunkLocation contains the chunk index and the within-chunk index @@ -81,7 +117,7 @@ struct ARROW_EXPORT ChunkResolver { const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed); const auto chunk_index = ResolveChunkIndex(index, cached_chunk); - return {chunk_index, index - offsets_[chunk_index]}; + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; } /// \brief Resolve a logical index to a ChunkLocation. @@ -97,12 +133,70 @@ struct ARROW_EXPORT ChunkResolver { /// \return ChunkLocation with a valid chunk_index if index is within /// bounds, or with chunk_index == chunks.size() if logical index is /// `>= chunked_array.length()`. - inline ChunkLocation ResolveWithChunkIndexHint(int64_t index, - ChunkLocation hint) const { + inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const { assert(hint.chunk_index < static_cast(offsets_.size())); const auto chunk_index = ResolveChunkIndex(index, hint.chunk_index); - return {chunk_index, index - offsets_[chunk_index]}; + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; + } + + /// \brief Resolve `n_indices` logical indices to chunk indices. + /// + /// \pre 0 <= logical_index_vec[i] < logical_array_length() + /// (for well-defined and valid chunk index results) + /// \pre out_chunk_index_vec has space for `n_indices` + /// \pre chunk_hint in [0, chunks.size()] + /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n) + /// \post if logical_index_vec[i] >= chunked_array.length(), then + /// out_chunk_index_vec[i] == chunks.size() + /// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds) + /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and + /// out_index_in_chunk_vec[i] are UNDEFINED + /// + /// \param n_indices The number of logical indices to resolve + /// \param logical_index_vec The logical indices to resolve + /// \param out_chunk_index_vec The output array where the chunk indices will be written + /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany + /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the + /// within-chunk indices will be written + /// \return false iff chunks.size() > std::numeric_limits::max() + template + [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec, + IndexType* out_chunk_index_vec, IndexType chunk_hint = 0, + IndexType* out_index_in_chunk_vec = NULLPTR) const { + if constexpr (sizeof(IndexType) < sizeof(uint64_t)) { + // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()). + constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits::max(); + // A ChunkedArray with enough empty chunks can make the index of a chunk + // exceed the logical index and thus the maximum value of IndexType. + const bool chunk_index_fits_on_type = + static_cast(offsets_.size() - 1) <= kMaxIndexTypeValue; + if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) { + return false; + } + // Since an index-in-chunk cannot possibly exceed the logical index being + // queried, we don't have to worry about these values not fitting on IndexType. + } + if constexpr (std::is_signed_v) { + // We interpret signed integers as unsigned and avoid having to generate double + // the amount of binary code to handle each integer width. + // + // Negative logical indices can become large values when cast to unsigned, and + // they are gracefully handled by ResolveManyImpl, but both the chunk index + // and the index in chunk values will be undefined in these cases. This + // happend because int8_t(-1) == uint8_t(255) and 255 could be a valid + // logical index in the chunked array. + using U = std::make_unsigned_t; + ResolveManyImpl(n_indices, reinterpret_cast(logical_index_vec), + reinterpret_cast(out_chunk_index_vec), + static_cast(chunk_hint), + reinterpret_cast(out_index_in_chunk_vec)); + } else { + static_assert(std::is_unsigned_v); + ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint, + out_index_in_chunk_vec); + } + return true; } private: @@ -130,17 +224,33 @@ struct ARROW_EXPORT ChunkResolver { return chunk_index; } + /// \pre all the pre-conditions of ChunkResolver::ResolveMany() + /// \pre num_offsets - 1 <= std::numeric_limits::max() + void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) const; + void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, uint16_t*) const; + void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, uint32_t*) const; + void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, uint64_t*) const; + + public: /// \brief Find the index of the chunk that contains the logical index. /// /// Any non-negative index is accepted. When `hi=num_offsets`, the largest /// possible return value is `num_offsets-1` which is equal to - /// `chunks.size()`. The is returned when the logical index is out-of-bounds. + /// `chunks.size()`. Which is returned when the logical index is greater or + /// equal the logical length of the chunked array. /// - /// \pre index >= 0 + /// \pre index >= 0 (otherwise, when index is negative, hi-1 is returned) /// \pre lo < hi /// \pre lo >= 0 && hi <= offsets_.size() static inline int64_t Bisect(int64_t index, const int64_t* offsets, int64_t lo, int64_t hi) { + return Bisect(static_cast(index), + reinterpret_cast(offsets), static_cast(lo), + static_cast(hi)); + } + + static inline int64_t Bisect(uint64_t index, const uint64_t* offsets, uint64_t lo, + uint64_t hi) { // Similar to std::upper_bound(), but slightly different as our offsets // array always starts with 0. auto n = hi - lo; @@ -148,8 +258,8 @@ struct ARROW_EXPORT ChunkResolver { // (lo < hi is guaranteed by the precondition). assert(n > 1 && "lo < hi is a precondition of Bisect"); do { - const int64_t m = n >> 1; - const int64_t mid = lo + m; + const uint64_t m = n >> 1; + const uint64_t mid = lo + m; if (index >= offsets[mid]) { lo = mid; n -= m; diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 6ca52ab46ca68..e9cc283b53cd5 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/chunk_resolver.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/testing/builder.h" @@ -34,6 +35,9 @@ namespace arrow { +using internal::ChunkLocation; +using internal::ChunkResolver; + class TestChunkedArray : public ::testing::Test { protected: virtual void Construct() { @@ -310,4 +314,200 @@ TEST_F(TestChunkedArray, GetScalar) { ASSERT_RAISES(IndexError, carr.GetScalar(7)); } +// ChunkResolver tests + +using IndexTypes = ::testing::Types; + +TEST(TestChunkResolver, Resolve) { + ChunkResolver empty(std::vector({0})); // [] + // ChunkLocation::index_in_chunk is undefined when chunk_index==chunks.size(), + // so only chunk_index is compared in these cases. + ASSERT_EQ(empty.Resolve(0).chunk_index, 0); + ASSERT_EQ(empty.Resolve(0).chunk_index, 0); + + ChunkResolver one(std::vector({0, 1})); // [[0]] + ASSERT_EQ(one.Resolve(1).chunk_index, 1); + ASSERT_EQ(one.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one.Resolve(1).chunk_index, 1); + + ChunkResolver one_and_empty(std::vector({0, 1, 1, 1})); // [[0], [], []] + ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3); + + ChunkResolver one_one_one(std::vector({0, 1, 2, 3})); // [[0], [1], [2]] + ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3); + ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0))); + ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0))); + ASSERT_EQ(one_one_one.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0))); + ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0))); + ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3); + + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + ASSERT_EQ(resolver.Resolve(10).chunk_index, 3); + ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6))); + ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5))); + ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1))); + ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0))); + ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0))); + ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1))); + ASSERT_EQ(resolver.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1))); + ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0))); + ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0))); + ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1))); + ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5))); + ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6))); + ASSERT_EQ(resolver.Resolve(10).chunk_index, 3); +} + +template +class TestChunkResolverMany : public ::testing::Test { + public: + using IndexType = T; + + Result> ResolveMany( + const ChunkResolver& resolver, const std::vector& logical_index_vec) { + const size_t n = logical_index_vec.size(); + std::vector chunk_index_vec; + chunk_index_vec.resize(n); + std::vector index_in_chunk_vec; + index_in_chunk_vec.resize(n); + bool valid = resolver.ResolveMany( + static_cast(n), logical_index_vec.data(), chunk_index_vec.data(), 0, + index_in_chunk_vec.data()); + if (ARROW_PREDICT_FALSE(!valid)) { + return Status::Invalid("index type doesn't fit possible chunk indexes"); + } + std::vector locations; + locations.reserve(n); + for (size_t i = 0; i < n; i++) { + auto chunk_index = static_cast(chunk_index_vec[i]); + auto index_in_chunk = static_cast(index_in_chunk_vec[i]); + locations.emplace_back(chunk_index, index_in_chunk); + } + return locations; + } + + void CheckResolveMany(const ChunkResolver& resolver, + const std::vector& logical_index_vec) { + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + IndexType logical_index = logical_index_vec[i]; + const auto expected = resolver.Resolve(logical_index); + ASSERT_LE(expected.chunk_index, resolver.num_chunks()); + if (expected.chunk_index == resolver.num_chunks()) { + // index_in_chunk is undefined in this case + ASSERT_EQ(locations[i].chunk_index, expected.chunk_index); + } else { + ASSERT_EQ(locations[i], expected); + } + } + } + + void TestBasics() { + std::vector logical_index_vec; + + ChunkResolver empty(std::vector({0})); // [] + logical_index_vec = {0, 0}; + CheckResolveMany(empty, logical_index_vec); + + ChunkResolver one(std::vector({0, 1})); // [[0]] + logical_index_vec = {1, 0, 1}; + CheckResolveMany(one, logical_index_vec); + + ChunkResolver one_and_empty(std::vector({0, 1, 1, 1})); // [[0], [], []] + logical_index_vec = {3, 2, 1, 0, 1, 2, 3}; + CheckResolveMany(one_and_empty, logical_index_vec); + + ChunkResolver one_one_one(std::vector({0, 1, 2, 3})); // [[0], [1], [2]] + logical_index_vec = {3, 2, 1, 0, 1, 2, 3}; + CheckResolveMany(one_one_one, logical_index_vec); + + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + logical_index_vec = {10, 9, 8, 4, 3, 2, 1, 0, 1, 2, 3, 4, 8, 9, 10}; + CheckResolveMany(resolver, logical_index_vec); + } + + void TestOutOfBounds() { + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + + std::vector logical_index_vec = {10, 11, 12, 13, 14, 13, 11, 10}; + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + + if constexpr (std::is_signed_v) { + std::vector logical_index_vec = {-1, -2, -3, -4, INT8_MIN}; + + ChunkResolver resolver(std::vector({0, 2, 128})); // [[0, 1], [2..127]] + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + // All the negative indices are greater than resolver.logical_array_length()-1 + // when cast to uint8_t. + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + + if constexpr (sizeof(IndexType) == 1) { + ChunkResolver resolver(std::vector( + {0, 2, 128, 129, 256})); // [[0, 1], [2..127], [128], [129, 255]] + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + if constexpr (sizeof(IndexType) == 1) { + // All the negative 8-bit indices are SMALLER than + // resolver.logical_array_length()=256 when cast to 8-bit unsigned integers. + // So the resolved locations might look valid, but they should not be trusted. + ASSERT_LT(locations[i].chunk_index, resolver.num_chunks()); + } else { + // All the negative indices are greater than resolver.logical_array_length() + // when cast to 16/32/64-bit unsigned integers. + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + } + } + } + } + + void TestOverflow() { + const int64_t kMaxIndex = std::is_signed_v ? 127 : 255; + std::vector logical_index_vec = {0, 1, 2, + static_cast(kMaxIndex)}; + + // Overflows are rare because to make them possible, we need more chunks + // than logical elements in the ChunkedArray. That requires at least one + // empty chunk. + std::vector offsets; + for (int64_t i = 0; i <= kMaxIndex; i++) { + offsets.push_back(i); + } + ChunkResolver resolver{offsets}; + ASSERT_OK(ResolveMany(resolver, logical_index_vec)); + + offsets.push_back(kMaxIndex); // adding an empty chunk + ChunkResolver resolver_with_empty{offsets}; + if (sizeof(IndexType) == 1) { + ASSERT_NOT_OK(ResolveMany(resolver_with_empty, logical_index_vec)); + } else { + ASSERT_OK(ResolveMany(resolver_with_empty, logical_index_vec)); + } + } +}; + +TYPED_TEST_SUITE(TestChunkResolverMany, IndexTypes); + +TYPED_TEST(TestChunkResolverMany, Basics) { this->TestBasics(); } +TYPED_TEST(TestChunkResolverMany, OutOfBounds) { this->TestOutOfBounds(); } +TYPED_TEST(TestChunkResolverMany, Overflow) { this->TestOverflow(); } + } // namespace arrow diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index bb632e2eb912d..e983b47e39dc4 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -44,6 +44,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -59,6 +60,7 @@ using internal::BitmapReader; using internal::BitmapUInt64Reader; using internal::checked_cast; using internal::OptionalBitmapEquals; +using util::Float16; // ---------------------------------------------------------------------- // Public method implementations @@ -95,6 +97,30 @@ struct FloatingEquality { const T epsilon; }; +// For half-float equality. +template +struct FloatingEquality { + explicit FloatingEquality(const EqualOptions& options) + : epsilon(static_cast(options.atol())) {} + + bool operator()(uint16_t x, uint16_t y) const { + Float16 f_x = Float16::FromBits(x); + Float16 f_y = Float16::FromBits(y); + if (x == y) { + return Flags::signed_zeros_equal || (f_x.signbit() == f_y.signbit()); + } + if (Flags::nans_equal && f_x.is_nan() && f_y.is_nan()) { + return true; + } + if (Flags::approximate && (fabs(f_x.ToFloat() - f_y.ToFloat()) <= epsilon)) { + return true; + } + return false; + } + + const float epsilon; +}; + template struct FloatingEqualityDispatcher { const EqualOptions& options; @@ -259,6 +285,8 @@ class RangeDataEqualsImpl { Status Visit(const DoubleType& type) { return CompareFloating(type); } + Status Visit(const HalfFloatType& type) { return CompareFloating(type); } + // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } @@ -863,6 +891,8 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } + Status Visit(const HalfFloatScalar& left) { return CompareFloating(left); } + template enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index badcf4f2f26ac..0a8018cd580cf 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -90,7 +90,9 @@ add_arrow_test(internals_test light_array_test.cc registry_test.cc key_hash_test.cc - row/compare_test.cc) + row/compare_test.cc + row/grouper_test.cc + util_internal_test.cc) add_arrow_compute_test(expression_test SOURCES expression_test.cc) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index eaec940556361..7c3bc46650e9f 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -341,7 +341,8 @@ static auto kMatchSubstringOptionsType = GetFunctionOptionsType( DataMember("nan_is_null", &NullOptions::nan_is_null)); static auto kPadOptionsType = GetFunctionOptionsType( - DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding)); + DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding), + DataMember("lean_left_on_odd_padding", &PadOptions::lean_left_on_odd_padding)); static auto kReplaceSliceOptionsType = GetFunctionOptionsType( DataMember("start", &ReplaceSliceOptions::start), DataMember("stop", &ReplaceSliceOptions::stop), @@ -480,10 +481,11 @@ NullOptions::NullOptions(bool nan_is_null) : FunctionOptions(internal::kNullOptionsType), nan_is_null(nan_is_null) {} constexpr char NullOptions::kTypeName[]; -PadOptions::PadOptions(int64_t width, std::string padding) +PadOptions::PadOptions(int64_t width, std::string padding, bool lean_left_on_odd_padding) : FunctionOptions(internal::kPadOptionsType), width(width), - padding(std::move(padding)) {} + padding(std::move(padding)), + lean_left_on_odd_padding(lean_left_on_odd_padding) {} PadOptions::PadOptions() : PadOptions(0, " ") {} constexpr char PadOptions::kTypeName[]; diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index bad34f4a37881..947474e5962d0 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -358,7 +358,8 @@ class ARROW_EXPORT StrftimeOptions : public FunctionOptions { class ARROW_EXPORT PadOptions : public FunctionOptions { public: - explicit PadOptions(int64_t width, std::string padding = " "); + explicit PadOptions(int64_t width, std::string padding = " ", + bool lean_left_on_odd_padding = true); PadOptions(); static constexpr char const kTypeName[] = "PadOptions"; @@ -366,6 +367,10 @@ class ARROW_EXPORT PadOptions : public FunctionOptions { int64_t width; /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII). std::string padding; + /// What to do if there is an odd number of padding characters (in case of centered + /// padding). Defaults to aligning on the left (i.e. adding the extra padding character + /// on the right) + bool lean_left_on_odd_padding = true; }; class ARROW_EXPORT TrimOptions : public FunctionOptions { diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index d47ee42ebf239..f0d5c0fcc3d72 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -153,6 +153,8 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("tiebreaker", &RankOptions::tiebreaker)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); +static auto kListFlattenOptionsType = GetFunctionOptionsType( + DataMember("recursive", &ListFlattenOptions::recursive)); } // namespace } // namespace internal @@ -224,6 +226,10 @@ PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; +ListFlattenOptions::ListFlattenOptions(bool recursive) + : FunctionOptions(internal::kListFlattenOptionsType), recursive(recursive) {} +constexpr char ListFlattenOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -237,6 +243,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 919572f16ee69..e5bcc37329661 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -245,6 +245,18 @@ class ARROW_EXPORT PairwiseOptions : public FunctionOptions { int64_t periods = 1; }; +/// \brief Options for list_flatten function +class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { + public: + explicit ListFlattenOptions(bool recursive = false); + static constexpr char const kTypeName[] = "ListFlattenOptions"; + static ListFlattenOptions Defaults() { return ListFlattenOptions(); } + + /// \brief If true, the list is flattened recursively until a non-list + /// array is formed. + bool recursive = false; +}; + /// @} /// \brief Filter with a boolean selection filter diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 28dcf493fa294..05c4936482b0b 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -480,7 +480,7 @@ struct NullGeneralization { if (dtype_id == Type::NA) { return ALL_NULL; } - if (!arrow::internal::HasValidityBitmap(dtype_id)) { + if (!arrow::internal::may_have_validity_bitmap(dtype_id)) { return ALL_VALID; } if (value.is_scalar()) { @@ -923,7 +923,7 @@ class ScalarExecutor : public KernelExecutorImpl { DCHECK(output.is_array_data()); // Emit a result for each chunk - RETURN_NOT_OK(EmitResult(std::move(output.array_data()), listener)); + RETURN_NOT_OK(EmitResult(output.array_data(), listener)); } return Status::OK(); } @@ -1107,7 +1107,7 @@ class VectorExecutor : public KernelExecutorImpl { RETURN_NOT_OK(PropagateNulls(kernel_ctx_, span, out.array_data().get())); } RETURN_NOT_OK(kernel_->exec(kernel_ctx_, span, &out)); - return EmitResult(std::move(out.array_data()), listener); + return EmitResult(out.array_data(), listener); } Status ExecChunked(const ExecBatch& batch, ExecListener* listener) { @@ -1116,10 +1116,10 @@ class VectorExecutor : public KernelExecutorImpl { ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(batch.length)); RETURN_NOT_OK(kernel_->exec_chunked(kernel_ctx_, batch, &out)); if (out.is_array()) { - return EmitResult(std::move(out.array()), listener); + return EmitResult(out.array(), listener); } else { DCHECK(out.is_chunked_array()); - return EmitResult(std::move(out.chunked_array()), listener); + return EmitResult(out.chunked_array(), listener); } } diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index 532869b3453a7..33e5928c2865d 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -763,9 +763,7 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i for (size_t i = 0; i < arguments.size(); ++i) { ARROW_ASSIGN_OR_RAISE( arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context)); - if (arguments[i].is_array()) { - all_scalar = false; - } + all_scalar &= arguments[i].is_scalar(); } int64_t input_length; @@ -1645,7 +1643,7 @@ Expression and_(const std::vector& operands) { Expression folded = operands.front(); for (auto it = operands.begin() + 1; it != operands.end(); ++it) { - folded = and_(std::move(folded), std::move(*it)); + folded = and_(std::move(folded), *it); } return folded; } @@ -1659,7 +1657,7 @@ Expression or_(const std::vector& operands) { Expression folded = operands.front(); for (auto it = operands.begin() + 1; it != operands.end(); ++it) { - folded = or_(std::move(folded), std::move(*it)); + folded = or_(std::move(folded), *it); } return folded; } diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index 30bd882b2c039..d94a17b6ffadf 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -909,6 +909,41 @@ TEST(Expression, ExecuteCallWithNoArguments) { EXPECT_EQ(actual.length(), kCount); } +TEST(Expression, ExecuteChunkedArray) { + // GH-41923: compute should generate the right result if input + // ExecBatch is `chunked_array`. + auto input_schema = struct_({field("a", struct_({ + field("a", float64()), + field("b", float64()), + }))}); + + auto chunked_array_input = ChunkedArrayFromJSON(input_schema, {R"([ + {"a": {"a": 6.125, "b": 3.375}}, + {"a": {"a": 0.0, "b": 1}} + ])", + R"([ + {"a": {"a": -1, "b": 4.75}} + ])"}); + + ASSERT_OK_AND_ASSIGN(auto table_input, + Table::FromChunkedStructArray(chunked_array_input)); + + auto expr = add(field_ref(FieldRef("a", "a")), field_ref(FieldRef("a", "b"))); + + ASSERT_OK_AND_ASSIGN(expr, expr.Bind(input_schema)); + std::vector inputs{table_input->column(0)}; + ExecBatch batch{inputs, 3}; + + ASSERT_OK_AND_ASSIGN(Datum res, ExecuteScalarExpression(expr, batch)); + + AssertDatumsEqual(res, ArrayFromJSON(float64(), + R"([ + 9.5, + 1, + 3.75 + ])")); +} + TEST(Expression, ExecuteDictionaryTransparent) { ExpectExecute( equal(field_ref("a"), field_ref("b")), diff --git a/cpp/src/arrow/compute/function_internal.h b/cpp/src/arrow/compute/function_internal.h index 653273ef0fac2..9d8928466baa5 100644 --- a/cpp/src/arrow/compute/function_internal.h +++ b/cpp/src/arrow/compute/function_internal.h @@ -684,12 +684,13 @@ const FunctionOptionsType* GetFunctionOptionsType(const Properties&... propertie auto options = std::make_unique(); RETURN_NOT_OK( FromStructScalarImpl(options.get(), scalar, properties_).status_); - return std::move(options); + // R build with openSUSE155 requires an explicit unique_ptr construction + return std::unique_ptr(std::move(options)); } std::unique_ptr Copy(const FunctionOptions& options) const override { auto out = std::make_unique(); CopyImpl(out.get(), checked_cast(options), properties_); - return std::move(out); + return out; } private: diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 66d38ecd64d49..c269de0763217 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -102,6 +102,7 @@ TEST(FunctionOptions, Equality) { #endif options.emplace_back(new PadOptions(5, " ")); options.emplace_back(new PadOptions(10, "A")); + options.emplace_back(new PadOptions(10, "A", false)); options.emplace_back(new TrimOptions(" ")); options.emplace_back(new TrimOptions("abc")); options.emplace_back(new SliceOptions(/*start=*/1)); diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index fd554ba3d83c5..5c87ef2cd0561 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -75,7 +75,7 @@ Result> ScalarAggregateKernel::MergeAll( for (auto& state : states) { RETURN_NOT_OK(kernel->merge(ctx, std::move(*state), out.get())); } - return std::move(out); + return out; } // ---------------------------------------------------------------------- @@ -361,7 +361,8 @@ size_t InputType::Hash() const { case InputType::EXACT_TYPE: hash_combine(result, type_->Hash()); break; - default: + case InputType::ANY_TYPE: + case InputType::USE_TYPE_MATCHER: break; } return result; @@ -378,10 +379,8 @@ std::string InputType::ToString() const { break; case InputType::USE_TYPE_MATCHER: { ss << type_matcher_->ToString(); - } break; - default: - DCHECK(false); break; + } } return ss.str(); } @@ -400,9 +399,8 @@ bool InputType::Equals(const InputType& other) const { return type_->Equals(*other.type_); case InputType::USE_TYPE_MATCHER: return type_matcher_->Equals(*other.type_matcher_); - default: - return false; } + return false; } bool InputType::Matches(const DataType& type) const { @@ -411,21 +409,23 @@ bool InputType::Matches(const DataType& type) const { return type_->Equals(type); case InputType::USE_TYPE_MATCHER: return type_matcher_->Matches(type); - default: - // ANY_TYPE + case InputType::ANY_TYPE: return true; } + return false; } bool InputType::Matches(const Datum& value) const { switch (value.kind()) { + case Datum::NONE: + case Datum::RECORD_BATCH: + case Datum::TABLE: + DCHECK(false) << "Matches expects ARRAY, CHUNKED_ARRAY or SCALAR"; + return false; case Datum::ARRAY: case Datum::CHUNKED_ARRAY: case Datum::SCALAR: break; - default: - DCHECK(false); - return false; } return Matches(*value.type()); } @@ -445,11 +445,13 @@ const TypeMatcher& InputType::type_matcher() const { Result OutputType::Resolve(KernelContext* ctx, const std::vector& types) const { - if (kind_ == OutputType::FIXED) { - return type_.get(); - } else { - return resolver_(ctx, types); + switch (kind_) { + case OutputType::FIXED: + return type_; + case OutputType::COMPUTED: + break; } + return resolver_(ctx, types); } const std::shared_ptr& OutputType::type() const { @@ -463,11 +465,13 @@ const OutputType::Resolver& OutputType::resolver() const { } std::string OutputType::ToString() const { - if (kind_ == OutputType::FIXED) { - return type_->ToString(); - } else { - return "computed"; + switch (kind_) { + case OutputType::FIXED: + return type_->ToString(); + case OutputType::COMPUTED: + break; } + return "computed"; } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index afb30996eac15..7c7b9c8b68d45 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -81,6 +81,7 @@ add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_cast_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_if_else_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(scalar_list_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_random_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_round_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 00a833742f957..0fd9cae7a8d71 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/compute/api_vector.h" #include "arrow/type_fwd.h" namespace arrow { @@ -56,9 +57,23 @@ Result LastType(KernelContext*, const std::vector& types return types.back(); } -Result ListValuesType(KernelContext*, const std::vector& args) { - const auto& list_type = checked_cast(*args[0].type); - return list_type.value_type().get(); +Result ListValuesType(KernelContext* ctx, + const std::vector& args) { + auto list_type = checked_cast(args[0].type); + auto value_type = list_type->value_type().get(); + + auto recursive = + ctx->state() ? OptionsWrapper::Get(ctx).recursive : false; + if (!recursive) { + return value_type; + } + + for (auto value_kind = value_type->id(); + is_list(value_kind) || is_list_view(value_kind); value_kind = value_type->id()) { + list_type = checked_cast(list_type->value_type().get()); + value_type = list_type->value_type().get(); + } + return value_type; } void EnsureDictionaryDecoded(std::vector* types) { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 72b29057b82e0..9e46a21887f8c 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -369,43 +369,6 @@ struct UnboxScalar { } }; -template -struct BoxScalar; - -template -struct BoxScalar> { - using T = typename GetOutputType::T; - static void Box(T val, Scalar* out) { - // Enables BoxScalar to work on a (for example) Time64Scalar - T* mutable_data = reinterpret_cast( - checked_cast<::arrow::internal::PrimitiveScalarBase*>(out)->mutable_data()); - *mutable_data = val; - } -}; - -template -struct BoxScalar> { - using T = typename GetOutputType::T; - using ScalarType = typename TypeTraits::ScalarType; - static void Box(T val, Scalar* out) { - checked_cast(out)->value = std::make_shared(val); - } -}; - -template <> -struct BoxScalar { - using T = Decimal128; - using ScalarType = Decimal128Scalar; - static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } -}; - -template <> -struct BoxScalar { - using T = Decimal256; - using ScalarType = Decimal256Scalar; - static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } -}; - // A VisitArraySpanInline variant that calls its visitor function with logical // values, such as Decimal128 rather than std::string_view. @@ -460,7 +423,8 @@ static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& ar Result FirstType(KernelContext*, const std::vector& types); Result LastType(KernelContext*, const std::vector& types); -Result ListValuesType(KernelContext*, const std::vector& types); +Result ListValuesType(KernelContext* ctx, + const std::vector& types); // ---------------------------------------------------------------------- // Helpers for iterating over common DataType instances for adding kernels to diff --git a/cpp/src/arrow/compute/kernels/gather_internal.h b/cpp/src/arrow/compute/kernels/gather_internal.h new file mode 100644 index 0000000000000..4c161533a7277 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/gather_internal.h @@ -0,0 +1,306 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/util/bit_block_counter.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/macros.h" + +// Implementation helpers for kernels that need to load/gather fixed-width +// data from multiple, arbitrary indices. +// +// https://en.wikipedia.org/wiki/Gather/scatter_(vector_addressing) + +namespace arrow::internal { + +// CRTP [1] base class for Gather that provides a gathering loop in terms of +// Write*() methods that must be implemented by the derived class. +// +// [1] https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern +template +class GatherBaseCRTP { + public: + // Output offset is not supported by Gather and idx is supposed to have offset + // pre-applied. idx_validity parameters on functions can use the offset they + // carry to read the validity bitmap as bitmaps can't have pre-applied offsets + // (they might not align to byte boundaries). + + GatherBaseCRTP() = default; + ARROW_DISALLOW_COPY_AND_ASSIGN(GatherBaseCRTP); + ARROW_DEFAULT_MOVE_AND_ASSIGN(GatherBaseCRTP); + + protected: + ARROW_FORCE_INLINE int64_t ExecuteNoNulls(int64_t idx_length) { + auto* self = static_cast(this); + for (int64_t position = 0; position < idx_length; position++) { + self->WriteValue(position); + } + return idx_length; + } + + // See derived Gather classes below for the meaning of the parameters, pre and + // post-conditions. + // + // src_validity is not necessarily the source of the values that are being + // gathered (e.g. the source could be a nested fixed-size list array and the + // values being gathered are from the innermost buffer), so the ArraySpan is + // used solely to check for nulls in the source values and nothing else. + // + // idx_length is the number of elements in idx and consequently the number of + // bits that might be written to out_is_valid. Member `Write*()` functions will be + // called with positions from 0 to idx_length - 1. + // + // If `kOutputIsZeroInitialized` is true, then `WriteZero()` or `WriteZeroSegment()` + // doesn't have to be called for resulting null positions. A position is + // considered null if either the index or the source value is null at that + // position. + template + ARROW_FORCE_INLINE int64_t ExecuteWithNulls(const ArraySpan& src_validity, + int64_t idx_length, const IndexCType* idx, + const ArraySpan& idx_validity, + uint8_t* out_is_valid) { + auto* self = static_cast(this); + OptionalBitBlockCounter indices_bit_counter(idx_validity.buffers[0].data, + idx_validity.offset, idx_length); + int64_t position = 0; + int64_t valid_count = 0; + while (position < idx_length) { + BitBlockCount block = indices_bit_counter.NextBlock(); + if (!src_validity.MayHaveNulls()) { + // Source values are never null, so things are easier + valid_count += block.popcount; + if (block.popcount == block.length) { + // Fastest path: neither source values nor index nulls + bit_util::SetBitsTo(out_is_valid, position, block.length, true); + for (int64_t i = 0; i < block.length; ++i) { + self->WriteValue(position); + ++position; + } + } else if (block.popcount > 0) { + // Slow path: some indices but not all are null + for (int64_t i = 0; i < block.length; ++i) { + ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr); + if (idx_validity.IsValid(position)) { + // index is not null + bit_util::SetBit(out_is_valid, position); + self->WriteValue(position); + } else if constexpr (!kOutputIsZeroInitialized) { + self->WriteZero(position); + } + ++position; + } + } else { + self->WriteZeroSegment(position, block.length); + position += block.length; + } + } else { + // Source values may be null, so we must do random access into src_validity + if (block.popcount == block.length) { + // Faster path: indices are not null but source values may be + for (int64_t i = 0; i < block.length; ++i) { + ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr); + if (src_validity.IsValid(idx[position])) { + // value is not null + self->WriteValue(position); + bit_util::SetBit(out_is_valid, position); + ++valid_count; + } else if constexpr (!kOutputIsZeroInitialized) { + self->WriteZero(position); + } + ++position; + } + } else if (block.popcount > 0) { + // Slow path: some but not all indices are null. Since we are doing + // random access in general we have to check the value nullness one by + // one. + for (int64_t i = 0; i < block.length; ++i) { + ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr); + ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr); + if (idx_validity.IsValid(position) && src_validity.IsValid(idx[position])) { + // index is not null && value is not null + self->WriteValue(position); + bit_util::SetBit(out_is_valid, position); + ++valid_count; + } else if constexpr (!kOutputIsZeroInitialized) { + self->WriteZero(position); + } + ++position; + } + } else { + if constexpr (!kOutputIsZeroInitialized) { + self->WriteZeroSegment(position, block.length); + } + position += block.length; + } + } + } + return valid_count; + } +}; + +// A gather primitive for primitive fixed-width types with a integral byte width. If +// `kWithFactor` is true, the actual width is a runtime multiple of `kValueWidthInbits` +// (this can be useful for fixed-size list inputs and other input types with unusual byte +// widths that don't deserve value specialization). +template +class Gather : public GatherBaseCRTP> { + public: + static_assert(kValueWidthInBits >= 0 && kValueWidthInBits % 8 == 0); + static constexpr int kValueWidth = kValueWidthInBits / 8; + + private: + const int64_t src_length_; // number of elements of kValueWidth bytes in src_ + const uint8_t* src_; + const int64_t idx_length_; // number IndexCType elements in idx_ + const IndexCType* idx_; + uint8_t* out_; + int64_t factor_; + + public: + void WriteValue(int64_t position) { + if constexpr (kWithFactor) { + const int64_t scaled_factor = kValueWidth * factor_; + memcpy(out_ + position * scaled_factor, src_ + idx_[position] * scaled_factor, + scaled_factor); + } else { + memcpy(out_ + position * kValueWidth, src_ + idx_[position] * kValueWidth, + kValueWidth); + } + } + + void WriteZero(int64_t position) { + if constexpr (kWithFactor) { + const int64_t scaled_factor = kValueWidth * factor_; + memset(out_ + position * scaled_factor, 0, scaled_factor); + } else { + memset(out_ + position * kValueWidth, 0, kValueWidth); + } + } + + void WriteZeroSegment(int64_t position, int64_t length) { + if constexpr (kWithFactor) { + const int64_t scaled_factor = kValueWidth * factor_; + memset(out_ + position * scaled_factor, 0, length * scaled_factor); + } else { + memset(out_ + position * kValueWidth, 0, length * kValueWidth); + } + } + + public: + Gather(int64_t src_length, const uint8_t* src, int64_t zero_src_offset, + int64_t idx_length, const IndexCType* idx, uint8_t* out, int64_t factor) + : src_length_(src_length), + src_(src), + idx_length_(idx_length), + idx_(idx), + out_(out), + factor_(factor) { + assert(zero_src_offset == 0); + assert(src && idx && out); + assert((kWithFactor || factor == 1) && + "When kWithFactor is false, the factor is assumed to be 1 at compile time"); + } + + ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); } + + /// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized. + /// \pre Bits in out_is_valid have to always be zero initialized. + /// \post The bits for the valid elements (and only those) are set in out_is_valid. + /// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null + /// elements have 0s written to them. This might be less efficient than + /// zero-initializing first and calling this->Execute() afterwards. + /// \return The number of valid elements in out. + template + ARROW_FORCE_INLINE int64_t Execute(const ArraySpan& src_validity, + const ArraySpan& idx_validity, + uint8_t* out_is_valid) { + assert(src_length_ == src_validity.length); + assert(idx_length_ == idx_validity.length); + assert(out_is_valid); + return this->template ExecuteWithNulls( + src_validity, idx_length_, idx_, idx_validity, out_is_valid); + } +}; + +// A gather primitive for boolean inputs. Unlike its counterpart above, +// this does not support passing a non-trivial factor parameter. +template +class Gather + : public GatherBaseCRTP> { + private: + const int64_t src_length_; // number of elements of bits bytes in src_ after offset + const uint8_t* src_; // the boolean array data buffer in bits + const int64_t src_offset_; // offset in bits + const int64_t idx_length_; // number IndexCType elements in idx_ + const IndexCType* idx_; + uint8_t* out_; // output boolean array data buffer in bits + + public: + Gather(int64_t src_length, const uint8_t* src, int64_t src_offset, int64_t idx_length, + const IndexCType* idx, uint8_t* out, int64_t factor) + : src_length_(src_length), + src_(src), + src_offset_(src_offset), + idx_length_(idx_length), + idx_(idx), + out_(out) { + assert(src && idx && out); + assert(factor == 1 && + "factor != 1 is not supported when Gather is used to gather bits/booleans"); + } + + void WriteValue(int64_t position) { + bit_util::SetBitTo(out_, position, + bit_util::GetBit(src_, src_offset_ + idx_[position])); + } + + void WriteZero(int64_t position) { bit_util::ClearBit(out_, position); } + + void WriteZeroSegment(int64_t position, int64_t block_length) { + bit_util::SetBitsTo(out_, position, block_length, false); + } + + ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); } + + /// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized. + /// \pre Bits in out_is_valid have to always be zero initialized. + /// \post The bits for the valid elements (and only those) are set in out_is_valid. + /// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null + /// elements have 0s written to them. This might be less efficient than + /// zero-initializing first and calling this->Execute() afterwards. + /// \return The number of valid elements in out. + template + ARROW_FORCE_INLINE int64_t Execute(const ArraySpan& src_validity, + const ArraySpan& idx_validity, + uint8_t* out_is_valid) { + assert(src_length_ == src_validity.length); + assert(idx_length_ == idx_validity.length); + assert(out_is_valid); + return this->template ExecuteWithNulls( + src_validity, idx_length_, idx_, idx_validity, out_is_valid); + } +}; + +} // namespace arrow::internal diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 5052d8dd66694..54cd695421a93 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -83,7 +83,8 @@ Result> HashAggregateInit(KernelContext* ctx, const KernelInitArgs& args) { auto impl = std::make_unique(); RETURN_NOT_OK(impl->Init(ctx->exec_context(), args)); - return std::move(impl); + // R build with openSUSE155 requires an explicit unique_ptr construction + return std::unique_ptr(std::move(impl)); } Status HashAggregateResize(KernelContext* ctx, int64_t num_groups) { @@ -813,7 +814,7 @@ struct GroupedMeanImpl (*null_count)++; bit_util::SetBitTo((*null_bitmap)->mutable_data(), i, false); } - return std::move(values); + return values; } std::shared_ptr out_type() const override { @@ -1114,7 +1115,8 @@ Result> VarStdInit(KernelContext* ctx, auto impl = std::make_unique>(); impl->result_type_ = result_type; RETURN_NOT_OK(impl->Init(ctx->exec_context(), args)); - return std::move(impl); + // R build with openSUSE155 requires an explicit unique_ptr construction + return std::unique_ptr(std::move(impl)); } template @@ -1685,7 +1687,7 @@ Result> MinMaxInit(KernelContext* ctx, const KernelInitArgs& args) { ARROW_ASSIGN_OR_RAISE(auto impl, HashAggregateInit>(ctx, args)); static_cast*>(impl.get())->type_ = args.inputs[0].GetSharedPtr(); - return std::move(impl); + return impl; } template @@ -2188,7 +2190,7 @@ Result> FirstLastInit(KernelContext* ctx, ARROW_ASSIGN_OR_RAISE(auto impl, HashAggregateInit>(ctx, args)); static_cast*>(impl.get())->type_ = args.inputs[0].GetSharedPtr(); - return std::move(impl); + return impl; } template @@ -2597,7 +2599,7 @@ Result> GroupedDistinctInit(KernelContext* ctx, instance->out_type_ = args.inputs[0].GetSharedPtr(); ARROW_ASSIGN_OR_RAISE(instance->grouper_, Grouper::Make(args.inputs, ctx->exec_context())); - return std::move(impl); + return impl; } // ---------------------------------------------------------------------- @@ -2839,7 +2841,7 @@ Result> GroupedOneInit(KernelContext* ctx, ARROW_ASSIGN_OR_RAISE(auto impl, HashAggregateInit>(ctx, args)); auto instance = static_cast*>(impl.get()); instance->out_type_ = args.inputs[0].GetSharedPtr(); - return std::move(impl); + return impl; } struct GroupedOneFactory { @@ -3237,7 +3239,7 @@ Result> GroupedListInit(KernelContext* ctx, ARROW_ASSIGN_OR_RAISE(auto impl, HashAggregateInit>(ctx, args)); auto instance = static_cast*>(impl.get()); instance->out_type_ = args.inputs[0].GetSharedPtr(); - return std::move(impl); + return impl; } struct GroupedListFactory { diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index efd25a8a20c80..eb243de4a765e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -503,7 +503,7 @@ Result ResolveDecimalBinaryOperationOutput( ToResult(getter(left_type.precision(), left_type.scale(), right_type.precision(), right_type.scale()))); ARROW_ASSIGN_OR_RAISE(auto type, DecimalType::Make(left_type.id(), precision, scale)); - return std::move(type); + return type; } Result ResolveDecimalAdditionOrSubtractionOutput( @@ -566,7 +566,7 @@ Result ResolveTemporalOutput(KernelContext*, } auto type = duration(right_type.unit()); - return std::move(type); + return type; } template diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc b/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc index c32a6ef6de93e..2a54d28c6fb64 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc @@ -56,8 +56,9 @@ Status CastToExtension(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou std::shared_ptr GetCastToExtension(std::string name) { auto func = std::make_shared(std::move(name), Type::EXTENSION); for (Type::type in_ty : AllTypeIds()) { - DCHECK_OK( - func->AddKernel(in_ty, {InputType(in_ty)}, kOutputTargetType, CastToExtension)); + DCHECK_OK(func->AddKernel(in_ty, {InputType(in_ty)}, kOutputTargetType, + CastToExtension, NullHandling::COMPUTED_NO_PREALLOCATE, + MemAllocation::NO_PREALLOCATE)); } return func; } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc index 8cf5a04addb00..d8c4088759643 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc @@ -19,10 +19,13 @@ #include "arrow/compute/cast_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/extension_type.h" +#include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" namespace arrow { +using arrow::util::Float16; using internal::checked_cast; using internal::PrimitiveScalarBase; @@ -47,6 +50,42 @@ struct CastPrimitive { } }; +// Converting floating types to half float. +template +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + using InT = typename InType::c_type; + const InT* in_values = arr.GetValues(1); + uint16_t* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = Float16(*in_values++).bits(); + } + } +}; + +// Converting from half float to other floating types. +template <> +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + const uint16_t* in_values = arr.GetValues(1); + float* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = Float16::FromBits(*in_values++).ToFloat(); + } + } +}; + +template <> +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + const uint16_t* in_values = arr.GetValues(1); + double* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = Float16::FromBits(*in_values++).ToDouble(); + } + } +}; + template struct CastPrimitive::value>> { // memcpy output @@ -56,6 +95,33 @@ struct CastPrimitive: } }; +// Cast int to half float +template +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + using InT = typename InType::c_type; + const InT* in_values = arr.GetValues(1); + uint16_t* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + float temp = static_cast(*in_values++); + *out_values++ = Float16(temp).bits(); + } + } +}; + +// Cast half float to int +template +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + using OutT = typename OutType::c_type; + const uint16_t* in_values = arr.GetValues(1); + OutT* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = static_cast(Float16::FromBits(*in_values++).ToFloat()); + } + } +}; + template void CastNumberImpl(Type::type out_type, const ArraySpan& input, ArraySpan* out) { switch (out_type) { @@ -79,6 +145,8 @@ void CastNumberImpl(Type::type out_type, const ArraySpan& input, ArraySpan* out) return CastPrimitive::Exec(input, out); case Type::DOUBLE: return CastPrimitive::Exec(input, out); + case Type::HALF_FLOAT: + return CastPrimitive::Exec(input, out); default: break; } @@ -109,6 +177,8 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, return CastNumberImpl(out_type, input, out); case Type::DOUBLE: return CastNumberImpl(out_type, input, out); + case Type::HALF_FLOAT: + return CastNumberImpl(out_type, input, out); default: DCHECK(false); break; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index b054e57f04d12..3df86e7d6936c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -23,6 +23,7 @@ #include "arrow/compute/kernels/util_internal.h" #include "arrow/scalar.h" #include "arrow/util/bit_block_counter.h" +#include "arrow/util/float16.h" #include "arrow/util/int_util.h" #include "arrow/util/value_parsing.h" @@ -34,6 +35,7 @@ using internal::IntegersCanFit; using internal::OptionalBitBlockCounter; using internal::ParseValue; using internal::PrimitiveScalarBase; +using util::Float16; namespace compute { namespace internal { @@ -56,18 +58,37 @@ Status CastFloatingToFloating(KernelContext*, const ExecSpan& batch, ExecResult* // ---------------------------------------------------------------------- // Implement fast safe floating point to integer cast +// +template +struct WasTruncated { + static bool Check(OutT out_val, InT in_val) { + return static_cast(out_val) != in_val; + } + + static bool CheckMaybeNull(OutT out_val, InT in_val, bool is_valid) { + return is_valid && static_cast(out_val) != in_val; + } +}; + +// Half float to int +template +struct WasTruncated { + using OutT = typename OutType::c_type; + static bool Check(OutT out_val, uint16_t in_val) { + return static_cast(out_val) != Float16::FromBits(in_val).ToFloat(); + } + + static bool CheckMaybeNull(OutT out_val, uint16_t in_val, bool is_valid) { + return is_valid && static_cast(out_val) != Float16::FromBits(in_val).ToFloat(); + } +}; // InType is a floating point type we are planning to cast to integer template ARROW_DISABLE_UBSAN("float-cast-overflow") Status CheckFloatTruncation(const ArraySpan& input, const ArraySpan& output) { - auto WasTruncated = [&](OutT out_val, InT in_val) -> bool { - return static_cast(out_val) != in_val; - }; - auto WasTruncatedMaybeNull = [&](OutT out_val, InT in_val, bool is_valid) -> bool { - return is_valid && static_cast(out_val) != in_val; - }; auto GetErrorMessage = [&](InT val) { return Status::Invalid("Float value ", val, " was truncated converting to ", *output.type); @@ -86,26 +107,28 @@ Status CheckFloatTruncation(const ArraySpan& input, const ArraySpan& output) { if (block.popcount == block.length) { // Fast path: branchless for (int64_t i = 0; i < block.length; ++i) { - block_out_of_bounds |= WasTruncated(out_data[i], in_data[i]); + block_out_of_bounds |= + WasTruncated::Check(out_data[i], in_data[i]); } } else if (block.popcount > 0) { // Indices have nulls, must only boundscheck non-null values for (int64_t i = 0; i < block.length; ++i) { - block_out_of_bounds |= WasTruncatedMaybeNull( + block_out_of_bounds |= WasTruncated::CheckMaybeNull( out_data[i], in_data[i], bit_util::GetBit(bitmap, offset_position + i)); } } if (ARROW_PREDICT_FALSE(block_out_of_bounds)) { if (input.GetNullCount() > 0) { for (int64_t i = 0; i < block.length; ++i) { - if (WasTruncatedMaybeNull(out_data[i], in_data[i], - bit_util::GetBit(bitmap, offset_position + i))) { + if (WasTruncated::CheckMaybeNull( + out_data[i], in_data[i], + bit_util::GetBit(bitmap, offset_position + i))) { return GetErrorMessage(in_data[i]); } } } else { for (int64_t i = 0; i < block.length; ++i) { - if (WasTruncated(out_data[i], in_data[i])) { + if (WasTruncated::Check(out_data[i], in_data[i])) { return GetErrorMessage(in_data[i]); } } @@ -151,6 +174,9 @@ Status CheckFloatToIntTruncation(const ExecValue& input, const ExecResult& outpu return CheckFloatToIntTruncationImpl(input.array, *output.array_span()); case Type::DOUBLE: return CheckFloatToIntTruncationImpl(input.array, *output.array_span()); + case Type::HALF_FLOAT: + return CheckFloatToIntTruncationImpl(input.array, + *output.array_span()); default: break; } @@ -293,6 +319,15 @@ struct CastFunctor< } }; +template <> +struct CastFunctor> { + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return applicator::ScalarUnaryNotNull>::Exec(ctx, batch, + out); + } +}; + // ---------------------------------------------------------------------- // Decimal to integer @@ -689,6 +724,10 @@ std::shared_ptr GetCastToInteger(std::string name) { DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastFloatingToInteger)); } + // Cast from half-float + DCHECK_OK(func->AddKernel(Type::HALF_FLOAT, {InputType(Type::HALF_FLOAT)}, out_ty, + CastFloatingToInteger)); + // From other numbers to integer AddCommonNumberCasts(out_ty, func.get()); @@ -715,6 +754,10 @@ std::shared_ptr GetCastToFloating(std::string name) { DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastFloatingToFloating)); } + // From half-float to float/double + DCHECK_OK(func->AddKernel(Type::HALF_FLOAT, {InputType(Type::HALF_FLOAT)}, out_ty, + CastFloatingToFloating)); + // From other numbers to floating point AddCommonNumberCasts(out_ty, func.get()); @@ -723,6 +766,7 @@ std::shared_ptr GetCastToFloating(std::string name) { CastFunctor::Exec)); DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty, CastFunctor::Exec)); + return func; } @@ -795,6 +839,32 @@ std::shared_ptr GetCastToDecimal256() { return func; } +std::shared_ptr GetCastToHalfFloat() { + // HalfFloat is a bit brain-damaged for now + auto func = std::make_shared("func", Type::HALF_FLOAT); + AddCommonCasts(Type::HALF_FLOAT, float16(), func.get()); + + // Casts from integer to floating point + for (const std::shared_ptr& in_ty : IntTypes()) { + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, + TypeTraits::type_singleton(), + CastIntegerToFloating)); + } + + // Cast from other strings to half float. + for (const std::shared_ptr& in_ty : BaseBinaryTypes()) { + auto exec = GenerateVarBinaryBase(*in_ty); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, + TypeTraits::type_singleton(), exec)); + } + + DCHECK_OK(func.get()->AddKernel(Type::FLOAT, {InputType(Type::FLOAT)}, float16(), + CastFloatingToFloating)); + DCHECK_OK(func.get()->AddKernel(Type::DOUBLE, {InputType(Type::DOUBLE)}, float16(), + CastFloatingToFloating)); + return func; +} + } // namespace std::vector> GetNumericCasts() { @@ -830,13 +900,14 @@ std::vector> GetNumericCasts() { functions.push_back(GetCastToInteger("cast_uint64")); // HalfFloat is a bit brain-damaged for now - auto cast_half_float = - std::make_shared("cast_half_float", Type::HALF_FLOAT); - AddCommonCasts(Type::HALF_FLOAT, float16(), cast_half_float.get()); + auto cast_half_float = GetCastToHalfFloat(); functions.push_back(cast_half_float); - functions.push_back(GetCastToFloating("cast_float")); - functions.push_back(GetCastToFloating("cast_double")); + auto cast_float = GetCastToFloating("cast_float"); + functions.push_back(cast_float); + + auto cast_double = GetCastToFloating("cast_double"); + functions.push_back(cast_double); functions.push_back(GetCastToDecimal128()); functions.push_back(GetCastToDecimal256()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index a6576e4e4c26f..dc3fe29a3dfae 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -340,10 +340,15 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou if (input.offset == output->offset) { output->buffers[0] = input.GetBuffer(0); } else { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, - input.offset, input.length)); + // When the offsets are different (e.g., due to slice operation), we need to check if + // the null bitmap buffer is not null before copying it. The null bitmap buffer can be + // null if the input array value does not contain any null value. + if (input.buffers[0].data != NULLPTR) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } } // This buffer is preallocated @@ -437,6 +442,10 @@ void AddNumberToStringCasts(CastFunction* func) { GenerateNumeric(*in_ty), NullHandling::COMPUTED_NO_PREALLOCATE)); } + + DCHECK_OK(func->AddKernel(Type::HALF_FLOAT, {float16()}, out_ty, + NumericToStringCastFunctor::Exec, + NullHandling::COMPUTED_NO_PREALLOCATE)); } template diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index a8acf68f66c8b..f60d8f2e19e98 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -389,7 +389,7 @@ TEST(Cast, ToIntDowncastUnsafe) { } TEST(Cast, FloatingToInt) { - for (auto from : {float32(), float64()}) { + for (auto from : {float16(), float32(), float64()}) { for (auto to : {int32(), int64()}) { // float to int no truncation CheckCast(ArrayFromJSON(from, "[1.0, null, 0.0, -1.0, 5.0]"), @@ -407,6 +407,15 @@ TEST(Cast, FloatingToInt) { } } +TEST(Cast, FloatingToFloating) { + for (auto from : {float16(), float32(), float64()}) { + for (auto to : {float16(), float32(), float64()}) { + CheckCast(ArrayFromJSON(from, "[1.0, 0.0, -1.0, 5.0]"), + ArrayFromJSON(to, "[1.0, 0.0, -1.0, 5.0]")); + } + } +} + TEST(Cast, IntToFloating) { for (auto from : {uint32(), int32()}) { std::string two_24 = "[16777216, 16777217]"; @@ -2187,6 +2196,33 @@ TEST(Cast, BinaryOrStringToFixedSizeBinary) { } } +TEST(Cast, FixedSizeBinaryToBinaryOrString) { + for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])"); + + CheckCast(valid_input, ArrayFromJSON(out_type, R"(["foo", null, "bar", "baz", + "quu"])")); + + auto empty_input = ArrayFromJSON(fixed_size_binary(3), "[]"); + CheckCast(empty_input, ArrayFromJSON(out_type, "[]")); + } +} + +TEST(Cast, FixedSizeBinaryToBinaryOrStringWithSlice) { + for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])"); + auto sliced = valid_input->Slice(1, 3); + CheckCast(sliced, ArrayFromJSON(out_type, R"([null, "bar", "baz"])")); + + auto valid_input_without_null = ArrayFromJSON(fixed_size_binary(3), R"(["foo", "bar", + "baz", "quu"])"); + auto sliced_without_null = valid_input_without_null->Slice(1, 3); + CheckCast(sliced_without_null, ArrayFromJSON(out_type, R"(["bar", "baz", "quu"])")); + } +} + TEST(Cast, IntToString) { for (auto string_type : {utf8(), large_utf8()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), @@ -2220,14 +2256,12 @@ TEST(Cast, IntToString) { } TEST(Cast, FloatingToString) { - for (auto string_type : {utf8(), large_utf8()}) { - CheckCast( - ArrayFromJSON(float32(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), - ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); - - CheckCast( - ArrayFromJSON(float64(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), - ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); + for (auto float_type : {float16(), float32(), float64()}) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast(ArrayFromJSON(float_type, "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), + ArrayFromJSON(string_type, + R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); + } } } @@ -2247,7 +2281,7 @@ TEST(Cast, ListToPrimitive) { Cast(*ArrayFromJSON(list(binary()), R"([["1", "2"], ["3", "4"]])"), utf8())); } -using make_list_t = std::shared_ptr(const std::shared_ptr&); +using make_list_t = std::shared_ptr(std::shared_ptr); static const auto list_factories = std::vector{&list, &large_list}; diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc index daf8ed76d628d..9b2fd987d81d8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_compare.cc +++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc @@ -491,8 +491,9 @@ template struct ScalarMinMax { using OutValue = typename GetOutputType::T; - static void ExecScalar(const ExecSpan& batch, - const ElementWiseAggregateOptions& options, Scalar* out) { + static Result> ExecScalar( + const ExecSpan& batch, const ElementWiseAggregateOptions& options, + std::shared_ptr type) { // All arguments are scalar OutValue value{}; bool valid = false; @@ -502,8 +503,8 @@ struct ScalarMinMax { const Scalar& scalar = *arg.scalar; if (!scalar.is_valid) { if (options.skip_nulls) continue; - out->is_valid = false; - return; + valid = false; + break; } if (!valid) { value = UnboxScalar::Unbox(scalar); @@ -513,9 +514,10 @@ struct ScalarMinMax { value, UnboxScalar::Unbox(scalar)); } } - out->is_valid = valid; if (valid) { - BoxScalar::Box(value, out); + return MakeScalar(std::move(type), std::move(value)); + } else { + return MakeNullScalar(std::move(type)); } } @@ -537,8 +539,7 @@ struct ScalarMinMax { bool initialize_output = true; if (scalar_count > 0) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr temp_scalar, - MakeScalar(out->type()->GetSharedPtr(), 0)); - ExecScalar(batch, options, temp_scalar.get()); + ExecScalar(batch, options, out->type()->GetSharedPtr())); if (temp_scalar->is_valid) { const auto value = UnboxScalar::Unbox(*temp_scalar); initialize_output = false; diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index ee181c053c053..6368ef525ff9c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1309,9 +1309,10 @@ void AddFixedWidthIfElseKernel(const std::shared_ptr& scalar_fun } void AddNestedIfElseKernels(const std::shared_ptr& scalar_function) { - for (const auto type_id : {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, - Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::STRUCT, - Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { + for (const auto type_id : + {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, Type::LARGE_LIST_VIEW, + Type::FIXED_SIZE_LIST, Type::MAP, Type::STRUCT, Type::DENSE_UNION, + Type::SPARSE_UNION, Type::DICTIONARY}) { ScalarKernel kernel({boolean(), InputType(type_id), InputType(type_id)}, LastType, NestedIfElseExec::Exec); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; @@ -1482,39 +1483,27 @@ Status ExecScalarCaseWhen(KernelContext* ctx, const ExecSpan& batch, ExecResult* result = temp.get(); } - // TODO(wesm): clean this up to have less duplication - if (out->is_array_data()) { - ArrayData* output = out->array_data().get(); - if (is_dictionary_type::value) { - const ExecValue& dict_from = has_result ? result : batch[1]; - if (dict_from.is_scalar()) { - output->dictionary = checked_cast(*dict_from.scalar) - .value.dictionary->data(); - } else { - output->dictionary = dict_from.array.ToArrayData()->dictionary; - } - } - CopyValues(result, /*in_offset=*/0, batch.length, - output->GetMutableValues(0, 0), - output->GetMutableValues(1, 0), output->offset); - } else { - // ArraySpan - ArraySpan* output = out->array_span_mutable(); - if (is_dictionary_type::value) { - const ExecValue& dict_from = has_result ? result : batch[1]; - output->child_data.resize(1); - if (dict_from.is_scalar()) { - output->child_data[0].SetMembers( - *checked_cast(*dict_from.scalar) - .value.dictionary->data()); - } else { - output->child_data[0] = dict_from.array; - } + // Only input types of non-fixed length (which cannot be pre-allocated) + // will save the output data in ArrayData. And make sure the FixedLength + // types must be output in ArraySpan. + static_assert(is_fixed_width(Type::type_id)); + DCHECK(out->is_array_span()); + + ArraySpan* output = out->array_span_mutable(); + if (is_dictionary_type::value) { + const ExecValue& dict_from = has_result ? result : batch[1]; + output->child_data.resize(1); + if (dict_from.is_scalar()) { + output->child_data[0].SetMembers( + *checked_cast(*dict_from.scalar) + .value.dictionary->data()); + } else { + output->child_data[0] = dict_from.array; } - CopyValues(result, /*in_offset=*/0, batch.length, - output->GetValues(0, 0), output->GetValues(1, 0), - output->offset); } + CopyValues(result, /*in_offset=*/0, batch.length, + output->GetValues(0, 0), output->GetValues(1, 0), + output->offset); return Status::OK(); } @@ -1847,6 +1836,48 @@ struct CaseWhenFunctor> { } }; +// TODO(GH-41453): a more efficient implementation for list-views is possible +template +struct CaseWhenFunctor> { + using offset_type = typename Type::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + /// TODO(wesm): should this be a DCHECK? Or checked elsewhere + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecVarWidthScalarCaseWhen(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [&](ArrayBuilder* raw_builder) { + auto builder = checked_cast(raw_builder); + auto child_builder = builder->value_builder(); + + int64_t reservation = 0; + for (int arg = 1; arg < batch.num_values(); arg++) { + const ExecValue& source = batch[arg]; + if (!source.is_array()) { + const auto& scalar = checked_cast(*source.scalar); + if (!scalar.value) continue; + reservation = + std::max(reservation, batch.length * scalar.value->length()); + } else { + const ArraySpan& array = source.array; + reservation = std::max(reservation, array.child_data[0].length); + } + } + return child_builder->Reserve(reservation); + }); + } +}; + // No-op reserve function, pulled out to avoid apparent miscompilation on MinGW Status ReserveNoData(ArrayBuilder*) { return Status::OK(); } @@ -2712,6 +2743,25 @@ void AddBinaryCaseWhenKernels(const std::shared_ptr& scalar_fu } } +template +void AddNestedCaseWhenKernel(const std::shared_ptr& scalar_function) { + AddCaseWhenKernel(scalar_function, ArrowNestedType::type_id, + CaseWhenFunctor::Exec); +} + +void AddNestedCaseWhenKernels(const std::shared_ptr& scalar_function) { + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); +} + void AddCoalesceKernel(const std::shared_ptr& scalar_function, detail::GetTypeId get_id, ArrayKernelExec exec) { ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, FirstType, @@ -2731,6 +2781,25 @@ void AddPrimitiveCoalesceKernels(const std::shared_ptr& scalar_f } } +template +void AddNestedCoalesceKernel(const std::shared_ptr& scalar_function) { + AddCoalesceKernel(scalar_function, ArrowNestedType::type_id, + CoalesceFunctor::Exec); +} + +void AddNestedCoalesceKernels(const std::shared_ptr& scalar_function) { + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); +} + void AddChooseKernel(const std::shared_ptr& scalar_function, detail::GetTypeId get_id, ArrayKernelExec exec) { ScalarKernel kernel(KernelSignature::Make({Type::INT64, InputType(get_id.id)}, LastType, @@ -2822,15 +2891,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec); AddBinaryCaseWhenKernels(func, BaseBinaryTypes()); - AddCaseWhenKernel(func, Type::FIXED_SIZE_LIST, - CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::LIST, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::MAP, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::STRUCT, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DENSE_UNION, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::SPARSE_UNION, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DICTIONARY, CaseWhenFunctor::Exec); + AddNestedCaseWhenKernels(func); DCHECK_OK(registry->AddFunction(std::move(func))); } { @@ -2848,15 +2909,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { for (const auto& ty : BaseBinaryTypes()) { AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase(ty)); } - AddCoalesceKernel(func, Type::FIXED_SIZE_LIST, - CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::LIST, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::LARGE_LIST, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::MAP, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::STRUCT, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::DENSE_UNION, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::SPARSE_UNION, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::DICTIONARY, CoalesceFunctor::Exec); + AddNestedCoalesceKernels(func); DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index 58bc560f52842..5988908853d50 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -284,8 +284,11 @@ static void CaseWhenBench(benchmark::State& state) { state.SetItemsProcessed(state.iterations() * (len - offset)); } -static void CaseWhenBenchList(benchmark::State& state) { - auto type = list(int64()); +template +static void CaseWhenBenchList(benchmark::State& state, + const std::shared_ptr& type) { + using ArrayType = typename TypeTraits::ArrayType; + auto fld = field("", type); int64_t len = state.range(0); @@ -295,17 +298,17 @@ static void CaseWhenBenchList(benchmark::State& state) { auto cond_field = field("cond", boolean(), key_value_metadata({{"null_probability", "0.01"}})); - auto cond = rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), - key_value_metadata({{"null_probability", "0.0"}})), - len); - auto val1 = rand.ArrayOf(*fld, len); - auto val2 = rand.ArrayOf(*fld, len); - auto val3 = rand.ArrayOf(*fld, len); - auto val4 = rand.ArrayOf(*fld, len); + auto cond = std::static_pointer_cast( + rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), + key_value_metadata({{"null_probability", "0.0"}})), + len)) + ->Slice(offset); + auto val1 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val2 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val3 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val4 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); for (auto _ : state) { - ABORT_NOT_OK( - CaseWhen(cond->Slice(offset), {val1->Slice(offset), val2->Slice(offset), - val3->Slice(offset), val4->Slice(offset)})); + ABORT_NOT_OK(CaseWhen(cond, {val1, val2, val3, val4})); } // Set bytes processed to ~length of output @@ -372,6 +375,21 @@ static void CaseWhenBenchStringContiguous(benchmark::State& state) { return CaseWhenBenchContiguous(state); } +template +static void CaseWhenBenchVarLengthListLike(benchmark::State& state) { + auto value_type = TypeTraits::type_singleton(); + auto list_type = std::make_shared(value_type); + return CaseWhenBenchList(state, list_type); +} + +static void CaseWhenBenchListInt64(benchmark::State& state) { + return CaseWhenBenchVarLengthListLike(state); +} + +static void CaseWhenBenchListViewInt64(benchmark::State& state) { + CaseWhenBenchVarLengthListLike(state); +} + struct CoalesceParams { int64_t length; int64_t num_arguments; @@ -533,9 +551,11 @@ BENCHMARK(CaseWhenBench64)->Args({kNumItems, 99}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 0}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 99}); -// CaseWhen: Lists -BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 0}); -BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 99}); +// CaseWhen: List-like types +BENCHMARK(CaseWhenBenchListInt64)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchListInt64)->Args({kFewItems, 99}); +BENCHMARK(CaseWhenBenchListViewInt64)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchListViewInt64)->Args({kFewItems, 99}); // CaseWhen: Strings BENCHMARK(CaseWhenBenchString)->Args({kFewItems, 0}); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index c4c46b5efe84d..9a0ca325277dc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -896,6 +896,21 @@ TEST_F(TestIfElseKernel, ParameterizedTypes) { {cond, ArrayFromJSON(type0, "[0]"), ArrayFromJSON(type1, "[1]")})); } +TEST_F(TestIfElseKernel, MapNested) { + auto type = map(int64(), utf8()); + CheckWithDifferentShapes( + ArrayFromJSON(boolean(), "[true, true, false, false]"), + ArrayFromJSON(type, R"([null, [[2, "foo"], [4, null]], [[3, "test"]], []])"), + ArrayFromJSON(type, R"([[[1, "b"]], [[2, "c"]], [[7, "abc"]], null])"), + ArrayFromJSON(type, R"([null, [[2, "foo"], [4, null]], [[7, "abc"]], null])")); + + CheckWithDifferentShapes( + ArrayFromJSON(boolean(), "[null, null, null, null]"), + ArrayFromJSON(type, R"([null, [[1, "c"]], [[4, null]], [[6, "ok"]]])"), + ArrayFromJSON(type, R"([[[-1, null]], [[3, "c"]], null, [[6, "ok"]]])"), + ArrayFromJSON(type, R"([null, null, null, null])")); +} + template class TestIfElseUnion : public ::testing::Test {}; @@ -1920,7 +1935,7 @@ TYPED_TEST(TestCaseWhenBinary, Random) { template class TestCaseWhenList : public ::testing::Test {}; -TYPED_TEST_SUITE(TestCaseWhenList, ListArrowTypes); +TYPED_TEST_SUITE(TestCaseWhenList, ListAndListViewArrowTypes); TYPED_TEST(TestCaseWhenList, ListOfString) { auto type = std::make_shared(utf8()); @@ -2555,7 +2570,7 @@ class TestCoalesceList : public ::testing::Test {}; TYPED_TEST_SUITE(TestCoalesceNumeric, IfElseNumericBasedTypes); TYPED_TEST_SUITE(TestCoalesceBinary, BaseBinaryArrowTypes); -TYPED_TEST_SUITE(TestCoalesceList, ListArrowTypes); +TYPED_TEST_SUITE(TestCoalesceList, ListAndListViewArrowTypes); TYPED_TEST(TestCoalesceNumeric, Basics) { auto type = default_type_instance(); diff --git a/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc new file mode 100644 index 0000000000000..8c5b43d55f756 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/exec.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" +#include "benchmark/benchmark.h" + +namespace arrow::compute { + +constexpr auto kSeed = 0x94378165; + +const auto kSliceStart = 2; +const auto kSliceStop = 10; + +static void BenchmarkListSlice(benchmark::State& state, const ListSliceOptions& opts, + std::shared_ptr list_ty) { + RegressionArgs args(state, /*size_is_bytes=*/false); + auto rand = random::RandomArrayGenerator(kSeed); + auto array = rand.ArrayOf(std::move(list_ty), args.size, args.null_proportion); + auto ctx = default_exec_context(); + std::vector input_args = {std::move(array)}; + for (auto _ : state) { + ABORT_NOT_OK(CallFunction("list_slice", input_args, &opts, ctx).status()); + } +} + +template +static void ListSliceInt64List(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + BenchmarkListSlice(state, opts, std::make_shared(int64())); +} + +template +static void ListSliceStringList(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + BenchmarkListSlice(state, opts, std::make_shared(utf8())); +} + +template +static void ListSliceInt64ListWithStop(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.stop = kSliceStop; + BenchmarkListSlice(state, opts, std::make_shared(int64())); +} + +template +static void ListSliceStringListWithStop(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.stop = kSliceStop; + BenchmarkListSlice(state, opts, std::make_shared(utf8())); +} + +template +static void ListSliceInt64ListWithStepAndStop(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.step = 2; + opts.stop = kSliceStop; + BenchmarkListSlice(state, opts, std::make_shared(int64())); +} + +template +static void ListSliceStringListWithStepAndStop(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.step = 2; + opts.stop = kSliceStop; + BenchmarkListSlice(state, opts, std::make_shared(utf8())); +} + +static void ListSliceInt64ListView(benchmark::State& state) { + ListSliceInt64List(state); +} + +static void ListSliceStringListView(benchmark::State& state) { + ListSliceStringList(state); +} + +static void ListSliceInt64ListViewWithStop(benchmark::State& state) { + ListSliceInt64ListWithStop(state); +} + +static void ListSliceStringListViewWithStop(benchmark::State& state) { + ListSliceStringListWithStop(state); +} + +static void ListSliceInt64ListViewWithStepAndStop(benchmark::State& state) { + ListSliceInt64ListWithStepAndStop(state); +} + +static void ListSliceStringListViewWithStepAndStop(benchmark::State& state) { + ListSliceStringListWithStepAndStop(state); +} + +static void ListSliceInt64ListToFSL(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.stop = kSliceStop; + opts.return_fixed_size_list = true; + BenchmarkListSlice(state, opts, std::make_shared(int64())); +} + +static void ListSliceStringListToFSL(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.stop = kSliceStop; + opts.return_fixed_size_list = true; + BenchmarkListSlice(state, opts, std::make_shared(utf8())); +} + +BENCHMARK(ListSliceInt64List)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringList)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceInt64ListWithStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListWithStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceInt64ListWithStepAndStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListWithStepAndStop)->Apply(RegressionSetArgs); + +BENCHMARK(ListSliceInt64ListView)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListView)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceInt64ListViewWithStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListViewWithStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceInt64ListViewWithStepAndStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListViewWithStepAndStop)->Apply(RegressionSetArgs); + +BENCHMARK(ListSliceInt64ListToFSL)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListToFSL)->Apply(RegressionSetArgs); + +} // namespace arrow::compute diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 733ab9c0dc287..71e367153d9c7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -23,10 +23,12 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/type_fwd.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" #include "arrow/util/string.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -41,10 +43,17 @@ Status ListValueLength(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou const ArraySpan& arr = batch[0].array; ArraySpan* out_arr = out->array_span_mutable(); auto out_values = out_arr->GetValues(1); - const offset_type* offsets = arr.GetValues(1); - // Offsets are always well-defined and monotonic, even for null values - for (int64_t i = 0; i < arr.length; ++i) { - *out_values++ = offsets[i + 1] - offsets[i]; + if (is_list_view(*arr.type)) { + const auto* sizes = arr.GetValues(2); + if (arr.length > 0) { + memcpy(out_values, sizes, arr.length * sizeof(offset_type)); + } + } else { + const offset_type* offsets = arr.GetValues(1); + // Offsets are always well-defined and monotonic, even for null values + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = offsets[i + 1] - offsets[i]; + } } return Status::OK(); } @@ -59,6 +68,30 @@ Status FixedSizeListValueLength(KernelContext* ctx, const ExecSpan& batch, return Status::OK(); } +template +void AddListValueLengthKernel(ScalarFunction* func, + const std::shared_ptr& out_type) { + auto in_type = {InputType(InListType::type_id)}; + ScalarKernel kernel(in_type, out_type, ListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +template <> +void AddListValueLengthKernel( + ScalarFunction* func, const std::shared_ptr& out_type) { + auto in_type = {InputType(Type::FIXED_SIZE_LIST)}; + ScalarKernel kernel(in_type, out_type, FixedSizeListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddListValueLengthKernels(ScalarFunction* func) { + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); +} + const FunctionDoc list_value_length_doc{ "Compute list lengths", ("`lists` must have a list-like type.\n" @@ -98,14 +131,56 @@ std::string ToString(const std::optional& o) { return o.has_value() ? ToChars(*o) : "(nullopt)"; } -template +/// \param stop User-provided stop or the length of the input list +int64_t ListSliceLength(int64_t start, int64_t step, int64_t stop) { + DCHECK_GE(step, 1); + const auto size = std::max(stop - start, 0); + return bit_util::CeilDiv(size, step); +} + +std::optional EffectiveSliceStop(const ListSliceOptions& opts, + const BaseListType& input_type) { + if (!opts.stop.has_value() && input_type.id() == Type::FIXED_SIZE_LIST) { + return checked_cast(input_type).list_size(); + } + return opts.stop; +} + +Result ListSliceOutputType(const ListSliceOptions& opts, + const BaseListType& input_list_type) { + const auto& value_type = input_list_type.field(0); + const bool is_fixed_size_list = input_list_type.id() == Type::FIXED_SIZE_LIST; + const auto return_fixed_size_list = + opts.return_fixed_size_list.value_or(is_fixed_size_list); + if (return_fixed_size_list) { + auto stop = EffectiveSliceStop(opts, input_list_type); + if (!stop.has_value()) { + return Status::Invalid( + "Unable to produce FixedSizeListArray from non-FixedSizeListArray without " + "`stop` being set."); + } + if (opts.step < 1) { + return Status::Invalid("`step` must be >= 1, got: ", opts.step); + } + const auto length = ListSliceLength(opts.start, opts.step, *stop); + return fixed_size_list(value_type, static_cast(length)); + } + if (is_fixed_size_list) { + return list(value_type); + } + return TypeHolder{&input_list_type}; +} + +template struct ListSlice { - using offset_type = typename Type::offset_type; + using offset_type = typename InListType::offset_type; static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const auto opts = OptionsWrapper::Get(ctx); + const auto& opts = OptionsWrapper::Get(ctx); + const ArraySpan& list_array = batch[0].array; + const auto* list_type = checked_cast(list_array.type); - // Invariants + // Pre-conditions if (opts.start < 0 || (opts.stop.has_value() && opts.start >= opts.stop.value())) { // TODO(ARROW-18281): support start == stop which should give empty lists return Status::Invalid("`start`(", opts.start, @@ -116,128 +191,201 @@ struct ListSlice { return Status::Invalid("`step` must be >= 1, got: ", opts.step); } - const ArraySpan& list_array = batch[0].array; - const Type* list_type = checked_cast(list_array.type); - const auto value_type = list_type->field(0); - const auto return_fixed_size_list = opts.return_fixed_size_list.value_or( - list_type->id() == arrow::Type::FIXED_SIZE_LIST); - std::unique_ptr builder; - - // should have been checked in resolver - // if stop not set, then cannot return fixed size list without input being fixed size - // list b/c we cannot determine the max list element in type resolving. - DCHECK(opts.stop.has_value() || - (!opts.stop.has_value() && (!return_fixed_size_list || - list_type->id() == arrow::Type::FIXED_SIZE_LIST))); - - // construct array values - if (return_fixed_size_list) { - int32_t stop; - if (opts.stop.has_value()) { - stop = static_cast(opts.stop.value()); - } else { - DCHECK_EQ(list_type->id(), arrow::Type::FIXED_SIZE_LIST); - stop = reinterpret_cast(list_type)->list_size(); - } - const auto size = std::max(stop - static_cast(opts.start), 0); - const auto length = bit_util::CeilDiv(size, opts.step); - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), - fixed_size_list(value_type, static_cast(length)), - &builder)); - RETURN_NOT_OK(BuildArray(batch, opts, *builder)); - } else { - if constexpr (std::is_same_v) { - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), &builder)); - RETURN_NOT_OK(BuildArray(batch, opts, *builder)); - } else { - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), &builder)); - RETURN_NOT_OK(BuildArray(batch, opts, *builder)); - } + auto* pool = ctx->memory_pool(); + ARROW_ASSIGN_OR_RAISE(auto output_type_holder, ListSliceOutputType(opts, *list_type)); + constexpr auto kInputTypeId = InListType::type_id; + auto output_type = output_type_holder.GetSharedPtr(); + switch (output_type->id()) { + // The various `if constexpr` guards below avoid generating + // ListSlice::BuildArray specializations + // that will never be invoked at runtime. + case Type::LIST: + DCHECK(kInputTypeId == Type::LIST || kInputTypeId == Type::FIXED_SIZE_LIST); + if constexpr (kInputTypeId == Type::LIST || + kInputTypeId == Type::FIXED_SIZE_LIST) { + return BuildArray(pool, opts, batch, output_type, out); + } + break; + case Type::LARGE_LIST: + DCHECK_EQ(kInputTypeId, Type::LARGE_LIST); + if constexpr (kInputTypeId == Type::LARGE_LIST) { + return BuildArray(pool, opts, batch, output_type, out); + } + break; + case Type::FIXED_SIZE_LIST: + // A fixed-size list can be produced from any list-like input + // if ListSliceOptions::return_fixed_size_list is set to true + return BuildArray(pool, opts, batch, output_type, out); + case Type::LIST_VIEW: + DCHECK_EQ(kInputTypeId, Type::LIST_VIEW); + if constexpr (kInputTypeId == Type::LIST_VIEW) { + return BuildArray(pool, opts, batch, output_type, out); + } + break; + case Type::LARGE_LIST_VIEW: + DCHECK_EQ(kInputTypeId, Type::LARGE_LIST_VIEW); + if constexpr (kInputTypeId == Type::LARGE_LIST_VIEW) { + return BuildArray(pool, opts, batch, output_type, out); + } + break; + default: + break; } - - // build output arrays and set result - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - out->value = std::move(result->data()); + Unreachable(); return Status::OK(); } + /// \brief Builds the array of list slices from the input list array template - static Status BuildArray(const ExecSpan& batch, const ListSliceOptions& opts, - ArrayBuilder& builder) { - if constexpr (std::is_same_v) { - RETURN_NOT_OK(BuildArrayFromFixedSizeListType(batch, opts, builder)); + static Status BuildArray(MemoryPool* pool, const ListSliceOptions& opts, + const ExecSpan& batch, + const std::shared_ptr& output_type, + ExecResult* out) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, output_type, &builder)); + auto* list_builder = checked_cast(builder.get()); + RETURN_NOT_OK(list_builder->Resize(batch[0].array.length)); + if constexpr (std::is_same_v) { + RETURN_NOT_OK(BuildArrayFromFixedSizeListType(opts.start, opts.step, opts.stop, + batch, list_builder)); } else { - RETURN_NOT_OK(BuildArrayFromListType(batch, opts, builder)); + RETURN_NOT_OK(BuildArrayFromVarLenListLikeType(opts.start, opts.step, opts.stop, + batch, list_builder)); } + std::shared_ptr result; + RETURN_NOT_OK(list_builder->FinishInternal(&result)); + out->value = std::move(result); return Status::OK(); } template - static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch, - const ListSliceOptions& opts, - ArrayBuilder& builder) { - const auto list_size = - checked_cast(*batch[0].type()).list_size(); + static Status BuildArrayFromFixedSizeListType(int64_t start, int64_t step, + std::optional stop, + const ExecSpan& batch, + BuilderType* out_list_builder) { + static_assert(std::is_same_v); + constexpr bool kIsFixedSizeOutput = std::is_same_v; + const auto& fsl_type = checked_cast(*batch[0].type()); const ArraySpan& list_array = batch[0].array; - const ArraySpan& list_values = list_array.child_data[0]; - - auto list_builder = checked_cast(&builder); - for (auto i = 0; i < list_array.length; ++i) { - auto offset = (i + list_array.offset) * list_size; - auto next_offset = offset + list_size; - if (list_array.IsNull(i)) { - RETURN_NOT_OK(list_builder->AppendNull()); + const ArraySpan& values_array = list_array.child_data[0]; + ArrayBuilder* value_builder = out_list_builder->value_builder(); + + auto* is_valid = list_array.GetValues(0, 0); + const auto list_size = static_cast(fsl_type.list_size()); + const int64_t effective_stop = stop.value_or(list_size); + int64_t slice_length, value_count; + int64_t null_padding = 0; + if constexpr (kIsFixedSizeOutput) { + if (list_size < effective_stop) { + slice_length = ListSliceLength(start, step, effective_stop); + value_count = ListSliceLength(start, step, list_size); + DCHECK_LE(value_count, slice_length); + null_padding = slice_length - value_count; + } else { + slice_length = ListSliceLength(start, step, effective_stop); + value_count = slice_length; + } + } else { + slice_length = ListSliceLength(start, step, std::min(list_size, effective_stop)); + value_count = slice_length; + } + int64_t offset = list_array.offset * list_size; + for (int64_t i = 0; i < list_array.length; ++i) { + if (is_valid && !bit_util::GetBit(is_valid, list_array.offset + i)) { + RETURN_NOT_OK(out_list_builder->AppendNull()); } else { - RETURN_NOT_OK(SetValues(list_builder, offset, next_offset, &opts, - &list_values)); + int64_t start_offset = offset + start; + RETURN_NOT_OK(AppendListSliceDimensions(slice_length, + out_list_builder)); + RETURN_NOT_OK(AppendListSliceValues(start_offset, step, value_count, null_padding, + values_array, value_builder)); } + offset += list_size; } return Status::OK(); } template - static Status BuildArrayFromListType(const ExecSpan& batch, - const ListSliceOptions& opts, - ArrayBuilder& builder) { + static Status BuildArrayFromVarLenListLikeType(int64_t start, int64_t step, + std::optional stop, + const ExecSpan& batch, + BuilderType* out_list_builder) { + constexpr bool kIsListViewInput = is_list_view(InListType::type_id); + constexpr bool kIsFixedSizeOutput = std::is_same_v; const ArraySpan& list_array = batch[0].array; - const offset_type* offsets = list_array.GetValues(1); - - const ArraySpan& list_values = list_array.child_data[0]; - - auto list_builder = checked_cast(&builder); - for (auto i = 0; i < list_array.length; ++i) { + const ArraySpan& values_array = list_array.child_data[0]; + ArrayBuilder* value_builder = out_list_builder->value_builder(); + + const auto* is_valid = list_array.GetValues(0, 0); + const auto* offsets = list_array.GetValues(1); + const offset_type* sizes = nullptr; + if constexpr (kIsListViewInput) { + sizes = list_array.GetValues(2); + } + for (int64_t i = 0; i < list_array.length; ++i) { const offset_type offset = offsets[i]; - const offset_type next_offset = offsets[i + 1]; - if (list_array.IsNull(i)) { - RETURN_NOT_OK(list_builder->AppendNull()); + const int64_t list_size = kIsListViewInput ? sizes[i] : offsets[i + 1] - offset; + if (is_valid && !bit_util::GetBit(is_valid, list_array.offset + i)) { + RETURN_NOT_OK(out_list_builder->AppendNull()); } else { - RETURN_NOT_OK(SetValues(list_builder, offset, next_offset, &opts, - &list_values)); + int64_t effective_stop = stop.value_or(list_size); + int64_t slice_length, value_count; + int64_t null_padding = 0; + if constexpr (kIsFixedSizeOutput) { + if (list_size < effective_stop) { + slice_length = ListSliceLength(start, step, effective_stop); + value_count = ListSliceLength(start, step, list_size); + DCHECK_LE(value_count, slice_length); + null_padding = slice_length - value_count; + } else { + slice_length = ListSliceLength(start, step, effective_stop); + value_count = slice_length; + } + } else { + slice_length = + ListSliceLength(start, step, std::min(list_size, effective_stop)); + value_count = slice_length; + } + RETURN_NOT_OK(AppendListSliceDimensions(slice_length, + out_list_builder)); + RETURN_NOT_OK(AppendListSliceValues(offset + start, step, value_count, + null_padding, values_array, value_builder)); } } return Status::OK(); } - template - static Status SetValues(BuilderType* list_builder, const offset_type offset, - const offset_type next_offset, const ListSliceOptions* opts, - const ArraySpan* list_values) { - auto value_builder = list_builder->value_builder(); - auto cursor = offset; - - RETURN_NOT_OK(list_builder->Append()); - const auto size = opts->stop.has_value() ? (opts->stop.value() - opts->start) - : ((next_offset - opts->start) - offset); - while (cursor < offset + size) { - if (cursor + opts->start >= next_offset) { - if constexpr (!std::is_same_v) { - break; // don't pad nulls for variable sized list output - } - RETURN_NOT_OK(value_builder->AppendNull()); - } else { + + template + static Status AppendListSliceDimensions(int64_t slice_length, + BuilderType* out_list_builder) { + if constexpr (kIsFixedSizeOutput) { + DCHECK_EQ(out_list_builder->type()->id(), Type::FIXED_SIZE_LIST); + return out_list_builder->Append(); + } else { + return out_list_builder->Append(/*is_valid=*/true, slice_length); + } + } + + /// \param value_count The pre-validated number of values to append starting + /// from `start_offset` with a step of `step` + /// \param null_padding The number of nulls to append after the values + static Status AppendListSliceValues(int64_t start_offset, int64_t step, + int64_t value_count, int64_t null_padding, + const ArraySpan& values_array, + ArrayBuilder* out_value_builder) { + if (step == 1) { + RETURN_NOT_OK( + out_value_builder->AppendArraySlice(values_array, start_offset, value_count)); + } else { + auto cursor_offset = start_offset; + for (int64_t i = 0; i < value_count; i++) { RETURN_NOT_OK( - value_builder->AppendArraySlice(*list_values, cursor + opts->start, 1)); + out_value_builder->AppendArraySlice(values_array, cursor_offset, 1)); + cursor_offset += step; } - cursor += static_cast(opts->step); + } + if (null_padding > 0) { + RETURN_NOT_OK(out_value_builder->AppendNulls(null_padding)); } return Status::OK(); } @@ -246,36 +394,8 @@ struct ListSlice { Result MakeListSliceResolve(KernelContext* ctx, const std::vector& types) { const auto& opts = OptionsWrapper::Get(ctx); - const auto list_type = checked_cast(types[0].type); - const auto value_type = list_type->field(0); - const auto return_fixed_size_list = - opts.return_fixed_size_list.value_or(list_type->id() == Type::FIXED_SIZE_LIST); - if (return_fixed_size_list) { - int32_t stop; - if (!opts.stop.has_value()) { - if (list_type->id() == Type::FIXED_SIZE_LIST) { - stop = checked_cast(list_type)->list_size(); - } else { - return Status::NotImplemented( - "Unable to produce FixedSizeListArray from non-FixedSizeListArray without " - "`stop` being set."); - } - } else { - stop = static_cast(opts.stop.value()); - } - const auto size = std::max(static_cast(stop - opts.start), 0); - if (opts.step < 1) { - return Status::Invalid("`step` must be >= 1, got: ", opts.step); - } - const auto length = bit_util::CeilDiv(size, opts.step); - return fixed_size_list(value_type, static_cast(length)); - } else { - // Returning large list if that's what we got in and didn't ask for fixed size - if (list_type->id() == Type::LARGE_LIST) { - return large_list(value_type); - } - return list(value_type); - } + const auto* list_type = checked_cast(types[0].type); + return ListSliceOutputType(opts, *list_type); } template @@ -293,6 +413,8 @@ void AddListSliceKernels(ScalarFunction* func) { AddListSliceKernels(func); AddListSliceKernels(func); AddListSliceKernels(func); + AddListSliceKernels(func); + AddListSliceKernels(func); } const FunctionDoc list_slice_doc( @@ -399,6 +521,8 @@ void AddListElementKernels(ScalarFunction* func) { void AddListElementKernels(ScalarFunction* func) { AddListElementKernels(func); AddListElementKernels(func); + AddListElementKernels(func); + AddListElementKernels(func); AddListElementKernels(func); } @@ -824,12 +948,7 @@ const FunctionDoc map_lookup_doc{ void RegisterScalarNested(FunctionRegistry* registry) { auto list_value_length = std::make_shared( "list_value_length", Arity::Unary(), list_value_length_doc); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(), - ListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), - FixedSizeListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(), - ListValueLength)); + AddListValueLengthKernels(list_value_length.get()); DCHECK_OK(registry->AddFunction(std::move(list_value_length))); auto list_element = diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index a72ec99620b82..b6a6cac1b4382 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -30,11 +30,21 @@ namespace arrow { namespace compute { static std::shared_ptr GetOffsetType(const DataType& type) { - return type.id() == Type::LIST ? int32() : int64(); + switch (type.id()) { + case Type::LIST: + case Type::LIST_VIEW: + return int32(); + case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: + return int64(); + default: + Unreachable("Unexpected type"); + } } TEST(TestScalarNested, ListValueLength) { - for (auto ty : {list(int32()), large_list(int32())}) { + for (auto ty : {list(int32()), large_list(int32()), list_view(int32()), + large_list_view(int32())}) { CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]", GetOffsetType(*ty), "[3, null, 2, 0]"); } @@ -47,7 +57,8 @@ TEST(TestScalarNested, ListValueLength) { TEST(TestScalarNested, ListElementNonFixedListWithNulls) { auto sample = "[[7, 5, 81], [6, null, 4, 7, 8], [3, 12, 2, 0], [1, 9], null]"; for (auto ty : NumericTypes()) { - for (auto list_type : {list(ty), large_list(ty)}) { + for (auto list_type : + {list(ty), large_list(ty), list_view(ty), large_list_view(ty)}) { auto input = ArrayFromJSON(list_type, sample); auto null_input = ArrayFromJSON(list_type, "[null]"); for (auto index_type : IntTypes()) { @@ -117,33 +128,45 @@ TEST(TestScalarNested, ListElementInvalid) { Raises(StatusCode::Invalid)); } +using VarLenListLikeTypeFactory = + std::shared_ptr (*)(std::shared_ptr); +static const VarLenListLikeTypeFactory kVarLenListTypeFactories[] = { + list, + large_list, + list_view, + large_list_view, +}; + TEST(TestScalarNested, ListSliceVariableOutput) { const auto value_types = {float32(), int32()}; for (auto value_type : value_types) { - auto input = ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6], null]"); - ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1, - /*return_fixed_size_list=*/false); - auto expected = ArrayFromJSON(list(value_type), "[[1, 2], [4, 5], [6], null]"); - CheckScalarUnary("list_slice", input, expected, &args); + for (auto list_type_factory : kVarLenListTypeFactories) { + ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1, + /*return_fixed_size_list=*/false); + auto list_ty = list_type_factory(value_type); + auto input = ArrayFromJSON(list_ty, "[[1, 2, 3], [4, 5], [6], null]"); + auto expected = ArrayFromJSON(list_ty, "[[1, 2], [4, 5], [6], null]"); + CheckScalarUnary("list_slice", input, expected, &args); - args.start = 1; - expected = ArrayFromJSON(list(value_type), "[[2], [5], [], null]"); - CheckScalarUnary("list_slice", input, expected, &args); + args.start = 1; + expected = ArrayFromJSON(list_ty, "[[2], [5], [], null]"); + CheckScalarUnary("list_slice", input, expected, &args); - args.start = 2; - args.stop = 4; - expected = ArrayFromJSON(list(value_type), "[[3], [], [], null]"); - CheckScalarUnary("list_slice", input, expected, &args); + args.start = 2; + args.stop = 4; + expected = ArrayFromJSON(list_ty, "[[3], [], [], null]"); + CheckScalarUnary("list_slice", input, expected, &args); - args.start = 1; - args.stop = std::nullopt; - expected = ArrayFromJSON(list(value_type), "[[2, 3], [5], [], null]"); - CheckScalarUnary("list_slice", input, expected, &args); + args.start = 1; + args.stop = std::nullopt; + expected = ArrayFromJSON(list_ty, "[[2, 3], [5], [], null]"); + CheckScalarUnary("list_slice", input, expected, &args); - args.start = 0; - args.stop = 4; - args.step = 2; - expected = ArrayFromJSON(list(value_type), "[[1, 3], [4], [6], null]"); + args.start = 0; + args.stop = 4; + args.step = 2; + expected = ArrayFromJSON(list_ty, "[[1, 3], [4], [6], null]"); + } } // Verify passing `return_fixed_size_list=false` with fixed size input @@ -158,9 +181,13 @@ TEST(TestScalarNested, ListSliceVariableOutput) { TEST(TestScalarNested, ListSliceFixedOutput) { const auto value_types = {float32(), int32()}; for (auto value_type : value_types) { - auto inputs = {ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6], null]"), - ArrayFromJSON(fixed_size_list(value_type, 3), - "[[1, 2, 3], [4, 5, null], [6, null, null], null]")}; + const char* kVarLenListJSON = "[[1, 2, 3], [4, 5], [6], null]"; + const char* kFixedSizeListJSON = "[[1, 2, 3], [4, 5, null], [6, null, null], null]"; + std::vector> inputs; + for (auto list_type_factory : kVarLenListTypeFactories) { + inputs.push_back(ArrayFromJSON(list_type_factory(value_type), kVarLenListJSON)); + } + inputs.push_back(ArrayFromJSON(fixed_size_list(value_type, 3), kFixedSizeListJSON)); for (auto input : inputs) { ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1, /*return_fixed_size_list=*/true); @@ -187,7 +214,7 @@ TEST(TestScalarNested, ListSliceFixedOutput) { CheckScalarUnary("list_slice", input, expected, &args); } else { EXPECT_RAISES_WITH_MESSAGE_THAT( - NotImplemented, + Invalid, ::testing::HasSubstr("Unable to produce FixedSizeListArray from " "non-FixedSizeListArray without `stop` being set."), CallFunction("list_slice", {input}, &args)); @@ -253,22 +280,25 @@ TEST(TestScalarNested, ListSliceChildArrayOffset) { ASSERT_EQ(input->offset(), 0); ASSERT_EQ(input->values()->offset(), 2); - ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1, + ListSliceOptions args(/*start=*/0, /*stop=*/3, /*step=*/1, /*return_fixed_size_list=*/false); auto expected = ArrayFromJSON(list(int8()), "[[2], [3, 4]]"); CheckScalarUnary("list_slice", input, expected, &args); args.return_fixed_size_list = true; - expected = ArrayFromJSON(fixed_size_list(int8(), 2), "[[2, null], [3, 4]]"); + expected = ArrayFromJSON(fixed_size_list(int8(), 3), "[[2, null, null], [3, 4, null]]"); CheckScalarUnary("list_slice", input, expected, &args); } TEST(TestScalarNested, ListSliceOutputEqualsInputType) { + const char* kVarLenListJSON = "[[1, 2, 3], [4, 5], [6, null], null]"; + const char* kFixedLenListJSON = "[[1, 2], [4, 5], [6, null], null]"; // Default is to return same type as the one passed in. - auto inputs = { - ArrayFromJSON(list(int8()), "[[1, 2, 3], [4, 5], [6, null], null]"), - ArrayFromJSON(large_list(int8()), "[[1, 2, 3], [4, 5], [6, null], null]"), - ArrayFromJSON(fixed_size_list(int8(), 2), "[[1, 2], [4, 5], [6, null], null]")}; + std::vector> inputs; + for (auto list_type_factory : kVarLenListTypeFactories) { + inputs.push_back(ArrayFromJSON(list_type_factory(int8()), kVarLenListJSON)); + } + inputs.push_back(ArrayFromJSON(fixed_size_list(int8(), 2), kFixedLenListJSON)); for (auto input : inputs) { ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1); auto expected = ArrayFromJSON(input->type(), "[[1, 2], [4, 5], [6, null], null]"); @@ -305,10 +335,9 @@ TEST(TestScalarNested, ListSliceBadParameters) { // stop not set and FixedSizeList requested with variable sized input args.stop = std::nullopt; EXPECT_RAISES_WITH_MESSAGE_THAT( - NotImplemented, - ::testing::HasSubstr("NotImplemented: Unable to produce FixedSizeListArray from " - "non-FixedSizeListArray without " - "`stop` being set."), + Invalid, + ::testing::HasSubstr("Invalid: Unable to produce FixedSizeListArray from " + "non-FixedSizeListArray without `stop` being set."), CallFunction("list_slice", {input}, &args)); // Catch step must be >= 1 args.start = 0; diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc index 038e623b43c53..fecd57412b436 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc @@ -1142,9 +1142,13 @@ struct AsciiPadTransform : public StringTransformBase { int64_t left = 0; int64_t right = 0; if (PadLeft && PadRight) { - // If odd number of spaces, put the extra space on the right - left = spaces / 2; - right = spaces - left; + if (options_.lean_left_on_odd_padding) { + left = spaces / 2; + right = spaces - left; + } else { + right = spaces / 2; + left = spaces - right; + } } else if (PadLeft) { left = spaces; } else if (PadRight) { @@ -1315,7 +1319,7 @@ struct RegexSubstringMatcher { const MatchSubstringOptions& options, bool is_utf8 = true, bool literal = false) { auto matcher = std::make_unique(options, is_utf8, literal); RETURN_NOT_OK(RegexStatus(matcher->regex_match_)); - return std::move(matcher); + return matcher; } explicit RegexSubstringMatcher(const MatchSubstringOptions& options, @@ -1685,7 +1689,7 @@ struct FindSubstringRegex { bool is_utf8 = true, bool literal = false) { auto matcher = FindSubstringRegex(options, is_utf8, literal); RETURN_NOT_OK(RegexStatus(*matcher.regex_match_)); - return std::move(matcher); + return matcher; } explicit FindSubstringRegex(const MatchSubstringOptions& options, bool is_utf8 = true, @@ -1832,7 +1836,7 @@ struct CountSubstringRegex { bool is_utf8 = true, bool literal = false) { CountSubstringRegex counter(options, is_utf8, literal); RETURN_NOT_OK(RegexStatus(*counter.regex_match_)); - return std::move(counter); + return counter; } template @@ -2055,7 +2059,7 @@ struct RegexSubstringReplacer { std::move(replacement_error)); } - return std::move(replacer); + return replacer; } // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore @@ -2203,7 +2207,7 @@ struct ExtractRegexData { } data.group_names.emplace_back(item->second); } - return std::move(data); + return data; } Result ResolveOutputType(const std::vector& types) const { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 26289a7f787e1..0a2261290846a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -1988,6 +1988,11 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) { #endif TYPED_TEST(TestStringKernels, Strptime) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Skipping some strptime tests due to emscripten bug " + "https://github.com/emscripten-core/emscripten/issues/20466"; +#endif + std::string input1 = R"(["5/1/2020", null, null, "12/13/1900", null])"; std::string input2 = R"(["5-1-2020", "12/13/1900"])"; std::string input3 = R"(["5/1/2020", "AA/BB/CCCC"])"; @@ -2008,6 +2013,7 @@ TYPED_TEST(TestStringKernels, Strptime) { this->CheckUnary("strptime", input4, unit, output4, &options); options.format = "%m/%d/%Y %%z"; + // emscripten bug https://github.com/emscripten-core/emscripten/issues/20466 this->CheckUnary("strptime", input5, unit, output1, &options); options.error_is_null = false; @@ -2019,6 +2025,11 @@ TYPED_TEST(TestStringKernels, Strptime) { } TYPED_TEST(TestStringKernels, StrptimeZoneOffset) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() + << "Emscripten bug https://github.com/emscripten-core/emscripten/issues/20467"; +#endif + if (!arrow::internal::kStrptimeSupportsZone) { GTEST_SKIP() << "strptime does not support %z on this platform"; } @@ -2106,6 +2117,12 @@ TYPED_TEST(TestStringKernels, PadUTF8) { R"([null, "a\u2008\u2008\u2008\u2008", "bb\u2008\u2008\u2008", "b\u00E1r\u2008\u2008", "foobar"])", &options); + PadOptions options2{/*width=*/5, "\xe2\x80\x88", /*lean_left_on_odd_padding=*/false}; + this->CheckUnary( + "utf8_center", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(), + R"([null, "\u2008\u2008a\u2008\u2008", "\u2008\u2008bb\u2008", "\u2008b\u00E1r\u2008", "foobar"])", + &options2); + PadOptions options_bad{/*width=*/3, /*padding=*/"spam"}; auto input = ArrayFromJSON(this->type(), R"(["foo"])"); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, @@ -2448,6 +2465,10 @@ TYPED_TEST(TestStringKernels, PadAscii) { this->CheckUnary("ascii_rpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(), R"([null, "a ", "bb ", "bar ", "foobar"])", &options); + PadOptions options2{/*width=*/5, " ", /*lean_left_on_odd_padding=*/false}; + this->CheckUnary("ascii_center", R"([null, "a", "bb", "bar", "foobar"])", this->type(), + R"([null, " a ", " bb ", " bar ", "foobar"])", &options2); + PadOptions options_bad{/*width=*/3, /*padding=*/"spam"}; auto input = ArrayFromJSON(this->type(), R"(["foo"])"); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc index d720d4eee804f..42762ca8b116f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc @@ -930,9 +930,13 @@ struct Utf8PadTransform : public StringTransformBase { int64_t left = 0; int64_t right = 0; if (PadLeft && PadRight) { - // If odd number of spaces, put the extra space on the right - left = spaces / 2; - right = spaces - left; + if (options_.lean_left_on_odd_padding) { + left = spaces / 2; + right = spaces - left; + } else { + right = spaces / 2; + left = spaces - right; + } } else if (PadLeft) { left = spaces; } else if (PadRight) { diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 8dac6525fe2e6..8da8c760ea22b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -2143,7 +2143,10 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) { TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { #ifdef _WIN32 GTEST_SKIP() << "There is a known bug in strftime for locales on Windows (ARROW-15922)"; -#else +#elif defined(__EMSCRIPTEN__) + GTEST_SKIP() << "Emscripten doesn't build with multiple locales as default"; +#endif + if (!LocaleExists("fr_FR.UTF-8")) { GTEST_SKIP() << "locale 'fr_FR.UTF-8' doesn't exist on this system"; } @@ -2155,10 +2158,12 @@ TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { ["01 janvier 1970 00:00:59,123", "18 août 2021 15:11:50,456", null])"; CheckScalarUnary("strftime", timestamp(TimeUnit::MILLI, "UTC"), milliseconds, utf8(), expected, &options); -#endif } TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Emscripten doesn't build with multiple locales as default"; +#endif auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "nonexistent"); const char* seconds = R"(["1970-01-01T00:00:59", null])"; auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), seconds); diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc index 23d0fd18d578a..2217787663a63 100644 --- a/cpp/src/arrow/compute/kernels/test_util.cc +++ b/cpp/src/arrow/compute/kernels/test_util.cc @@ -31,6 +31,7 @@ #include "arrow/datum.h" #include "arrow/result.h" #include "arrow/table.h" +#include "arrow/testing/fixed_width_test_util.h" #include "arrow/testing/gtest_util.h" namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 800deba3a5ed2..5067298858132 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -530,7 +530,8 @@ Result> HashInit(KernelContext* ctx, auto result = std::make_unique(args.inputs[0].GetSharedPtr(), args.options, ctx->memory_pool()); RETURN_NOT_OK(result->Reset()); - return std::move(result); + // R build with openSUSE155 requires an explicit unique_ptr construction + return std::unique_ptr(std::move(result)); } template @@ -697,13 +698,12 @@ void AddHashKernels(VectorFunction* func, VectorKernel base, OutputType out_ty) DCHECK_OK(func->AddKernel(base)); } - // Example parametric types that we want to match only on Type::type - auto parametric_types = {time32(TimeUnit::SECOND), time64(TimeUnit::MICRO), - timestamp(TimeUnit::SECOND), duration(TimeUnit::SECOND), - fixed_size_binary(0)}; - for (const auto& ty : parametric_types) { - base.init = GetHashInit(ty->id()); - base.signature = KernelSignature::Make({ty->id()}, out_ty); + // Parametric types that we want matching to be dependent only on type id + auto parametric_types = {Type::TIME32, Type::TIME64, Type::TIMESTAMP, Type::DURATION, + Type::FIXED_SIZE_BINARY}; + for (const auto& type_id : parametric_types) { + base.init = GetHashInit(type_id); + base.signature = KernelSignature::Make({type_id}, out_ty); DCHECK_OK(func->AddKernel(base)); } diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 08930e589f7b4..955f9b8cbd14c 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -18,19 +18,32 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/list_util.h" #include "arrow/visit_type_inline.h" namespace arrow { + +using internal::CountSetBits; +using list_util::internal::RangeOfValuesUsed; + namespace compute { namespace internal { namespace { template Status ListFlatten(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + auto recursive = OptionsWrapper::Get(ctx).recursive; typename TypeTraits::ArrayType list_array(batch[0].array.ToArrayData()); - ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool())); + + auto pool = ctx->memory_pool(); + ARROW_ASSIGN_OR_RAISE(auto result, (recursive ? list_array.FlattenRecursively(pool) + : list_array.Flatten(pool))); + out->value = std::move(result->data()); return Status::OK(); } @@ -70,6 +83,63 @@ struct ListParentIndicesArray { Status Visit(const LargeListType& type) { return VisitList(type); } + template + Status VisitListView(const Type&) { + ArraySpan list_view{*input}; + + const offset_type* offsets = list_view.GetValues(1); + const offset_type* sizes = list_view.GetValues(2); + int64_t values_offset; + int64_t values_length; + ARROW_ASSIGN_OR_RAISE(std::tie(values_offset, values_length), + RangeOfValuesUsed(list_view)); + + ARROW_ASSIGN_OR_RAISE(auto indices_validity, + AllocateEmptyBitmap(values_length, ctx->memory_pool())); + auto* out_indices_validity = indices_validity->mutable_data(); + int64_t total_pop_count = 0; + + ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int64_t))); + auto* out_indices = indices->template mutable_data_as(); + memset(out_indices, -1, values_length * sizeof(int64_t)); + + const auto* validity = list_view.GetValues(0, 0); + RETURN_NOT_OK(arrow::internal::VisitSetBitRuns( + validity, list_view.offset, list_view.length, + [this, offsets, sizes, out_indices, out_indices_validity, values_offset, + &total_pop_count](int64_t run_start, int64_t run_length) { + for (int64_t i = run_start; i < run_start + run_length; ++i) { + auto validity_offset = offsets[i] - values_offset; + const int64_t pop_count = + CountSetBits(out_indices_validity, validity_offset, sizes[i]); + if (ARROW_PREDICT_FALSE(pop_count > 0)) { + return Status::Invalid( + "Function 'list_parent_indices' cannot produce parent indices for " + "values used by more than one list-view array element."); + } + bit_util::SetBitmap(out_indices_validity, validity_offset, sizes[i]); + total_pop_count += sizes[i]; + for (auto j = static_cast(offsets[i]); + j < static_cast(offsets[i]) + sizes[i]; ++j) { + out_indices[j - values_offset] = i + base_output_offset; + } + } + return Status::OK(); + })); + + DCHECK_LE(total_pop_count, values_length); + const int64_t null_count = values_length - total_pop_count; + BufferVector buffers{null_count > 0 ? std::move(indices_validity) : nullptr, + std::move(indices)}; + out = std::make_shared(int64(), values_length, std::move(buffers), + null_count); + return Status::OK(); + } + + Status Visit(const ListViewType& type) { return VisitListView(type); } + + Status Visit(const LargeListViewType& type) { return VisitListView(type); } + Status Visit(const FixedSizeListType& type) { using offset_type = typename FixedSizeListType::offset_type; const offset_type slot_length = type.list_size(); @@ -107,14 +177,19 @@ struct ListParentIndicesArray { const FunctionDoc list_flatten_doc( "Flatten list values", - ("`lists` must have a list-like type.\n" - "Return an array with the top list level flattened.\n" - "Top-level null values in `lists` do not emit anything in the input."), - {"lists"}); + ("`lists` must have a list-like type (lists, list-views, and\n" + "fixed-size lists).\n" + "Return an array with the top list level flattened unless\n" + "`recursive` is set to true in ListFlattenOptions. When that\n" + "is that case, flattening happens recursively until a non-list\n" + "array is formed.\n" + "\n" + "Null list values do not emit anything to the output."), + {"lists"}, "ListFlattenOptions"); const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", - ("`lists` must have a list-like type.\n" + ("`lists` must have a list-like or list-view type.\n" "For each value in each list of `lists`, the top-level list index\n" "is emitted."), {"lists"}); @@ -136,6 +211,7 @@ class ListParentIndicesFunction : public MetaFunction { int64_t base_output_offset = 0; ArrayVector out_chunks; + out_chunks.reserve(input->num_chunks()); for (const auto& chunk : input->chunks()) { ARROW_ASSIGN_OR_RAISE(auto out_chunk, ListParentIndicesArray::Exec(&kernel_ctx, chunk->data(), @@ -153,17 +229,34 @@ class ListParentIndicesFunction : public MetaFunction { } }; +const ListFlattenOptions* GetDefaultListFlattenOptions() { + static const auto kDefaultListFlattenOptions = ListFlattenOptions::Defaults(); + return &kDefaultListFlattenOptions; +} + +template +void AddBaseListFlattenKernels(VectorFunction* func) { + auto in_type = {InputType(InListType::type_id)}; + auto out_type = OutputType(ListValuesType); + VectorKernel kernel(in_type, out_type, ListFlatten, + OptionsWrapper::Init); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddBaseListFlattenKernels(VectorFunction* func) { + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); +} + } // namespace void RegisterVectorNested(FunctionRegistry* registry) { - auto flatten = - std::make_shared("list_flatten", Arity::Unary(), list_flatten_doc); - DCHECK_OK(flatten->AddKernel({Type::LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::FIXED_SIZE_LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::LARGE_LIST}, OutputType(ListValuesType), - ListFlatten)); + auto flatten = std::make_shared( + "list_flatten", Arity::Unary(), list_flatten_doc, GetDefaultListFlattenOptions()); + AddBaseListFlattenKernels(flatten.get()); DCHECK_OK(registry->AddFunction(std::move(flatten))); DCHECK_OK(registry->AddFunction(std::make_shared())); diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index eef1b6835ffb5..da751fa5de403 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -19,6 +19,7 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/result.h" #include "arrow/testing/gtest_util.h" @@ -29,38 +30,113 @@ namespace compute { using arrow::internal::checked_cast; -TEST(TestVectorNested, ListFlatten) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]"); - auto expected = ArrayFromJSON(int16(), "[0, null, 1, 2, 3]"); +using ListAndListViewTypes = + ::testing::Types; + +// ---------------------------------------------------------------------- +// [Large]List and [Large]ListView tests +template +class TestVectorNestedSpecialized : public ::testing::Test { + public: + using TypeClass = T; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + } + + public: + void TestListFlatten() { + auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], []]"); + auto expected = ArrayFromJSON(value_type_, "[0, null, 1, 2, 3]"); CheckVectorUnary("list_flatten", input, expected); // Construct a list with a non-empty null slot auto tweaked = TweakValidityBit(input, 0, false); - expected = ArrayFromJSON(int16(), "[2, 3]"); + expected = ArrayFromJSON(value_type_, "[2, 3]"); CheckVectorUnary("list_flatten", tweaked, expected); } -} -TEST(TestVectorNested, ListFlattenNulls) { - const auto ty = list(int32()); - auto input = ArrayFromJSON(ty, "[null, null]"); - auto expected = ArrayFromJSON(int32(), "[]"); - CheckVectorUnary("list_flatten", input, expected); -} + void TestListFlattenNulls() { + value_type_ = int32(); + type_ = std::make_shared(value_type_); + auto input = ArrayFromJSON(type_, "[null, null]"); + auto expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected); + } -TEST(TestVectorNested, ListFlattenChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { - ARROW_SCOPED_TRACE(ty->ToString()); - auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], []]"}); - auto expected = ChunkedArrayFromJSON(int16(), {"[0, null, 1]", "[2, 3]"}); + void TestListFlattenChunkedArray() { + ARROW_SCOPED_TRACE(type_->ToString()); + auto input = ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], []]"}); + auto expected = ChunkedArrayFromJSON(value_type_, {"[0, null, 1]", "[2, 3]"}); CheckVectorUnary("list_flatten", input, expected); ARROW_SCOPED_TRACE("empty"); - input = ChunkedArrayFromJSON(ty, {}); - expected = ChunkedArrayFromJSON(int16(), {}); + input = ChunkedArrayFromJSON(type_, {}); + expected = ChunkedArrayFromJSON(value_type_, {}); CheckVectorUnary("list_flatten", input, expected); } + + void TestListFlattenRecursively() { + auto inner_type = std::make_shared(value_type_); + type_ = std::make_shared(inner_type); + + ListFlattenOptions opts; + opts.recursive = true; + + // List types with two nesting levels: list> + auto input = ArrayFromJSON(type_, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])"); + auto expected = ArrayFromJSON(value_type_, "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // Empty nested list should flatten until non-list type is reached + input = ArrayFromJSON(type_, R"([null])"); + expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // List types with three nesting levels: list>> + type_ = std::make_shared(std::make_shared(fixed_size_list(value_type_, 2))); + input = ArrayFromJSON(type_, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])"); + expected = ArrayFromJSON(value_type_, "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + } + + protected: + std::shared_ptr type_; + std::shared_ptr value_type_; +}; + +TYPED_TEST_SUITE(TestVectorNestedSpecialized, ListAndListViewTypes); + +TYPED_TEST(TestVectorNestedSpecialized, ListFlatten) { this->TestListFlatten(); } + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenNulls) { + this->TestListFlattenNulls(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenChunkedArray) { + this->TestListFlattenChunkedArray(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenRecursively) { + this->TestListFlattenRecursively(); } TEST(TestVectorNested, ListFlattenFixedSizeList) { @@ -92,8 +168,65 @@ TEST(TestVectorNested, ListFlattenFixedSizeListNulls) { CheckVectorUnary("list_flatten", input, expected); } +TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) { + ListFlattenOptions opts; + opts.recursive = true; + + auto inner_type = fixed_size_list(int32(), 2); + auto type = fixed_size_list(inner_type, 2); + auto input = ArrayFromJSON(type, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])"); + auto expected = ArrayFromJSON(int32(), "[0, 1, null, 3, 7, null, 2, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); +} + +template +void SwapListView(ArrayData* array, int64_t i, int64_t j) { + ASSERT_TRUE(is_list_view(array->type->id())); + ASSERT_EQ(array->type->id(), T::type_id); + ASSERT_LT(i, array->length); + ASSERT_LT(j, array->length); + auto* validity = array->GetMutableValues(0); + if (validity) { + const bool is_valid_i = bit_util::GetBit(validity, array->offset + i); + const bool is_valid_j = bit_util::GetBit(validity, array->offset + j); + if (is_valid_i ^ is_valid_j) { + bit_util::SetBitTo(validity, array->offset + i, is_valid_j); + bit_util::SetBitTo(validity, array->offset + j, is_valid_i); + } + } + auto* offsets = array->GetMutableValues(1); + auto* sizes = array->GetMutableValues(2); + std::swap(offsets[i], offsets[j]); + std::swap(sizes[i], sizes[j]); +} + +template +void SetListView(ArrayData* array, int64_t i, offset_type offset, offset_type size) { + ASSERT_TRUE(is_list_view(array->type->id())); + ASSERT_EQ(array->type->id(), T::type_id); + ASSERT_LT(i, array->length); + auto* validity = array->GetMutableValues(0); + if (validity) { + bit_util::SetBit(validity, array->offset + i); + } + auto* offsets = array->GetMutableValues(1); + auto* sizes = array->GetMutableValues(2); + offsets[i] = offset; + sizes[i] = size; +} + TEST(TestVectorNested, ListParentIndices) { - for (auto ty : {list(int16()), large_list(int16())}) { + const auto types = { + list(int16()), + large_list(int16()), + list_view(int16()), + large_list_view(int16()), + }; + for (auto ty : types) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); @@ -105,10 +238,47 @@ TEST(TestVectorNested, ListParentIndices) { auto tweaked = TweakValidityBit(input, 1, false); auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); CheckVectorUnary("list_parent_indices", tweaked, expected); + + { + // Construct a list-view with a non-empty null slot + auto input = + ArrayFromJSON(list_view(int16()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); + auto tweaked = TweakValidityBit(input, 1, false); + auto expected = ArrayFromJSON(int64(), "[0, 0, 0, null, null, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", tweaked, expected); + + // Swap some list-view entries + auto swapped = tweaked->data()->Copy(); + SwapListView(swapped.get(), 0, 2); + SwapListView(swapped.get(), 1, 4); + AssertDatumsEqual( + swapped, + ArrayFromJSON(list_view(int16()), "[[2, 3], [4, 5], [0, null, 1], [], null]"), + /*verbose=*/true); + expected = ArrayFromJSON(int64(), "[2, 2, 2, null, null, 0, 0, 1, 1]"); + CheckVectorUnary("list_parent_indices", swapped, expected); + + // Make one view use values that are used by other list-views + SetListView(swapped.get(), 3, 1, 4); + AssertDatumsEqual( + swapped, + ArrayFromJSON(list_view(int16()), + "[[2, 3], [4, 5], [0, null, 1], [null, 1, 0, 0], null]"), + /*verbose=*/true); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("values used by more than one list-view"), + CallFunction("list_parent_indices", {input})); + } } TEST(TestVectorNested, ListParentIndicesChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { + const auto types = { + list(int16()), + large_list(int16()), + list_view(int16()), + large_list_view(int16()), + }; + for (auto ty : types) { auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 64c3db204c9ee..b265673e23c86 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -68,12 +68,10 @@ using TakeState = OptionsWrapper; // ---------------------------------------------------------------------- // DropNull Implementation -Result> GetDropNullFilter(const Array& values, - MemoryPool* memory_pool) { - auto bitmap_buffer = values.null_bitmap(); - std::shared_ptr out_array = std::make_shared( - values.length(), bitmap_buffer, nullptr, 0, values.offset()); - return out_array; +std::shared_ptr MakeDropNullFilter(const Array& values) { + auto& bitmap_buffer = values.null_bitmap(); + return std::make_shared(values.length(), bitmap_buffer, nullptr, 0, + values.offset()); } Result DropNullArray(const std::shared_ptr& values, ExecContext* ctx) { @@ -86,8 +84,7 @@ Result DropNullArray(const std::shared_ptr& values, ExecContext* c if (values->type()->id() == Type::type::NA) { return std::make_shared(0); } - ARROW_ASSIGN_OR_RAISE(auto drop_null_filter, - GetDropNullFilter(*values, ctx->memory_pool())); + auto drop_null_filter = Datum{MakeDropNullFilter(*values)}; return Filter(values, drop_null_filter, FilterOptions::Defaults(), ctx); } @@ -185,19 +182,16 @@ class DropNullMetaFunction : public MetaFunction { Result ExecuteImpl(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override { - switch (args[0].kind()) { - case Datum::ARRAY: { - return DropNullArray(args[0].make_array(), ctx); - } break; - case Datum::CHUNKED_ARRAY: { - return DropNullChunkedArray(args[0].chunked_array(), ctx); - } break; - case Datum::RECORD_BATCH: { - return DropNullRecordBatch(args[0].record_batch(), ctx); - } break; - case Datum::TABLE: { - return DropNullTable(args[0].table(), ctx); - } break; + auto& values = args[0]; + switch (values.kind()) { + case Datum::ARRAY: + return DropNullArray(values.make_array(), ctx); + case Datum::CHUNKED_ARRAY: + return DropNullChunkedArray(values.chunked_array(), ctx); + case Datum::RECORD_BATCH: + return DropNullRecordBatch(values.record_batch(), ctx); + case Datum::TABLE: + return DropNullTable(values.table(), ctx); default: break; } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 8825d697fdf77..bf67a474f31e2 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -22,6 +22,7 @@ #include #include +#include "arrow/array/concatenate.h" #include "arrow/array/data.h" #include "arrow/buffer_builder.h" #include "arrow/chunked_array.h" @@ -40,6 +41,7 @@ #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/fixed_width_internal.h" namespace arrow { @@ -158,9 +160,11 @@ class PrimitiveFilterImpl { PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, ArrayData* out_arr) - : byte_width_(values.type->byte_width()), + : byte_width_(util::FixedWidthInBytes(*values.type)), values_is_valid_(values.buffers[0].data), - values_data_(values.buffers[1].data), + // No offset applied for boolean because it's a bitmap + values_data_(kIsBoolean ? values.buffers[1].data + : util::OffsetPointerOfFixedByteWidthValues(values)), values_null_count_(values.null_count), values_offset_(values.offset), values_length_(values.length), @@ -169,17 +173,13 @@ class PrimitiveFilterImpl { if constexpr (kByteWidth >= 0 && !kIsBoolean) { DCHECK_EQ(kByteWidth, byte_width_); } - if constexpr (!kIsBoolean) { - // No offset applied for boolean because it's a bitmap - values_data_ += values.offset * byte_width(); - } + DCHECK_EQ(out_arr->offset, 0); if (out_arr->buffers[0] != nullptr) { // May be unallocated if neither filter nor values contain nulls out_is_valid_ = out_arr->buffers[0]->mutable_data(); } - out_data_ = out_arr->buffers[1]->mutable_data(); - DCHECK_EQ(out_arr->offset, 0); + out_data_ = util::MutableFixedWidthValuesPointer(out_arr); out_length_ = out_arr->length; out_position_ = 0; } @@ -416,7 +416,7 @@ class PrimitiveFilterImpl { out_position_ += length; } - constexpr int32_t byte_width() const { + constexpr int64_t byte_width() const { if constexpr (kByteWidth >= 0) { return kByteWidth; } else { @@ -425,7 +425,7 @@ class PrimitiveFilterImpl { } private: - int32_t byte_width_; + int64_t byte_width_; const uint8_t* values_is_valid_; const uint8_t* values_data_; int64_t values_null_count_; @@ -439,6 +439,8 @@ class PrimitiveFilterImpl { int64_t out_position_; }; +} // namespace + Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& values = batch[0].array; const ArraySpan& filter = batch[1].array; @@ -468,9 +470,10 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult // validity bitmap. const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero; - const int bit_width = values.type->bit_width(); - RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, output_length, bit_width, - allocate_validity, out_arr)); + DCHECK(util::IsFixedWidthLike(values)); + const int64_t bit_width = util::FixedWidthInBits(*values.type); + RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( + ctx, output_length, /*source=*/values, allocate_validity, out_arr)); switch (bit_width) { case 1: @@ -505,6 +508,8 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult return Status::OK(); } +namespace { + // ---------------------------------------------------------------------- // Optimized filter for base binary types (32-bit and 64-bit) @@ -924,12 +929,26 @@ Result> FilterRecordBatch(const RecordBatch& batch, return Status::Invalid("Filter inputs must all be the same length"); } - // Convert filter to selection vector/indices and use Take + // Fetch filter const auto& filter_opts = *static_cast(options); - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr indices, - GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior, - ctx->memory_pool())); + ArrayData filter_array; + switch (filter.kind()) { + case Datum::ARRAY: + filter_array = *filter.array(); + break; + case Datum::CHUNKED_ARRAY: { + ARROW_ASSIGN_OR_RAISE(auto combined, Concatenate(filter.chunked_array()->chunks())); + filter_array = *combined->data(); + break; + } + default: + return Status::TypeError("Filter should be array-like"); + } + + // Convert filter to selection vector/indices and use Take + ARROW_ASSIGN_OR_RAISE(std::shared_ptr indices, + GetTakeIndices(filter_array, filter_opts.null_selection_behavior, + ctx->memory_pool())); std::vector> columns(batch.num_columns()); for (int i = 0; i < batch.num_columns(); ++i) { ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices), @@ -1038,7 +1057,6 @@ class FilterMetaFunction : public MetaFunction { } if (args[0].kind() == Datum::RECORD_BATCH) { - auto values_batch = args[0].record_batch(); ARROW_ASSIGN_OR_RAISE( std::shared_ptr out_batch, FilterRecordBatch(*args[0].record_batch(), args[1], options, ctx)); @@ -1083,6 +1101,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(Type::EXTENSION), plain_filter, ExtensionFilterExec}, {InputType(Type::LIST), plain_filter, ListFilterExec}, {InputType(Type::LARGE_LIST), plain_filter, LargeListFilterExec}, + {InputType(Type::LIST_VIEW), plain_filter, ListViewFilterExec}, + {InputType(Type::LARGE_LIST_VIEW), plain_filter, LargeListViewFilterExec}, {InputType(Type::FIXED_SIZE_LIST), plain_filter, FSLFilterExec}, {InputType(Type::DENSE_UNION), plain_filter, DenseUnionFilterExec}, {InputType(Type::SPARSE_UNION), plain_filter, SparseUnionFilterExec}, @@ -1101,6 +1121,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(Type::EXTENSION), ree_filter, ExtensionFilterExec}, {InputType(Type::LIST), ree_filter, ListFilterExec}, {InputType(Type::LARGE_LIST), ree_filter, LargeListFilterExec}, + {InputType(Type::LIST_VIEW), ree_filter, ListViewFilterExec}, + {InputType(Type::LARGE_LIST_VIEW), ree_filter, LargeListViewFilterExec}, {InputType(Type::FIXED_SIZE_LIST), ree_filter, FSLFilterExec}, {InputType(Type::DENSE_UNION), ree_filter, DenseUnionFilterExec}, {InputType(Type::SPARSE_UNION), ree_filter, SparseUnionFilterExec}, diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index a0fe2808e3e4e..7189d42850e79 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -37,6 +37,7 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" @@ -65,24 +66,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, DCHECK_OK(registry->AddFunction(std::move(func))); } -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out) { - // Preallocate memory - out->length = length; - out->buffers.resize(2); - - if (allocate_validity) { - ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length)); - } - if (bit_width == 1) { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length)); - } else { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], - ctx->Allocate(bit_util::BytesForBits(length * bit_width))); - } - return Status::OK(); -} - namespace { /// \brief Iterate over a REE filter, emitting ranges of a plain values array that @@ -564,39 +547,6 @@ struct VarBinarySelectionImpl : public Selection, T } }; -struct FSBSelectionImpl : public Selection { - using Base = Selection; - LIFT_BASE_MEMBERS(); - - TypedBufferBuilder data_builder; - - FSBSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length, - ExecResult* out) - : Base(ctx, batch, output_length, out), data_builder(ctx->memory_pool()) {} - - template - Status GenerateOutput() { - FixedSizeBinaryArray typed_values(this->values.ToArrayData()); - int32_t value_size = typed_values.byte_width(); - - RETURN_NOT_OK(data_builder.Reserve(value_size * output_length)); - Adapter adapter(this); - return adapter.Generate( - [&](int64_t index) { - auto val = typed_values.GetView(index); - data_builder.UnsafeAppend(reinterpret_cast(val.data()), - value_size); - return Status::OK(); - }, - [&]() { - data_builder.UnsafeAppend(value_size, static_cast(0x00)); - return Status::OK(); - }); - } - - Status Finish() override { return data_builder.Finish(&out->buffers[1]); } -}; - template struct ListSelectionImpl : public Selection, Type> { using offset_type = typename Type::offset_type; @@ -662,6 +612,63 @@ struct ListSelectionImpl : public Selection, Type> { } }; +template +struct ListViewSelectionImpl : public Selection, Type> { + using offset_type = typename Type::offset_type; + + using Base = Selection, Type>; + LIFT_BASE_MEMBERS(); + + TypedBufferBuilder offsets_builder; + TypedBufferBuilder sizes_builder; + + ListViewSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length, + ExecResult* out) + : Base(ctx, batch, output_length, out), + offsets_builder(ctx->memory_pool()), + sizes_builder(ctx->memory_pool()) {} + + template + Status GenerateOutput() { + auto* offsets = this->values.template GetValues(1); + auto* sizes = this->values.template GetValues(2); + + offset_type null_list_view_offset = 0; + Adapter adapter(this); + RETURN_NOT_OK(adapter.Generate( + [&](int64_t index) { + offset_type value_offset = offsets[index]; + offset_type value_length = sizes[index]; + offsets_builder.UnsafeAppend(value_offset); + sizes_builder.UnsafeAppend(value_length); + null_list_view_offset = value_offset + value_length; + return Status::OK(); + }, + [&]() { + // 0 could be appended here, but by adding the last offset, we keep + // the buffer compatible with how offsets behave in ListType as well. + // The invariant that `offsets[i] + sizes[i] <= values.length` is + // trivially maintained by having `sizes[i]` set to 0 here. + offsets_builder.UnsafeAppend(null_list_view_offset); + sizes_builder.UnsafeAppend(0); + return Status::OK(); + })); + return Status::OK(); + } + + Status Init() override { + RETURN_NOT_OK(offsets_builder.Reserve(output_length)); + return sizes_builder.Reserve(output_length); + } + + Status Finish() override { + RETURN_NOT_OK(offsets_builder.Finish(&out->buffers[1])); + RETURN_NOT_OK(sizes_builder.Finish(&out->buffers[2])); + out->child_data = {this->values.child_data[0].ToArrayData()}; + return Status::OK(); + } +}; + struct DenseUnionSelectionImpl : public Selection { using Base = Selection; @@ -908,7 +915,30 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult return FilterExec>(ctx, batch, out); } +Status ListViewFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return FilterExec>(ctx, batch, out); +} + +Status LargeListViewFilterExec(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { + return FilterExec>(ctx, batch, out); +} + Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // PrimitiveFilterExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*exclude_bool_and_dictionary=*/true)) { + const auto byte_width = util::FixedWidthInBytes(*values.type); + // 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec + // might not handle it correctly. + if (byte_width > 0) { + return PrimitiveFilterExec(ctx, batch, out); + } + } return FilterExec(ctx, batch, out); } @@ -942,23 +972,6 @@ Status LargeVarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch, return TakeExec>(ctx, batch, out); } -Status FSBTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const ArraySpan& values = batch[0].array; - const auto byte_width = values.type->byte_width(); - // Use primitive Take implementation (presumably faster) for some byte widths - switch (byte_width) { - case 1: - case 2: - case 4: - case 8: - case 16: - case 32: - return PrimitiveTakeExec(ctx, batch, out); - default: - return TakeExec(ctx, batch, out); - } -} - Status ListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return TakeExec>(ctx, batch, out); } @@ -967,7 +980,24 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* return TakeExec>(ctx, batch, out); } +Status ListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return TakeExec>(ctx, batch, out); +} + +Status LargeListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return TakeExec>(ctx, batch, out); +} + Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // FixedWidthTakeExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*exclude_bool_and_dictionary=*/true)) { + return FixedWidthTakeExec(ctx, batch, out); + } return TakeExec(ctx, batch, out); } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h index 95f3e51cd67e3..887bf08354120 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h @@ -45,12 +45,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, const FunctionOptions* default_options, FunctionRegistry* registry); -/// \brief Allocate an ArrayData for a primitive array with a given length and bit width -/// -/// \param[in] bit_width 1 or a multiple of 8 -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out); - /// \brief Callback type for VisitPlainxREEFilterOutputSegments. /// /// position is the logical position in the values array relative to its offset. @@ -70,18 +64,22 @@ void VisitPlainxREEFilterOutputSegments( FilterOptions::NullSelectionBehavior null_selection, const EmitREEFilterSegment& emit_segment); +Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); +Status ListViewFilterExec(KernelContext*, const ExecSpan&, ExecResult*); +Status LargeListViewFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status DenseUnionFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status VarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*); -Status PrimitiveTakeExec(KernelContext*, const ExecSpan&, ExecResult*); -Status FSBTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +Status FixedWidthTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +Status ListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +Status LargeListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status DenseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status SparseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index 5cd3710828485..c45cc552a2cc5 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "arrow/array/builder_primitive.h" @@ -27,8 +28,10 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/codegen_internal.h" +#include "arrow/compute/kernels/gather_internal.h" #include "arrow/compute/kernels/vector_selection_internal.h" #include "arrow/compute/kernels/vector_selection_take_internal.h" +#include "arrow/compute/registry.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" #include "arrow/table.h" @@ -37,6 +40,7 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/ree_util.h" @@ -323,235 +327,79 @@ namespace { using TakeState = OptionsWrapper; // ---------------------------------------------------------------------- -// Implement optimized take for primitive types from boolean to 1/2/4/8-byte -// C-type based types. Use common implementation for every byte width and only -// generate code for unsigned integer indices, since after boundschecking to -// check for negative numbers in the indices we can safely reinterpret_cast -// signed integers as unsigned. - -/// \brief The Take implementation for primitive (fixed-width) types does not -/// use the logical Arrow type but rather the physical C type. This way we -/// only generate one take function for each byte width. -/// -/// This function assumes that the indices have been boundschecked. -template -struct PrimitiveTakeImpl { - static constexpr int kValueWidth = ValueWidthConstant::value; - - static void Exec(const ArraySpan& values, const ArraySpan& indices, - ArrayData* out_arr) { - DCHECK_EQ(values.type->byte_width(), kValueWidth); - const auto* values_data = - values.GetValues(1, 0) + kValueWidth * values.offset; - const uint8_t* values_is_valid = values.buffers[0].data; - auto values_offset = values.offset; - - const auto* indices_data = indices.GetValues(1); - const uint8_t* indices_is_valid = indices.buffers[0].data; - auto indices_offset = indices.offset; - - auto out = out_arr->GetMutableValues(1, 0) + kValueWidth * out_arr->offset; - auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - DCHECK_EQ(out_offset, 0); - - // If either the values or indices have nulls, we preemptively zero out the - // out validity bitmap so that we don't have to use ClearBit in each - // iteration for nulls. - if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); - } - - auto WriteValue = [&](int64_t position) { - memcpy(out + position * kValueWidth, - values_data + indices_data[position] * kValueWidth, kValueWidth); - }; - - auto WriteZero = [&](int64_t position) { - memset(out + position * kValueWidth, 0, kValueWidth); - }; - - auto WriteZeroSegment = [&](int64_t position, int64_t length) { - memset(out + position * kValueWidth, 0, kValueWidth * length); - }; - - OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset, - indices.length); - int64_t position = 0; - int64_t valid_count = 0; - while (position < indices.length) { - BitBlockCount block = indices_bit_counter.NextBlock(); - if (values.null_count == 0) { - // Values are never null, so things are easier - valid_count += block.popcount; - if (block.popcount == block.length) { - // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); - for (int64_t i = 0; i < block.length; ++i) { - WriteValue(position); - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some indices but not all are null - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); - WriteValue(position); - } else { - WriteZero(position); - } - ++position; - } - } else { - WriteZeroSegment(position, block.length); - position += block.length; - } - } else { - // Values have nulls, so we must do random access into the values bitmap - if (block.popcount == block.length) { - // Faster path: indices are not null but values may be - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } else { - WriteZero(position); - } - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null. Since we are doing - // random access in general we have to check the value nullness one by - // one. - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position) && - bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // index is not null && value is not null - WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } else { - WriteZero(position); - } - ++position; - } - } else { - WriteZeroSegment(position, block.length); - position += block.length; - } - } - } - out_arr->null_count = out_arr->length - valid_count; - } -}; - -template -struct BooleanTakeImpl { - static void Exec(const ArraySpan& values, const ArraySpan& indices, - ArrayData* out_arr) { - const uint8_t* values_data = values.buffers[1].data; - const uint8_t* values_is_valid = values.buffers[0].data; - auto values_offset = values.offset; - - const auto* indices_data = indices.GetValues(1); - const uint8_t* indices_is_valid = indices.buffers[0].data; - auto indices_offset = indices.offset; - - auto out = out_arr->buffers[1]->mutable_data(); - auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - - // If either the values or indices have nulls, we preemptively zero out the - // out validity bitmap so that we don't have to use ClearBit in each - // iteration for nulls. - if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); - } - // Avoid uninitialized data in values array - bit_util::SetBitsTo(out, out_offset, indices.length, false); - - auto PlaceDataBit = [&](int64_t loc, IndexCType index) { - bit_util::SetBitTo(out, out_offset + loc, - bit_util::GetBit(values_data, values_offset + index)); - }; +// Implement optimized take for primitive types from boolean to +// 1/2/4/8/16/32-byte C-type based types and fixed-size binary (0 or more +// bytes). +// +// Use one specialization for each of these primitive byte-widths so the +// compiler can specialize the memcpy to dedicated CPU instructions and for +// fixed-width binary use the 1-byte specialization but pass WithFactor=true +// that makes the kernel consider the factor parameter provided at runtime. +// +// Only unsigned index types need to be instantiated since after +// boundschecking to check for negative numbers in the indices we can safely +// reinterpret_cast signed integers as unsigned. - OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset, - indices.length); - int64_t position = 0; +/// \brief The Take implementation for primitive types and fixed-width binary. +/// +/// Also note that this function can also handle fixed-size-list arrays if +/// they fit the criteria described in fixed_width_internal.h, so use the +/// function defined in that file to access values and destination pointers +/// and DO NOT ASSUME `values.type()` is a primitive type. +/// +/// NOTE: Template parameters are types instead of values to let +/// `TakeIndexDispatch<>` forward `typename... Args` after the index type. +/// +/// \pre the indices have been boundschecked +template +struct FixedWidthTakeImpl { + static constexpr int kValueWidthInBits = ValueBitWidthConstant::value; + + static Status Exec(KernelContext* ctx, const ArraySpan& values, + const ArraySpan& indices, ArrayData* out_arr, int64_t factor) { +#ifndef NDEBUG + int64_t bit_width = util::FixedWidthInBits(*values.type); + DCHECK(WithFactor::value || (kValueWidthInBits == bit_width && factor == 1)); + DCHECK(!WithFactor::value || + (factor > 0 && kValueWidthInBits == 8 && // factors are used with bytes + static_cast(factor * kValueWidthInBits) == bit_width)); +#endif + const bool out_has_validity = values.MayHaveNulls() || indices.MayHaveNulls(); + + const uint8_t* src; + int64_t src_offset; + std::tie(src_offset, src) = util::OffsetPointerOfFixedBitWidthValues(values); + uint8_t* out = util::MutableFixedWidthValuesPointer(out_arr); int64_t valid_count = 0; - while (position < indices.length) { - BitBlockCount block = indices_bit_counter.NextBlock(); - if (values.null_count == 0) { - // Values are never null, so things are easier - valid_count += block.popcount; - if (block.popcount == block.length) { - // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); - for (int64_t i = 0; i < block.length; ++i) { - PlaceDataBit(position, indices_data[position]); - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); - PlaceDataBit(position, indices_data[position]); - } - ++position; - } - } else { - position += block.length; - } - } else { - // Values have nulls, so we must do random access into the values bitmap - if (block.popcount == block.length) { - // Faster path: indices are not null but values may be - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - bit_util::SetBit(out_is_valid, out_offset + position); - PlaceDataBit(position, indices_data[position]); - ++valid_count; - } - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null. Since we are doing - // random access in general we have to check the value nullness one by - // one. - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - PlaceDataBit(position, indices_data[position]); - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } - } - ++position; - } - } else { - position += block.length; - } - } + arrow::internal::Gather gather{ + /*src_length=*/values.length, + src, + src_offset, + /*idx_length=*/indices.length, + /*idx=*/indices.GetValues(1), + out, + factor}; + if (out_has_validity) { + DCHECK_EQ(out_arr->offset, 0); + // out_is_valid must be zero-initiliazed, because Gather::Execute + // saves time by not having to ClearBit on every null element. + auto out_is_valid = out_arr->GetMutableValues(0); + memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length)); + valid_count = gather.template Execute( + /*src_validity=*/values, /*idx_validity=*/indices, out_is_valid); + } else { + valid_count = gather.Execute(); } out_arr->null_count = out_arr->length - valid_count; + return Status::OK(); } }; template