Skip to content

Commit

Permalink
Merge branch 'main' into parquet/adding-str-length-for-debugging
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Feb 1, 2024
2 parents ea48b1e + c534749 commit 5b531a6
Show file tree
Hide file tree
Showing 35 changed files with 1,035 additions and 76 deletions.
7 changes: 1 addition & 6 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,10 @@
# /cpp/
/cpp/src/arrow/acero @westonpace
/cpp/src/arrow/adapters/orc @wgtmac
/cpp/src/arrow/dataset @westonpace
/cpp/src/arrow/engine @westonpace
/cpp/src/arrow/flight/ @lidavidm
/cpp/src/arrow/util/async* @westonpace
/cpp/src/arrow/util/future* @westonpace
/cpp/src/arrow/util/thread* @westonpace
/cpp/src/parquet @wgtmac
/cpp/src/skyhook @westonpace
/csharp/ @westonpace
/csharp/ @curthagenlocher
/go/ @zeroshade
/java/ @lidavidm
/js/ @domoritz @trxcllnt
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,6 @@ jobs:
/t REG_DWORD `
/d 1 `
/f
- name: Installed Packages
run: choco list
- name: Install Dependencies
run: choco install -y --no-progress openssl
- name: Checkout Arrow
uses: actions/checkout@v4
with:
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/matlab.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
- name: Install ninja-build
run: sudo apt-get install ninja-build
- name: Install MATLAB
uses: matlab-actions/setup-matlab@v1
uses: matlab-actions/setup-matlab@v2
with:
release: R2023a
- name: Install ccache
Expand Down Expand Up @@ -85,7 +85,7 @@ jobs:
# Add the installation directory to the MATLAB Search Path by
# setting the MATLABPATH environment variable.
MATLABPATH: matlab/install/arrow_matlab
uses: matlab-actions/run-tests@v1
uses: matlab-actions/run-tests@v2
with:
select-by-folder: matlab/test
macos:
Expand All @@ -100,7 +100,7 @@ jobs:
- name: Install ninja-build
run: brew install ninja
- name: Install MATLAB
uses: matlab-actions/setup-matlab@v1
uses: matlab-actions/setup-matlab@v2
with:
release: R2023a
- name: Install ccache
Expand All @@ -125,7 +125,7 @@ jobs:
# Add the installation directory to the MATLAB Search Path by
# setting the MATLABPATH environment variable.
MATLABPATH: matlab/install/arrow_matlab
uses: matlab-actions/run-tests@v1
uses: matlab-actions/run-tests@v2
with:
select-by-folder: matlab/test
windows:
Expand All @@ -138,7 +138,7 @@ jobs:
with:
fetch-depth: 0
- name: Install MATLAB
uses: matlab-actions/setup-matlab@v1
uses: matlab-actions/setup-matlab@v2
with:
release: R2023a
- name: Download Timezone Database
Expand Down Expand Up @@ -171,6 +171,6 @@ jobs:
# Add the installation directory to the MATLAB Search Path by
# setting the MATLABPATH environment variable.
MATLABPATH: matlab/install/arrow_matlab
uses: matlab-actions/run-tests@v1
uses: matlab-actions/run-tests@v2
with:
select-by-folder: matlab/test
3 changes: 1 addition & 2 deletions ci/conda_env_python.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ cloudpickle
fsspec
hypothesis
numpy>=1.16.6
pytest<8 # pytest-lazy-fixture broken on pytest 8.0.0
pytest<8
pytest-faulthandler
pytest-lazy-fixture
s3fs>=2023.10.0
setuptools
setuptools_scm<8.0.0
5 changes: 3 additions & 2 deletions ci/docker/python-wheel-manylinux.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ ENV MANYLINUX_VERSION=${manylinux}
RUN yum install -y dnf

# Install basic dependencies
RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget kernel-headers
RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget

# A system Python is required for ninja and vcpkg in this Dockerfile.
# On manylinux2014 base images, system Python is 2.7.5, while
Expand Down Expand Up @@ -97,4 +97,5 @@ SHELL ["/bin/bash", "-i", "-c"]
ENTRYPOINT ["/bin/bash", "-i", "-c"]

COPY python/requirements-wheel-build.txt /arrow/python/
RUN pip install -r /arrow/python/requirements-wheel-build.txt
# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release
RUN pip install -r /arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
3 changes: 2 additions & 1 deletion ci/docker/python-wheel-windows-vs2017.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION%
RUN python -m pip install -U pip setuptools

COPY python/requirements-wheel-build.txt arrow/python/
RUN python -m pip install -r arrow/python/requirements-wheel-build.txt
# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release
RUN python -m pip install -r arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"

# ENV CLCACHE_DIR="C:\clcache"
# ENV CLCACHE_COMPRESS=1
Expand Down
3 changes: 2 additions & 1 deletion ci/scripts/install_dask.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,5 @@ else
fi

# additional dependencies needed for dask's s3 tests
pip install moto[server] flask requests
# Moto 5 results in timeouts in s3 tests: https://github.com/dask/dask/issues/10869
pip install "moto[server]<5" flask requests
5 changes: 4 additions & 1 deletion ci/scripts/python_wheel_macos_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,15 @@ echo "=== (${PYTHON_VERSION}) Install Python build dependencies ==="
export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}"

# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release
pip install \
--upgrade \
--only-binary=:all: \
--target $PIP_SITE_PACKAGES \
--platform $PIP_TARGET_PLATFORM \
-r ${source_dir}/python/requirements-wheel-build.txt
-r ${source_dir}/python/requirements-wheel-build.txt \
--pre \
--extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
pip install "delocate>=0.10.3"

echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ==="
Expand Down
12 changes: 10 additions & 2 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2414,7 +2414,11 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
void SetData(int num_values, const uint8_t* data, int len) override {
// num_values is equal to page's num_values, including null values in this page
this->num_values_ = num_values;
decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len);
if (decoder_ == nullptr) {
decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len);
} else {
decoder_->Reset(data, len);
}
InitHeader();
}

Expand Down Expand Up @@ -2773,7 +2777,11 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,

void SetData(int num_values, const uint8_t* data, int len) override {
DecoderImpl::SetData(num_values, data, len);
decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len);
if (decoder_ == nullptr) {
decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len);
} else {
decoder_->Reset(data, len);
}
DecodeLengths();
}

Expand Down
4 changes: 2 additions & 2 deletions cpp/thirdparty/versions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ ARROW_UTF8PROC_BUILD_VERSION=v2.7.0
ARROW_UTF8PROC_BUILD_SHA256_CHECKSUM=4bb121e297293c0fd55f08f83afab6d35d48f0af4ecc07523ad8ec99aa2b12a1
ARROW_XSIMD_BUILD_VERSION=9.0.1
ARROW_XSIMD_BUILD_SHA256_CHECKSUM=b1bb5f92167fd3a4f25749db0be7e61ed37e0a5d943490f3accdcd2cd2918cc0
ARROW_ZLIB_BUILD_VERSION=1.3
ARROW_ZLIB_BUILD_SHA256_CHECKSUM=ff0ba4c292013dbc27530b3a81e1f9a813cd39de01ca5e0f8bf355702efa593e
ARROW_ZLIB_BUILD_VERSION=1.3.1
ARROW_ZLIB_BUILD_SHA256_CHECKSUM=9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23
ARROW_ZSTD_BUILD_VERSION=1.5.5
ARROW_ZSTD_BUILD_SHA256_CHECKSUM=9c4396cc829cfae319a6e2615202e82aad41372073482fce286fac78646d3ee4

Expand Down
1 change: 0 additions & 1 deletion dev/tasks/conda-recipes/arrow-cpp/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,6 @@ outputs:
# test_cpp_extension_in_python requires a compiler
- {{ compiler("cxx") }} # [linux]
- pytest
- pytest-lazy-fixture
- backports.zoneinfo # [py<39]
- boto3
- cffi
Expand Down
9 changes: 5 additions & 4 deletions docs/source/status.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Data Types
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| UInt8/16/32/64 |||||||||
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Float16 | ✓ (1) | ||| ✓ (2)| ✓ || |
| Float16 | ✓ (1) | ✓ (2) ||| ✓ (3)| ✓ || |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Float32/64 |||||||||
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
Expand Down Expand Up @@ -104,7 +104,7 @@ Data Types
| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift |
| (special) | | | | | | | | |
+===================+=======+=======+=======+============+=======+=======+=======+=======+
| Dictionary || ✓ (3) |||| ✓ (3) || |
| Dictionary || ✓ (4) |||| ✓ (3) || |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Extension |||| | ||| |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
Expand All @@ -114,8 +114,9 @@ Data Types
Notes:

* \(1) Casting to/from Float16 in C++ is not supported.
* \(2) Float16 support in C# is only available when targeting .NET 6+.
* \(3) Nested dictionaries not supported
* \(2) Casting to/from Float16 in Java is not supported.
* \(3) Float16 support in C# is only available when targeting .NET 6+.
* \(4) Nested dictionaries not supported

.. seealso::
The :ref:`format_columnar` specification.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@
import org.apache.arrow.dataset.file.DatasetFileWriter;
import org.apache.arrow.dataset.file.FileFormat;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.util.Float16;
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.vector.BitVector;
import org.apache.arrow.vector.DateMilliVector;
import org.apache.arrow.vector.Decimal256Vector;
import org.apache.arrow.vector.DecimalVector;
import org.apache.arrow.vector.DurationVector;
import org.apache.arrow.vector.FixedSizeBinaryVector;
import org.apache.arrow.vector.Float2Vector;
import org.apache.arrow.vector.Float4Vector;
import org.apache.arrow.vector.Float8Vector;
import org.apache.arrow.vector.IntVector;
Expand Down Expand Up @@ -89,7 +91,6 @@ public class TestAllTypes extends TestDataset {

private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) {
// Notes:
// - Float16 is not supported by Java.
// - IntervalMonthDayNano is not supported by Parquet.
// - Map (GH-38250) and SparseUnion are resulting in serialization errors when writing with the Dataset API.
// "Unhandled type for Arrow to Parquet schema conversion" errors: IntervalDay, IntervalYear, DenseUnion
Expand All @@ -109,6 +110,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) {
Field.nullablePrimitive("uint16", new ArrowType.Int(16, false)),
Field.nullablePrimitive("uint32", new ArrowType.Int(32, false)),
Field.nullablePrimitive("uint64", new ArrowType.Int(64, false)),
Field.nullablePrimitive("float16", new ArrowType.FloatingPoint(FloatingPointPrecision.HALF)),
Field.nullablePrimitive("float32", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)),
Field.nullablePrimitive("float64", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)),
Field.nullablePrimitive("utf8", ArrowType.Utf8.INSTANCE),
Expand Down Expand Up @@ -148,6 +150,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) {
root.getVector("uint16").setNull(0);
root.getVector("uint32").setNull(0);
root.getVector("uint64").setNull(0);
root.getVector("float16").setNull(0);
root.getVector("float32").setNull(0);
root.getVector("float64").setNull(0);
root.getVector("utf8").setNull(0);
Expand Down Expand Up @@ -180,6 +183,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) {
((UInt2Vector) root.getVector("uint16")).set(1, 1);
((UInt4Vector) root.getVector("uint32")).set(1, 1);
((UInt8Vector) root.getVector("uint64")).set(1, 1);
((Float2Vector) root.getVector("float16")).set(1, Float16.toFloat16(+32.875f));
((Float4Vector) root.getVector("float32")).set(1, 1.0f);
((Float8Vector) root.getVector("float64")).set(1, 1.0);
((VarCharVector) root.getVector("utf8")).set(1, new Text("a"));
Expand Down
Loading

0 comments on commit 5b531a6

Please sign in to comment.