Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/branch-24.06' into HEAD
Browse files Browse the repository at this point in the history
Signed-off-by: Allen Xu <allxu@nvidia.com>
  • Loading branch information
wjxiz1992 committed May 7, 2024
2 parents e0b7bba + 71ecc9f commit c6baa5a
Show file tree
Hide file tree
Showing 322 changed files with 3,844 additions and 985 deletions.
5 changes: 1 addition & 4 deletions .github/workflows/blossom-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,17 @@ jobs:
# This job only runs for pull request comments
if: contains( '\
abellina,\
andygrove,\
anfeng,\
firestarman,\
GaryShen2008,\
jbrennan333, \
jlowe,\
krajendrannv,\
kuhushukla,\
mythrocks,\
nartal1,\
nvdbaranec,\
NvTimLiu,\
razajafri,\
revans2,\
rongou,\
rwlee,\
sameerz,\
tgravescs,\
Expand All @@ -73,6 +69,7 @@ jobs:
yinqingh,\
parthosa,\
liurenjie1024,\
binmahone,\
', format('{0},', github.actor)) && github.event.comment.body == 'build'
steps:
- name: Check if comment is issued by authorized person
Expand Down
481 changes: 176 additions & 305 deletions CHANGELOG.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ access to any of the memory that RMM is holding.
The Qualification and Profiling tools have been moved to
[nvidia/spark-rapids-tools](https://github.com/NVIDIA/spark-rapids-tools) repo.

Please refer to [Qualification tool documentation](https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-qualification-tool.html)
and [Profiling tool documentation](https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-profiling-tool.html)
Please refer to [Qualification tool documentation](https://docs.nvidia.com/spark-rapids/user-guide/latest/qualification/overview.html)
and [Profiling tool documentation](https://docs.nvidia.com/spark-rapids/user-guide/latest/profiling/overview.html)
for more details on how to use the tools.

## Dependency for External Projects
Expand Down
17 changes: 17 additions & 0 deletions aggregator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,23 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>release343</id>
<activation>
<property>
<name>buildver</name>
<value>343</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-delta-24x_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
</dependency>
</dependencies>
</profile>
<profile>
<id>release350</id>
<activation>
Expand Down
4 changes: 2 additions & 2 deletions build/build-info
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash

#
# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -24,7 +24,7 @@ set -e
echo_build_properties() {
echo version=$1
echo cudf_version=$2
echo user=$USER
echo user=$(whoami)
echo revision=$(git rev-parse HEAD)
echo branch=$(git rev-parse --abbrev-ref HEAD)
echo date=$(date -u +%Y-%m-%dT%H:%M:%SZ)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,15 @@ abstract class DataGen(var conf: ColumnConf,
this
}

def setNullProbabilityRecursively(probability: Double): DataGen = {
this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability))
children.foreach {
case (_, dataGen) =>
dataGen.setNullProbabilityRecursively(probability)
}
this
}

/**
* Set a specific location to seed mapping for the value generation.
*/
Expand Down Expand Up @@ -672,6 +681,7 @@ abstract class DataGen(var conf: ColumnConf,
* Get the default value generator for this specific data gen.
*/
protected def getValGen: GeneratorFunction
def children: Seq[(String, DataGen)]

/**
* Get the final ready to use GeneratorFunction for the data generator.
Expand Down Expand Up @@ -823,6 +833,8 @@ class BooleanGen(conf: ColumnConf,
override def dataType: DataType = BooleanType

override protected def getValGen: GeneratorFunction = BooleanGenFunc()

override def children: Seq[(String, DataGen)] = Seq.empty
}

/**
Expand Down Expand Up @@ -878,6 +890,8 @@ class ByteGen(conf: ColumnConf,
extends DataGen(conf, defaultValueRange) {
override def getValGen: GeneratorFunction = ByteGenFunc()
override def dataType: DataType = ByteType

override def children: Seq[(String, DataGen)] = Seq.empty
}

/**
Expand Down Expand Up @@ -935,6 +949,8 @@ class ShortGen(conf: ColumnConf,
override def getValGen: GeneratorFunction = ShortGenFunc()

override def dataType: DataType = ShortType

override def children: Seq[(String, DataGen)] = Seq.empty
}

/**
Expand Down Expand Up @@ -991,6 +1007,8 @@ class IntGen(conf: ColumnConf,
override def getValGen: GeneratorFunction = IntGenFunc()

override def dataType: DataType = IntegerType

override def children: Seq[(String, DataGen)] = Seq.empty
}

/**
Expand Down Expand Up @@ -1045,6 +1063,8 @@ class LongGen(conf: ColumnConf,
override def getValGen: GeneratorFunction = LongGenFunc()

override def dataType: DataType = LongType

override def children: Seq[(String, DataGen)] = Seq.empty
}

case class Decimal32GenFunc(
Expand Down Expand Up @@ -1284,6 +1304,8 @@ class DecimalGen(dt: DecimalType,
val max = DecimalGen.genMaxUnscaled(dt.precision)
DecimalGenFunc(dt.precision, dt.scale, -max, max)
}

override def children: Seq[(String, DataGen)] = Seq.empty
}

/**
Expand Down Expand Up @@ -1341,6 +1363,8 @@ class TimestampGen(conf: ColumnConf,
override protected def getValGen: GeneratorFunction = TimestampGenFunc()

override def dataType: DataType = TimestampType

override def children: Seq[(String, DataGen)] = Seq.empty
}

object BigDataGenConsts {
Expand Down Expand Up @@ -1418,6 +1442,8 @@ class DateGen(conf: ColumnConf,
override protected def getValGen: GeneratorFunction = DateGenFunc()

override def dataType: DataType = DateType

override def children: Seq[(String, DataGen)] = Seq.empty
}

/**
Expand All @@ -1440,6 +1466,8 @@ class DoubleGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
override def dataType: DataType = DoubleType

override protected def getValGen: GeneratorFunction = DoubleGenFunc()

override def children: Seq[(String, DataGen)] = Seq.empty
}

/**
Expand All @@ -1462,6 +1490,8 @@ class FloatGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
override def dataType: DataType = FloatType

override protected def getValGen: GeneratorFunction = FloatGenFunc()

override def children: Seq[(String, DataGen)] = Seq.empty
}

trait JSONType {
Expand Down Expand Up @@ -1648,6 +1678,8 @@ class StringGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
override def dataType: DataType = StringType

override protected def getValGen: GeneratorFunction = ASCIIGenFunc()

override def children: Seq[(String, DataGen)] = Seq.empty
}

case class StructGenFunc(childGens: Array[GeneratorFunction]) extends GeneratorFunction {
Expand Down Expand Up @@ -1752,6 +1784,8 @@ class ArrayGen(child: DataGen,
None
}
}

override def children: Seq[(String, DataGen)] = Seq(("data", child))
}

case class MapGenFunc(
Expand Down Expand Up @@ -1816,6 +1850,8 @@ class MapGen(key: DataGen,
None
}
}

override def children: Seq[(String, DataGen)] = Seq(("key", key), ("value", value))
}


Expand Down Expand Up @@ -1864,6 +1900,11 @@ class ColumnGen(val dataGen: DataGen) {
this
}

def setNullProbabilityRecursively(probability: Double): ColumnGen = {
dataGen.setNullProbabilityRecursively(probability)
this
}

def setNullGen(f: NullGeneratorFunction): ColumnGen = {
dataGen.setNullGen(f)
this
Expand Down Expand Up @@ -1973,6 +2014,14 @@ class TableGen(val columns: Seq[(String, ColumnGen)], numRows: Long) {
this
}

def setNullProbabilityRecursively(probability: Double): TableGen = {
columns.foreach {
case (_, columnGen) =>
columnGen.setNullProbabilityRecursively(probability)
}
this
}

/**
* Convert this table into a `DataFrame` that can be
* written out or used directly. Writing it out to parquet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@
{"spark": "341"}
{"spark": "341db"}
{"spark": "342"}
{"spark": "343"}
{"spark": "350"}
{"spark": "351"}
{"spark": "400"}
spark-rapids-shim-json-lines ***/
package org.apache.spark.sql.tests.datagen

Expand Down
9 changes: 5 additions & 4 deletions docs/additional-functionality/advanced_configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@ Name | Description | Default Value | Applicable at
<a name="sql.json.read.decimal.enabled"></a>spark.rapids.sql.json.read.decimal.enabled|When reading a quoted string as a decimal Spark supports reading non-ascii unicode digits, and the RAPIDS Accelerator does not.|true|Runtime
<a name="sql.json.read.double.enabled"></a>spark.rapids.sql.json.read.double.enabled|JSON reading is not 100% compatible when reading doubles.|true|Runtime
<a name="sql.json.read.float.enabled"></a>spark.rapids.sql.json.read.float.enabled|JSON reading is not 100% compatible when reading floats.|true|Runtime
<a name="sql.json.read.mixedTypesAsString.enabled"></a>spark.rapids.sql.json.read.mixedTypesAsString.enabled|JSON reading is not 100% compatible when reading mixed types as string.|false|Runtime
<a name="sql.mode"></a>spark.rapids.sql.mode|Set the mode for the Rapids Accelerator. The supported modes are explainOnly and executeOnGPU. This config can not be changed at runtime, you must restart the application for it to take affect. The default mode is executeOnGPU, which means the RAPIDS Accelerator plugin convert the Spark operations and execute them on the GPU when possible. The explainOnly mode allows running queries on the CPU and the RAPIDS Accelerator will evaluate the queries as if it was going to run on the GPU. The explanations of what would have run on the GPU and why are output in log messages. When using explainOnly mode, the default explain output is ALL, this can be changed by setting spark.rapids.sql.explain. See that config for more details.|executeongpu|Startup
<a name="sql.optimizer.joinReorder.enabled"></a>spark.rapids.sql.optimizer.joinReorder.enabled|When enabled, joins may be reordered for improved query performance|true|Runtime
<a name="sql.python.gpu.enabled"></a>spark.rapids.sql.python.gpu.enabled|This is an experimental feature and is likely to change in the future. Enable (true) or disable (false) support for scheduling Python Pandas UDFs with GPU resources. When enabled, pandas UDFs are assumed to share the same GPU that the RAPIDs accelerator uses and will honor the python GPU configs|false|Runtime
<a name="sql.reader.chunked"></a>spark.rapids.sql.reader.chunked|Enable a chunked reader where possible. A chunked reader allows reading highly compressed data that could not be read otherwise, but at the expense of more GPU memory, and in some cases more GPU computation.|true|Runtime
<a name="sql.reader.chunked.subPage"></a>spark.rapids.sql.reader.chunked.subPage|Enable a chunked reader where possible for reading data that is smaller than the typical row group/page limit. Currently this only works for parquet.|true|Runtime
<a name="sql.reader.chunked"></a>spark.rapids.sql.reader.chunked|Enable a chunked reader where possible. A chunked reader allows reading highly compressed data that could not be read otherwise, but at the expense of more GPU memory, and in some cases more GPU computation. Currently this only supports ORC and Parquet formats.|true|Runtime
<a name="sql.reader.chunked.limitMemoryUsage"></a>spark.rapids.sql.reader.chunked.limitMemoryUsage|Enable a soft limit on the internal memory usage of the chunked reader (if being used). Such limit is calculated as the multiplication of 'spark.rapids.sql.batchSizeBytes' and 'spark.rapids.sql.reader.chunked.memoryUsageRatio'.For example, if batchSizeBytes is set to 1GB and memoryUsageRatio is 4, the chunked reader will try to keep its memory usage under 4GB.|None|Runtime
<a name="sql.reader.chunked.subPage"></a>spark.rapids.sql.reader.chunked.subPage|Enable a chunked reader where possible for reading data that is smaller than the typical row group/page limit. Currently deprecated and replaced by 'spark.rapids.sql.reader.chunked.limitMemoryUsage'.|None|Runtime
<a name="sql.reader.multithreaded.combine.sizeBytes"></a>spark.rapids.sql.reader.multithreaded.combine.sizeBytes|The target size in bytes to combine multiple small files together when using the MULTITHREADED parquet or orc reader. With combine disabled, the MULTITHREADED reader reads the files in parallel and sends individual files down to the GPU, but that can be inefficient for small files. When combine is enabled, files that are ready within spark.rapids.sql.reader.multithreaded.combine.waitTime together, up to this threshold size, are combined before sending down to GPU. This can be disabled by setting it to 0. Note that combine also will not go over the spark.rapids.sql.reader.batchSizeRows or spark.rapids.sql.reader.batchSizeBytes limits.|67108864|Runtime
<a name="sql.reader.multithreaded.combine.waitTime"></a>spark.rapids.sql.reader.multithreaded.combine.waitTime|When using the multithreaded parquet or orc reader with combine mode, how long to wait, in milliseconds, for more files to finish if haven't met the size threshold. Note that this will wait this amount of time from when the last file was available, so total wait time could be larger then this.|200|Runtime
<a name="sql.reader.multithreaded.read.keepOrder"></a>spark.rapids.sql.reader.multithreaded.read.keepOrder|When using the MULTITHREADED reader, if this is set to true we read the files in the same order Spark does, otherwise the order may not be the same. Now it is supported only for parquet and orc.|true|Runtime
Expand Down Expand Up @@ -184,6 +184,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
<a name="sql.expression.ArrayContains"></a>spark.rapids.sql.expression.ArrayContains|`array_contains`|Returns a boolean if the array contains the passed in key|true|None|
<a name="sql.expression.ArrayExcept"></a>spark.rapids.sql.expression.ArrayExcept|`array_except`|Returns an array of the elements in array1 but not in array2, without duplicates|true|This is not 100% compatible with the Spark version because the GPU implementation treats -0.0 and 0.0 as equal, but the CPU implementation currently does not (see SPARK-39845). Also, Apache Spark 3.1.3 fixed issue SPARK-36741 where NaNs in these set like operators were not treated as being equal. We have chosen to break with compatibility for the older versions of Spark in this instance and handle NaNs the same as 3.1.3+|
<a name="sql.expression.ArrayExists"></a>spark.rapids.sql.expression.ArrayExists|`exists`|Return true if any element satisfies the predicate LambdaFunction|true|None|
<a name="sql.expression.ArrayFilter"></a>spark.rapids.sql.expression.ArrayFilter|`filter`|Filter an input array using a given predicate|true|None|
<a name="sql.expression.ArrayIntersect"></a>spark.rapids.sql.expression.ArrayIntersect|`array_intersect`|Returns an array of the elements in the intersection of array1 and array2, without duplicates|true|This is not 100% compatible with the Spark version because the GPU implementation treats -0.0 and 0.0 as equal, but the CPU implementation currently does not (see SPARK-39845). Also, Apache Spark 3.1.3 fixed issue SPARK-36741 where NaNs in these set like operators were not treated as being equal. We have chosen to break with compatibility for the older versions of Spark in this instance and handle NaNs the same as 3.1.3+|
<a name="sql.expression.ArrayMax"></a>spark.rapids.sql.expression.ArrayMax|`array_max`|Returns the maximum value in the array|true|None|
<a name="sql.expression.ArrayMin"></a>spark.rapids.sql.expression.ArrayMin|`array_min`|Returns the minimum value in the array|true|None|
Expand Down Expand Up @@ -269,7 +270,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
<a name="sql.expression.IsNotNull"></a>spark.rapids.sql.expression.IsNotNull|`isnotnull`|Checks if a value is not null|true|None|
<a name="sql.expression.IsNull"></a>spark.rapids.sql.expression.IsNull|`isnull`|Checks if a value is null|true|None|
<a name="sql.expression.JsonToStructs"></a>spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|false|This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case|
<a name="sql.expression.JsonTuple"></a>spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because JsonTuple on the GPU does not support all of the normalization that the CPU supports.|
<a name="sql.expression.JsonTuple"></a>spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because Experimental feature that could be unstable or have performance issues.|
<a name="sql.expression.KnownFloatingPointNormalized"></a>spark.rapids.sql.expression.KnownFloatingPointNormalized| |Tag to prevent redundant normalization|true|None|
<a name="sql.expression.KnownNotNull"></a>spark.rapids.sql.expression.KnownNotNull| |Tag an expression as known to not be null|true|None|
<a name="sql.expression.Lag"></a>spark.rapids.sql.expression.Lag|`lag`|Window function that returns N entries behind this one|true|None|
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -24,7 +24,7 @@
# - ROCKY_VER: Rocky Linux OS version

ARG CUDA_VER=11.8.0
ARG UCX_VER=1.15.0
ARG UCX_VER=1.16.0
ARG UCX_CUDA_VER=11
ARG UCX_ARCH=x86_64
ARG ROCKY_VER=8
Expand All @@ -38,6 +38,5 @@ RUN ls /usr/lib
RUN mkdir /tmp/ucx_install && cd /tmp/ucx_install && \
wget https://github.com/openucx/ucx/releases/download/v$UCX_VER/ucx-$UCX_VER-centos8-mofed5-cuda$UCX_CUDA_VER-$UCX_ARCH.tar.bz2 && \
tar -xvf *.bz2 && \
rpm -i ucx-$UCX_VER*.rpm && \
rpm -i ucx-cuda-$UCX_VER*.rpm --nodeps && \
rpm -i `ls ucx-[0-9]*.rpm ucx-cuda-[0-9]*.rpm` --nodeps && \
rm -rf /tmp/ucx_install
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -24,7 +24,7 @@
# - ROCKY_VER: Rocky Linux OS version

ARG CUDA_VER=11.8.0
ARG UCX_VER=1.15.0
ARG UCX_VER=1.16.0
ARG UCX_CUDA_VER=11
ARG UCX_ARCH=x86_64
ARG ROCKY_VER=8
Expand All @@ -37,7 +37,5 @@ RUN yum update -y && yum install -y wget bzip2 rdma-core numactl-libs libgomp li
RUN mkdir /tmp/ucx_install && cd /tmp/ucx_install && \
wget https://github.com/openucx/ucx/releases/download/v$UCX_VER/ucx-$UCX_VER-centos8-mofed5-cuda$UCX_CUDA_VER-$UCX_ARCH.tar.bz2 && \
tar -xvf *.bz2 && \
rpm -i ucx-$UCX_VER*.rpm && \
rpm -i ucx-cuda-$UCX_VER*.rpm --nodeps && \
rpm -i ucx-ib-$UCX_VER-1.el8.x86_64.rpm ucx-rdmacm-$UCX_VER-1.el8.x86_64.rpm && \
rpm -i `ls ucx-[0-9]*.rpm ucx-cuda-[0-9]*.rpm ucx-ib-[0-9]*.rpm ucx-rdmacm-[0-9]*.rpm` --nodeps && \
rm -rf /tmp/ucx_install
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -25,7 +25,7 @@
#

ARG CUDA_VER=11.8.0
ARG UCX_VER=1.15.0
ARG UCX_VER=1.16.0
ARG UCX_CUDA_VER=11
ARG UCX_ARCH=x86_64
ARG UBUNTU_VER=20.04
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,7 +35,7 @@

ARG RDMA_CORE_VERSION=32.1
ARG CUDA_VER=11.8.0
ARG UCX_VER=1.15.0
ARG UCX_VER=1.16.0
ARG UCX_CUDA_VER=11
ARG UCX_ARCH=x86_64
ARG UBUNTU_VER=20.04
Expand Down
Loading

0 comments on commit c6baa5a

Please sign in to comment.