diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md index e34e0557d..c498a1d15 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md @@ -16,4 +16,4 @@ Following this [guide](/docs/get-started/xgboost-examples/building-sample-apps/p You need to copy the dataset to `/opt/xgboost`. Use the following links to download the data. 1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md) 2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) -3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus) +3. [Agaricus dataset](https://github.com/dmlc/xgboost/tree/master/demo/data) diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md index 5fc42d603..4e12a8342 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md @@ -16,4 +16,4 @@ Following this [guide](/docs/get-started/xgboost-examples/building-sample-apps/s You need to copy the dataset to `/opt/xgboost`. Use the following links to download the data. 1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md) 2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) -3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus) +3. [Agaricus dataset](https://github.com/dmlc/xgboost/tree/master/demo/data) diff --git a/docs/img/guides/microbm.png b/docs/img/guides/microbm.png index e04553b37..581c39543 100644 Binary files a/docs/img/guides/microbm.png and b/docs/img/guides/microbm.png differ diff --git a/examples/SQL+DF-Examples/micro-benchmarks/README.md b/examples/SQL+DF-Examples/micro-benchmarks/README.md index 29a17061e..e135ff239 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/README.md +++ b/examples/SQL+DF-Examples/micro-benchmarks/README.md @@ -3,7 +3,7 @@ Standard industry benchmarks are a great way to measure performance over a period of time but another barometer to measure performance is to measure performance of common operators that are used in the data preprocessing stage or in data analytics. -The microbenchmark notebook in this repo uses four such queries in the chart shown below: +The microbenchmark notebook in this repo uses five such queries in the chart shown below: - **Count Distinct**: a function used to estimate the number of unique page views or unique customers visiting an e-commerce site. @@ -11,6 +11,7 @@ The microbenchmark notebook in this repo uses four such queries in the chart sho timestamped event data in marketing or financial industry. - **Intersect**: an operator used to remove duplicates in a dataframe. - **Cross-join**: A common use for a cross join is to obtain all combinations of items. +- **Hash-join**: Joining two tables together by matching rows based on a common column. These queries were run on a standard eight-nodes CPU cluster with 2 CPU (128 cores), 512GB memory and 1xA100 GPUs per node. The dataset used was of size 3TB with multiple different data types. diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-cpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-cpu.ipynb index ce5c5a797..c3707ee8d 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-cpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-cpu.ipynb @@ -529,6 +529,34 @@ "runMicroBenchmark(spark,\"Crossjoin\",query ,1)" ] }, + { + "cell_type": "markdown", + "id": "56f915c2-9b9a-4982-8c4e-5b570c17bfeb", + "metadata": {}, + "source": [ + "### HashJoin\n", + "This is a microbenchmark for a HashJoin. The query on GPU will be more than 10x times faster than CPU based on the cluster in the readme." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "040603c9-a96f-4017-bcdb-5f93e12996a4", + "metadata": {}, + "outputs": [], + "source": [ + "spark.read.parquet(dataRoot + \"/store_sales\").createOrReplaceTempView(\"store_sales\")\n", + "spark.read.parquet(dataRoot + \"/store_returns\").createOrReplaceTempView(\"store_returns\")\n", + "\n", + "print(\"-\"*50)\n", + "query = '''\n", + "select sum(store_sales.ss_ext_wholesale_cost)\n", + "from store_sales\n", + "join store_returns on (ss_item_sk = sr_item_sk) and (ss_addr_sk=sr_addr_sk)\n", + "'''\n", + "runMicroBenchmark(spark,\"HashJoin\",query,1)" + ] + }, { "cell_type": "code", "execution_count": 24, @@ -550,7 +578,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -564,7 +592,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.2" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index 099b047a4..0777594f3 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -541,18 +541,48 @@ "runMicroBenchmark(spark,\"Crossjoin\",query,2)" ] }, + { + "cell_type": "markdown", + "id": "06b351e6-b7bd-4063-a20b-fe4fd71221f9", + "metadata": {}, + "source": [ + "### HashJoin\n", + "This is a microbenchmark for a HashJoin. The query on GPU will be more than 10x times faster than CPU based on the cluster in the readme." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "191d0c9a-2d3a-40f4-89aa-f61dab5caa90", + "metadata": {}, + "outputs": [], + "source": [ + "spark.read.parquet(dataRoot + \"/store_sales\").createOrReplaceTempView(\"store_sales\")\n", + "spark.read.parquet(dataRoot + \"/store_returns\").createOrReplaceTempView(\"store_returns\")\n", + "\n", + "print(\"-\"*50)\n", + "query = '''\n", + "select sum(store_sales.ss_ext_wholesale_cost)\n", + "from store_sales\n", + "join store_returns on (ss_item_sk = sr_item_sk) and (ss_addr_sk=sr_addr_sk)\n", + "'''\n", + "runMicroBenchmark(spark,\"HashJoin\",query,1)" + ] + }, { "cell_type": "code", "execution_count": null, "id": "fc2092e8", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "spark.stop()" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -566,7 +596,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.2" + "version": "3.12.3" } }, "nbformat": 4,