From df4e9fe1cde3dbc31aab791d5f96bbcd869fc837 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Thu, 4 Apr 2024 13:41:31 -0500 Subject: [PATCH 1/2] Improve the taxi-trip-execute.sh script The current performance of the script is very poor. Additionally there are 6 copies of this script. This commit improves the performance by doing S3 to S3 copies in the background rather than using the local network to upload 100 copies. This also removes the additional copies of the script. --- .../taxi-trip-execute.sh | 14 +++--- .../taxi-trip-execute.sh | 43 ------------------- .../taxi-trip-execute.sh | 43 ------------------- .../taxi-trip-execute.sh | 43 ------------------- .../taxi-trip-execute.sh | 43 ------------------- .../taxi-trip-execute.sh | 43 ------------------- .../data-analytics/_taxi_trip_exec.md | 4 +- .../observability-spark-on-eks.md | 8 ++-- 8 files changed, 16 insertions(+), 225 deletions(-) rename analytics/{terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage => scripts}/taxi-trip-execute.sh (80%) delete mode 100644 analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc/taxi-trip-execute.sh delete mode 100644 analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh delete mode 100644 analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh delete mode 100755 analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh delete mode 100755 analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh diff --git a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage/taxi-trip-execute.sh b/analytics/scripts/taxi-trip-execute.sh similarity index 80% rename from analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage/taxi-trip-execute.sh rename to analytics/scripts/taxi-trip-execute.sh index ca9156eeb..37a964ba5 100755 --- a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage/taxi-trip-execute.sh +++ b/analytics/scripts/taxi-trip-execute.sh @@ -11,13 +11,11 @@ # Script usage ./taxi-trip-execute my-s3-bucket us-west-2 -# validating that use passes two arguments, if not return a message to pass the arguments if [ $# -ne 2 ]; then echo "Usage: $0 " exit 1 fi - S3_BUCKET="$1" REGION="$2" @@ -31,15 +29,21 @@ aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${R # Copy Test Input data to S3 bucket wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" +aws s3 cp "input/yellow_tripdata_2022-0.parquet" s3://${S3_BUCKET}/input/yellow_tripdata_2022-0.parquet + +pids=() # Making duplicate copies to increase the size of the data. max=100 for (( i=1; i <= $max; ++i )) do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" + aws s3 cp s3://${S3_BUCKET}/input/yellow_tripdata_2022-0.parquet s3://${S3_BUCKET}/input/yellow_tripdata_2022-${i}.parquet & + pids+=($!) done -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} +for pid in "${pids[@]}"; do + wait $pid +done # Delete a local input folder -rm -rf input +rm -rf input \ No newline at end of file diff --git a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc/taxi-trip-execute.sh deleted file mode 100644 index 31432838b..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "ebs-storage-dynamic-pvc.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f ebs-storage-dynamic-pvc.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 3 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh deleted file mode 100644 index ac6d8ceb6..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "nvme-storage-yunikorn-gang-scheduling.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 3 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh deleted file mode 100644 index 4ebf25143..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "ebs-storage-dynamic-pvc.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f ebs-storage-dynamic-pvc.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh deleted file mode 100755 index b3a9dcfb5..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "nvme-ephemeral-storage.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f nvme-ephemeral-storage.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh deleted file mode 100755 index 25707e596..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "nvme-storage-yunikorn-gang-scheduling.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md index f6edf2c88..eef45910b 100644 --- a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md +++ b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md @@ -5,5 +5,5 @@ it in order to increase the size a bit. This will take a bit of time and will require a relatively fast internet connection. ```bash -./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE -``` +${DOEKS_HOME}/analytics/scripts/taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE +``` \ No newline at end of file diff --git a/website/docs/blueprints/data-analytics/observability-spark-on-eks.md b/website/docs/blueprints/data-analytics/observability-spark-on-eks.md index 2b5371a71..83b7226be 100644 --- a/website/docs/blueprints/data-analytics/observability-spark-on-eks.md +++ b/website/docs/blueprints/data-analytics/observability-spark-on-eks.md @@ -2,6 +2,9 @@ sidebar_position: 3 sidebar_label: Observability Spark on EKS --- + +import TaxiTripExec from './_taxi_trip_exec.md'; + # Observability Spark on EKS ## Introduction @@ -15,11 +18,10 @@ We will reuse the previous Spark on Operator example. Please follow [this link]( let's navigate to one example folder under spark-k8s-operator and run the shell script to upload data and py script to the S3 bucket created by terraform above. ```bash cd data-on-eks/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage - -# replace \ with your S3 bucket and \ with your region, then run -./taxi-trip-execute.sh ``` + + ## Spark Web UI When you submit a Spark application, Spark context is created which ideally gives you [Spark Web UI](https://sparkbyexamples.com/spark/spark-web-ui-understanding/) to monitor the execution of the application. Monitoring includes the following. - Spark configurations used From d922ead94f9ea85046688f1cd0c121c57576a629 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Mon, 8 Apr 2024 19:31:59 -0500 Subject: [PATCH 2/2] Take the default duplication down to 25 At some point the duplication was bumped to 100. This takes forever to complete with 4x4gb executors, which isn't a useful experience for users. With duplication at 25 the spark job takes about 5 minutes to complete which is plenty of time for the user to poke around. --- analytics/scripts/taxi-trip-execute.sh | 10 +++++----- .../docs/blueprints/data-analytics/_taxi_trip_exec.md | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/analytics/scripts/taxi-trip-execute.sh b/analytics/scripts/taxi-trip-execute.sh index 37a964ba5..fd609b12d 100755 --- a/analytics/scripts/taxi-trip-execute.sh +++ b/analytics/scripts/taxi-trip-execute.sh @@ -19,7 +19,7 @@ fi S3_BUCKET="$1" REGION="$2" -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" +INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input" # Create a local input folder mkdir input @@ -29,15 +29,15 @@ aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${R # Copy Test Input data to S3 bucket wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" -aws s3 cp "input/yellow_tripdata_2022-0.parquet" s3://${S3_BUCKET}/input/yellow_tripdata_2022-0.parquet +aws s3 cp "input/yellow_tripdata_2022-0.parquet" ${INPUT_DATA_S3_PATH}/yellow_tripdata_2022-0.parquet pids=() # Making duplicate copies to increase the size of the data. -max=100 +max=25 for (( i=1; i <= $max; ++i )) do - aws s3 cp s3://${S3_BUCKET}/input/yellow_tripdata_2022-0.parquet s3://${S3_BUCKET}/input/yellow_tripdata_2022-${i}.parquet & + aws s3 cp ${INPUT_DATA_S3_PATH}/yellow_tripdata_2022-0.parquet ${INPUT_DATA_S3_PATH}/yellow_tripdata_2022-${i}.parquet & pids+=($!) done @@ -46,4 +46,4 @@ for pid in "${pids[@]}"; do done # Delete a local input folder -rm -rf input \ No newline at end of file +rm -rf input diff --git a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md index eef45910b..5352f9a52 100644 --- a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md +++ b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md @@ -6,4 +6,4 @@ require a relatively fast internet connection. ```bash ${DOEKS_HOME}/analytics/scripts/taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE -``` \ No newline at end of file +```