Skip to content

Commit

Permalink
PySpark tutorial updates
Browse files Browse the repository at this point in the history
  • Loading branch information
adrianisk committed Mar 15, 2024
1 parent 5198110 commit 9b79d56
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 11 deletions.
1 change: 0 additions & 1 deletion pyspark/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
numpy==1.26.4
pandas==2.2.1
pyspark==3.5.1
sqlglot==22.0.0
gable>=0.10.0a0
15 changes: 5 additions & 10 deletions pyspark/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,11 @@ def run_job(spark, final_output_table):
# Write the joined data to the final output table
joined_df.write.mode("overwrite").saveAsTable(final_output_table)

joined_df.createOrReplaceTempView("joined_temp")
commission_rate_metrics_df = spark.sql("""
SELECT
booking_date,
avg(commission_rate) AS "avg_commission_rate",
max(commission_rate) AS "max_commission_rate",
min(commission_rate) AS "min_commission_rate"
FROM
joined_temp
""")
commission_rate_metrics_df = joined_df.groupBy("booking_date").agg(
F.avg("commission_rate").alias("avg_commission_rate"),
F.max("commission_rate").alias("max_commission_rate"),
F.min("commission_rate").alias("min_commission_rate")
)

# Merge into existing metrics table
commission_rate_metrics_df.createOrReplaceTempView("commission_rate_metrics_temp")
Expand Down

0 comments on commit 9b79d56

Please sign in to comment.