From 351b068c07d17a69f0c44c96ef2988d6401a5d35 Mon Sep 17 00:00:00 2001 From: thomas-serre-sonarsource Date: Wed, 29 Jan 2025 14:16:37 +0000 Subject: [PATCH 1/3] Create rule S7191 --- rules/S7191/metadata.json | 2 ++ rules/S7191/python/metadata.json | 25 ++++++++++++++++++ rules/S7191/python/rule.adoc | 44 ++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 rules/S7191/metadata.json create mode 100644 rules/S7191/python/metadata.json create mode 100644 rules/S7191/python/rule.adoc diff --git a/rules/S7191/metadata.json b/rules/S7191/metadata.json new file mode 100644 index 00000000000..2c63c085104 --- /dev/null +++ b/rules/S7191/metadata.json @@ -0,0 +1,2 @@ +{ +} diff --git a/rules/S7191/python/metadata.json b/rules/S7191/python/metadata.json new file mode 100644 index 00000000000..6fe5f528b6f --- /dev/null +++ b/rules/S7191/python/metadata.json @@ -0,0 +1,25 @@ +{ + "title": "FIXME", + "type": "CODE_SMELL", + "status": "ready", + "remediation": { + "func": "Constant\/Issue", + "constantCost": "5min" + }, + "tags": [ + ], + "defaultSeverity": "Major", + "ruleSpecification": "RSPEC-7191", + "sqKey": "S7191", + "scope": "All", + "defaultQualityProfiles": ["Sonar way"], + "quickfix": "unknown", + "code": { + "impacts": { + "MAINTAINABILITY": "HIGH", + "RELIABILITY": "MEDIUM", + "SECURITY": "LOW" + }, + "attribute": "CONVENTIONAL" + } +} diff --git a/rules/S7191/python/rule.adoc b/rules/S7191/python/rule.adoc new file mode 100644 index 00000000000..caae0d69054 --- /dev/null +++ b/rules/S7191/python/rule.adoc @@ -0,0 +1,44 @@ +FIXME: add a description + +// If you want to factorize the description uncomment the following line and create the file. +//include::../description.adoc[] + +== Why is this an issue? + +FIXME: remove the unused optional headers (that are commented out) + +//=== What is the potential impact? + +== How to fix it +//== How to fix it in FRAMEWORK NAME + +=== Code examples + +==== Noncompliant code example + +[source,python,diff-id=1,diff-type=noncompliant] +---- +FIXME +---- + +==== Compliant solution + +[source,python,diff-id=1,diff-type=compliant] +---- +FIXME +---- + +//=== How does this work? + +//=== Pitfalls + +//=== Going the extra mile + + +//== Resources +//=== Documentation +//=== Articles & blog posts +//=== Conference presentations +//=== Standards +//=== External coding guidelines +//=== Benchmarks From cb5c361642cb8988953d5ad9ba56b9de2df1d4fc Mon Sep 17 00:00:00 2001 From: Thomas Serre Date: Wed, 29 Jan 2025 15:45:55 +0100 Subject: [PATCH 2/3] Create rule S7191: PySpark should be preferred over when multiple columns are specified --- rules/S7191/python/metadata.json | 13 +++---- rules/S7191/python/rule.adoc | 62 +++++++++++++++++++++----------- 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/rules/S7191/python/metadata.json b/rules/S7191/python/metadata.json index 6fe5f528b6f..2d2eb311488 100644 --- a/rules/S7191/python/metadata.json +++ b/rules/S7191/python/metadata.json @@ -1,5 +1,5 @@ { - "title": "FIXME", + "title": "`withColumns` method should be preferred over `withColumn` when multiple columns are specified", "type": "CODE_SMELL", "status": "ready", "remediation": { @@ -7,19 +7,20 @@ "constantCost": "5min" }, "tags": [ + "pyspark", + "data-science" ], "defaultSeverity": "Major", "ruleSpecification": "RSPEC-7191", "sqKey": "S7191", "scope": "All", "defaultQualityProfiles": ["Sonar way"], - "quickfix": "unknown", + "quickfix": "partial", "code": { "impacts": { - "MAINTAINABILITY": "HIGH", - "RELIABILITY": "MEDIUM", - "SECURITY": "LOW" + "MAINTAINABILITY": "MEDIUM", + "RELIABILITY": "MEDIUM" }, - "attribute": "CONVENTIONAL" + "attribute": "EFFICIENT" } } diff --git a/rules/S7191/python/rule.adoc b/rules/S7191/python/rule.adoc index caae0d69054..205a1a0e499 100644 --- a/rules/S7191/python/rule.adoc +++ b/rules/S7191/python/rule.adoc @@ -1,16 +1,29 @@ -FIXME: add a description - -// If you want to factorize the description uncomment the following line and create the file. -//include::../description.adoc[] +This rule identifies instances where multiple `withColumn` calls are used in succession to add or modify columns in a PySpark DataFrame. It suggests using `withColumns` instead, which is more efficient and concise. == Why is this an issue? -FIXME: remove the unused optional headers (that are commented out) +Using `withColumn` multiple times can lead to inefficient code, as each call creates a new Spark Logical Plan. withColumns allows for adding or modifying multiple columns in a single operation, improving performance. + +=== What is the potential impact? + +Creating a new column can be a costly operation, as Spark has to loop on every row to compute the new column value. + +=== Exceptions + +`withColumn` can be used multiple times sequentially on a Dataframe when computing consecutive columns requires the presence of the previous ones. +In this case, consecutive `withColumn` calls are a solution. -//=== What is the potential impact? +[source,python,diff-id=1,diff-type=compliant] +---- +from pyspark.sql import SparkSession +from pyspark.sql.functions import col +spark = SparkSession.builder.getOrCreate() +df = spark.createDataFrame([[1,2],[2,3]], ["id", "value"]) +df_with_new_cols = df.withColumn("squared_value", col("value") * col("value")).withColumn("cubic_value", col("squared_value") * col("value")) # Compliant +---- == How to fix it -//== How to fix it in FRAMEWORK NAME +To fix this issue, `withColumns` method should be used instead of multiple consecutive calls to `withColumn` method. === Code examples @@ -18,27 +31,34 @@ FIXME: remove the unused optional headers (that are commented out) [source,python,diff-id=1,diff-type=noncompliant] ---- -FIXME +from pyspark.sql import SparkSession +from pyspark.sql.functions import col +spark = SparkSession.builder.getOrCreate() +df = spark.createDataFrame([[1,2],[2,3]], ["id", "value"]) +df_with_new_cols = df.withColumn("value_plus_1", col("value") + 1).withColumn("value_plus_2", col("value") + 2).withColumn("value_plus_3", col("value") + 3) # Noncompliant ---- ==== Compliant solution [source,python,diff-id=1,diff-type=compliant] ---- -FIXME +from pyspark.sql import SparkSession +from pyspark.sql.functions import col +spark = SparkSession.builder.getOrCreate() +df = spark.createDataFrame([[1,2],[2,3]], ["id", "value"]) +df_with_new_cols = df.withColumns({ # Compliant + "value_plus_1": col("value") + 1, + "value_plus_2": col("value") + 2, + "value_plus_3": col("value") + 3, +}) ---- -//=== How does this work? - -//=== Pitfalls - -//=== Going the extra mile +== Resources +=== Documentation + * PySpark withColumn Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumn.html[pyspark.sql.DataFrame.withColumn] + * PySpark withColumns Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html[pyspark.sql.DataFrame.withColumns] + +=== Articles & blog posts -//== Resources -//=== Documentation -//=== Articles & blog posts -//=== Conference presentations -//=== Standards -//=== External coding guidelines -//=== Benchmarks + * Medium blog - https://blog.devgenius.io/why-to-avoid-multiple-chaining-of-withcolumn-function-in-spark-job-35ee8e09daaa[Why to avoid multiple chaining of withColumn() function in Spark job.] From 978668565220ae33db51fc0b5bed4979c5edfd99 Mon Sep 17 00:00:00 2001 From: Thomas Serre Date: Tue, 4 Feb 2025 10:35:34 +0100 Subject: [PATCH 3/3] Fix after review --- rules/S7191/python/metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/S7191/python/metadata.json b/rules/S7191/python/metadata.json index 2d2eb311488..7ccb948186c 100644 --- a/rules/S7191/python/metadata.json +++ b/rules/S7191/python/metadata.json @@ -1,5 +1,5 @@ { - "title": "`withColumns` method should be preferred over `withColumn` when multiple columns are specified", + "title": "\"withColumns\" method should be preferred over \"withColumn\" when multiple columns are specified", "type": "CODE_SMELL", "status": "ready", "remediation": {