From 351b068c07d17a69f0c44c96ef2988d6401a5d35 Mon Sep 17 00:00:00 2001
From: thomas-serre-sonarsource
 <thomas-serre-sonarsource@users.noreply.github.com>
Date: Wed, 29 Jan 2025 14:16:37 +0000
Subject: [PATCH 1/3] Create rule S7191

---
 rules/S7191/metadata.json        |  2 ++
 rules/S7191/python/metadata.json | 25 ++++++++++++++++++
 rules/S7191/python/rule.adoc     | 44 ++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+)
 create mode 100644 rules/S7191/metadata.json
 create mode 100644 rules/S7191/python/metadata.json
 create mode 100644 rules/S7191/python/rule.adoc

diff --git a/rules/S7191/metadata.json b/rules/S7191/metadata.json
new file mode 100644
index 00000000000..2c63c085104
--- /dev/null
+++ b/rules/S7191/metadata.json
@@ -0,0 +1,2 @@
+{
+}
diff --git a/rules/S7191/python/metadata.json b/rules/S7191/python/metadata.json
new file mode 100644
index 00000000000..6fe5f528b6f
--- /dev/null
+++ b/rules/S7191/python/metadata.json
@@ -0,0 +1,25 @@
+{
+  "title": "FIXME",
+  "type": "CODE_SMELL",
+  "status": "ready",
+  "remediation": {
+    "func": "Constant\/Issue",
+    "constantCost": "5min"
+  },
+  "tags": [
+  ],
+  "defaultSeverity": "Major",
+  "ruleSpecification": "RSPEC-7191",
+  "sqKey": "S7191",
+  "scope": "All",
+  "defaultQualityProfiles": ["Sonar way"],
+  "quickfix": "unknown",
+  "code": {
+    "impacts": {
+      "MAINTAINABILITY": "HIGH",
+      "RELIABILITY": "MEDIUM",
+      "SECURITY": "LOW"
+    },
+    "attribute": "CONVENTIONAL"
+  }
+}
diff --git a/rules/S7191/python/rule.adoc b/rules/S7191/python/rule.adoc
new file mode 100644
index 00000000000..caae0d69054
--- /dev/null
+++ b/rules/S7191/python/rule.adoc
@@ -0,0 +1,44 @@
+FIXME: add a description
+
+// If you want to factorize the description uncomment the following line and create the file.
+//include::../description.adoc[]
+
+== Why is this an issue?
+
+FIXME: remove the unused optional headers (that are commented out)
+
+//=== What is the potential impact?
+
+== How to fix it
+//== How to fix it in FRAMEWORK NAME
+
+=== Code examples
+
+==== Noncompliant code example
+
+[source,python,diff-id=1,diff-type=noncompliant]
+----
+FIXME
+----
+
+==== Compliant solution
+
+[source,python,diff-id=1,diff-type=compliant]
+----
+FIXME
+----
+
+//=== How does this work?
+
+//=== Pitfalls
+
+//=== Going the extra mile
+
+
+//== Resources
+//=== Documentation
+//=== Articles & blog posts
+//=== Conference presentations
+//=== Standards
+//=== External coding guidelines
+//=== Benchmarks

From cb5c361642cb8988953d5ad9ba56b9de2df1d4fc Mon Sep 17 00:00:00 2001
From: Thomas Serre <thomas.serre@sonarsource.com>
Date: Wed, 29 Jan 2025 15:45:55 +0100
Subject: [PATCH 2/3] Create rule S7191: PySpark  should be preferred over 
 when multiple columns are specified

---
 rules/S7191/python/metadata.json | 13 +++----
 rules/S7191/python/rule.adoc     | 62 +++++++++++++++++++++-----------
 2 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/rules/S7191/python/metadata.json b/rules/S7191/python/metadata.json
index 6fe5f528b6f..2d2eb311488 100644
--- a/rules/S7191/python/metadata.json
+++ b/rules/S7191/python/metadata.json
@@ -1,5 +1,5 @@
 {
-  "title": "FIXME",
+  "title": "`withColumns` method should be preferred over `withColumn` when multiple columns are specified",
   "type": "CODE_SMELL",
   "status": "ready",
   "remediation": {
@@ -7,19 +7,20 @@
     "constantCost": "5min"
   },
   "tags": [
+    "pyspark",
+    "data-science"
   ],
   "defaultSeverity": "Major",
   "ruleSpecification": "RSPEC-7191",
   "sqKey": "S7191",
   "scope": "All",
   "defaultQualityProfiles": ["Sonar way"],
-  "quickfix": "unknown",
+  "quickfix": "partial",
   "code": {
     "impacts": {
-      "MAINTAINABILITY": "HIGH",
-      "RELIABILITY": "MEDIUM",
-      "SECURITY": "LOW"
+      "MAINTAINABILITY": "MEDIUM",
+      "RELIABILITY": "MEDIUM"
     },
-    "attribute": "CONVENTIONAL"
+    "attribute": "EFFICIENT"
   }
 }
diff --git a/rules/S7191/python/rule.adoc b/rules/S7191/python/rule.adoc
index caae0d69054..205a1a0e499 100644
--- a/rules/S7191/python/rule.adoc
+++ b/rules/S7191/python/rule.adoc
@@ -1,16 +1,29 @@
-FIXME: add a description
-
-// If you want to factorize the description uncomment the following line and create the file.
-//include::../description.adoc[]
+This rule identifies instances where multiple `withColumn` calls are used in succession to add or modify columns in a PySpark DataFrame. It suggests using `withColumns` instead, which is more efficient and concise.
 
 == Why is this an issue?
 
-FIXME: remove the unused optional headers (that are commented out)
+Using `withColumn` multiple times can lead to inefficient code, as each call creates a new Spark Logical Plan. withColumns allows for adding or modifying multiple columns in a single operation, improving performance.
+
+=== What is the potential impact?
+
+Creating a new column can be a costly operation, as Spark has to loop on every row to compute the new column value.
+
+=== Exceptions
+
+`withColumn` can be used multiple times sequentially on a Dataframe when computing consecutive columns requires the presence of the previous ones.
+In this case, consecutive `withColumn` calls are a solution. 
 
-//=== What is the potential impact?
+[source,python,diff-id=1,diff-type=compliant]
+----
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col
+spark = SparkSession.builder.getOrCreate()
+df = spark.createDataFrame([[1,2],[2,3]], ["id", "value"])
+df_with_new_cols = df.withColumn("squared_value", col("value") * col("value")).withColumn("cubic_value", col("squared_value") * col("value")) # Compliant
+----
 
 == How to fix it
-//== How to fix it in FRAMEWORK NAME
+To fix this issue, `withColumns` method should be used instead of multiple consecutive calls to `withColumn` method. 
 
 === Code examples
 
@@ -18,27 +31,34 @@ FIXME: remove the unused optional headers (that are commented out)
 
 [source,python,diff-id=1,diff-type=noncompliant]
 ----
-FIXME
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col
+spark = SparkSession.builder.getOrCreate()
+df = spark.createDataFrame([[1,2],[2,3]], ["id", "value"])
+df_with_new_cols = df.withColumn("value_plus_1", col("value") + 1).withColumn("value_plus_2", col("value") + 2).withColumn("value_plus_3", col("value") + 3) # Noncompliant
 ----
 
 ==== Compliant solution
 
 [source,python,diff-id=1,diff-type=compliant]
 ----
-FIXME
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col
+spark = SparkSession.builder.getOrCreate()
+df = spark.createDataFrame([[1,2],[2,3]], ["id", "value"])
+df_with_new_cols = df.withColumns({    # Compliant
+    "value_plus_1": col("value") + 1,
+    "value_plus_2": col("value") + 2,
+    "value_plus_3": col("value") + 3,
+})
 ----
 
-//=== How does this work?
-
-//=== Pitfalls
-
-//=== Going the extra mile
+== Resources
+=== Documentation
 
+ * PySpark withColumn Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumn.html[pyspark.sql.DataFrame.withColumn]
+ * PySpark withColumns Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html[pyspark.sql.DataFrame.withColumns]
+ 
+=== Articles & blog posts
 
-//== Resources
-//=== Documentation
-//=== Articles & blog posts
-//=== Conference presentations
-//=== Standards
-//=== External coding guidelines
-//=== Benchmarks
+ * Medium blog - https://blog.devgenius.io/why-to-avoid-multiple-chaining-of-withcolumn-function-in-spark-job-35ee8e09daaa[Why to avoid multiple chaining of withColumn() function in Spark job.]

From 978668565220ae33db51fc0b5bed4979c5edfd99 Mon Sep 17 00:00:00 2001
From: Thomas Serre <thomas.serre@sonarsource.com>
Date: Tue, 4 Feb 2025 10:35:34 +0100
Subject: [PATCH 3/3] Fix after review

---
 rules/S7191/python/metadata.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rules/S7191/python/metadata.json b/rules/S7191/python/metadata.json
index 2d2eb311488..7ccb948186c 100644
--- a/rules/S7191/python/metadata.json
+++ b/rules/S7191/python/metadata.json
@@ -1,5 +1,5 @@
 {
-  "title": "`withColumns` method should be preferred over `withColumn` when multiple columns are specified",
+  "title": "\"withColumns\" method should be preferred over \"withColumn\" when multiple columns are specified",
   "type": "CODE_SMELL",
   "status": "ready",
   "remediation": {