Skip to content

Commit

Permalink
v0.4 Random forest train pipeline done
Browse files Browse the repository at this point in the history
  • Loading branch information
Shakleen committed Oct 7, 2024
1 parent b0c305a commit 9a859ce
Show file tree
Hide file tree
Showing 21 changed files with 8 additions and 4 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"class":"org.apache.spark.ml.regression.RandomForestRegressionModel","timestamp":1728229923283,"sparkVersion":"3.5.3","uid":"RandomForestRegressor_adb1e91a940d","paramMap":{"predictionCol":"predicted_bike_demand","labelCol":"bike_demand","featuresCol":"final_features","seed":29,"subsamplingRate":0.01,"numTrees":100,"maxDepth":10},"defaultParamMap":{"labelCol":"label","predictionCol":"prediction","featuresCol":"features","minInstancesPerNode":1,"bootstrap":true,"maxBins":32,"seed":6152653672830384431,"minInfoGain":0.0,"checkpointInterval":10,"subsamplingRate":1.0,"maxMemoryInMB":256,"featureSubsetStrategy":"auto","minWeightFractionPerNode":0.0,"impurity":"variance","numTrees":20,"leafCol":"","cacheNodeIds":false,"maxDepth":5},"numFeatures":30,"numTrees":100}
{"class":"org.apache.spark.ml.regression.RandomForestRegressionModel","timestamp":1728313143351,"sparkVersion":"3.5.3","uid":"RandomForestRegressor_225f2649b4d0","paramMap":{"subsamplingRate":0.01,"predictionCol":"predicted_bike_demand","featuresCol":"final_features","seed":29,"minInstancesPerNode":100,"maxBins":32,"numTrees":100,"maxDepth":25,"labelCol":"bike_demand"},"defaultParamMap":{"checkpointInterval":10,"subsamplingRate":1.0,"predictionCol":"prediction","leafCol":"","featuresCol":"features","seed":-3044862090669368019,"featureSubsetStrategy":"auto","minInstancesPerNode":1,"minWeightFractionPerNode":0.0,"minInfoGain":0.0,"maxBins":32,"maxMemoryInMB":256,"cacheNodeIds":false,"impurity":"variance","bootstrap":true,"numTrees":20,"labelCol":"label","maxDepth":5},"numFeatures":30,"numTrees":100}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"class":"org.apache.spark.ml.regression.RandomForestRegressionModel","timestamp":1728232412003,"sparkVersion":"3.5.3","uid":"RandomForestRegressor_dc9537a6c871","paramMap":{"seed":29,"subsamplingRate":0.01,"numTrees":100,"maxDepth":10,"labelCol":"dock_demand","predictionCol":"predicted_dock_demand","featuresCol":"final_features"},"defaultParamMap":{"minWeightFractionPerNode":0.0,"minInfoGain":0.0,"seed":6152653672830384431,"checkpointInterval":10,"maxMemoryInMB":256,"impurity":"variance","leafCol":"","numTrees":20,"subsamplingRate":1.0,"maxDepth":5,"cacheNodeIds":false,"labelCol":"label","predictionCol":"prediction","featuresCol":"features","maxBins":32,"minInstancesPerNode":1,"featureSubsetStrategy":"auto","bootstrap":true},"numFeatures":30,"numTrees":100}
{"class":"org.apache.spark.ml.regression.RandomForestRegressionModel","timestamp":1728324809326,"sparkVersion":"3.5.3","uid":"RandomForestRegressor_da7f08d2085c","paramMap":{"seed":29,"numTrees":100,"maxDepth":25,"subsamplingRate":0.01,"labelCol":"dock_demand","maxBins":32,"featuresCol":"final_features","predictionCol":"predicted_dock_demand","minInstancesPerNode":100},"defaultParamMap":{"minWeightFractionPerNode":0.0,"minInfoGain":0.0,"leafCol":"","impurity":"variance","seed":-3044862090669368019,"numTrees":20,"bootstrap":true,"maxDepth":5,"featureSubsetStrategy":"auto","checkpointInterval":10,"subsamplingRate":1.0,"maxMemoryInMB":256,"labelCol":"label","maxBins":32,"cacheNodeIds":false,"featuresCol":"features","predictionCol":"prediction","minInstancesPerNode":1},"numFeatures":30,"numTrees":100}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
8 changes: 6 additions & 2 deletions src/train_pipeline/random_forest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressor, RandomForestRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.dataframe import DataFrame
from typing import Tuple

Expand Down Expand Up @@ -35,7 +34,9 @@ class RandomForestPipelineConfig:
subsampling_rate: float = 0.01
train_val_test_split_ratio = [0.9, 0.05, 0.05]
max_depth: int = 25
num_trees: int = 250
num_trees: int = 100
min_instances_per_node: int = 100
max_bins: int = 32


class RandomForestPipeline:
Expand All @@ -54,6 +55,7 @@ def split_train_val_test(
return (train_data, val_data, test_data)

def get_regressor(self, label_name: str, predict_name: str):

return RandomForestRegressor(
featuresCol=self.config.feature_column_name,
labelCol=label_name,
Expand All @@ -62,6 +64,8 @@ def get_regressor(self, label_name: str, predict_name: str):
subsamplingRate=self.config.subsampling_rate,
maxDepth=self.config.max_depth,
numTrees=self.config.num_trees,
minInstancesPerNode=self.config.min_instances_per_node,
maxBins=self.config.max_bins,
)

def eval_model(
Expand Down

0 comments on commit 9a859ce

Please sign in to comment.