diff --git a/Machine_Learning/Diabetes-Prediction/Diabetes Predictor - Deployment.py b/Machine_Learning/Diabetes-Prediction/Diabetes Predictor - Deployment.py index 66681b7d3b..7c687f373b 100644 --- a/Machine_Learning/Diabetes-Prediction/Diabetes Predictor - Deployment.py +++ b/Machine_Learning/Diabetes-Prediction/Diabetes Predictor - Deployment.py @@ -2,6 +2,8 @@ import numpy as np import pandas as pd import pickle +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.ensemble import RandomForestClassifier # Loading the dataset df = pd.read_csv('kaggle_diabetes.csv') @@ -20,17 +22,31 @@ df_copy['Insulin'].fillna(df_copy['Insulin'].median(), inplace=True) df_copy['BMI'].fillna(df_copy['BMI'].median(), inplace=True) -# Model Building -from sklearn.model_selection import train_test_split +# Splitting the data into features and target variable X = df.drop(columns='Outcome') y = df['Outcome'] + +# Splitting into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) -# Creating Random Forest Model -from sklearn.ensemble import RandomForestClassifier -classifier = RandomForestClassifier(n_estimators=20) +# Hyperparameter tuning using GridSearchCV +param_grid = { + 'n_estimators': [10, 50, 100], + 'max_depth': [None, 10, 20], + 'min_samples_split': [2, 5, 10] +} + +grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5) +grid_search.fit(X_train, y_train) + +# Best parameters found +best_params = grid_search.best_params_ +print(f"Best hyperparameters: {best_params}") + +# Building Random Forest Model with best hyperparameters +classifier = RandomForestClassifier(**best_params) classifier.fit(X_train, y_train) # Creating a pickle file for the classifier filename = 'diabetes-prediction-rfc-model.pkl' -pickle.dump(classifier, open(filename, 'wb')) \ No newline at end of file +pickle.dump(classifier, open(filename, 'wb'))