-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_model.py
102 lines (87 loc) · 3.69 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import json
import yaml
import joblib
import mlflow
import argparse
import numpy as np
import pandas as pd
from urllib.parse import urlparse
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,recall_score,accuracy_score,precision_score,confusion_matrix,classification_report
def read_params(config_path):
"""
read parameters from the params.yaml file
input: params.yaml location
output: parameters as dictionary
"""
with open(config_path) as yaml_file:
config = yaml.safe_load(yaml_file)
return config
def accuracymeasures(y_test,predictions,avg_method):
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average=avg_method)
recall = recall_score(y_test, predictions, average=avg_method)
f1score = f1_score(y_test, predictions, average=avg_method)
target_names = ['0','1']
print("Classification report")
print("---------------------","\n")
print(classification_report(y_test, predictions,target_names=target_names),"\n")
print("Confusion Matrix")
print("---------------------","\n")
print(confusion_matrix(y_test, predictions),"\n")
print("Accuracy Measures")
print("---------------------","\n")
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1score)
return accuracy,precision,recall,f1score
def get_feat_and_target(df,target):
"""
Get features and target variables seperately from given dataframe and target
input: dataframe and target column
output: two dataframes for x and y
"""
x=df.drop(target,axis=1)
y=df[[target]]
return x,y
def train_and_evaluate(config_path):
config = read_params(config_path)
train_data_path = config["processed_data_config"]["train_data_csv"]
test_data_path = config["processed_data_config"]["test_data_csv"]
target = config["raw_data_config"]["target"]
max_depth=config["random_forest"]["max_depth"]
n_estimators=config["random_forest"]["n_estimators"]
train = pd.read_csv(train_data_path, sep=",")
test = pd.read_csv(test_data_path, sep=",")
train_x,train_y=get_feat_and_target(train,target)
test_x,test_y=get_feat_and_target(test,target)
################### MLFLOW ###############################
mlflow_config = config["mlflow_config"]
remote_server_uri = mlflow_config["remote_server_uri"]
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment(mlflow_config["experiment_name"])
with mlflow.start_run(run_name=mlflow_config["run_name"]) as mlops_run:
model = RandomForestClassifier(max_depth=max_depth,n_estimators=n_estimators)
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
accuracy,precision,recall,f1score = accuracymeasures(test_y,y_pred,'weighted')
mlflow.log_param("max_depth",max_depth)
mlflow.log_param("n_estimators", n_estimators)
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
mlflow.log_metric("f1_score", f1score)
tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme
if tracking_url_type_store != "file":
mlflow.sklearn.log_model(
model,
"model",
registered_model_name=mlflow_config["registered_model_name"])
else:
mlflow.sklearn.load_model(model, "model")
if __name__=="__main__":
args = argparse.ArgumentParser()
args.add_argument("--config", default="params.yaml")
parsed_args = args.parse_args()
train_and_evaluate(config_path=parsed_args.config)