-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlogreg_train.py
162 lines (146 loc) Β· 5.54 KB
/
logreg_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# **************************************************************************** #
# #
# ::: :::::::: #
# logreg_train.py :+: :+: :+: #
# +:+ +:+ +:+ #
# By: obelouch <obelouch@student.42.fr> +#+ +:+ +#+ #
# +#+#+#+#+#+ +#+ #
# Created: 2020/12/18 18:36:47 by obelouch #+# #+# #
# Updated: 2020/12/26 16:50:02 by aelouarg ### ########.fr #
# #
# **************************************************************************** #
########### Choice of features #################################################
# #
# Some features are homogenous or coherant with other ones, so there #
# existance is not necessary for training the model and can give use #
# a complex hypothesis that will cause 'Overfitting' #
# Our choice was to remove: #
# - Arithmancy : Homogenous #
# - Astronomy : Similar to 'Defense Against the Dark Arts' #
# - Transfiguration : Semi similar to 'History of Magic' #
# - Potions : Semi homogenous #
# - Care of Magical Creatures: Semi homogenous #
# #
################################################################################
from src.standarize import standarize_X
from src.precision import print_precision
from mylib.csvTools import get_df_from_csv
from mylib.consts import bcolors, errors
from mylib.libft import get_flags_and_args
from src.algorithms import get_theta
from os import path
import numpy as np
import sys
# Global Variables:
algo = 'BGD'
def print_loading():
'''
Print The Loading Message depend on the Algo type
'''
print(f'\nTraining using {bcolors.BOLD}', end='')
if algo == 'SGD':
print('Stochastic Gradient Descent', end='')
else:
print('Batch Gradient Descent', end='')
print(f'{bcolors.ENDC} Algorithm ....\n')
def exit_usage(error):
'''
Print the error Msg and Exit
'''
print(f'\n{bcolors.FAIL}Error{bcolors.ENDC}: ', end='')
if error == errors.ARG_NBR:
print('Wrong number of arguments!')
elif error == errors.NO_ARG:
print('No file is provided!')
elif error == errors.NOT_FILE:
print('File not found!')
elif error == errors.NOT_CSV:
print('Wrong file extension, accept only CSV!')
elif error == errors.FLAG_NBR:
print('Too much options used!')
elif error == errors.WRONG_FLAG:
print('Wrong option used!')
else:
print('Can\'t read the file!')
print(f'{bcolors.WARNING}Usage{bcolors.ENDC}: ', end='')
print('python3 logreg_train.py [-BGD | -SGD] <_train dataset_>')
print(' -BGD: Batch Gradient Descent Algorithm')
print(' -SGD: Stochastic Gradient Descent Algorithm')
exit(1)
def get_filename(args):
'''
Check & take the dataset file from the argument
'''
if len(args) > 1:
exit_usage(errors.ARG_NBR)
if len(args) == 0:
exit_usage(errors.NO_ARG)
filename = args[0]
if not path.exists(filename):
exit_usage(errors.NOT_FILE)
if not filename.endswith('.csv'):
exit_usage(errors.NOT_CSV)
return filename
def set_algorithm(flags):
'''
Set the Algorithm
'''
global algo
if len(flags) > 1:
exit_usage(errors.FLAG_NBR)
if len(flags) == 1:
option = flags[0]
if option not in ['BGD', 'SGD']:
exit_usage(errors.WRONG_FLAG)
algo = option
def check_data(houses):
'''
Check House Data if empty
'''
for elt in houses:
if (elt == 0):
print("%sError%s: Wrong Data" % (bcolors.FAIL, bcolors.ENDC))
exit(1)
def logreg_train():
'''
Train the logistic regression model with the dataset_train
'''
# Get Arguments & Flags
flags, args = get_flags_and_args()
# Check & Set Algorithm:
set_algorithm(flags)
# Check & Get the CSV filename
filename = get_filename(args)
trainSet = get_df_from_csv(
filename,
[1, 8, 9, 10, 11, 12, 13, 17, 18]
)
# Test House Coulmn:
check_data(trainSet.loc[:, "Hogwarts House"])
# Print Loading:
print_loading()
# The X (features) Matrice [m x 9]
# (remove the index column and add X0 column full of 1):
X = np.concatenate(
(
np.ones((trainSet.shape[0], 1)),
standarize_X(trainSet.iloc[:, 1:]),
),
# concat in columns
axis=1
)
# The Y (labels) Vector [m x 1]
Y = trainSet['Hogwarts House']
# Get Theta depend on selected algorithm
Theta = get_theta(X, Y, algo)
# Print Weights in a file:
Theta.to_csv(
'weights.csv',
index=False,
sep=',',
)
# Print Precision:
print(f'{bcolors.OKGREEN}Training DONE{bcolors.ENDC} β
\n')
print_precision(Theta, X, Y)
# Launch the Logistic Regression training:
logreg_train()