diff --git a/README.md b/README.md index 4695e2a4..0f27b559 100644 --- a/README.md +++ b/README.md @@ -164,9 +164,11 @@ optional arguments: | **DCP** | Disjunct class percentage | min_impurity_split [docs](https://blog.nelsonliu.me/2016/08/05/gsoc-week-10-scikit-learn-pr-6954-adding-pre-pruning-to-decisiontrees/) | | **TD** | Tree Depth with and without prunning | -- | | **TDWithPrunning** | Tree Depth with prunning | min_impurity_split | +| **AutoEncoder** | AutoEncoder | [docs](docs/AutoEncoder.md) | | **CODB** | CODB | Below | + #### CODB * path to CODB jar file jar_path, must be defined * k nearest neighbors (default = 7) -k "\" diff --git a/docs/AutoEncoder.md b/docs/AutoEncoder.md new file mode 100644 index 00000000..ad89643f --- /dev/null +++ b/docs/AutoEncoder.md @@ -0,0 +1,39 @@ +## AutoEncoder in Keras +**** +Trains defined Neural Network on the whole dataset and evaulates output for each instance. The output is mean squared error [MSE](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html) between the instance and its prediction. + +#### Configuration: +``` +"detectors": { + "AutoEncoder" : { + "batch_size": BATCH_SIZE, + "epochs": EPOCHS, + "optimizer": OPTIMIZER, + "layers": NN_LAYOUT, + "activation": ACTIVATION, + "loss": LOSS + } + }, +``` + +, where +* 1 <= BATCH_SIZE(int) >= SIZE_OF_DATASET, default = 40 +* 1 <= EPOCHS(int), default = 200 +* OPTIMIZER(str) from [keras.optimizers](https://keras.io/optimizers/), default: "keras.optimizers.adam()" +* LAYERS(str) -- list of number of neurons in the given hidden layers (does not include input and output), default = "[20,5,20]" for 3 layers +* ACTIVATION(str) -- name of activation function [activations](https://keras.io/activations/), default = "tanh" +* LOSS(str) -- name of a loss function for training neural net from [losses](https://keras.io/losses/), default = "mean_squared_error" + +e.g. +``` +"detectors": { + "AutoEncoder" : { + "batch_size": 40, + "epochs": 200, + "optimizer": "keras.optimizers.adam()", + "layers": "[20,5,20]", + "activation": "tanh", + "loss": "mean_squared_error" + } + }, +``` \ No newline at end of file diff --git a/pv056_2019/outlier_detection/AutoEncoder.py b/pv056_2019/outlier_detection/AutoEncoder.py new file mode 100644 index 00000000..5ff5cb15 --- /dev/null +++ b/pv056_2019/outlier_detection/AutoEncoder.py @@ -0,0 +1,62 @@ +import keras +from keras import layers +from sklearn.metrics import mean_squared_error as mse +from sklearn import preprocessing +import pandas as pd + + +class AutoEncoder: + def __init__(self, df, params): + self.params = params + + self.params.setdefault("batch_size", 40) + self.params.setdefault("epochs", 200) + self.params.setdefault("optimizer", "keras.optimizers.adam()") + self.params.setdefault("loss", "mean_squared_error") + self.params.setdefault("layers", "[20,5,20]") + self.params.setdefault("activation", "tanh") + + self.params["layers"] = eval(self.params["layers"]) + + x = df.values + min_max_scaler = preprocessing.MinMaxScaler() + x_scaled = min_max_scaler.fit_transform(x) + self.df = pd.DataFrame(x_scaled) + + self.auto_enc = keras.models.Sequential() + + self.auto_enc.add( + layers.Dense( + input_dim=df.shape[1], + units=self.params["layers"][0], + activation=self.params["activation"], + ) + ) + + if len(self.params["layers"]) > 1: + for l in self.params["layers"][1:]: + self.auto_enc.add( + layers.Dense(units=l, activation=self.params["activation"]) + ) + self.auto_enc.add( + layers.Dense(units=df.shape[1], activation=self.params["activation"]) + ) + + self.auto_enc.compile( + loss=self.params["loss"], optimizer=eval(self.params["optimizer"]) + ) + + self.auto_enc.summary() + + def compute_values(self, classes): + self.auto_enc.fit( + self.df.values, + self.df.values, + batch_size=int(self.params["batch_size"]), + epochs=int(self.params["epochs"]), + ) + return mse( + self.df.transpose(), + self.auto_enc.predict(self.df.values).transpose(), + multioutput="raw_values", + ) diff --git a/pv056_2019/outlier_detection/__init__.py b/pv056_2019/outlier_detection/__init__.py index b17ba831..f2b763c0 100644 --- a/pv056_2019/outlier_detection/__init__.py +++ b/pv056_2019/outlier_detection/__init__.py @@ -17,6 +17,7 @@ from pv056_2019.outlier_detection.DCP import DCPMetric from pv056_2019.outlier_detection.DS import DSMetric from pv056_2019.outlier_detection.KDN import KDNMetric +from pv056_2019.outlier_detection.AutoEncoder import AutoEncoder from pv056_2019.outlier_detection.CODB import CODBMetric @@ -405,6 +406,17 @@ def compute_scores(self, dataframe: pd.DataFrame, classes: np.array): return self +@detector +class AE(AbstractDetector): + name = "AutoEncoder" + data_type = "REAL" + + def compute_scores(self, dataframe: pd.DataFrame, classes: np.array): + bin_dataframe = dataframe._binarize_categorical_values() + self.clf = AutoEncoder(bin_dataframe, self.settings) + self.values = self.clf.compute_values(classes=classes) + return self + @detector class CODB(AbstractDetector): name = "CODB" diff --git a/requirements.txt b/requirements.txt index 0fa0d0ea..85ac4927 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,14 +4,32 @@ # # pip-compile # -dataclasses==0.6 # via pydantic +absl-py==0.7.1 # via tensorboard, tensorflow, tensorflow-estimator +astor==0.7.1 # via tensorflow +gast==0.2.2 # via tensorflow +grpcio==1.19.0 # via tensorboard, tensorflow +h5py==2.9.0 # via keras, keras-applications +keras-applications==1.0.7 # via keras, tensorflow +keras-preprocessing==1.0.9 # via keras, tensorflow +keras==2.2.4 liac-arff==2.4.0 +markdown==3.1 # via tensorboard +mock==2.0.0 # via tensorflow-estimator numpy==1.16.2 pandas==0.24.1 +pbr==5.1.3 # via mock +protobuf==3.7.1 # via tensorboard, tensorflow pydantic==0.20.1 python-dateutil==2.8.0 # via pandas pytz==2018.9 # via pandas +pyyaml==5.1 # via keras scikit-learn==0.20.3 # via sklearn -scipy==1.2.1 # via scikit-learn -six==1.12.0 # via python-dateutil +scipy==1.2.1 # via keras, scikit-learn +six==1.12.0 # via absl-py, grpcio, h5py, keras, keras-preprocessing, mock, protobuf, python-dateutil, tensorboard, tensorflow, tensorflow-estimator sklearn==0.0 +tensorboard==1.13.1 # via tensorflow +tensorflow-estimator==1.13.0 # via tensorflow +tensorflow==1.13.1 +termcolor==1.1.0 # via tensorflow +werkzeug==0.15.2 # via tensorboard +wheel==0.33.1 # via tensorboard, tensorflow diff --git a/setup.py b/setup.py index ba09f952..4d090c3e 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,15 @@ license="MIT", packages=find_packages(include=["pv056_2019", "pv056_2019.*"]), include_package_data=True, - install_requires=["pandas", "numpy", "liac-arff", "sklearn", "pydantic"], + install_requires=[ + "pandas", + "numpy", + "liac-arff", + "sklearn", + "pydantic", + "keras", + "tensorflow", + ], entry_points={ "console_scripts": [ "pv056-split-data=pv056_2019.data_splitter:main",