From 0417fade08a8334061c7dee06aa1b762ae423c16 Mon Sep 17 00:00:00 2001 From: Nicolas Papernot Date: Mon, 19 Sep 2016 08:40:31 +0200 Subject: [PATCH 1/4] completed the MNIST tutorial --- tutorials/mnist_tutorial.md | 111 +++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/tutorials/mnist_tutorial.md b/tutorials/mnist_tutorial.md index 9d31a2940..7f80597c3 100644 --- a/tutorials/mnist_tutorial.md +++ b/tutorials/mnist_tutorial.md @@ -32,7 +32,116 @@ it is made up of multiple convolutional and ReLU layers. You can find the model definition in the [`utils_mnist` cleverhans module](https://github.com/openai/cleverhans/blob/master/cleverhans/utils_mnist.py). -TODO(insert code snippet here) +``` +# Define input TF placeholder +x = tf.placeholder(tf.float32, shape=(None, 1, 28, 28)) +y = tf.placeholder(tf.float32, shape=(None, FLAGS.nb_classes)) + +# Define TF model graph +model = model_mnist() +predictions = model(x) +print "Defined TensorFlow model graph." +``` ## Training the model with TensorFlow +The library includes a helper function that runs a +TensorFlow optimizer to train models and another +helper function to load the MNIST dataset. +To train our MNIST model, we run the following: + +``` +# Get MNIST test data +X_train, Y_train, X_test, Y_test = data_mnist() + +# Train an MNIST model +tf_model_train(sess, x, y, predictions, X_train, Y_train) +``` + +We can then evaluate the performance of this model +using `tf_model_eval` included in `cleverhans.utils_tf`: + +``` +# Evaluate the accuracy of the MNIST model on legitimate test examples +accuracy = tf_model_eval(sess, x, y, predictions, X_test, Y_test) +assert X_test.shape[0] == 10000, X_test.shape +print 'Test accuracy on legitimate test examples: ' + str(accuracy) +``` + +The accuracy returned should be above `97%`. + +## Crafting adversarial examples + +This tutorial applies the Fast Gradient Sign method +introduced by [Goodfellow et al.](https://arxiv.org/abs/1412.6572). +We first need to create the necessary graph elements by +calling `cleverhans.attacks.fgsm` before using the helper +function `cleverhans.utils_tf.batch_eval` to apply it to +our test set. This gives the following: + +``` +# Craft adversarial examples using Fast Gradient Sign Method (FGSM) +adv_x = fgsm(x, predictions, eps=0.3) +X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test]) +assert X_test_adv.shape[0] == 10000, X_test_adv.shape + +# Evaluate the accuracy of the MNIST model on adversarial examples +accuracy = tf_model_eval(sess, x, y, predictions, X_test_adv, Y_test) +print'Test accuracy on adversarial examples: ' + str(accuracy) +``` + +The second part evaluates the accuracy of the model on +adversarial examples in a similar way than described +previously for legitimate examples. It should be lower +than the previous accuracy you obtained. + + +## Improving robustness using adversarial training + +One defense strategy to mitigate adversarial examples is to use +adversarial training, i.e. train the model with both the +original data and adversarially modified data (with correct +labels). You can use the training function `utils_tf.tf_model_train` +with the optional argument `predictions_adv` set to the result +of `cleverhans.attacks.fgsm` in order to perform adversarial +training. + +In the following snippet, we first declare a new model (in a +way similar to the one described previously) and then we train +it with both legitimate and adversarial training points. + +``` +# Redefine TF model graph +model_2 = model_mnist() +predictions_2 = model_2(x) +adv_x_2 = fgsm(x, predictions_2, eps=0.3) +predictions_2_adv = model_2(adv_x_2) + +# Perform adversarial training +tf_model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv) +``` + +We can then verify that (1) its accuracy on legitimate data is +still comparable to the first model, (2) its accuracy on newly +generated adversarial examples is higher. + +``` +# Evaluate the accuracy of the adversarialy trained MNIST model on +# legitimate test examples +accuracy = tf_model_eval(sess, x, y, predictions_2, X_test, Y_test) +print 'Test accuracy on legitimate test examples: ' + str(accuracy) + +# Craft adversarial examples using Fast Gradient Sign Method (FGSM) on +# the new model, which was trained using adversarial training +X_test_adv_2, = batch_eval(sess, [x], [adv_x_2], [X_test]) +assert X_test_adv_2.shape[0] == 10000, X_test_adv_2.shape + +# Evaluate the accuracy of the adversarially trained MNIST model on +# adversarial examples +accuracy_adv = tf_model_eval(sess, x, y, predictions_2, X_test_adv_2, Y_test) +print'Test accuracy on adversarial examples: ' + str(accuracy_adv) +``` + +## Code + +The complete code for this tutorial is available [here](https://github.com/openai/cleverhans/blob/master/tutorials/mnist_tutorial.py). From d94c0b9cb3986371623f32f4ee7beab70970393b Mon Sep 17 00:00:00 2001 From: Nicolas Papernot Date: Mon, 19 Sep 2016 08:43:11 +0200 Subject: [PATCH 2/4] include note on active development in readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5d8c37b4c..7cc1d7995 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,8 @@ benchmark machine learning systems' vulnerability to [adversarial examples](http://karpathy.github.io/2015/03/30/breaking-convnets/) . +Note: this library is still in active development. + ## Setting up `cleverhans` ### Dependencies @@ -50,7 +52,6 @@ Bug fixes can be initiated through Github pull requests. The following authors contributed to this library (by alphabetical order): * Ian Goodfellow (OpenAI) * Nicolas Papernot (Pennsylvania State University) -* Ryan Sheatsley (Pennsylvania State University) ## Copyright From 715599a0c5b0291d6e453897a7463db65bff098c Mon Sep 17 00:00:00 2001 From: Nicolas Papernot Date: Mon, 19 Sep 2016 08:45:30 +0200 Subject: [PATCH 3/4] Documented attacks.py --- cleverhans/attacks.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cleverhans/attacks.py b/cleverhans/attacks.py index 501d9e7d0..8d96e59a3 100644 --- a/cleverhans/attacks.py +++ b/cleverhans/attacks.py @@ -13,7 +13,9 @@ def fgsm(x, predictions, eps, back='tf'): """ - + A wrapper for the Fast Gradient Sign Method. + It calls the right function, depending on the + user's backend. :param sess: :param x: :param y: @@ -31,6 +33,14 @@ def fgsm(x, predictions, eps, back='tf'): raise NotImplementedError("Theano FGSM not implemented.") def fgsm_tf(x, predictions, eps): + """ + TensorFlow implementation of the Fast Gradient + Sign method. + :param x: the input placeholder + :param predictions: the model's output tensor + :param eps: the epsilon (input variation parameter) + :return: a tensor for the adversarial example + """ # Define loss y = tf.to_float(tf.equal(predictions, tf.reduce_max(predictions, 1, keep_dims=True))) From 89cd4794b94560e6e0cc2f6c5f615cf9be6f725e Mon Sep 17 00:00:00 2001 From: Nicolas Papernot Date: Mon, 19 Sep 2016 08:49:16 +0200 Subject: [PATCH 4/4] Documentes utils_tf --- cleverhans/utils_tf.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/cleverhans/utils_tf.py b/cleverhans/utils_tf.py index 8ec86c833..c977180bd 100644 --- a/cleverhans/utils_tf.py +++ b/cleverhans/utils_tf.py @@ -33,14 +33,16 @@ def tf_model_train(sess, x, y, predictions, X_train, Y_train, save=False, predictions_adv=None): """ Train a TF graph - :param sess: - :param x: - :param y: - :param model: - :param X_train: - :param Y_train: - :param save: - :return: + :param sess: TF session to use when training the graph + :param x: input placeholder + :param y: output placeholder (for labels) + :param predictions: model output predictions + :param X_train: numpy array with training inputs + :param Y_train: numpy array with training outputs + :param save: Boolean controling the save operation + :param predictions_adv: if set with the adversarial example tensor, + will run adversarial training + :return: True if model trained """ print "Starting model training using TensorFlow." @@ -93,14 +95,14 @@ def tf_model_train(sess, x, y, predictions, X_train, Y_train, save=False, def tf_model_eval(sess, x, y, model, X_test, Y_test): """ - - :param sess: - :param x: - :param y: - :param model: - :param X_test: - :param Y_test: - :return: + Compute the accuracy of a TF model on some data + :param sess: TF session to use when training the graph + :param x: input placeholder + :param y: output placeholder (for labels) + :param model: model output predictions + :param X_test: numpy array with training inputs + :param Y_test: numpy array with training outputs + :return: a float with the accuracy value """ # Define sympbolic for accuracy acc_value = keras.metrics.categorical_accuracy(y, model) @@ -145,6 +147,9 @@ def tf_model_load(sess): return True def batch_eval(sess, tf_inputs, tf_outputs, numpy_inputs): + """ + A helper function that computes a tensor on numpy inputs by batches. + """ n = len(numpy_inputs) assert n > 0 assert n == len(tf_inputs)