Source code for cleverhans.attacks.virtual_adversarial_method

"""The VirtualAdversarialMethod attack

"""

import warnings

import tensorflow as tf

from cleverhans.attacks.attack import Attack
from cleverhans.model import Model, CallableModelWrapper
from cleverhans.model import wrapper_warning_logits
from cleverhans import utils_tf

tf_dtype = tf.as_dtype('float32')

[docs]class VirtualAdversarialMethod(Attack): """ This attack was originally proposed by Miyato et al. (2016) and was used for virtual adversarial training. Paper link: https://arxiv.org/abs/1507.00677 :param model: cleverhans.model.Model :param sess: optional tf.Session :param dtypestr: dtype of the data :param kwargs: passed through to super constructor """ def __init__(self, model, sess=None, dtypestr='float32', **kwargs): """ Note: the model parameter should be an instance of the cleverhans.model.Model abstraction provided by CleverHans. """ if not isinstance(model, Model): wrapper_warning_logits() model = CallableModelWrapper(model, 'logits') super(VirtualAdversarialMethod, self).__init__(model, sess, dtypestr, **kwargs) self.feedable_kwargs = ('eps', 'xi', 'clip_min', 'clip_max') self.structural_kwargs = ['num_iterations']
[docs] def generate(self, x, **kwargs): """ Generate symbolic graph for adversarial examples and return. :param x: The model's symbolic inputs. :param kwargs: See `parse_params` """ # Parse and save attack-specific parameters assert self.parse_params(**kwargs) return vatm( self.model, x, self.model.get_logits(x), eps=self.eps, num_iterations=self.num_iterations, xi=self.xi, clip_min=self.clip_min, clip_max=self.clip_max)
[docs] def parse_params(self, eps=2.0, nb_iter=None, xi=1e-6, clip_min=None, clip_max=None, num_iterations=None, **kwargs): """ Take in a dictionary of parameters and applies attack-specific checks before saving them as attributes. Attack-specific parameters: :param eps: (optional float )the epsilon (input variation parameter) :param nb_iter: (optional) the number of iterations Defaults to 1 if not specified :param xi: (optional float) the finite difference parameter :param clip_min: (optional float) Minimum input component value :param clip_max: (optional float) Maximum input component value :param num_iterations: Deprecated alias for `nb_iter` """ # Save attack-specific parameters self.eps = eps if num_iterations is not None: warnings.warn("`num_iterations` is deprecated. Switch to `nb_iter`." " The old name will be removed on or after 2019-04-26.") # Note: when we remove the deprecated alias, we can put the default # value of 1 for nb_iter back in the method signature assert nb_iter is None nb_iter = num_iterations del num_iterations if nb_iter is None: nb_iter = 1 self.num_iterations = nb_iter self.xi = xi self.clip_min = clip_min self.clip_max = clip_max if len(kwargs.keys()) > 0: warnings.warn("kwargs is unused and will be removed on or after " "2019-04-26.") return True
[docs]def vatm(model, x, logits, eps, num_iterations=1, xi=1e-6, clip_min=None, clip_max=None, scope=None): """ Tensorflow implementation of the perturbation method used for virtual adversarial training: https://arxiv.org/abs/1507.00677 :param model: the model which returns the network unnormalized logits :param x: the input placeholder :param logits: the model's unnormalized output tensor (the input to the softmax layer) :param eps: the epsilon (input variation parameter) :param num_iterations: the number of iterations :param xi: the finite difference parameter :param clip_min: optional parameter that can be used to set a minimum value for components of the example returned :param clip_max: optional parameter that can be used to set a maximum value for components of the example returned :param seed: the seed for random generator :return: a tensor for the adversarial example """ with tf.name_scope(scope, "virtual_adversarial_perturbation"): d = tf.random_normal(tf.shape(x), dtype=tf_dtype) for _ in range(num_iterations): d = xi * utils_tf.l2_batch_normalize(d) logits_d = model.get_logits(x + d) kl = utils_tf.kl_with_logits(logits, logits_d) Hd = tf.gradients(kl, d)[0] d = tf.stop_gradient(Hd) d = eps * utils_tf.l2_batch_normalize(d) adv_x = x + d if (clip_min is not None) and (clip_max is not None): adv_x = tf.clip_by_value(adv_x, clip_min, clip_max) return adv_x