Spaces:

NCTCMumbai
/

NCTC

Running

File size: 7,481 Bytes

0b8359d

# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Library with adversarial attacks.

This library designed to be self-contained and have no dependencies other
than TensorFlow. It only contains PGD / Iterative FGSM attacks,
see https://arxiv.org/abs/1706.06083 and https://arxiv.org/abs/1607.02533
for details.

For wider set of adversarial attacks refer to Cleverhans library:
https://github.com/tensorflow/cleverhans
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf


def generate_pgd_common(x,
                        bounds,
                        model_fn,
                        attack_params,
                        one_hot_labels,
                        perturbation_multiplier):
  """Common code for generating PGD adversarial examples.

  Args:
    x: original examples.
    bounds: tuple with bounds of image values, bounds[0] < bounds[1].
    model_fn: model function with signature model_fn(images).
    attack_params: parameters of the attack.
    one_hot_labels: one hot label vector to use in the loss.
    perturbation_multiplier: multiplier of adversarial perturbation,
      either +1.0 or -1.0.

  Returns:
    Tensor with adversarial examples.

  Raises:
    ValueError: if attack parameters are invalid.
  """
  # parse attack_params
  # Format of attack_params: 'EPS_STEP_NITER'
  # where EPS - epsilon, STEP - step size, NITER - number of iterations
  params_list = attack_params.split('_')
  if len(params_list) != 3:
    raise ValueError('Invalid parameters of PGD attack: %s' % attack_params)
  epsilon = int(params_list[0])
  step_size = int(params_list[1])
  niter = int(params_list[2])

  # rescale epsilon and step size to image bounds
  epsilon = float(epsilon) / 255.0 * (bounds[1] - bounds[0])
  step_size = float(step_size) / 255.0 * (bounds[1] - bounds[0])

  # clipping boundaries
  clip_min = tf.maximum(x - epsilon, bounds[0])
  clip_max = tf.minimum(x + epsilon, bounds[1])

  # compute starting point
  start_x = x + tf.random_uniform(tf.shape(x), -epsilon, epsilon)
  start_x = tf.clip_by_value(start_x, clip_min, clip_max)

  # main iteration of PGD
  loop_vars = [0, start_x]

  def loop_cond(index, _):
    return index < niter

  def loop_body(index, adv_images):
    logits = model_fn(adv_images)
    loss = tf.reduce_sum(
        tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=one_hot_labels,
            logits=logits))
    perturbation = step_size * tf.sign(tf.gradients(loss, adv_images)[0])
    new_adv_images = adv_images + perturbation_multiplier * perturbation
    new_adv_images = tf.clip_by_value(new_adv_images, clip_min, clip_max)
    return index + 1, new_adv_images

  with tf.control_dependencies([start_x]):
    _, result = tf.while_loop(
        loop_cond,
        loop_body,
        loop_vars,
        back_prop=False,
        parallel_iterations=1)
    return result


def generate_pgd_ll(x, bounds, model_fn, attack_params):
  # pylint: disable=g-doc-args
  """Generats targeted PGD adversarial examples with least likely target class.

  See generate_pgd_common for description of arguments.

  Returns:
    Tensor with adversarial examples.
  """
  # pylint: enable=g-doc-args

  # compute one hot least likely class
  logits = model_fn(x)
  num_classes = tf.shape(logits)[1]
  one_hot_labels = tf.one_hot(tf.argmin(model_fn(x), axis=1), num_classes)

  return generate_pgd_common(x, bounds, model_fn, attack_params,
                             one_hot_labels=one_hot_labels,
                             perturbation_multiplier=-1.0)


def generate_pgd_rand(x, bounds, model_fn, attack_params):
  # pylint: disable=g-doc-args
  """Generats targeted PGD adversarial examples with random target class.

  See generate_pgd_common for description of arguments.

  Returns:
    Tensor with adversarial examples.
  """
  # pylint: enable=g-doc-args

  # compute one hot random class
  logits = model_fn(x)
  batch_size = tf.shape(logits)[0]
  num_classes = tf.shape(logits)[1]
  random_labels = tf.random_uniform(shape=[batch_size],
                                    minval=0,
                                    maxval=num_classes,
                                    dtype=tf.int32)
  one_hot_labels = tf.one_hot(random_labels, num_classes)

  return generate_pgd_common(x, bounds, model_fn, attack_params,
                             one_hot_labels=one_hot_labels,
                             perturbation_multiplier=-1.0)


def generate_pgd(x, bounds, model_fn, attack_params):
  # pylint: disable=g-doc-args
  """Generats non-targeted PGD adversarial examples.

  See generate_pgd_common for description of arguments.

  Returns:
    tensor with adversarial examples.
  """
  # pylint: enable=g-doc-args

  # compute one hot predicted class
  logits = model_fn(x)
  num_classes = tf.shape(logits)[1]
  one_hot_labels = tf.one_hot(tf.argmax(model_fn(x), axis=1), num_classes)

  return generate_pgd_common(x, bounds, model_fn, attack_params,
                             one_hot_labels=one_hot_labels,
                             perturbation_multiplier=1.0)


def generate_adversarial_examples(x, bounds, model_fn, attack_description):
  """Generates adversarial examples.

  Args:
    x: original examples.
    bounds: tuple with bounds of image values, bounds[0] < bounds[1]
    model_fn: model function with signature model_fn(images).
    attack_description: string which describes an attack, see notes below for
      details.

  Returns:
    Tensor with adversarial examples.

  Raises:
    ValueError: if attack description is invalid.


  Attack description could be one of the following strings:
  - "clean" - no attack, return original images.
  - "pgd_EPS_STEP_NITER" - non-targeted PGD attack.
  - "pgdll_EPS_STEP_NITER" - tageted PGD attack with least likely target class.
  - "pgdrnd_EPS_STEP_NITER" - targetd PGD attack with random target class.

  Meaning of attack parameters is following:
  - EPS - maximum size of adversarial perturbation, between 0 and 255.
  - STEP - step size of one iteration of PGD, between 0 and 255.
  - NITER - number of iterations.
  """
  if attack_description == 'clean':
    return x
  idx = attack_description.find('_')
  if idx < 0:
    raise ValueError('Invalid value of attack description %s'
                     % attack_description)
  attack_name = attack_description[:idx]
  attack_params = attack_description[idx+1:]
  if attack_name == 'pgdll':
    return generate_pgd_ll(x, bounds, model_fn, attack_params)
  elif attack_name == 'pgdrnd':
    return generate_pgd_rand(x, bounds, model_fn, attack_params)
  elif attack_name == 'pgd':
    return generate_pgd(x, bounds, model_fn, attack_params)
  else:
    raise ValueError('Invalid value of attack description %s'
                     % attack_description)