Spaces:

derek-thomas
/

fawkes

Running

App Files Files Community

derek-thomas HF staff commited on Feb 1, 2024

Commit

3eebbc1

•

1 Parent(s): 44645f1

Adding fawkes updated files

Browse files

Files changed (6) hide show

app.py +13 -36
fawkes/__init__.py +19 -0
fawkes/align_face.py +80 -0
fawkes/differentiator.py +300 -0
fawkes/protection.py +197 -0
fawkes/utils.py +731 -0

app.py CHANGED Viewed

@@ -1,38 +1,22 @@
 import gradio as gr
-import glob
-import os
-from PIL import Image
-import numpy as np
-from fawkes.protection import Fawkes  # Make sure the import path is correct
-def run_protection_interface(uploaded_image, gpu='0', mode='low', feature_extractor="arcface_extractor_0",
-                             th=0.01, max_step=1000, sd=1e6, lr=2, batch_size=1, format='png',
-                             separate_target=False, no_align=False, debug=False):
     """
     Gradio compatible function for running protection.
     """
     if uploaded_image is None:
         return None, "No image uploaded."
-    # Save the uploaded image to a temporary directory
-    temp_dir = "temp_imgs"
-    os.makedirs(temp_dir, exist_ok=True)
-    img_path = os.path.join(temp_dir, "uploaded_image.png")
-    uploaded_image.save(img_path)
     # Run the protection process
-    protector = Fawkes(feature_extractor, gpu, batch_size, mode=mode)
-    image_paths = [img_path]
-    protector.run_protection(image_paths, th=th, sd=sd, lr=lr,
-                             max_step=max_step, batch_size=batch_size,
-                             format=format, separate_target=separate_target,
-                             debug=debug, no_align=no_align)
-    # Load and return the processed image
-    processed_img_path = img_path.replace(".png", "_cloaked.png")
-    if os.path.exists(processed_img_path):
-        processed_image = Image.open(processed_img_path)
         return processed_image, "Protection process completed."
     else:
         return None, "Protection process failed or no cloaked image generated."
@@ -46,18 +30,12 @@ with gr.Blocks() as demo:
         with gr.Column():
             gr.Markdown("### Configuration Options")
-            gpu = gr.Textbox(label="GPU", value='0')
-            mode = gr.Dropdown(label="Mode", choices=['low', 'mid', 'high'], value='low')
-            feature_extractor = gr.Textbox(label="Feature Extractor", value="arcface_extractor_0")
-            th = gr.Slider(label="Threshold", minimum=0.001, maximum=0.05, value=0.01)
-            max_step = gr.Slider(label="Max Steps", minimum=500, maximum=2000, value=1000)
-            sd = gr.Slider(label="Penalty Number (SD)", minimum=1e5, maximum=1e7, value=1e6)
-            lr = gr.Slider(label="Learning Rate", minimum=1, maximum=25, value=2)
-            batch_size = gr.Slider(label="Batch Size", minimum=1, maximum=10, value=1)
-            format = gr.Radio(label="Output Format", choices=['png', 'jpg', 'jpeg'])
             separate_target = gr.Checkbox(label="Separate Target")
             no_align = gr.Checkbox(label="No Align")
-            debug = gr.Checkbox(label="Debug")
     run_button = gr.Button("Run Protection")
     output_image = gr.Image(label="Processed Image")
@@ -65,8 +43,7 @@ with gr.Blocks() as demo:
     run_button.click(
             fn=run_protection_interface,
-            inputs=[uploaded_image, gpu, mode, feature_extractor, th, max_step, sd, lr, batch_size, format,
-                    separate_target, no_align, debug],
             outputs=[output_image, output_text]
             )

 import gradio as gr
+from fawkes.protection import Fawkes
+def run_protection_interface(uploaded_image, mode='low', sd=1e6, format='png', separate_target=False, no_align=False,
+                             debug=False):
     """
     Gradio compatible function for running protection.
     """
     if uploaded_image is None:
         return None, "No image uploaded."
     # Run the protection process
+    protector = Fawkes(gpu="0", batch_size=1, mode=mode)
+    processed_image = protector.run_protection(uploaded_image, sd=sd, batch_size=1, format=format,
+                                               separate_target=separate_target, debug=debug, no_align=no_align)
+    if processed_image is not None:
         return processed_image, "Protection process completed."
     else:
         return None, "Protection process failed or no cloaked image generated."
         with gr.Column():
             gr.Markdown("### Configuration Options")
+            mode = gr.Radio(label="Mode", choices=['low', 'mid', 'high'], value='low')
+            format = gr.Radio(label="Output Format", choices=['png', 'jpg', 'jpeg'], value='png')
             separate_target = gr.Checkbox(label="Separate Target")
             no_align = gr.Checkbox(label="No Align")
+            with gr.Accordion(label='Advanced Config', open=False):
+                sd = gr.Slider(label="Penalty Number (SD)", minimum=1e5, maximum=1e7, value=1e6)
     run_button = gr.Button("Run Protection")
     output_image = gr.Image(label="Processed Image")
     run_button.click(
             fn=run_protection_interface,
+            inputs=[uploaded_image, mode, sd, format, separate_target, no_align],
             outputs=[output_image, output_text]
             )

fawkes/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# -*- coding: utf-8 -*-
+# @Date    : 2020-07-01
+# @Author  : Shawn Shan (shansixiong@cs.uchicago.edu)
+# @Link    : https://www.shawnshan.com/
+__version__ = '1.0.2'
+from .differentiator import FawkesMaskGeneration
+from .protection import main, Fawkes
+from .utils import load_extractor, init_gpu, select_target_label, dump_image, reverse_process_cloaked, Faces, get_file
+__all__ = (
+    '__version__',
+    'FawkesMaskGeneration', 'load_extractor',
+    'init_gpu',
+    'select_target_label', 'dump_image', 'reverse_process_cloaked',
+    'Faces', 'get_file', 'main', 'Fawkes'
+)

fawkes/align_face.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import numpy as np
+from mtcnn import MTCNN
+def to_rgb(img):
+    w, h = img.shape
+    ret = np.empty((w, h, 3), dtype=np.uint8)
+    ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img
+    return ret
+def aligner():
+    return MTCNN(min_face_size=30)
+def align(orig_img, aligner):
+    """ run MTCNN face detector """
+    if orig_img.ndim < 2:
+        return None
+    if orig_img.ndim == 2:
+        orig_img = to_rgb(orig_img)
+    orig_img = orig_img[:, :, 0:3]
+    detect_results = aligner.detect_faces(orig_img)
+    cropped_arr = []
+    bounding_boxes_arr = []
+    for dic in detect_results:
+        if dic['confidence'] < 0.9:
+            continue
+        x, y, width, height = dic['box']
+        if width < 30 or height < 30:
+            continue
+        bb = [y, x, y + height, x + width]
+        cropped = orig_img[bb[0]:bb[2], bb[1]:bb[3], :]
+        cropped_arr.append(np.copy(cropped))
+        bounding_boxes_arr.append(bb)
+    return cropped_arr, bounding_boxes_arr
+    # if nrof_faces > 0:
+    #     det = bounding_boxes[0]['box']
+    #     det_arr = []
+    #     img_size = np.asarray(orig_img.shape)[0:2]
+    #     if nrof_faces > 1:
+    #         margin = margin / 1.5
+    #         if detect_multiple_faces:
+    #             for i in range(nrof_faces):
+    #                 det_arr.append(np.squeeze(bounding_boxes[i]['box']))
+    #         else:
+    #             bounding_box_size = (det[1] + det[3])
+    #             img_center = img_size / 2
+    #             offsets = np.vstack([(det[0] + det[2]) / 2 - img_center[1],
+    #                                  (det[1] + det[3]) / 2 - img_center[0]])
+    #             offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+    #             index = np.argmax(bounding_box_size - offset_dist_squared * 2.0)  # some extra weight on the centering
+    #             det_arr.append(det[index, :])
+    #     else:
+    #         det_arr.append(np.squeeze(det))
+    #
+    #     cropped_arr = []
+    #     bounding_boxes_arr = []
+    #     for i, det in enumerate(det_arr):
+    #         det = np.squeeze(det)
+    #         bb = np.zeros(4, dtype=np.int32)
+    #         # add in margin
+    #         marg1 = int((det[2] - det[0]) * margin)
+    #         marg2 = int((det[3] - det[1]) * margin)
+    #
+    #         bb[0] = max(det[0] - marg1 / 2, 0)
+    #         bb[1] = max(det[1] - marg2 / 2, 0)
+    #         bb[2] = min(det[0] + det[2] + marg1 / 2, img_size[0])
+    #         bb[3] = min(det[1] + det[3] + marg2 / 2, img_size[1])
+    #         cropped = orig_img[bb[0]:bb[2], bb[1]: bb[3], :]
+    #         cropped_arr.append(cropped)
+    #         bounding_boxes_arr.append([bb[0], bb[1], bb[2], bb[3]])
+    #     return cropped_arr, bounding_boxes_arr
+    # else:
+    #     return None

fawkes/differentiator.py ADDED Viewed

	@@ -0,0 +1,300 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Date    : 2020-10-21
+# @Author  : Emily Wenger (ewenger@uchicago.edu)
+import datetime
+import time
+import numpy as np
+import tensorflow as tf
+from fawkes.utils import preprocess, reverse_preprocess
+from keras.utils import Progbar
+class FawkesMaskGeneration:
+    # if the attack is trying to mimic a target image or a neuron vector
+    MIMIC_IMG = True
+    # number of iterations to perform gradient descent
+    MAX_ITERATIONS = 10000
+    # larger values converge faster to less accurate results
+    LEARNING_RATE = 1e-2
+    # the initial constant c to pick as a first guess
+    INITIAL_CONST = 1
+    # pixel intensity range
+    INTENSITY_RANGE = 'imagenet'
+    # threshold for distance
+    L_THRESHOLD = 0.03
+    # whether keep the final result or the best result
+    KEEP_FINAL = False
+    # max_val of image
+    MAX_VAL = 255
+    MAXIMIZE = False
+    IMAGE_SHAPE = (112, 112, 3)
+    RATIO = 1.0
+    LIMIT_DIST = False
+    LOSS_TYPE = 'features'  # use features (original Fawkes) or gradients (Witches Brew) to run Fawkes?
+    def __init__(self, bottleneck_model_ls, mimic_img=MIMIC_IMG,
+                 batch_size=1, learning_rate=LEARNING_RATE,
+                 max_iterations=MAX_ITERATIONS, initial_const=INITIAL_CONST,
+                 intensity_range=INTENSITY_RANGE, l_threshold=L_THRESHOLD,
+                 max_val=MAX_VAL, keep_final=KEEP_FINAL, maximize=MAXIMIZE, image_shape=IMAGE_SHAPE, verbose=1,
+                 ratio=RATIO, limit_dist=LIMIT_DIST, loss_method=LOSS_TYPE, tanh_process=True,
+                 save_last_on_failed=True):
+        assert intensity_range in {'raw', 'imagenet', 'inception', 'mnist'}
+        # constant used for tanh transformation to avoid corner cases
+        self.it = 0
+        self.tanh_constant = 2 - 1e-6
+        self.save_last_on_failed = save_last_on_failed
+        self.MIMIC_IMG = mimic_img
+        self.LEARNING_RATE = learning_rate
+        self.MAX_ITERATIONS = max_iterations
+        self.initial_const = initial_const
+        self.batch_size = batch_size
+        self.intensity_range = intensity_range
+        self.l_threshold = l_threshold
+        self.max_val = max_val
+        self.keep_final = keep_final
+        self.verbose = verbose
+        self.maximize = maximize
+        self.learning_rate = learning_rate
+        self.ratio = ratio
+        self.limit_dist = limit_dist
+        self.single_shape = list(image_shape)
+        self.bottleneck_models = bottleneck_model_ls
+        self.loss_method = loss_method
+        self.tanh_process = tanh_process
+    @staticmethod
+    def resize_tensor(input_tensor, model_input_shape):
+        if input_tensor.shape[1:] == model_input_shape or model_input_shape[1] is None:
+            return input_tensor
+        resized_tensor = tf.image.resize(input_tensor, model_input_shape[:2])
+        return resized_tensor
+    def preprocess_arctanh(self, imgs):
+        """ Do tan preprocess """
+        imgs = reverse_preprocess(imgs, self.intensity_range)
+        imgs = imgs / 255.0
+        imgs = imgs - 0.5
+        imgs = imgs * self.tanh_constant
+        tanh_imgs = np.arctanh(imgs)
+        return tanh_imgs
+    def reverse_arctanh(self, imgs):
+        raw_img = (tf.tanh(imgs) / self.tanh_constant + 0.5) * 255
+        return raw_img
+    def input_space_process(self, img):
+        if self.intensity_range == 'imagenet':
+            mean = np.repeat([[[[103.939, 116.779, 123.68]]]], len(img), axis=0)
+            raw_img = (img[..., ::-1] - mean)
+        else:
+            raw_img = img
+        return raw_img
+    def clipping(self, imgs):
+        imgs = reverse_preprocess(imgs, self.intensity_range)
+        imgs = np.clip(imgs, 0, self.max_val)
+        imgs = preprocess(imgs, self.intensity_range)
+        return imgs
+    def calc_dissim(self, source_raw, source_mod_raw):
+        msssim_split = tf.image.ssim(source_raw, source_mod_raw, max_val=255.0)
+        dist_raw = (1.0 - tf.stack(msssim_split)) / 2.0
+        dist = tf.maximum(dist_raw - self.l_threshold, 0.0)
+        dist_raw_avg = tf.reduce_mean(dist_raw)
+        dist_sum = tf.reduce_sum(dist)
+        return dist, dist_raw, dist_sum, dist_raw_avg
+    def calc_bottlesim(self, tape, source_raw, target_raw, original_raw):
+        """ original Fawkes loss function. """
+        bottlesim = 0.0
+        bottlesim_sum = 0.0
+        # make sure everything is the right size.
+        model_input_shape = self.single_shape
+        cur_aimg_input = self.resize_tensor(source_raw, model_input_shape)
+        if target_raw is not None:
+            cur_timg_input = self.resize_tensor(target_raw, model_input_shape)
+        for bottleneck_model in self.bottleneck_models:
+            if tape is not None:
+                try:
+                    tape.watch(bottleneck_model.model.variables)
+                except AttributeError:
+                    tape.watch(bottleneck_model.variables)
+            # get the respective feature space reprs.
+            bottleneck_a = bottleneck_model(cur_aimg_input)
+            if self.maximize:
+                bottleneck_s = bottleneck_model(original_raw)
+                bottleneck_diff = bottleneck_a - bottleneck_s
+                scale_factor = tf.sqrt(tf.reduce_sum(tf.square(bottleneck_s), axis=1))
+            else:
+                bottleneck_t = bottleneck_model(cur_timg_input)
+                bottleneck_diff = bottleneck_t - bottleneck_a
+                scale_factor = tf.sqrt(tf.reduce_sum(tf.square(bottleneck_t), axis=1))
+            cur_bottlesim = tf.reduce_sum(tf.square(bottleneck_diff), axis=1)
+            cur_bottlesim = cur_bottlesim / scale_factor
+            bottlesim += cur_bottlesim
+            bottlesim_sum += tf.reduce_sum(cur_bottlesim)
+        return bottlesim, bottlesim_sum
+    def compute_feature_loss(self, tape, aimg_raw, simg_raw, aimg_input, timg_input, simg_input):
+        """ Compute input space + feature space loss.
+        """
+        input_space_loss, dist_raw, input_space_loss_sum, input_space_loss_raw_avg = self.calc_dissim(aimg_raw,
+                                                                                                      simg_raw)
+        feature_space_loss, feature_space_loss_sum = self.calc_bottlesim(tape, aimg_input, timg_input, simg_input)
+        if self.maximize:
+            loss = self.const * tf.square(input_space_loss) - feature_space_loss * self.const_diff
+        else:
+            if self.it < self.MAX_ITERATIONS:
+                loss = self.const * tf.square(input_space_loss) + 1000 * feature_space_loss
+        loss_sum = tf.reduce_sum(loss)
+        return loss_sum, feature_space_loss, input_space_loss_raw_avg, dist_raw
+    def compute(self, source_imgs, target_imgs=None):
+        """ Main function that runs cloak generation. """
+        start_time = time.time()
+        adv_imgs = []
+        for idx in range(0, len(source_imgs), self.batch_size):
+            print('processing image %d at %s' % (idx + 1, datetime.datetime.now()))
+            adv_img = self.compute_batch(source_imgs[idx:idx + self.batch_size],
+                                         target_imgs[idx:idx + self.batch_size] if target_imgs is not None else None)
+            adv_imgs.extend(adv_img)
+        elapsed_time = time.time() - start_time
+        print('protection cost %f s' % elapsed_time)
+        return np.array(adv_imgs)
+    def compute_batch(self, source_imgs, target_imgs=None, retry=True):
+        """ TF2 method to generate the cloak. """
+        # preprocess images.
+        global progressbar
+        nb_imgs = source_imgs.shape[0]
+        # make sure source/target images are an array
+        source_imgs = np.array(source_imgs, dtype=np.float32)
+        if target_imgs is not None:
+            target_imgs = np.array(target_imgs, dtype=np.float32)
+        # metrics to test
+        best_bottlesim = [0] * nb_imgs if self.maximize else [np.inf] * nb_imgs
+        best_adv = np.zeros(source_imgs.shape)
+        # convert to tanh-space
+        simg_tanh = self.preprocess_arctanh(source_imgs)
+        if target_imgs is not None:
+            timg_tanh = self.preprocess_arctanh(target_imgs)
+        self.modifier = tf.Variable(np.random.uniform(-1, 1, tuple([len(source_imgs)] + self.single_shape)) * 1e-4,
+                                    dtype=tf.float32)
+        # make the optimizer
+        optimizer = tf.keras.optimizers.legacy.Adadelta(float(self.learning_rate))
+        const_numpy = np.ones(len(source_imgs)) * self.initial_const
+        self.const = tf.Variable(const_numpy, dtype=np.float32)
+        const_diff_numpy = np.ones(len(source_imgs)) * 1.0
+        self.const_diff = tf.Variable(const_diff_numpy, dtype=np.float32)
+        # get the modifier
+        if self.verbose == 0:
+            progressbar = Progbar(
+                self.MAX_ITERATIONS, width=30, verbose=1
+            )
+        # watch relevant variables.
+        simg_tanh = tf.Variable(simg_tanh, dtype=np.float32)
+        simg_raw = tf.Variable(source_imgs, dtype=np.float32)
+        if target_imgs is not None:
+            timg_raw = tf.Variable(timg_tanh, dtype=np.float32)
+        # run the attack
+        outside_list = np.ones(len(source_imgs))
+        self.it = 0
+        while self.it < self.MAX_ITERATIONS:
+            self.it += 1
+            with tf.GradientTape(persistent=True) as tape:
+                tape.watch(self.modifier)
+                tape.watch(simg_tanh)
+                # Convert from tanh for DISSIM
+                aimg_raw = self.reverse_arctanh(simg_tanh + self.modifier)
+                actual_modifier = aimg_raw - simg_raw
+                actual_modifier = tf.clip_by_value(actual_modifier, -15.0, 15.0)
+                aimg_raw = simg_raw + actual_modifier
+                simg_raw = self.reverse_arctanh(simg_tanh)
+                # Convert further preprocess for bottleneck
+                aimg_input = self.input_space_process(aimg_raw)
+                if target_imgs is not None:
+                    timg_input = self.input_space_process(timg_raw)
+                else:
+                    timg_input = None
+                simg_input = self.input_space_process(simg_raw)
+                # get the feature space loss.
+                loss, internal_dist, input_dist_avg, dist_raw = self.compute_feature_loss(
+                    tape, aimg_raw, simg_raw, aimg_input, timg_input, simg_input)
+                # compute gradients
+                grad = tape.gradient(loss, [self.modifier])
+                if grad[0] is not None:
+                    optimizer.apply_gradients(zip(grad, [self.modifier]))
+            if self.it == 1:
+                self.modifier = tf.Variable(self.modifier - tf.sign(grad[0]) * 0.01, dtype=tf.float32)
+            for e, (input_dist, feature_d, mod_img) in enumerate(zip(dist_raw, internal_dist, aimg_input)):
+                if e >= nb_imgs:
+                    break
+                input_dist = input_dist.numpy()
+                feature_d = feature_d.numpy()
+                if input_dist <= self.l_threshold * 0.9 and const_diff_numpy[e] <= 129:
+                    const_diff_numpy[e] *= 2
+                    if outside_list[e] == -1:
+                        const_diff_numpy[e] = 1
+                    outside_list[e] = 1
+                elif input_dist >= self.l_threshold * 1.1 and const_diff_numpy[e] >= 1 / 129:
+                    const_diff_numpy[e] /= 2
+                    if outside_list[e] == 1:
+                        const_diff_numpy[e] = 1
+                    outside_list[e] = -1
+                else:
+                    const_diff_numpy[e] = 1.0
+                    outside_list[e] = 0
+                if input_dist <= self.l_threshold * 1.1 and (
+                        (feature_d < best_bottlesim[e] and (not self.maximize)) or (
+                        feature_d > best_bottlesim[e] and self.maximize)):
+                    best_bottlesim[e] = feature_d
+                    best_adv[e] = mod_img
+            self.const_diff = tf.Variable(const_diff_numpy, dtype=np.float32)
+            if self.verbose == 1:
+                print("ITER {:0.2f}  Total Loss: {:.2f} {:0.4f} raw; diff: {:.4f}".format(self.it, loss, input_dist_avg,
+                                                                                          np.mean(internal_dist)))
+            if self.verbose == 0:
+                progressbar.update(self.it)
+        if self.verbose == 1:
+            print("Final diff: {:.4f}".format(np.mean(best_bottlesim)))
+        print("\n")
+        if self.save_last_on_failed:
+            for e, diff in enumerate(best_bottlesim):
+                if diff < 0.3 and dist_raw[e] < 0.015 and internal_dist[e] > diff:
+                    best_adv[e] = aimg_input[e]
+        best_adv = self.clipping(best_adv[:nb_imgs])
+        return best_adv

fawkes/protection.py ADDED Viewed

	@@ -0,0 +1,197 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Date    : 2020-05-17
+# @Author  : Shawn Shan (shansixiong@cs.uchicago.edu)
+# @Link    : https://www.shawnshan.com/
+import argparse
+import glob
+import logging
+import os
+import sys
+logging.getLogger('tensorflow').setLevel(logging.ERROR)
+os.environ["KMP_AFFINITY"] = "noverbose"
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+tf.get_logger().setLevel('ERROR')
+tf.autograph.set_verbosity(3)
+import numpy as np
+from fawkes.differentiator import FawkesMaskGeneration
+from fawkes.utils import init_gpu, reverse_process_cloaked, \
+    Faces, load_extractor
+from fawkes.align_face import aligner
+def generate_cloak_images(protector, image_X, target_emb=None):
+    cloaked_image_X = protector.compute(image_X, target_emb)
+    return cloaked_image_X
+IMG_SIZE = 112
+PREPROCESS = 'raw'
+class Fawkes(object):
+    def __init__(self, gpu, batch_size, mode="low"):
+        self.gpu = gpu
+        self.batch_size = batch_size
+        self.mode = mode
+        th, max_step, lr, extractors = self.mode2param(self.mode)
+        self.th = th
+        self.lr = lr
+        self.max_step = max_step
+        if gpu is not None:
+            init_gpu(gpu)
+        self.aligner = aligner()
+        self.protector = None
+        self.protector_param = None
+        self.feature_extractors_ls = [load_extractor(name) for name in extractors]
+    def mode2param(self, mode):
+        if mode == 'low':
+            th = 0.004
+            max_step = 40
+            lr = 25
+            extractors = ["extractor_2"]
+        elif mode == 'mid':
+            th = 0.012
+            max_step = 75
+            lr = 20
+            extractors = ["extractor_0", "extractor_2"]
+        elif mode == 'high':
+            th = 0.017
+            max_step = 150
+            lr = 15
+            extractors = ["extractor_0", "extractor_2"]
+        else:
+            raise Exception("mode must be one of 'min', 'low', 'mid', 'high'")
+        return th, max_step, lr, extractors
+    def run_protection(self, image, sd=1e7, batch_size=1, format='png', separate_target=True, debug=False,
+                       no_align=False, maximize=True, save_last_on_failed=True):
+        current_param = "-".join([str(x) for x in [self.th, sd, self.lr, self.max_step, batch_size, format,
+                                                   separate_target, debug]])
+        faces = Faces(image, self.aligner, verbose=1, no_align=no_align)
+        original_images = faces.cropped_faces
+        if len(original_images) == 0:
+            print("No face detected. ")
+            return 2
+        original_images = np.array(original_images)
+        if current_param != self.protector_param:
+            self.protector_param = current_param
+            if self.protector is not None:
+                del self.protector
+            if batch_size == -1:
+                batch_size = len(original_images)
+            self.protector = FawkesMaskGeneration(self.feature_extractors_ls,
+                                                  batch_size=batch_size,
+                                                  mimic_img=True,
+                                                  intensity_range=PREPROCESS,
+                                                  initial_const=sd,
+                                                  learning_rate=self.lr,
+                                                  max_iterations=self.max_step,
+                                                  l_threshold=self.th,
+                                                  verbose=debug,
+                                                  maximize=maximize,
+                                                  keep_final=False,
+                                                  image_shape=(IMG_SIZE, IMG_SIZE, 3),
+                                                  loss_method='features',
+                                                  tanh_process=True,
+                                                  save_last_on_failed=save_last_on_failed,
+                                                  )
+        protected_images = generate_cloak_images(self.protector, original_images)
+        faces.cloaked_cropped_faces = protected_images
+        final_images, images_without_face = faces.merge_faces(
+                reverse_process_cloaked(protected_images, preprocess=PREPROCESS),
+                reverse_process_cloaked(original_images, preprocess=PREPROCESS))
+        if images_without_face:
+            return None
+        else:
+            return [img for img in final_images][0]
+        # for i in range(len(final_images)):
+        #     if i in images_without_face:
+        #         continue
+        #     p_img = final_images[i]
+        #     path = image_paths[i]
+        #     file_name = "{}_cloaked.{}".format(".".join(path.split(".")[:-1]), format)
+        #     dump_image(p_img, file_name, format=format)
+        # print("Done!")
+        # return 1
+def main(*argv):
+    if not argv:
+        argv = list(sys.argv)
+    try:
+        import signal
+        signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+    except Exception as e:
+        pass
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--directory', '-d', type=str,
+                        help='the directory that contains images to run protection', default='imgs/')
+    parser.add_argument('--gpu', '-g', type=str,
+                        help='the GPU id when using GPU for optimization', default='0')
+    parser.add_argument('--mode', '-m', type=str,
+                        help='cloak generation mode, select from min, low, mid, high. The higher the mode is, '
+                             'the more perturbation added and stronger protection',
+                        default='low')
+    parser.add_argument('--feature-extractor', type=str,
+                        help="name of the feature extractor used for optimization",
+                        default="arcface_extractor_0")
+    parser.add_argument('--th', help='only relevant with mode=custom, DSSIM threshold for perturbation', type=float,
+                        default=0.01)
+    parser.add_argument('--max-step', help='only relevant with mode=custom, number of steps for optimization', type=int,
+                        default=1000)
+    parser.add_argument('--sd', type=int, help='only relevant with mode=custom, penalty number, read more in the paper',
+                        default=1e6)
+    parser.add_argument('--lr', type=float, help='only relevant with mode=custom, learning rate', default=2)
+    parser.add_argument('--batch-size', help="number of images to run optimization together", type=int, default=1)
+    parser.add_argument('--separate_target', help="whether select separate targets for each faces in the directory",
+                        action='store_true')
+    parser.add_argument('--no-align', help="whether to detect and crop faces",
+                        action='store_true')
+    parser.add_argument('--debug', help="turn on debug and copy/paste the stdout when reporting an issue on github",
+                        action='store_true')
+    parser.add_argument('--format', type=str,
+                        help="format of the output image",
+                        default="png")
+    args = parser.parse_args(argv[1:])
+    assert args.format in ['png', 'jpg', 'jpeg']
+    if args.format == 'jpg':
+        args.format = 'jpeg'
+    image_paths = glob.glob(os.path.join(args.directory, "*"))
+    image_paths = [path for path in image_paths if "_cloaked" not in path.split("/")[-1]]
+    protector = Fawkes(args.gpu, args.batch_size, mode=args.mode)
+    protector.run_protection(image_paths, th=args.th, sd=args.sd, lr=args.lr,
+                             max_step=args.max_step,
+                             batch_size=args.batch_size, format=args.format,
+                             separate_target=args.separate_target, debug=args.debug, no_align=args.no_align)
+if __name__ == '__main__':
+    main(*sys.argv)

fawkes/utils.py ADDED Viewed

	@@ -0,0 +1,731 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Date    : 2020-05-17
+# @Author  : Shawn Shan (shansixiong@cs.uchicago.edu)
+# @Link    : https://www.shawnshan.com/
+import errno
+import glob
+import gzip
+import hashlib
+import json
+import os
+import pickle
+import random
+import shutil
+import sys
+import tarfile
+import zipfile
+import PIL
+import pkg_resources
+import six
+from keras.utils import Progbar
+from six.moves.urllib.error import HTTPError, URLError
+stderr = sys.stderr
+sys.stderr = open(os.devnull, 'w')
+import keras
+sys.stderr = stderr
+import keras.backend as K
+import numpy as np
+import tensorflow as tf
+from PIL import Image, ExifTags
+from keras.layers import Dense, Activation
+from keras.models import Model
+from keras.preprocessing import image
+from fawkes.align_face import align
+from six.moves.urllib.request import urlopen
+if sys.version_info[0] == 2:
+    def urlretrieve(url, filename, reporthook=None, data=None):
+        def chunk_read(response, chunk_size=8192, reporthook=None):
+            content_type = response.info().get('Content-Length')
+            total_size = -1
+            if content_type is not None:
+                total_size = int(content_type.strip())
+            count = 0
+            while True:
+                chunk = response.read(chunk_size)
+                count += 1
+                if reporthook is not None:
+                    reporthook(count, chunk_size, total_size)
+                if chunk:
+                    yield chunk
+                else:
+                    break
+        response = urlopen(url, data)
+        with open(filename, 'wb') as fd:
+            for chunk in chunk_read(response, reporthook=reporthook):
+                fd.write(chunk)
+else:
+    from six.moves.urllib.request import urlretrieve
+def clip_img(X, preprocessing='raw'):
+    X = reverse_preprocess(X, preprocessing)
+    X = np.clip(X, 0.0, 255.0)
+    X = preprocess(X, preprocessing)
+    return X
+IMG_SIZE = 112
+PREPROCESS = 'raw'
+def load_image(path):
+    try:
+        img = Image.open(path)
+    except PIL.UnidentifiedImageError:
+        return None
+    except IsADirectoryError:
+        return None
+    try:
+        info = img._getexif()
+    except OSError:
+        return None
+    if info is not None:
+        for orientation in ExifTags.TAGS.keys():
+            if ExifTags.TAGS[orientation] == 'Orientation':
+                break
+        exif = dict(img._getexif().items())
+        if orientation in exif.keys():
+            if exif[orientation] == 3:
+                img = img.rotate(180, expand=True)
+            elif exif[orientation] == 6:
+                img = img.rotate(270, expand=True)
+            elif exif[orientation] == 8:
+                img = img.rotate(90, expand=True)
+            else:
+                pass
+    img = img.convert('RGB')
+    image_array = image.img_to_array(img)
+    return image_array
+class Faces(object):
+    def __init__(self, image, aligner, verbose=1, eval_local=False, preprocessing=True, no_align=False):
+        self.verbose = verbose
+        self.no_align = no_align
+        self.aligner = aligner
+        self.margin = 30
+        self.org_faces = [image]  # single image in a list
+        self.cropped_faces = []
+        self.cropped_faces_shape = []
+        self.cropped_index = []
+        self.start_end_ls = []
+        self.callback_idx = []
+        self.images_without_face = []
+        # Processing the single image
+        cur_img = np.array(image)
+        if not self.no_align:
+            align_img = align(cur_img, self.aligner)
+            if align_img is None:
+                if self.verbose:
+                    print("Find 0 face(s) in the image")
+                self.images_without_face.append(0)
+                return
+            cur_faces = align_img[0]
+        else:
+            cur_faces = [cur_img]
+        cur_faces = [face for face in cur_faces if face.shape[0] != 0 and face.shape[1] != 0]
+        cur_shapes = [f.shape[:-1] for f in cur_faces]
+        cur_faces_square = []
+        if self.verbose and not self.no_align:
+            print("Find {} face(s) in the image".format(len(cur_faces)))
+        if eval_local:
+            cur_faces = cur_faces[:1]
+        for img in cur_faces:
+            if eval_local:
+                base = resize(img, (IMG_SIZE, IMG_SIZE))
+            else:
+                long_size = max([img.shape[1], img.shape[0]]) + self.margin
+                base = np.ones((long_size, long_size, 3)) * np.mean(img, axis=(0, 1))
+                start1, end1 = get_ends(long_size, img.shape[0])
+                start2, end2 = get_ends(long_size, img.shape[1])
+                base[start1:end1, start2:end2, :] = img
+                cur_start_end = (start1, end1, start2, end2)
+                self.start_end_ls.append(cur_start_end)
+            cur_faces_square.append(base)
+            cur_faces_square = [resize(f, (IMG_SIZE, IMG_SIZE)) for f in cur_faces_square]
+        self.cropped_faces.extend(cur_faces_square)
+        if not self.no_align:
+            cur_index = align_img[1]
+            self.cropped_faces_shape.extend(cur_shapes)
+            self.cropped_index.extend(cur_index[:len(cur_faces_square)])
+            self.callback_idx.extend([0]*len(cur_faces_square))
+        if len(self.cropped_faces) == 0:
+            return
+        self.cropped_faces = np.array(self.cropped_faces)
+        if preprocessing:
+            self.cropped_faces = preprocess(self.cropped_faces, PREPROCESS)
+        self.cloaked_cropped_faces = None
+        self.cloaked_faces = np.copy(self.org_faces)
+    def get_faces(self):
+        return self.cropped_faces
+    def merge_faces(self, protected_images, original_images):
+        if self.no_align:
+            return np.clip(protected_images, 0.0, 255.0), self.images_without_face
+        self.cloaked_faces = np.copy(self.org_faces)
+        for i in range(len(self.cropped_faces)):
+            cur_protected = protected_images[i]
+            cur_original = original_images[i]
+            org_shape = self.cropped_faces_shape[i]
+            old_square_shape = max([org_shape[0], org_shape[1]]) + self.margin
+            cur_protected = resize(cur_protected, (old_square_shape, old_square_shape))
+            cur_original = resize(cur_original, (old_square_shape, old_square_shape))
+            start1, end1, start2, end2 = self.start_end_ls[i]
+            reshape_cloak = cur_protected - cur_original
+            reshape_cloak = reshape_cloak[start1:end1, start2:end2, :]
+            callback_id = self.callback_idx[i]
+            bb = self.cropped_index[i]
+            self.cloaked_faces[callback_id][bb[0]:bb[2], bb[1]:bb[3], :] += reshape_cloak.astype(np.uint8)
+        for i in range(0, len(self.cloaked_faces)):
+            self.cloaked_faces[i] = np.clip(self.cloaked_faces[i], 0.0, 255.0)
+        return self.cloaked_faces, self.images_without_face
+def get_ends(longsize, window):
+    start = (longsize - window) // 2
+    end = start + window
+    return start, end
+def dump_dictionary_as_json(dict, outfile):
+    j = json.dumps(dict)
+    with open(outfile, "wb") as f:
+        f.write(j.encode())
+def load_victim_model(number_classes, teacher_model=None, end2end=False):
+    for l in teacher_model.layers:
+        l.trainable = end2end
+    x = teacher_model.layers[-1].output
+    x = Dense(number_classes)(x)
+    x = Activation('softmax', name="act")(x)
+    model = Model(teacher_model.input, x)
+    opt = keras.optimizers.Adadelta()
+    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
+    return model
+def resize(img, sz):
+    assert np.min(img) >= 0 and np.max(img) <= 255.0
+    from keras.preprocessing import image
+    im_data = image.array_to_img(img).resize((sz[1], sz[0]))
+    im_data = image.img_to_array(im_data)
+    return im_data
+def init_gpu(gpu):
+    ''' code to initialize gpu in tf2'''
+    if isinstance(gpu, list):
+        gpu_num = ','.join([str(i) for i in gpu])
+    else:
+        gpu_num = str(gpu)
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        print('GPU already initiated')
+        return
+    os.environ["CUDA_VISIBLE_DEVICES"] = gpu_num
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    if gpus:
+        try:
+            tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
+            tf.config.experimental.set_memory_growth(gpus[0], True)
+            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
+            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
+        except RuntimeError as e:
+            print(e)
+def fix_gpu_memory(mem_fraction=1):
+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+    tf_config = None
+    if tf.test.is_gpu_available():
+        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_fraction)
+        tf_config = tf.ConfigProto(gpu_options=gpu_options)
+        tf_config.gpu_options.allow_growth = True
+        tf_config.log_device_placement = False
+    init_op = tf.global_variables_initializer()
+    sess = tf.Session(config=tf_config)
+    sess.run(init_op)
+    K.set_session(sess)
+    return sess
+def preprocess(X, method):
+    assert method in {'raw', 'imagenet', 'inception', 'mnist'}
+    if method == 'raw':
+        pass
+    elif method == 'imagenet':
+        X = imagenet_preprocessing(X)
+    else:
+        raise Exception('unknown method %s' % method)
+    return X
+def reverse_preprocess(X, method):
+    assert method in {'raw', 'imagenet', 'inception', 'mnist'}
+    if method == 'raw':
+        pass
+    elif method == 'imagenet':
+        X = imagenet_reverse_preprocessing(X)
+    else:
+        raise Exception('unknown method %s' % method)
+    return X
+def imagenet_preprocessing(x, data_format=None):
+    if data_format is None:
+        data_format = K.image_data_format()
+    assert data_format in ('channels_last', 'channels_first')
+    x = np.array(x)
+    if data_format == 'channels_first':
+        # 'RGB'->'BGR'
+        if x.ndim == 3:
+            x = x[::-1, ...]
+        else:
+            x = x[:, ::-1, ...]
+    else:
+        # 'RGB'->'BGR'
+        x = x[..., ::-1]
+    mean = [103.939, 116.779, 123.68]
+    std = None
+    # Zero-center by mean pixel
+    if data_format == 'channels_first':
+        if x.ndim == 3:
+            x[0, :, :] -= mean[0]
+            x[1, :, :] -= mean[1]
+            x[2, :, :] -= mean[2]
+            if std is not None:
+                x[0, :, :] /= std[0]
+                x[1, :, :] /= std[1]
+                x[2, :, :] /= std[2]
+        else:
+            x[:, 0, :, :] -= mean[0]
+            x[:, 1, :, :] -= mean[1]
+            x[:, 2, :, :] -= mean[2]
+            if std is not None:
+                x[:, 0, :, :] /= std[0]
+                x[:, 1, :, :] /= std[1]
+                x[:, 2, :, :] /= std[2]
+    else:
+        x[..., 0] -= mean[0]
+        x[..., 1] -= mean[1]
+        x[..., 2] -= mean[2]
+        if std is not None:
+            x[..., 0] /= std[0]
+            x[..., 1] /= std[1]
+            x[..., 2] /= std[2]
+    return x
+def imagenet_reverse_preprocessing(x, data_format=None):
+    import keras.backend as K
+    x = np.array(x)
+    if data_format is None:
+        data_format = K.image_data_format()
+    assert data_format in ('channels_last', 'channels_first')
+    if data_format == 'channels_first':
+        if x.ndim == 3:
+            # Zero-center by mean pixel
+            x[0, :, :] += 103.939
+            x[1, :, :] += 116.779
+            x[2, :, :] += 123.68
+            # 'BGR'->'RGB'
+            x = x[::-1, :, :]
+        else:
+            x[:, 0, :, :] += 103.939
+            x[:, 1, :, :] += 116.779
+            x[:, 2, :, :] += 123.68
+            x = x[:, ::-1, :, :]
+    else:
+        # Zero-center by mean pixel
+        x[..., 0] += 103.939
+        x[..., 1] += 116.779
+        x[..., 2] += 123.68
+        # 'BGR'->'RGB'
+        x = x[..., ::-1]
+    return x
+def reverse_process_cloaked(x, preprocess='imagenet'):
+    # x = clip_img(x, preprocess)
+    return reverse_preprocess(x, preprocess)
+def build_bottleneck_model(model, cut_off):
+    bottleneck_model = Model(model.input, model.get_layer(cut_off).output)
+    bottleneck_model.compile(loss='categorical_crossentropy',
+                             optimizer='adam',
+                             metrics=['accuracy'])
+    return bottleneck_model
+def load_extractor(name):
+    hash_map = {"extractor_2": "ce703d481db2b83513bbdafa27434703",
+                "extractor_0": "94854151fd9077997d69ceda107f9c6b"}
+    assert name in ["extractor_2", 'extractor_0']
+    model_file = pkg_resources.resource_filename("fawkes", "model/{}.h5".format(name))
+    cur_hash = hash_map[name]
+    model_dir = pkg_resources.resource_filename("fawkes", "model/")
+    os.makedirs(model_dir, exist_ok=True)
+    get_file("{}.h5".format(name), "http://mirror.cs.uchicago.edu/fawkes/files/{}.h5".format(name),
+             cache_dir=model_dir, cache_subdir='', md5_hash=cur_hash)
+    model = keras.models.load_model(model_file)
+    model = Extractor(model)
+    return model
+class Extractor(object):
+    def __init__(self, model):
+        self.model = model
+    def predict(self, imgs):
+        imgs = imgs / 255.0
+        embeds = l2_norm(self.model(imgs))
+        return embeds
+    def __call__(self, x):
+        return self.predict(x)
+def get_dataset_path(dataset):
+    model_dir = os.path.join(os.path.expanduser('~'), '.fawkes')
+    if not os.path.exists(os.path.join(model_dir, "config.json")):
+        raise Exception("Please config the datasets before running protection code. See more in README and config.py.")
+    config = json.load(open(os.path.join(model_dir, "config.json"), 'r'))
+    if dataset not in config:
+        raise Exception(
+            "Dataset {} does not exist, please download to data/ and add the path to this function... Abort".format(
+                dataset))
+    return config[dataset]['train_dir'], config[dataset]['test_dir'], config[dataset]['num_classes'], config[dataset][
+        'num_images']
+def dump_image(x, filename, format="png", scale=False):
+    img = image.array_to_img(x, scale=scale)
+    img.save(filename, format)
+    return
+def load_embeddings(feature_extractors_names):
+    model_dir = os.path.join(os.path.expanduser('~'), '.fawkes')
+    for extractor_name in feature_extractors_names:
+        fp = gzip.open(os.path.join(model_dir, "{}_emb.p.gz".format(extractor_name)), 'rb')
+        path2emb = pickle.load(fp)
+        fp.close()
+    return path2emb
+def extractor_ls_predict(feature_extractors_ls, X):
+    feature_ls = []
+    for extractor in feature_extractors_ls:
+        cur_features = extractor.predict(X)
+        feature_ls.append(cur_features)
+    concated_feature_ls = np.concatenate(feature_ls, axis=1)
+    return concated_feature_ls
+def pairwise_l2_distance(A, B):
+    BT = B.transpose()
+    vecProd = np.dot(A, BT)
+    SqA = A ** 2
+    sumSqA = np.matrix(np.sum(SqA, axis=1))
+    sumSqAEx = np.tile(sumSqA.transpose(), (1, vecProd.shape[1]))
+    SqB = B ** 2
+    sumSqB = np.sum(SqB, axis=1)
+    sumSqBEx = np.tile(sumSqB, (vecProd.shape[0], 1))
+    SqED = sumSqBEx + sumSqAEx - 2 * vecProd
+    SqED[SqED < 0] = 0.0
+    ED = np.sqrt(SqED)
+    return ED
+def select_target_label(imgs, feature_extractors_ls, feature_extractors_names, metric='l2'):
+    model_dir = os.path.join(os.path.expanduser('~'), '.fawkes')
+    original_feature_x = extractor_ls_predict(feature_extractors_ls, imgs)
+    path2emb = load_embeddings(feature_extractors_names)
+    items = list([(k, v) for k, v in path2emb.items()])
+    paths = [p[0] for p in items]
+    embs = [p[1] for p in items]
+    embs = np.array(embs)
+    pair_dist = pairwise_l2_distance(original_feature_x, embs)
+    pair_dist = np.array(pair_dist)
+    max_sum = np.min(pair_dist, axis=0)
+    max_id_ls = np.argsort(max_sum)[::-1]
+    max_id = random.choice(max_id_ls[:20])
+    target_data_id = paths[int(max_id)]
+    print("target ID: {}".format(target_data_id))
+    image_dir = os.path.join(model_dir, "target_data/{}".format(target_data_id))
+    os.makedirs(os.path.join(model_dir, "target_data"), exist_ok=True)
+    os.makedirs(image_dir, exist_ok=True)
+    for i in range(10):
+        if os.path.exists(os.path.join(model_dir, "target_data/{}/{}.jpg".format(target_data_id, i))):
+            continue
+        try:
+            get_file("{}.jpg".format(i),
+                     "http://mirror.cs.uchicago.edu/fawkes/files/target_data/{}/{}.jpg".format(target_data_id, i),
+                     cache_dir=model_dir, cache_subdir='target_data/{}/'.format(target_data_id))
+        except Exception:
+            pass
+    image_paths = glob.glob(image_dir + "/*.jpg")
+    target_images = [image.img_to_array(image.load_img(cur_path)) for cur_path in
+                     image_paths]
+    target_images = np.array([resize(x, (IMG_SIZE, IMG_SIZE)) for x in target_images])
+    target_images = preprocess(target_images, PREPROCESS)
+    target_images = list(target_images)
+    while len(target_images) < len(imgs):
+        target_images += target_images
+    target_images = random.sample(target_images, len(imgs))
+    return np.array(target_images)
+def l2_norm(x, axis=1):
+    """l2 norm"""
+    norm = tf.norm(x, axis=axis, keepdims=True)
+    output = x / norm
+    return output
+""" TensorFlow implementation get_file
+https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/keras/utils/data_utils.py#L168-L297
+"""
+def get_file(fname,
+             origin,
+             untar=False,
+             md5_hash=None,
+             file_hash=None,
+             cache_subdir='datasets',
+             hash_algorithm='auto',
+             extract=False,
+             archive_format='auto',
+             cache_dir=None):
+    if cache_dir is None:
+        cache_dir = os.path.join(os.path.expanduser('~'), '.keras')
+    if md5_hash is not None and file_hash is None:
+        file_hash = md5_hash
+        hash_algorithm = 'md5'
+    datadir_base = os.path.expanduser(cache_dir)
+    if not os.access(datadir_base, os.W_OK):
+        datadir_base = os.path.join('/tmp', '.keras')
+    datadir = os.path.join(datadir_base, cache_subdir)
+    _makedirs_exist_ok(datadir)
+    # fname = path_to_string(fname)
+    if untar:
+        untar_fpath = os.path.join(datadir, fname)
+        fpath = untar_fpath + '.tar.gz'
+    else:
+        fpath = os.path.join(datadir, fname)
+    download = False
+    if os.path.exists(fpath):
+        # File found; verify integrity if a hash was provided.
+        if file_hash is not None:
+            if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
+                print('A local file was found, but it seems to be '
+                      'incomplete or outdated because the ' + hash_algorithm +
+                      ' file hash does not match the original value of ' + file_hash +
+                      ' so we will re-download the data.')
+                download = True
+    else:
+        download = True
+    if download:
+        print('Downloading data from', origin)
+        class ProgressTracker(object):
+            # Maintain progbar for the lifetime of download.
+            # This design was chosen for Python 2.7 compatibility.
+            progbar = None
+        def dl_progress(count, block_size, total_size):
+            if ProgressTracker.progbar is None:
+                if total_size == -1:
+                    total_size = None
+                ProgressTracker.progbar = Progbar(total_size)
+            else:
+                ProgressTracker.progbar.update(count * block_size)
+        error_msg = 'URL fetch failure on {}: {} -- {}'
+        try:
+            try:
+                urlretrieve(origin, fpath, dl_progress)
+            except HTTPError as e:
+                raise Exception(error_msg.format(origin, e.code, e.msg))
+            except URLError as e:
+                raise Exception(error_msg.format(origin, e.errno, e.reason))
+        except (Exception, KeyboardInterrupt) as e:
+            if os.path.exists(fpath):
+                os.remove(fpath)
+            raise
+        ProgressTracker.progbar = None
+    if untar:
+        if not os.path.exists(untar_fpath):
+            _extract_archive(fpath, datadir, archive_format='tar')
+        return untar_fpath
+    if extract:
+        _extract_archive(fpath, datadir, archive_format)
+    return fpath
+def _extract_archive(file_path, path='.', archive_format='auto'):
+    if archive_format is None:
+        return False
+    if archive_format == 'auto':
+        archive_format = ['tar', 'zip']
+    if isinstance(archive_format, six.string_types):
+        archive_format = [archive_format]
+    for archive_type in archive_format:
+        if archive_type == 'tar':
+            open_fn = tarfile.open
+            is_match_fn = tarfile.is_tarfile
+        if archive_type == 'zip':
+            open_fn = zipfile.ZipFile
+            is_match_fn = zipfile.is_zipfile
+        if is_match_fn(file_path):
+            with open_fn(file_path) as archive:
+                try:
+                    archive.extractall(path)
+                except (tarfile.TarError, RuntimeError, KeyboardInterrupt):
+                    if os.path.exists(path):
+                        if os.path.isfile(path):
+                            os.remove(path)
+                        else:
+                            shutil.rmtree(path)
+                    raise
+            return True
+    return False
+def _makedirs_exist_ok(datadir):
+    if six.PY2:
+        # Python 2 doesn't have the exist_ok arg, so we try-except here.
+        try:
+            os.makedirs(datadir)
+        except OSError as e:
+            if e.errno != errno.EEXIST:
+                raise
+    else:
+        os.makedirs(datadir, exist_ok=True)  # pylint: disable=unexpected-keyword-arg
+def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
+    """Validates a file against a sha256 or md5 hash.
+    Arguments:
+        fpath: path to the file being validated
+        file_hash:  The expected hash string of the file.
+            The sha256 and md5 hash algorithms are both supported.
+        algorithm: Hash algorithm, one of 'auto', 'sha256', or 'md5'.
+            The default 'auto' detects the hash algorithm in use.
+        chunk_size: Bytes to read at a time, important for large files.
+    Returns:
+        Whether the file is valid
+    """
+    if (algorithm == 'sha256') or (algorithm == 'auto' and len(file_hash) == 64):
+        hasher = 'sha256'
+    else:
+        hasher = 'md5'
+    if str(_hash_file(fpath, hasher, chunk_size)) == str(file_hash):
+        return True
+    else:
+        return False
+def _hash_file(fpath, algorithm='sha256', chunk_size=65535):
+    """Calculates a file sha256 or md5 hash.
+    Example:
+    ```python
+    _hash_file('/path/to/file.zip')
+    'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
+    ```
+    Arguments:
+        fpath: path to the file being validated
+        algorithm: hash algorithm, one of `'auto'`, `'sha256'`, or `'md5'`.
+            The default `'auto'` detects the hash algorithm in use.
+        chunk_size: Bytes to read at a time, important for large files.
+    Returns:
+        The file hash
+    """
+    if (algorithm == 'sha256') or (algorithm == 'auto' and len(hash) == 64):
+        hasher = hashlib.sha256()
+    else:
+        hasher = hashlib.md5()
+    with open(fpath, 'rb') as fpath_file:
+        for chunk in iter(lambda: fpath_file.read(chunk_size), b''):
+            hasher.update(chunk)
+    return hasher.hexdigest()