Spaces:
Running
Running
# Lint as: python3 | |
# Copyright 2018 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Executes Keras benchmarks and accuracy tests.""" | |
# pylint: disable=line-too-long | |
from __future__ import print_function | |
import json | |
import os | |
import time | |
from typing import Any, MutableMapping, Optional | |
from absl import flags | |
import tensorflow as tf # pylint: disable=g-bad-import-order | |
from official.benchmark import benchmark_wrappers | |
from official.benchmark import keras_benchmark | |
from official.benchmark.models import resnet_imagenet_main | |
from official.vision.image_classification import classifier_trainer | |
MIN_TOP_1_ACCURACY = 0.76 | |
MAX_TOP_1_ACCURACY = 0.77 | |
MOBILENET_V1_MIN_TOP_1_ACCURACY = 0.65 | |
MOBILENET_V1_MAX_TOP_1_ACCURACY = 0.68 | |
# Range of top-1 accracies for model optimization techniques. | |
# Each item indicates (MIN_TOP_1_ACCURACY, MAX_TOP_1_ACCURACY). | |
MODEL_OPTIMIZATION_TOP_1_ACCURACY = { | |
'RESNET50_FINETUNE_PRUNING': (0.76, 0.77), | |
'MOBILENET_V1_FINETUNE_PRUNING': (0.67, 0.68), | |
} | |
FLAGS = flags.FLAGS | |
def _get_classifier_parameters( | |
num_gpus: int = 0, | |
builder: str = 'records', | |
skip_eval: bool = False, | |
distribution_strategy: str = 'mirrored', | |
per_replica_batch_size: int = 128, | |
epochs: int = 90, | |
steps: int = 0, | |
epochs_between_evals: int = 1, | |
dtype: str = 'float32', | |
enable_xla: bool = False, | |
run_eagerly: bool = False, | |
gpu_thread_mode: Optional[str] = None, | |
dataset_num_private_threads: Optional[int] = None, | |
loss_scale: Optional[str] = None, | |
report_metrics: bool = True, | |
batchnorm_spatial_persistent: bool = False) -> MutableMapping[str, Any]: | |
"""Gets classifier trainer's ResNet parameters.""" | |
return { | |
'runtime': { | |
'num_gpus': num_gpus, | |
'distribution_strategy': distribution_strategy, | |
'run_eagerly': run_eagerly, | |
'enable_xla': enable_xla, | |
'dataset_num_private_threads': dataset_num_private_threads, | |
'gpu_thread_mode': gpu_thread_mode, | |
'loss_scale': loss_scale, | |
'batchnorm_spatial_persistent': batchnorm_spatial_persistent, | |
}, | |
'train_dataset': { | |
'builder': builder, | |
'use_per_replica_batch_size': True, | |
'batch_size': per_replica_batch_size, | |
'image_size': 224, | |
'dtype': dtype, | |
}, | |
'validation_dataset': { | |
'builder': builder, | |
'batch_size': per_replica_batch_size, | |
'use_per_replica_batch_size': True, | |
'image_size': 224, | |
'dtype': dtype, | |
}, | |
'train': { | |
'epochs': epochs, | |
'steps': steps, | |
'callbacks': { | |
'enable_tensorboard': False, | |
'enable_checkpoint_and_export': False, | |
'enable_time_history': True, | |
}, | |
'metrics': ['accuracy'] if report_metrics else [], | |
}, | |
'model': { | |
'loss': { | |
'label_smoothing': 0.1, | |
}, | |
}, | |
'evaluation': { | |
'epochs_between_evals': epochs_between_evals, | |
'skip_eval': skip_eval, | |
}, | |
} | |
class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark): | |
"""Benchmark accuracy tests for ResNet50 in Keras.""" | |
def __init__(self, | |
output_dir: Optional[str] = None, | |
root_data_dir: Optional[str] = None, | |
**kwargs): | |
"""A benchmark class. | |
Args: | |
output_dir: directory where to output e.g. log files | |
root_data_dir: directory under which to look for dataset | |
**kwargs: arbitrary named arguments. This is needed to make the | |
constructor forward compatible in case PerfZero provides more | |
named arguments before updating the constructor. | |
""" | |
flag_methods = [classifier_trainer.define_classifier_flags] | |
self.data_dir = os.path.join(root_data_dir, 'imagenet') | |
super(Resnet50KerasAccuracy, self).__init__( | |
output_dir=output_dir, flag_methods=flag_methods) | |
def _run_and_report_benchmark( | |
self, | |
experiment_name: str, | |
top_1_min: float = MIN_TOP_1_ACCURACY, | |
top_1_max: float = MAX_TOP_1_ACCURACY, | |
num_gpus: int = 0, | |
distribution_strategy: str = 'mirrored', | |
per_replica_batch_size: int = 128, | |
epochs: int = 90, | |
steps: int = 0, | |
epochs_between_evals: int = 1, | |
dtype: str = 'float32', | |
enable_xla: bool = False, | |
run_eagerly: bool = False, | |
gpu_thread_mode: Optional[str] = None, | |
dataset_num_private_threads: Optional[int] = None, | |
loss_scale: Optional[str] = None): | |
"""Runs and reports the benchmark given the provided configuration.""" | |
FLAGS.model_type = 'resnet' | |
FLAGS.dataset = 'imagenet' | |
FLAGS.mode = 'train_and_eval' | |
FLAGS.data_dir = self.data_dir | |
FLAGS.model_dir = self._get_model_dir(experiment_name) | |
parameters = _get_classifier_parameters( | |
num_gpus=num_gpus, | |
distribution_strategy=distribution_strategy, | |
per_replica_batch_size=per_replica_batch_size, | |
epochs=epochs, | |
steps=steps, | |
epochs_between_evals=epochs_between_evals, | |
dtype=dtype, | |
enable_xla=enable_xla, | |
run_eagerly=run_eagerly, | |
gpu_thread_mode=gpu_thread_mode, | |
dataset_num_private_threads=dataset_num_private_threads, | |
report_metrics=True, | |
loss_scale=loss_scale, | |
batchnorm_spatial_persistent=True) | |
FLAGS.params_override = json.dumps(parameters) | |
total_batch_size = num_gpus * per_replica_batch_size | |
start_time_sec = time.time() | |
stats = classifier_trainer.run(flags.FLAGS) | |
wall_time_sec = time.time() - start_time_sec | |
super(Resnet50KerasAccuracy, self)._report_benchmark( | |
stats, | |
wall_time_sec, | |
top_1_min=top_1_min, | |
top_1_max=top_1_max, | |
total_batch_size=total_batch_size, | |
log_steps=100) | |
def benchmark_8_gpu(self): | |
"""Tests Keras model with eager, dist_strat and 8 GPUs.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_8_gpu', | |
num_gpus=8, | |
per_replica_batch_size=128, | |
epochs=90, | |
epochs_between_evals=10, | |
dtype='float32') | |
def benchmark_8_gpu_fp16(self): | |
"""Tests Keras model with eager, dist_strat, 8 GPUs, and fp16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_8_gpu_fp16', | |
num_gpus=8, | |
per_replica_batch_size=256, | |
epochs=90, | |
epochs_between_evals=10, | |
dtype='float16') | |
def benchmark_xla_8_gpu_fp16(self): | |
"""Tests Keras model with XLA, eager, dist_strat, 8 GPUs and fp16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_8_gpu_fp16', | |
num_gpus=8, | |
per_replica_batch_size=256, | |
epochs=90, | |
epochs_between_evals=10, | |
dtype='float16', | |
enable_xla=True) | |
def benchmark_xla_8_gpu_fp16_dynamic(self): | |
"""Tests Keras model with XLA, eager, dist_strat, 8 GPUs, dynamic fp16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_8_gpu_fp16_dynamic', | |
top_1_min=0.736, | |
num_gpus=8, | |
per_replica_batch_size=256, | |
epochs=90, | |
epochs_between_evals=10, | |
dtype='float16', | |
loss_scale='dynamic') | |
def _get_model_dir(self, folder_name): | |
return os.path.join(self.output_dir, folder_name) | |
class MobilenetV1KerasAccuracy(keras_benchmark.KerasBenchmark): | |
"""Benchmark accuracy tests for MobilenetV1 in Keras.""" | |
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): | |
"""A benchmark class. | |
Args: | |
output_dir: directory where to output e.g. log files | |
root_data_dir: directory under which to look for dataset | |
**kwargs: arbitrary named arguments. This is needed to make the | |
constructor forward compatible in case PerfZero provides more | |
named arguments before updating the constructor. | |
""" | |
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags] | |
self.data_dir = os.path.join(root_data_dir, 'imagenet') | |
super(MobilenetV1KerasAccuracy, self).__init__( | |
output_dir=output_dir, | |
flag_methods=flag_methods, | |
default_flags={ | |
'model': 'mobilenet', | |
'optimizer': 'mobilenet_default', | |
'initial_learning_rate_per_sample': 0.00039, | |
}) | |
def benchmark_8_gpu(self): | |
"""Test Keras model with eager, dist_strat and 8 GPUs.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.data_dir = self.data_dir | |
FLAGS.batch_size = 128 * 8 | |
FLAGS.train_epochs = 90 | |
FLAGS.epochs_between_evals = 10 | |
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu') | |
FLAGS.dtype = 'fp32' | |
FLAGS.enable_eager = True | |
self._run_and_report_benchmark() | |
def _run_and_report_benchmark(self, | |
top_1_min=MOBILENET_V1_MIN_TOP_1_ACCURACY, | |
top_1_max=MOBILENET_V1_MAX_TOP_1_ACCURACY): | |
start_time_sec = time.time() | |
stats = resnet_imagenet_main.run(flags.FLAGS) | |
wall_time_sec = time.time() - start_time_sec | |
super(MobilenetV1KerasAccuracy, self)._report_benchmark( | |
stats, | |
wall_time_sec, | |
top_1_min=top_1_min, | |
top_1_max=top_1_max, | |
total_batch_size=FLAGS.batch_size, | |
log_steps=100) | |
def _get_model_dir(self, folder_name): | |
return os.path.join(self.output_dir, folder_name) | |
class Resnet50KerasClassifierBenchmarkBase(keras_benchmark.KerasBenchmark): | |
"""Resnet50 (classifier_trainer) benchmarks.""" | |
def __init__(self, output_dir=None, default_flags=None, | |
tpu=None, dataset_builder='records', train_epochs=1, | |
train_steps=110, data_dir=None): | |
flag_methods = [classifier_trainer.define_classifier_flags] | |
self.dataset_builder = dataset_builder | |
self.train_epochs = train_epochs | |
self.train_steps = train_steps | |
self.data_dir = data_dir | |
super(Resnet50KerasClassifierBenchmarkBase, self).__init__( | |
output_dir=output_dir, | |
flag_methods=flag_methods, | |
default_flags=default_flags, | |
tpu=tpu) | |
def _run_and_report_benchmark( | |
self, | |
experiment_name: str, | |
skip_steps: Optional[int] = None, | |
top_1_min: float = MIN_TOP_1_ACCURACY, | |
top_1_max: float = MAX_TOP_1_ACCURACY, | |
num_gpus: int = 0, | |
num_tpus: int = 0, | |
distribution_strategy: str = 'mirrored', | |
per_replica_batch_size: int = 128, | |
epochs_between_evals: int = 1, | |
dtype: str = 'float32', | |
enable_xla: bool = False, | |
run_eagerly: bool = False, | |
gpu_thread_mode: Optional[str] = None, | |
dataset_num_private_threads: Optional[int] = None, | |
loss_scale: Optional[str] = None): | |
"""Runs and reports the benchmark given the provided configuration.""" | |
FLAGS.model_type = 'resnet' | |
FLAGS.dataset = 'imagenet' | |
FLAGS.mode = 'train_and_eval' | |
FLAGS.data_dir = self.data_dir | |
FLAGS.model_dir = self._get_model_dir(experiment_name) | |
parameters = _get_classifier_parameters( | |
builder=self.dataset_builder, | |
skip_eval=True, | |
num_gpus=num_gpus, | |
distribution_strategy=distribution_strategy, | |
per_replica_batch_size=per_replica_batch_size, | |
epochs=self.train_epochs, | |
steps=self.train_steps, | |
epochs_between_evals=epochs_between_evals, | |
dtype=dtype, | |
enable_xla=enable_xla, | |
gpu_thread_mode=gpu_thread_mode, | |
dataset_num_private_threads=dataset_num_private_threads, | |
loss_scale=loss_scale, | |
report_metrics=False, | |
batchnorm_spatial_persistent=True) | |
FLAGS.params_override = json.dumps(parameters) | |
if distribution_strategy == 'tpu': | |
total_batch_size = num_tpus * per_replica_batch_size | |
else: | |
total_batch_size = num_gpus * per_replica_batch_size | |
start_time_sec = time.time() | |
stats = classifier_trainer.run(flags.FLAGS) | |
wall_time_sec = time.time() - start_time_sec | |
# Number of logged step time entries that are excluded in performance | |
# report. We keep results from last 100 batches, or skip the steps based on | |
# input skip_steps. | |
warmup = (skip_steps or (self.train_steps - 100)) // FLAGS.log_steps | |
super(Resnet50KerasClassifierBenchmarkBase, self)._report_benchmark( | |
stats, | |
wall_time_sec, | |
total_batch_size=total_batch_size, | |
log_steps=FLAGS.log_steps, | |
warmup=warmup, | |
start_time_sec=start_time_sec) | |
def benchmark_1_gpu_no_dist_strat(self): | |
"""Tests Keras model with 1 GPU, no distribution strategy.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_1_gpu_no_dist_strat', | |
num_gpus=1, | |
distribution_strategy='off', | |
per_replica_batch_size=128) | |
def benchmark_1_gpu_no_dist_strat_run_eagerly(self): | |
"""Tests Keras model with 1 GPU, no distribution strategy, run eagerly.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_1_gpu_no_dist_strat_run_eagerly', | |
num_gpus=1, | |
run_eagerly=True, | |
distribution_strategy='off', | |
per_replica_batch_size=64) | |
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self): | |
"""Tests with 1 GPU, no distribution strategy, fp16, run eagerly.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_1_gpu_no_dist_strat_run_eagerly_fp16', | |
num_gpus=1, | |
run_eagerly=True, | |
distribution_strategy='off', | |
dtype='float16', | |
per_replica_batch_size=128) | |
def benchmark_1_gpu(self): | |
"""Tests Keras model with 1 GPU.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_1_gpu', | |
num_gpus=1, | |
distribution_strategy='one_device', | |
per_replica_batch_size=128) | |
def benchmark_xla_1_gpu(self): | |
"""Tests Keras model with XLA and 1 GPU.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_1_gpu', | |
num_gpus=1, | |
enable_xla=True, | |
distribution_strategy='one_device', | |
per_replica_batch_size=128) | |
def benchmark_1_gpu_fp16(self): | |
"""Tests Keras model with 1 GPU and fp16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_1_gpu_fp16', | |
num_gpus=1, | |
distribution_strategy='one_device', | |
dtype='float16', | |
per_replica_batch_size=256) | |
def benchmark_1_gpu_fp16_dynamic(self): | |
"""Tests Keras model with 1 GPU, fp16, and dynamic loss scaling.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_1_gpu_fp16_dynamic', | |
num_gpus=1, | |
distribution_strategy='one_device', | |
dtype='float16', | |
per_replica_batch_size=256, | |
loss_scale='dynamic') | |
def benchmark_xla_1_gpu_fp16(self): | |
"""Tests Keras model with XLA, 1 GPU and fp16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_1_gpu_fp16', | |
num_gpus=1, | |
enable_xla=True, | |
distribution_strategy='one_device', | |
dtype='float16', | |
per_replica_batch_size=256) | |
def benchmark_xla_1_gpu_fp16_tweaked(self): | |
"""Tests Keras model with XLA, 1 GPU, fp16, and manual config tuning.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_1_gpu_fp16_tweaked', | |
num_gpus=1, | |
enable_xla=True, | |
distribution_strategy='one_device', | |
dtype='float16', | |
per_replica_batch_size=256, | |
gpu_thread_mode='gpu_private') | |
def benchmark_xla_1_gpu_fp16_dynamic(self): | |
"""Tests Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_1_gpu_fp16_dynamic', | |
num_gpus=1, | |
enable_xla=True, | |
distribution_strategy='one_device', | |
dtype='float16', | |
per_replica_batch_size=256, | |
loss_scale='dynamic') | |
def benchmark_8_gpu(self): | |
"""Tests Keras model with 8 GPUs.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_8_gpu', | |
num_gpus=8, | |
distribution_strategy='mirrored', | |
per_replica_batch_size=128) | |
def benchmark_8_gpu_tweaked(self): | |
"""Tests Keras model with manual config tuning and 8 GPUs.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_8_gpu_tweaked', | |
num_gpus=8, | |
distribution_strategy='mirrored', | |
per_replica_batch_size=128, | |
dataset_num_private_threads=14) | |
def benchmark_xla_8_gpu(self): | |
"""Tests Keras model with XLA and 8 GPUs.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_8_gpu', | |
num_gpus=8, | |
enable_xla=True, | |
distribution_strategy='mirrored', | |
per_replica_batch_size=128) | |
def benchmark_xla_8_gpu_tweaked(self): | |
"""Tests Keras model with manual config tuning, 8 GPUs, and XLA.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_8_gpu_tweaked', | |
num_gpus=8, | |
enable_xla=True, | |
distribution_strategy='mirrored', | |
per_replica_batch_size=128, | |
gpu_thread_mode='gpu_private', | |
dataset_num_private_threads=24) | |
def benchmark_8_gpu_fp16(self): | |
"""Tests Keras model with 8 GPUs and fp16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_8_gpu_fp16', | |
num_gpus=8, | |
dtype='float16', | |
distribution_strategy='mirrored', | |
per_replica_batch_size=256) | |
def benchmark_8_gpu_fp16_tweaked(self): | |
"""Tests Keras model with 8 GPUs, fp16, and manual config tuning.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_8_gpu_fp16_tweaked', | |
num_gpus=8, | |
dtype='float16', | |
distribution_strategy='mirrored', | |
per_replica_batch_size=256, | |
gpu_thread_mode='gpu_private', | |
dataset_num_private_threads=40) | |
def benchmark_8_gpu_fp16_dynamic_tweaked(self): | |
"""Tests Keras model with 8 GPUs, fp16, dynamic loss scaling, and tuned.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_8_gpu_fp16_dynamic_tweaked', | |
num_gpus=8, | |
dtype='float16', | |
distribution_strategy='mirrored', | |
per_replica_batch_size=256, | |
loss_scale='dynamic', | |
gpu_thread_mode='gpu_private', | |
dataset_num_private_threads=40) | |
def benchmark_xla_8_gpu_fp16(self): | |
"""Tests Keras model with XLA, 8 GPUs and fp16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_8_gpu_fp16', | |
dtype='float16', | |
num_gpus=8, | |
enable_xla=True, | |
distribution_strategy='mirrored', | |
per_replica_batch_size=256) | |
def benchmark_xla_8_gpu_fp16_tweaked(self): | |
"""Test Keras model with manual config tuning, XLA, 8 GPUs and fp16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_8_gpu_fp16_tweaked', | |
dtype='float16', | |
num_gpus=8, | |
enable_xla=True, | |
distribution_strategy='mirrored', | |
per_replica_batch_size=256, | |
gpu_thread_mode='gpu_private', | |
dataset_num_private_threads=48) | |
def benchmark_xla_8_gpu_fp16_tweaked_delay_measure(self): | |
"""Tests with manual config tuning, XLA, 8 GPUs and fp16. | |
Delay performance measurement for stable performance on 96 vCPU platforms. | |
""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_8_gpu_fp16_tweaked_delay_measure', | |
dtype='float16', | |
num_gpus=8, | |
enable_xla=True, | |
distribution_strategy='mirrored', | |
per_replica_batch_size=256, | |
gpu_thread_mode='gpu_private', | |
dataset_num_private_threads=48, | |
steps=310) | |
def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self): | |
"""Tests Keras model with config tuning, XLA, 8 GPUs and dynamic fp16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_xla_8_gpu_fp16_dynamic_tweaked', | |
dtype='float16', | |
num_gpus=8, | |
enable_xla=True, | |
distribution_strategy='mirrored', | |
per_replica_batch_size=256, | |
gpu_thread_mode='gpu_private', | |
loss_scale='dynamic', | |
dataset_num_private_threads=48) | |
def benchmark_2x2_tpu_bf16(self): | |
"""Test Keras model with 2x2 TPU, bf16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_2x2_tpu_bf16', | |
dtype='bfloat16', | |
num_tpus=8, | |
distribution_strategy='tpu', | |
per_replica_batch_size=128) | |
def benchmark_4x4_tpu_bf16(self): | |
"""Test Keras model with 4x4 TPU, bf16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_4x4_tpu_bf16', | |
dtype='bfloat16', | |
num_tpus=32, | |
distribution_strategy='tpu', | |
per_replica_batch_size=128) | |
def benchmark_8x8_tpu_bf16(self): | |
"""Test Keras model with 8x8 TPU, bf16.""" | |
self._setup() | |
self._run_and_report_benchmark( | |
experiment_name='benchmark_8x8_tpu_bf16', | |
dtype='bfloat16', | |
num_tpus=128, | |
distribution_strategy='tpu', | |
per_replica_batch_size=64) | |
def fill_report_object(self, stats): | |
super(Resnet50KerasClassifierBenchmarkBase, self).fill_report_object( | |
stats, | |
total_batch_size=FLAGS.batch_size, | |
log_steps=FLAGS.log_steps) | |
class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark): | |
"""Resnet50 benchmarks.""" | |
def __init__(self, output_dir=None, default_flags=None, tpu=None): | |
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags] | |
super(Resnet50KerasBenchmarkBase, self).__init__( | |
output_dir=output_dir, | |
flag_methods=flag_methods, | |
default_flags=default_flags, | |
tpu=tpu) | |
def _run_and_report_benchmark(self, skip_steps=None): | |
start_time_sec = time.time() | |
stats = resnet_imagenet_main.run(FLAGS) | |
wall_time_sec = time.time() - start_time_sec | |
# Number of logged step time entries that are excluded in performance | |
# report. We keep results from last 100 batches, or skip the steps based on | |
# input skip_steps. | |
warmup = (skip_steps or (FLAGS.train_steps - 100)) // FLAGS.log_steps | |
super(Resnet50KerasBenchmarkBase, self)._report_benchmark( | |
stats, | |
wall_time_sec, | |
total_batch_size=FLAGS.batch_size, | |
log_steps=FLAGS.log_steps, | |
warmup=warmup, | |
start_time_sec=start_time_sec) | |
def benchmark_1_gpu_no_dist_strat(self): | |
"""Test Keras model with 1 GPU, no distribution strategy.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat') | |
FLAGS.batch_size = 128 | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_no_dist_strat_run_eagerly(self): | |
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.run_eagerly = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_1_gpu_no_dist_strat_run_eagerly') | |
FLAGS.batch_size = 64 | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self): | |
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.run_eagerly = True | |
FLAGS.explicit_gpu_placement = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked') | |
FLAGS.batch_size = 64 | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self): | |
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.run_eagerly = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 128 | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self): | |
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.run_eagerly = True | |
FLAGS.explicit_gpu_placement = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 128 | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu(self): | |
"""Test Keras model with 1 GPU.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu') | |
FLAGS.batch_size = 128 | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_amp(self): | |
"""Test Keras model with 1 GPU with automatic mixed precision.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.dtype = 'fp16' | |
FLAGS.fp16_implementation = 'graph_rewrite' | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp') | |
FLAGS.batch_size = 256 | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu(self): | |
"""Test Keras model with XLA and 1 GPU.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu') | |
FLAGS.batch_size = 128 | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu_amp(self): | |
"""Test Keras model with XLA and 1 GPU with automatic mixed precision.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.dtype = 'fp16' | |
FLAGS.fp16_implementation = 'graph_rewrite' | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp') | |
FLAGS.batch_size = 256 | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_fp16(self): | |
"""Test Keras model with 1 GPU and fp16.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_fp16_dynamic(self): | |
"""Test Keras model with 1 GPU, fp16, and dynamic loss scaling.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
FLAGS.loss_scale = 'dynamic' | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu_fp16(self): | |
"""Test Keras model with XLA, 1 GPU and fp16.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu_fp16_tweaked(self): | |
"""Test Keras model with XLA, 1 GPU, fp16, and manual config tuning.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu_fp16_dynamic(self): | |
"""Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
FLAGS.loss_scale = 'dynamic' | |
self._run_and_report_benchmark() | |
def benchmark_8_gpu(self): | |
"""Test Keras model with 8 GPUs.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu') | |
FLAGS.batch_size = 128 * 8 # 8 GPUs | |
self._run_and_report_benchmark() | |
def benchmark_8_gpu_amp(self): | |
"""Test Keras model with 8 GPUs with automatic mixed precision.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.enable_eager = True | |
FLAGS.dtype = 'fp16' | |
FLAGS.fp16_implementation = 'graph_rewrite' | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp') | |
FLAGS.batch_size = 256 * 8 # 8 GPUs | |
self._run_and_report_benchmark() | |
def benchmark_8_gpu_tweaked(self): | |
"""Test Keras model with manual config tuning and 8 GPUs.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_tweaked') | |
FLAGS.batch_size = 128 * 8 # 8 GPUs | |
FLAGS.datasets_num_private_threads = 14 | |
self._run_and_report_benchmark() | |
def benchmark_xla_8_gpu(self): | |
"""Test Keras model with XLA and 8 GPUs.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu') | |
FLAGS.batch_size = 128 * 8 # 8 GPUs | |
self._run_and_report_benchmark() | |
def benchmark_xla_8_gpu_amp(self): | |
"""Test Keras model with XLA and 8 GPUs with automatic mixed precision.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.enable_eager = True | |
FLAGS.dtype = 'fp16' | |
FLAGS.fp16_implementation = 'graph_rewrite' | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_amp') | |
FLAGS.batch_size = 256 * 8 # 8 GPUs | |
self._run_and_report_benchmark() | |
def benchmark_xla_8_gpu_tweaked(self): | |
"""Test Keras model with manual config tuning, 8 GPUs, and XLA.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_tweaked') | |
FLAGS.batch_size = 128 * 8 | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
FLAGS.datasets_num_private_threads = 24 | |
self._run_and_report_benchmark() | |
def benchmark_8_gpu_fp16(self): | |
"""Test Keras model with 8 GPUs and fp16.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.dtype = 'fp16' | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16') | |
FLAGS.batch_size = 256 * 8 # 8 GPUs | |
self._run_and_report_benchmark() | |
def benchmark_8_gpu_fp16_tweaked(self): | |
"""Test Keras model with 8 GPUs, fp16, and manual config tuning.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.dtype = 'fp16' | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16_tweaked') | |
FLAGS.batch_size = 256 * 8 # 8 GPUs | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
FLAGS.dataset_num_private_threads = 40 | |
self._run_and_report_benchmark() | |
def benchmark_8_gpu_fp16_dynamic_tweaked(self): | |
"""Test Keras model with 8 GPUs, fp16, dynamic loss scaling, and tuned.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.dtype = 'fp16' | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_8_gpu_fp16_dynamic_tweaked') | |
FLAGS.batch_size = 256 * 8 # 8 GPUs | |
FLAGS.loss_scale = 'dynamic' | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
FLAGS.dataset_num_private_threads = 40 | |
self._run_and_report_benchmark() | |
def benchmark_xla_8_gpu_fp16(self): | |
"""Test Keras model with XLA, 8 GPUs and fp16.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.dtype = 'fp16' | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16') | |
FLAGS.batch_size = 256 * 8 # 8 GPUs | |
self._run_and_report_benchmark() | |
def benchmark_xla_8_gpu_fp16_tweaked(self): | |
"""Test Keras model with manual config tuning, XLA, 8 GPUs and fp16.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.dtype = 'fp16' | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16_tweaked') | |
FLAGS.batch_size = 256 * 8 # 8 GPUs | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
FLAGS.datasets_num_private_threads = 48 | |
self._run_and_report_benchmark() | |
def benchmark_xla_8_gpu_fp16_tweaked_delay_measure(self): | |
"""Test with manual config tuning, XLA, 8 GPUs and fp16. | |
Delay performance measurement for stable performance on 96 vCPU platforms. | |
""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.dtype = 'fp16' | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_xla_8_gpu_fp16_tweaked_delay_measure') | |
FLAGS.batch_size = 256 * 8 | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
FLAGS.datasets_num_private_threads = 48 | |
FLAGS.train_steps = 310 | |
self._run_and_report_benchmark() | |
def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self): | |
"""Test Keras model with config tuning, XLA, 8 GPUs and dynamic fp16.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.dtype = 'fp16' | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'mirrored' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_xla_8_gpu_fp16_dynamic_tweaked') | |
FLAGS.batch_size = 256 * 8 # 8 GPUs | |
FLAGS.loss_scale = 'dynamic' | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
FLAGS.datasets_num_private_threads = 48 | |
self._run_and_report_benchmark() | |
def benchmark_2x2_tpu_bf16(self): | |
"""Test Keras model with 2x2 TPU, bf16.""" | |
self._setup() | |
FLAGS.dtype = 'bf16' | |
FLAGS.distribution_strategy = 'tpu' | |
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16') | |
FLAGS.batch_size = 1024 | |
self._run_and_report_benchmark() | |
def benchmark_4x4_tpu_bf16(self): | |
"""Test Keras model with 4x4 TPU, bf16.""" | |
self._setup() | |
FLAGS.dtype = 'bf16' | |
FLAGS.distribution_strategy = 'tpu' | |
FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16') | |
FLAGS.batch_size = 4096 | |
self._run_and_report_benchmark() | |
def benchmark_8x8_tpu_bf16(self): | |
"""Test Keras model with 8x8 TPU, bf16.""" | |
self._setup() | |
FLAGS.dtype = 'bf16' | |
FLAGS.distribution_strategy = 'tpu' | |
FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_bf16') | |
FLAGS.batch_size = 8192 | |
self._run_and_report_benchmark() | |
def fill_report_object(self, stats): | |
super(Resnet50KerasBenchmarkBase, self).fill_report_object( | |
stats, | |
total_batch_size=FLAGS.batch_size, | |
log_steps=FLAGS.log_steps) | |
class Resnet50KerasBenchmarkSynth(Resnet50KerasClassifierBenchmarkBase): | |
"""Resnet50 synthetic benchmark tests.""" | |
def __init__(self, output_dir=None, root_data_dir=None, tpu=None, **kwargs): | |
def_flags = {} | |
def_flags['log_steps'] = 10 | |
super(Resnet50KerasBenchmarkSynth, self).__init__( | |
output_dir=output_dir, default_flags=def_flags, tpu=tpu, | |
dataset_builder='synthetic', train_epochs=1, train_steps=110) | |
class Resnet50KerasBenchmarkReal(Resnet50KerasClassifierBenchmarkBase): | |
"""Resnet50 real data benchmark tests.""" | |
def __init__(self, output_dir=None, root_data_dir=None, tpu=None, **kwargs): | |
data_dir = os.path.join(root_data_dir, 'imagenet') | |
def_flags = {} | |
def_flags['log_steps'] = 10 | |
super(Resnet50KerasBenchmarkReal, self).__init__( | |
output_dir=output_dir, default_flags=def_flags, tpu=tpu, | |
dataset_builder='records', train_epochs=1, train_steps=110, | |
data_dir=data_dir) | |
class Resnet50KerasBenchmarkRemoteData(Resnet50KerasBenchmarkBase): | |
"""Resnet50 real data (stored in remote storage) benchmark tests.""" | |
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): | |
def_flags = {} | |
def_flags['skip_eval'] = True | |
def_flags['report_accuracy_metrics'] = False | |
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet') | |
# Defining multiple epochs overrides the train_steps setting in benchmarks. | |
def_flags['train_epochs'] = 2 | |
# Cache dataset so performance is stable after the first epoch. | |
def_flags['training_dataset_cache'] = True | |
def_flags['log_steps'] = 100 | |
# Note that for single GPU and pure eager tests which are less likely to be | |
# input bound and more stable, these tests will run for shorter time by | |
# overriding FLAGS.train_epochs, train_seteps, log_steps in benchmark | |
# methods, and skip_steps in _run_and_report_benchmark(). | |
super(Resnet50KerasBenchmarkRemoteData, self).__init__( | |
output_dir=output_dir, default_flags=def_flags) | |
def _override_flags_to_run_test_shorter(self): | |
FLAGS.train_epochs = 1 | |
FLAGS.train_steps = 300 | |
FLAGS.log_steps = 10 | |
def benchmark_1_gpu_no_dist_strat(self): | |
"""Test Keras model with 1 GPU, no distribution strategy.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat') | |
FLAGS.batch_size = 128 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_no_dist_strat_run_eagerly(self): | |
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.run_eagerly = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_1_gpu_no_dist_strat_run_eagerly') | |
FLAGS.batch_size = 64 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self): | |
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.run_eagerly = True | |
FLAGS.explicit_gpu_placement = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked') | |
FLAGS.batch_size = 64 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self): | |
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.run_eagerly = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 128 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self): | |
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.run_eagerly = True | |
FLAGS.explicit_gpu_placement = True | |
FLAGS.distribution_strategy = 'off' | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 128 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu(self): | |
"""Test Keras model with 1 GPU.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu') | |
FLAGS.batch_size = 128 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_amp(self): | |
"""Test Keras model with 1 GPU with automatic mixed precision.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.dtype = 'fp16' | |
FLAGS.fp16_implementation = 'graph_rewrite' | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp') | |
FLAGS.batch_size = 256 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu(self): | |
"""Test Keras model with XLA and 1 GPU.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu') | |
FLAGS.batch_size = 128 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu_amp(self): | |
"""Test Keras model with XLA and 1 GPU with automatic mixed precision.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.dtype = 'fp16' | |
FLAGS.fp16_implementation = 'graph_rewrite' | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp') | |
FLAGS.batch_size = 256 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_fp16(self): | |
"""Test Keras model with 1 GPU and fp16.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_1_gpu_fp16_dynamic(self): | |
"""Test Keras model with 1 GPU, fp16, and dynamic loss scaling.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
FLAGS.loss_scale = 'dynamic' | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu_fp16(self): | |
"""Test Keras model with XLA, 1 GPU and fp16.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu_fp16_tweaked(self): | |
"""Test Keras model with XLA, 1 GPU, fp16, and manual config tuning.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def benchmark_xla_1_gpu_fp16_dynamic(self): | |
"""Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling.""" | |
self._setup() | |
FLAGS.num_gpus = 1 | |
FLAGS.enable_eager = True | |
FLAGS.enable_xla = True | |
FLAGS.distribution_strategy = 'one_device' | |
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic') | |
FLAGS.dtype = 'fp16' | |
FLAGS.batch_size = 256 | |
FLAGS.loss_scale = 'dynamic' | |
self._override_flags_to_run_test_shorter() | |
self._run_and_report_benchmark() | |
def _run_and_report_benchmark(self): | |
if FLAGS.num_gpus == 1 or FLAGS.run_eagerly: | |
# For single GPU and pure eager tests which are less likely to be input | |
# bound and more stable, run for shorter time and use the default | |
# skip_steps. | |
skip_steps = None | |
else: | |
# skip the first epoch for performance measurement. | |
skip_steps = 600 | |
super(Resnet50KerasBenchmarkRemoteData, | |
self)._run_and_report_benchmark(skip_steps=skip_steps) | |
class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark): | |
"""Trivial model with real data benchmark tests.""" | |
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): | |
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags] | |
def_flags = {} | |
def_flags['use_trivial_model'] = True | |
def_flags['skip_eval'] = True | |
def_flags['report_accuracy_metrics'] = False | |
def_flags['dtype'] = 'fp16' | |
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet') | |
def_flags['train_steps'] = 600 | |
def_flags['log_steps'] = 100 | |
def_flags['distribution_strategy'] = 'mirrored' | |
super(TrivialKerasBenchmarkReal, self).__init__( | |
output_dir=output_dir, | |
flag_methods=flag_methods, | |
default_flags=def_flags) | |
def _run_and_report_benchmark(self): | |
start_time_sec = time.time() | |
stats = resnet_imagenet_main.run(FLAGS) | |
wall_time_sec = time.time() - start_time_sec | |
super(TrivialKerasBenchmarkReal, self)._report_benchmark( | |
stats, | |
wall_time_sec, | |
total_batch_size=FLAGS.batch_size, | |
log_steps=FLAGS.log_steps) | |
def benchmark_8_gpu_warmup(self): | |
"""Dummy test that runs over an epoch to warmup the machine.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.enable_eager = True | |
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup') | |
FLAGS.batch_size = 256 * 8 | |
FLAGS.train_steps = 700 | |
self._run_and_report_benchmark() | |
def fill_report_object(self, stats): | |
super(TrivialKerasBenchmarkReal, self).fill_report_object( | |
stats, | |
total_batch_size=FLAGS.batch_size, | |
log_steps=FLAGS.log_steps) | |
class Resnet50MultiWorkerKerasAccuracy(keras_benchmark.KerasBenchmark): | |
"""Resnet50 distributed accuracy tests with multiple workers.""" | |
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): | |
flag_methods = [classifier_trainer.define_imagenet_keras_flags] | |
self.data_dir = os.path.join(root_data_dir, 'imagenet') | |
super(Resnet50MultiWorkerKerasAccuracy, self).__init__( | |
output_dir=output_dir, flag_methods=flag_methods) | |
def _benchmark_common(self, eager, num_workers, all_reduce_alg): | |
"""Common to all benchmarks in this class.""" | |
self._setup() | |
num_gpus = 8 | |
FLAGS.num_gpus = num_gpus | |
FLAGS.data_dir = self.data_dir | |
FLAGS.train_epochs = 90 | |
FLAGS.epochs_between_evals = 10 | |
FLAGS.dtype = 'fp16' | |
FLAGS.enable_eager = eager | |
FLAGS.enable_xla = False | |
FLAGS.distribution_strategy = 'multi_worker_mirrored' | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
FLAGS.datasets_num_private_threads = 32 | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_{}_8_gpu_{}_worker_fp16_{}_tweaked'.format( | |
'eager' if eager else 'graph', num_workers, all_reduce_alg)) | |
FLAGS.batch_size = 256 * num_gpus * num_workers | |
FLAGS.all_reduce_alg = all_reduce_alg | |
self._run_and_report_benchmark() | |
def _run_and_report_benchmark(self, | |
top_1_min=MIN_TOP_1_ACCURACY, | |
top_1_max=MAX_TOP_1_ACCURACY): | |
start_time_sec = time.time() | |
stats = classifier_trainer.run(flags.FLAGS) | |
wall_time_sec = time.time() - start_time_sec | |
super(Resnet50MultiWorkerKerasAccuracy, self)._report_benchmark( | |
stats, | |
wall_time_sec, | |
top_1_min=top_1_min, | |
top_1_max=top_1_max, | |
total_batch_size=FLAGS.batch_size, | |
log_steps=100) | |
def _get_model_dir(self, folder_name): | |
return os.path.join(self.output_dir, folder_name) | |
def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self): | |
"""Eager, 8 GPUs per worker, 2 workers, fp16, ring all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='ring') | |
def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self): | |
"""Eager, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='nccl') | |
def benchmark_eager_8_gpu_8_workers_fp16_ring_tweaked(self): | |
"""Eager, 8 GPUs per worker, 8 workers, fp16, ring all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='ring') | |
def benchmark_eager_8_gpu_8_workers_fp16_nccl_tweaked(self): | |
"""Eager, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='nccl') | |
class Resnet50MultiWorkerKerasBenchmark(Resnet50KerasBenchmarkBase): | |
"""Resnet50 distributed benchmark tests with multiple workers.""" | |
def __init__(self, output_dir=None, default_flags=None): | |
super(Resnet50MultiWorkerKerasBenchmark, self).__init__( | |
output_dir=output_dir, default_flags=default_flags) | |
def _benchmark_common(self, eager, num_workers, all_reduce_alg): | |
"""Common to all benchmarks in this class.""" | |
self._setup() | |
num_gpus = 8 | |
FLAGS.num_gpus = num_gpus | |
FLAGS.dtype = 'fp16' | |
FLAGS.enable_eager = eager | |
FLAGS.enable_xla = False | |
FLAGS.distribution_strategy = 'multi_worker_mirrored' | |
FLAGS.tf_gpu_thread_mode = 'gpu_private' | |
FLAGS.datasets_num_private_threads = 32 | |
FLAGS.model_dir = self._get_model_dir( | |
'benchmark_{}_8_gpu_{}_worker_fp16_{}_tweaked'.format( | |
'eager' if eager else 'graph', num_workers, all_reduce_alg)) | |
FLAGS.batch_size = 256 * num_gpus * num_workers | |
FLAGS.all_reduce_alg = all_reduce_alg | |
self._run_and_report_benchmark() | |
def benchmark_eager_8_gpu_1_worker_fp16_ring_tweaked(self): | |
"""Eager, 8 GPUs per worker, 1 worker, fp16, ring all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='ring') | |
def benchmark_eager_8_gpu_1_worker_fp16_nccl_tweaked(self): | |
"""Eager, 8 GPUs per worker, 1 worker, fp16, nccl all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='nccl') | |
def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self): | |
"""Eager, 8 GPUs per worker, 2 workers, fp16, ring all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='ring') | |
def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self): | |
"""Eager, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='nccl') | |
def benchmark_eager_8_gpu_8_workers_fp16_ring_tweaked(self): | |
"""Eager, 8 GPUs per worker, 8 workers, fp16, ring all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='ring') | |
def benchmark_eager_8_gpu_8_workers_fp16_nccl_tweaked(self): | |
"""Eager, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce.""" | |
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='nccl') | |
class Resnet50MultiWorkerKerasBenchmarkSynth(Resnet50MultiWorkerKerasBenchmark): | |
"""Resnet50 multi-worker synthetic data benchmark tests.""" | |
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): | |
def_flags = {} | |
def_flags['skip_eval'] = True | |
def_flags['report_accuracy_metrics'] = False | |
def_flags['use_synthetic_data'] = True | |
def_flags['train_steps'] = 110 | |
def_flags['log_steps'] = 10 | |
super(Resnet50MultiWorkerKerasBenchmarkSynth, self).__init__( | |
output_dir=output_dir, default_flags=def_flags) | |
class Resnet50MultiWorkerKerasBenchmarkReal(Resnet50MultiWorkerKerasBenchmark): | |
"""Resnet50 multi-worker real data benchmark tests.""" | |
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): | |
def_flags = {} | |
def_flags['skip_eval'] = True | |
def_flags['report_accuracy_metrics'] = False | |
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet') | |
def_flags['train_steps'] = 110 | |
def_flags['log_steps'] = 10 | |
super(Resnet50MultiWorkerKerasBenchmarkReal, self).__init__( | |
output_dir=output_dir, default_flags=def_flags) | |
# TODO(kimjaehong): It also should be also cover other metheods of model | |
# optimization techniques. In that time, this class will change to something | |
# like 'KerasModelOptimizationAccuracyBase'. | |
class KerasPruningAccuracyBase(keras_benchmark.KerasBenchmark): | |
"""Benchmark accuracy tests for pruning method.""" | |
def __init__(self, | |
output_dir=None, | |
root_data_dir=None, | |
default_flags=None, | |
**kwargs): | |
"""A accuracy benchmark class for pruning method. | |
Args: | |
output_dir: directory where to output e.g. log files | |
root_data_dir: directory under which to look for dataset | |
default_flags: default flags | |
**kwargs: arbitrary named arguments. This is needed to make the | |
constructor forward compatible in case PerfZero provides more | |
named arguments before updating the constructor. | |
""" | |
if default_flags is None: | |
default_flags = {} | |
default_flags['pruning_method'] = 'polynomial_decay' | |
default_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet') | |
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags] | |
super(KerasPruningAccuracyBase, self).__init__( | |
output_dir=output_dir, | |
flag_methods=flag_methods, | |
default_flags=default_flags, | |
**kwargs) | |
def benchmark_8_gpu(self): | |
"""Test Keras model with eager, dist_strat and 8 GPUs.""" | |
self._setup() | |
FLAGS.num_gpus = 8 | |
FLAGS.batch_size = 32 * 8 | |
FLAGS.train_epochs = 90 | |
FLAGS.epochs_between_evals = 10 | |
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu') | |
FLAGS.dtype = 'fp32' | |
FLAGS.enable_eager = True | |
self._run_and_report_benchmark() | |
def _run_and_report_benchmark(self, | |
top_1_min=MODEL_OPTIMIZATION_TOP_1_ACCURACY[ | |
'RESNET50_FINETUNE_PRUNING'][0], | |
top_1_max=MODEL_OPTIMIZATION_TOP_1_ACCURACY[ | |
'RESNET50_FINETUNE_PRUNING'][1]): | |
start_time_sec = time.time() | |
stats = resnet_imagenet_main.run(flags.FLAGS) | |
wall_time_sec = time.time() - start_time_sec | |
super(KerasPruningAccuracyBase, self)._report_benchmark( | |
stats, | |
wall_time_sec, | |
top_1_min=top_1_min, | |
top_1_max=top_1_max, | |
total_batch_size=FLAGS.batch_size, | |
log_steps=100) | |
class MobilenetV1KerasPruningAccuracy(KerasPruningAccuracyBase): | |
"""Benchmark accuracy tests for MobilenetV1 with pruning method.""" | |
def __init__(self, root_data_dir=None, **kwargs): | |
default_flags = { | |
'model': 'mobilenet', | |
'optimizer': 'mobilenet_default', | |
'initial_learning_rate_per_sample': 0.00007, | |
'pretrained_filepath': tf.train.latest_checkpoint( | |
os.path.join(root_data_dir, 'mobilenet_v1')), | |
'pruning_begin_step': 0, | |
'pruning_end_step': 100000, | |
'pruning_initial_sparsity': 0.0, | |
'pruning_final_sparsity': 0.5, | |
'pruning_frequency': 100, | |
} | |
super(MobilenetV1KerasPruningAccuracy, self).__init__( | |
root_data_dir=root_data_dir, | |
default_flags=default_flags, | |
**kwargs) | |
def _run_and_report_benchmark(self): | |
super(MobilenetV1KerasPruningAccuracy, self)._run_and_report_benchmark( | |
top_1_min=\ | |
MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_PRUNING'][0], | |
top_1_max=\ | |
MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_PRUNING'][1]) | |
class Resnet50KerasPruningAccuracy(KerasPruningAccuracyBase): | |
"""Benchmark accuracy tests for resnet50 with pruning method.""" | |
def __init__(self, root_data_dir=None, **kwargs): | |
default_flags = { | |
'model': 'resnet50_v1.5', | |
'optimizer': 'mobilenet_default', | |
'initial_learning_rate_per_sample': 0.0000039, | |
'pretrained_filepath': tf.train.latest_checkpoint( | |
os.path.join(root_data_dir, 'resnet50')), | |
'pruning_begin_step': 0, | |
'pruning_end_step': 50000, | |
'pruning_initial_sparsity': 0.0, | |
'pruning_final_sparsity': 0.5, | |
'pruning_frequency': 100, | |
} | |
super(Resnet50KerasPruningAccuracy, self).__init__( | |
root_data_dir=root_data_dir, | |
default_flags=default_flags, | |
**kwargs) | |
def _run_and_report_benchmark(self): | |
super(Resnet50KerasPruningAccuracy, self)._run_and_report_benchmark( | |
top_1_min=\ | |
MODEL_OPTIMIZATION_TOP_1_ACCURACY['RESNET50_FINETUNE_PRUNING'][0], | |
top_1_max=\ | |
MODEL_OPTIMIZATION_TOP_1_ACCURACY['RESNET50_FINETUNE_PRUNING'][1]) | |
class KerasPruningBenchmarkRealBase(Resnet50KerasBenchmarkBase): | |
"""Pruning method benchmarks.""" | |
def __init__(self, root_data_dir=None, default_flags=None, **kwargs): | |
if default_flags is None: | |
default_flags = {} | |
default_flags.update({ | |
'skip_eval': True, | |
'report_accuracy_metrics': False, | |
'data_dir': os.path.join(root_data_dir, 'imagenet'), | |
'train_steps': 110, | |
'log_steps': 10, | |
'pruning_method': 'polynomial_decay', | |
'pruning_begin_step': 0, | |
'pruning_end_step': 50000, | |
'pruning_initial_sparsity': 0, | |
'pruning_final_sparsity': 0.5, | |
'pruning_frequency': 100, | |
}) | |
super(KerasPruningBenchmarkRealBase, self).__init__( | |
default_flags=default_flags, **kwargs) | |
class MobilenetV1KerasPruningBenchmarkReal(KerasPruningBenchmarkRealBase): | |
"""Pruning method benchmarks for MobilenetV1.""" | |
def __init__(self, **kwargs): | |
default_flags = { | |
'model': 'mobilenet', | |
'optimizer': 'mobilenet_default', | |
} | |
super(MobilenetV1KerasPruningBenchmarkReal, self).__init__( | |
default_flags=default_flags, **kwargs) | |
class Resnet50KerasPruningBenchmarkReal(KerasPruningBenchmarkRealBase): | |
"""Pruning method benchmarks for resnet50.""" | |
def __init__(self, **kwargs): | |
default_flags = { | |
'model': 'resnet50_v1.5', | |
'optimizer': 'mobilenet_default', | |
} | |
super(Resnet50KerasPruningBenchmarkReal, self).__init__( | |
default_flags=default_flags, **kwargs) | |
if __name__ == '__main__': | |
tf.test.main() | |