NCTC / models /official /benchmark /keras_imagenet_benchmark.py
NCTCMumbai's picture
Upload 2571 files
0b8359d
raw
history blame
60.2 kB
# Lint as: python3
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes Keras benchmarks and accuracy tests."""
# pylint: disable=line-too-long
from __future__ import print_function
import json
import os
import time
from typing import Any, MutableMapping, Optional
from absl import flags
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.benchmark import benchmark_wrappers
from official.benchmark import keras_benchmark
from official.benchmark.models import resnet_imagenet_main
from official.vision.image_classification import classifier_trainer
MIN_TOP_1_ACCURACY = 0.76
MAX_TOP_1_ACCURACY = 0.77
MOBILENET_V1_MIN_TOP_1_ACCURACY = 0.65
MOBILENET_V1_MAX_TOP_1_ACCURACY = 0.68
# Range of top-1 accracies for model optimization techniques.
# Each item indicates (MIN_TOP_1_ACCURACY, MAX_TOP_1_ACCURACY).
MODEL_OPTIMIZATION_TOP_1_ACCURACY = {
'RESNET50_FINETUNE_PRUNING': (0.76, 0.77),
'MOBILENET_V1_FINETUNE_PRUNING': (0.67, 0.68),
}
FLAGS = flags.FLAGS
def _get_classifier_parameters(
num_gpus: int = 0,
builder: str = 'records',
skip_eval: bool = False,
distribution_strategy: str = 'mirrored',
per_replica_batch_size: int = 128,
epochs: int = 90,
steps: int = 0,
epochs_between_evals: int = 1,
dtype: str = 'float32',
enable_xla: bool = False,
run_eagerly: bool = False,
gpu_thread_mode: Optional[str] = None,
dataset_num_private_threads: Optional[int] = None,
loss_scale: Optional[str] = None,
report_metrics: bool = True,
batchnorm_spatial_persistent: bool = False) -> MutableMapping[str, Any]:
"""Gets classifier trainer's ResNet parameters."""
return {
'runtime': {
'num_gpus': num_gpus,
'distribution_strategy': distribution_strategy,
'run_eagerly': run_eagerly,
'enable_xla': enable_xla,
'dataset_num_private_threads': dataset_num_private_threads,
'gpu_thread_mode': gpu_thread_mode,
'loss_scale': loss_scale,
'batchnorm_spatial_persistent': batchnorm_spatial_persistent,
},
'train_dataset': {
'builder': builder,
'use_per_replica_batch_size': True,
'batch_size': per_replica_batch_size,
'image_size': 224,
'dtype': dtype,
},
'validation_dataset': {
'builder': builder,
'batch_size': per_replica_batch_size,
'use_per_replica_batch_size': True,
'image_size': 224,
'dtype': dtype,
},
'train': {
'epochs': epochs,
'steps': steps,
'callbacks': {
'enable_tensorboard': False,
'enable_checkpoint_and_export': False,
'enable_time_history': True,
},
'metrics': ['accuracy'] if report_metrics else [],
},
'model': {
'loss': {
'label_smoothing': 0.1,
},
},
'evaluation': {
'epochs_between_evals': epochs_between_evals,
'skip_eval': skip_eval,
},
}
class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
"""Benchmark accuracy tests for ResNet50 in Keras."""
def __init__(self,
output_dir: Optional[str] = None,
root_data_dir: Optional[str] = None,
**kwargs):
"""A benchmark class.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [classifier_trainer.define_classifier_flags]
self.data_dir = os.path.join(root_data_dir, 'imagenet')
super(Resnet50KerasAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(
self,
experiment_name: str,
top_1_min: float = MIN_TOP_1_ACCURACY,
top_1_max: float = MAX_TOP_1_ACCURACY,
num_gpus: int = 0,
distribution_strategy: str = 'mirrored',
per_replica_batch_size: int = 128,
epochs: int = 90,
steps: int = 0,
epochs_between_evals: int = 1,
dtype: str = 'float32',
enable_xla: bool = False,
run_eagerly: bool = False,
gpu_thread_mode: Optional[str] = None,
dataset_num_private_threads: Optional[int] = None,
loss_scale: Optional[str] = None):
"""Runs and reports the benchmark given the provided configuration."""
FLAGS.model_type = 'resnet'
FLAGS.dataset = 'imagenet'
FLAGS.mode = 'train_and_eval'
FLAGS.data_dir = self.data_dir
FLAGS.model_dir = self._get_model_dir(experiment_name)
parameters = _get_classifier_parameters(
num_gpus=num_gpus,
distribution_strategy=distribution_strategy,
per_replica_batch_size=per_replica_batch_size,
epochs=epochs,
steps=steps,
epochs_between_evals=epochs_between_evals,
dtype=dtype,
enable_xla=enable_xla,
run_eagerly=run_eagerly,
gpu_thread_mode=gpu_thread_mode,
dataset_num_private_threads=dataset_num_private_threads,
report_metrics=True,
loss_scale=loss_scale,
batchnorm_spatial_persistent=True)
FLAGS.params_override = json.dumps(parameters)
total_batch_size = num_gpus * per_replica_batch_size
start_time_sec = time.time()
stats = classifier_trainer.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(Resnet50KerasAccuracy, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=top_1_min,
top_1_max=top_1_max,
total_batch_size=total_batch_size,
log_steps=100)
def benchmark_8_gpu(self):
"""Tests Keras model with eager, dist_strat and 8 GPUs."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu',
num_gpus=8,
per_replica_batch_size=128,
epochs=90,
epochs_between_evals=10,
dtype='float32')
def benchmark_8_gpu_fp16(self):
"""Tests Keras model with eager, dist_strat, 8 GPUs, and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_fp16',
num_gpus=8,
per_replica_batch_size=256,
epochs=90,
epochs_between_evals=10,
dtype='float16')
def benchmark_xla_8_gpu_fp16(self):
"""Tests Keras model with XLA, eager, dist_strat, 8 GPUs and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16',
num_gpus=8,
per_replica_batch_size=256,
epochs=90,
epochs_between_evals=10,
dtype='float16',
enable_xla=True)
def benchmark_xla_8_gpu_fp16_dynamic(self):
"""Tests Keras model with XLA, eager, dist_strat, 8 GPUs, dynamic fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16_dynamic',
top_1_min=0.736,
num_gpus=8,
per_replica_batch_size=256,
epochs=90,
epochs_between_evals=10,
dtype='float16',
loss_scale='dynamic')
def _get_model_dir(self, folder_name):
return os.path.join(self.output_dir, folder_name)
class MobilenetV1KerasAccuracy(keras_benchmark.KerasBenchmark):
"""Benchmark accuracy tests for MobilenetV1 in Keras."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""A benchmark class.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
self.data_dir = os.path.join(root_data_dir, 'imagenet')
super(MobilenetV1KerasAccuracy, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags={
'model': 'mobilenet',
'optimizer': 'mobilenet_default',
'initial_learning_rate_per_sample': 0.00039,
})
def benchmark_8_gpu(self):
"""Test Keras model with eager, dist_strat and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128 * 8
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
top_1_min=MOBILENET_V1_MIN_TOP_1_ACCURACY,
top_1_max=MOBILENET_V1_MAX_TOP_1_ACCURACY):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(MobilenetV1KerasAccuracy, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=top_1_min,
top_1_max=top_1_max,
total_batch_size=FLAGS.batch_size,
log_steps=100)
def _get_model_dir(self, folder_name):
return os.path.join(self.output_dir, folder_name)
class Resnet50KerasClassifierBenchmarkBase(keras_benchmark.KerasBenchmark):
"""Resnet50 (classifier_trainer) benchmarks."""
def __init__(self, output_dir=None, default_flags=None,
tpu=None, dataset_builder='records', train_epochs=1,
train_steps=110, data_dir=None):
flag_methods = [classifier_trainer.define_classifier_flags]
self.dataset_builder = dataset_builder
self.train_epochs = train_epochs
self.train_steps = train_steps
self.data_dir = data_dir
super(Resnet50KerasClassifierBenchmarkBase, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=default_flags,
tpu=tpu)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(
self,
experiment_name: str,
skip_steps: Optional[int] = None,
top_1_min: float = MIN_TOP_1_ACCURACY,
top_1_max: float = MAX_TOP_1_ACCURACY,
num_gpus: int = 0,
num_tpus: int = 0,
distribution_strategy: str = 'mirrored',
per_replica_batch_size: int = 128,
epochs_between_evals: int = 1,
dtype: str = 'float32',
enable_xla: bool = False,
run_eagerly: bool = False,
gpu_thread_mode: Optional[str] = None,
dataset_num_private_threads: Optional[int] = None,
loss_scale: Optional[str] = None):
"""Runs and reports the benchmark given the provided configuration."""
FLAGS.model_type = 'resnet'
FLAGS.dataset = 'imagenet'
FLAGS.mode = 'train_and_eval'
FLAGS.data_dir = self.data_dir
FLAGS.model_dir = self._get_model_dir(experiment_name)
parameters = _get_classifier_parameters(
builder=self.dataset_builder,
skip_eval=True,
num_gpus=num_gpus,
distribution_strategy=distribution_strategy,
per_replica_batch_size=per_replica_batch_size,
epochs=self.train_epochs,
steps=self.train_steps,
epochs_between_evals=epochs_between_evals,
dtype=dtype,
enable_xla=enable_xla,
gpu_thread_mode=gpu_thread_mode,
dataset_num_private_threads=dataset_num_private_threads,
loss_scale=loss_scale,
report_metrics=False,
batchnorm_spatial_persistent=True)
FLAGS.params_override = json.dumps(parameters)
if distribution_strategy == 'tpu':
total_batch_size = num_tpus * per_replica_batch_size
else:
total_batch_size = num_gpus * per_replica_batch_size
start_time_sec = time.time()
stats = classifier_trainer.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
# Number of logged step time entries that are excluded in performance
# report. We keep results from last 100 batches, or skip the steps based on
# input skip_steps.
warmup = (skip_steps or (self.train_steps - 100)) // FLAGS.log_steps
super(Resnet50KerasClassifierBenchmarkBase, self)._report_benchmark(
stats,
wall_time_sec,
total_batch_size=total_batch_size,
log_steps=FLAGS.log_steps,
warmup=warmup,
start_time_sec=start_time_sec)
def benchmark_1_gpu_no_dist_strat(self):
"""Tests Keras model with 1 GPU, no distribution strategy."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_no_dist_strat',
num_gpus=1,
distribution_strategy='off',
per_replica_batch_size=128)
def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
"""Tests Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_no_dist_strat_run_eagerly',
num_gpus=1,
run_eagerly=True,
distribution_strategy='off',
per_replica_batch_size=64)
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
"""Tests with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_no_dist_strat_run_eagerly_fp16',
num_gpus=1,
run_eagerly=True,
distribution_strategy='off',
dtype='float16',
per_replica_batch_size=128)
def benchmark_1_gpu(self):
"""Tests Keras model with 1 GPU."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu',
num_gpus=1,
distribution_strategy='one_device',
per_replica_batch_size=128)
def benchmark_xla_1_gpu(self):
"""Tests Keras model with XLA and 1 GPU."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_1_gpu',
num_gpus=1,
enable_xla=True,
distribution_strategy='one_device',
per_replica_batch_size=128)
def benchmark_1_gpu_fp16(self):
"""Tests Keras model with 1 GPU and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_fp16',
num_gpus=1,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256)
def benchmark_1_gpu_fp16_dynamic(self):
"""Tests Keras model with 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_fp16_dynamic',
num_gpus=1,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256,
loss_scale='dynamic')
def benchmark_xla_1_gpu_fp16(self):
"""Tests Keras model with XLA, 1 GPU and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_1_gpu_fp16',
num_gpus=1,
enable_xla=True,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256)
def benchmark_xla_1_gpu_fp16_tweaked(self):
"""Tests Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_1_gpu_fp16_tweaked',
num_gpus=1,
enable_xla=True,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private')
def benchmark_xla_1_gpu_fp16_dynamic(self):
"""Tests Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_1_gpu_fp16_dynamic',
num_gpus=1,
enable_xla=True,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256,
loss_scale='dynamic')
def benchmark_8_gpu(self):
"""Tests Keras model with 8 GPUs."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu',
num_gpus=8,
distribution_strategy='mirrored',
per_replica_batch_size=128)
def benchmark_8_gpu_tweaked(self):
"""Tests Keras model with manual config tuning and 8 GPUs."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_tweaked',
num_gpus=8,
distribution_strategy='mirrored',
per_replica_batch_size=128,
dataset_num_private_threads=14)
def benchmark_xla_8_gpu(self):
"""Tests Keras model with XLA and 8 GPUs."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=128)
def benchmark_xla_8_gpu_tweaked(self):
"""Tests Keras model with manual config tuning, 8 GPUs, and XLA."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_tweaked',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=128,
gpu_thread_mode='gpu_private',
dataset_num_private_threads=24)
def benchmark_8_gpu_fp16(self):
"""Tests Keras model with 8 GPUs and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_fp16',
num_gpus=8,
dtype='float16',
distribution_strategy='mirrored',
per_replica_batch_size=256)
def benchmark_8_gpu_fp16_tweaked(self):
"""Tests Keras model with 8 GPUs, fp16, and manual config tuning."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_fp16_tweaked',
num_gpus=8,
dtype='float16',
distribution_strategy='mirrored',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private',
dataset_num_private_threads=40)
def benchmark_8_gpu_fp16_dynamic_tweaked(self):
"""Tests Keras model with 8 GPUs, fp16, dynamic loss scaling, and tuned."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_fp16_dynamic_tweaked',
num_gpus=8,
dtype='float16',
distribution_strategy='mirrored',
per_replica_batch_size=256,
loss_scale='dynamic',
gpu_thread_mode='gpu_private',
dataset_num_private_threads=40)
def benchmark_xla_8_gpu_fp16(self):
"""Tests Keras model with XLA, 8 GPUs and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16',
dtype='float16',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=256)
def benchmark_xla_8_gpu_fp16_tweaked(self):
"""Test Keras model with manual config tuning, XLA, 8 GPUs and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16_tweaked',
dtype='float16',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private',
dataset_num_private_threads=48)
def benchmark_xla_8_gpu_fp16_tweaked_delay_measure(self):
"""Tests with manual config tuning, XLA, 8 GPUs and fp16.
Delay performance measurement for stable performance on 96 vCPU platforms.
"""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16_tweaked_delay_measure',
dtype='float16',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private',
dataset_num_private_threads=48,
steps=310)
def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self):
"""Tests Keras model with config tuning, XLA, 8 GPUs and dynamic fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16_dynamic_tweaked',
dtype='float16',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private',
loss_scale='dynamic',
dataset_num_private_threads=48)
def benchmark_2x2_tpu_bf16(self):
"""Test Keras model with 2x2 TPU, bf16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_2x2_tpu_bf16',
dtype='bfloat16',
num_tpus=8,
distribution_strategy='tpu',
per_replica_batch_size=128)
def benchmark_4x4_tpu_bf16(self):
"""Test Keras model with 4x4 TPU, bf16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_4x4_tpu_bf16',
dtype='bfloat16',
num_tpus=32,
distribution_strategy='tpu',
per_replica_batch_size=128)
def benchmark_8x8_tpu_bf16(self):
"""Test Keras model with 8x8 TPU, bf16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8x8_tpu_bf16',
dtype='bfloat16',
num_tpus=128,
distribution_strategy='tpu',
per_replica_batch_size=64)
def fill_report_object(self, stats):
super(Resnet50KerasClassifierBenchmarkBase, self).fill_report_object(
stats,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
"""Resnet50 benchmarks."""
def __init__(self, output_dir=None, default_flags=None, tpu=None):
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
super(Resnet50KerasBenchmarkBase, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=default_flags,
tpu=tpu)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self, skip_steps=None):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(FLAGS)
wall_time_sec = time.time() - start_time_sec
# Number of logged step time entries that are excluded in performance
# report. We keep results from last 100 batches, or skip the steps based on
# input skip_steps.
warmup = (skip_steps or (FLAGS.train_steps - 100)) // FLAGS.log_steps
super(Resnet50KerasBenchmarkBase, self)._report_benchmark(
stats,
wall_time_sec,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
warmup=warmup,
start_time_sec=start_time_sec)
def benchmark_1_gpu_no_dist_strat(self):
"""Test Keras model with 1 GPU, no distribution strategy."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly')
FLAGS.batch_size = 64
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self):
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.explicit_gpu_placement = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked')
FLAGS.batch_size = 64
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self):
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.explicit_gpu_placement = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu(self):
"""Test Keras model with 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_amp(self):
"""Test Keras model with 1 GPU with automatic mixed precision."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp')
FLAGS.batch_size = 256
self._run_and_report_benchmark()
def benchmark_xla_1_gpu(self):
"""Test Keras model with XLA and 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_amp(self):
"""Test Keras model with XLA and 1 GPU with automatic mixed precision."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp')
FLAGS.batch_size = 256
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16(self):
"""Test Keras model with 1 GPU and fp16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16_dynamic(self):
"""Test Keras model with 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16(self):
"""Test Keras model with XLA, 1 GPU and fp16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16_tweaked(self):
"""Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16_dynamic(self):
"""Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark()
def benchmark_8_gpu(self):
"""Test Keras model with 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_8_gpu_amp(self):
"""Test Keras model with 8 GPUs with automatic mixed precision."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp')
FLAGS.batch_size = 256 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_8_gpu_tweaked(self):
"""Test Keras model with manual config tuning and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_tweaked')
FLAGS.batch_size = 128 * 8 # 8 GPUs
FLAGS.datasets_num_private_threads = 14
self._run_and_report_benchmark()
def benchmark_xla_8_gpu(self):
"""Test Keras model with XLA and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu')
FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_amp(self):
"""Test Keras model with XLA and 8 GPUs with automatic mixed precision."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_amp')
FLAGS.batch_size = 256 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_tweaked(self):
"""Test Keras model with manual config tuning, 8 GPUs, and XLA."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_tweaked')
FLAGS.batch_size = 128 * 8
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 24
self._run_and_report_benchmark()
def benchmark_8_gpu_fp16(self):
"""Test Keras model with 8 GPUs and fp16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
FLAGS.batch_size = 256 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_8_gpu_fp16_tweaked(self):
"""Test Keras model with 8 GPUs, fp16, and manual config tuning."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16_tweaked')
FLAGS.batch_size = 256 * 8 # 8 GPUs
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.dataset_num_private_threads = 40
self._run_and_report_benchmark()
def benchmark_8_gpu_fp16_dynamic_tweaked(self):
"""Test Keras model with 8 GPUs, fp16, dynamic loss scaling, and tuned."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir(
'benchmark_8_gpu_fp16_dynamic_tweaked')
FLAGS.batch_size = 256 * 8 # 8 GPUs
FLAGS.loss_scale = 'dynamic'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.dataset_num_private_threads = 40
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_fp16(self):
"""Test Keras model with XLA, 8 GPUs and fp16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
FLAGS.batch_size = 256 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_fp16_tweaked(self):
"""Test Keras model with manual config tuning, XLA, 8 GPUs and fp16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16_tweaked')
FLAGS.batch_size = 256 * 8 # 8 GPUs
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 48
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_fp16_tweaked_delay_measure(self):
"""Test with manual config tuning, XLA, 8 GPUs and fp16.
Delay performance measurement for stable performance on 96 vCPU platforms.
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir(
'benchmark_xla_8_gpu_fp16_tweaked_delay_measure')
FLAGS.batch_size = 256 * 8
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 48
FLAGS.train_steps = 310
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self):
"""Test Keras model with config tuning, XLA, 8 GPUs and dynamic fp16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir(
'benchmark_xla_8_gpu_fp16_dynamic_tweaked')
FLAGS.batch_size = 256 * 8 # 8 GPUs
FLAGS.loss_scale = 'dynamic'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 48
self._run_and_report_benchmark()
def benchmark_2x2_tpu_bf16(self):
"""Test Keras model with 2x2 TPU, bf16."""
self._setup()
FLAGS.dtype = 'bf16'
FLAGS.distribution_strategy = 'tpu'
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16')
FLAGS.batch_size = 1024
self._run_and_report_benchmark()
def benchmark_4x4_tpu_bf16(self):
"""Test Keras model with 4x4 TPU, bf16."""
self._setup()
FLAGS.dtype = 'bf16'
FLAGS.distribution_strategy = 'tpu'
FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16')
FLAGS.batch_size = 4096
self._run_and_report_benchmark()
def benchmark_8x8_tpu_bf16(self):
"""Test Keras model with 8x8 TPU, bf16."""
self._setup()
FLAGS.dtype = 'bf16'
FLAGS.distribution_strategy = 'tpu'
FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_bf16')
FLAGS.batch_size = 8192
self._run_and_report_benchmark()
def fill_report_object(self, stats):
super(Resnet50KerasBenchmarkBase, self).fill_report_object(
stats,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
class Resnet50KerasBenchmarkSynth(Resnet50KerasClassifierBenchmarkBase):
"""Resnet50 synthetic benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, tpu=None, **kwargs):
def_flags = {}
def_flags['log_steps'] = 10
super(Resnet50KerasBenchmarkSynth, self).__init__(
output_dir=output_dir, default_flags=def_flags, tpu=tpu,
dataset_builder='synthetic', train_epochs=1, train_steps=110)
class Resnet50KerasBenchmarkReal(Resnet50KerasClassifierBenchmarkBase):
"""Resnet50 real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, tpu=None, **kwargs):
data_dir = os.path.join(root_data_dir, 'imagenet')
def_flags = {}
def_flags['log_steps'] = 10
super(Resnet50KerasBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags, tpu=tpu,
dataset_builder='records', train_epochs=1, train_steps=110,
data_dir=data_dir)
class Resnet50KerasBenchmarkRemoteData(Resnet50KerasBenchmarkBase):
"""Resnet50 real data (stored in remote storage) benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['skip_eval'] = True
def_flags['report_accuracy_metrics'] = False
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
# Defining multiple epochs overrides the train_steps setting in benchmarks.
def_flags['train_epochs'] = 2
# Cache dataset so performance is stable after the first epoch.
def_flags['training_dataset_cache'] = True
def_flags['log_steps'] = 100
# Note that for single GPU and pure eager tests which are less likely to be
# input bound and more stable, these tests will run for shorter time by
# overriding FLAGS.train_epochs, train_seteps, log_steps in benchmark
# methods, and skip_steps in _run_and_report_benchmark().
super(Resnet50KerasBenchmarkRemoteData, self).__init__(
output_dir=output_dir, default_flags=def_flags)
def _override_flags_to_run_test_shorter(self):
FLAGS.train_epochs = 1
FLAGS.train_steps = 300
FLAGS.log_steps = 10
def benchmark_1_gpu_no_dist_strat(self):
"""Test Keras model with 1 GPU, no distribution strategy."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly')
FLAGS.batch_size = 64
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self):
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.explicit_gpu_placement = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked')
FLAGS.batch_size = 64
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self):
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.explicit_gpu_placement = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu(self):
"""Test Keras model with 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_amp(self):
"""Test Keras model with 1 GPU with automatic mixed precision."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp')
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu(self):
"""Test Keras model with XLA and 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_amp(self):
"""Test Keras model with XLA and 1 GPU with automatic mixed precision."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp')
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16(self):
"""Test Keras model with 1 GPU and fp16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16_dynamic(self):
"""Test Keras model with 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16(self):
"""Test Keras model with XLA, 1 GPU and fp16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16_tweaked(self):
"""Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16_dynamic(self):
"""Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
if FLAGS.num_gpus == 1 or FLAGS.run_eagerly:
# For single GPU and pure eager tests which are less likely to be input
# bound and more stable, run for shorter time and use the default
# skip_steps.
skip_steps = None
else:
# skip the first epoch for performance measurement.
skip_steps = 600
super(Resnet50KerasBenchmarkRemoteData,
self)._run_and_report_benchmark(skip_steps=skip_steps)
class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
"""Trivial model with real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
def_flags = {}
def_flags['use_trivial_model'] = True
def_flags['skip_eval'] = True
def_flags['report_accuracy_metrics'] = False
def_flags['dtype'] = 'fp16'
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
def_flags['train_steps'] = 600
def_flags['log_steps'] = 100
def_flags['distribution_strategy'] = 'mirrored'
super(TrivialKerasBenchmarkReal, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=def_flags)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(FLAGS)
wall_time_sec = time.time() - start_time_sec
super(TrivialKerasBenchmarkReal, self)._report_benchmark(
stats,
wall_time_sec,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_8_gpu_warmup(self):
"""Dummy test that runs over an epoch to warmup the machine."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup')
FLAGS.batch_size = 256 * 8
FLAGS.train_steps = 700
self._run_and_report_benchmark()
def fill_report_object(self, stats):
super(TrivialKerasBenchmarkReal, self).fill_report_object(
stats,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
class Resnet50MultiWorkerKerasAccuracy(keras_benchmark.KerasBenchmark):
"""Resnet50 distributed accuracy tests with multiple workers."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
flag_methods = [classifier_trainer.define_imagenet_keras_flags]
self.data_dir = os.path.join(root_data_dir, 'imagenet')
super(Resnet50MultiWorkerKerasAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
def _benchmark_common(self, eager, num_workers, all_reduce_alg):
"""Common to all benchmarks in this class."""
self._setup()
num_gpus = 8
FLAGS.num_gpus = num_gpus
FLAGS.data_dir = self.data_dir
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = eager
FLAGS.enable_xla = False
FLAGS.distribution_strategy = 'multi_worker_mirrored'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 32
FLAGS.model_dir = self._get_model_dir(
'benchmark_{}_8_gpu_{}_worker_fp16_{}_tweaked'.format(
'eager' if eager else 'graph', num_workers, all_reduce_alg))
FLAGS.batch_size = 256 * num_gpus * num_workers
FLAGS.all_reduce_alg = all_reduce_alg
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
top_1_min=MIN_TOP_1_ACCURACY,
top_1_max=MAX_TOP_1_ACCURACY):
start_time_sec = time.time()
stats = classifier_trainer.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(Resnet50MultiWorkerKerasAccuracy, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=top_1_min,
top_1_max=top_1_max,
total_batch_size=FLAGS.batch_size,
log_steps=100)
def _get_model_dir(self, folder_name):
return os.path.join(self.output_dir, folder_name)
def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='ring')
def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='nccl')
def benchmark_eager_8_gpu_8_workers_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='ring')
def benchmark_eager_8_gpu_8_workers_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='nccl')
class Resnet50MultiWorkerKerasBenchmark(Resnet50KerasBenchmarkBase):
"""Resnet50 distributed benchmark tests with multiple workers."""
def __init__(self, output_dir=None, default_flags=None):
super(Resnet50MultiWorkerKerasBenchmark, self).__init__(
output_dir=output_dir, default_flags=default_flags)
def _benchmark_common(self, eager, num_workers, all_reduce_alg):
"""Common to all benchmarks in this class."""
self._setup()
num_gpus = 8
FLAGS.num_gpus = num_gpus
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = eager
FLAGS.enable_xla = False
FLAGS.distribution_strategy = 'multi_worker_mirrored'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 32
FLAGS.model_dir = self._get_model_dir(
'benchmark_{}_8_gpu_{}_worker_fp16_{}_tweaked'.format(
'eager' if eager else 'graph', num_workers, all_reduce_alg))
FLAGS.batch_size = 256 * num_gpus * num_workers
FLAGS.all_reduce_alg = all_reduce_alg
self._run_and_report_benchmark()
def benchmark_eager_8_gpu_1_worker_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 1 worker, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='ring')
def benchmark_eager_8_gpu_1_worker_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 1 worker, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='nccl')
def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='ring')
def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='nccl')
def benchmark_eager_8_gpu_8_workers_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='ring')
def benchmark_eager_8_gpu_8_workers_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='nccl')
class Resnet50MultiWorkerKerasBenchmarkSynth(Resnet50MultiWorkerKerasBenchmark):
"""Resnet50 multi-worker synthetic data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['skip_eval'] = True
def_flags['report_accuracy_metrics'] = False
def_flags['use_synthetic_data'] = True
def_flags['train_steps'] = 110
def_flags['log_steps'] = 10
super(Resnet50MultiWorkerKerasBenchmarkSynth, self).__init__(
output_dir=output_dir, default_flags=def_flags)
class Resnet50MultiWorkerKerasBenchmarkReal(Resnet50MultiWorkerKerasBenchmark):
"""Resnet50 multi-worker real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['skip_eval'] = True
def_flags['report_accuracy_metrics'] = False
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
def_flags['train_steps'] = 110
def_flags['log_steps'] = 10
super(Resnet50MultiWorkerKerasBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags)
# TODO(kimjaehong): It also should be also cover other metheods of model
# optimization techniques. In that time, this class will change to something
# like 'KerasModelOptimizationAccuracyBase'.
class KerasPruningAccuracyBase(keras_benchmark.KerasBenchmark):
"""Benchmark accuracy tests for pruning method."""
def __init__(self,
output_dir=None,
root_data_dir=None,
default_flags=None,
**kwargs):
"""A accuracy benchmark class for pruning method.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
default_flags: default flags
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
if default_flags is None:
default_flags = {}
default_flags['pruning_method'] = 'polynomial_decay'
default_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
super(KerasPruningAccuracyBase, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=default_flags,
**kwargs)
def benchmark_8_gpu(self):
"""Test Keras model with eager, dist_strat and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.batch_size = 32 * 8
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
top_1_min=MODEL_OPTIMIZATION_TOP_1_ACCURACY[
'RESNET50_FINETUNE_PRUNING'][0],
top_1_max=MODEL_OPTIMIZATION_TOP_1_ACCURACY[
'RESNET50_FINETUNE_PRUNING'][1]):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(KerasPruningAccuracyBase, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=top_1_min,
top_1_max=top_1_max,
total_batch_size=FLAGS.batch_size,
log_steps=100)
class MobilenetV1KerasPruningAccuracy(KerasPruningAccuracyBase):
"""Benchmark accuracy tests for MobilenetV1 with pruning method."""
def __init__(self, root_data_dir=None, **kwargs):
default_flags = {
'model': 'mobilenet',
'optimizer': 'mobilenet_default',
'initial_learning_rate_per_sample': 0.00007,
'pretrained_filepath': tf.train.latest_checkpoint(
os.path.join(root_data_dir, 'mobilenet_v1')),
'pruning_begin_step': 0,
'pruning_end_step': 100000,
'pruning_initial_sparsity': 0.0,
'pruning_final_sparsity': 0.5,
'pruning_frequency': 100,
}
super(MobilenetV1KerasPruningAccuracy, self).__init__(
root_data_dir=root_data_dir,
default_flags=default_flags,
**kwargs)
def _run_and_report_benchmark(self):
super(MobilenetV1KerasPruningAccuracy, self)._run_and_report_benchmark(
top_1_min=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_PRUNING'][0],
top_1_max=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_PRUNING'][1])
class Resnet50KerasPruningAccuracy(KerasPruningAccuracyBase):
"""Benchmark accuracy tests for resnet50 with pruning method."""
def __init__(self, root_data_dir=None, **kwargs):
default_flags = {
'model': 'resnet50_v1.5',
'optimizer': 'mobilenet_default',
'initial_learning_rate_per_sample': 0.0000039,
'pretrained_filepath': tf.train.latest_checkpoint(
os.path.join(root_data_dir, 'resnet50')),
'pruning_begin_step': 0,
'pruning_end_step': 50000,
'pruning_initial_sparsity': 0.0,
'pruning_final_sparsity': 0.5,
'pruning_frequency': 100,
}
super(Resnet50KerasPruningAccuracy, self).__init__(
root_data_dir=root_data_dir,
default_flags=default_flags,
**kwargs)
def _run_and_report_benchmark(self):
super(Resnet50KerasPruningAccuracy, self)._run_and_report_benchmark(
top_1_min=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['RESNET50_FINETUNE_PRUNING'][0],
top_1_max=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['RESNET50_FINETUNE_PRUNING'][1])
class KerasPruningBenchmarkRealBase(Resnet50KerasBenchmarkBase):
"""Pruning method benchmarks."""
def __init__(self, root_data_dir=None, default_flags=None, **kwargs):
if default_flags is None:
default_flags = {}
default_flags.update({
'skip_eval': True,
'report_accuracy_metrics': False,
'data_dir': os.path.join(root_data_dir, 'imagenet'),
'train_steps': 110,
'log_steps': 10,
'pruning_method': 'polynomial_decay',
'pruning_begin_step': 0,
'pruning_end_step': 50000,
'pruning_initial_sparsity': 0,
'pruning_final_sparsity': 0.5,
'pruning_frequency': 100,
})
super(KerasPruningBenchmarkRealBase, self).__init__(
default_flags=default_flags, **kwargs)
class MobilenetV1KerasPruningBenchmarkReal(KerasPruningBenchmarkRealBase):
"""Pruning method benchmarks for MobilenetV1."""
def __init__(self, **kwargs):
default_flags = {
'model': 'mobilenet',
'optimizer': 'mobilenet_default',
}
super(MobilenetV1KerasPruningBenchmarkReal, self).__init__(
default_flags=default_flags, **kwargs)
class Resnet50KerasPruningBenchmarkReal(KerasPruningBenchmarkRealBase):
"""Pruning method benchmarks for resnet50."""
def __init__(self, **kwargs):
default_flags = {
'model': 'resnet50_v1.5',
'optimizer': 'mobilenet_default',
}
super(Resnet50KerasPruningBenchmarkReal, self).__init__(
default_flags=default_flags, **kwargs)
if __name__ == '__main__':
tf.test.main()