NCTCMumbai's picture
Upload 2571 files
0b8359d
raw
history blame
19.5 kB
# Lint as: python3
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Dataset utilities for vision tasks using TFDS and tf.data.Dataset."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import os
from typing import Any, List, Optional, Tuple, Mapping, Union
from absl import logging
from dataclasses import dataclass
import tensorflow as tf
import tensorflow_datasets as tfds
from official.modeling.hyperparams import base_config
from official.vision.image_classification import augment
from official.vision.image_classification import preprocessing
AUGMENTERS = {
'autoaugment': augment.AutoAugment,
'randaugment': augment.RandAugment,
}
@dataclass
class AugmentConfig(base_config.Config):
"""Configuration for image augmenters.
Attributes:
name: The name of the image augmentation to use. Possible options are
None (default), 'autoaugment', or 'randaugment'.
params: Any paramaters used to initialize the augmenter.
"""
name: Optional[str] = None
params: Optional[Mapping[str, Any]] = None
def build(self) -> augment.ImageAugment:
"""Build the augmenter using this config."""
params = self.params or {}
augmenter = AUGMENTERS.get(self.name, None)
return augmenter(**params) if augmenter is not None else None
@dataclass
class DatasetConfig(base_config.Config):
"""The base configuration for building datasets.
Attributes:
name: The name of the Dataset. Usually should correspond to a TFDS dataset.
data_dir: The path where the dataset files are stored, if available.
filenames: Optional list of strings representing the TFRecord names.
builder: The builder type used to load the dataset. Value should be one of
'tfds' (load using TFDS), 'records' (load from TFRecords), or 'synthetic'
(generate dummy synthetic data without reading from files).
split: The split of the dataset. Usually 'train', 'validation', or 'test'.
image_size: The size of the image in the dataset. This assumes that
`width` == `height`. Set to 'infer' to infer the image size from TFDS
info. This requires `name` to be a registered dataset in TFDS.
num_classes: The number of classes given by the dataset. Set to 'infer'
to infer the image size from TFDS info. This requires `name` to be a
registered dataset in TFDS.
num_channels: The number of channels given by the dataset. Set to 'infer'
to infer the image size from TFDS info. This requires `name` to be a
registered dataset in TFDS.
num_examples: The number of examples given by the dataset. Set to 'infer'
to infer the image size from TFDS info. This requires `name` to be a
registered dataset in TFDS.
batch_size: The base batch size for the dataset.
use_per_replica_batch_size: Whether to scale the batch size based on
available resources. If set to `True`, the dataset builder will return
batch_size multiplied by `num_devices`, the number of device replicas
(e.g., the number of GPUs or TPU cores). This setting should be `True` if
the strategy argument is passed to `build()` and `num_devices > 1`.
num_devices: The number of replica devices to use. This should be set by
`strategy.num_replicas_in_sync` when using a distribution strategy.
dtype: The desired dtype of the dataset. This will be set during
preprocessing.
one_hot: Whether to apply one hot encoding. Set to `True` to be able to use
label smoothing.
augmenter: The augmenter config to use. No augmentation is used by default.
download: Whether to download data using TFDS.
shuffle_buffer_size: The buffer size used for shuffling training data.
file_shuffle_buffer_size: The buffer size used for shuffling raw training
files.
skip_decoding: Whether to skip image decoding when loading from TFDS.
cache: whether to cache to dataset examples. Can be used to avoid re-reading
from disk on the second epoch. Requires significant memory overhead.
tf_data_service: The URI of a tf.data service to offload preprocessing onto
during training. The URI should be in the format "protocol://address",
e.g. "grpc://tf-data-service:5050".
mean_subtract: whether or not to apply mean subtraction to the dataset.
standardize: whether or not to apply standardization to the dataset.
"""
name: Optional[str] = None
data_dir: Optional[str] = None
filenames: Optional[List[str]] = None
builder: str = 'tfds'
split: str = 'train'
image_size: Union[int, str] = 'infer'
num_classes: Union[int, str] = 'infer'
num_channels: Union[int, str] = 'infer'
num_examples: Union[int, str] = 'infer'
batch_size: int = 128
use_per_replica_batch_size: bool = True
num_devices: int = 1
dtype: str = 'float32'
one_hot: bool = True
augmenter: AugmentConfig = AugmentConfig()
download: bool = False
shuffle_buffer_size: int = 10000
file_shuffle_buffer_size: int = 1024
skip_decoding: bool = True
cache: bool = False
tf_data_service: Optional[str] = None
mean_subtract: bool = False
standardize: bool = False
@property
def has_data(self):
"""Whether this dataset is has any data associated with it."""
return self.name or self.data_dir or self.filenames
@dataclass
class ImageNetConfig(DatasetConfig):
"""The base ImageNet dataset config."""
name: str = 'imagenet2012'
# Note: for large datasets like ImageNet, using records is faster than tfds
builder: str = 'records'
image_size: int = 224
batch_size: int = 128
@dataclass
class Cifar10Config(DatasetConfig):
"""The base CIFAR-10 dataset config."""
name: str = 'cifar10'
image_size: int = 224
batch_size: int = 128
download: bool = True
cache: bool = True
class DatasetBuilder:
"""An object for building datasets.
Allows building various pipelines fetching examples, preprocessing, etc.
Maintains additional state information calculated from the dataset, i.e.,
training set split, batch size, and number of steps (batches).
"""
def __init__(self, config: DatasetConfig, **overrides: Any):
"""Initialize the builder from the config."""
self.config = config.replace(**overrides)
self.builder_info = None
if self.config.augmenter is not None:
logging.info('Using augmentation: %s', self.config.augmenter.name)
self.augmenter = self.config.augmenter.build()
else:
self.augmenter = None
@property
def is_training(self) -> bool:
"""Whether this is the training set."""
return self.config.split == 'train'
@property
def batch_size(self) -> int:
"""The batch size, multiplied by the number of replicas (if configured)."""
if self.config.use_per_replica_batch_size:
return self.config.batch_size * self.config.num_devices
else:
return self.config.batch_size
@property
def global_batch_size(self):
"""The global batch size across all replicas."""
return self.batch_size
@property
def local_batch_size(self):
"""The base unscaled batch size."""
if self.config.use_per_replica_batch_size:
return self.config.batch_size
else:
return self.config.batch_size // self.config.num_devices
@property
def num_steps(self) -> int:
"""The number of steps (batches) to exhaust this dataset."""
# Always divide by the global batch size to get the correct # of steps
return self.num_examples // self.global_batch_size
@property
def dtype(self) -> tf.dtypes.DType:
"""Converts the config's dtype string to a tf dtype.
Returns:
A mapping from string representation of a dtype to the `tf.dtypes.DType`.
Raises:
ValueError if the config's dtype is not supported.
"""
dtype_map = {
'float32': tf.float32,
'bfloat16': tf.bfloat16,
'float16': tf.float16,
'fp32': tf.float32,
'bf16': tf.bfloat16,
}
try:
return dtype_map[self.config.dtype]
except:
raise ValueError('Invalid DType provided. Supported types: {}'.format(
dtype_map.keys()))
@property
def image_size(self) -> int:
"""The size of each image (can be inferred from the dataset)."""
if self.config.image_size == 'infer':
return self.info.features['image'].shape[0]
else:
return int(self.config.image_size)
@property
def num_channels(self) -> int:
"""The number of image channels (can be inferred from the dataset)."""
if self.config.num_channels == 'infer':
return self.info.features['image'].shape[-1]
else:
return int(self.config.num_channels)
@property
def num_examples(self) -> int:
"""The number of examples (can be inferred from the dataset)."""
if self.config.num_examples == 'infer':
return self.info.splits[self.config.split].num_examples
else:
return int(self.config.num_examples)
@property
def num_classes(self) -> int:
"""The number of classes (can be inferred from the dataset)."""
if self.config.num_classes == 'infer':
return self.info.features['label'].num_classes
else:
return int(self.config.num_classes)
@property
def info(self) -> tfds.core.DatasetInfo:
"""The TFDS dataset info, if available."""
if self.builder_info is None:
self.builder_info = tfds.builder(self.config.name).info
return self.builder_info
def build(self, strategy: tf.distribute.Strategy = None) -> tf.data.Dataset:
"""Construct a dataset end-to-end and return it using an optional strategy.
Args:
strategy: a strategy that, if passed, will distribute the dataset
according to that strategy. If passed and `num_devices > 1`,
`use_per_replica_batch_size` must be set to `True`.
Returns:
A TensorFlow dataset outputting batched images and labels.
"""
if strategy:
if strategy.num_replicas_in_sync != self.config.num_devices:
logging.warn('Passed a strategy with %d devices, but expected'
'%d devices.',
strategy.num_replicas_in_sync,
self.config.num_devices)
dataset = strategy.experimental_distribute_datasets_from_function(
self._build)
else:
dataset = self._build()
return dataset
def _build(self, input_context: tf.distribute.InputContext = None
) -> tf.data.Dataset:
"""Construct a dataset end-to-end and return it.
Args:
input_context: An optional context provided by `tf.distribute` for
cross-replica training.
Returns:
A TensorFlow dataset outputting batched images and labels.
"""
builders = {
'tfds': self.load_tfds,
'records': self.load_records,
'synthetic': self.load_synthetic,
}
builder = builders.get(self.config.builder, None)
if builder is None:
raise ValueError('Unknown builder type {}'.format(self.config.builder))
self.input_context = input_context
dataset = builder()
dataset = self.pipeline(dataset)
return dataset
def load_tfds(self) -> tf.data.Dataset:
"""Return a dataset loading files from TFDS."""
logging.info('Using TFDS to load data.')
builder = tfds.builder(self.config.name,
data_dir=self.config.data_dir)
if self.config.download:
builder.download_and_prepare()
decoders = {}
if self.config.skip_decoding:
decoders['image'] = tfds.decode.SkipDecoding()
read_config = tfds.ReadConfig(
interleave_cycle_length=10,
interleave_block_length=1,
input_context=self.input_context)
dataset = builder.as_dataset(
split=self.config.split,
as_supervised=True,
shuffle_files=True,
decoders=decoders,
read_config=read_config)
return dataset
def load_records(self) -> tf.data.Dataset:
"""Return a dataset loading files with TFRecords."""
logging.info('Using TFRecords to load data.')
if self.config.filenames is None:
if self.config.data_dir is None:
raise ValueError('Dataset must specify a path for the data files.')
file_pattern = os.path.join(self.config.data_dir,
'{}*'.format(self.config.split))
dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
else:
dataset = tf.data.Dataset.from_tensor_slices(self.config.filenames)
return dataset
def load_synthetic(self) -> tf.data.Dataset:
"""Return a dataset generating dummy synthetic data."""
logging.info('Generating a synthetic dataset.')
def generate_data(_):
image = tf.zeros([self.image_size, self.image_size, self.num_channels],
dtype=self.dtype)
label = tf.zeros([1], dtype=tf.int32)
return image, label
dataset = tf.data.Dataset.range(1)
dataset = dataset.repeat()
dataset = dataset.map(generate_data,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
return dataset
def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
"""Build a pipeline fetching, shuffling, and preprocessing the dataset.
Args:
dataset: A `tf.data.Dataset` that loads raw files.
Returns:
A TensorFlow dataset outputting batched images and labels.
"""
if (self.config.builder != 'tfds' and self.input_context
and self.input_context.num_input_pipelines > 1):
dataset = dataset.shard(self.input_context.num_input_pipelines,
self.input_context.input_pipeline_id)
logging.info('Sharding the dataset: input_pipeline_id=%d '
'num_input_pipelines=%d',
self.input_context.num_input_pipelines,
self.input_context.input_pipeline_id)
if self.is_training and self.config.builder == 'records':
# Shuffle the input files.
dataset.shuffle(buffer_size=self.config.file_shuffle_buffer_size)
if self.is_training and not self.config.cache:
dataset = dataset.repeat()
if self.config.builder == 'records':
# Read the data from disk in parallel
dataset = dataset.interleave(
tf.data.TFRecordDataset,
cycle_length=10,
block_length=1,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
if self.config.cache:
dataset = dataset.cache()
if self.is_training:
dataset = dataset.shuffle(self.config.shuffle_buffer_size)
dataset = dataset.repeat()
# Parse, pre-process, and batch the data in parallel
if self.config.builder == 'records':
preprocess = self.parse_record
else:
preprocess = self.preprocess
dataset = dataset.map(preprocess,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
if self.input_context and self.config.num_devices > 1:
if not self.config.use_per_replica_batch_size:
raise ValueError(
'The builder does not support a global batch size with more than '
'one replica. Got {} replicas. Please set a '
'`per_replica_batch_size` and enable '
'`use_per_replica_batch_size=True`.'.format(
self.config.num_devices))
# The batch size of the dataset will be multiplied by the number of
# replicas automatically when strategy.distribute_datasets_from_function
# is called, so we use local batch size here.
dataset = dataset.batch(self.local_batch_size,
drop_remainder=self.is_training)
else:
dataset = dataset.batch(self.global_batch_size,
drop_remainder=self.is_training)
# Prefetch overlaps in-feed with training
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
if self.config.tf_data_service:
if not hasattr(tf.data.experimental, 'service'):
raise ValueError('The tf_data_service flag requires Tensorflow version '
'>= 2.3.0, but the version is {}'.format(
tf.__version__))
dataset = dataset.apply(
tf.data.experimental.service.distribute(
processing_mode='parallel_epochs',
service=self.config.tf_data_service,
job_name='resnet_train'))
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
return dataset
def parse_record(self, record: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
"""Parse an ImageNet record from a serialized string Tensor."""
keys_to_features = {
'image/encoded':
tf.io.FixedLenFeature((), tf.string, ''),
'image/format':
tf.io.FixedLenFeature((), tf.string, 'jpeg'),
'image/class/label':
tf.io.FixedLenFeature([], tf.int64, -1),
'image/class/text':
tf.io.FixedLenFeature([], tf.string, ''),
'image/object/bbox/xmin':
tf.io.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymin':
tf.io.VarLenFeature(dtype=tf.float32),
'image/object/bbox/xmax':
tf.io.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymax':
tf.io.VarLenFeature(dtype=tf.float32),
'image/object/class/label':
tf.io.VarLenFeature(dtype=tf.int64),
}
parsed = tf.io.parse_single_example(record, keys_to_features)
label = tf.reshape(parsed['image/class/label'], shape=[1])
# Subtract one so that labels are in [0, 1000)
label -= 1
image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
image, label = self.preprocess(image_bytes, label)
return image, label
def preprocess(self, image: tf.Tensor, label: tf.Tensor
) -> Tuple[tf.Tensor, tf.Tensor]:
"""Apply image preprocessing and augmentation to the image and label."""
if self.is_training:
image = preprocessing.preprocess_for_train(
image,
image_size=self.image_size,
mean_subtract=self.config.mean_subtract,
standardize=self.config.standardize,
dtype=self.dtype,
augmenter=self.augmenter)
else:
image = preprocessing.preprocess_for_eval(
image,
image_size=self.image_size,
num_channels=self.num_channels,
mean_subtract=self.config.mean_subtract,
standardize=self.config.standardize,
dtype=self.dtype)
label = tf.cast(label, tf.int32)
if self.config.one_hot:
label = tf.one_hot(label, self.num_classes)
label = tf.reshape(label, [self.num_classes])
return image, label
@classmethod
def from_params(cls, *args, **kwargs):
"""Construct a dataset builder from a default config and any overrides."""
config = DatasetConfig.from_args(*args, **kwargs)
return cls(config)