Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /official /vision /image_classification /dataset_factory.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame

19.5 kB

	# Lint as: python3
	# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Dataset utilities for vision tasks using TFDS and tf.data.Dataset."""
	from __future__ import absolute_import
	from __future__ import division
	# from __future__ import google_type_annotations
	from __future__ import print_function

	import os
	from typing import Any, List, Optional, Tuple, Mapping, Union
	from absl import logging
	from dataclasses import dataclass
	import tensorflow as tf
	import tensorflow_datasets as tfds

	from official.modeling.hyperparams import base_config
	from official.vision.image_classification import augment
	from official.vision.image_classification import preprocessing


	AUGMENTERS = {
	'autoaugment': augment.AutoAugment,
	'randaugment': augment.RandAugment,
	}


	@dataclass
	class AugmentConfig(base_config.Config):
	"""Configuration for image augmenters.

	Attributes:
	name: The name of the image augmentation to use. Possible options are
	None (default), 'autoaugment', or 'randaugment'.
	params: Any paramaters used to initialize the augmenter.
	"""
	name: Optional[str] = None
	params: Optional[Mapping[str, Any]] = None

	def build(self) -> augment.ImageAugment:
	"""Build the augmenter using this config."""
	params = self.params or {}
	augmenter = AUGMENTERS.get(self.name, None)
	return augmenter(**params) if augmenter is not None else None


	@dataclass
	class DatasetConfig(base_config.Config):
	"""The base configuration for building datasets.

	Attributes:
	name: The name of the Dataset. Usually should correspond to a TFDS dataset.
	data_dir: The path where the dataset files are stored, if available.
	filenames: Optional list of strings representing the TFRecord names.
	builder: The builder type used to load the dataset. Value should be one of
	'tfds' (load using TFDS), 'records' (load from TFRecords), or 'synthetic'
	(generate dummy synthetic data without reading from files).
	split: The split of the dataset. Usually 'train', 'validation', or 'test'.
	image_size: The size of the image in the dataset. This assumes that
	`width` == `height`. Set to 'infer' to infer the image size from TFDS
	info. This requires `name` to be a registered dataset in TFDS.
	num_classes: The number of classes given by the dataset. Set to 'infer'
	to infer the image size from TFDS info. This requires `name` to be a
	registered dataset in TFDS.
	num_channels: The number of channels given by the dataset. Set to 'infer'
	to infer the image size from TFDS info. This requires `name` to be a
	registered dataset in TFDS.
	num_examples: The number of examples given by the dataset. Set to 'infer'
	to infer the image size from TFDS info. This requires `name` to be a
	registered dataset in TFDS.
	batch_size: The base batch size for the dataset.
	use_per_replica_batch_size: Whether to scale the batch size based on
	available resources. If set to `True`, the dataset builder will return
	batch_size multiplied by `num_devices`, the number of device replicas
	(e.g., the number of GPUs or TPU cores). This setting should be `True` if
	the strategy argument is passed to `build()` and `num_devices > 1`.
	num_devices: The number of replica devices to use. This should be set by
	`strategy.num_replicas_in_sync` when using a distribution strategy.
	dtype: The desired dtype of the dataset. This will be set during
	preprocessing.
	one_hot: Whether to apply one hot encoding. Set to `True` to be able to use
	label smoothing.
	augmenter: The augmenter config to use. No augmentation is used by default.
	download: Whether to download data using TFDS.
	shuffle_buffer_size: The buffer size used for shuffling training data.
	file_shuffle_buffer_size: The buffer size used for shuffling raw training
	files.
	skip_decoding: Whether to skip image decoding when loading from TFDS.
	cache: whether to cache to dataset examples. Can be used to avoid re-reading
	from disk on the second epoch. Requires significant memory overhead.
	tf_data_service: The URI of a tf.data service to offload preprocessing onto
	during training. The URI should be in the format "protocol://address",
	e.g. "grpc://tf-data-service:5050".
	mean_subtract: whether or not to apply mean subtraction to the dataset.
	standardize: whether or not to apply standardization to the dataset.
	"""
	name: Optional[str] = None
	data_dir: Optional[str] = None
	filenames: Optional[List[str]] = None
	builder: str = 'tfds'
	split: str = 'train'
	image_size: Union[int, str] = 'infer'
	num_classes: Union[int, str] = 'infer'
	num_channels: Union[int, str] = 'infer'
	num_examples: Union[int, str] = 'infer'
	batch_size: int = 128
	use_per_replica_batch_size: bool = True
	num_devices: int = 1
	dtype: str = 'float32'
	one_hot: bool = True
	augmenter: AugmentConfig = AugmentConfig()
	download: bool = False
	shuffle_buffer_size: int = 10000
	file_shuffle_buffer_size: int = 1024
	skip_decoding: bool = True
	cache: bool = False
	tf_data_service: Optional[str] = None
	mean_subtract: bool = False
	standardize: bool = False

	@property
	def has_data(self):
	"""Whether this dataset is has any data associated with it."""
	return self.name or self.data_dir or self.filenames


	@dataclass
	class ImageNetConfig(DatasetConfig):
	"""The base ImageNet dataset config."""
	name: str = 'imagenet2012'
	# Note: for large datasets like ImageNet, using records is faster than tfds
	builder: str = 'records'
	image_size: int = 224
	batch_size: int = 128


	@dataclass
	class Cifar10Config(DatasetConfig):
	"""The base CIFAR-10 dataset config."""
	name: str = 'cifar10'
	image_size: int = 224
	batch_size: int = 128
	download: bool = True
	cache: bool = True


	class DatasetBuilder:
	"""An object for building datasets.

	Allows building various pipelines fetching examples, preprocessing, etc.
	Maintains additional state information calculated from the dataset, i.e.,
	training set split, batch size, and number of steps (batches).
	"""

	def __init__(self, config: DatasetConfig, **overrides: Any):
	"""Initialize the builder from the config."""
	self.config = config.replace(**overrides)
	self.builder_info = None

	if self.config.augmenter is not None:
	logging.info('Using augmentation: %s', self.config.augmenter.name)
	self.augmenter = self.config.augmenter.build()
	else:
	self.augmenter = None

	@property
	def is_training(self) -> bool:
	"""Whether this is the training set."""
	return self.config.split == 'train'

	@property
	def batch_size(self) -> int:
	"""The batch size, multiplied by the number of replicas (if configured)."""
	if self.config.use_per_replica_batch_size:
	return self.config.batch_size * self.config.num_devices
	else:
	return self.config.batch_size

	@property
	def global_batch_size(self):
	"""The global batch size across all replicas."""
	return self.batch_size

	@property
	def local_batch_size(self):
	"""The base unscaled batch size."""
	if self.config.use_per_replica_batch_size:
	return self.config.batch_size
	else:
	return self.config.batch_size // self.config.num_devices

	@property
	def num_steps(self) -> int:
	"""The number of steps (batches) to exhaust this dataset."""
	# Always divide by the global batch size to get the correct # of steps
	return self.num_examples // self.global_batch_size

	@property
	def dtype(self) -> tf.dtypes.DType:
	"""Converts the config's dtype string to a tf dtype.

	Returns:
	A mapping from string representation of a dtype to the `tf.dtypes.DType`.

	Raises:
	ValueError if the config's dtype is not supported.

	"""
	dtype_map = {
	'float32': tf.float32,
	'bfloat16': tf.bfloat16,
	'float16': tf.float16,
	'fp32': tf.float32,
	'bf16': tf.bfloat16,
	}
	try:
	return dtype_map[self.config.dtype]
	except:
	raise ValueError('Invalid DType provided. Supported types: {}'.format(
	dtype_map.keys()))

	@property
	def image_size(self) -> int:
	"""The size of each image (can be inferred from the dataset)."""

	if self.config.image_size == 'infer':
	return self.info.features['image'].shape[0]
	else:
	return int(self.config.image_size)

	@property
	def num_channels(self) -> int:
	"""The number of image channels (can be inferred from the dataset)."""
	if self.config.num_channels == 'infer':
	return self.info.features['image'].shape[-1]
	else:
	return int(self.config.num_channels)

	@property
	def num_examples(self) -> int:
	"""The number of examples (can be inferred from the dataset)."""
	if self.config.num_examples == 'infer':
	return self.info.splits[self.config.split].num_examples
	else:
	return int(self.config.num_examples)

	@property
	def num_classes(self) -> int:
	"""The number of classes (can be inferred from the dataset)."""
	if self.config.num_classes == 'infer':
	return self.info.features['label'].num_classes
	else:
	return int(self.config.num_classes)

	@property
	def info(self) -> tfds.core.DatasetInfo:
	"""The TFDS dataset info, if available."""
	if self.builder_info is None:
	self.builder_info = tfds.builder(self.config.name).info
	return self.builder_info

	def build(self, strategy: tf.distribute.Strategy = None) -> tf.data.Dataset:
	"""Construct a dataset end-to-end and return it using an optional strategy.

	Args:
	strategy: a strategy that, if passed, will distribute the dataset
	according to that strategy. If passed and `num_devices > 1`,
	`use_per_replica_batch_size` must be set to `True`.

	Returns:
	A TensorFlow dataset outputting batched images and labels.
	"""
	if strategy:
	if strategy.num_replicas_in_sync != self.config.num_devices:
	logging.warn('Passed a strategy with %d devices, but expected'
	'%d devices.',
	strategy.num_replicas_in_sync,
	self.config.num_devices)
	dataset = strategy.experimental_distribute_datasets_from_function(
	self._build)
	else:
	dataset = self._build()

	return dataset

	def _build(self, input_context: tf.distribute.InputContext = None
	) -> tf.data.Dataset:
	"""Construct a dataset end-to-end and return it.

	Args:
	input_context: An optional context provided by `tf.distribute` for
	cross-replica training.

	Returns:
	A TensorFlow dataset outputting batched images and labels.
	"""
	builders = {
	'tfds': self.load_tfds,
	'records': self.load_records,
	'synthetic': self.load_synthetic,
	}

	builder = builders.get(self.config.builder, None)

	if builder is None:
	raise ValueError('Unknown builder type {}'.format(self.config.builder))

	self.input_context = input_context
	dataset = builder()
	dataset = self.pipeline(dataset)

	return dataset

	def load_tfds(self) -> tf.data.Dataset:
	"""Return a dataset loading files from TFDS."""

	logging.info('Using TFDS to load data.')

	builder = tfds.builder(self.config.name,
	data_dir=self.config.data_dir)

	if self.config.download:
	builder.download_and_prepare()

	decoders = {}

	if self.config.skip_decoding:
	decoders['image'] = tfds.decode.SkipDecoding()

	read_config = tfds.ReadConfig(
	interleave_cycle_length=10,
	interleave_block_length=1,
	input_context=self.input_context)

	dataset = builder.as_dataset(
	split=self.config.split,
	as_supervised=True,
	shuffle_files=True,
	decoders=decoders,
	read_config=read_config)

	return dataset

	def load_records(self) -> tf.data.Dataset:
	"""Return a dataset loading files with TFRecords."""
	logging.info('Using TFRecords to load data.')
	if self.config.filenames is None:
	if self.config.data_dir is None:
	raise ValueError('Dataset must specify a path for the data files.')

	file_pattern = os.path.join(self.config.data_dir,
	'{}*'.format(self.config.split))
	dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
	else:
	dataset = tf.data.Dataset.from_tensor_slices(self.config.filenames)

	return dataset

	def load_synthetic(self) -> tf.data.Dataset:
	"""Return a dataset generating dummy synthetic data."""
	logging.info('Generating a synthetic dataset.')

	def generate_data(_):
	image = tf.zeros([self.image_size, self.image_size, self.num_channels],
	dtype=self.dtype)
	label = tf.zeros([1], dtype=tf.int32)
	return image, label

	dataset = tf.data.Dataset.range(1)
	dataset = dataset.repeat()
	dataset = dataset.map(generate_data,
	num_parallel_calls=tf.data.experimental.AUTOTUNE)
	return dataset

	def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
	"""Build a pipeline fetching, shuffling, and preprocessing the dataset.

	Args:
	dataset: A `tf.data.Dataset` that loads raw files.

	Returns:
	A TensorFlow dataset outputting batched images and labels.
	"""
	if (self.config.builder != 'tfds' and self.input_context
	and self.input_context.num_input_pipelines > 1):
	dataset = dataset.shard(self.input_context.num_input_pipelines,
	self.input_context.input_pipeline_id)
	logging.info('Sharding the dataset: input_pipeline_id=%d '
	'num_input_pipelines=%d',
	self.input_context.num_input_pipelines,
	self.input_context.input_pipeline_id)

	if self.is_training and self.config.builder == 'records':
	# Shuffle the input files.
	dataset.shuffle(buffer_size=self.config.file_shuffle_buffer_size)

	if self.is_training and not self.config.cache:
	dataset = dataset.repeat()

	if self.config.builder == 'records':
	# Read the data from disk in parallel
	dataset = dataset.interleave(
	tf.data.TFRecordDataset,
	cycle_length=10,
	block_length=1,
	num_parallel_calls=tf.data.experimental.AUTOTUNE)

	if self.config.cache:
	dataset = dataset.cache()

	if self.is_training:
	dataset = dataset.shuffle(self.config.shuffle_buffer_size)
	dataset = dataset.repeat()

	# Parse, pre-process, and batch the data in parallel
	if self.config.builder == 'records':
	preprocess = self.parse_record
	else:
	preprocess = self.preprocess
	dataset = dataset.map(preprocess,
	num_parallel_calls=tf.data.experimental.AUTOTUNE)

	if self.input_context and self.config.num_devices > 1:
	if not self.config.use_per_replica_batch_size:
	raise ValueError(
	'The builder does not support a global batch size with more than '
	'one replica. Got {} replicas. Please set a '
	'`per_replica_batch_size` and enable '
	'`use_per_replica_batch_size=True`.'.format(
	self.config.num_devices))

	# The batch size of the dataset will be multiplied by the number of
	# replicas automatically when strategy.distribute_datasets_from_function
	# is called, so we use local batch size here.
	dataset = dataset.batch(self.local_batch_size,
	drop_remainder=self.is_training)
	else:
	dataset = dataset.batch(self.global_batch_size,
	drop_remainder=self.is_training)

	# Prefetch overlaps in-feed with training
	dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

	if self.config.tf_data_service:
	if not hasattr(tf.data.experimental, 'service'):
	raise ValueError('The tf_data_service flag requires Tensorflow version '
	'>= 2.3.0, but the version is {}'.format(
	tf.__version__))
	dataset = dataset.apply(
	tf.data.experimental.service.distribute(
	processing_mode='parallel_epochs',
	service=self.config.tf_data_service,
	job_name='resnet_train'))
	dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

	return dataset

	def parse_record(self, record: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
	"""Parse an ImageNet record from a serialized string Tensor."""
	keys_to_features = {
	'image/encoded':
	tf.io.FixedLenFeature((), tf.string, ''),
	'image/format':
	tf.io.FixedLenFeature((), tf.string, 'jpeg'),
	'image/class/label':
	tf.io.FixedLenFeature([], tf.int64, -1),
	'image/class/text':
	tf.io.FixedLenFeature([], tf.string, ''),
	'image/object/bbox/xmin':
	tf.io.VarLenFeature(dtype=tf.float32),
	'image/object/bbox/ymin':
	tf.io.VarLenFeature(dtype=tf.float32),
	'image/object/bbox/xmax':
	tf.io.VarLenFeature(dtype=tf.float32),
	'image/object/bbox/ymax':
	tf.io.VarLenFeature(dtype=tf.float32),
	'image/object/class/label':
	tf.io.VarLenFeature(dtype=tf.int64),
	}

	parsed = tf.io.parse_single_example(record, keys_to_features)

	label = tf.reshape(parsed['image/class/label'], shape=[1])

	# Subtract one so that labels are in [0, 1000)
	label -= 1

	image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
	image, label = self.preprocess(image_bytes, label)

	return image, label

	def preprocess(self, image: tf.Tensor, label: tf.Tensor
	) -> Tuple[tf.Tensor, tf.Tensor]:
	"""Apply image preprocessing and augmentation to the image and label."""
	if self.is_training:
	image = preprocessing.preprocess_for_train(
	image,
	image_size=self.image_size,
	mean_subtract=self.config.mean_subtract,
	standardize=self.config.standardize,
	dtype=self.dtype,
	augmenter=self.augmenter)
	else:
	image = preprocessing.preprocess_for_eval(
	image,
	image_size=self.image_size,
	num_channels=self.num_channels,
	mean_subtract=self.config.mean_subtract,
	standardize=self.config.standardize,
	dtype=self.dtype)

	label = tf.cast(label, tf.int32)
	if self.config.one_hot:
	label = tf.one_hot(label, self.num_classes)
	label = tf.reshape(label, [self.num_classes])

	return image, label

	@classmethod
	def from_params(cls, args, *kwargs):
	"""Construct a dataset builder from a default config and any overrides."""
	config = DatasetConfig.from_args(args, *kwargs)
	return cls(config)