Spaces:

shi-labs
/

OneFormer

Sleeping

App Files Files Community

OneFormer / oneformer /evaluation /evaluator.py

praeclarumjj3

Create Space

6e445f1 about 2 years ago

raw

history blame

8.43 kB

	# ------------------------------------------------------------------------------
	# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/evaluator.py
	# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
	# ------------------------------------------------------------------------------

	import datetime
	import logging
	import time
	from collections import OrderedDict, abc
	from contextlib import ExitStack, contextmanager
	from typing import List, Union
	import torch
	from torch import nn

	from detectron2.utils.comm import get_world_size, is_main_process
	from detectron2.utils.logger import log_every_n_seconds


	class DatasetEvaluator:
	"""
	Base class for a dataset evaluator.

	The function :func:`inference_on_dataset` runs the model over
	all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.

	This class will accumulate information of the inputs/outputs (by :meth:`process`),
	and produce evaluation results in the end (by :meth:`evaluate`).
	"""

	def reset(self):
	"""
	Preparation for a new round of evaluation.
	Should be called before starting a round of evaluation.
	"""
	pass

	def process(self, inputs, outputs):
	"""
	Process the pair of inputs and outputs.
	If they contain batches, the pairs can be consumed one-by-one using `zip`:

	.. code-block:: python

	for input_, output in zip(inputs, outputs):
	# do evaluation on single input/output pair
	...

	Args:
	inputs (list): the inputs that's used to call the model.
	outputs (list): the return value of `model(inputs)`
	"""
	pass

	def evaluate(self):
	"""
	Evaluate/summarize the performance, after processing all input/output pairs.

	Returns:
	dict:
	A new evaluator class can return a dict of arbitrary format
	as long as the user can process the results.
	In our train_net.py, we expect the following format:

	* key: the name of the task (e.g., bbox)
	* value: a dict of {metric name: score}, e.g.: {"AP50": 80}
	"""
	pass


	class DatasetEvaluators(DatasetEvaluator):
	"""
	Wrapper class to combine multiple :class:`DatasetEvaluator` instances.

	This class dispatches every evaluation call to
	all of its :class:`DatasetEvaluator`.
	"""

	def __init__(self, evaluators):
	"""
	Args:
	evaluators (list): the evaluators to combine.
	"""
	super().__init__()
	self._evaluators = evaluators

	def reset(self):
	for evaluator in self._evaluators:
	evaluator.reset()

	def process(self, inputs, outputs):
	for evaluator in self._evaluators:
	evaluator.process(inputs, outputs)

	def evaluate(self):
	results = OrderedDict()
	for evaluator in self._evaluators:
	result = evaluator.evaluate()
	if is_main_process() and result is not None:
	for k, v in result.items():
	assert (
	k not in results
	), "Different evaluators produce results with the same key {}".format(k)
	results[k] = v
	return results


	def inference_on_dataset(
	model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None]
	):
	"""
	Run model on the data_loader and evaluate the metrics with evaluator.
	Also benchmark the inference speed of `model.__call__` accurately.
	The model will be used in eval mode.

	Args:
	model (callable): a callable which takes an object from
	`data_loader` and returns some outputs.

	If it's an nn.Module, it will be temporarily set to `eval` mode.
	If you wish to evaluate a model in `training` mode instead, you can
	wrap the given model and override its behavior of `.eval()` and `.train()`.
	data_loader: an iterable object with a length.
	The elements it generates will be the inputs to the model.
	evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark,
	but don't want to do any evaluation.

	Returns:
	The return value of `evaluator.evaluate()`
	"""
	num_devices = get_world_size()
	logger = logging.getLogger(__name__)
	logger.info("Start inference on {} batches".format(len(data_loader)))

	total = len(data_loader) # inference data loader must have a fixed length
	if evaluator is None:
	# create a no-op evaluator
	evaluator = DatasetEvaluators([])
	if isinstance(evaluator, abc.MutableSequence):
	evaluator = DatasetEvaluators(evaluator)
	evaluator.reset()

	num_warmup = min(5, total - 1)
	start_time = time.perf_counter()
	total_data_time = 0
	total_compute_time = 0
	total_eval_time = 0
	with ExitStack() as stack:
	if isinstance(model, nn.Module):
	stack.enter_context(inference_context(model))
	stack.enter_context(torch.no_grad())

	start_data_time = time.perf_counter()
	for idx, inputs in enumerate(data_loader):
	total_data_time += time.perf_counter() - start_data_time
	if idx == num_warmup:
	start_time = time.perf_counter()
	total_data_time = 0
	total_compute_time = 0
	total_eval_time = 0

	start_compute_time = time.perf_counter()
	outputs = model(inputs)
	if torch.cuda.is_available():
	torch.cuda.synchronize()
	total_compute_time += time.perf_counter() - start_compute_time

	start_eval_time = time.perf_counter()
	evaluator.process(inputs, outputs)
	total_eval_time += time.perf_counter() - start_eval_time

	iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
	data_seconds_per_iter = total_data_time / iters_after_start
	compute_seconds_per_iter = total_compute_time / iters_after_start
	eval_seconds_per_iter = total_eval_time / iters_after_start
	total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
	if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
	eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
	log_every_n_seconds(
	logging.INFO,
	(
	f"Inference done {idx + 1}/{total}. "
	f"Dataloading: {data_seconds_per_iter:.4f} s/iter. "
	f"Inference: {compute_seconds_per_iter:.4f} s/iter. "
	f"Eval: {eval_seconds_per_iter:.4f} s/iter. "
	f"Total: {total_seconds_per_iter:.4f} s/iter. "
	f"ETA={eta}"
	),
	n=5,
	)
	start_data_time = time.perf_counter()

	# Measure the time only for this worker (before the synchronization barrier)
	total_time = time.perf_counter() - start_time
	total_time_str = str(datetime.timedelta(seconds=total_time))
	# NOTE this format is parsed by grep
	logger.info(
	"Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
	total_time_str, total_time / (total - num_warmup), num_devices
	)
	)
	total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
	logger.info(
	"Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
	total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
	)
	)

	results = evaluator.evaluate()
	# An evaluator may return None when not in main process.
	# Replace it by an empty dict instead to make it easier for downstream code to handle
	if results is None:
	results = {}
	return results


	@contextmanager
	def inference_context(model):
	"""
	A context where the model is temporarily changed to eval mode,
	and restored to previous mode afterwards.

	Args:
	model: a torch Module
	"""
	training_mode = model.training
	model.eval()
	yield
	model.train(training_mode)