Spaces:

tobiasc
/

conex

Build error

App Files Files Community

conex / espnet /lm /pytorch_backend /lm.py

tobiasc

Initial commit

ad16788 about 2 years ago

raw

history blame contribute delete

No virus

15.1 kB

	#!/usr/bin/env python3
	# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
	# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
	# This code is ported from the following implementation written in Torch.
	# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py

	"""LM training in pytorch."""

	import copy
	import json
	import logging
	import numpy as np

	import torch
	import torch.nn as nn
	from torch.nn.parallel import data_parallel

	from chainer import Chain
	from chainer.dataset import convert
	from chainer import reporter
	from chainer import training
	from chainer.training import extensions

	from espnet.lm.lm_utils import count_tokens
	from espnet.lm.lm_utils import load_dataset
	from espnet.lm.lm_utils import MakeSymlinkToBestModel
	from espnet.lm.lm_utils import ParallelSentenceIterator
	from espnet.lm.lm_utils import read_tokens
	from espnet.nets.lm_interface import dynamic_import_lm
	from espnet.nets.lm_interface import LMInterface
	from espnet.optimizer.factory import dynamic_import_optimizer
	from espnet.scheduler.pytorch import PyTorchScheduler
	from espnet.scheduler.scheduler import dynamic_import_scheduler

	from espnet.asr.asr_utils import snapshot_object
	from espnet.asr.asr_utils import torch_load
	from espnet.asr.asr_utils import torch_resume
	from espnet.asr.asr_utils import torch_snapshot

	from espnet.utils.training.tensorboard_logger import TensorboardLogger
	from tensorboardX import SummaryWriter

	from espnet.utils.deterministic_utils import set_deterministic_pytorch
	from espnet.utils.training.evaluator import BaseEvaluator
	from espnet.utils.training.iterators import ShufflingEnabler
	from espnet.utils.training.train_utils import check_early_stop
	from espnet.utils.training.train_utils import set_early_stop


	def compute_perplexity(result):
	"""Compute and add the perplexity to the LogReport.

	:param dict result: The current observations
	"""
	# Routine to rewrite the result dictionary of LogReport to add perplexity values
	result["perplexity"] = np.exp(result["main/nll"] / result["main/count"])
	if "validation/main/nll" in result:
	result["val_perplexity"] = np.exp(
	result["validation/main/nll"] / result["validation/main/count"]
	)


	class Reporter(Chain):
	"""Dummy module to use chainer's trainer."""

	def report(self, loss):
	"""Report nothing."""
	pass


	def concat_examples(batch, device=None, padding=None):
	"""Concat examples in minibatch.

	:param np.ndarray batch: The batch to concatenate
	:param int device: The device to send to
	:param Tuple[int,int] padding: The padding to use
	:return: (inputs, targets)
	:rtype (torch.Tensor, torch.Tensor)
	"""
	x, t = convert.concat_examples(batch, padding=padding)
	x = torch.from_numpy(x)
	t = torch.from_numpy(t)
	if device is not None and device >= 0:
	x = x.cuda(device)
	t = t.cuda(device)
	return x, t


	class BPTTUpdater(training.StandardUpdater):
	"""An updater for a pytorch LM."""

	def __init__(
	self,
	train_iter,
	model,
	optimizer,
	schedulers,
	device,
	gradclip=None,
	use_apex=False,
	accum_grad=1,
	):
	"""Initialize class.

	Args:
	train_iter (chainer.dataset.Iterator): The train iterator
	model (LMInterface) : The model to update
	optimizer (torch.optim.Optimizer): The optimizer for training
	schedulers (espnet.scheduler.scheduler.SchedulerInterface):
	The schedulers of `optimizer`
	device (int): The device id
	gradclip (float): The gradient clipping value to use
	use_apex (bool): The flag to use Apex in backprop.
	accum_grad (int): The number of gradient accumulation.

	"""
	super(BPTTUpdater, self).__init__(train_iter, optimizer)
	self.model = model
	self.device = device
	self.gradclip = gradclip
	self.use_apex = use_apex
	self.scheduler = PyTorchScheduler(schedulers, optimizer)
	self.accum_grad = accum_grad

	# The core part of the update routine can be customized by overriding.
	def update_core(self):
	"""Update the model."""
	# When we pass one iterator and optimizer to StandardUpdater.__init__,
	# they are automatically named 'main'.
	train_iter = self.get_iterator("main")
	optimizer = self.get_optimizer("main")
	# Progress the dataset iterator for sentences at each iteration.
	self.model.zero_grad() # Clear the parameter gradients
	accum = {"loss": 0.0, "nll": 0.0, "count": 0}
	for _ in range(self.accum_grad):
	batch = train_iter.__next__()
	# Concatenate the token IDs to matrices and send them to the device
	# self.converter does this job
	# (it is chainer.dataset.concat_examples by default)
	x, t = concat_examples(batch, device=self.device[0], padding=(0, -100))
	if self.device[0] == -1:
	loss, nll, count = self.model(x, t)
	else:
	# apex does not support torch.nn.DataParallel
	loss, nll, count = data_parallel(self.model, (x, t), self.device)

	# backward
	loss = loss.mean() / self.accum_grad
	if self.use_apex:
	from apex import amp

	with amp.scale_loss(loss, optimizer) as scaled_loss:
	scaled_loss.backward()
	else:
	loss.backward() # Backprop
	# accumulate stats
	accum["loss"] += float(loss)
	accum["nll"] += float(nll.sum())
	accum["count"] += int(count.sum())

	for k, v in accum.items():
	reporter.report({k: v}, optimizer.target)
	if self.gradclip is not None:
	nn.utils.clip_grad_norm_(self.model.parameters(), self.gradclip)
	optimizer.step() # Update the parameters
	self.scheduler.step(n_iter=self.iteration)


	class LMEvaluator(BaseEvaluator):
	"""A custom evaluator for a pytorch LM."""

	def __init__(self, val_iter, eval_model, reporter, device):
	"""Initialize class.

	:param chainer.dataset.Iterator val_iter : The validation iterator
	:param LMInterface eval_model : The model to evaluate
	:param chainer.Reporter reporter : The observations reporter
	:param int device : The device id to use

	"""
	super(LMEvaluator, self).__init__(val_iter, reporter, device=-1)
	self.model = eval_model
	self.device = device

	def evaluate(self):
	"""Evaluate the model."""
	val_iter = self.get_iterator("main")
	loss = 0
	nll = 0
	count = 0
	self.model.eval()
	with torch.no_grad():
	for batch in copy.copy(val_iter):
	x, t = concat_examples(batch, device=self.device[0], padding=(0, -100))
	if self.device[0] == -1:
	l, n, c = self.model(x, t)
	else:
	# apex does not support torch.nn.DataParallel
	l, n, c = data_parallel(self.model, (x, t), self.device)
	loss += float(l.sum())
	nll += float(n.sum())
	count += int(c.sum())
	self.model.train()
	# report validation loss
	observation = {}
	with reporter.report_scope(observation):
	reporter.report({"loss": loss}, self.model.reporter)
	reporter.report({"nll": nll}, self.model.reporter)
	reporter.report({"count": count}, self.model.reporter)
	return observation


	def train(args):
	"""Train with the given args.

	:param Namespace args: The program arguments
	:param type model_class: LMInterface class for training
	"""
	model_class = dynamic_import_lm(args.model_module, args.backend)
	assert issubclass(model_class, LMInterface), "model should implement LMInterface"
	# display torch version
	logging.info("torch version = " + torch.__version__)

	set_deterministic_pytorch(args)

	# check cuda and cudnn availability
	if not torch.cuda.is_available():
	logging.warning("cuda is not available")

	# get special label ids
	unk = args.char_list_dict["<unk>"]
	eos = args.char_list_dict["<eos>"]
	# read tokens as a sequence of sentences
	val, n_val_tokens, n_val_oovs = load_dataset(
	args.valid_label, args.char_list_dict, args.dump_hdf5_path
	)
	train, n_train_tokens, n_train_oovs = load_dataset(
	args.train_label, args.char_list_dict, args.dump_hdf5_path
	)
	logging.info("#vocab = " + str(args.n_vocab))
	logging.info("#sentences in the training data = " + str(len(train)))
	logging.info("#tokens in the training data = " + str(n_train_tokens))
	logging.info(
	"oov rate in the training data = %.2f %%"
	% (n_train_oovs / n_train_tokens * 100)
	)
	logging.info("#sentences in the validation data = " + str(len(val)))
	logging.info("#tokens in the validation data = " + str(n_val_tokens))
	logging.info(
	"oov rate in the validation data = %.2f %%" % (n_val_oovs / n_val_tokens * 100)
	)

	use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
	# Create the dataset iterators
	batch_size = args.batchsize * max(args.ngpu, 1)
	if batch_size * args.accum_grad > args.batchsize:
	logging.info(
	f"batch size is automatically increased "
	f"({args.batchsize} -> {batch_size * args.accum_grad})"
	)
	train_iter = ParallelSentenceIterator(
	train,
	batch_size,
	max_length=args.maxlen,
	sos=eos,
	eos=eos,
	shuffle=not use_sortagrad,
	)
	val_iter = ParallelSentenceIterator(
	val, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
	)
	epoch_iters = int(len(train_iter.batch_indices) / args.accum_grad)
	logging.info("#iterations per epoch = %d" % epoch_iters)
	logging.info("#total iterations = " + str(args.epoch * epoch_iters))
	# Prepare an RNNLM model
	if args.train_dtype in ("float16", "float32", "float64"):
	dtype = getattr(torch, args.train_dtype)
	else:
	dtype = torch.float32
	model = model_class(args.n_vocab, args).to(dtype=dtype)
	if args.ngpu > 0:
	model.to("cuda")
	gpu_id = list(range(args.ngpu))
	else:
	gpu_id = [-1]

	# Save model conf to json
	model_conf = args.outdir + "/model.json"
	with open(model_conf, "wb") as f:
	logging.info("writing a model config file to " + model_conf)
	f.write(
	json.dumps(vars(args), indent=4, ensure_ascii=False, sort_keys=True).encode(
	"utf_8"
	)
	)

	logging.warning(
	"num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
	sum(p.numel() for p in model.parameters()),
	sum(p.numel() for p in model.parameters() if p.requires_grad),
	sum(p.numel() for p in model.parameters() if p.requires_grad)
	* 100.0
	/ sum(p.numel() for p in model.parameters()),
	)
	)

	# Set up an optimizer
	opt_class = dynamic_import_optimizer(args.opt, args.backend)
	optimizer = opt_class.from_args(model.parameters(), args)
	if args.schedulers is None:
	schedulers = []
	else:
	schedulers = [dynamic_import_scheduler(v)(k, args) for k, v in args.schedulers]

	# setup apex.amp
	if args.train_dtype in ("O0", "O1", "O2", "O3"):
	try:
	from apex import amp
	except ImportError as e:
	logging.error(
	f"You need to install apex for --train-dtype {args.train_dtype}. "
	"See https://github.com/NVIDIA/apex#linux"
	)
	raise e
	model, optimizer = amp.initialize(model, optimizer, opt_level=args.train_dtype)
	use_apex = True
	else:
	use_apex = False

	# FIXME: TOO DIRTY HACK
	reporter = Reporter()
	setattr(model, "reporter", reporter)
	setattr(optimizer, "target", reporter)
	setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

	updater = BPTTUpdater(
	train_iter,
	model,
	optimizer,
	schedulers,
	gpu_id,
	gradclip=args.gradclip,
	use_apex=use_apex,
	accum_grad=args.accum_grad,
	)
	trainer = training.Trainer(updater, (args.epoch, "epoch"), out=args.outdir)
	trainer.extend(LMEvaluator(val_iter, model, reporter, device=gpu_id))
	trainer.extend(
	extensions.LogReport(
	postprocess=compute_perplexity,
	trigger=(args.report_interval_iters, "iteration"),
	)
	)
	trainer.extend(
	extensions.PrintReport(
	[
	"epoch",
	"iteration",
	"main/loss",
	"perplexity",
	"val_perplexity",
	"elapsed_time",
	]
	),
	trigger=(args.report_interval_iters, "iteration"),
	)
	trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
	# Save best models
	trainer.extend(torch_snapshot(filename="snapshot.ep.{.updater.epoch}"))
	trainer.extend(snapshot_object(model, "rnnlm.model.{.updater.epoch}"))
	# T.Hori: MinValueTrigger should be used, but it fails when resuming
	trainer.extend(MakeSymlinkToBestModel("validation/main/loss", "rnnlm.model"))

	if use_sortagrad:
	trainer.extend(
	ShufflingEnabler([train_iter]),
	trigger=(args.sortagrad if args.sortagrad != -1 else args.epoch, "epoch"),
	)
	if args.resume:
	logging.info("resumed from %s" % args.resume)
	torch_resume(args.resume, trainer)

	set_early_stop(trainer, args, is_lm=True)
	if args.tensorboard_dir is not None and args.tensorboard_dir != "":
	writer = SummaryWriter(args.tensorboard_dir)
	trainer.extend(
	TensorboardLogger(writer), trigger=(args.report_interval_iters, "iteration")
	)

	trainer.run()
	check_early_stop(trainer, args.epoch)

	# compute perplexity for test set
	if args.test_label:
	logging.info("test the best model")
	torch_load(args.outdir + "/rnnlm.model.best", model)
	test = read_tokens(args.test_label, args.char_list_dict)
	n_test_tokens, n_test_oovs = count_tokens(test, unk)
	logging.info("#sentences in the test data = " + str(len(test)))
	logging.info("#tokens in the test data = " + str(n_test_tokens))
	logging.info(
	"oov rate in the test data = %.2f %%" % (n_test_oovs / n_test_tokens * 100)
	)
	test_iter = ParallelSentenceIterator(
	test, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
	)
	evaluator = LMEvaluator(test_iter, model, reporter, device=gpu_id)
	result = evaluator()
	compute_perplexity(result)
	logging.info(f"test perplexity: {result['perplexity']}")