Spaces:

hoang1007
/

wav2vec2

Running

App Files Files Community

wav2vec2 / src /model /wav2vec2.py

hoang1007

init

5381499 over 1 year ago

raw

history blame contribute delete

No virus

10.3 kB

	"""
	A wrapper of Wav2Vec2 for training phase.
	"""
	from typing import Tuple, Optional
	import torch
	from pytorch_lightning import LightningModule
	import einops
	from torchmetrics import MeanMetric

	from .modules import (
	ContextEncoder,
	FeatureExtractor,
	QuantizationModule,
	Wav2Vec2Processor,
	)
	from src.utils import init_module_weights


	class Wav2Vec2PretrainingModule(LightningModule):
	def __init__(self, config):
	super().__init__()

	self.save_hyperparameters(config)

	self.processor = Wav2Vec2Processor()
	self.context_encoder = ContextEncoder(config.context_encoder)
	self.feature_extractor = FeatureExtractor(config.feature_extractor)
	self.quantizer = QuantizationModule(config.quantizer)

	self.train_loss = MeanMetric()

	def forward(self, waveforms: Tuple[torch.Tensor, ...]):
	"""
	Args:
	waveforms (Tuple[torch.Tensor]): The waveforms. Shape: (batch_size, wave_length).

	Returns:
	loss: The loss of the model. Contrastive loss + Diversity loss.
	"""
	waveforms, wave_lengths = self.processor(waveforms)

	# features.shape == (batch_size, num_frames, hidden_size)
	features, num_frames = self.feature_extractor(waveforms, wave_lengths)

	attention_mask = self._compute_attention_mask(num_frames)
	mask_time_indices = self._compute_mask_span(
	shape=features.shape[:-1],
	mask_prob=self.hparams.mask_prob,
	mask_length=self.hparams.mask_length,
	attention_mask=attention_mask,
	device=features.device,
	min_masks=self.hparams.min_masks,
	)

	context_features = self.context_encoder(
	features, attention_mask=attention_mask, mask_time_indices=mask_time_indices
	)

	quantized_features, perplexity = self.quantizer(features, attention_mask)

	negative_quantized_features = self._sample_negatives(
	quantized_features,
	num_negatives=self.hparams.num_negatives,
	attention_mask=attention_mask,
	)

	# (batch_size, num_frames, num_negatives + 1)
	contrastive_logits = self._compute_contrastive_logits(
	context_features,
	quantized_features,
	negative_quantized_features,
	self.hparams.contrastive_logits_temperature,
	).flatten(0, -2)

	# compute contrastive loss
	# positive indices are always the first one
	targets = (1 - mask_time_indices.long().flatten()) * -100

	contrastive_loss = torch.nn.functional.cross_entropy(
	contrastive_logits, targets, reduction="sum"
	)

	# compute diversity loss
	diversity_loss = 1 - perplexity / self.quantizer.total_codewords

	loss = contrastive_loss + diversity_loss * self.hparams.diversity_loss_weight

	return loss

	@staticmethod
	def _sample_negatives(
	features: torch.Tensor,
	num_negatives: int,
	attention_mask: Optional[torch.Tensor] = None,
	):
	"""
	Sampling negative features from quantized features to compute the contrastive loss.

	Args:
	features (torch.Tensor): The quantized features. Shape: (batch_size, num_frames, d_model).
	num_negatives (int): The number of negative samples.
	attention_mask (Optional[torch.Tensor]): The mask for valid frames. `True` is invalid. Shape: (batch_size, num_frames).

	Returns:
	sampled_negatives (torch.Tensor): The sampled negative features. Shape: (batch_size, num_frames, num_negatives, d_model).
	"""

	batch_size, num_frames, d_model = features.shape

	features = features.view(-1, d_model) # (batch_size * num_frames, d_model)

	with torch.no_grad():
	sampled_ids = []

	for batch_idx in range(batch_size):
	num_valid_frames = (
	features.size(1)
	if attention_mask is None
	else (1 - attention_mask[batch_idx].long()).sum()
	).item()

	sampled_ids.append(
	torch.randint(
	0,
	num_valid_frames - 1,
	(num_frames * num_negatives,),
	device=features.device,
	)
	)

	sampled_ids = torch.stack(
	sampled_ids, dim=0
	) # (batch_size, num_frames * num_negatives)

	feature_ids = einops.repeat(
	torch.arange(num_frames, device=features.device),
	"f -> (f n)",
	n=num_negatives,
	)

	# avoid sampling the same positive vector, but keep the distribution uniform
	sampled_ids[sampled_ids >= feature_ids] += 1

	# correct for batch size
	# E.g [[0, 1, 2], [0, 1, 2]] -> [0, 1, 2, 3, 4, 5]
	sampled_ids += torch.arange(
	0, batch_size * num_frames, num_frames, device=features.device
	).unsqueeze_(-1)

	sampled_negatives = features[sampled_ids.view(-1)]
	sampled_negatives = einops.rearrange(
	sampled_negatives,
	"(b f n) d -> b f n d",
	b=batch_size,
	f=num_frames,
	n=num_negatives,
	)

	return sampled_negatives

	@staticmethod
	def _compute_contrastive_logits(
	predicted_features: torch.Tensor,
	target_features: torch.Tensor,
	negative_features: torch.Tensor,
	temperature: int = 1,
	):
	"""
	Compute the logits for contrastive loss.

	Args:
	predicted_features (torch.Tensor): The predicted features. Shape: (batch_size, num_frames, d_model).
	target_features (torch.Tensor): The target features. Shape: (batch_size, num_frames, d_model).
	negative_features (torch.Tensor): The negative features. Shape: (batch_size, num_frames, num_negatives, d_model).
	temperature (int): The temperature for contrastive loss.

	Returns:
	logits (torch.Tensor): The logits for contrastive loss. Shape: (batch_size, num_frames, num_negatives + 1).
	"""

	# (batch_size, num_frames, num_negatives + 1, d_model)
	target_features = torch.cat(
	(target_features.unsqueeze_(2), negative_features), dim=2
	)

	# (batch_size, num_frames, 1, d_model)
	predicted_features = predicted_features.unsqueeze_(2)

	# (batch_size, num_frames, num_negatives + 1)
	logits = torch.cosine_similarity(predicted_features, target_features, dim=-1)
	logits /= temperature

	return logits

	@staticmethod
	def _compute_mask_span(
	shape: Tuple[int, int],
	mask_prob: float = 0.065,
	mask_length: int = 10,
	attention_mask: Optional[torch.Tensor] = None,
	device: torch.device = torch.device("cpu"),
	min_masks: int = 0,
	):
	"""
	Compute the mask span for contrastive task.

	Args:
	shape (Tuple[int, int]): The shape of the mask span. Shape: (batch_size, num_frames).
	mask_prob (float): The probability of choosing a frame to be the start of masking position.
	mask_length (int): The length of the mask span.
	attention_mask (Optional[torch.Tensor]): The mask for valid frames. `True` is invalid. Shape: (batch_size, num_frames).
	device (torch.device): The device of the mask span.
	min_masks (int): The minimum number of masks.

	Returns:
	mask_span (torch.Tensor): The mask span. Shape: (batch_size, num_frames).
	"""

	batch_size, num_frames = shape

	# NOTE: num_frames / mask_length: the number of spans in one waveform
	num_masked_spans = int(
	mask_prob * num_frames / mask_length + torch.rand(1).item()
	)
	num_masked_spans = max(num_masked_spans, min_masks)

	# make sure num masked indices <= num frames
	if num_masked_spans * mask_length > num_frames:
	num_masked_spans = num_frames // mask_length

	# uniform distribution to sample from
	# NOTE: num_frames - (mask_length - 1): the number of start positions of the span
	uniform_dist = torch.ones(
	(batch_size, num_frames - (mask_length - 1)), device=device
	)

	# (batch_size, num_masked_spans)
	mask_span_ids = torch.multinomial(uniform_dist, num_masked_spans)

	# (batch_size, num_masked_spans * mask_length)
	mask_span_ids = einops.repeat(mask_span_ids, "b n -> b (n l)", l=mask_length)

	offsets = einops.repeat(
	torch.arange(mask_length, device=device),
	"l -> b (n l)",
	b=batch_size,
	n=num_masked_spans,
	)

	mask_span_ids = mask_span_ids + offsets

	mask_span = torch.zeros(shape, device=device, dtype=torch.bool)
	mask_span = mask_span.scatter_(1, mask_span_ids, True)

	if attention_mask is not None:
	# Make sure the invalid frames are not masked
	mask_span = torch.where(attention_mask.bool(), mask_span, False)

	return mask_span

	@staticmethod
	def _compute_attention_mask(length: torch.Tensor):
	"""
	Args:
	length (Tensor): The length of valid frames. Shape: (batch)
	max_length (int): The maximum length of the frames.

	Returns:
	attention_mask (BoolTensor): The mask for the valid frames. `True` is invalid. Shape: (batch, num_frames)
	"""
	max_length = length.max().item()

	mask = (
	torch.arange(max_length, device=length.device).expand(
	length.size(0), max_length
	)
	>= length[:, None]
	)

	return mask

	def training_step(self, batch, batch_idx):
	loss = self(batch)

	self.train_loss(loss)

	if batch_idx % 100 == 0:
	self.log("train/loss", self.train_loss, on_step=True, on_epoch=True)

	return loss

	def configure_optimizers(self):
	return torch.optim.AdamW(self.parameters(), lr=1e-4)