# -*- coding: utf-8 -*- r""" Translation Ranking Base Model ============================== Abstract base class used to build new ranking systems inside Polos. This task consists of ranking "good" translations above "worse" ones. """ from argparse import Namespace from typing import List import pandas as pd import torch import torch.nn as nn from polos.models.model_base import ModelBase from polos.models.utils import average_pooling, max_pooling from polos.modules.scalar_mix import ScalarMixWithDropout from polos.metrics import WMTKendall class RankingBase(ModelBase): """ Ranking Model base class used to fine-tune pretrained models such as XLM-R to produce better sentence embeddings by optmizing Triplet Margin Loss. :param hparams: Namespace containing the hyperparameters. """ def __init__( self, hparams: Namespace, ) -> None: super().__init__(hparams) def read_csv(self, path: str) -> List[dict]: """Reads a comma separated value file. :param path: path to a csv file. :return: List of records as dictionaries """ df = pd.read_csv(path) df = df[["src", "ref", "pos", "neg"]] df["src"] = df["src"].astype(str) df["ref"] = df["ref"].astype(str) df["pos"] = df["pos"].astype(str) df["neg"] = df["neg"].astype(str) return df.to_dict("records") def _build_loss(self): """ Initializes the loss function/s. """ self.loss = nn.TripletMarginLoss(margin=1.0, p=2) def _build_model(self) -> ModelBase: """ Initializes the ranking model architecture. """ super()._build_model() self.metrics = WMTKendall() if self.hparams.encoder_model != "LASER": self.layer = ( int(self.hparams.layer) if self.hparams.layer != "mix" else self.hparams.layer ) self.scalar_mix = ( ScalarMixWithDropout( mixture_size=self.encoder.num_layers, dropout=self.hparams.scalar_mix_dropout, do_layer_norm=True, ) if self.layer == "mix" and self.hparams.pool != "default" else None ) def get_sentence_embedding( self, tokens: torch.Tensor, lengths: torch.Tensor ) -> torch.Tensor: """Auxiliar function that extracts sentence embeddings for a single sentence. :param tokens: sequences [batch_size x seq_len] :param lengths: lengths [batch_size] :return: torch.Tensor [batch_size x hidden_size] """ # When using just one GPU this should not change behavior # but when splitting batches across GPU the tokens have padding # from the entire original batch if self.trainer and self.trainer.use_dp and self.trainer.num_gpus > 1: tokens = tokens[:, : lengths.max()] encoder_out = self.encoder(tokens, lengths) # for LASER we dont care about the word embeddings if self.hparams.encoder_model == "LASER": pass elif self.scalar_mix: embeddings = self.scalar_mix(encoder_out["all_layers"], encoder_out["mask"]) elif self.layer >= 0 and self.layer < self.encoder.num_layers: embeddings = encoder_out["all_layers"][self.layer] else: raise Exception("Invalid model layer {}.".format(self.layer)) if self.hparams.pool == "default" or self.hparams.encoder_model == "LASER": sentemb = encoder_out["sentemb"] elif self.hparams.pool == "max": sentemb = max_pooling( tokens, embeddings, self.encoder.tokenizer.padding_index ) elif self.hparams.pool == "avg": sentemb = average_pooling( tokens, embeddings, encoder_out["mask"], self.encoder.tokenizer.padding_index, ) elif self.hparams.pool == "cls": sentemb = embeddings[:, 0, :] else: raise Exception("Invalid pooling technique.") return sentemb