Spaces:

yuwd
/

Polos-Demo

Sleeping

File size: 4,438 Bytes

03f6091

# -*- coding: utf-8 -*-
r"""
Quality Estimator Model
=======================
    Quality Estimator Model estimates a quality score for the hyphotesis (e.g: the MT text)
    by looking only at source and MT.
"""
from argparse import Namespace
from typing import Dict, List, Tuple, Union

import pandas as pd
import torch

from polos.models.estimators import PolosEstimator, Estimator
from polos.modules.feedforward import FeedForward
from polos.modules.scalar_mix import ScalarMixWithDropout
from torchnlp.utils import collate_tensors


class QualityEstimator(PolosEstimator):
    """
    Estimator class that uses a pretrained encoder to extract features from
    the sequences and then passes those features to a feed forward estimator.

    :param hparams: Namespace containing the hyperparameters.
    """

    def __init__(
        self,
        hparams: Namespace,
    ) -> None:
        super().__init__(hparams)

    def _build_model(self) -> Estimator:
        """
        Initializes the estimator architecture.
        """
        super()._build_model()
        if self.hparams.encoder_model != "LASER":
            self.layer = (
                int(self.hparams.layer)
                if self.hparams.layer != "mix"
                else self.hparams.layer
            )

            self.scalar_mix = (
                ScalarMixWithDropout(
                    mixture_size=self.encoder.num_layers,
                    dropout=self.hparams.scalar_mix_dropout,
                    do_layer_norm=True,
                )
                if self.layer == "mix" and self.hparams.pool != "default"
                else None
            )

        self.ff = FeedForward(
            in_dim=self.encoder.output_units * 4,
            hidden_sizes=self.hparams.hidden_sizes,
            activations=self.hparams.activations,
            dropout=self.hparams.dropout,
        )

    def read_csv(self, path: str) -> List[dict]:
        """Reads a comma separated value file.

        :param path: path to a csv file.

        :return: List of records as dictionaries
        """
        df = pd.read_csv(path)
        df = df[["mt", "src", "score"]]
        df["mt"] = df["mt"].astype(str)
        df["src"] = df["src"].astype(str)
        df["score"] = df["score"].astype(float)
        return df.to_dict("records")

    def prepare_sample(
        self, sample: List[Dict[str, Union[str, float]]], inference: bool = False
    ) -> Union[
        Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor]
    ]:
        """
        Function that prepares a sample to input the model.
        :param sample: list of dictionaries.
        :param inference: If set to true prepares only the model inputs.

        :returns: Tuple with 2 dictionaries (model inputs and targets).
            If `inference=True` returns only the model inputs.
        """
        sample = collate_tensors(sample)
        mt_inputs = self.encoder.prepare_sample(sample["mt"])
        src_inputs = self.encoder.prepare_sample(sample["src"])

        mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()}
        src_inputs = {"src_" + k: v for k, v in src_inputs.items()}

        inputs = {**mt_inputs, **src_inputs}

        if inference:
            return inputs

        targets = {"score": torch.tensor(sample["score"], dtype=torch.float)}
        return inputs, targets

    def forward(
        self,
        mt_tokens: torch.tensor,
        src_tokens: torch.tensor,
        mt_lengths: torch.tensor,
        src_lengths: torch.tensor,
        **kwargs
    ) -> Dict[str, torch.Tensor]:
        """
        Function that encodes both Source, MT and returns a quality score.

        :param mt_tokens: MT sequences [batch_size x mt_seq_len]
        :param src_tokens: SRC sequences [batch_size x src_seq_len]
        :param mt_lengths: MT lengths [batch_size]
        :param src_lengths: SRC lengths [batch_size]

        :return: Dictionary with model outputs to be passed to the loss function.
        """
        mt_sentemb = self.get_sentence_embedding(mt_tokens, mt_lengths)
        src_sentemb = self.get_sentence_embedding(src_tokens, src_lengths)

        diff_src = torch.abs(mt_sentemb - src_sentemb)
        prod_src = mt_sentemb * src_sentemb

        embedded_sequences = torch.cat(
            (mt_sentemb, src_sentemb, prod_src, diff_src), dim=1
        )
        return {"score": self.ff(embedded_sequences)}