Spaces:

sunwaee
/

Perceiver-Multiclass-Emotion-Classification

Sleeping

Perceiver-Multiclass-Emotion-Classification

File size: 3,587 Bytes

b438028

from typing import List

import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import PerceiverTokenizer


def _map_outputs(predictions):
    """
    Map model outputs to classes.

    :param predictions: model ouptut batch
    :return:
    """

    labels = [
        "admiration",
        "amusement",
        "anger",
        "annoyance",
        "approval",
        "caring",
        "confusion",
        "curiosity",
        "desire",
        "disappointment",
        "disapproval",
        "disgust",
        "embarrassment",
        "excitement",
        "fear",
        "gratitude",
        "grief",
        "joy",
        "love",
        "nervousness",
        "optimism",
        "pride",
        "realization",
        "relief",
        "remorse",
        "sadness",
        "surprise",
        "neutral"
    ]
    classes = []
    for i, example in enumerate(predictions):
        out_batch = []
        for j, category in enumerate(example):
            out_batch.append(labels[j]) if category > 0.5 else None
        classes.append(out_batch)
    return classes


class MultiLabelPipeline:
    """
    Multi label classification pipeline.
    """

    def __init__(self, model_path):
        """
        Init MLC pipeline.

        :param model_path: model to use
        """

        # Init attributes
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if self.device == 'cuda':
            self.model = torch.load(model_path).eval().to(self.device)
        else:
            self.model = torch.load(model_path, map_location=torch.device('cpu')).eval().to(self.device)
        self.tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')

    def __call__(self, dataset, batch_size: int = 4):
        """
        Processing pipeline.

        :param dataset: dataset
        :return:
        """

        # Tokenize inputs
        dataset = dataset.map(lambda row: self.tokenizer(row['text'], padding="max_length", truncation=True),
                              batched=True, remove_columns=['text'], desc='Tokenizing')
        dataset.set_format('torch', columns=['input_ids', 'attention_mask'])
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Define output classes
        classes = []
        mem_logs = []

        with tqdm(dataloader, unit='batches') as progression:
            for batch in progression:
                progression.set_description('Inference')
                # Forward
                outputs = self.model(inputs=batch['input_ids'].to(self.device),
                                     attention_mask=batch['attention_mask'].to(self.device), )

                # Outputs
                predictions = outputs.logits.cpu().detach().numpy()

                # Map predictions to classes
                batch_classes = _map_outputs(predictions)

                for row in batch_classes:
                    classes.append(row)

                # Retrieve memory usage
                memory = round(torch.cuda.memory_reserved(self.device) / 1e9, 2)
                mem_logs.append(memory)

                # Update pbar
                progression.set_postfix(memory=f"{round(sum(mem_logs) / len(mem_logs), 2)}Go")

        return classes


def inputs_to_dataset(inputs: List[str]):
    """
    Convert a list of strings to a dataset object.

    :param inputs: list of strings
    :return:
    """

    inputs = {'text': [input for input in inputs]}

    return Dataset.from_dict(inputs)