In [1]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 
          'validation_matched': 'data/validation_matched-00000-of-00001.parquet', 
          'validation_mismatched': 'data/validation_mismatched-00000-of-00001.parquet'}
          
df = pd.read_parquet("hf://datasets/nyu-mll/multi_nli/" + splits["train"])

In [2]:
df = df[['label', 'premise', 'hypothesis']].iloc[:13000]
df

Unnamed: 0,label,premise,hypothesis
0,1,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...
1,0,you know during the season and i guess at at y...,You lose the things to the following level if ...
2,0,One of our number will carry out your instruct...,A member of my team will execute your orders w...
3,0,How do you know? All this is their information...,This information belongs to them.
4,1,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.
...,...,...,...
12995,1,right you have to question you have to wonder ...,I would not mind living on an island to find out.
12996,2,Reviewers may not be familiar with the charact...,"Typically, reviewers are fully aware of an eme..."
12997,1,yeah it was Twins was good too because when i...,Twins was the best movie I saw last year.
12998,0,The Jews are Neanderthals.,Jewish people are like Neanderthals.


In [3]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

In [4]:
class MNLIDataBert(Dataset):

  def __init__(self, train_df, val_df):

    self.train_df = train_df
    self.val_df = val_df

    self.base_path = '/content/'
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # Using a pre-trained BERT tokenizer to encode sentences
    self.train_data = None
    self.val_data = None
    self.init_data()

  def init_data(self):
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)

  def load_data(self, df):
    MAX_LEN = 512
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    premise_list = df['premise'].to_list()
    hypothesis_list = df['hypothesis'].to_list()
    label_list = df['label'].to_list()

    for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # premise and hypothesis 
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(label)
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)
    y = torch.tensor(y)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    print(len(dataset))
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    return train_loader, val_loader

In [5]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
val_df

Unnamed: 0,label,premise,hypothesis
3615,2,An ambitious plan for a hexagonally based chur...,"The complex plan of the hospital, came to noth..."
2536,1,for for city use and,Only the city can use it.
5397,0,The really valuable estate cannot be touched b...,The death tax is unable to reach the most impo...
9982,0,isn't that the truth it's funny in fact it's i...,I love that music from my childhood has return...
1498,0,Most drivers will be able to point out the Bok...,The Bok House is transformed into a restaurant.
...,...,...,...
11872,1,"As the road rises, the rugged countryside beco...",The hillsides are full of ferns and trees.
9264,0,The monastery rests in a fertile valley and is...,In a fertile valley surrounded by plane and pi...
7277,2,Since everyone who matters presumably knows al...,People who matter no nothing about who backs t...
3752,2,so what type of restaurant do you like,"You don't eat at restaurants at all, right?"


In [7]:
mnli_dataset = MNLIDataBert(train_df, val_df)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

10400
2600


In [8]:
train_loader, val_loader = mnli_dataset.get_data_loaders()

In [9]:
from transformers import BertForSequenceClassification, AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if 'bias' not in n and 'LayerNorm.weight' not in n], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if 'bias' in n or 'LayerNorm.weight' in n], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

In [11]:
import time

In [12]:
EPOCHS = 2

def train(model, train_loader, val_loader, optimizer):  
  total_step = len(train_loader)

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()

      acc = multi_acc(prediction, labels)

      loss.backward()
      optimizer.step()
      
      total_train_loss += loss.item()
      total_train_acc  += acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    with torch.no_grad():
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)
        
        loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()
        
        acc = multi_acc(prediction, labels)

        total_val_loss += loss.item()
        total_val_acc  += acc.item()

    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [13]:
train(model, train_loader, val_loader, optimizer)

Epoch 1: train_loss: 0.8012 train_acc: 0.6405 | val_loss: 0.6349 val_acc: 0.7367
00:08:12.58
Epoch 2: train_loss: 0.4223 train_acc: 0.8425 | val_loss: 0.6711 val_acc: 0.7416
00:08:17.60


In [14]:
import torch
from transformers import BertTokenizer
import torch.nn.functional as F

model.eval()

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Function to predict entailment for a single premise-hypothesis pair
def predict_entailment(premise, hypothesis):
    # Tokenize and encode the inputs
    premise_id = tokenizer.encode(premise, add_special_tokens=False)
    hypothesis_id = tokenizer.encode(hypothesis, add_special_tokens=False)
    pair_token_ids = [tokenizer.cls_token_id] + premise_id + [tokenizer.sep_token_id] + hypothesis_id + [tokenizer.sep_token_id]
    
    segment_ids = torch.tensor([0] * (len(premise_id) + 2) + [1] * (len(hypothesis_id) + 1)).unsqueeze(0)  # Add batch dimension
    attention_mask_ids = torch.tensor([1] * (len(premise_id) + len(hypothesis_id) + 3)).unsqueeze(0)  # Add batch dimension
    token_ids = torch.tensor(pair_token_ids).unsqueeze(0)  # Add batch dimension
    
    # Move to device
    token_ids = token_ids.to(device)
    segment_ids = segment_ids.to(device)
    attention_mask_ids = attention_mask_ids.to(device)
    
    # Run the model
    with torch.no_grad():
        outputs = model(token_ids, token_type_ids=segment_ids, attention_mask=attention_mask_ids)
        logits = outputs.logits
    
    # Apply softmax to get probabilities
    probs = F.softmax(logits, dim=1)
    
    # Get the predicted label
    predicted_label = torch.argmax(probs, dim=1).item()
    
    return predicted_label, probs

In [15]:
label_map = {0: 'Entailment', 1: 'Neutral', 2: 'Contradiction'}

In [16]:
# Example premises and hypotheses
premises = [
    "A man is playing a guitar.",
    "Laura likes to go to restaurants every weekend.",
    "Messi is a proffesional football player."
]

hypotheses = [
    "A person is making music.",
    "Laura doesn't eat at restaurants at all.",
    "Akash is doing his homework."
]

In [17]:

# Predict entailment for each pair
for premise, hypothesis in zip(premises, hypotheses):
    label, probs = predict_entailment(premise, hypothesis)
    print(f"Premise: {premise}")
    print(f"Hypothesis: {hypothesis}")
    print(f"Predicted label: {label_map[label]}")
    print(f"Probabilities: {probs}")
    print('-'*80)

Premise: A man is playing a guitar.
Hypothesis: A person is making music.
Predicted label: Entailment
Probabilities: tensor([[0.9668, 0.0200, 0.0132]], device='cuda:0')
--------------------------------------------------------------------------------
Premise: Laura likes to go to restaurants every weekend.
Hypothesis: Laura doesn't eat at restaurants at all.
Predicted label: Contradiction
Probabilities: tensor([[0.0016, 0.0022, 0.9962]], device='cuda:0')
--------------------------------------------------------------------------------
Premise: Messi is a proffesional football player.
Hypothesis: Akash is doing his homework.
Predicted label: Neutral
Probabilities: tensor([[0.0153, 0.6406, 0.3441]], device='cuda:0')
--------------------------------------------------------------------------------


In [18]:
model_path = "./ema_task_model"
tokenizer_path = "./ema_task_tokenizer"

# Save the model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)

('./ema_task_tokenizer/tokenizer_config.json',
 './ema_task_tokenizer/special_tokens_map.json',
 './ema_task_tokenizer/vocab.txt',
 './ema_task_tokenizer/added_tokens.json')

In [19]:
!zip -r ema_task_model.zip ema_task_model
!zip -r ema_task_tokenizer.zip ema_task_tokenizer

  adding: ema_task_model/ (stored 0%)
  adding: ema_task_model/config.json (deflated 51%)
  adding: ema_task_model/model.safetensors (deflated 7%)
  adding: ema_task_tokenizer/ (stored 0%)
  adding: ema_task_tokenizer/vocab.txt (deflated 53%)
  adding: ema_task_tokenizer/special_tokens_map.json (deflated 42%)
  adding: ema_task_tokenizer/tokenizer_config.json (deflated 75%)
