Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import torch | |
import torch.nn as nn | |
import torch.optim as optim | |
from torch.utils.data import DataLoader, Dataset | |
from transformers import BertTokenizer, BertModel | |
# Define constants | |
DIMENSIONS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'] | |
class EssayDataset(Dataset): | |
def __init__(self, texts, targets, tokenizer, max_len): | |
self.texts = texts | |
self.targets = targets | |
self.tokenizer = tokenizer | |
self.max_len = max_len | |
def __len__(self): | |
return len(self.texts) | |
def __getitem__(self, item): | |
text = self.texts[item] | |
target = self.targets[item] | |
encoding = self.tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=self.max_len, | |
return_token_type_ids=False, | |
padding='max_length', | |
return_attention_mask=True, | |
return_tensors='pt', | |
truncation=True | |
) | |
return { | |
'text': text, | |
'input_ids': encoding['input_ids'].flatten(), | |
'attention_mask': encoding['attention_mask'].flatten(), | |
'targets': torch.tensor(target, dtype=torch.float) | |
} | |
class EssayScoreRegressor(nn.Module): | |
def __init__(self, n_outputs): | |
super(EssayScoreRegressor, self).__init__() | |
self.bert = BertModel.from_pretrained('bert-base-uncased') | |
self.drop = nn.Dropout(p=0.3) | |
self.out = nn.Linear(self.bert.config.hidden_size, n_outputs) | |
def forward(self, input_ids, attention_mask): | |
pooled_output = self.bert( | |
input_ids=input_ids, | |
attention_mask=attention_mask | |
)['pooler_output'] | |
output = self.drop(pooled_output) | |
return self.out(output) | |
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): | |
model = model.train() | |
losses = [] | |
for d in data_loader: | |
input_ids = d['input_ids'].to(device) | |
attention_mask = d['attention_mask'].to(device) | |
targets = d['targets'].to(device) | |
outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
loss = loss_fn(outputs, targets) | |
losses.append(loss.item()) | |
loss.backward() | |
optimizer.step() | |
scheduler.step() | |
optimizer.zero_grad() | |
return np.mean(losses) | |
def train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs, batch_size, max_len): | |
train_dataset = EssayDataset( | |
texts=train_data['full_text'].to_numpy(), | |
targets=train_data[DIMENSIONS].to_numpy(), | |
tokenizer=tokenizer, | |
max_len=max_len | |
) | |
val_dataset = EssayDataset( | |
texts=val_data['full_text'].to_numpy(), | |
targets=val_data[DIMENSIONS].to_numpy(), | |
tokenizer=tokenizer, | |
max_len=max_len | |
) | |
train_data_loader = DataLoader( | |
train_dataset, | |
batch_size=batch_size, | |
shuffle=True | |
) | |
val_data_loader = DataLoader( | |
val_dataset, | |
batch_size=batch_size, | |
shuffle=False | |
) | |
loss_fn = nn.MSELoss().to(device) | |
for epoch in range(epochs): | |
print(f'Epoch {epoch + 1}/{epochs}') | |
print('-' * 10) | |
train_loss = train_epoch( | |
model, | |
train_data_loader, | |
loss_fn, | |
optimizer, | |
device, | |
scheduler, | |
len(train_dataset) | |
) | |
print(f'Train loss {train_loss}') | |
if __name__ == "__main__": | |
df = pd.read_csv('train.csv') | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model = EssayScoreRegressor(n_outputs=len(DIMENSIONS)) | |
model = model.to(device) | |
optimizer = optim.Adam(model.parameters(), lr=2e-5) | |
total_steps = len(df) // 16 * 5 | |
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1) | |
train_data = df.sample(frac=0.8, random_state=42) | |
val_data = df.drop(train_data.index) | |
train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs=5, batch_size=16, max_len=160) | |