Spaces:
Runtime error
Runtime error
File size: 4,227 Bytes
aefa46f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
# Define constants
DIMENSIONS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
class EssayDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = self.texts[item]
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
padding='max_length',
return_attention_mask=True,
return_tensors='pt',
truncation=True
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.float)
}
class EssayScoreRegressor(nn.Module):
def __init__(self, n_outputs):
super(EssayScoreRegressor, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.drop = nn.Dropout(p=0.3)
self.out = nn.Linear(self.bert.config.hidden_size, n_outputs)
def forward(self, input_ids, attention_mask):
pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)['pooler_output']
output = self.drop(pooled_output)
return self.out(output)
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
model = model.train()
losses = []
for d in data_loader:
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
targets = d['targets'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
loss = loss_fn(outputs, targets)
losses.append(loss.item())
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return np.mean(losses)
def train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs, batch_size, max_len):
train_dataset = EssayDataset(
texts=train_data['full_text'].to_numpy(),
targets=train_data[DIMENSIONS].to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
val_dataset = EssayDataset(
texts=val_data['full_text'].to_numpy(),
targets=val_data[DIMENSIONS].to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
train_data_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True
)
val_data_loader = DataLoader(
val_dataset,
batch_size=batch_size,
shuffle=False
)
loss_fn = nn.MSELoss().to(device)
for epoch in range(epochs):
print(f'Epoch {epoch + 1}/{epochs}')
print('-' * 10)
train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(train_dataset)
)
print(f'Train loss {train_loss}')
if __name__ == "__main__":
df = pd.read_csv('train.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EssayScoreRegressor(n_outputs=len(DIMENSIONS))
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(df) // 16 * 5
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1)
train_data = df.sample(frac=0.8, random_state=42)
val_data = df.drop(train_data.index)
train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs=5, batch_size=16, max_len=160)
|