import pandas as pd import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, Dataset from transformers import BertTokenizer, BertModel # Define constants DIMENSIONS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'] class EssayDataset(Dataset): def __init__(self, texts, targets, tokenizer, max_len): self.texts = texts self.targets = targets self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, item): text = self.texts[item] target = self.targets[item] encoding = self.tokenizer.encode_plus( text, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation=True ) return { 'text': text, 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'targets': torch.tensor(target, dtype=torch.float) } class EssayScoreRegressor(nn.Module): def __init__(self, n_outputs): super(EssayScoreRegressor, self).__init__() self.bert = BertModel.from_pretrained('bert-base-uncased') self.drop = nn.Dropout(p=0.3) self.out = nn.Linear(self.bert.config.hidden_size, n_outputs) def forward(self, input_ids, attention_mask): pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask )['pooler_output'] output = self.drop(pooled_output) return self.out(output) def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): model = model.train() losses = [] for d in data_loader: input_ids = d['input_ids'].to(device) attention_mask = d['attention_mask'].to(device) targets = d['targets'].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) loss = loss_fn(outputs, targets) losses.append(loss.item()) loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() return np.mean(losses) def train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs, batch_size, max_len): train_dataset = EssayDataset( texts=train_data['full_text'].to_numpy(), targets=train_data[DIMENSIONS].to_numpy(), tokenizer=tokenizer, max_len=max_len ) val_dataset = EssayDataset( texts=val_data['full_text'].to_numpy(), targets=val_data[DIMENSIONS].to_numpy(), tokenizer=tokenizer, max_len=max_len ) train_data_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True ) val_data_loader = DataLoader( val_dataset, batch_size=batch_size, shuffle=False ) loss_fn = nn.MSELoss().to(device) for epoch in range(epochs): print(f'Epoch {epoch + 1}/{epochs}') print('-' * 10) train_loss = train_epoch( model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_dataset) ) print(f'Train loss {train_loss}') if __name__ == "__main__": df = pd.read_csv('train.csv') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = EssayScoreRegressor(n_outputs=len(DIMENSIONS)) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=2e-5) total_steps = len(df) // 16 * 5 scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1) train_data = df.sample(frac=0.8, random_state=42) val_data = df.drop(train_data.index) train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs=5, batch_size=16, max_len=160)