Spaces:
Sleeping
Sleeping
File size: 10,801 Bytes
c2a30b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 |
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup, AutoTokenizer, AutoModel, logging
import warnings
import time
import pickle
warnings.filterwarnings("ignore")
logging.set_verbosity_error()
# Function to set seed for reproducibility
def seed_everything(seed_value):
np.random.seed(seed_value) # Set seed for numpy random numbers
torch.manual_seed(seed_value) # Set seed for PyTorch random numbers
if torch.cuda.is_available(): # If CUDA is available, set CUDA seed
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
torch.backends.cudnn.deterministic = True # Ensure deterministic behavior
torch.backends.cudnn.benchmark = True # Improve performance by allowing cudnn benchmarking
seed_everything(86) # Set seed value for reproducibility
model_name = "bluenguyen/longformer-phobert-base-4096" # Pretrained model name
max_len = 512 # Maximum sequence length for tokenizer (512, but can use 256 if phobertbase)
n_classes = 13 # Number of output classes
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) # Load tokenizer
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Set device to GPU if available, otherwise CPU
EPOCHS = 5 # Number of training epochs
N_SPLITS = 5 # Number of folds for cross-validation
TRAIN_PATH = "data/train_data_162k.json"
TEST_PATH = "data/test_data_162k.json"
VAL_PATH = "data/val_data_162k.json"
# Function to read data from JSON file
def get_data(path):
df = pd.read_json(path, lines=True)
return df
# Read the data from JSON files
train_df = get_data(TRAIN_PATH)
test_df = get_data(TEST_PATH)
valid_df = get_data(VAL_PATH)
# Combine train and validation data
train_df = pd.concat([train_df, valid_df], ignore_index=True)
# Apply StratifiedKFold
skf = StratifiedKFold(n_splits=N_SPLITS)
for fold, (_, val_) in enumerate(skf.split(X=train_df, y=train_df.category)):
train_df.loc[val_, "kfold"] = fold
class NewsDataset(Dataset):
def __init__(self, df, tokenizer, max_len):
self.df = df
self.max_len = max_len
self.tokenizer = tokenizer
def __len__(self):
return len(self.df)
def __getitem__(self, index):
"""
To customize dataset, inherit from Dataset class and implement
__len__ & __getitem__
__getitem__ should return
data:
input_ids
attention_masks
text
targets
"""
row = self.df.iloc[index]
text, label = self.get_input_data(row)
# Encode_plus will:
# (1) split text into token
# (2) Add the '[CLS]' and '[SEP]' token to the start and end
# (3) Truncate/Pad sentence to max length
# (4) Map token to their IDS
# (5) Create attention mask
# (6) Return a dictionary of outputs
encoding = self.tokenizer.encode_plus(
text,
truncation=True,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
return_attention_mask=True,
return_token_type_ids=False,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_masks': encoding['attention_mask'].flatten(),
'targets': torch.tensor(label, dtype=torch.long),
}
def labelencoder(self, text):
label_map = {
'Cong nghe': 0, 'Doi song': 1, 'Giai tri': 2, 'Giao duc': 3, 'Khoa hoc': 4,
'Kinh te': 5, 'Nha dat': 6, 'Phap luat': 7, 'The gioi': 8, 'The thao': 9,
'Van hoa': 10, 'Xa hoi': 11, 'Xe co': 12
}
return label_map.get(text, -1)
def get_input_data(self, row):
text = row['processed_content']
label = self.labelencoder(row['category'])
return text, label
class NewsClassifier(nn.Module):
def __init__(self, n_classes, model_name):
super(NewsClassifier, self).__init__()
# Load a pre-trained BERT model
self.bert = AutoModel.from_pretrained(model_name)
# Dropout layer to prevent overfitting
self.drop = nn.Dropout(p=0.3)
# Fully-connected layer to convert BERT's hidden state to the number of classes to predict
self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
# Initialize weights and biases of the fully-connected layer using the normal distribution method
nn.init.normal_(self.fc.weight, std=0.02)
nn.init.normal_(self.fc.bias, 0)
def forward(self, input_ids, attention_mask):
# Get the output from the BERT model
last_hidden_state, output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=False
)
# Apply dropout
x = self.drop(output)
# Pass through the fully-connected layer to get predictions
x = self.fc(x)
return x
def prepare_loaders(df, fold):
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)
train_dataset = NewsDataset(df_train, tokenizer, max_len)
valid_dataset = NewsDataset(df_valid, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True, num_workers=2)
return train_loader, valid_loader
# Function to train the model for one epoch
def train(model, criterion, optimizer, train_loader, lr_scheduler):
model.train() # Set the model to training mode
losses = [] # List to store losses during training
correct = 0 # Variable to store number of correct predictions
# Iterate over batches in the training data loader
for batch_idx, data in enumerate(train_loader):
input_ids = data['input_ids'].to(device) # Move input_ids to GPU/CPU
attention_mask = data['attention_masks'].to(device) # Move attention_mask to GPU/CPU
targets = data['targets'].to(device) # Move targets to GPU/CPU
optimizer.zero_grad() # Clear gradients from previous iteration
outputs = model( # Forward pass through the model
input_ids=input_ids,
attention_mask=attention_mask
)
loss = criterion(outputs, targets) # Calculate the loss
_, pred = torch.max(outputs, dim=1) # Get the predicted labels
correct += torch.sum(pred == targets) # Count correct predictions
losses.append(loss.item()) # Append the current loss value to losses list
loss.backward() # Backpropagation: compute gradients
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Clip gradients to prevent exploding gradients
optimizer.step() # Update model parameters
lr_scheduler.step() # Update learning rate scheduler
# Print training progress every 1000 batches
if batch_idx % 1000 == 0:
print(f'Batch {batch_idx}/{len(train_loader)} - Loss: {loss.item():.4f}, Accuracy: {correct.double() / ((batch_idx + 1) * train_loader.batch_size):.4f}')
train_accuracy = correct.double() / len(train_loader.dataset) # Calculate training accuracy
avg_loss = np.mean(losses) # Calculate average loss
print(f'Train Accuracy: {train_accuracy:.4f} Loss: {avg_loss:.4f}')
# Function to evaluate the model
def eval(model, criterion, valid_loader, test_loader=None):
model.eval() # Set the model to evaluation mode
losses = [] # List to store losses during evaluation
correct = 0 # Variable to store number of correct predictions
with torch.no_grad(): # Disable gradient calculation for evaluation
data_loader = test_loader if test_loader else valid_loader # Choose between test and validation data loader
for batch_idx, data in enumerate(data_loader):
input_ids = data['input_ids'].to(device) # Move input_ids to GPU/CPU
attention_mask = data['attention_masks'].to(device) # Move attention_mask to GPU/CPU
targets = data['targets'].to(device) # Move targets to GPU/CPU
outputs = model( # Forward pass through the model
input_ids=input_ids,
attention_mask=attention_mask
)
loss = criterion(outputs, targets) # Calculate the loss
_, pred = torch.max(outputs, dim=1) # Get the predicted labels
correct += torch.sum(pred == targets) # Count correct predictions
losses.append(loss.item()) # Append the current loss value to losses list
dataset_size = len(test_loader.dataset) if test_loader else len(valid_loader.dataset) # Determine dataset size
accuracy = correct.double() / dataset_size # Calculate accuracy
avg_loss = np.mean(losses) # Calculate average loss
# Print evaluation results (either test or validation)
if test_loader:
print(f'Test Accuracy: {accuracy:.4f} Loss: {avg_loss:.4f}')
else:
print(f'Valid Accuracy: {accuracy:.4f} Loss: {avg_loss:.4f}')
return accuracy # Return accuracy for further analysis or logging
total_start_time = time.time()
# Main training loop
for fold in range(skf.n_splits):
print(f'----------- Fold: {fold + 1} ------------------')
train_loader, valid_loader = prepare_loaders(train_df, fold=fold)
model = NewsClassifier(n_classes=13).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=len(train_loader) * EPOCHS
)
best_acc = 0
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 30)
train(model, criterion, optimizer, train_loader, lr_scheduler)
val_acc = eval(model, criterion, valid_loader)
if val_acc > best_acc:
torch.save(model.state_dict(), f'phobert_fold{fold + 1}.pth')
best_acc = val_acc
print(f'Best Accuracy for Fold {fold + 1}: {best_acc:.4f}')
print()
print(f'Finished Fold {fold + 1} with Best Accuracy: {best_acc:.4f}')
print('--------------------------------------')
total_end_time = time.time()
total_duration = total_end_time - total_start_time
print(f'Total training time: {total_duration:.2f} seconds') |