|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset |
|
import torch |
|
|
|
|
|
df = pd.read_csv("dataset/processed_new_data.csv") |
|
|
|
def prepare_dataset(df, tokenizer, max_length=512): |
|
input_ids = [] |
|
attention_masks = [] |
|
labels = [] |
|
|
|
for _, row in df.iterrows(): |
|
encoded = tokenizer.encode_plus( |
|
row['Description'], |
|
add_special_tokens=True, |
|
max_length=max_length, |
|
padding='max_length', |
|
truncation=True, |
|
return_attention_mask=True, |
|
return_tensors='pt', |
|
) |
|
input_ids.append(encoded['input_ids']) |
|
attention_masks.append(encoded['attention_mask']) |
|
labels.append([row['SAS_Class'], row['SDS_Class']]) |
|
|
|
input_ids = torch.cat(input_ids, dim=0) |
|
attention_masks = torch.cat(attention_masks, dim=0) |
|
labels = torch.tensor(labels, dtype=torch.float) |
|
|
|
return TensorDataset(input_ids, attention_masks, labels) |
|
|
|
|
|
train_df, val_df = train_test_split(df, test_size=0.1) |
|
|
|
|
|
|
|
|
|
|
|
|