Tokymin's picture
编写了数据处理已经预训练的基础代码
1f4f3bd
raw
history blame
1.23 kB
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
# 加载数据
df = pd.read_csv("dataset/processed_new_data.csv")
# 准备数据集
def prepare_dataset(df, tokenizer, max_length=512):
input_ids = []
attention_masks = []
labels = []
for _, row in df.iterrows():
encoded = tokenizer.encode_plus(
row['Description'],
add_special_tokens=True,
max_length=max_length,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
input_ids.append(encoded['input_ids'])
attention_masks.append(encoded['attention_mask'])
labels.append([row['SAS_Class'], row['SDS_Class']])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels, dtype=torch.float)
return TensorDataset(input_ids, attention_masks, labels)
# 分割数据集
train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验证的比例分割数据集
# 创建DataLoader