Tokymin's picture
Validation: 0%| | 0/6 [00:00<?, ?it/s]Training loss: 0.30
28578be
raw
history blame
6.33 kB
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch
from sklearn.model_selection import train_test_split
from dataset.load_dataset import df, prepare_dataset
from torch.nn import BCEWithLogitsLoss
from transformers import BertForSequenceClassification, BertConfig
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast
from torch.utils.tensorboard import SummaryWriter
import datetime
# 初始化TensorBoard SummaryWriter
current_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
log_dir = f'runs/train_{current_time}'
writer = SummaryWriter(log_dir)
epochs = 10
lr = 1e-5
optimizer_name = 'AdamW'
loss_fn_name = 'BCEWithLogitsLoss'
batch_size = 16
# 构建模型保存路径,包括重要参数
model_save_name = f'model_{current_time}_lr{lr}_opt{optimizer_name}_loss{loss_fn_name}_batch{batch_size}_epoch{epochs}.pt'
model_save_path = f'./saved_models/{model_save_name}'
tokenizer = AutoTokenizer.from_pretrained(
"pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition") # 用于将文本转换为模型所需输入格式的tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 创建一个device对象,如果有可用的GPU就使用它,否则使用CPU
# 分割数据集
train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验证的比例分割数据集
# 准备训练和验证数据集
train_dataset = prepare_dataset(train_df, tokenizer)
val_dataset = prepare_dataset(val_df, tokenizer)
# 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
# 加载配置
config = BertConfig.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")
config.num_labels = 8 # 调整为你的标签数量
model = AutoModelForSequenceClassification.from_pretrained(
"pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition", config=config, ignore_mismatched_sizes=True).to(
device)
# 准备优化器和学习率调度器
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
total_steps = len(train_dataloader) * epochs # epochs是您想要训练的轮数
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = BCEWithLogitsLoss()
# 微调模型
scaler = GradScaler()
for epoch in range(epochs): # 迭代多个epoch
print(f"\nEpoch {epoch + 1}/{epochs}")
print('-------------------------------')
model.train()
total_loss = 0
train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
for step, batch in enumerate(train_progress_bar):
# 将数据加载到GPU
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
model.zero_grad()
# 前向传播
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
logits = outputs.logits
# 反向传播
loss = loss_fn(logits, b_labels)
total_loss += loss.item()
# loss.backward()
# optimizer.step()
# scheduler.step()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# 检查loss是否为nan
if torch.isnan(loss).any():
print(f"Loss is nan in epoch {epoch + 1}, step {step}.")
# 可选:打印出问题数据的更多信息或采取其他措施
# 注意:直接跳过可能不解决根本问题,最好检查为何loss会是nan
continue # 跳过当前批次的反向传播和优化器步骤
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
train_progress_bar.set_postfix({'loss': f"{loss.item():.2f}"})
# 添加训练损失到TensorBoard
writer.add_scalar('Loss/train', loss.item(), epoch * len(train_dataloader) + step)
# 评估阶段
avg_train_loss = total_loss / len(train_dataloader)
print(f"Training loss: {avg_train_loss:.2f}")
# 验证阶段
model.eval()
total_eval_accuracy = 0
eval_progress_bar = tqdm(validation_dataloader, desc="Validation", leave=False)
total_eval_loss = 0 # 初始化验证集总损失
for batch in eval_progress_bar:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
logits = outputs.logits
# 注意:如果你的损失函数和模型的输出有不同的形状要求,可能需要对下面的损失计算进行调整
loss = loss_fn(logits, b_labels)
total_eval_loss += loss.item() # 累加批次损失到总损失
# 使用sigmoid函数将logits转换为概率值
probs = torch.sigmoid(logits)
# 将概率高于0.5的预测为正类(1),低于0.5的预测为负类(0)
predictions = (probs > 0.5).int()
# 比较预测和真实标签
correct_predictions = (predictions == b_labels.int()).float() # 确保标签也是整数类型
# 计算每个样本的正确预测的平均数,然后计算整个批次的平均值
accuracy_per_sample = correct_predictions.mean(dim=1)
accuracy = accuracy_per_sample.mean().item()
total_eval_accuracy += accuracy
# 更新进度条
eval_progress_bar.set_postfix({'accuracy': f"{accuracy:.2f}"})
# 计算整个验证集的平均损失
avg_val_loss = total_eval_loss / len(validation_dataloader)
print(f"Validation Loss: {avg_val_loss:.2f}")
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
writer.add_scalar('Loss/val', avg_val_loss, epoch) # 确保在TensorBoard中记录验证损失
print(f"Validation Accuracy: {avg_val_accuracy:.2f}")
writer.close()
# 保存模型
torch.save(model.state_dict(), model_save_path)
print(f"traing end, save model to :{model_save_path}")