File size: 7,200 Bytes
28578be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b3e072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28578be
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch
from sklearn.model_selection import train_test_split
from dataset.load_dataset import df, prepare_dataset
from torch.nn import BCEWithLogitsLoss
from transformers import BertForSequenceClassification, BertConfig
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast
from torch.utils.tensorboard import SummaryWriter
import datetime

# 初始化TensorBoard SummaryWriter
current_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
log_dir = f'runs/train_{current_time}'
writer = SummaryWriter(log_dir)
epochs = 10
lr = 1e-5
optimizer_name = 'AdamW'
loss_fn_name = 'BCEWithLogitsLoss'
batch_size = 16

# 构建模型保存路径,包括重要参数
model_save_name = f'model_{current_time}_lr{lr}_opt{optimizer_name}_loss{loss_fn_name}_batch{batch_size}_epoch{epochs}.pt'
model_save_path = f'./saved_models/{model_save_name}'

tokenizer = AutoTokenizer.from_pretrained(
    "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")  # 用于将文本转换为模型所需输入格式的tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 创建一个device对象,如果有可用的GPU就使用它,否则使用CPU

# 分割数据集
train_df, val_df = train_test_split(df, test_size=0.1)  # 以90%训练,10%验证的比例分割数据集

# 准备训练和验证数据集
train_dataset = prepare_dataset(train_df, tokenizer)
val_dataset = prepare_dataset(val_df, tokenizer)
# 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# 加载配置
config = BertConfig.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")
config.num_labels = 8  # 调整为你的标签数量

model = AutoModelForSequenceClassification.from_pretrained(
    "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition", config=config, ignore_mismatched_sizes=True).to(
    device)
# 准备优化器和学习率调度器
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
total_steps = len(train_dataloader) * epochs  # epochs是您想要训练的轮数
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = BCEWithLogitsLoss()
# 微调模型
scaler = GradScaler()

for epoch in range(epochs):  # 迭代多个epoch
    print(f"\nEpoch {epoch + 1}/{epochs}")
    print('-------------------------------')
    model.train()
    total_loss = 0
    train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
    for step, batch in enumerate(train_progress_bar):
        # 将数据加载到GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        # 前向传播
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits
        # 反向传播
        loss = loss_fn(logits, b_labels)
        total_loss += loss.item()
        # loss.backward()
        # optimizer.step()
        # scheduler.step()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # 检查loss是否为nan
        if torch.isnan(loss).any():
            print(f"Loss is nan in epoch {epoch + 1}, step {step}.")
            # 可选:打印出问题数据的更多信息或采取其他措施
            # 注意:直接跳过可能不解决根本问题,最好检查为何loss会是nan
            continue  # 跳过当前批次的反向传播和优化器步骤
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        train_progress_bar.set_postfix({'loss': f"{loss.item():.2f}"})
        # 添加训练损失到TensorBoard
        writer.add_scalar('Loss/train', loss.item(), epoch * len(train_dataloader) + step)

    # 评估阶段
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Training loss: {avg_train_loss:.2f}")

    # 验证阶段
    model.eval()
    total_eval_accuracy = 0
    eval_progress_bar = tqdm(validation_dataloader, desc="Validation", leave=False)
    total_eval_loss = 0  # 初始化验证集总损失

    for batch in eval_progress_bar:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits
        # 注意:如果你的损失函数和模型的输出有不同的形状要求,可能需要对下面的损失计算进行调整
        loss = loss_fn(logits, b_labels)
        total_eval_loss += loss.item()  # 累加批次损失到总损失
        # 使用sigmoid函数将logits转换为概率值
        # probs = torch.sigmoid(logits)
        # # 将概率高于0.5的预测为正类(1),低于0.5的预测为负类(0)
        # predictions = (probs > 0.5).int()
        # # 比较预测和真实标签
        # correct_predictions = (predictions == b_labels.int()).float()  # 确保标签也是整数类型
        # # 计算每个样本的正确预测的平均数,然后计算整个批次的平均值
        # accuracy_per_sample = correct_predictions.mean(dim=1)
        # accuracy = accuracy_per_sample.mean().item()

        logits_sas = logits[:, :4]  # SAS_Class的4个输出
        logits_sds = logits[:, 4:]  # SDS_Class的4个输出
        # 应用softmax来获取概率分布
        probs_sas = torch.softmax(logits_sas, dim=1)
        probs_sds = torch.softmax(logits_sds, dim=1)

        # 选择概率最高的类别作为预测结果
        _, predictions_sas = torch.max(probs_sas, dim=1)
        _, predictions_sds = torch.max(probs_sds, dim=1)

        # 真实的标签
        true_sas = b_labels[:, 0].long()  # 确保是长整型
        true_sds = b_labels[:, 1].long()  # 确保是长整型

        # 计算准确性
        accuracy_sas = (predictions_sas == true_sas).float().mean()
        accuracy_sds = (predictions_sds == true_sds).float().mean()

        # 综合两个准确性得分
        accuracy = (accuracy_sas + accuracy_sds) / 2
        total_eval_accuracy += accuracy
        # 更新进度条
        eval_progress_bar.set_postfix({'accuracy': f"{accuracy:.2f}"})
    # 计算整个验证集的平均损失
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print(f"Validation Loss: {avg_val_loss:.2f}")
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    writer.add_scalar('Loss/val', avg_val_loss, epoch)  # 确保在TensorBoard中记录验证损失
    print(f"Validation Accuracy: {avg_val_accuracy:.2f}")

writer.close()
# 保存模型
torch.save(model.state_dict(), model_save_path)
print(f"traing end, save model to :{model_save_path}")