Tokymin commited on
Commit
28578be
1 Parent(s): fc547f0

Validation: 0%| | 0/6 [00:00<?, ?it/s]Training loss: 0.30

Browse files

Validation Loss: 0.28
Validation Accuracy: 0.90
traing end, save model to :./saved_models/model_20240302-214915_lr1e-05_optAdamW_lossBCEWithLogitsLoss_batch16_epoch10.pt

Files changed (6) hide show
  1. app.py +29 -2
  2. dataset/CustomDataset.py +0 -40
  3. new.py +0 -108
  4. test.py +0 -18
  5. test2.py +0 -23
  6. train.py +131 -36
app.py CHANGED
@@ -1,4 +1,31 @@
1
  import streamlit as st
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
 
5
+ # 加载模型和tokenizer
6
+ tokenizer = AutoTokenizer.from_pretrained("your_model_directory")
7
+ model = AutoModelForSequenceClassification.from_pretrained("your_model_directory", num_labels=8)
8
+ model.eval()
9
+
10
+ def predict(text):
11
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
12
+ with torch.no_grad():
13
+ outputs = model(**inputs)
14
+ logits = outputs.logits
15
+ probabilities = torch.softmax(logits, dim=1).squeeze()
16
+ # 假设每个类别(SAS_Class和SDS_Class)都有4个概率值
17
+ sas_probs = probabilities[:4] # 获取SAS_Class的概率
18
+ sds_probs = probabilities[4:] # 获取SDS_Class的概率
19
+ return sas_probs, sds_probs
20
+
21
+ # 创建Streamlit应用
22
+ st.title("Multi-label Classification App")
23
+
24
+ # 用户输入文本
25
+ user_input = st.text_area("Enter text here", "Type something...")
26
+
27
+ if st.button("Predict"):
28
+ # 显示预测结果
29
+ sas_probs, sds_probs = predict(user_input)
30
+ st.write("SAS_Class probabilities:", sas_probs.numpy())
31
+ st.write("SDS_Class probabilities:", sds_probs.numpy())
dataset/CustomDataset.py DELETED
@@ -1,40 +0,0 @@
1
- import torch
2
- from torch.utils.data import Dataset
3
-
4
- n_classes = 2
5
-
6
- class CustomDataset(Dataset):
7
- def __init__(self, data, tokenizer, max_length=512):
8
- self.data = data
9
- self.tokenizer = tokenizer
10
- self.max_length = max_length
11
-
12
- def __len__(self):
13
- return len(self.data)
14
-
15
- def __getitem__(self, idx):
16
- item = self.data[idx]
17
- # 假设我们的数据是一个字典,包含"text"和"label"
18
- text = item['Description']
19
- label = item['label']
20
-
21
- # 对文本进行编码
22
- encoded = self.tokenizer.encode_plus(
23
- text,
24
- add_special_tokens=True,
25
- max_length=self.max_length,
26
- padding='max_length',
27
- truncation=True,
28
- return_attention_mask=True,
29
- return_tensors='pt',
30
- )
31
- # 在这里添加任何需要的数据检查逻辑
32
- # 例如,检查标签是否在预期的范围内
33
- if label < 0 or label > n_classes: # 假设n_classes是标签的数量
34
- raise ValueError("Found an invalid label")
35
-
36
- return {
37
- 'input_ids': encoded['input_ids'].flatten(),
38
- 'attention_mask': encoded['attention_mask'].flatten(),
39
- 'labels': torch.tensor(label, dtype=torch.long)
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
new.py DELETED
@@ -1,108 +0,0 @@
1
- from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
2
- from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
3
- import torch
4
- from sklearn.model_selection import train_test_split
5
- from dataset.load_dataset import df, prepare_dataset
6
- from torch.nn import BCEWithLogitsLoss
7
- from transformers import BertForSequenceClassification, BertConfig
8
- from tqdm.auto import tqdm
9
- from torch.cuda.amp import GradScaler, autocast
10
-
11
-
12
- epochs = 10
13
- tokenizer = AutoTokenizer.from_pretrained(
14
- "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition") # 用于将文本转换为模型所需输入格式的tokenizer
15
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 创建一个device对象,如果有可用的GPU就使用它,否则使用CPU
16
-
17
- # 分割数据集
18
- train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验证的比例分割数据集
19
-
20
- # 准备训练和验证数据集
21
- train_dataset = prepare_dataset(train_df, tokenizer)
22
- val_dataset = prepare_dataset(val_df, tokenizer)
23
- # 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证
24
- train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
25
- validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)
26
-
27
-
28
- # 加载配置
29
- config = BertConfig.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")
30
- config.num_labels = 8 # 调整为你的标签数量
31
-
32
- model = AutoModelForSequenceClassification.from_pretrained(
33
- "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition",config=config,ignore_mismatched_sizes=True).to(device)
34
- # 准备优化器和学习率调度器
35
- optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
36
- total_steps = len(train_dataloader) * epochs # epochs是您想要训练的轮数
37
- scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
38
- loss_fn = BCEWithLogitsLoss()
39
- # 微调模型
40
- scaler = GradScaler()
41
-
42
- for epoch in range(epochs): # 迭代多个epoch
43
- print(f"\nEpoch {epoch + 1}/{epochs}")
44
- print('-------------------------------')
45
- model.train()
46
- total_loss = 0
47
- train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
48
- for step, batch in enumerate(train_progress_bar):
49
- # 将数据加载到GPU
50
- batch = tuple(t.to(device) for t in batch)
51
- b_input_ids, b_input_mask, b_labels = batch
52
- model.zero_grad()
53
- # 前向传播
54
- outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
55
- logits = outputs.logits
56
- # 反向传播
57
- loss = loss_fn(logits, b_labels)
58
- total_loss += loss.item()
59
- # loss.backward()
60
- # optimizer.step()
61
- # scheduler.step()
62
- torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
63
- # 检查loss是否为nan
64
- if torch.isnan(loss).any():
65
- print(f"Loss is nan in epoch {epoch + 1}, step {step}.")
66
- # 可选:打印出问题数据的更多信息或采取其他措施
67
- # 注意:直接跳过可能不解决根本问题,最好检查为何loss会是nan
68
- continue # 跳过当前批次的反向传播和优化器步骤
69
- scaler.scale(loss).backward()
70
- scaler.step(optimizer)
71
- scaler.update()
72
- train_progress_bar.set_postfix({'loss': f"{loss.item():.2f}"})
73
-
74
- # 评估阶段
75
- avg_train_loss = total_loss / len(train_dataloader)
76
- print(f"Training loss: {avg_train_loss:.2f}")
77
-
78
- # 验证阶段
79
- model.eval()
80
- total_eval_accuracy = 0
81
- eval_progress_bar = tqdm(validation_dataloader, desc="Validation", leave=False)
82
-
83
- for batch in eval_progress_bar:
84
- batch = tuple(t.to(device) for t in batch)
85
- b_input_ids, b_input_mask, b_labels = batch
86
- with torch.no_grad():
87
- outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
88
-
89
- logits = outputs.logits
90
- # predictions = torch.argmax(logits, dim=1).flatten()
91
- # labels = b_labels.flatten()
92
- # accuracy = (predictions == labels).cpu().numpy().mean()
93
- # 使用sigmoid函数将logits转换为概率值
94
- probs = torch.sigmoid(logits)
95
- # 将概率高于0.5的预测为正类(1),低于0.5的预测为负类(0)
96
- predictions = (probs > 0.5).int()
97
-
98
- # 比较预测和真实标签
99
- correct_predictions = (predictions == b_labels.int()).float() # 确保标签也是整数类型
100
- # 计算每个样本的正确预测的平均数,然后计算整个批次的平均值
101
- accuracy_per_sample = correct_predictions.mean(dim=1)
102
- accuracy = accuracy_per_sample.mean().item()
103
- total_eval_accuracy += accuracy
104
- # 更新进度条
105
- eval_progress_bar.set_postfix({'accuracy': f"{accuracy:.2f}"})
106
-
107
- avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
108
- print(f"Validation Accuracy: {avg_val_accuracy:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test.py DELETED
@@ -1,18 +0,0 @@
1
- # Load model directly
2
- import ast
3
-
4
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
- import torch
6
- # assets_path = cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="download")
7
- # something_path = assets_path / "config.json" # Do anything you like in your assets folder !
8
-
9
- tokenizer = AutoTokenizer.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")
10
- model = AutoModelForSequenceClassification.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")
11
- input=tokenizer("I love using transformers for natural language processing.", return_tensors="pt")
12
-
13
- # 使用模型进行预测
14
- with torch.no_grad():
15
- logits = model(**input).logits
16
- # 解析预测结果
17
- predicted_class_id = logits.argmax().item()
18
- print(f"Predicted class id: {predicted_class_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test2.py DELETED
@@ -1,23 +0,0 @@
1
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
- import torch
3
-
4
- # 指定预训练模型
5
- model_name = "bert-base-uncased"
6
-
7
- # 加载分词器和模型
8
- tokenizer = AutoTokenizer.from_pretrained(model_name,force_download=True, resume_download=False)
9
- model = AutoModelForSequenceClassification.from_pretrained(model_name,force_download=True, resume_download=False)
10
-
11
- # 要进行分类的文本
12
- text = "I love using transformers for natural language processing."
13
-
14
- # 使用分词器处理文本
15
- inputs = tokenizer(text, return_tensors="pt")
16
-
17
- # 使用模型进行预测
18
- with torch.no_grad():
19
- logits = model(**inputs).logits
20
-
21
- # 解析预测结果
22
- predicted_class_id = logits.argmax().item()
23
- print(f"Predicted class id: {predicted_class_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train.py CHANGED
@@ -1,36 +1,131 @@
1
- import sagemaker
2
- import boto3
3
- from sagemaker.huggingface import HuggingFace
4
-
5
- try:
6
- role = sagemaker.get_execution_role()
7
- except ValueError:
8
- iam = boto3.client('iam')
9
- role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
10
-
11
- hyperparameters = {
12
- 'model_name_or_path': 'emilyalsentzer/Bio_ClinicalBERT',
13
- 'output_dir': '/opt/ml/model'
14
- # add your remaining hyperparameters
15
- # more info here https://github.com/huggingface/transformers/tree/v4.37.0/examples/pytorch/text-classification
16
- }
17
-
18
- # git configuration to download our fine-tuning script
19
- git_config = {'repo': 'https://github.com/huggingface/transformers.git', 'branch': 'v4.37.0'}
20
-
21
- # creates Hugging Face estimator
22
- huggingface_estimator = HuggingFace(
23
- entry_point='run_glue.py',
24
- source_dir='./examples/pytorch/text-classification',
25
- instance_type='ml.p3.2xlarge',
26
- instance_count=1,
27
- role=role,
28
- git_config=git_config,
29
- transformers_version='4.37.0',
30
- pytorch_version='2.1.0',
31
- py_version='py310',
32
- hyperparameters=hyperparameters
33
- )
34
-
35
- # starting the train job
36
- huggingface_estimator.fit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
2
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
3
+ import torch
4
+ from sklearn.model_selection import train_test_split
5
+ from dataset.load_dataset import df, prepare_dataset
6
+ from torch.nn import BCEWithLogitsLoss
7
+ from transformers import BertForSequenceClassification, BertConfig
8
+ from tqdm.auto import tqdm
9
+ from torch.cuda.amp import GradScaler, autocast
10
+ from torch.utils.tensorboard import SummaryWriter
11
+ import datetime
12
+
13
+ # 初始化TensorBoard SummaryWriter
14
+ current_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
15
+ log_dir = f'runs/train_{current_time}'
16
+ writer = SummaryWriter(log_dir)
17
+ epochs = 10
18
+ lr = 1e-5
19
+ optimizer_name = 'AdamW'
20
+ loss_fn_name = 'BCEWithLogitsLoss'
21
+ batch_size = 16
22
+
23
+ # 构建模型保存路径,包括重要参数
24
+ model_save_name = f'model_{current_time}_lr{lr}_opt{optimizer_name}_loss{loss_fn_name}_batch{batch_size}_epoch{epochs}.pt'
25
+ model_save_path = f'./saved_models/{model_save_name}'
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained(
28
+ "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition") # 用于将文本转换为模型所需输入格式的tokenizer
29
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 创建一个device对象,如果有可用的GPU就使用它,否则使用CPU
30
+
31
+ # 分割数据集
32
+ train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验证的比例分割数据集
33
+
34
+ # 准备训练和验证数据集
35
+ train_dataset = prepare_dataset(train_df, tokenizer)
36
+ val_dataset = prepare_dataset(val_df, tokenizer)
37
+ # 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证
38
+ train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
39
+ validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
40
+
41
+ # 加载配置
42
+ config = BertConfig.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")
43
+ config.num_labels = 8 # 调整为你的标签数量
44
+
45
+ model = AutoModelForSequenceClassification.from_pretrained(
46
+ "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition", config=config, ignore_mismatched_sizes=True).to(
47
+ device)
48
+ # 准备优化器和学习率调度器
49
+ optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
50
+ total_steps = len(train_dataloader) * epochs # epochs是您想要训练的轮数
51
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
52
+ loss_fn = BCEWithLogitsLoss()
53
+ # 微调模型
54
+ scaler = GradScaler()
55
+
56
+ for epoch in range(epochs): # 迭代多个epoch
57
+ print(f"\nEpoch {epoch + 1}/{epochs}")
58
+ print('-------------------------------')
59
+ model.train()
60
+ total_loss = 0
61
+ train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
62
+ for step, batch in enumerate(train_progress_bar):
63
+ # 将数据加载到GPU
64
+ batch = tuple(t.to(device) for t in batch)
65
+ b_input_ids, b_input_mask, b_labels = batch
66
+ model.zero_grad()
67
+ # 前向传播
68
+ outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
69
+ logits = outputs.logits
70
+ # 反向传播
71
+ loss = loss_fn(logits, b_labels)
72
+ total_loss += loss.item()
73
+ # loss.backward()
74
+ # optimizer.step()
75
+ # scheduler.step()
76
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
77
+ # 检查loss是否为nan
78
+ if torch.isnan(loss).any():
79
+ print(f"Loss is nan in epoch {epoch + 1}, step {step}.")
80
+ # 可选:打印出问题数据的更多信息或采取其他措施
81
+ # 注意:直接跳过可能不解决根本问题,最好检查为何loss会是nan
82
+ continue # 跳过当前批次的反向传播和优化器步骤
83
+ scaler.scale(loss).backward()
84
+ scaler.step(optimizer)
85
+ scaler.update()
86
+ train_progress_bar.set_postfix({'loss': f"{loss.item():.2f}"})
87
+ # 添加训练损失到TensorBoard
88
+ writer.add_scalar('Loss/train', loss.item(), epoch * len(train_dataloader) + step)
89
+
90
+ # 评估阶段
91
+ avg_train_loss = total_loss / len(train_dataloader)
92
+ print(f"Training loss: {avg_train_loss:.2f}")
93
+
94
+ # 验证阶段
95
+ model.eval()
96
+ total_eval_accuracy = 0
97
+ eval_progress_bar = tqdm(validation_dataloader, desc="Validation", leave=False)
98
+ total_eval_loss = 0 # 初始化验证集总损失
99
+
100
+ for batch in eval_progress_bar:
101
+ batch = tuple(t.to(device) for t in batch)
102
+ b_input_ids, b_input_mask, b_labels = batch
103
+ with torch.no_grad():
104
+ outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
105
+ logits = outputs.logits
106
+ # 注意:如果你的损失函数和模型的输出有不同的形状要求,可能需要对下面的损失计算进行调整
107
+ loss = loss_fn(logits, b_labels)
108
+ total_eval_loss += loss.item() # 累加批次损失到总损失
109
+ # 使用sigmoid函数将logits转换为概率值
110
+ probs = torch.sigmoid(logits)
111
+ # 将概率高于0.5的预测为正类(1),低于0.5的预测为负类(0)
112
+ predictions = (probs > 0.5).int()
113
+ # 比较预测和真实标签
114
+ correct_predictions = (predictions == b_labels.int()).float() # 确保标签也是整数类型
115
+ # 计算每个样本的正确预测的平均数,然后计算整个批次的平均值
116
+ accuracy_per_sample = correct_predictions.mean(dim=1)
117
+ accuracy = accuracy_per_sample.mean().item()
118
+ total_eval_accuracy += accuracy
119
+ # 更新进度条
120
+ eval_progress_bar.set_postfix({'accuracy': f"{accuracy:.2f}"})
121
+ # 计算整个验证集的平均损失
122
+ avg_val_loss = total_eval_loss / len(validation_dataloader)
123
+ print(f"Validation Loss: {avg_val_loss:.2f}")
124
+ avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
125
+ writer.add_scalar('Loss/val', avg_val_loss, epoch) # 确保在TensorBoard中记录验证损失
126
+ print(f"Validation Accuracy: {avg_val_accuracy:.2f}")
127
+
128
+ writer.close()
129
+ # 保存模型
130
+ torch.save(model.state_dict(), model_save_path)
131
+ print(f"traing end, save model to :{model_save_path}")