# %% import torch # GPUが使用可能か判断 if torch.cuda.is_available(): print('gpu is available') else: raise Exception('gpu is NOT available') # %% from datasets import load_dataset, DatasetDict from transformers import AutoTokenizer from transformers import AutoModelForSequenceClassification from transformers import TrainingArguments from transformers import Trainer from sklearn.metrics import accuracy_score, f1_score import numpy as np import pandas as pd import torch import random # %% from pprint import pprint from datasets import load_dataset # Hugging Face Hub上のllm-book/wrime-sentimentのリポジトリから # データを読み込む train_dataset = load_dataset("llm-book/wrime-sentiment", split="train", remove_neutral=False) valid_dataset = load_dataset("llm-book/wrime-sentiment", split="validation", remove_neutral=False) # pprintで見やすく表示する pprint(train_dataset) pprint(valid_dataset) # %% # トークナイザのロード model_name = "cl-tohoku/bert-base-japanese-whole-word-masking" tokenizer = AutoTokenizer.from_pretrained(model_name) # %% # トークナイズ処理 def preprocess_text(batch): encoded_batch = tokenizer(batch['sentence'], max_length=512) encoded_batch['labels'] = batch['label'] return encoded_batch encoded_train_dataset = train_dataset.map( preprocess_text, remove_columns=train_dataset.column_names, ) encoded_valid_dataset = valid_dataset.map( preprocess_text, remove_columns=valid_dataset.column_names, ) # ミニバッチ構築 from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # %% # モデルの準備 from transformers import AutoModelForSequenceClassification class_label = train_dataset.features["label"] label2id = {label: id for id, label in enumerate(class_label.names)} id2label = {id: label for id, label in enumerate(class_label.names)} model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=class_label.num_classes, label2id=label2id, # ラベル名からIDへの対応を指定 id2label=id2label, # IDからラベル名への対応を指定 ) print(type(model).__name__) # %% # 訓練の準備 from transformers import TrainingArguments # 保存ディレクトリ save_dir = f'bert-finetuned-wrime-base' training_args = TrainingArguments( output_dir=save_dir, # 結果の保存フォルダ per_device_train_batch_size=32, # 訓練時のバッチサイズ per_device_eval_batch_size=32, # 評価時のバッチサイズ learning_rate=2e-5, # 学習率 lr_scheduler_type="constant", # 学習率スケジューラの種類 warmup_ratio=0.1, # 学習率のウォームアップの長さを指定 num_train_epochs=100, # エポック数 save_strategy="epoch", # チェックポイントの保存タイミング logging_strategy="epoch", # ロギングのタイミング evaluation_strategy="epoch", # 検証セットによる評価のタイミング load_best_model_at_end=True, # 訓練後に開発セットで最良のモデルをロード metric_for_best_model="accuracy", # 最良のモデルを決定する評価指標 fp16=True, # 自動混合精度演算の有効化 ) # %% # メトリクスの定義 def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) f1 = f1_score(labels, preds, average="weighted") acc = accuracy_score(labels, preds) return {"accuracy": acc, "f1": f1} # %% # 訓練の実行 from transformers import Trainer from transformers import EarlyStoppingCallback trainer = Trainer( model=model, train_dataset=encoded_train_dataset, eval_dataset=encoded_valid_dataset, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], ) trainer.train() # %% # モデルの保存 trainer.save_model(save_dir) tokenizer.save_pretrained(save_dir) # 履歴の保存 history_df = pd.DataFrame(trainer.state.log_history) history_df.to_csv('base_line/wrime_baseline_history.csv') # %% import matplotlib.pyplot as plt def show_graph(df, suptitle, output='output.png'): suptitle_size = 23 graph_title_size = 20 legend_size = 18 ticks_size = 13 # 学習曲線 fig = plt.figure(figsize=(20, 5)) plt.suptitle(suptitle, fontsize=suptitle_size) # Train Loss plt.subplot(131) plt.title('Train Loss', fontsize=graph_title_size) plt.plot(df['loss'].dropna(), label='train') plt.legend(fontsize=legend_size) plt.yticks(fontsize=ticks_size) # Validation Loss plt.subplot(132) # reg_str = f'$y={round(regression.coef_[0],5)}*x+{round(regression.intercept_,3)}$' plt.title(f'Val Loss', fontsize=graph_title_size) y = df['eval_loss'].dropna().values x = np.arange(len(y)).reshape(-1, 1) # pred = regression.coef_ * x.ravel() + regression.intercept_ # 線形回帰直線 plt.plot(y, color='tab:orange', label='val') # plt.plot(pred, color='green', label='pred') plt.legend(fontsize=legend_size) # plt.xlabel(reg_str, fontsize=ticks_size) plt.yticks(fontsize=ticks_size) # Accuracy/F1 plt.subplot(133) plt.title('eval Accuracy/F1', fontsize=graph_title_size) plt.plot(df['eval_accuracy'].dropna(), label='accuracy') plt.plot(df['eval_f1'].dropna(), label='F1') plt.legend(fontsize=legend_size) plt.yticks(fontsize=ticks_size) plt.tight_layout() # plt.show() plt.savefig(output) # %% # 結果を表示 suptitle = 'batch:32, lr:2e-5, type:constant' show_graph(history_df, suptitle, 'base_line/wrime_baseline_output.png')