from transformers import BertTokenizer, BertForMaskedLM import torch from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score import numpy as np import matplotlib.pyplot as plt # # Step 1: Prepare the dataset # # Load your training and validation datasets # def read_data(file_path): # with open(file_path, 'r', encoding='utf-8') as file: # data = file.readlines() # return data # src_train = read_data('src_train.txt') # File containing original sentences for training # tgt_train = read_data('tgt_train.txt') # File containing corresponding simplified sentences for training # src_valid = read_data('src_valid.txt') # File containing original sentences for validation # tgt_valid = read_data('tgt_valid.txt') # File containing corresponding simplified sentences for validation # # Step 2: Fine-tune the BERT model # tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-french-wwm-cased') # model = BertForMaskedLM.from_pretrained('dbmdz/bert-base-french-wwm-cased') # # Fine-tune the model on your training dataset # # You need to define the training loop here # # Step 3: Evaluate the model # def evaluate_model(model, tokenizer, src_valid, tgt_valid): # predicted_sentences = [] # true_labels = [] # for src_sentence, tgt_sentence in zip(src_valid, tgt_valid): # # Tokenize and get predictions # tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt') # with torch.no_grad(): # outputs = model(tokenized_sentence) # predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy() # # Decode predicted sentence # predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True) # # Append to lists # predicted_sentences.append(predicted_sentence) # true_labels.append(tgt_sentence) # # Calculate evaluation metrics # precision = precision_score(true_labels, predicted_sentences, average='weighted') # recall = recall_score(true_labels, predicted_sentences, average='weighted') # f1 = f1_score(true_labels, predicted_sentences, average='weighted') # # Create confusion matrix # labels = np.unique(true_labels) # cm = confusion_matrix(true_labels, predicted_sentences, labels=labels) # return precision, recall, f1, cm # precision, recall, f1, confusion_matrix = evaluate_model(model, tokenizer, src_valid, tgt_valid) # print("Precision:", precision) # print("Recall:", recall) # print("F1 Score:", f1) # print("Confusion Matrix:") # print(confusion_matrix) # # Step 4: Analyze the results # # Count the number of sentences with perfect matches (>70% match, >50% match, <20% match) # def match_percentage(sentence1, sentence2): # n = len(sentence1) # if n == 0: # return 0.0 # common = sum([1 for x, y in zip(sentence1, sentence2) if x == y]) # return common / n # matches_70 = 0 # matches_50 = 0 # matches_20 = 0 # for pred, true in zip(predicted_sentences, tgt_valid): # percentage = match_percentage(pred, true) # if percentage > 0.7: # matches_70 += 1 # if percentage > 0.5: # matches_50 += 1 # if percentage < 0.2: # matches_20 += 1 # print("Number of sentences with >70% match:", matches_70) # print("Number of sentences with >50% match:", matches_50) # print("Number of sentences with <20% match:", matches_20) # # Save confusion matrix as image # plt.figure(figsize=(8, 6)) # plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues) # plt.title('Confusion Matrix') # plt.colorbar() # tick_marks = np.arange(len(labels)) # plt.xticks(tick_marks, labels, rotation=45) # plt.yticks(tick_marks, labels) # plt.xlabel('Predicted Label') # plt.ylabel('True Label') # plt.tight_layout() # plt.savefig('confusion_matrix.png') # Step 1: Prepare the dataset # Load your training and validation datasets def read_data(file_path): with open(file_path, 'r', encoding='utf-8') as file: data = file.readlines() return data def read_picto_ids(file_path): with open(file_path, 'r', encoding='utf-8') as file: data = file.readlines() picto_ids = [list(map(int, line.split())) for line in data] return picto_ids src_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\src_train.txt') # File containing original sentences for training tgt_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\tgt_train.txt') # File containing corresponding simplified sentences for training picto_train = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\picto_id_train.txt') # File containing picto IDs for training src_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\src_valid.txt') # File containing original sentences for validation tgt_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\tgt_valid.txt') # File containing corresponding simplified sentences for validation picto_valid = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\picto_id_valid.txt') # File containing picto IDs for validation # Now src_train, tgt_train, and picto_train are lists containing the sentences and picto IDs from the files. # Step 2: Fine-tune the BERT model # Same as before # Step 3: Evaluate the model def evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid): predicted_sentences = [] true_labels = [] for src_sentence, tgt_sentence, picto_ids in zip(src_valid, tgt_valid, picto_valid): # Tokenize and get predictions tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt') with torch.no_grad(): outputs = model(tokenized_sentence) predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy() # Decode predicted sentence predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True) # Append to lists predicted_sentences.append(predicted_sentence) true_labels.append(tgt_sentence) # Calculate evaluation metrics based on picto IDs accuracies = {"100%": 0, "70%": 0, "50%": 0, "20%": 0} for pred, true, picto_pred, picto_true in zip(predicted_sentences, true_labels, picto_valid, tgt_valid): if pred == true: accuracies["100%"] += 1 elif len(pred.split()) == len(picto_pred) == len(picto_true): match_count = sum(1 for x, y in zip(picto_pred, picto_true) if x == y) match_percentage = match_count / len(picto_pred) if match_percentage >= 0.7: accuracies["70%"] += 1 elif match_percentage >= 0.5: accuracies["50%"] += 1 elif match_percentage >= 0.2: accuracies["20%"] += 1 return accuracies from transformers import CamembertModel, CamembertTokenizer # You can replace "camembert-base" with any other model from the table, e.g. "camembert/camembert-large". tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb") camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb") accuracies = evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid) print("Accuracies based on picto IDs:") print(accuracies)