import itertools import re import spacy import json import evaluate from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel import torch from utils import * from celebbot import CelebBot QA_MODEL_ID = "google/flan-t5-xl" SENTTR_MODEL_ID = "sentence-transformers/all-mpnet-base-v2" celeb_names = ["Cate Blanchett", "David Beckham", "Emma Watson", "Lady Gaga", "Madonna", "Mark Zuckerberg"] def evaluate_system(): device = 'cpu' with open("data.json", encoding='utf-8') as json_file: celeb_data = json.load(json_file) references = [val['answers'] for key, val in list(celeb_data.items()) if key in celeb_names] references = list(itertools.chain.from_iterable(references)) predictions = [] QA_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_ID) QA_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_ID).to(device) sentTr_tokenizer = AutoTokenizer.from_pretrained(SENTTR_MODEL_ID) sentTr_model = AutoModel.from_pretrained(SENTTR_MODEL_ID).to(device) for name in celeb_names: gender = celeb_data[name]["gender"] knowledge = celeb_data[name]["knowledge"] lname = name.split(" ")[-1] lname_regex = re.compile(rf'\b({lname})\b') name_regex = re.compile(rf'\b({name})\b') lnames = lname+"’s" if not lname.endswith("s") else lname+"’" lnames_regex = re.compile(rf'\b({lnames})\b') names = name+"’s" if not name.endswith("s") else name+"’" names_regex = re.compile(rf'\b({names})\b') if gender == "M": knowledge = re.sub(he_regex, "I", knowledge) knowledge = re.sub(his_regex, "my", knowledge) elif gender == "F": knowledge = re.sub(she_regex, "I", knowledge) knowledge = re.sub(her_regex, "my", knowledge) knowledge = re.sub(names_regex, "my", knowledge) knowledge = re.sub(lnames_regex, "my", knowledge) knowledge = re.sub(name_regex, "I", knowledge) knowledge = re.sub(lname_regex, "I", knowledge) spacy_model = spacy.load("en_core_web_sm") knowledge_sents = [i.text.strip() for i in spacy_model(knowledge).sents] ai = CelebBot(name, QA_tokenizer, QA_model, sentTr_tokenizer, sentTr_model, spacy_model, knowledge_sents) if re.search(re.compile(rf'\b(you|your|{ai.name})\b', flags=re.IGNORECASE), ai.text) != None: instruction1 = f"You are a celebrity named {ai.name}. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." knowledge = ai.retrieve_knowledge_assertions() else: instruction1 = f"Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." queries = [f"Context: {instruction1} {knowledge}\n\nQuestion: {q}\n\nAnswer:" for q in celeb_data[name]["questions"]] input_ids = ai.QA_tokenizer(f"{queries}", return_tensors="pt").input_ids.to(device) outputs = ai.QA_model.generate(input_ids, max_length=1024) predictions+= ai.QA_tokenizer.batch_decode(outputs, skip_special_tokens=True) file = open('predictions.txt','w') for prediction in predictions: file.write(prediction+"\n") file.close() bleu = evaluate.load("bleu") results = bleu.compute(predictions=predictions, references=references, max_order=4) print(f"BLEU: {round(results['bleu'], 2)}") meteor = evaluate.load("meteor") results = meteor.compute(predictions=predictions, references=references) print(f"METEOR: {round(results['meteor'], 2)}") rouge = evaluate.load("rouge") results = rouge.compute(predictions=predictions, references=references) print(f"ROUGE: {round(results['rougeL'], 2)}") bertscore = evaluate.load("bertscore") results = bertscore.compute(predictions=predictions, references=references, rescale_with_baseline=True, lang="en") print(f"F1: {round(sum(results['f1'])/len(results['f1']), 2)}") if __name__ == "__main__": evaluate_system()