import wandb from datasets import load_metric from transformers import pipeline import yaml # Charger la configuration with open('config/config.yaml', 'r') as f: config = yaml.safe_load(f) # Charger le modèle fine-tuned model_name = "results_student" # Remplacer par le chemin vers le modèle student tokenizer_name = "distilbert-base-uncased" # Configuration de l'évaluation bleu = load_metric("bleu") rouge = load_metric("rouge") # Initialiser wandb wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity']) def evaluate_model(model_name, tokenizer_name): nlp = pipeline("text-classification", model=model_name, tokenizer=tokenizer_name) # Simuler des exemples pour l'évaluation examples = [ {"reference": "This is a great movie.", "candidate": "This is a fantastic movie."}, {"reference": "I love this film.", "candidate": "I enjoy this movie."} ] references = [e["reference"] for e in examples] candidates = [nlp(e["candidate"])[0]["label"] for e in examples] # Calcul des scores BLEU et ROUGE bleu_score = bleu.compute(predictions=candidates, references=references) rouge_score = rouge.compute(predictions=candidates, references=references) # Enregistrer les scores sur wandb wandb.log({ "bleu_score": bleu_score, "rouge_score": rouge_score }) # Évaluer les modèles evaluate_model(model_name, tokenizer_name)