text-classification / evaluate /tests /test_trainer_evaluator_parity.py
XS-dev
trial
5657307
import json
import os
import shutil
import subprocess
import tempfile
import unittest
import numpy as np
import torch
import transformers
from datasets import load_dataset
from transformers import AutoFeatureExtractor, AutoModelForImageClassification, Trainer, TrainingArguments, pipeline
from evaluate import evaluator, load
from .utils import slow
class TestEvaluatorTrainerParity(unittest.TestCase):
def setUp(self):
self.dir_path = tempfile.mkdtemp("evaluator_trainer_parity_test")
transformers_version = transformers.__version__
branch = ""
if not transformers_version.endswith(".dev0"):
branch = f"--branch v{transformers_version}"
subprocess.run(
f"git clone --depth 3 --filter=blob:none --sparse {branch} https://github.com/huggingface/transformers",
shell=True,
cwd=self.dir_path,
)
def tearDown(self):
shutil.rmtree(self.dir_path, ignore_errors=True)
def test_text_classification_parity(self):
model_name = "philschmid/tiny-bert-sst2-distilled"
subprocess.run(
"git sparse-checkout set examples/pytorch/text-classification",
shell=True,
cwd=os.path.join(self.dir_path, "transformers"),
)
subprocess.run(
f"python examples/pytorch/text-classification/run_glue.py"
f" --model_name_or_path {model_name}"
f" --task_name sst2"
f" --do_eval"
f" --max_seq_length 9999999999" # rely on tokenizer.model_max_length for max_length
f" --output_dir {os.path.join(self.dir_path, 'textclassification_sst2_transformers')}"
f" --max_eval_samples 80",
shell=True,
cwd=os.path.join(self.dir_path, "transformers"),
)
with open(
f"{os.path.join(self.dir_path, 'textclassification_sst2_transformers', 'eval_results.json')}", "r"
) as f:
transformers_results = json.load(f)
eval_dataset = load_dataset("glue", "sst2", split="validation[:80]")
pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name)
task_evaluator = evaluator(task="text-classification")
evaluator_results = task_evaluator.compute(
model_or_pipeline=pipe,
data=eval_dataset,
metric="accuracy",
input_column="sentence",
label_column="label",
label_mapping={"negative": 0, "positive": 1},
strategy="simple",
)
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])
@slow
def test_text_classification_parity_two_columns(self):
model_name = "prajjwal1/bert-tiny-mnli"
max_eval_samples = 150
subprocess.run(
"git sparse-checkout set examples/pytorch/text-classification",
shell=True,
cwd=os.path.join(self.dir_path, "transformers"),
)
subprocess.run(
f"python examples/pytorch/text-classification/run_glue.py"
f" --model_name_or_path {model_name}"
f" --task_name mnli"
f" --do_eval"
f" --max_seq_length 256"
f" --output_dir {os.path.join(self.dir_path, 'textclassification_mnli_transformers')}"
f" --max_eval_samples {max_eval_samples}",
shell=True,
cwd=os.path.join(self.dir_path, "transformers"),
)
with open(
f"{os.path.join(self.dir_path, 'textclassification_mnli_transformers', 'eval_results.json')}", "r"
) as f:
transformers_results = json.load(f)
eval_dataset = load_dataset("glue", "mnli", split=f"validation_matched[:{max_eval_samples}]")
pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name, max_length=256)
task_evaluator = evaluator(task="text-classification")
evaluator_results = task_evaluator.compute(
model_or_pipeline=pipe,
data=eval_dataset,
metric="accuracy",
input_column="premise",
second_input_column="hypothesis",
label_column="label",
label_mapping={"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
)
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])
def test_image_classification_parity(self):
# we can not compare to the Pytorch transformers example, that uses custom preprocessing on the images
model_name = "douwekiela/resnet-18-finetuned-dogfood"
dataset_name = "beans"
max_eval_samples = 120
raw_dataset = load_dataset(dataset_name, split="validation")
eval_dataset = raw_dataset.select(range(max_eval_samples))
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForImageClassification.from_pretrained(model_name)
def collate_fn(examples):
pixel_values = torch.stack(
[torch.tensor(feature_extractor(example["image"])["pixel_values"][0]) for example in examples]
)
labels = torch.tensor([example["labels"] for example in examples])
return {"pixel_values": pixel_values, "labels": labels}
metric = load("accuracy")
trainer = Trainer(
model=model,
args=TrainingArguments(
output_dir=os.path.join(self.dir_path, "imageclassification_beans_transformers"),
remove_unused_columns=False,
),
train_dataset=None,
eval_dataset=eval_dataset,
compute_metrics=lambda p: metric.compute(
predictions=np.argmax(p.predictions, axis=1), references=p.label_ids
),
tokenizer=None,
data_collator=collate_fn,
)
metrics = trainer.evaluate()
trainer.save_metrics("eval", metrics)
with open(
f"{os.path.join(self.dir_path, 'imageclassification_beans_transformers', 'eval_results.json')}", "r"
) as f:
transformers_results = json.load(f)
pipe = pipeline(task="image-classification", model=model_name, feature_extractor=model_name)
task_evaluator = evaluator(task="image-classification")
evaluator_results = task_evaluator.compute(
model_or_pipeline=pipe,
data=eval_dataset,
metric="accuracy",
input_column="image",
label_column="labels",
label_mapping=model.config.label2id,
strategy="simple",
)
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])
def test_question_answering_parity(self):
model_name_v1 = "anas-awadalla/bert-tiny-finetuned-squad"
model_name_v2 = "mrm8488/bert-tiny-finetuned-squadv2"
subprocess.run(
"git sparse-checkout set examples/pytorch/question-answering",
shell=True,
cwd=os.path.join(self.dir_path, "transformers"),
)
# test squad_v1-like dataset
subprocess.run(
f"python examples/pytorch/question-answering/run_qa.py"
f" --model_name_or_path {model_name_v1}"
f" --dataset_name squad"
f" --do_eval"
f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squad_transformers')}"
f" --max_eval_samples 100"
f" --max_seq_length 384",
shell=True,
cwd=os.path.join(self.dir_path, "transformers"),
)
with open(
f"{os.path.join(self.dir_path, 'questionanswering_squad_transformers', 'eval_results.json')}", "r"
) as f:
transformers_results = json.load(f)
eval_dataset = load_dataset("squad", split="validation[:100]")
pipe = pipeline(
task="question-answering",
model=model_name_v1,
tokenizer=model_name_v1,
max_answer_len=30,
padding="max_length",
)
task_evaluator = evaluator(task="question-answering")
evaluator_results = task_evaluator.compute(
model_or_pipeline=pipe,
data=eval_dataset,
metric="squad",
strategy="simple",
)
self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"])
self.assertEqual(transformers_results["eval_exact_match"], evaluator_results["exact_match"])
# test squad_v2-like dataset
subprocess.run(
f"python examples/pytorch/question-answering/run_qa.py"
f" --model_name_or_path {model_name_v2}"
f" --dataset_name squad_v2"
f" --version_2_with_negative"
f" --do_eval"
f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squadv2_transformers')}"
f" --max_eval_samples 100"
f" --max_seq_length 384",
shell=True,
cwd=os.path.join(self.dir_path, "transformers"),
)
with open(
f"{os.path.join(self.dir_path, 'questionanswering_squadv2_transformers', 'eval_results.json')}", "r"
) as f:
transformers_results = json.load(f)
eval_dataset = load_dataset("squad_v2", split="validation[:100]")
pipe = pipeline(
task="question-answering",
model=model_name_v2,
tokenizer=model_name_v2,
max_answer_len=30,
)
task_evaluator = evaluator(task="question-answering")
evaluator_results = task_evaluator.compute(
model_or_pipeline=pipe,
data=eval_dataset,
metric="squad_v2",
strategy="simple",
squad_v2_format=True,
)
self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"])
self.assertEqual(transformers_results["eval_HasAns_f1"], evaluator_results["HasAns_f1"])
self.assertEqual(transformers_results["eval_NoAns_f1"], evaluator_results["NoAns_f1"])
def test_token_classification_parity(self):
model_name = "hf-internal-testing/tiny-bert-for-token-classification"
n_samples = 500
subprocess.run(
"git sparse-checkout set examples/pytorch/token-classification",
shell=True,
cwd=os.path.join(self.dir_path, "transformers"),
)
subprocess.run(
f"python examples/pytorch/token-classification/run_ner.py"
f" --model_name_or_path {model_name}"
f" --dataset_name conll2003"
f" --do_eval"
f" --output_dir {os.path.join(self.dir_path, 'tokenclassification_conll2003_transformers')}"
f" --max_eval_samples {n_samples}",
shell=True,
cwd=os.path.join(self.dir_path, "transformers"),
)
with open(
os.path.join(self.dir_path, "tokenclassification_conll2003_transformers", "eval_results.json"), "r"
) as f:
transformers_results = json.load(f)
eval_dataset = load_dataset("conll2003", split=f"validation[:{n_samples}]")
pipe = pipeline(task="token-classification", model=model_name)
e = evaluator(task="token-classification")
evaluator_results = e.compute(
model_or_pipeline=pipe,
data=eval_dataset,
metric="seqeval",
input_column="tokens",
label_column="ner_tags",
strategy="simple",
)
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["overall_accuracy"])
self.assertEqual(transformers_results["eval_f1"], evaluator_results["overall_f1"])