Spaces:
Sleeping
Sleeping
import json | |
import os | |
import shutil | |
import subprocess | |
import tempfile | |
import unittest | |
import numpy as np | |
import torch | |
import transformers | |
from datasets import load_dataset | |
from transformers import AutoFeatureExtractor, AutoModelForImageClassification, Trainer, TrainingArguments, pipeline | |
from evaluate import evaluator, load | |
from .utils import slow | |
class TestEvaluatorTrainerParity(unittest.TestCase): | |
def setUp(self): | |
self.dir_path = tempfile.mkdtemp("evaluator_trainer_parity_test") | |
transformers_version = transformers.__version__ | |
branch = "" | |
if not transformers_version.endswith(".dev0"): | |
branch = f"--branch v{transformers_version}" | |
subprocess.run( | |
f"git clone --depth 3 --filter=blob:none --sparse {branch} https://github.com/huggingface/transformers", | |
shell=True, | |
cwd=self.dir_path, | |
) | |
def tearDown(self): | |
shutil.rmtree(self.dir_path, ignore_errors=True) | |
def test_text_classification_parity(self): | |
model_name = "philschmid/tiny-bert-sst2-distilled" | |
subprocess.run( | |
"git sparse-checkout set examples/pytorch/text-classification", | |
shell=True, | |
cwd=os.path.join(self.dir_path, "transformers"), | |
) | |
subprocess.run( | |
f"python examples/pytorch/text-classification/run_glue.py" | |
f" --model_name_or_path {model_name}" | |
f" --task_name sst2" | |
f" --do_eval" | |
f" --max_seq_length 9999999999" # rely on tokenizer.model_max_length for max_length | |
f" --output_dir {os.path.join(self.dir_path, 'textclassification_sst2_transformers')}" | |
f" --max_eval_samples 80", | |
shell=True, | |
cwd=os.path.join(self.dir_path, "transformers"), | |
) | |
with open( | |
f"{os.path.join(self.dir_path, 'textclassification_sst2_transformers', 'eval_results.json')}", "r" | |
) as f: | |
transformers_results = json.load(f) | |
eval_dataset = load_dataset("glue", "sst2", split="validation[:80]") | |
pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name) | |
task_evaluator = evaluator(task="text-classification") | |
evaluator_results = task_evaluator.compute( | |
model_or_pipeline=pipe, | |
data=eval_dataset, | |
metric="accuracy", | |
input_column="sentence", | |
label_column="label", | |
label_mapping={"negative": 0, "positive": 1}, | |
strategy="simple", | |
) | |
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"]) | |
def test_text_classification_parity_two_columns(self): | |
model_name = "prajjwal1/bert-tiny-mnli" | |
max_eval_samples = 150 | |
subprocess.run( | |
"git sparse-checkout set examples/pytorch/text-classification", | |
shell=True, | |
cwd=os.path.join(self.dir_path, "transformers"), | |
) | |
subprocess.run( | |
f"python examples/pytorch/text-classification/run_glue.py" | |
f" --model_name_or_path {model_name}" | |
f" --task_name mnli" | |
f" --do_eval" | |
f" --max_seq_length 256" | |
f" --output_dir {os.path.join(self.dir_path, 'textclassification_mnli_transformers')}" | |
f" --max_eval_samples {max_eval_samples}", | |
shell=True, | |
cwd=os.path.join(self.dir_path, "transformers"), | |
) | |
with open( | |
f"{os.path.join(self.dir_path, 'textclassification_mnli_transformers', 'eval_results.json')}", "r" | |
) as f: | |
transformers_results = json.load(f) | |
eval_dataset = load_dataset("glue", "mnli", split=f"validation_matched[:{max_eval_samples}]") | |
pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name, max_length=256) | |
task_evaluator = evaluator(task="text-classification") | |
evaluator_results = task_evaluator.compute( | |
model_or_pipeline=pipe, | |
data=eval_dataset, | |
metric="accuracy", | |
input_column="premise", | |
second_input_column="hypothesis", | |
label_column="label", | |
label_mapping={"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}, | |
) | |
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"]) | |
def test_image_classification_parity(self): | |
# we can not compare to the Pytorch transformers example, that uses custom preprocessing on the images | |
model_name = "douwekiela/resnet-18-finetuned-dogfood" | |
dataset_name = "beans" | |
max_eval_samples = 120 | |
raw_dataset = load_dataset(dataset_name, split="validation") | |
eval_dataset = raw_dataset.select(range(max_eval_samples)) | |
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) | |
model = AutoModelForImageClassification.from_pretrained(model_name) | |
def collate_fn(examples): | |
pixel_values = torch.stack( | |
[torch.tensor(feature_extractor(example["image"])["pixel_values"][0]) for example in examples] | |
) | |
labels = torch.tensor([example["labels"] for example in examples]) | |
return {"pixel_values": pixel_values, "labels": labels} | |
metric = load("accuracy") | |
trainer = Trainer( | |
model=model, | |
args=TrainingArguments( | |
output_dir=os.path.join(self.dir_path, "imageclassification_beans_transformers"), | |
remove_unused_columns=False, | |
), | |
train_dataset=None, | |
eval_dataset=eval_dataset, | |
compute_metrics=lambda p: metric.compute( | |
predictions=np.argmax(p.predictions, axis=1), references=p.label_ids | |
), | |
tokenizer=None, | |
data_collator=collate_fn, | |
) | |
metrics = trainer.evaluate() | |
trainer.save_metrics("eval", metrics) | |
with open( | |
f"{os.path.join(self.dir_path, 'imageclassification_beans_transformers', 'eval_results.json')}", "r" | |
) as f: | |
transformers_results = json.load(f) | |
pipe = pipeline(task="image-classification", model=model_name, feature_extractor=model_name) | |
task_evaluator = evaluator(task="image-classification") | |
evaluator_results = task_evaluator.compute( | |
model_or_pipeline=pipe, | |
data=eval_dataset, | |
metric="accuracy", | |
input_column="image", | |
label_column="labels", | |
label_mapping=model.config.label2id, | |
strategy="simple", | |
) | |
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"]) | |
def test_question_answering_parity(self): | |
model_name_v1 = "anas-awadalla/bert-tiny-finetuned-squad" | |
model_name_v2 = "mrm8488/bert-tiny-finetuned-squadv2" | |
subprocess.run( | |
"git sparse-checkout set examples/pytorch/question-answering", | |
shell=True, | |
cwd=os.path.join(self.dir_path, "transformers"), | |
) | |
# test squad_v1-like dataset | |
subprocess.run( | |
f"python examples/pytorch/question-answering/run_qa.py" | |
f" --model_name_or_path {model_name_v1}" | |
f" --dataset_name squad" | |
f" --do_eval" | |
f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squad_transformers')}" | |
f" --max_eval_samples 100" | |
f" --max_seq_length 384", | |
shell=True, | |
cwd=os.path.join(self.dir_path, "transformers"), | |
) | |
with open( | |
f"{os.path.join(self.dir_path, 'questionanswering_squad_transformers', 'eval_results.json')}", "r" | |
) as f: | |
transformers_results = json.load(f) | |
eval_dataset = load_dataset("squad", split="validation[:100]") | |
pipe = pipeline( | |
task="question-answering", | |
model=model_name_v1, | |
tokenizer=model_name_v1, | |
max_answer_len=30, | |
padding="max_length", | |
) | |
task_evaluator = evaluator(task="question-answering") | |
evaluator_results = task_evaluator.compute( | |
model_or_pipeline=pipe, | |
data=eval_dataset, | |
metric="squad", | |
strategy="simple", | |
) | |
self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"]) | |
self.assertEqual(transformers_results["eval_exact_match"], evaluator_results["exact_match"]) | |
# test squad_v2-like dataset | |
subprocess.run( | |
f"python examples/pytorch/question-answering/run_qa.py" | |
f" --model_name_or_path {model_name_v2}" | |
f" --dataset_name squad_v2" | |
f" --version_2_with_negative" | |
f" --do_eval" | |
f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squadv2_transformers')}" | |
f" --max_eval_samples 100" | |
f" --max_seq_length 384", | |
shell=True, | |
cwd=os.path.join(self.dir_path, "transformers"), | |
) | |
with open( | |
f"{os.path.join(self.dir_path, 'questionanswering_squadv2_transformers', 'eval_results.json')}", "r" | |
) as f: | |
transformers_results = json.load(f) | |
eval_dataset = load_dataset("squad_v2", split="validation[:100]") | |
pipe = pipeline( | |
task="question-answering", | |
model=model_name_v2, | |
tokenizer=model_name_v2, | |
max_answer_len=30, | |
) | |
task_evaluator = evaluator(task="question-answering") | |
evaluator_results = task_evaluator.compute( | |
model_or_pipeline=pipe, | |
data=eval_dataset, | |
metric="squad_v2", | |
strategy="simple", | |
squad_v2_format=True, | |
) | |
self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"]) | |
self.assertEqual(transformers_results["eval_HasAns_f1"], evaluator_results["HasAns_f1"]) | |
self.assertEqual(transformers_results["eval_NoAns_f1"], evaluator_results["NoAns_f1"]) | |
def test_token_classification_parity(self): | |
model_name = "hf-internal-testing/tiny-bert-for-token-classification" | |
n_samples = 500 | |
subprocess.run( | |
"git sparse-checkout set examples/pytorch/token-classification", | |
shell=True, | |
cwd=os.path.join(self.dir_path, "transformers"), | |
) | |
subprocess.run( | |
f"python examples/pytorch/token-classification/run_ner.py" | |
f" --model_name_or_path {model_name}" | |
f" --dataset_name conll2003" | |
f" --do_eval" | |
f" --output_dir {os.path.join(self.dir_path, 'tokenclassification_conll2003_transformers')}" | |
f" --max_eval_samples {n_samples}", | |
shell=True, | |
cwd=os.path.join(self.dir_path, "transformers"), | |
) | |
with open( | |
os.path.join(self.dir_path, "tokenclassification_conll2003_transformers", "eval_results.json"), "r" | |
) as f: | |
transformers_results = json.load(f) | |
eval_dataset = load_dataset("conll2003", split=f"validation[:{n_samples}]") | |
pipe = pipeline(task="token-classification", model=model_name) | |
e = evaluator(task="token-classification") | |
evaluator_results = e.compute( | |
model_or_pipeline=pipe, | |
data=eval_dataset, | |
metric="seqeval", | |
input_column="tokens", | |
label_column="ner_tags", | |
strategy="simple", | |
) | |
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["overall_accuracy"]) | |
self.assertEqual(transformers_results["eval_f1"], evaluator_results["overall_f1"]) | |