Spaces:
Running
Running
""" | |
Source and credits: https://github.com/ZubinGou/math-evaluation-harness/blob/main/python_executor.py | |
""" | |
import argparse | |
import json | |
from concurrent.futures import TimeoutError | |
import numpy as np | |
from pebble import ProcessPool | |
from tqdm import tqdm | |
from grader import math_equal_process | |
def evaluate(samples: list=None, file_path: str=None): | |
assert samples or file_path, "samples or file_path must be provided" | |
if not samples: | |
with open(file_path, 'r') as f: | |
samples = [json.loads(line) for line in f] | |
# dedup by idx | |
if 'idx' in samples[0]: | |
samples = {sample['idx']: sample for sample in samples}.values() | |
samples = sorted(samples, key=lambda x: x['idx']) | |
else: | |
samples = [dict(idx=idx, **sample) for idx, sample in enumerate(samples)] | |
params = [(idx, sample['pred'], sample['gt']) for idx, sample in enumerate(samples)] | |
scores = [] | |
timeout_cnt = 0 | |
with ProcessPool() as pool: | |
future = pool.map(math_equal_process, params, timeout=3) | |
iterator = future.result() | |
with tqdm(total=len(samples), desc="Evaluate") as progress_bar: | |
while True: | |
try: | |
result = next(iterator) | |
scores.append(result) | |
except StopIteration: | |
break | |
except TimeoutError as error: | |
print(error) | |
scores.append(False) | |
timeout_cnt += 1 | |
except Exception as error: | |
print(error.traceback) | |
exit() | |
progress_bar.update(1) | |
assert len(samples) == len(scores) | |
for i in range(len(samples)): | |
samples[i]['score'] = scores[i] | |
mean_score = np.round(np.mean([score for score in scores if score is not False]), decimals=2) | |
result_json = { | |
"num_samples": len(samples), | |
"num_scores": len(scores), | |
"timeout_samples": timeout_cnt, | |
"acc": mean_score | |
} | |
return samples, result_json | |
if __name__ == "__main__": | |
samples, results_json = evaluate(file_path="output/MATH.jsonl") | |
print('test') | |