|
import json |
|
import os |
|
import pickle |
|
from os import PathLike |
|
from typing import List |
|
|
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
from tqdm import tqdm |
|
|
|
from evalplus.data import get_human_eval_plus |
|
from evalplus.eval import estimate_pass_at_k |
|
|
|
SMALL_SIZE = 10 |
|
MEDIUM_SIZE = 14 |
|
BIGGER_SIZE = 18 |
|
|
|
plt.rc("font", size=SMALL_SIZE) |
|
plt.rc("axes", titlesize=MEDIUM_SIZE) |
|
plt.rc("axes", labelsize=MEDIUM_SIZE) |
|
plt.rc("xtick", labelsize=MEDIUM_SIZE) |
|
plt.rc("ytick", labelsize=MEDIUM_SIZE) |
|
plt.rc("legend", fontsize=MEDIUM_SIZE - 1) |
|
plt.rc("figure", titlesize=BIGGER_SIZE) |
|
|
|
plt.rc("text", usetex=True) |
|
|
|
SUCCESS = "success" |
|
|
|
|
|
def passk_rel_drop(task2bvs_old, task2bvs_new): |
|
|
|
|
|
|
|
|
|
|
|
|
|
passk_old = {} |
|
passk_new = {} |
|
|
|
|
|
for exp_i in range(len(task2bvs_old[0])): |
|
ntotal = [] |
|
npass_old = [] |
|
npass_new = [] |
|
nsamples = None |
|
for task_i in range(len(task2bvs_old)): |
|
bv_old = task2bvs_old[task_i][exp_i] |
|
bv_new = task2bvs_new[task_i][exp_i] |
|
ntotal.append(len(bv_old)) |
|
npass_old.append(bv_old.sum()) |
|
npass_new.append(bv_new.sum()) |
|
if nsamples is None: |
|
nsamples = len(bv_old) |
|
assert len(bv_old) == len(bv_new) == nsamples |
|
|
|
d_old = passk_old.setdefault(nsamples, {}) |
|
d_new = passk_new.setdefault(nsamples, {}) |
|
for k in [1, 10, 100]: |
|
if nsamples >= k: |
|
pass_at_k_old = estimate_pass_at_k(ntotal, npass_old, k).mean() * 100 |
|
pass_at_k_new = estimate_pass_at_k(ntotal, npass_new, k).mean() * 100 |
|
d_old.setdefault(k, []).append(pass_at_k_old) |
|
d_new.setdefault(k, []).append(pass_at_k_new) |
|
|
|
for nsamples in passk_old: |
|
print("=====================================") |
|
print(f"{nsamples = }") |
|
do = passk_old[nsamples] |
|
dn = passk_new[nsamples] |
|
drops = [] |
|
for k in [1, 10, 100]: |
|
if k in do: |
|
pko = np.array(do[k]) |
|
pkn = np.array(dn[k]) |
|
drop = 100 * (pko - pkn) / pko |
|
drops.append(drop) |
|
print( |
|
f"pass@{k}: \t{pko.mean():.1f}% -> {pkn.mean():.1f}% (drop {drop.mean():.1f}%)" |
|
) |
|
drops = np.array(drops) |
|
print(f"+++ {drops.mean() = :.1f}%") |
|
print(f"+++ {drops.max() = :.1f}%") |
|
print("=====================================") |
|
|
|
|
|
def get_data(paths: List[PathLike]): |
|
task2bvs_old = None |
|
task2bvs_new = None |
|
|
|
for path in tqdm(paths): |
|
res = json.load(open(path, "r"))["eval"] |
|
ntask = len(res) |
|
|
|
assert ntask == 164 |
|
|
|
if task2bvs_old is None and task2bvs_new is None: |
|
task2bvs_old = [[] for _ in range(ntask)] |
|
task2bvs_new = [[] for _ in range(ntask)] |
|
|
|
|
|
for i, v in enumerate(res.values()): |
|
base = v["base"] |
|
plus = v["plus"] |
|
bbv = np.array([s == SUCCESS for s, _ in base]) |
|
pbv = np.array([s == SUCCESS for s, _ in plus]) & bbv |
|
assert bbv.mean() >= pbv.mean() |
|
|
|
task2bvs_old[i].append(bbv) |
|
task2bvs_new[i].append(pbv) |
|
|
|
assert len(task2bvs_old) == len(task2bvs_new) |
|
return task2bvs_old, task2bvs_new |
|
|
|
|
|
if __name__ == "__main__": |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--root", type=str, default="/JawTitan/EvalPlus/humaneval") |
|
args = parser.parse_args() |
|
|
|
paths = [] |
|
for path in os.listdir(args.root): |
|
eval_json_path = os.path.join(args.root, path, "eval_results.json") |
|
if not os.path.isfile(eval_json_path) or not path[-1].isdigit(): |
|
print(f"skip {path}") |
|
continue |
|
paths.append(eval_json_path) |
|
|
|
CACHE_PATH = "passrate.pkl" |
|
if os.path.isfile(CACHE_PATH): |
|
task2bvs_old, task2bvs_new = pickle.load(open(CACHE_PATH, "rb")) |
|
else: |
|
task2bvs_old, task2bvs_new = get_data(paths) |
|
pickle.dump((task2bvs_old, task2bvs_new), open(CACHE_PATH, "wb")) |
|
|
|
passk_rel_drop(task2bvs_old, task2bvs_new) |
|
|
|
rate_old = [[bv.mean() for bv in task] for task in task2bvs_old] |
|
rate_new = [[bv.mean() for bv in task] for task in task2bvs_new] |
|
|
|
rate_old = 100 * np.array(rate_old).mean(axis=1) |
|
rate_new = 100 * np.array(rate_new).mean(axis=1) |
|
|
|
print( |
|
f"{(rate_new != rate_old).sum()} out of {len(rate_new)} tasks have dropped pass rate" |
|
) |
|
|
|
ntask = len(rate_old) |
|
|
|
|
|
|
|
indices = np.array(rate_old).argsort() |
|
|
|
unsolved = np.where(np.array(rate_old) == 0)[0] |
|
print("Unsolvable: ", unsolved) |
|
|
|
diff = np.array(rate_old) - np.array(rate_new) |
|
diff_indices = diff.argsort() |
|
|
|
tasks = get_human_eval_plus() |
|
for i in reversed(diff_indices[-15:]): |
|
print( |
|
f"#{i} {tasks[f'HumanEval/{i}']['entry_point']} drops {diff[i] :.1f} ~ {100 * diff[i] / rate_old[i]:.1f}%:" |
|
f" {rate_old[i]:.1f} -> {rate_new[i]:.1f}" |
|
) |
|
|
|
rate_old = np.array(rate_old)[indices] |
|
rate_new = np.array(rate_new)[indices] |
|
|
|
|
|
|
|
x = np.arange(ntask) |
|
width = 1 |
|
|
|
fig, ax = plt.subplots(figsize=(9, 3)) |
|
HUMANEVAL = r"\textsc{HumanEval}" |
|
HUMANEVAL_PLUS = r"\textsc{HumanEval\textsuperscript{+}}" |
|
rects1 = ax.bar(x, rate_old, color="coral", label=HUMANEVAL, alpha=0.5) |
|
rects2 = ax.bar(x, rate_new, color="firebrick", label=HUMANEVAL_PLUS, alpha=0.6) |
|
|
|
|
|
ax.set_ylabel("Average Pass Rate (\%)") |
|
ax.set_xlabel(f"Problems (Sorted by {HUMANEVAL} pass rate)") |
|
|
|
|
|
ax.set_xticks([]) |
|
ax.set_xticklabels([]) |
|
|
|
ax.set_xlim(-0.5, ntask - 0.5) |
|
|
|
|
|
ax.grid(linestyle="--", linewidth=1, alpha=0.8) |
|
|
|
|
|
ax.set_yscale("log") |
|
|
|
ax.legend() |
|
fig.tight_layout() |
|
|
|
plt.savefig("passrate.pdf", bbox_inches="tight") |
|
plt.savefig("passrate.png", bbox_inches="tight") |
|
|