long-code-arena / src /get_results_for_task.py
saridormi's picture
Fix formatting
dc801c4
raw
history blame
No virus
2.65 kB
import logging
import os
import pandas as pd # type: ignore[import]
from datasets import (get_dataset_config_names, # type: ignore[import]
load_dataset)
from .leaderboard_formatting import (COLUMNS_PRETTY, METRICS_PER_TASK,
SORT_COLUMN_PER_TASK,
get_columns_per_task)
from .tasks import TASKS_PRETTY_REVERSE
AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])
def _get_results_stub() -> pd.DataFrame:
stub_df = pd.DataFrame(
[
{
"Model Name": "GPT-4",
"Availability": "Proprietary",
"Context Size": "16k",
"BLEU": "X",
"ROUGE": "X",
"ChrF": "X",
"BERTScore": "X",
"BERTScore (Normalized)": "X",
"Submitted By": "🏟 Long Code Arena Team",
},
{
"Model Name": "CodeLlama-7b (instruct)",
"Availability": "Llama 2 license",
"Context Size": "16k",
"BLEU": "X",
"ROUGE": "X",
"ChrF": "X",
"BERTScore": "X",
"BERTScore (Normalized)": "X",
"Submitted By": "🏟 Long Code Arena Team",
},
]
)
return stub_df
def _get_results_dataset(task_id: str) -> pd.DataFrame:
results_df = load_dataset(
os.environ["DATASET_ID"], task_id, split="test"
).to_pandas()
results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
results_df["Context Size"] = results_df["Context Size"].map(
lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x
)
results_df = results_df.sort_values(
by=SORT_COLUMN_PER_TASK[task_id], ascending=False
)
for metric_column in METRICS_PER_TASK[task_id]:
if "BERTScore" in metric_column:
results_df[metric_column] = results_df[metric_column].map(
lambda x: f"{x:.5f}"
)
else:
results_df[metric_column] = results_df[metric_column].map(
lambda x: f"{x:.2f}"
)
results_df = results_df[get_columns_per_task(task_id)]
return results_df
def get_results_for_task(task_pretty: str) -> pd.DataFrame:
task_id = TASKS_PRETTY_REVERSE[task_pretty]
if task_id in AVAILABLE_TASKS:
logging.info(f"Retrieving results for {task_pretty}...")
return _get_results_dataset(task_id)
logging.info(f"Generating leaderboard stub for {task_pretty}...")
return _get_results_stub()