File size: 2,481 Bytes
9203553
 
 
cdf268e
6c92442
9203553
6c92442
 
 
 
 
 
 
e2473e2
9203553
e2473e2
9203553
 
e2473e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9203553
 
 
6c92442
9203553
6c92442
aa8b23d
6c92442
aa8b23d
 
dc801c4
6c92442
dc801c4
6c92442
aa8b23d
9203553
 
 
 
2d0af54
 
9203553
2d0af54
9203553
2d0af54
9203553
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import logging
import os

import pandas as pd  # type: ignore[import]
from datasets import get_dataset_config_names, load_dataset  # type: ignore[import]

from .leaderboard_formatting import (
    COLUMNS_PRETTY,
    METRICS_PER_TASK,
    SORT_COLUMN_PER_TASK,
    get_columns_per_task,
)
from .tasks_content import TASKS_PRETTY_REVERSE

AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])


def _get_results_stub() -> pd.DataFrame:
    stub_df = pd.DataFrame(
        [
            {
                "Model Name": "GPT-4",
                "Availability": "Proprietary",
                "Context Size": "16k",
                "BLEU": "X",
                "ROUGE": "X",
                "ChrF": "X",
                "BERTScore": "X",
                "BERTScore (Normalized)": "X",
                "Submitted By": "🏟 Long Code Arena Team",
            },
            {
                "Model Name": "CodeLlama-7b (instruct)",
                "Availability": "Llama 2 license",
                "Context Size": "16k",
                "BLEU": "X",
                "ROUGE": "X",
                "ChrF": "X",
                "BERTScore": "X",
                "BERTScore (Normalized)": "X",
                "Submitted By": "🏟 Long Code Arena Team",
            },
        ]
    )
    return stub_df


def _get_results_dataset(task_id: str) -> pd.DataFrame:
    results_df = load_dataset(os.environ["DATASET_ID"], task_id, split="test").to_pandas()
    results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
    results_df["Context Size"] = results_df["Context Size"].map(lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x)

    results_df = results_df.sort_values(by=SORT_COLUMN_PER_TASK[task_id], ascending=False)

    for metric_column in METRICS_PER_TASK[task_id]:
        if "BERTScore" in metric_column:
            results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.5f}")
        else:
            results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")

    results_df = results_df[get_columns_per_task(task_id)]
    return results_df


def get_results_for_task(task_pretty: str) -> pd.DataFrame:
    task_id = TASKS_PRETTY_REVERSE[task_pretty]
    if task_id in AVAILABLE_TASKS:
        logging.info(f"Retrieving results for {task_pretty}...")
        return _get_results_dataset(task_id)
    logging.info(f"Generating leaderboard stub for {task_pretty}...")
    return _get_results_stub()