File size: 3,424 Bytes
bb3c194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
from datasets import load_dataset
import os
import json
from pprint import pprint
pd.options.plotting.backend = "plotly"

MODELS = [
    "mistralai__Mistral-7B-Instruct-v0.2",
    # "HuggingFaceH4__zephyr-7b-beta",
    # "meta-llama__Llama-2-7b-chat-hf",
    # "01-ai__Yi-34B-Chat",
]

HF_TOKEN = os.getenv("HF_TOKEN")

score_turn = {
    1: "multi_turn",
    0: "single_turn",
}

def get_dataframe_lighteval() -> pd.DataFrame:
    samples = []
    scores = []
    for model in MODELS:
        details_lighteval = load_dataset(
            f"SaylorTwift/details_{model}_private",
            "extended_mt_bench_0",
            split="latest",
            token=HF_TOKEN,
        )

        for d in details_lighteval:
            judement_prompt = d["judement_prompt"]
            judgement = d["judgement"]
            predictions = d["predictions"][0]
            prompts = d["full_prompt"]

            turns = []
            for turn in range(len(predictions)):
                if turn == 1:
                    prompt = prompts[turn].format(model_response=predictions[turn - 1])
                else:
                    prompt = prompts[turn]

                turns.append([])
                turns[turn].append(prompt)
                turns[turn].append(predictions[turn])
                turns[turn].append(judement_prompt[turn])
                turns[turn].append(judgement[turn])

            for i, turn in enumerate(turns):
                samples.append(
                    {
                        "model": model,
                        "turn": i,
                        "prompt": turn[0],
                        "response": turn[1],
                        "judgement_prompt": turn[2],
                        "judgment": turn[3],
                        "score": d["metrics"][score_turn[i]],
                        "question_id": d["specifics"]["id"],
                    }
                )

    dataframe_all_samples = pd.DataFrame(samples)

    return dataframe_all_samples




def construct_dataframe() -> pd.DataFrame:
    """
    Construct a dataframe from the data in the data folder
    """
    lighteval = get_dataframe_lighteval()
    lighteval["model"] = lighteval["model"].apply(lambda x: x.split("__")[1])
    lighteval = lighteval.set_index(["question_id", "turn", "model"])
    all_samples = lighteval.reset_index()
    all_samples = all_samples.set_index("question_id")

    return all_samples.dropna()


def create_plot(model: str, dataframe: pd.DataFrame):
    new = dataframe[dataframe["model"] == model].dropna()
    new = new[new["turn"] == 1]
    new["score_lighteval"] = new["score_lighteval"].astype(int)
    new["score_mt_bench"] = new["score_mt_bench"].astype(int)
    new = new[['score_lighteval', 'score_mt_bench']]
    new.index = new.index.astype(str)

    fig = new.plot.bar(title="Scores", labels={"index": "Index", "value": "Score"}, barmode="group")

    return fig


def get_scores(dataframe):
    dataframe = dataframe.dropna()
    dataframe["score"] = dataframe["score"].astype(int)
    new = dataframe[['score', "turn", "model"]]
    new = new.groupby(["model", "turn"]).mean()
    new = new.groupby(["model"]).mean()
    return new

if __name__ == "__main__":
    df = construct_dataframe()
    from pprint import pprint
    pprint(df)
    #print(df.iloc[130])
    # model = "zephyr-7b-beta"
    # fig = create_plot(model, df)
    # fig.show()