Nathan Habib commited on
Commit
bb3c194
1 Parent(s): 56f8b5d
Files changed (3) hide show
  1. app.py +91 -0
  2. requirements.txt +1 -0
  3. utils.py +114 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ from utils import construct_dataframe, MODELS, get_scores
5
+
6
+ HF_TOKEN = os.getenv("HF_TOKEN")
7
+ DATAFRAME: pd.DataFrame = construct_dataframe()
8
+ MAX_LINES = 500
9
+ MIN_LINES = 10
10
+
11
+
12
+ def get_from_question_id_turn_2(model, question_id: int):
13
+ new = DATAFRAME.loc[question_id]
14
+ new = new[new["turn"] == 1]
15
+ new = new[new["model"] == model]
16
+
17
+ prompt_lighteval = new["prompt"].values[0]
18
+ response_lighteval = new["response"].values[0]
19
+ judgement_prompt_lighteval = new["judgement_prompt"].values[0]
20
+ judgement_lighteval = new["judgment"].values[0]
21
+ score_lighteval = new["score"].values[0]
22
+
23
+ return prompt_lighteval, response_lighteval, judgement_prompt_lighteval[1]["content"], judgement_lighteval, score_lighteval
24
+
25
+
26
+ def get_from_question_id_turn_1(model, question_id: int):
27
+ new = DATAFRAME.loc[question_id]
28
+ new = new[new["turn"] == 0]
29
+ new = new[new["model"] == model]
30
+
31
+ prompt_lighteval = new["prompt"].values[0]
32
+ response_lighteval = new["response"].values[0]
33
+ judgement_prompt_lighteval = new["judgement_prompt"].values[0]
34
+ judgement_lighteval = new["judgment"].values[0]
35
+ score_lighteval = new["score"].values[0]
36
+
37
+ return prompt_lighteval, response_lighteval, judgement_prompt_lighteval[1]["content"], judgement_lighteval, score_lighteval
38
+
39
+
40
+ with gr.Blocks() as demo:
41
+ with gr.Row():
42
+ model = gr.Dropdown([model.split("__")[1] for model in MODELS], label="Model")
43
+ index = gr.Dropdown(set(DATAFRAME.index.values.tolist()), label="Index", value=DATAFRAME.index.values.tolist()[0])
44
+
45
+ with gr.Row():
46
+ gr.DataFrame(get_scores(DATAFRAME).reset_index(), interactive=False, )
47
+
48
+ with gr.Row():
49
+ with gr.Column():
50
+ gr.Markdown("## Turn 1")
51
+ score_lighteval = gr.Number(label="Score", interactive=False)
52
+ prompt_lighteval = gr.Textbox(
53
+ label="Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES
54
+ )
55
+ response_lighteval = gr.Textbox(label="Response", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
56
+ judgement_prompt_lighteval = gr.Textbox(label="Judgement Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
57
+ judgement_lighteval = gr.Textbox(label="Judgement", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
58
+ with gr.Column():
59
+ gr.Markdown("## Turn 2")
60
+ score_lighteval_2 = gr.Number(label="Score", interactive=False)
61
+ prompt_lighteval_2 = gr.Textbox(
62
+ label="Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES
63
+ )
64
+ response_lighteval_2 = gr.Textbox(label="Response", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
65
+ judgement_prompt_lighteval_2 = gr.Textbox(label="Judgement Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
66
+ judgement_lighteval_2 = gr.Textbox(label="Judgement", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
67
+
68
+
69
+ index.change(
70
+ fn=get_from_question_id_turn_1,
71
+ inputs=[model, index],
72
+ outputs=[prompt_lighteval, response_lighteval, judgement_prompt_lighteval, judgement_lighteval, score_lighteval],
73
+ )
74
+
75
+ index.change(
76
+ fn=get_from_question_id_turn_2,
77
+ inputs=[model, index],
78
+ outputs=[prompt_lighteval_2, response_lighteval_2, judgement_prompt_lighteval_2, judgement_lighteval_2, score_lighteval_2],
79
+ )
80
+ model.change(
81
+ fn=get_from_question_id_turn_2,
82
+ inputs=[model, index],
83
+ outputs=[prompt_lighteval_2, response_lighteval_2, judgement_prompt_lighteval_2, judgement_lighteval_2, score_lighteval_2],
84
+ )
85
+ model.change(
86
+ fn=get_from_question_id_turn_1,
87
+ inputs=[model, index],
88
+ outputs=[prompt_lighteval, response_lighteval, judgement_prompt_lighteval, judgement_lighteval, score_lighteval,],
89
+ )
90
+
91
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ plotly
utils.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from datasets import load_dataset
3
+ import os
4
+ import json
5
+ from pprint import pprint
6
+ pd.options.plotting.backend = "plotly"
7
+
8
+ MODELS = [
9
+ "mistralai__Mistral-7B-Instruct-v0.2",
10
+ # "HuggingFaceH4__zephyr-7b-beta",
11
+ # "meta-llama__Llama-2-7b-chat-hf",
12
+ # "01-ai__Yi-34B-Chat",
13
+ ]
14
+
15
+ HF_TOKEN = os.getenv("HF_TOKEN")
16
+
17
+ score_turn = {
18
+ 1: "multi_turn",
19
+ 0: "single_turn",
20
+ }
21
+
22
+ def get_dataframe_lighteval() -> pd.DataFrame:
23
+ samples = []
24
+ scores = []
25
+ for model in MODELS:
26
+ details_lighteval = load_dataset(
27
+ f"SaylorTwift/details_{model}_private",
28
+ "extended_mt_bench_0",
29
+ split="latest",
30
+ token=HF_TOKEN,
31
+ )
32
+
33
+ for d in details_lighteval:
34
+ judement_prompt = d["judement_prompt"]
35
+ judgement = d["judgement"]
36
+ predictions = d["predictions"][0]
37
+ prompts = d["full_prompt"]
38
+
39
+ turns = []
40
+ for turn in range(len(predictions)):
41
+ if turn == 1:
42
+ prompt = prompts[turn].format(model_response=predictions[turn - 1])
43
+ else:
44
+ prompt = prompts[turn]
45
+
46
+ turns.append([])
47
+ turns[turn].append(prompt)
48
+ turns[turn].append(predictions[turn])
49
+ turns[turn].append(judement_prompt[turn])
50
+ turns[turn].append(judgement[turn])
51
+
52
+ for i, turn in enumerate(turns):
53
+ samples.append(
54
+ {
55
+ "model": model,
56
+ "turn": i,
57
+ "prompt": turn[0],
58
+ "response": turn[1],
59
+ "judgement_prompt": turn[2],
60
+ "judgment": turn[3],
61
+ "score": d["metrics"][score_turn[i]],
62
+ "question_id": d["specifics"]["id"],
63
+ }
64
+ )
65
+
66
+ dataframe_all_samples = pd.DataFrame(samples)
67
+
68
+ return dataframe_all_samples
69
+
70
+
71
+
72
+
73
+ def construct_dataframe() -> pd.DataFrame:
74
+ """
75
+ Construct a dataframe from the data in the data folder
76
+ """
77
+ lighteval = get_dataframe_lighteval()
78
+ lighteval["model"] = lighteval["model"].apply(lambda x: x.split("__")[1])
79
+ lighteval = lighteval.set_index(["question_id", "turn", "model"])
80
+ all_samples = lighteval.reset_index()
81
+ all_samples = all_samples.set_index("question_id")
82
+
83
+ return all_samples.dropna()
84
+
85
+
86
+ def create_plot(model: str, dataframe: pd.DataFrame):
87
+ new = dataframe[dataframe["model"] == model].dropna()
88
+ new = new[new["turn"] == 1]
89
+ new["score_lighteval"] = new["score_lighteval"].astype(int)
90
+ new["score_mt_bench"] = new["score_mt_bench"].astype(int)
91
+ new = new[['score_lighteval', 'score_mt_bench']]
92
+ new.index = new.index.astype(str)
93
+
94
+ fig = new.plot.bar(title="Scores", labels={"index": "Index", "value": "Score"}, barmode="group")
95
+
96
+ return fig
97
+
98
+
99
+ def get_scores(dataframe):
100
+ dataframe = dataframe.dropna()
101
+ dataframe["score"] = dataframe["score"].astype(int)
102
+ new = dataframe[['score', "turn", "model"]]
103
+ new = new.groupby(["model", "turn"]).mean()
104
+ new = new.groupby(["model"]).mean()
105
+ return new
106
+
107
+ if __name__ == "__main__":
108
+ df = construct_dataframe()
109
+ from pprint import pprint
110
+ pprint(df)
111
+ #print(df.iloc[130])
112
+ # model = "zephyr-7b-beta"
113
+ # fig = create_plot(model, df)
114
+ # fig.show()