Terry Zhuo
commited on
Commit
•
8fb39f8
1
Parent(s):
c373956
update
Browse files- app.py +11 -4
- src/text_content.py +1 -1
- src/utils.py +2 -1
app.py
CHANGED
@@ -24,7 +24,8 @@ from datasets import load_dataset
|
|
24 |
TOKEN = os.environ.get("TOKEN", None)
|
25 |
api = HfApi(TOKEN)
|
26 |
df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("complete", ascending=False)
|
27 |
-
|
|
|
28 |
complete_solve_rate = load_dataset("bigcode/bigcodebench-complete-solve-rate", split="train").to_pandas()
|
29 |
instruct_solve_rate = load_dataset("bigcode/bigcodebench-instruct-solve-rate", split="train").to_pandas()
|
30 |
|
@@ -238,9 +239,15 @@ with demo:
|
|
238 |
|
239 |
with gr.TabItem("📊 Elo Rating", id=1):
|
240 |
with gr.Column():
|
241 |
-
|
242 |
-
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
with gr.TabItem("🧩 Solve Rate", id=2):
|
245 |
with gr.Column():
|
246 |
complete_map = gr.Plot()
|
|
|
24 |
TOKEN = os.environ.get("TOKEN", None)
|
25 |
api = HfApi(TOKEN)
|
26 |
df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("complete", ascending=False)
|
27 |
+
task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="train").to_pandas()
|
28 |
+
model_elo_mle_df = load_dataset("bigcode/bigcodebench-elo-model-with-tie", split="train").to_pandas()
|
29 |
complete_solve_rate = load_dataset("bigcode/bigcodebench-complete-solve-rate", split="train").to_pandas()
|
30 |
instruct_solve_rate = load_dataset("bigcode/bigcodebench-instruct-solve-rate", split="train").to_pandas()
|
31 |
|
|
|
239 |
|
240 |
with gr.TabItem("📊 Elo Rating", id=1):
|
241 |
with gr.Column():
|
242 |
+
with gr.Group():
|
243 |
+
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
244 |
+
task_elo_map = gr.Plot()
|
245 |
+
demo.load(plot_elo_mle, [gr.Dataframe(task_elo_mle_df, visible=False)], task_elo_map)
|
246 |
+
with gr.Group():
|
247 |
+
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
248 |
+
model_elo_map = gr.Plot()
|
249 |
+
demo.load(plot_elo_mle, [gr.Dataframe(model_elo_mle_df, visible=False)], model_elo_map)
|
250 |
+
|
251 |
with gr.TabItem("🧩 Solve Rate", id=2):
|
252 |
with gr.Column():
|
253 |
complete_map = gr.Plot()
|
src/text_content.py
CHANGED
@@ -17,7 +17,7 @@ The dataset has 2 variants:
|
|
17 |
|
18 |
Figure below shows the example of `Complete` vs `Instruct` prompt. For `Instruct`, we only focus on instruction-tuned LLMs.
|
19 |
|
20 |
-
<img src="https://github.com/bigcode-bench/bigcode-bench.github.io/blob/main/asset/bigcodebench_prompt.
|
21 |
|
22 |
The specific prompt template can be found [here](https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/model.py).
|
23 |
|
|
|
17 |
|
18 |
Figure below shows the example of `Complete` vs `Instruct` prompt. For `Instruct`, we only focus on instruction-tuned LLMs.
|
19 |
|
20 |
+
<img src="https://github.com/bigcode-bench/bigcode-bench.github.io/blob/main/asset/bigcodebench_prompt.svg?raw=true" alt="OctoCoder vs Base HumanEval prompt" width="800px">
|
21 |
|
22 |
The specific prompt template can be found [here](https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/model.py).
|
23 |
|
src/utils.py
CHANGED
@@ -46,7 +46,8 @@ def make_clickable_names(df):
|
|
46 |
def plot_elo_mle(df):
|
47 |
fig = px.scatter(df, x="model", y="rating", error_y="error_y",
|
48 |
error_y_minus="error_y_minus",
|
49 |
-
|
|
|
50 |
fig.update_layout(xaxis_title="Model",
|
51 |
yaxis_title="Rating",
|
52 |
autosize=True,
|
|
|
46 |
def plot_elo_mle(df):
|
47 |
fig = px.scatter(df, x="model", y="rating", error_y="error_y",
|
48 |
error_y_minus="error_y_minus",
|
49 |
+
# title="Bootstrap of Elo MLE Estimates (BigCodeBench-Complete)"
|
50 |
+
)
|
51 |
fig.update_layout(xaxis_title="Model",
|
52 |
yaxis_title="Rating",
|
53 |
autosize=True,
|