Terry Zhuo commited on
Commit
8fb39f8
1 Parent(s): c373956
Files changed (3) hide show
  1. app.py +11 -4
  2. src/text_content.py +1 -1
  3. src/utils.py +2 -1
app.py CHANGED
@@ -24,7 +24,8 @@ from datasets import load_dataset
24
  TOKEN = os.environ.get("TOKEN", None)
25
  api = HfApi(TOKEN)
26
  df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("complete", ascending=False)
27
- elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="train").to_pandas()
 
28
  complete_solve_rate = load_dataset("bigcode/bigcodebench-complete-solve-rate", split="train").to_pandas()
29
  instruct_solve_rate = load_dataset("bigcode/bigcodebench-instruct-solve-rate", split="train").to_pandas()
30
 
@@ -238,9 +239,15 @@ with demo:
238
 
239
  with gr.TabItem("📊 Elo Rating", id=1):
240
  with gr.Column():
241
- elo_map = gr.Plot()
242
- demo.load(plot_elo_mle, [gr.Dataframe(elo_mle_df, visible=False)], elo_map)
243
-
 
 
 
 
 
 
244
  with gr.TabItem("🧩 Solve Rate", id=2):
245
  with gr.Column():
246
  complete_map = gr.Plot()
 
24
  TOKEN = os.environ.get("TOKEN", None)
25
  api = HfApi(TOKEN)
26
  df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("complete", ascending=False)
27
+ task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="train").to_pandas()
28
+ model_elo_mle_df = load_dataset("bigcode/bigcodebench-elo-model-with-tie", split="train").to_pandas()
29
  complete_solve_rate = load_dataset("bigcode/bigcodebench-complete-solve-rate", split="train").to_pandas()
30
  instruct_solve_rate = load_dataset("bigcode/bigcodebench-instruct-solve-rate", split="train").to_pandas()
31
 
 
239
 
240
  with gr.TabItem("📊 Elo Rating", id=1):
241
  with gr.Column():
242
+ with gr.Group():
243
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
244
+ task_elo_map = gr.Plot()
245
+ demo.load(plot_elo_mle, [gr.Dataframe(task_elo_mle_df, visible=False)], task_elo_map)
246
+ with gr.Group():
247
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
248
+ model_elo_map = gr.Plot()
249
+ demo.load(plot_elo_mle, [gr.Dataframe(model_elo_mle_df, visible=False)], model_elo_map)
250
+
251
  with gr.TabItem("🧩 Solve Rate", id=2):
252
  with gr.Column():
253
  complete_map = gr.Plot()
src/text_content.py CHANGED
@@ -17,7 +17,7 @@ The dataset has 2 variants:
17
 
18
  Figure below shows the example of `Complete` vs `Instruct` prompt. For `Instruct`, we only focus on instruction-tuned LLMs.
19
 
20
- <img src="https://github.com/bigcode-bench/bigcode-bench.github.io/blob/main/asset/bigcodebench_prompt.png?raw=true" alt="OctoCoder vs Base HumanEval prompt" width="800px">
21
 
22
  The specific prompt template can be found [here](https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/model.py).
23
 
 
17
 
18
  Figure below shows the example of `Complete` vs `Instruct` prompt. For `Instruct`, we only focus on instruction-tuned LLMs.
19
 
20
+ <img src="https://github.com/bigcode-bench/bigcode-bench.github.io/blob/main/asset/bigcodebench_prompt.svg?raw=true" alt="OctoCoder vs Base HumanEval prompt" width="800px">
21
 
22
  The specific prompt template can be found [here](https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/model.py).
23
 
src/utils.py CHANGED
@@ -46,7 +46,8 @@ def make_clickable_names(df):
46
  def plot_elo_mle(df):
47
  fig = px.scatter(df, x="model", y="rating", error_y="error_y",
48
  error_y_minus="error_y_minus",
49
- title="Bootstrap of Elo MLE Estimates (BigCodeBench-Complete)")
 
50
  fig.update_layout(xaxis_title="Model",
51
  yaxis_title="Rating",
52
  autosize=True,
 
46
  def plot_elo_mle(df):
47
  fig = px.scatter(df, x="model", y="rating", error_y="error_y",
48
  error_y_minus="error_y_minus",
49
+ # title="Bootstrap of Elo MLE Estimates (BigCodeBench-Complete)"
50
+ )
51
  fig.update_layout(xaxis_title="Model",
52
  yaxis_title="Rating",
53
  autosize=True,