lmzheng commited on
Commit
f72ce70
1 Parent(s): 72650c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -3
app.py CHANGED
@@ -24,7 +24,7 @@ def make_leaderboard_md(elo_results):
24
  - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
25
  - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
26
 
27
- 💻 We use [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to compute MT-bench scores (single-answer grading on a scale of 10) and win rates (against gpt-3.5). The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
28
  """
29
  return leaderboard_md
30
 
@@ -173,7 +173,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
173
  "Model",
174
  "Arena Elo rating",
175
  "MT-bench (score)",
176
- "MT-bench (win rate %)",
177
  "MMLU",
178
  "License",
179
  ]
@@ -191,7 +190,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
191
 
192
  gr.Dataframe(
193
  headers=headers,
194
- datatype=["markdown", "number", "number", "number", "number", "str"],
195
  value=values,
196
  elem_id="leaderboard_dataframe",
197
  )
 
24
  - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
25
  - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
26
 
27
+ 💻 We use [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to compute MT-bench scores (single-answer grading on a scale of 10). The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
28
  """
29
  return leaderboard_md
30
 
 
173
  "Model",
174
  "Arena Elo rating",
175
  "MT-bench (score)",
 
176
  "MMLU",
177
  "License",
178
  ]
 
190
 
191
  gr.Dataframe(
192
  headers=headers,
193
+ datatype=["markdown", "number", "number", "number", "str"],
194
  value=values,
195
  elem_id="leaderboard_dataframe",
196
  )