Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
b33239d
1 Parent(s): d00fb74

feat: switch the default metric to ndcg_at_10

Browse files
app.py CHANGED
@@ -31,9 +31,9 @@ except Exception:
31
  raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
32
 
33
  original_df_qa = get_leaderboard_df(
34
- raw_data, task='qa', metric='ndcg_at_3')
35
  original_df_long_doc = get_leaderboard_df(
36
- raw_data, task='long-doc', metric='ndcg_at_3')
37
  print(f'raw data: {len(raw_data)}')
38
  print(f'QA data loaded: {original_df_qa.shape}')
39
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
 
31
  raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
32
 
33
  original_df_qa = get_leaderboard_df(
34
+ raw_data, task='qa', metric='ndcg_at_10')
35
  original_df_long_doc = get_leaderboard_df(
36
+ raw_data, task='long-doc', metric='ndcg_at_10')
37
  print(f'raw data: {len(raw_data)}')
38
  print(f'QA data loaded: {original_df_qa.shape}')
39
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
tests/src/leaderboard/test_read_evals.py CHANGED
@@ -41,7 +41,7 @@ def test_get_raw_eval_results():
41
  def test_get_leaderboard_df():
42
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
43
  raw_data = get_raw_eval_results(results_path)
44
- df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_3')
45
  assert df.shape[0] == 4
46
  # the results contain only one embedding model
47
  # for i in range(4):
 
41
  def test_get_leaderboard_df():
42
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
43
  raw_data = get_raw_eval_results(results_path)
44
+ df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_10')
45
  assert df.shape[0] == 4
46
  # the results contain only one embedding model
47
  # for i in range(4):