Spaces:
AIR-Bench
/
Running on CPU Upgrade

File size: 2,957 Bytes
8b7a945
 
8e1f9af
8b7a945
 
 
 
 
 
 
3b83af7
 
 
 
 
8b7a945
 
 
 
 
3d59d51
 
 
 
 
e5c7cad
 
8b7a945
 
 
1a2dba5
e5c7cad
8b7a945
1a2dba5
 
 
 
 
3b83af7
8a1daf9
3b83af7
1a2dba5
3b83af7
b33239d
1a2dba5
3b83af7
1a2dba5
 
 
 
 
 
 
3b83af7
 
 
 
 
1a2dba5
3b83af7
 
 
 
 
 
 
 
8a1daf9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from pathlib import Path

from src.read_evals import FullEvalResult, get_raw_eval_results, get_leaderboard_df

cur_fp = Path(__file__)


def test_init_from_json_file():
    json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
    full_eval_result = FullEvalResult.init_from_json_file(json_fp)
    num_different_task_domain_lang_metric_dataset_combination = 6
    assert len(full_eval_result.results) == \
           num_different_task_domain_lang_metric_dataset_combination
    assert full_eval_result.retrieval_model == "bge-m3"
    assert full_eval_result.reranking_model == "bge-reranker-v2-m3"


def test_to_dict():
    json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
    full_eval_result = FullEvalResult.init_from_json_file(json_fp)
    result_list = full_eval_result.to_dict(task='qa', metric='ndcg_at_1')
    assert len(result_list) == 1
    result_dict = result_list[0]
    assert result_dict["Retrieval Model"] == "bge-m3"
    assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
    assert result_dict["wiki_en"] is not None
    assert result_dict["wiki_zh"] is not None


def test_get_raw_eval_results():
    results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
    results = get_raw_eval_results(results_path)
    # only load the latest results
    assert len(results) == 4
    assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
    assert len(results[0].results) == 70
    assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3"
    assert len(results[1].results) == 70


def test_get_leaderboard_df():
    results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
    raw_data = get_raw_eval_results(results_path)
    df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_10')
    assert df.shape[0] == 4
    # the results contain only one embedding model
    # for i in range(4):
    #     assert df["Retrieval Model"][i] == "bge-m3"
    # # the results contain only two reranking model
    # assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
    # assert df["Reranking Model"][1] == "NoReranker"
    # assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
    # assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()


def test_get_leaderboard_df_long_doc():
    results_path = cur_fp.parents[2] / "toydata" / "test_results"
    raw_data = get_raw_eval_results(results_path)
    df = get_leaderboard_df(raw_data, 'long-doc', 'ndcg_at_1')
    assert df.shape[0] == 2
    # the results contain only one embedding model
    for i in range(2):
        assert df["Retrieval Model"][i] == "bge-m3"
    # the results contains only two reranking model
    assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
    assert df["Reranking Model"][1] == "NoReranker"
    assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
    assert not df[['Average ⬆️', 'law_en_lex_files_500k_600k', ]].isnull().values.any()