add auto eval
Browse files- app.py +10 -5
- eval-results/omnieval-auto/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/bge-large-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/bge-m3_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/e5-mistral-7b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/{demo-leaderboard β omnieval-auto}/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +12 -12
- eval-results/omnieval-auto/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/jina-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard/qwen2-72b_bge-large-zh β omnieval-human/bge-large-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard/qwen2-72b_bge-m3 β omnieval-human/bge-m3_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard/qwen2-72b_e5-mistral-7b β omnieval-human/e5-mistral-7b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard/qwen2-72b_gte-qwen2-1.5b β omnieval-human/gte-qwen2-1.5b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard/qwen2-72b_jina-zh β omnieval-human/jina-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- src/about.py +5 -5
- src/envs.py +3 -2
- src/leaderboard/read_evals.py +1 -1
app.py
CHANGED
@@ -24,7 +24,7 @@ from src.display.utils import (
|
|
24 |
WeightType,
|
25 |
Precision
|
26 |
)
|
27 |
-
from src.envs import API, EVAL_REQUESTS_PATH,
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
|
@@ -41,7 +41,8 @@ try:
|
|
41 |
except Exception:
|
42 |
restart_space()
|
43 |
try:
|
44 |
-
print(
|
|
|
45 |
# snapshot_download(
|
46 |
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
# )
|
@@ -49,7 +50,8 @@ except Exception:
|
|
49 |
restart_space()
|
50 |
|
51 |
|
52 |
-
|
|
|
53 |
|
54 |
# (
|
55 |
# finished_eval_queue_df,
|
@@ -97,8 +99,11 @@ with demo:
|
|
97 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
98 |
|
99 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
100 |
-
with gr.TabItem("
|
101 |
-
leaderboard = init_leaderboard(
|
|
|
|
|
|
|
102 |
|
103 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
104 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
24 |
WeightType,
|
25 |
Precision
|
26 |
)
|
27 |
+
from src.envs import API, EVAL_REQUESTS_PATH, AUTO_RESULTS_PATH, HUMAN_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
|
|
|
41 |
except Exception:
|
42 |
restart_space()
|
43 |
try:
|
44 |
+
print(AUTO_RESULTS_PATH)
|
45 |
+
print(HUMAN_RESULTS_PATH)
|
46 |
# snapshot_download(
|
47 |
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
48 |
# )
|
|
|
50 |
restart_space()
|
51 |
|
52 |
|
53 |
+
AUTO_LEADERBOARD_DF = get_leaderboard_df(AUTO_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
54 |
+
HUMAN_LEADERBOARD_DF = get_leaderboard_df(HUMAN_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
55 |
|
56 |
# (
|
57 |
# finished_eval_queue_df,
|
|
|
99 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
100 |
|
101 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
102 |
+
with gr.TabItem("πOmniEval-Human", elem_id="llm-benchmark-tab-table", id=0):
|
103 |
+
leaderboard = init_leaderboard(HUMAN_LEADERBOARD_DF)
|
104 |
+
|
105 |
+
with gr.TabItem("π€OmniEval-Auto", elem_id="llm-benchmark-tab-table", id=1):
|
106 |
+
leaderboard = init_leaderboard(AUTO_LEADERBOARD_DF)
|
107 |
|
108 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
109 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
eval-results/omnieval-auto/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.0,
|
5 |
+
"map": 0.0
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0011680767773708802,
|
9 |
+
"f1": 0.3709233008524321,
|
10 |
+
"rouge1": 0.2570830224992733,
|
11 |
+
"rouge2": 0.09085043984411759,
|
12 |
+
"rougeL": 0.1860727124152372,
|
13 |
+
"accuracy": 0.35869427958075517,
|
14 |
+
"completeness": 0.5755086661642803,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.0,
|
17 |
+
"numerical_accuracy": 0.11213720316622691
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "CLOSE_deepseek-v2-chat",
|
22 |
+
"generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
25 |
+
"num_params": 236,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "CLOSE",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"num_params": 0.0,
|
31 |
+
"open_source": true
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
eval-results/omnieval-auto/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.0,
|
5 |
+
"map": 0.0
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0008839499936860714,
|
9 |
+
"f1": 0.39891051266403244,
|
10 |
+
"rouge1": 0.2679937299203498,
|
11 |
+
"rouge2": 0.09293819886242284,
|
12 |
+
"rougeL": 0.19931718897529843,
|
13 |
+
"accuracy": 0.3238413941154186,
|
14 |
+
"completeness": 0.52843637454982,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.0,
|
17 |
+
"numerical_accuracy": 0.06765619606489472
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "CLOSE_llama3-70b-instruct",
|
22 |
+
"generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
25 |
+
"num_params": 70.6,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "CLOSE",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"num_params": 0.0,
|
31 |
+
"open_source": true
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
eval-results/omnieval-auto/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.0,
|
5 |
+
"map": 0.0
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0002525571410531633,
|
9 |
+
"f1": 0.32215271896313463,
|
10 |
+
"rouge1": 0.2352109086389165,
|
11 |
+
"rouge2": 0.08060449522198783,
|
12 |
+
"rougeL": 0.16073680618083347,
|
13 |
+
"accuracy": 0.37883571157974494,
|
14 |
+
"completeness": 0.6016923768159353,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.0,
|
17 |
+
"numerical_accuracy": 0.1255931667193926
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "CLOSE_qwen2-72b",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "CLOSE",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"num_params": 0.0,
|
31 |
+
"open_source": true
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
eval-results/omnieval-auto/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.0,
|
5 |
+
"map": 0.0
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0,
|
9 |
+
"f1": 0.06725057117657031,
|
10 |
+
"rouge1": 0.1277764944666756,
|
11 |
+
"rouge2": 0.03211441875898112,
|
12 |
+
"rougeL": 0.03257144660565082,
|
13 |
+
"accuracy": 0.15734309887612072,
|
14 |
+
"completeness": 0.5063249001331558,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.0,
|
17 |
+
"numerical_accuracy": 0.06932865291794647
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "CLOSE_yi15-34b",
|
22 |
+
"generative_model": "01ai/Yi-1.5-34B-Chat-16K",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
25 |
+
"num_params": 34.4,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "CLOSE",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"num_params": 0.0,
|
31 |
+
"open_source": true
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
eval-results/omnieval-auto/bge-large-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.3097634381445468,
|
5 |
+
"map": 0.30402197247127166
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0026518499810582142,
|
9 |
+
"f1": 0.2480828824153542,
|
10 |
+
"rouge1": 0.2493538725800514,
|
11 |
+
"rouge2": 0.1235656068292625,
|
12 |
+
"rougeL": 0.16098924930699862,
|
13 |
+
"accuracy": 0.3906427579239803,
|
14 |
+
"completeness": 0.5930474914396308,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.5045650189122212,
|
17 |
+
"numerical_accuracy": 0.28149656401119877
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "bge-large-zh_qwen2-72b",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "BAAI/bge-large-zh",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "BAAI/bge-large-zh",
|
31 |
+
"num_params": 0.326,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/omnieval-auto/bge-m3_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.33076566906595944,
|
5 |
+
"map": 0.32402765500694536
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.002525571410531633,
|
9 |
+
"f1": 0.2524796046548042,
|
10 |
+
"rouge1": 0.2542055585319881,
|
11 |
+
"rouge2": 0.12967013110722864,
|
12 |
+
"rougeL": 0.16623387811734364,
|
13 |
+
"accuracy": 0.0,
|
14 |
+
"completeness": 0.0,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.0,
|
17 |
+
"numerical_accuracy": 0.0
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "bge-m3_qwen2-72b",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "BAAI/bge-m3",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "BAAI/bge-m3",
|
31 |
+
"num_params": 0.5,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/omnieval-auto/e5-mistral-7b_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.26059266742433806,
|
5 |
+
"map": 0.25533526960474806
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.002146735698951888,
|
9 |
+
"f1": 0.24207930410773865,
|
10 |
+
"rouge1": 0.24073805243800728,
|
11 |
+
"rouge2": 0.1162276261848681,
|
12 |
+
"rougeL": 0.1534679545927458,
|
13 |
+
"accuracy": 0.37713095087763604,
|
14 |
+
"completeness": 0.5855007473841555,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.49136152656008253,
|
17 |
+
"numerical_accuracy": 0.2582123758594347
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "e5-mistral-7b_qwen2-72b",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "intfloat/e5-mistral-7b-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "intfloat/e5-mistral-7b-instruct",
|
31 |
+
"num_params": 7.11,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/omnieval-auto/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.3406848507808225,
|
5 |
+
"map": 0.3337426863661236
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0035568464031653824,
|
9 |
+
"f1": 0.3226028700822056,
|
10 |
+
"rouge1": 0.29804464952499493,
|
11 |
+
"rouge2": 0.1619392409911174,
|
12 |
+
"rougeL": 0.21536150159516076,
|
13 |
+
"accuracy": 0.3783377209477247,
|
14 |
+
"completeness": 0.5935541629364369,
|
15 |
+
"hallucination": 0.06668379802132854,
|
16 |
+
"utilization": 0.48314821907315203,
|
17 |
+
"numerical_accuracy": 0.2761605035405193
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
|
22 |
+
"generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
25 |
+
"num_params": 236,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.78,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/omnieval-auto/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.3406848507808225,
|
5 |
+
"map": 0.3337426863661236
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.030906680136380857,
|
9 |
+
"f1": 0.4704248712273675,
|
10 |
+
"rouge1": 0.3844331865430577,
|
11 |
+
"rouge2": 0.21544656691735142,
|
12 |
+
"rougeL": 0.3082188596657867,
|
13 |
+
"accuracy": 0.4181714862987751,
|
14 |
+
"completeness": 0.586105675146771,
|
15 |
+
"hallucination": 0.0880543450397334,
|
16 |
+
"utilization": 0.45601078859491395,
|
17 |
+
"numerical_accuracy": 0.2751721876024926
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
|
22 |
+
"generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
25 |
+
"num_params": 70.6,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.78,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/{demo-leaderboard β omnieval-auto}/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json
RENAMED
@@ -1,20 +1,20 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"retrieval": {
|
4 |
-
"mrr": 0.
|
5 |
-
"map": 0.
|
6 |
},
|
7 |
"generation": {
|
8 |
-
"em": 0.
|
9 |
-
"f1": 0.
|
10 |
-
"rouge1": 0.
|
11 |
-
"rouge2": 0.
|
12 |
-
"rougeL": 0.
|
13 |
-
"accuracy": 0.
|
14 |
-
"completeness": 0.
|
15 |
-
"hallucination": 0.
|
16 |
-
"utilization":
|
17 |
-
"numerical_accuracy": 0.
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"retrieval": {
|
4 |
+
"mrr": 0.3406848507808225,
|
5 |
+
"map": 0.3337426863661236
|
6 |
},
|
7 |
"generation": {
|
8 |
+
"em": 0.0028412678368480867,
|
9 |
+
"f1": 0.2477112059712835,
|
10 |
+
"rouge1": 0.25666135328401396,
|
11 |
+
"rouge2": 0.13256084364546591,
|
12 |
+
"rougeL": 0.1669344569228441,
|
13 |
+
"accuracy": 0.40573304710190683,
|
14 |
+
"completeness": 0.6131668895824045,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.5346272891410885,
|
17 |
+
"numerical_accuracy": 0.2971301335972291
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
eval-results/omnieval-auto/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.3406848507808225,
|
5 |
+
"map": 0.3337426863661236
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0,
|
9 |
+
"f1": 0.09732568803130702,
|
10 |
+
"rouge1": 0.1642342072893325,
|
11 |
+
"rouge2": 0.06542075931397044,
|
12 |
+
"rougeL": 0.059256539829821125,
|
13 |
+
"accuracy": 0.3304375804375804,
|
14 |
+
"completeness": 0.5735068912710567,
|
15 |
+
"hallucination": 0.06555017663221248,
|
16 |
+
"utilization": 0.4132755170113409,
|
17 |
+
"numerical_accuracy": 0.175
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "gte-qwen2-1.5b_yi15-34b",
|
22 |
+
"generative_model": "01ai/Yi-1.5-34B-Chat-16K",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
25 |
+
"num_params": 34.4,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.78,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/omnieval-auto/jina-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.25315906890600665,
|
5 |
+
"map": 0.24830681483352277
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0026518499810582142,
|
9 |
+
"f1": 0.24837825152624493,
|
10 |
+
"rouge1": 0.24111819423215256,
|
11 |
+
"rouge2": 0.11665848753826197,
|
12 |
+
"rougeL": 0.1558018779014647,
|
13 |
+
"accuracy": 0.3705644652102538,
|
14 |
+
"completeness": 0.5820335932813437,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.4738984364905027,
|
17 |
+
"numerical_accuracy": 0.24648820567187915
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "jina-zh_qwen2-72b",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "jinaai/jina-embeddings-v2-base-zh",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "jinaai/jina-embeddings-v2-base-zh",
|
31 |
+
"num_params": 0.161,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/{demo-leaderboard β omnieval-human}/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
RENAMED
File without changes
|
eval-results/{demo-leaderboard β omnieval-human}/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
RENAMED
File without changes
|
eval-results/{demo-leaderboard β omnieval-human}/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json
RENAMED
File without changes
|
eval-results/{demo-leaderboard β omnieval-human}/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json
RENAMED
File without changes
|
eval-results/{demo-leaderboard/qwen2-72b_bge-large-zh β omnieval-human/bge-large-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
@@ -18,7 +18,7 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "bge-large-zh_qwen2-72b",
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
eval-results/{demo-leaderboard/qwen2-72b_bge-m3 β omnieval-human/bge-m3_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
@@ -18,7 +18,7 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "bge-m3_qwen2-72b",
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
eval-results/{demo-leaderboard/qwen2-72b_e5-mistral-7b β omnieval-human/e5-mistral-7b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
@@ -18,7 +18,7 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "e5-mistral-7b_qwen2-72b",
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
RENAMED
File without changes
|
eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
RENAMED
File without changes
|
eval-results/{demo-leaderboard/qwen2-72b_gte-qwen2-1.5b β omnieval-human/gte-qwen2-1.5b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
@@ -18,7 +18,7 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "gte-qwen2-1.5b_qwen2-72b",
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json
RENAMED
File without changes
|
eval-results/{demo-leaderboard/qwen2-72b_jina-zh β omnieval-human/jina-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
@@ -18,7 +18,7 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "jina-zh_qwen2-72b",
|
22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
src/about.py
CHANGED
@@ -83,12 +83,12 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
83 |
With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
|
84 |
|
85 |
|
86 |
-
##
|
87 |
`conda env create -f environment.yml && conda activate finrag`
|
88 |
|
89 |
-
<!-- ##
|
90 |
1. -->
|
91 |
-
##
|
92 |
Notion:
|
93 |
1. The code run path is `./OpenFinBench`
|
94 |
2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
|
@@ -136,11 +136,11 @@ Then conduct the model-based evaluate using the following codes, (change the par
|
|
136 |
sh evaluator/judgement/judger.sh
|
137 |
```
|
138 |
|
139 |
-
##
|
140 |
|
141 |
OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
|
142 |
|
143 |
-
##
|
144 |
The paper is waiting to be released!
|
145 |
|
146 |
<!-- # Check Infos
|
|
|
83 |
With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
|
84 |
|
85 |
|
86 |
+
## π§ Installation
|
87 |
`conda env create -f environment.yml && conda activate finrag`
|
88 |
|
89 |
+
<!-- ## β¨ Features
|
90 |
1. -->
|
91 |
+
## π Quick-Start
|
92 |
Notion:
|
93 |
1. The code run path is `./OpenFinBench`
|
94 |
2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
|
|
|
136 |
sh evaluator/judgement/judger.sh
|
137 |
```
|
138 |
|
139 |
+
## π License
|
140 |
|
141 |
OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
|
142 |
|
143 |
+
## π Citation
|
144 |
The paper is waiting to be released!
|
145 |
|
146 |
<!-- # Check Infos
|
src/envs.py
CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
@@ -18,7 +18,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
18 |
|
19 |
# Local caches
|
20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
-
|
|
|
22 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "RUC-NLPIR" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
18 |
|
19 |
# Local caches
|
20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
+
HUMAN_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results", "omnieval-human")
|
22 |
+
AUTO_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results", "omnieval-auto")
|
23 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
24 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
25 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -183,7 +183,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
183 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
184 |
"""From the path of the results folder root, extract all needed info for results"""
|
185 |
model_result_filepaths = []
|
186 |
-
|
187 |
for root, _, files in os.walk(results_path):
|
188 |
# We should only have json files in model results
|
189 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
|
|
183 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
184 |
"""From the path of the results folder root, extract all needed info for results"""
|
185 |
model_result_filepaths = []
|
186 |
+
print(f"Reading results from {results_path}")
|
187 |
for root, _, files in os.walk(results_path):
|
188 |
# We should only have json files in model results
|
189 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|