Spaces:
Running
Running
ai-forever
commited on
Commit
•
1ddbee0
1
Parent(s):
95df55c
Upload 19 files
Browse files- .gitattributes +35 -35
- .gitignore +1 -13
- README.md +16 -44
- app.py +225 -204
- datasets_config.json +1 -0
- docs/description.md +63 -0
- requirements.txt +4 -16
- results/ChatGLM2-6B-32K.json +1 -0
- results/GLM4-9B-Chat.json +1 -0
- results/GPT-4o.json +1 -0
- results/LLaMA-2-7B-32k.json +1 -0
- results/LLaMA-3-8B-Instruct.json +1 -0
- results/LLaMA-3-8B.json +1 -0
- results/LongAlpaca.json +1 -0
- results/LongChat-7B-v1.5-32k.json +1 -0
- results/Mistral-7B-Instruct-v0.3.json +1 -0
- results/Mistral-7B-v0.1.json +1 -0
- results/Mistral-7B-v0.3.json +1 -0
- results/Saiga-LLaMA-3-8B.json +1 -0
.gitattributes
CHANGED
@@ -1,35 +1,35 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.
|
29 |
-
*.
|
30 |
-
*.
|
31 |
-
*.
|
32 |
-
*.
|
33 |
-
*.
|
34 |
-
|
35 |
-
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -1,13 +1 @@
|
|
1 |
-
|
2 |
-
venv/
|
3 |
-
__pycache__/
|
4 |
-
.env
|
5 |
-
.ipynb_checkpoints
|
6 |
-
*ipynb
|
7 |
-
.vscode/
|
8 |
-
|
9 |
-
eval-queue/
|
10 |
-
eval-results/
|
11 |
-
eval-queue-bk/
|
12 |
-
eval-results-bk/
|
13 |
-
logs/
|
|
|
1 |
+
hf_token
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,44 +1,16 @@
|
|
1 |
-
---
|
2 |
-
title: LIBRA Leaderboard
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk: gradio
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
```json
|
18 |
-
{
|
19 |
-
"config": {
|
20 |
-
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
21 |
-
"model_name": "path of the model on the hub: org/model",
|
22 |
-
"model_sha": "revision on the hub",
|
23 |
-
},
|
24 |
-
"results": {
|
25 |
-
"task_name": {
|
26 |
-
"metric_name": score,
|
27 |
-
},
|
28 |
-
"task_name2": {
|
29 |
-
"metric_name": score,
|
30 |
-
}
|
31 |
-
}
|
32 |
-
}
|
33 |
-
```
|
34 |
-
|
35 |
-
Request files are created automatically by this tool.
|
36 |
-
|
37 |
-
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
38 |
-
|
39 |
-
# Code logic for more complex edits
|
40 |
-
|
41 |
-
You'll find
|
42 |
-
- the main table' columns names and properties in `src/display/utils.py`
|
43 |
-
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
44 |
-
- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
1 |
+
---
|
2 |
+
title: LIBRA Leaderboard
|
3 |
+
emoji: 🏆
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.36.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
tags:
|
12 |
+
- leaderboard
|
13 |
+
short_description: LLM extra long context benchmark
|
14 |
+
---
|
15 |
+
|
16 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,204 +1,225 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
from
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
)
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
]
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import gradio as gr
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from collections import defaultdict
|
8 |
+
|
9 |
+
|
10 |
+
LENGTHS = ["dataset_total_score", "4k", "8k", "16k", "32k", "64k", "128k"]
|
11 |
+
datasets_params = json.load(open("datasets_config.json", "r"))
|
12 |
+
TASKS = datasets_params.keys()
|
13 |
+
|
14 |
+
|
15 |
+
def make_default_md():
|
16 |
+
leaderboard_md = "LeaderBoard"
|
17 |
+
return leaderboard_md
|
18 |
+
|
19 |
+
|
20 |
+
def make_model_desc_md():
|
21 |
+
with open("docs/description.md", "r") as f:
|
22 |
+
description = f.read()
|
23 |
+
return description
|
24 |
+
|
25 |
+
|
26 |
+
def make_overall_table_by_tasks(files):
|
27 |
+
results = defaultdict(list)
|
28 |
+
|
29 |
+
result_dct = {}
|
30 |
+
for file in files:
|
31 |
+
if not file.endswith("json"): continue
|
32 |
+
path = "results/" + file
|
33 |
+
data = json.load(open(path))
|
34 |
+
model_name = file.split('/')[-1].split(".json")[0]
|
35 |
+
result_dct[model_name] = {}
|
36 |
+
for dataset in data.keys():
|
37 |
+
if dataset == "total_score":
|
38 |
+
result_dct[model_name][dataset] = round(data[dataset] * 100, 1)
|
39 |
+
continue
|
40 |
+
result_dct[model_name][dataset] = round(data[dataset]["dataset_total_score"] * 100, 1)
|
41 |
+
|
42 |
+
for file in files:
|
43 |
+
if not file.endswith("json"): continue
|
44 |
+
model_name = file.split('/')[-1].split(".json")[0]
|
45 |
+
results['Model'].append(model_name)
|
46 |
+
for key in result_dct[model_name].keys():
|
47 |
+
if key == "total_score":
|
48 |
+
results["Total Score"].append(result_dct[model_name][key])
|
49 |
+
else:
|
50 |
+
results[datasets_params[key]["name"]].append(result_dct[model_name][key])
|
51 |
+
|
52 |
+
table = pd.DataFrame(results).sort_values(['Total Score'], ascending=False)
|
53 |
+
cols = table.columns.tolist()
|
54 |
+
cols = [cols[0]] + [cols[22]] + cols[1:22]
|
55 |
+
return table[cols]
|
56 |
+
|
57 |
+
|
58 |
+
def make_overall_table_by_lengths(files):
|
59 |
+
results = defaultdict(list)
|
60 |
+
|
61 |
+
result_dct = {}
|
62 |
+
for file in files:
|
63 |
+
if not file.endswith("json"): continue
|
64 |
+
path = "results/" + file
|
65 |
+
data = json.load(open(path))
|
66 |
+
model_name = file.split('/')[-1].split(".json")[0]
|
67 |
+
result_dct[model_name] = {}
|
68 |
+
for dataset in data.keys():
|
69 |
+
if dataset == "total_score":
|
70 |
+
result_dct[model_name][dataset] = data[dataset]
|
71 |
+
continue
|
72 |
+
for length in data[dataset].keys():
|
73 |
+
if length == "dataset_total_score": continue
|
74 |
+
if length not in result_dct[model_name]:
|
75 |
+
result_dct[model_name][length] = []
|
76 |
+
result_dct[model_name][length].append(data[dataset][length])
|
77 |
+
|
78 |
+
for model_name in result_dct.keys():
|
79 |
+
for length in result_dct[model_name].keys():
|
80 |
+
result_dct[model_name][length] = round(np.mean(result_dct[model_name][length]) * 100, 1)
|
81 |
+
|
82 |
+
for file in files:
|
83 |
+
if not file.endswith("json"): continue
|
84 |
+
model_name = file.split('/')[-1].split(".json")[0]
|
85 |
+
results['Model'].append(model_name)
|
86 |
+
for key in result_dct[model_name].keys():
|
87 |
+
if key == "total_score":
|
88 |
+
results["Total Score"].append(result_dct[model_name][key])
|
89 |
+
else:
|
90 |
+
results[key].append(result_dct[model_name][key])
|
91 |
+
|
92 |
+
table = pd.DataFrame(results).sort_values(['Total Score'], ascending=False)
|
93 |
+
cols = table.columns.tolist()
|
94 |
+
cols = [cols[0]] + [cols[7]] + cols[1:7]
|
95 |
+
return table[cols]
|
96 |
+
|
97 |
+
|
98 |
+
def load_model(files, tab_name):
|
99 |
+
results = defaultdict(list)
|
100 |
+
|
101 |
+
for file in files:
|
102 |
+
if not file.endswith("json"): continue
|
103 |
+
model_name = file.split('/')[-1].split(".json")[0]
|
104 |
+
results['Model'].append(model_name)
|
105 |
+
result = json.load(open("results/" + file, "r"))
|
106 |
+
for length in LENGTHS:
|
107 |
+
if length in result[tab_name].keys():
|
108 |
+
if length == "dataset_total_score":
|
109 |
+
results["Dataset Total Score"].append(round(result[tab_name][length] * 100, 1))
|
110 |
+
continue
|
111 |
+
results[length].append(round(result[tab_name][length] * 100, 1))
|
112 |
+
else:
|
113 |
+
results[length].append("-")
|
114 |
+
|
115 |
+
return pd.DataFrame(results).sort_values(['Dataset Total Score'], ascending=False)
|
116 |
+
|
117 |
+
|
118 |
+
def build_leaderboard_tab(files):
|
119 |
+
default_md = make_default_md()
|
120 |
+
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
121 |
+
|
122 |
+
with gr.Tabs() as tabs:
|
123 |
+
|
124 |
+
with gr.Tab("Results by Lengths", id=0):
|
125 |
+
df = make_overall_table_by_lengths(files)
|
126 |
+
gr.Dataframe(
|
127 |
+
headers=[
|
128 |
+
"Model",
|
129 |
+
] + LENGTHS,
|
130 |
+
datatype=[
|
131 |
+
"markdown",
|
132 |
+
"str",
|
133 |
+
"str",
|
134 |
+
"str",
|
135 |
+
"str",
|
136 |
+
"str",
|
137 |
+
"str",
|
138 |
+
"str",
|
139 |
+
],
|
140 |
+
value=df,
|
141 |
+
elem_id="arena_leaderboard_dataframe",
|
142 |
+
height=700,
|
143 |
+
wrap=True,
|
144 |
+
)
|
145 |
+
|
146 |
+
with gr.Tab("Results by Tasks", id=1):
|
147 |
+
df = make_overall_table_by_tasks(files)
|
148 |
+
gr.Dataframe(
|
149 |
+
headers=[
|
150 |
+
"Model",
|
151 |
+
] + LENGTHS,
|
152 |
+
datatype=[
|
153 |
+
"markdown",
|
154 |
+
"str",
|
155 |
+
"str",
|
156 |
+
"str",
|
157 |
+
"str",
|
158 |
+
"str",
|
159 |
+
"str",
|
160 |
+
"str",
|
161 |
+
"str",
|
162 |
+
"str",
|
163 |
+
"str",
|
164 |
+
"str",
|
165 |
+
"str",
|
166 |
+
"str",
|
167 |
+
"str",
|
168 |
+
"str",
|
169 |
+
"str",
|
170 |
+
"str",
|
171 |
+
"str",
|
172 |
+
"str",
|
173 |
+
"str",
|
174 |
+
"str",
|
175 |
+
"str"
|
176 |
+
],
|
177 |
+
value=df,
|
178 |
+
elem_id="arena_leaderboard_dataframe",
|
179 |
+
height=700,
|
180 |
+
wrap=False,
|
181 |
+
)
|
182 |
+
|
183 |
+
for tab_id, tab_name in enumerate(TASKS):
|
184 |
+
df = load_model(files, tab_name)
|
185 |
+
with gr.Tab(datasets_params[tab_name]["name"], id=tab_id+2):
|
186 |
+
gr.Dataframe(
|
187 |
+
headers=[
|
188 |
+
"Model",
|
189 |
+
] + LENGTHS,
|
190 |
+
datatype=[
|
191 |
+
"markdown",
|
192 |
+
"str",
|
193 |
+
"str",
|
194 |
+
"str",
|
195 |
+
"str",
|
196 |
+
"str",
|
197 |
+
"str",
|
198 |
+
"str",
|
199 |
+
],
|
200 |
+
value=df,
|
201 |
+
elem_id="arena_leaderboard_dataframe",
|
202 |
+
height=700,
|
203 |
+
wrap=True,
|
204 |
+
)
|
205 |
+
|
206 |
+
with gr.Tab("Description", id=tab_id + 3):
|
207 |
+
desc_md = make_model_desc_md()
|
208 |
+
gr.Markdown(desc_md, elem_id="leaderboard_markdown")
|
209 |
+
|
210 |
+
return [md_1]
|
211 |
+
|
212 |
+
|
213 |
+
def build_demo(files):
|
214 |
+
text_size = gr.themes.sizes.text_lg
|
215 |
+
|
216 |
+
with gr.Blocks(title="LIBRA leaderboard",
|
217 |
+
theme=gr.themes.Base(text_size=text_size)) as demo:
|
218 |
+
build_leaderboard_tab(files)
|
219 |
+
return demo
|
220 |
+
|
221 |
+
|
222 |
+
if __name__ == "__main__":
|
223 |
+
files = os.listdir("results")
|
224 |
+
demo = build_demo(files)
|
225 |
+
demo.launch(share=False)
|
datasets_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"name": "Passkey", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "matreshka_yes_no": {"name": "MatreshkaYesNo", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "matreshka_names": {"name": "MatreshkaNames", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "passkey_with_librusec": {"name": "PasskeyWithLibrusec", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "librusec_history": {"name": "LibrusecHistory", "lengths": ["8k", "16k", "32k", "64k"]}, "ru_gsm100": {"name": "ruGSM100", "lengths": ["16k"]}, "ru_sci_passage_count": {"name": "ruSciPassageCount", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_2wikimultihopqa": {"name": "ru2WikiMultihopQA", "lengths": ["8k", "16k", "32k"]}, "long_context_multiq": {"name": "LongContextMultiQ", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_sci_abstract_retrieval": {"name": "ruSciAbstractRetrieval", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_trec": {"name": "ruTREC", "lengths": ["4k", "8k", "16k", "32k"]}, "ru_sci_fi": {"name": "ruSciFi", "lengths": ["32k", "64k"]}, "librusec_mhqa": {"name": "LibrusecMHQA", "lengths": ["8k"]}, "ru_babilong_qa1": {"name": "ruBABILongQA1", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_babilong_qa2": {"name": "ruBABILongQA2", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_babilong_qa3": {"name": "ruBABILongQA3", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_babilong_qa4": {"name": "ruBABILongQA4", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_babilong_qa5": {"name": "ruBABILongQA5", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_quality": {"name": "ruQuALITY", "lengths": ["8k", "16k"]}, "ru_tpo": {"name": "ruTPO", "lengths": ["8k"]}, "ru_qasper": {"name": "ruQasper", "lengths": ["8k", "16k", "32k"]}}
|
docs/description.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# LIBRA: Long Input Benchmark for Russian Analysis
|
2 |
+
|
3 |
+
<img src="https://i.imgur.com/BNleRrG.png" width="800" />
|
4 |
+
|
5 |
+
## Dataset Summary
|
6 |
+
|
7 |
+
LIBRA (Long Input Benchmark for Russian Analysis) is designed to evaluate the capabilities of large language models (LLMs) in understanding and processing long texts in Russian. This benchmark includes 21 datasets adapted for different tasks and complexities. The tasks are divided into four complexity groups and allow evaluation across various context lengths ranging from 4,000 up to 128,000 tokens.
|
8 |
+
|
9 |
+
## Tasks and Complexity Groups
|
10 |
+
|
11 |
+
### Group I: Simple Information Retrieval
|
12 |
+
- **Passkey**: Extract a relevant piece of code number from a long text fragment.
|
13 |
+
- **PasskeyWithLibrusec**: Similar to Passkey but with added noise from Librusec texts.
|
14 |
+
|
15 |
+
### Group II: Question Answering and Multiple Choice
|
16 |
+
- **MatreshkaNames**: Identify the person in dialogues based on the discussed topic.
|
17 |
+
- **MatreshkaYesNo**: Indicate whether a specific topic was mentioned in the dialog.
|
18 |
+
- **LibrusecHistory**: Answer questions based on historical texts.
|
19 |
+
- **ruTREC**: Few-shot in-context learning for topic classification. Created by translating the TREC dataset from LongBench.
|
20 |
+
- **ruSciFi**: Answer true/false based on context and general world knowledge. Translation of SciFi dataset from L-Eval.
|
21 |
+
- **ruSciAbstractRetrieval**: Retrieve relevant paragraphs from scientific abstracts.
|
22 |
+
- **ruTPO**: Multiple-choice questions similar to TOEFL exams. Translation of the TPO dataset from L-Eval.
|
23 |
+
- **ruQuALITY**: Multiple-choice QA tasks based on detailed texts. Created by translating the QuALITY dataset from L-Eval.
|
24 |
+
|
25 |
+
### Group III: Multi-hop Question Answering
|
26 |
+
- **ruBABILongQA**: 5 long-context reasoning tasks for QA using facts hidden among irrelevant information.
|
27 |
+
- **LongContextMultiQ**: Multi-hop QA based on Wikidata and Wikipedia.
|
28 |
+
- **LibrusecMHQA**: Multi-hop QA requiring information distributed across several text parts.
|
29 |
+
- **ru2WikiMultihopQA**: Translation of the 2WikiMultihopQA dataset from LongBench.
|
30 |
+
|
31 |
+
### Group IV: Complex Reasoning and Mathematical Problems
|
32 |
+
- **ruSciPassageCount**: Count unique paragraphs in a long text.
|
33 |
+
- **ruQasper**: Question Answering over academic research papers. Created by translating the Qasper dataset from LongBench.
|
34 |
+
- **ruGSM100**: Solve math problems using Chain-of-Thought reasoning.
|
35 |
+
|
36 |
+
## Dataset Structure
|
37 |
+
|
38 |
+
The datasets are divided into subsets based on context lengths: 4k, 8k, 16k, 32k, 64k, and 128k tokens. Each subset contains a different number of samples depending on the task complexity.
|
39 |
+
|
40 |
+
## Usage
|
41 |
+
|
42 |
+
The LIBRA benchmark is available under the MIT license. Researchers and developers can use these datasets to evaluate the long-context understanding abilities of various LLMs. The datasets, codebase, and public leaderboard are open-source to guide forthcoming research in this area.
|
43 |
+
|
44 |
+
## Citation
|
45 |
+
|
46 |
+
_TODO_
|
47 |
+
|
48 |
+
@article{LIBRA2024,
|
49 |
+
title={Long Input Benchmark for Russian Analysis},
|
50 |
+
author={Anonymous},
|
51 |
+
journal={ACL},
|
52 |
+
year={2024}
|
53 |
+
}
|
54 |
+
|
55 |
+
## License
|
56 |
+
|
57 |
+
The datasets are published under the MIT license.
|
58 |
+
|
59 |
+
## Acknowledgments
|
60 |
+
|
61 |
+
_TODO_
|
62 |
+
|
63 |
+
For more details and code, please visit our [GitHub repository](#).
|
requirements.txt
CHANGED
@@ -1,16 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
gradio[oauth]
|
6 |
-
gradio_leaderboard==0.0.9
|
7 |
-
gradio_client
|
8 |
-
huggingface-hub>=0.18.0
|
9 |
-
matplotlib
|
10 |
-
numpy
|
11 |
-
pandas
|
12 |
-
python-dateutil
|
13 |
-
tqdm
|
14 |
-
transformers
|
15 |
-
tokenizers>=0.15.0
|
16 |
-
sentencepiece
|
|
|
1 |
+
plotly
|
2 |
+
gradio
|
3 |
+
numpy
|
4 |
+
pandas
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/ChatGLM2-6B-32K.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 0.82, "64k": 0, "128k": 0, "dataset_total_score": 0.6366666666666666}, "matreshka_yes_no": {"4k": 0.5016722408026756, "8k": 0.5, "16k": 0.5, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.33361204013377926}, "matreshka_names": {"4k": 0.04666666666666667, "8k": 0.02666666666666667, "16k": 0.006666666666666667, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.013333333333333334}, "passkey_with_librusec": {"4k": 0.99, "8k": 0.995, "16k": 0.985, "32k": 0.93, "64k": 0, "128k": 0, "dataset_total_score": 0.65}, "librusec_history": {"8k": 0.21875, "16k": 0.09375, "32k": 0.03125, "64k": 0, "dataset_total_score": 0.0859375}, "ru_gsm100": {"16k": 0.05, "dataset_total_score": 0.05}, "ru_sci_passage_count": {"4k": 0.09, "8k": 0.06, "16k": 0.07, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.03666666666666667}, "ru_2wikimultihopqa": {"8k": 0.1836734693877551, "16k": 0.21875, "32k": 0.12195121951219512, "dataset_total_score": 0.17479156296665008}, "long_context_multiq": {"8k": 0.05, "16k": 0.015, "4k": 0.005, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.011666666666666667}, "ru_sci_abstract_retrieval": {"4k": 0.4166666666666667, "8k": 0.21887180280037422, "16k": 0.1810637996020305, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.13610037817817858}, "ru_trec": {"4k": 0.05405405405405406, "8k": 0.04, "16k": 0.04395604395604396, "32k": 0.040983606557377046, "dataset_total_score": 0.04474842614186877}, "ru_sci_fi": {"32k": 0.0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.06770833333333333, "dataset_total_score": 0.06770833333333333}, "ru_babilong_qa1": {"4k": 0.27, "8k": 0.23, "16k": 0.23, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.12166666666666666}, "ru_babilong_qa2": {"4k": 0.05, "8k": 0.04, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.015}, "ru_babilong_qa3": {"4k": 0.07, "8k": 0.03, "16k": 0.04, "32k": 0.01, "64k": 0, "128k": 0, "dataset_total_score": 0.025000000000000005}, "ru_babilong_qa4": {"4k": 0.02, "8k": 0.018000000000000002, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.006333333333333334}, "ru_babilong_qa5": {"4k": 0.2, "8k": 0.18, "16k": 0.15, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.08833333333333333}, "ru_quality": {"16k": 0.43892339544513453, "8k": 0.5447154471544716, "dataset_total_score": 0.49181942129980305}, "ru_tpo": {"8k": 0.28950863213811423, "dataset_total_score": 0.28950863213811423}, "ru_qasper": {"16k": 0.03254246612062729, "8k": 0.03544011573751087, "32k": 0.009411737687136327, "dataset_total_score": 0.025798106515091495}, "total_score": 0.15736624130349933}
|
results/GLM4-9B-Chat.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 1.0, "128k": 1.0, "dataset_total_score": 1.0}, "matreshka_yes_no": {"4k": 0.7926421404682275, "8k": 0.75, "16k": 0.7133333333333334, "32k": 0.67, "64k": 0.5966666666666667, "128k": 0.56, "dataset_total_score": 0.6804403567447045}, "matreshka_names": {"4k": 0.6466666666666666, "8k": 0.5066666666666667, "16k": 0.52, "32k": 0.47333333333333333, "64k": 0.37333333333333335, "128k": 0.32, "dataset_total_score": 0.47333333333333333}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 1.0, "128k": 1.0, "dataset_total_score": 1.0}, "librusec_history": {"8k": 0.84375, "16k": 0.84375, "32k": 0.84375, "64k": 0.75, "dataset_total_score": 0.8203125}, "ru_gsm100": {"16k": 0.08, "dataset_total_score": 0.08}, "ru_sci_passage_count": {"4k": 0.27, "8k": 0.08, "16k": 0.09, "32k": 0.0, "64k": 0.01, "128k": 0.0, "dataset_total_score": 0.07500000000000001}, "ru_2wikimultihopqa": {"8k": 0.5510204081632653, "16k": 0.5546875, "32k": 0.35772357723577236, "dataset_total_score": 0.4878104951330125}, "long_context_multiq": {"8k": 0.265, "16k": 0.035, "4k": 0.055, "64k": 0.005, "32k": 0.005, "128k": 0.1, "dataset_total_score": 0.0775}, "ru_sci_abstract_retrieval": {"4k": 0.9819047619047618, "8k": 0.923411865911866, "16k": 0.9122101461259002, "32k": 0.8189266620312142, "64k": 0.6411071734029656, "128k": 0.3908439729202464, "dataset_total_score": 0.7780674303828258}, "ru_trec": {"4k": 0.5675675675675675, "8k": 0.7, "16k": 0.7582417582417582, "32k": 0.7704918032786885, "dataset_total_score": 0.6990752822720037}, "ru_sci_fi": {"32k": 0.3888888888888889, "64k": 0.42857142857142855, "dataset_total_score": 0.4087301587301587}, "librusec_mhqa": {"8k": 0.4453125, "dataset_total_score": 0.4453125}, "ru_babilong_qa1": {"4k": 0.699375, "8k": 0.59, "16k": 0.6, "32k": 0.508125, "64k": 0.429375, "128k": 0.42, "dataset_total_score": 0.5411458333333333}, "ru_babilong_qa2": {"4k": 0.389375, "8k": 0.33, "16k": 0.299375, "32k": 0.2693333333333333, "64k": 0.2675, "128k": 0.23491666666666666, "dataset_total_score": 0.29841666666666666}, "ru_babilong_qa3": {"4k": 0.24598809523809526, "8k": 0.2792380952380953, "16k": 0.21408333333333335, "32k": 0.2264761904761905, "64k": 0.18666666666666668, "128k": 0.18545833333333334, "dataset_total_score": 0.22298511904761908}, "ru_babilong_qa4": {"4k": 0.6207142857142857, "8k": 0.5964285714285714, "16k": 0.5657142857142857, "32k": 0.58, "64k": 0.43, "128k": 0.37714285714285717, "dataset_total_score": 0.5283333333333334}, "ru_babilong_qa5": {"4k": 0.7300000000000001, "8k": 0.7350000000000001, "16k": 0.7200000000000002, "32k": 0.6683333333333334, "64k": 0.6966666666666668, "128k": 0.6700000000000002, "dataset_total_score": 0.7033333333333335}, "ru_quality": {"16k": 0.6521739130434783, "8k": 0.8292682926829268, "dataset_total_score": 0.7407211028632026}, "ru_tpo": {"8k": 0.8685258964143426, "dataset_total_score": 0.8685258964143426}, "ru_qasper": {"16k": 0.05927748156784547, "8k": 0.06532155413695329, "32k": 0.025813608477215297, "dataset_total_score": 0.050137548060671354}, "total_score": 0.5228181376023114}
|
results/GPT-4o.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 1.0, "128k": 1.0, "dataset_total_score": 1.0}, "matreshka_yes_no": {"4k": 0.8, "8k": 0.6, "16k": 1.0, "32k": 0.8, "64k": 0.7, "128k": 0.9, "dataset_total_score": 0.8000000000000002}, "matreshka_names": {"4k": 0.6, "8k": 0.6, "16k": 0.5, "32k": 0.4, "64k": 0.5, "128k": 0.5, "dataset_total_score": 0.5166666666666667}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 1.0, "128k": 1.0, "dataset_total_score": 1.0}, "librusec_history": {"8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0.9, "dataset_total_score": 0.975}, "ru_gsm100": {"16k": 1.0, "dataset_total_score": 1.0}, "ru_sci_passage_count": {"4k": 1.0, "8k": 0.5, "16k": 0.3, "32k": 0.0, "64k": 0.2, "128k": 0.1, "dataset_total_score": 0.35000000000000003}, "ru_2wikimultihopqa": {"8k": 0.8, "16k": 0.8, "32k": 0.7, "dataset_total_score": 0.7666666666666666}, "long_context_multiq": {"4k": 0.3, "8k": 1.0, "16k": 0.7, "32k": 0.0, "64k": 0.1, "128k": 0.1, "dataset_total_score": 0.3666666666666667}, "ru_sci_abstract_retrieval": {"4k": 0.99, "8k": 0.9541666666666668, "16k": 0.9254479578392623, "32k": 0.9562564463343153, "64k": 0.590978869808793, "128k": 0.19764315322255238, "dataset_total_score": 0.7690821823119315}, "ru_trec": {"4k": 0.6, "8k": 0.8, "16k": 0.9, "32k": 0.7, "dataset_total_score": 0.75}, "ru_sci_fi": {"32k": 0.6, "64k": 0.9, "dataset_total_score": 0.75}, "librusec_mhqa": {"8k": 0.5, "dataset_total_score": 0.5}, "ru_babilong_qa1": {"4k": 0.9, "8k": 0.8, "16k": 0.7, "32k": 0.9, "64k": 0.8, "128k": 0.6, "dataset_total_score": 0.7833333333333333}, "ru_babilong_qa2": {"4k": 0.4, "8k": 0.3, "16k": 0.4, "32k": 0.4, "64k": 0.5, "128k": 0.2, "dataset_total_score": 0.3666666666666667}, "ru_babilong_qa3": {"4k": 0.2, "8k": 0.3, "16k": 0.1, "32k": 0.2, "64k": 0.2, "128k": 0.2866666666666667, "dataset_total_score": 0.21444444444444444}, "ru_babilong_qa4": {"4k": 0.8800000000000001, "8k": 0.8, "16k": 0.8, "32k": 0.5714285714285715, "64k": 0.8857142857142858, "128k": 0.8, "dataset_total_score": 0.7895238095238096}, "ru_babilong_qa5": {"4k": 0.8666666666666666, "8k": 0.8666666666666666, "16k": 0.9333333333333333, "32k": 0.9666666666666666, "64k": 0.8666666666666668, "128k": 0.9, "dataset_total_score": 0.9}, "ru_quality": {"8k": 0.8, "16k": 0.8666666666666668, "dataset_total_score": 0.8333333333333335}, "ru_tpo": {"8k": 1.0, "dataset_total_score": 1.0}, "ru_qasper": {"8k": 0.2865100250626566, "16k": 0.3184757236227824, "32k": 0.3465384615384616, "dataset_total_score": 0.3171747367413002}, "total_score": 0.70231230982642}
|
results/LLaMA-2-7B-32k.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0, "128k": 0, "dataset_total_score": 0.6666666666666666}, "matreshka_yes_no": {"4k": 0.5016722408026756, "8k": 0.5, "16k": 0.5, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.33361204013377926}, "matreshka_names": {"4k": 0.08, "8k": 0.06666666666666667, "16k": 0.02, "32k": 0.04, "64k": 0, "128k": 0, "dataset_total_score": 0.034444444444444444}, "passkey_with_librusec": {"4k": 1.0, "8k": 0.975, "16k": 0.985, "32k": 0.97, "64k": 0, "128k": 0, "dataset_total_score": 0.6549999999999999}, "librusec_history": {"8k": 0.6875, "16k": 0.5, "32k": 0.4375, "64k": 0, "dataset_total_score": 0.40625}, "ru_gsm100": {"16k": 0.07, "dataset_total_score": 0.07}, "ru_sci_passage_count": {"4k": 0.18, "8k": 0.05, "16k": 0.05, "32k": 0.005, "64k": 0, "128k": 0, "dataset_total_score": 0.047499999999999994}, "ru_2wikimultihopqa": {"8k": 0.4489795918367347, "16k": 0.3984375, "32k": 0.2682926829268293, "dataset_total_score": 0.37190325825452136}, "long_context_multiq": {"8k": 0.33, "16k": 0.1, "4k": 0.045, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.07916666666666666}, "ru_sci_abstract_retrieval": {"4k": 0.8519047619047619, "8k": 0.7612582259010829, "16k": 0.4675868475624726, "32k": 0.2665782908189807, "64k": 0, "128k": 0, "dataset_total_score": 0.39122135436454974}, "ru_trec": {"4k": 0.24324324324324326, "8k": 0.18, "16k": 0.24175824175824176, "32k": 0.28688524590163933, "dataset_total_score": 0.2379716827257811}, "ru_sci_fi": {"32k": 0.1111111111111111, "64k": 0, "dataset_total_score": 0.05555555555555555}, "librusec_mhqa": {"8k": 0.2760416666666667, "dataset_total_score": 0.2760416666666667}, "ru_babilong_qa1": {"4k": 0.6, "8k": 0.66, "16k": 0.66, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.4033333333333333}, "ru_babilong_qa2": {"4k": 0.25, "8k": 0.3, "16k": 0.25875, "32k": 0.19, "64k": 0, "128k": 0, "dataset_total_score": 0.16645833333333335}, "ru_babilong_qa3": {"4k": 0.22933333333333333, "8k": 0.28933333333333333, "16k": 0.26, "32k": 0.2, "64k": 0, "128k": 0, "dataset_total_score": 0.1631111111111111}, "ru_babilong_qa4": {"4k": 0.31, "8k": 0.34, "16k": 0.23, "32k": 0.12, "64k": 0, "128k": 0, "dataset_total_score": 0.16666666666666666}, "ru_babilong_qa5": {"4k": 0.59, "8k": 0.66, "16k": 0.64, "32k": 0.69, "64k": 0, "128k": 0, "dataset_total_score": 0.43}, "ru_quality": {"16k": 0.13871635610766048, "8k": 0.17073170731707318, "dataset_total_score": 0.15472403171236682}, "ru_tpo": {"8k": 0.5431606905710492, "dataset_total_score": 0.5431606905710492}, "ru_qasper": {"16k": 0.05999038960490889, "8k": 0.0580459343880765, "32k": 0.022401950361811175, "dataset_total_score": 0.04681275811826552}, "total_score": 0.2714095362059408}
|
results/LLaMA-3-8B-Instruct.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "matreshka_yes_no": {"4k": 0.8394648829431438, "8k": 0.8, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.27324414715719064}, "matreshka_names": {"4k": 0.5333333333333333, "8k": 0.46, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.16555555555555557}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "librusec_history": {"8k": 0.90625, "16k": 0, "32k": 0, "64k": 0, "dataset_total_score": 0.2265625}, "ru_gsm100": {"16k": 0, "dataset_total_score": 0.0}, "ru_sci_passage_count": {"4k": 0.31, "8k": 0.08, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.065}, "ru_2wikimultihopqa": {"8k": 0.5306122448979592, "16k": 0, "32k": 0, "dataset_total_score": 0.17687074829931973}, "long_context_multiq": {"8k": 0.245, "4k": 0.05, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.049166666666666664}, "ru_sci_abstract_retrieval": {"4k": 0.9663095238095238, "8k": 0.9151886869744013, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.31358303513065416}, "ru_trec": {"4k": 0.5945945945945946, "8k": 0.5, "16k": 0, "32k": 0, "dataset_total_score": 0.2736486486486487}, "ru_sci_fi": {"32k": 0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.4609375, "dataset_total_score": 0.4609375}, "ru_babilong_qa1": {"4k": 0.6862083333333334, "8k": 0.7335416666666665, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.236625}, "ru_babilong_qa2": {"4k": 0.14, "8k": 0.10866666666666668, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.04144444444444445}, "ru_babilong_qa3": {"4k": 0.09, "8k": 0.1798095238095238, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.04496825396825397}, "ru_babilong_qa4": {"4k": 0.5725714285714285, "8k": 0.6034285714285714, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.19599999999999998}, "ru_babilong_qa5": {"4k": 0.7666666666666667, "8k": 0.7516666666666667, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.2530555555555556}, "ru_quality": {"8k": 0.6910569105691058, "16k": 0, "dataset_total_score": 0.3455284552845529}, "ru_tpo": {"8k": 0.7808764940239044, "dataset_total_score": 0.7808764940239044}, "ru_qasper": {"8k": 0.06525418503550595, "16k": 0, "32k": 0, "dataset_total_score": 0.021751395011835317}, "total_score": 0.21864214601967852}
|
results/LLaMA-3-8B.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "matreshka_yes_no": {"4k": 0.6220735785953178, "8k": 0.59, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.2020122630992196}, "matreshka_names": {"4k": 0.4, "8k": 0.2, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.10000000000000002}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "librusec_history": {"8k": 0.90625, "16k": 0, "32k": 0, "64k": 0, "dataset_total_score": 0.2265625}, "ru_gsm100": {"16k": 0, "dataset_total_score": 0.0}, "ru_sci_passage_count": {"4k": 0.15, "8k": 0.05, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.03333333333333333}, "ru_2wikimultihopqa": {"8k": 0.5510204081632653, "16k": 0, "32k": 0, "dataset_total_score": 0.18367346938775508}, "long_context_multiq": {"8k": 0.325, "4k": 0.095, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.07}, "ru_sci_abstract_retrieval": {"4k": 0.9711111111111111, "8k": 0.8806912531912532, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3086337273837274}, "ru_trec": {"4k": 0.3783783783783784, "8k": 0.38, "16k": 0, "32k": 0, "dataset_total_score": 0.1895945945945946}, "ru_sci_fi": {"32k": 0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.4140625, "dataset_total_score": 0.4140625}, "ru_babilong_qa1": {"4k": 0.68, "8k": 0.57, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.20833333333333334}, "ru_babilong_qa2": {"4k": 0.27, "8k": 0.19, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.07666666666666667}, "ru_babilong_qa3": {"4k": 0.28470833333333334, "8k": 0.25866666666666666, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.09056249999999999}, "ru_babilong_qa4": {"4k": 0.5838571428571429, "8k": 0.5642857142857143, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.19135714285714286}, "ru_babilong_qa5": {"4k": 0.6716666666666667, "8k": 0.6866666666666668, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.2263888888888889}, "ru_quality": {"8k": 0.17073170731707318, "16k": 0, "dataset_total_score": 0.08536585365853659}, "ru_tpo": {"8k": 0.5816733067729084, "dataset_total_score": 0.5816733067729084}, "ru_qasper": {"8k": 0.06526998039125843, "16k": 0, "32k": 0, "dataset_total_score": 0.021756660130419478}, "total_score": 0.18460206698919968}
|
results/LongAlpaca.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 0.775, "8k": 0.825, "16k": 0.575, "32k": 0.37, "64k": 0, "128k": 0, "dataset_total_score": 0.42416666666666664}, "matreshka_yes_no": {"4k": 0.4782608695652174, "8k": 0.3933333333333333, "16k": 0.48, "32k": 0.4766666666666667, "64k": 0, "128k": 0, "dataset_total_score": 0.30471014492753623}, "matreshka_names": {"4k": 0.013333333333333334, "8k": 0.006666666666666667, "16k": 0.006666666666666667, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0044444444444444444}, "passkey_with_librusec": {"4k": 0.71, "8k": 0.7, "16k": 0.56, "32k": 0.465, "64k": 0, "128k": 0, "dataset_total_score": 0.4058333333333333}, "librusec_history": {"8k": 0.1875, "16k": 0.15625, "32k": 0.1875, "64k": 0, "dataset_total_score": 0.1328125}, "ru_gsm100": {"16k": 0.02, "dataset_total_score": 0.02}, "ru_sci_passage_count": {"4k": 0.13, "8k": 0.05, "16k": 0.02, "32k": 0.03, "64k": 0, "128k": 0, "dataset_total_score": 0.03833333333333333}, "ru_2wikimultihopqa": {"8k": 0.40816326530612246, "16k": 0.2890625, "32k": 0.21138211382113822, "dataset_total_score": 0.3028692930424202}, "long_context_multiq": {"8k": 0.015, "16k": 0.0, "4k": 0.03, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0075}, "ru_sci_abstract_retrieval": {"4k": 0.6496428571428572, "8k": 0.44704965669251384, "16k": 0.2043098561917367, "32k": 0.11199263409419845, "64k": 0, "128k": 0, "dataset_total_score": 0.23549916735355103}, "ru_trec": {"4k": 0.0, "8k": 0.02, "16k": 0.0, "32k": 0.0, "dataset_total_score": 0.005}, "ru_sci_fi": {"32k": 0.027777777777777776, "64k": 0, "dataset_total_score": 0.013888888888888888}, "librusec_mhqa": {"8k": 0.078125, "dataset_total_score": 0.078125}, "ru_babilong_qa1": {"4k": 0.09, "8k": 0.06, "16k": 0.06, "32k": 0.02, "64k": 0, "128k": 0, "dataset_total_score": 0.03833333333333333}, "ru_babilong_qa2": {"4k": 0.01, "8k": 0.01, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0033333333333333335}, "ru_babilong_qa3": {"4k": 0.05, "8k": 0.09, "16k": 0.04, "32k": 0.029333333333333336, "64k": 0, "128k": 0, "dataset_total_score": 0.03488888888888889}, "ru_babilong_qa4": {"4k": 0.0, "8k": 0.01, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0016666666666666668}, "ru_babilong_qa5": {"4k": 0.4416666666666667, "8k": 0.44, "16k": 0.475, "32k": 0.40666666666666673, "64k": 0, "128k": 0, "dataset_total_score": 0.2938888888888889}, "ru_quality": {"16k": 0.4824016563146999, "8k": 0.39837398373983746, "dataset_total_score": 0.4403878200272687}, "ru_tpo": {"8k": 0.06772908366533864, "dataset_total_score": 0.06772908366533864}, "ru_qasper": {"16k": 0.02179956634280568, "8k": 0.02260857442083522, "32k": 0.015666306859159988, "dataset_total_score": 0.020024815874266962}, "total_score": 0.13683026679372187}
|
results/LongChat-7B-v1.5-32k.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 0.995, "8k": 1.0, "16k": 1.0, "32k": 0.995, "64k": 0, "128k": 0, "dataset_total_score": 0.665}, "matreshka_yes_no": {"4k": 0.5016722408026756, "8k": 0.5, "16k": 0.5, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.33361204013377926}, "matreshka_names": {"4k": 0.17333333333333334, "8k": 0.06666666666666667, "16k": 0.08, "32k": 0.03333333333333333, "64k": 0, "128k": 0, "dataset_total_score": 0.058888888888888886}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 0.985, "32k": 0.975, "64k": 0, "128k": 0, "dataset_total_score": 0.66}, "librusec_history": {"8k": 0.5625, "16k": 0.34375, "32k": 0.15625, "64k": 0, "dataset_total_score": 0.265625}, "ru_gsm100": {"16k": 0.05, "dataset_total_score": 0.05}, "ru_sci_passage_count": {"4k": 0.18, "8k": 0.08, "16k": 0.01, "32k": 0.02, "64k": 0, "128k": 0, "dataset_total_score": 0.04833333333333334}, "ru_2wikimultihopqa": {"8k": 0.42857142857142855, "16k": 0.3984375, "32k": 0.22764227642276422, "dataset_total_score": 0.3515504016647309}, "long_context_multiq": {"8k": 0.14, "16k": 0.025, "4k": 0.025, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.03166666666666667}, "ru_sci_abstract_retrieval": {"4k": 0.8742063492063491, "8k": 0.7618964368964368, "16k": 0.6055101098106439, "32k": 0.22205033016504383, "64k": 0, "128k": 0, "dataset_total_score": 0.4106105376797456}, "ru_trec": {"4k": 0.05405405405405406, "8k": 0.1, "16k": 0.07692307692307693, "32k": 0.06557377049180328, "dataset_total_score": 0.07413772536723356}, "ru_sci_fi": {"32k": 0.05555555555555555, "64k": 0, "dataset_total_score": 0.027777777777777776}, "librusec_mhqa": {"8k": 0.24739583333333334, "dataset_total_score": 0.24739583333333334}, "ru_babilong_qa1": {"4k": 0.26, "8k": 0.29, "16k": 0.31, "32k": 0.19, "64k": 0, "128k": 0, "dataset_total_score": 0.17500000000000002}, "ru_babilong_qa2": {"4k": 0.11, "8k": 0.08, "16k": 0.16, "32k": 0.08, "64k": 0, "128k": 0, "dataset_total_score": 0.07166666666666667}, "ru_babilong_qa3": {"4k": 0.09, "8k": 0.05, "16k": 0.04, "32k": 0.06, "64k": 0, "128k": 0, "dataset_total_score": 0.04}, "ru_babilong_qa4": {"4k": 0.25214285714285717, "8k": 0.2921428571428571, "16k": 0.15642857142857142, "32k": 0.05928571428571429, "64k": 0, "128k": 0, "dataset_total_score": 0.12666666666666665}, "ru_babilong_qa5": {"4k": 0.5133333333333333, "8k": 0.5, "16k": 0.48333333333333334, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.3327777777777778}, "ru_quality": {"16k": 0.1780538302277433, "8k": 0.28455284552845533, "dataset_total_score": 0.2313033378780993}, "ru_tpo": {"8k": 0.3957503320053121, "dataset_total_score": 0.3957503320053121}, "ru_qasper": {"16k": 0.06475047107877817, "8k": 0.060803436943138944, "32k": 0.023538832049622788, "dataset_total_score": 0.04969758002384664}, "total_score": 0.22130764599351707}
|
results/Mistral-7B-Instruct-v0.3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0, "128k": 0, "dataset_total_score": 0.6666666666666666}, "matreshka_yes_no": {"4k": 0.5652173913043478, "8k": 0.5066666666666667, "16k": 0.5466666666666666, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.35309178743961356}, "matreshka_names": {"4k": 0.38, "8k": 0.32, "16k": 0.16666666666666666, "32k": 0.11333333333333333, "64k": 0, "128k": 0, "dataset_total_score": 0.1633333333333333}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 0.995, "64k": 0, "128k": 0, "dataset_total_score": 0.6658333333333334}, "librusec_history": {"8k": 0.71875, "16k": 0.625, "32k": 0.6875, "64k": 0, "dataset_total_score": 0.5078125}, "ru_gsm100": {"16k": 0.11, "dataset_total_score": 0.11}, "ru_sci_passage_count": {"4k": 0.26, "8k": 0.14, "16k": 0.07, "32k": 0.02, "64k": 0, "128k": 0, "dataset_total_score": 0.08166666666666668}, "ru_2wikimultihopqa": {"8k": 0.5510204081632653, "16k": 0.46875, "32k": 0.2764227642276423, "dataset_total_score": 0.4320643907969692}, "long_context_multiq": {"8k": 0.22, "16k": 0.035, "4k": 0.035, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.04833333333333334}, "ru_sci_abstract_retrieval": {"4k": 0.9824603174603175, "8k": 0.8690578865578865, "16k": 0.7111051699917106, "32k": 0.05148203921970241, "64k": 0, "128k": 0, "dataset_total_score": 0.43568423553826957}, "ru_trec": {"4k": 0.5675675675675675, "8k": 0.38, "16k": 0.4065934065934066, "32k": 0.3442622950819672, "dataset_total_score": 0.4246058173107353}, "ru_sci_fi": {"32k": 0.3055555555555556, "64k": 0, "dataset_total_score": 0.1527777777777778}, "librusec_mhqa": {"8k": 0.3359375, "dataset_total_score": 0.3359375}, "ru_babilong_qa1": {"4k": 0.25, "8k": 0.15, "16k": 0.22, "32k": 0.24, "64k": 0, "128k": 0, "dataset_total_score": 0.14333333333333334}, "ru_babilong_qa2": {"4k": 0.08, "8k": 0.05, "16k": 0.02, "32k": 0.02, "64k": 0, "128k": 0, "dataset_total_score": 0.028333333333333332}, "ru_babilong_qa3": {"4k": 0.1, "8k": 0.08, "16k": 0.1, "32k": 0.08, "64k": 0, "128k": 0, "dataset_total_score": 0.060000000000000005}, "ru_babilong_qa4": {"4k": 0.5178571428571428, "8k": 0.4428571428571429, "16k": 0.39285714285714285, "32k": 0.3028571428571429, "64k": 0, "128k": 0, "dataset_total_score": 0.2760714285714286}, "ru_babilong_qa5": {"4k": 0.5466666666666667, "8k": 0.62, "16k": 0.5533333333333333, "32k": 0.5333333333333333, "64k": 0, "128k": 0, "dataset_total_score": 0.3755555555555556}, "ru_quality": {"16k": 0.22981366459627334, "8k": 0.3821138211382114, "dataset_total_score": 0.30596374286724237}, "ru_tpo": {"8k": 0.6640106241699868, "dataset_total_score": 0.6640106241699868}, "ru_qasper": {"16k": 0.06576022201401649, "8k": 0.06619134922698901, "32k": 0.02936483912182809, "dataset_total_score": 0.05377213678761119}, "total_score": 0.2992784522292948}
|
results/Mistral-7B-v0.1.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 0.975, "16k": 0.125, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.35000000000000003}, "matreshka_yes_no": {"4k": 0.5016722408026756, "8k": 0.5, "16k": 0.0033333333333333335, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.1675009290226682}, "matreshka_names": {"4k": 0.32666666666666666, "8k": 0.16, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.08111111111111112}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 0.3, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.3833333333333333}, "librusec_history": {"8k": 0.78125, "16k": 0.15625, "32k": 0.0, "64k": 0, "dataset_total_score": 0.234375}, "ru_gsm100": {"16k": 0.13, "dataset_total_score": 0.13}, "ru_sci_passage_count": {"4k": 0.04, "8k": 0.04, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.013333333333333334}, "ru_2wikimultihopqa": {"8k": 0.42857142857142855, "16k": 0.1796875, "32k": 0.08130081300813008, "dataset_total_score": 0.22985324719318623}, "long_context_multiq": {"8k": 0.22, "16k": 0.005, "4k": 0.04, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.04416666666666667}, "ru_sci_abstract_retrieval": {"4k": 0.9484126984126985, "8k": 0.7607802118516404, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.28486548504405645}, "ru_trec": {"4k": 0.02702702702702703, "8k": 0.1, "16k": 0.03296703296703297, "32k": 0.0, "dataset_total_score": 0.039998514998515}, "ru_sci_fi": {"32k": 0.027777777777777776, "64k": 0, "dataset_total_score": 0.013888888888888888}, "librusec_mhqa": {"8k": 0.3411458333333333, "dataset_total_score": 0.3411458333333333}, "ru_babilong_qa1": {"4k": 0.63, "8k": 0.63, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.21}, "ru_babilong_qa2": {"4k": 0.21, "8k": 0.25, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.07666666666666666}, "ru_babilong_qa3": {"4k": 0.29, "8k": 0.25, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.09000000000000001}, "ru_babilong_qa4": {"4k": 0.4292857142857143, "8k": 0.3157142857142857, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.12416666666666669}, "ru_babilong_qa5": {"4k": 0.7, "8k": 0.6933333333333335, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.23222222222222225}, "ru_quality": {"16k": 0.11801242236024845, "8k": 0.22764227642276424, "dataset_total_score": 0.17282734939150635}, "ru_tpo": {"8k": 0.3957503320053121, "dataset_total_score": 0.3957503320053121}, "ru_qasper": {"16k": 0.011042882576489372, "8k": 0.0625419691096683, "32k": 0.0008322260797508323, "dataset_total_score": 0.024805692588636172}, "total_score": 0.17333387011743345}
|
results/Mistral-7B-v0.3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0, "128k": 0, "dataset_total_score": 0.6666666666666666}, "matreshka_yes_no": {"4k": 0.44816053511705684, "8k": 0.47, "16k": 0.5, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.31969342251950944}, "matreshka_names": {"4k": 0.2866666666666667, "8k": 0.16, "16k": 0.10666666666666667, "32k": 0.04666666666666667, "64k": 0, "128k": 0, "dataset_total_score": 0.09999999999999999}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0, "128k": 0, "dataset_total_score": 0.6666666666666666}, "librusec_history": {"8k": 0.9375, "16k": 0.9375, "32k": 0.84375, "64k": 0, "dataset_total_score": 0.6796875}, "ru_gsm100": {"16k": 0.09, "dataset_total_score": 0.09}, "ru_sci_passage_count": {"4k": 0.0, "8k": 0.0, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0}, "ru_2wikimultihopqa": {"8k": 0.46938775510204084, "16k": 0.4921875, "32k": 0.2682926829268293, "dataset_total_score": 0.40995597934295674}, "long_context_multiq": {"8k": 0.24, "16k": 0.035, "4k": 0.04, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.05249999999999999}, "ru_sci_abstract_retrieval": {"4k": 0.8735714285714286, "8k": 0.5658155733155733, "16k": 0.3690303371278, "32k": 0.01873920061760887, "64k": 0, "128k": 0, "dataset_total_score": 0.30452608993873514}, "ru_trec": {"4k": 0.0, "8k": 0.08, "16k": 0.04395604395604396, "32k": 0.09016393442622951, "dataset_total_score": 0.05352999459556837}, "ru_sci_fi": {"32k": 0.0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.390625, "dataset_total_score": 0.390625}, "ru_babilong_qa1": {"4k": 0.6, "8k": 0.63, "16k": 0.58, "32k": 0.43, "64k": 0, "128k": 0, "dataset_total_score": 0.37333333333333335}, "ru_babilong_qa2": {"4k": 0.35, "8k": 0.23, "16k": 0.18, "32k": 0.24, "64k": 0, "128k": 0, "dataset_total_score": 0.16666666666666666}, "ru_babilong_qa3": {"4k": 0.29, "8k": 0.23, "16k": 0.23, "32k": 0.19, "64k": 0, "128k": 0, "dataset_total_score": 0.15666666666666665}, "ru_babilong_qa4": {"4k": 0.4628571428571429, "8k": 0.3442857142857143, "16k": 0.36214285714285716, "32k": 0.24857142857142858, "64k": 0, "128k": 0, "dataset_total_score": 0.23630952380952383}, "ru_babilong_qa5": {"4k": 0.7033333333333335, "8k": 0.6866666666666668, "16k": 0.7533333333333334, "32k": 0.6833333333333335, "64k": 0, "128k": 0, "dataset_total_score": 0.4711111111111112}, "ru_quality": {"16k": 0.06832298136645963, "8k": 0.23577235772357727, "dataset_total_score": 0.15204766954501844}, "ru_tpo": {"8k": 0.3970783532536521, "dataset_total_score": 0.3970783532536521}, "ru_qasper": {"16k": 0.0648885639017482, "8k": 0.08898968989798027, "32k": 0.01870864419884028, "dataset_total_score": 0.05752896599952292}, "total_score": 0.27355207667217124}
|
results/Saiga-LLaMA-3-8B.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"passkey": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "matreshka_yes_no": {"4k": 0.8729096989966555, "8k": 0.81, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.28048494983277594}, "matreshka_names": {"4k": 0.5333333333333333, "8k": 0.4, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.15555555555555556}, "passkey_with_librusec": {"4k": 1.0, "8k": 0.995, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3325}, "librusec_history": {"8k": 0.96875, "16k": 0, "32k": 0, "64k": 0, "dataset_total_score": 0.2421875}, "ru_gsm100": {"16k": 0, "dataset_total_score": 0.0}, "ru_sci_passage_count": {"4k": 0.195, "8k": 0.035, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.03833333333333334}, "ru_2wikimultihopqa": {"8k": 0.5306122448979592, "16k": 0, "32k": 0, "dataset_total_score": 0.17687074829931973}, "long_context_multiq": {"8k": 0.235, "4k": 0.055, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.04833333333333333}, "ru_sci_abstract_retrieval": {"4k": 0.9772222222222221, "8k": 0.9258564054992626, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.31717977128691416}, "ru_trec": {"4k": 0.5135135135135135, "8k": 0.54, "16k": 0, "32k": 0, "dataset_total_score": 0.2633783783783784}, "ru_sci_fi": {"32k": 0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.4505208333333333, "dataset_total_score": 0.4505208333333333}, "ru_babilong_qa1": {"4k": 0.7629583333333333, "8k": 0.758125, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.25351388888888887}, "ru_babilong_qa2": {"4k": 0.195625, "8k": 0.06875, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.044062500000000004}, "ru_babilong_qa3": {"4k": 0.14733333333333334, "8k": 0.21585714285714286, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.06053174603174603}, "ru_babilong_qa4": {"4k": 0.6347142857142858, "8k": 0.5821428571428572, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.20280952380952386}, "ru_babilong_qa5": {"4k": 0.7466666666666667, "8k": 0.7633333333333334, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.2516666666666667}, "ru_quality": {"8k": 0.35772357723577236, "16k": 0, "dataset_total_score": 0.17886178861788618}, "ru_tpo": {"8k": 0.7569721115537849, "dataset_total_score": 0.7569721115537849}, "ru_qasper": {"8k": 0.07413702213069599, "16k": 0, "32k": 0, "dataset_total_score": 0.024712340710231998}, "total_score": 0.21008610966500027}
|