ai-forever commited on
Commit
1ddbee0
1 Parent(s): 95df55c

Upload 19 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
35
- scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,13 +1 @@
1
- auto_evals/
2
- venv/
3
- __pycache__/
4
- .env
5
- .ipynb_checkpoints
6
- *ipynb
7
- .vscode/
8
-
9
- eval-queue/
10
- eval-results/
11
- eval-queue-bk/
12
- eval-results-bk/
13
- logs/
 
1
+ hf_token
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,44 +1,16 @@
1
- ---
2
- title: LIBRA Leaderboard
3
- emoji: 🥇
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: gradio
7
- app_file: app.py
8
- pinned: true
9
- license: mit
10
- ---
11
-
12
- # Start the configuration
13
-
14
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
15
-
16
- Results files should have the following format and be stored as json files:
17
- ```json
18
- {
19
- "config": {
20
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
21
- "model_name": "path of the model on the hub: org/model",
22
- "model_sha": "revision on the hub",
23
- },
24
- "results": {
25
- "task_name": {
26
- "metric_name": score,
27
- },
28
- "task_name2": {
29
- "metric_name": score,
30
- }
31
- }
32
- }
33
- ```
34
-
35
- Request files are created automatically by this tool.
36
-
37
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
38
-
39
- # Code logic for more complex edits
40
-
41
- You'll find
42
- - the main table' columns names and properties in `src/display/utils.py`
43
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
44
- - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
+ ---
2
+ title: LIBRA Leaderboard
3
+ emoji: 🏆
4
+ colorFrom: indigo
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.36.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ tags:
12
+ - leaderboard
13
+ short_description: LLM extra long context benchmark
14
+ ---
15
+
16
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,204 +1,225 @@
1
- import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
-
91
-
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
- with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
-
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ from collections import defaultdict
8
+
9
+
10
+ LENGTHS = ["dataset_total_score", "4k", "8k", "16k", "32k", "64k", "128k"]
11
+ datasets_params = json.load(open("datasets_config.json", "r"))
12
+ TASKS = datasets_params.keys()
13
+
14
+
15
+ def make_default_md():
16
+ leaderboard_md = "LeaderBoard"
17
+ return leaderboard_md
18
+
19
+
20
+ def make_model_desc_md():
21
+ with open("docs/description.md", "r") as f:
22
+ description = f.read()
23
+ return description
24
+
25
+
26
+ def make_overall_table_by_tasks(files):
27
+ results = defaultdict(list)
28
+
29
+ result_dct = {}
30
+ for file in files:
31
+ if not file.endswith("json"): continue
32
+ path = "results/" + file
33
+ data = json.load(open(path))
34
+ model_name = file.split('/')[-1].split(".json")[0]
35
+ result_dct[model_name] = {}
36
+ for dataset in data.keys():
37
+ if dataset == "total_score":
38
+ result_dct[model_name][dataset] = round(data[dataset] * 100, 1)
39
+ continue
40
+ result_dct[model_name][dataset] = round(data[dataset]["dataset_total_score"] * 100, 1)
41
+
42
+ for file in files:
43
+ if not file.endswith("json"): continue
44
+ model_name = file.split('/')[-1].split(".json")[0]
45
+ results['Model'].append(model_name)
46
+ for key in result_dct[model_name].keys():
47
+ if key == "total_score":
48
+ results["Total Score"].append(result_dct[model_name][key])
49
+ else:
50
+ results[datasets_params[key]["name"]].append(result_dct[model_name][key])
51
+
52
+ table = pd.DataFrame(results).sort_values(['Total Score'], ascending=False)
53
+ cols = table.columns.tolist()
54
+ cols = [cols[0]] + [cols[22]] + cols[1:22]
55
+ return table[cols]
56
+
57
+
58
+ def make_overall_table_by_lengths(files):
59
+ results = defaultdict(list)
60
+
61
+ result_dct = {}
62
+ for file in files:
63
+ if not file.endswith("json"): continue
64
+ path = "results/" + file
65
+ data = json.load(open(path))
66
+ model_name = file.split('/')[-1].split(".json")[0]
67
+ result_dct[model_name] = {}
68
+ for dataset in data.keys():
69
+ if dataset == "total_score":
70
+ result_dct[model_name][dataset] = data[dataset]
71
+ continue
72
+ for length in data[dataset].keys():
73
+ if length == "dataset_total_score": continue
74
+ if length not in result_dct[model_name]:
75
+ result_dct[model_name][length] = []
76
+ result_dct[model_name][length].append(data[dataset][length])
77
+
78
+ for model_name in result_dct.keys():
79
+ for length in result_dct[model_name].keys():
80
+ result_dct[model_name][length] = round(np.mean(result_dct[model_name][length]) * 100, 1)
81
+
82
+ for file in files:
83
+ if not file.endswith("json"): continue
84
+ model_name = file.split('/')[-1].split(".json")[0]
85
+ results['Model'].append(model_name)
86
+ for key in result_dct[model_name].keys():
87
+ if key == "total_score":
88
+ results["Total Score"].append(result_dct[model_name][key])
89
+ else:
90
+ results[key].append(result_dct[model_name][key])
91
+
92
+ table = pd.DataFrame(results).sort_values(['Total Score'], ascending=False)
93
+ cols = table.columns.tolist()
94
+ cols = [cols[0]] + [cols[7]] + cols[1:7]
95
+ return table[cols]
96
+
97
+
98
+ def load_model(files, tab_name):
99
+ results = defaultdict(list)
100
+
101
+ for file in files:
102
+ if not file.endswith("json"): continue
103
+ model_name = file.split('/')[-1].split(".json")[0]
104
+ results['Model'].append(model_name)
105
+ result = json.load(open("results/" + file, "r"))
106
+ for length in LENGTHS:
107
+ if length in result[tab_name].keys():
108
+ if length == "dataset_total_score":
109
+ results["Dataset Total Score"].append(round(result[tab_name][length] * 100, 1))
110
+ continue
111
+ results[length].append(round(result[tab_name][length] * 100, 1))
112
+ else:
113
+ results[length].append("-")
114
+
115
+ return pd.DataFrame(results).sort_values(['Dataset Total Score'], ascending=False)
116
+
117
+
118
+ def build_leaderboard_tab(files):
119
+ default_md = make_default_md()
120
+ md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
121
+
122
+ with gr.Tabs() as tabs:
123
+
124
+ with gr.Tab("Results by Lengths", id=0):
125
+ df = make_overall_table_by_lengths(files)
126
+ gr.Dataframe(
127
+ headers=[
128
+ "Model",
129
+ ] + LENGTHS,
130
+ datatype=[
131
+ "markdown",
132
+ "str",
133
+ "str",
134
+ "str",
135
+ "str",
136
+ "str",
137
+ "str",
138
+ "str",
139
+ ],
140
+ value=df,
141
+ elem_id="arena_leaderboard_dataframe",
142
+ height=700,
143
+ wrap=True,
144
+ )
145
+
146
+ with gr.Tab("Results by Tasks", id=1):
147
+ df = make_overall_table_by_tasks(files)
148
+ gr.Dataframe(
149
+ headers=[
150
+ "Model",
151
+ ] + LENGTHS,
152
+ datatype=[
153
+ "markdown",
154
+ "str",
155
+ "str",
156
+ "str",
157
+ "str",
158
+ "str",
159
+ "str",
160
+ "str",
161
+ "str",
162
+ "str",
163
+ "str",
164
+ "str",
165
+ "str",
166
+ "str",
167
+ "str",
168
+ "str",
169
+ "str",
170
+ "str",
171
+ "str",
172
+ "str",
173
+ "str",
174
+ "str",
175
+ "str"
176
+ ],
177
+ value=df,
178
+ elem_id="arena_leaderboard_dataframe",
179
+ height=700,
180
+ wrap=False,
181
+ )
182
+
183
+ for tab_id, tab_name in enumerate(TASKS):
184
+ df = load_model(files, tab_name)
185
+ with gr.Tab(datasets_params[tab_name]["name"], id=tab_id+2):
186
+ gr.Dataframe(
187
+ headers=[
188
+ "Model",
189
+ ] + LENGTHS,
190
+ datatype=[
191
+ "markdown",
192
+ "str",
193
+ "str",
194
+ "str",
195
+ "str",
196
+ "str",
197
+ "str",
198
+ "str",
199
+ ],
200
+ value=df,
201
+ elem_id="arena_leaderboard_dataframe",
202
+ height=700,
203
+ wrap=True,
204
+ )
205
+
206
+ with gr.Tab("Description", id=tab_id + 3):
207
+ desc_md = make_model_desc_md()
208
+ gr.Markdown(desc_md, elem_id="leaderboard_markdown")
209
+
210
+ return [md_1]
211
+
212
+
213
+ def build_demo(files):
214
+ text_size = gr.themes.sizes.text_lg
215
+
216
+ with gr.Blocks(title="LIBRA leaderboard",
217
+ theme=gr.themes.Base(text_size=text_size)) as demo:
218
+ build_leaderboard_tab(files)
219
+ return demo
220
+
221
+
222
+ if __name__ == "__main__":
223
+ files = os.listdir("results")
224
+ demo = build_demo(files)
225
+ demo.launch(share=False)
datasets_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"name": "Passkey", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "matreshka_yes_no": {"name": "MatreshkaYesNo", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "matreshka_names": {"name": "MatreshkaNames", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "passkey_with_librusec": {"name": "PasskeyWithLibrusec", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "librusec_history": {"name": "LibrusecHistory", "lengths": ["8k", "16k", "32k", "64k"]}, "ru_gsm100": {"name": "ruGSM100", "lengths": ["16k"]}, "ru_sci_passage_count": {"name": "ruSciPassageCount", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_2wikimultihopqa": {"name": "ru2WikiMultihopQA", "lengths": ["8k", "16k", "32k"]}, "long_context_multiq": {"name": "LongContextMultiQ", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_sci_abstract_retrieval": {"name": "ruSciAbstractRetrieval", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_trec": {"name": "ruTREC", "lengths": ["4k", "8k", "16k", "32k"]}, "ru_sci_fi": {"name": "ruSciFi", "lengths": ["32k", "64k"]}, "librusec_mhqa": {"name": "LibrusecMHQA", "lengths": ["8k"]}, "ru_babilong_qa1": {"name": "ruBABILongQA1", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_babilong_qa2": {"name": "ruBABILongQA2", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_babilong_qa3": {"name": "ruBABILongQA3", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_babilong_qa4": {"name": "ruBABILongQA4", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_babilong_qa5": {"name": "ruBABILongQA5", "lengths": ["4k", "8k", "16k", "32k", "64k", "128k"]}, "ru_quality": {"name": "ruQuALITY", "lengths": ["8k", "16k"]}, "ru_tpo": {"name": "ruTPO", "lengths": ["8k"]}, "ru_qasper": {"name": "ruQasper", "lengths": ["8k", "16k", "32k"]}}
docs/description.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LIBRA: Long Input Benchmark for Russian Analysis
2
+
3
+ <img src="https://i.imgur.com/BNleRrG.png" width="800" />
4
+
5
+ ## Dataset Summary
6
+
7
+ LIBRA (Long Input Benchmark for Russian Analysis) is designed to evaluate the capabilities of large language models (LLMs) in understanding and processing long texts in Russian. This benchmark includes 21 datasets adapted for different tasks and complexities. The tasks are divided into four complexity groups and allow evaluation across various context lengths ranging from 4,000 up to 128,000 tokens.
8
+
9
+ ## Tasks and Complexity Groups
10
+
11
+ ### Group I: Simple Information Retrieval
12
+ - **Passkey**: Extract a relevant piece of code number from a long text fragment.
13
+ - **PasskeyWithLibrusec**: Similar to Passkey but with added noise from Librusec texts.
14
+
15
+ ### Group II: Question Answering and Multiple Choice
16
+ - **MatreshkaNames**: Identify the person in dialogues based on the discussed topic.
17
+ - **MatreshkaYesNo**: Indicate whether a specific topic was mentioned in the dialog.
18
+ - **LibrusecHistory**: Answer questions based on historical texts.
19
+ - **ruTREC**: Few-shot in-context learning for topic classification. Created by translating the TREC dataset from LongBench.
20
+ - **ruSciFi**: Answer true/false based on context and general world knowledge. Translation of SciFi dataset from L-Eval.
21
+ - **ruSciAbstractRetrieval**: Retrieve relevant paragraphs from scientific abstracts.
22
+ - **ruTPO**: Multiple-choice questions similar to TOEFL exams. Translation of the TPO dataset from L-Eval.
23
+ - **ruQuALITY**: Multiple-choice QA tasks based on detailed texts. Created by translating the QuALITY dataset from L-Eval.
24
+
25
+ ### Group III: Multi-hop Question Answering
26
+ - **ruBABILongQA**: 5 long-context reasoning tasks for QA using facts hidden among irrelevant information.
27
+ - **LongContextMultiQ**: Multi-hop QA based on Wikidata and Wikipedia.
28
+ - **LibrusecMHQA**: Multi-hop QA requiring information distributed across several text parts.
29
+ - **ru2WikiMultihopQA**: Translation of the 2WikiMultihopQA dataset from LongBench.
30
+
31
+ ### Group IV: Complex Reasoning and Mathematical Problems
32
+ - **ruSciPassageCount**: Count unique paragraphs in a long text.
33
+ - **ruQasper**: Question Answering over academic research papers. Created by translating the Qasper dataset from LongBench.
34
+ - **ruGSM100**: Solve math problems using Chain-of-Thought reasoning.
35
+
36
+ ## Dataset Structure
37
+
38
+ The datasets are divided into subsets based on context lengths: 4k, 8k, 16k, 32k, 64k, and 128k tokens. Each subset contains a different number of samples depending on the task complexity.
39
+
40
+ ## Usage
41
+
42
+ The LIBRA benchmark is available under the MIT license. Researchers and developers can use these datasets to evaluate the long-context understanding abilities of various LLMs. The datasets, codebase, and public leaderboard are open-source to guide forthcoming research in this area.
43
+
44
+ ## Citation
45
+
46
+ _TODO_
47
+
48
+ @article{LIBRA2024,
49
+ title={Long Input Benchmark for Russian Analysis},
50
+ author={Anonymous},
51
+ journal={ACL},
52
+ year={2024}
53
+ }
54
+
55
+ ## License
56
+
57
+ The datasets are published under the MIT license.
58
+
59
+ ## Acknowledgments
60
+
61
+ _TODO_
62
+
63
+ For more details and code, please visit our [GitHub repository](#).
requirements.txt CHANGED
@@ -1,16 +1,4 @@
1
- APScheduler
2
- black
3
- datasets
4
- gradio
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.9
7
- gradio_client
8
- huggingface-hub>=0.18.0
9
- matplotlib
10
- numpy
11
- pandas
12
- python-dateutil
13
- tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- sentencepiece
 
1
+ plotly
2
+ gradio
3
+ numpy
4
+ pandas
 
 
 
 
 
 
 
 
 
 
 
 
results/ChatGLM2-6B-32K.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 0.82, "64k": 0, "128k": 0, "dataset_total_score": 0.6366666666666666}, "matreshka_yes_no": {"4k": 0.5016722408026756, "8k": 0.5, "16k": 0.5, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.33361204013377926}, "matreshka_names": {"4k": 0.04666666666666667, "8k": 0.02666666666666667, "16k": 0.006666666666666667, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.013333333333333334}, "passkey_with_librusec": {"4k": 0.99, "8k": 0.995, "16k": 0.985, "32k": 0.93, "64k": 0, "128k": 0, "dataset_total_score": 0.65}, "librusec_history": {"8k": 0.21875, "16k": 0.09375, "32k": 0.03125, "64k": 0, "dataset_total_score": 0.0859375}, "ru_gsm100": {"16k": 0.05, "dataset_total_score": 0.05}, "ru_sci_passage_count": {"4k": 0.09, "8k": 0.06, "16k": 0.07, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.03666666666666667}, "ru_2wikimultihopqa": {"8k": 0.1836734693877551, "16k": 0.21875, "32k": 0.12195121951219512, "dataset_total_score": 0.17479156296665008}, "long_context_multiq": {"8k": 0.05, "16k": 0.015, "4k": 0.005, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.011666666666666667}, "ru_sci_abstract_retrieval": {"4k": 0.4166666666666667, "8k": 0.21887180280037422, "16k": 0.1810637996020305, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.13610037817817858}, "ru_trec": {"4k": 0.05405405405405406, "8k": 0.04, "16k": 0.04395604395604396, "32k": 0.040983606557377046, "dataset_total_score": 0.04474842614186877}, "ru_sci_fi": {"32k": 0.0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.06770833333333333, "dataset_total_score": 0.06770833333333333}, "ru_babilong_qa1": {"4k": 0.27, "8k": 0.23, "16k": 0.23, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.12166666666666666}, "ru_babilong_qa2": {"4k": 0.05, "8k": 0.04, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.015}, "ru_babilong_qa3": {"4k": 0.07, "8k": 0.03, "16k": 0.04, "32k": 0.01, "64k": 0, "128k": 0, "dataset_total_score": 0.025000000000000005}, "ru_babilong_qa4": {"4k": 0.02, "8k": 0.018000000000000002, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.006333333333333334}, "ru_babilong_qa5": {"4k": 0.2, "8k": 0.18, "16k": 0.15, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.08833333333333333}, "ru_quality": {"16k": 0.43892339544513453, "8k": 0.5447154471544716, "dataset_total_score": 0.49181942129980305}, "ru_tpo": {"8k": 0.28950863213811423, "dataset_total_score": 0.28950863213811423}, "ru_qasper": {"16k": 0.03254246612062729, "8k": 0.03544011573751087, "32k": 0.009411737687136327, "dataset_total_score": 0.025798106515091495}, "total_score": 0.15736624130349933}
results/GLM4-9B-Chat.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 1.0, "128k": 1.0, "dataset_total_score": 1.0}, "matreshka_yes_no": {"4k": 0.7926421404682275, "8k": 0.75, "16k": 0.7133333333333334, "32k": 0.67, "64k": 0.5966666666666667, "128k": 0.56, "dataset_total_score": 0.6804403567447045}, "matreshka_names": {"4k": 0.6466666666666666, "8k": 0.5066666666666667, "16k": 0.52, "32k": 0.47333333333333333, "64k": 0.37333333333333335, "128k": 0.32, "dataset_total_score": 0.47333333333333333}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 1.0, "128k": 1.0, "dataset_total_score": 1.0}, "librusec_history": {"8k": 0.84375, "16k": 0.84375, "32k": 0.84375, "64k": 0.75, "dataset_total_score": 0.8203125}, "ru_gsm100": {"16k": 0.08, "dataset_total_score": 0.08}, "ru_sci_passage_count": {"4k": 0.27, "8k": 0.08, "16k": 0.09, "32k": 0.0, "64k": 0.01, "128k": 0.0, "dataset_total_score": 0.07500000000000001}, "ru_2wikimultihopqa": {"8k": 0.5510204081632653, "16k": 0.5546875, "32k": 0.35772357723577236, "dataset_total_score": 0.4878104951330125}, "long_context_multiq": {"8k": 0.265, "16k": 0.035, "4k": 0.055, "64k": 0.005, "32k": 0.005, "128k": 0.1, "dataset_total_score": 0.0775}, "ru_sci_abstract_retrieval": {"4k": 0.9819047619047618, "8k": 0.923411865911866, "16k": 0.9122101461259002, "32k": 0.8189266620312142, "64k": 0.6411071734029656, "128k": 0.3908439729202464, "dataset_total_score": 0.7780674303828258}, "ru_trec": {"4k": 0.5675675675675675, "8k": 0.7, "16k": 0.7582417582417582, "32k": 0.7704918032786885, "dataset_total_score": 0.6990752822720037}, "ru_sci_fi": {"32k": 0.3888888888888889, "64k": 0.42857142857142855, "dataset_total_score": 0.4087301587301587}, "librusec_mhqa": {"8k": 0.4453125, "dataset_total_score": 0.4453125}, "ru_babilong_qa1": {"4k": 0.699375, "8k": 0.59, "16k": 0.6, "32k": 0.508125, "64k": 0.429375, "128k": 0.42, "dataset_total_score": 0.5411458333333333}, "ru_babilong_qa2": {"4k": 0.389375, "8k": 0.33, "16k": 0.299375, "32k": 0.2693333333333333, "64k": 0.2675, "128k": 0.23491666666666666, "dataset_total_score": 0.29841666666666666}, "ru_babilong_qa3": {"4k": 0.24598809523809526, "8k": 0.2792380952380953, "16k": 0.21408333333333335, "32k": 0.2264761904761905, "64k": 0.18666666666666668, "128k": 0.18545833333333334, "dataset_total_score": 0.22298511904761908}, "ru_babilong_qa4": {"4k": 0.6207142857142857, "8k": 0.5964285714285714, "16k": 0.5657142857142857, "32k": 0.58, "64k": 0.43, "128k": 0.37714285714285717, "dataset_total_score": 0.5283333333333334}, "ru_babilong_qa5": {"4k": 0.7300000000000001, "8k": 0.7350000000000001, "16k": 0.7200000000000002, "32k": 0.6683333333333334, "64k": 0.6966666666666668, "128k": 0.6700000000000002, "dataset_total_score": 0.7033333333333335}, "ru_quality": {"16k": 0.6521739130434783, "8k": 0.8292682926829268, "dataset_total_score": 0.7407211028632026}, "ru_tpo": {"8k": 0.8685258964143426, "dataset_total_score": 0.8685258964143426}, "ru_qasper": {"16k": 0.05927748156784547, "8k": 0.06532155413695329, "32k": 0.025813608477215297, "dataset_total_score": 0.050137548060671354}, "total_score": 0.5228181376023114}
results/GPT-4o.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 1.0, "128k": 1.0, "dataset_total_score": 1.0}, "matreshka_yes_no": {"4k": 0.8, "8k": 0.6, "16k": 1.0, "32k": 0.8, "64k": 0.7, "128k": 0.9, "dataset_total_score": 0.8000000000000002}, "matreshka_names": {"4k": 0.6, "8k": 0.6, "16k": 0.5, "32k": 0.4, "64k": 0.5, "128k": 0.5, "dataset_total_score": 0.5166666666666667}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 1.0, "128k": 1.0, "dataset_total_score": 1.0}, "librusec_history": {"8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0.9, "dataset_total_score": 0.975}, "ru_gsm100": {"16k": 1.0, "dataset_total_score": 1.0}, "ru_sci_passage_count": {"4k": 1.0, "8k": 0.5, "16k": 0.3, "32k": 0.0, "64k": 0.2, "128k": 0.1, "dataset_total_score": 0.35000000000000003}, "ru_2wikimultihopqa": {"8k": 0.8, "16k": 0.8, "32k": 0.7, "dataset_total_score": 0.7666666666666666}, "long_context_multiq": {"4k": 0.3, "8k": 1.0, "16k": 0.7, "32k": 0.0, "64k": 0.1, "128k": 0.1, "dataset_total_score": 0.3666666666666667}, "ru_sci_abstract_retrieval": {"4k": 0.99, "8k": 0.9541666666666668, "16k": 0.9254479578392623, "32k": 0.9562564463343153, "64k": 0.590978869808793, "128k": 0.19764315322255238, "dataset_total_score": 0.7690821823119315}, "ru_trec": {"4k": 0.6, "8k": 0.8, "16k": 0.9, "32k": 0.7, "dataset_total_score": 0.75}, "ru_sci_fi": {"32k": 0.6, "64k": 0.9, "dataset_total_score": 0.75}, "librusec_mhqa": {"8k": 0.5, "dataset_total_score": 0.5}, "ru_babilong_qa1": {"4k": 0.9, "8k": 0.8, "16k": 0.7, "32k": 0.9, "64k": 0.8, "128k": 0.6, "dataset_total_score": 0.7833333333333333}, "ru_babilong_qa2": {"4k": 0.4, "8k": 0.3, "16k": 0.4, "32k": 0.4, "64k": 0.5, "128k": 0.2, "dataset_total_score": 0.3666666666666667}, "ru_babilong_qa3": {"4k": 0.2, "8k": 0.3, "16k": 0.1, "32k": 0.2, "64k": 0.2, "128k": 0.2866666666666667, "dataset_total_score": 0.21444444444444444}, "ru_babilong_qa4": {"4k": 0.8800000000000001, "8k": 0.8, "16k": 0.8, "32k": 0.5714285714285715, "64k": 0.8857142857142858, "128k": 0.8, "dataset_total_score": 0.7895238095238096}, "ru_babilong_qa5": {"4k": 0.8666666666666666, "8k": 0.8666666666666666, "16k": 0.9333333333333333, "32k": 0.9666666666666666, "64k": 0.8666666666666668, "128k": 0.9, "dataset_total_score": 0.9}, "ru_quality": {"8k": 0.8, "16k": 0.8666666666666668, "dataset_total_score": 0.8333333333333335}, "ru_tpo": {"8k": 1.0, "dataset_total_score": 1.0}, "ru_qasper": {"8k": 0.2865100250626566, "16k": 0.3184757236227824, "32k": 0.3465384615384616, "dataset_total_score": 0.3171747367413002}, "total_score": 0.70231230982642}
results/LLaMA-2-7B-32k.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0, "128k": 0, "dataset_total_score": 0.6666666666666666}, "matreshka_yes_no": {"4k": 0.5016722408026756, "8k": 0.5, "16k": 0.5, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.33361204013377926}, "matreshka_names": {"4k": 0.08, "8k": 0.06666666666666667, "16k": 0.02, "32k": 0.04, "64k": 0, "128k": 0, "dataset_total_score": 0.034444444444444444}, "passkey_with_librusec": {"4k": 1.0, "8k": 0.975, "16k": 0.985, "32k": 0.97, "64k": 0, "128k": 0, "dataset_total_score": 0.6549999999999999}, "librusec_history": {"8k": 0.6875, "16k": 0.5, "32k": 0.4375, "64k": 0, "dataset_total_score": 0.40625}, "ru_gsm100": {"16k": 0.07, "dataset_total_score": 0.07}, "ru_sci_passage_count": {"4k": 0.18, "8k": 0.05, "16k": 0.05, "32k": 0.005, "64k": 0, "128k": 0, "dataset_total_score": 0.047499999999999994}, "ru_2wikimultihopqa": {"8k": 0.4489795918367347, "16k": 0.3984375, "32k": 0.2682926829268293, "dataset_total_score": 0.37190325825452136}, "long_context_multiq": {"8k": 0.33, "16k": 0.1, "4k": 0.045, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.07916666666666666}, "ru_sci_abstract_retrieval": {"4k": 0.8519047619047619, "8k": 0.7612582259010829, "16k": 0.4675868475624726, "32k": 0.2665782908189807, "64k": 0, "128k": 0, "dataset_total_score": 0.39122135436454974}, "ru_trec": {"4k": 0.24324324324324326, "8k": 0.18, "16k": 0.24175824175824176, "32k": 0.28688524590163933, "dataset_total_score": 0.2379716827257811}, "ru_sci_fi": {"32k": 0.1111111111111111, "64k": 0, "dataset_total_score": 0.05555555555555555}, "librusec_mhqa": {"8k": 0.2760416666666667, "dataset_total_score": 0.2760416666666667}, "ru_babilong_qa1": {"4k": 0.6, "8k": 0.66, "16k": 0.66, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.4033333333333333}, "ru_babilong_qa2": {"4k": 0.25, "8k": 0.3, "16k": 0.25875, "32k": 0.19, "64k": 0, "128k": 0, "dataset_total_score": 0.16645833333333335}, "ru_babilong_qa3": {"4k": 0.22933333333333333, "8k": 0.28933333333333333, "16k": 0.26, "32k": 0.2, "64k": 0, "128k": 0, "dataset_total_score": 0.1631111111111111}, "ru_babilong_qa4": {"4k": 0.31, "8k": 0.34, "16k": 0.23, "32k": 0.12, "64k": 0, "128k": 0, "dataset_total_score": 0.16666666666666666}, "ru_babilong_qa5": {"4k": 0.59, "8k": 0.66, "16k": 0.64, "32k": 0.69, "64k": 0, "128k": 0, "dataset_total_score": 0.43}, "ru_quality": {"16k": 0.13871635610766048, "8k": 0.17073170731707318, "dataset_total_score": 0.15472403171236682}, "ru_tpo": {"8k": 0.5431606905710492, "dataset_total_score": 0.5431606905710492}, "ru_qasper": {"16k": 0.05999038960490889, "8k": 0.0580459343880765, "32k": 0.022401950361811175, "dataset_total_score": 0.04681275811826552}, "total_score": 0.2714095362059408}
results/LLaMA-3-8B-Instruct.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "matreshka_yes_no": {"4k": 0.8394648829431438, "8k": 0.8, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.27324414715719064}, "matreshka_names": {"4k": 0.5333333333333333, "8k": 0.46, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.16555555555555557}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "librusec_history": {"8k": 0.90625, "16k": 0, "32k": 0, "64k": 0, "dataset_total_score": 0.2265625}, "ru_gsm100": {"16k": 0, "dataset_total_score": 0.0}, "ru_sci_passage_count": {"4k": 0.31, "8k": 0.08, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.065}, "ru_2wikimultihopqa": {"8k": 0.5306122448979592, "16k": 0, "32k": 0, "dataset_total_score": 0.17687074829931973}, "long_context_multiq": {"8k": 0.245, "4k": 0.05, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.049166666666666664}, "ru_sci_abstract_retrieval": {"4k": 0.9663095238095238, "8k": 0.9151886869744013, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.31358303513065416}, "ru_trec": {"4k": 0.5945945945945946, "8k": 0.5, "16k": 0, "32k": 0, "dataset_total_score": 0.2736486486486487}, "ru_sci_fi": {"32k": 0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.4609375, "dataset_total_score": 0.4609375}, "ru_babilong_qa1": {"4k": 0.6862083333333334, "8k": 0.7335416666666665, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.236625}, "ru_babilong_qa2": {"4k": 0.14, "8k": 0.10866666666666668, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.04144444444444445}, "ru_babilong_qa3": {"4k": 0.09, "8k": 0.1798095238095238, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.04496825396825397}, "ru_babilong_qa4": {"4k": 0.5725714285714285, "8k": 0.6034285714285714, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.19599999999999998}, "ru_babilong_qa5": {"4k": 0.7666666666666667, "8k": 0.7516666666666667, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.2530555555555556}, "ru_quality": {"8k": 0.6910569105691058, "16k": 0, "dataset_total_score": 0.3455284552845529}, "ru_tpo": {"8k": 0.7808764940239044, "dataset_total_score": 0.7808764940239044}, "ru_qasper": {"8k": 0.06525418503550595, "16k": 0, "32k": 0, "dataset_total_score": 0.021751395011835317}, "total_score": 0.21864214601967852}
results/LLaMA-3-8B.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "matreshka_yes_no": {"4k": 0.6220735785953178, "8k": 0.59, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.2020122630992196}, "matreshka_names": {"4k": 0.4, "8k": 0.2, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.10000000000000002}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "librusec_history": {"8k": 0.90625, "16k": 0, "32k": 0, "64k": 0, "dataset_total_score": 0.2265625}, "ru_gsm100": {"16k": 0, "dataset_total_score": 0.0}, "ru_sci_passage_count": {"4k": 0.15, "8k": 0.05, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.03333333333333333}, "ru_2wikimultihopqa": {"8k": 0.5510204081632653, "16k": 0, "32k": 0, "dataset_total_score": 0.18367346938775508}, "long_context_multiq": {"8k": 0.325, "4k": 0.095, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.07}, "ru_sci_abstract_retrieval": {"4k": 0.9711111111111111, "8k": 0.8806912531912532, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3086337273837274}, "ru_trec": {"4k": 0.3783783783783784, "8k": 0.38, "16k": 0, "32k": 0, "dataset_total_score": 0.1895945945945946}, "ru_sci_fi": {"32k": 0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.4140625, "dataset_total_score": 0.4140625}, "ru_babilong_qa1": {"4k": 0.68, "8k": 0.57, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.20833333333333334}, "ru_babilong_qa2": {"4k": 0.27, "8k": 0.19, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.07666666666666667}, "ru_babilong_qa3": {"4k": 0.28470833333333334, "8k": 0.25866666666666666, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.09056249999999999}, "ru_babilong_qa4": {"4k": 0.5838571428571429, "8k": 0.5642857142857143, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.19135714285714286}, "ru_babilong_qa5": {"4k": 0.6716666666666667, "8k": 0.6866666666666668, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.2263888888888889}, "ru_quality": {"8k": 0.17073170731707318, "16k": 0, "dataset_total_score": 0.08536585365853659}, "ru_tpo": {"8k": 0.5816733067729084, "dataset_total_score": 0.5816733067729084}, "ru_qasper": {"8k": 0.06526998039125843, "16k": 0, "32k": 0, "dataset_total_score": 0.021756660130419478}, "total_score": 0.18460206698919968}
results/LongAlpaca.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 0.775, "8k": 0.825, "16k": 0.575, "32k": 0.37, "64k": 0, "128k": 0, "dataset_total_score": 0.42416666666666664}, "matreshka_yes_no": {"4k": 0.4782608695652174, "8k": 0.3933333333333333, "16k": 0.48, "32k": 0.4766666666666667, "64k": 0, "128k": 0, "dataset_total_score": 0.30471014492753623}, "matreshka_names": {"4k": 0.013333333333333334, "8k": 0.006666666666666667, "16k": 0.006666666666666667, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0044444444444444444}, "passkey_with_librusec": {"4k": 0.71, "8k": 0.7, "16k": 0.56, "32k": 0.465, "64k": 0, "128k": 0, "dataset_total_score": 0.4058333333333333}, "librusec_history": {"8k": 0.1875, "16k": 0.15625, "32k": 0.1875, "64k": 0, "dataset_total_score": 0.1328125}, "ru_gsm100": {"16k": 0.02, "dataset_total_score": 0.02}, "ru_sci_passage_count": {"4k": 0.13, "8k": 0.05, "16k": 0.02, "32k": 0.03, "64k": 0, "128k": 0, "dataset_total_score": 0.03833333333333333}, "ru_2wikimultihopqa": {"8k": 0.40816326530612246, "16k": 0.2890625, "32k": 0.21138211382113822, "dataset_total_score": 0.3028692930424202}, "long_context_multiq": {"8k": 0.015, "16k": 0.0, "4k": 0.03, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0075}, "ru_sci_abstract_retrieval": {"4k": 0.6496428571428572, "8k": 0.44704965669251384, "16k": 0.2043098561917367, "32k": 0.11199263409419845, "64k": 0, "128k": 0, "dataset_total_score": 0.23549916735355103}, "ru_trec": {"4k": 0.0, "8k": 0.02, "16k": 0.0, "32k": 0.0, "dataset_total_score": 0.005}, "ru_sci_fi": {"32k": 0.027777777777777776, "64k": 0, "dataset_total_score": 0.013888888888888888}, "librusec_mhqa": {"8k": 0.078125, "dataset_total_score": 0.078125}, "ru_babilong_qa1": {"4k": 0.09, "8k": 0.06, "16k": 0.06, "32k": 0.02, "64k": 0, "128k": 0, "dataset_total_score": 0.03833333333333333}, "ru_babilong_qa2": {"4k": 0.01, "8k": 0.01, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0033333333333333335}, "ru_babilong_qa3": {"4k": 0.05, "8k": 0.09, "16k": 0.04, "32k": 0.029333333333333336, "64k": 0, "128k": 0, "dataset_total_score": 0.03488888888888889}, "ru_babilong_qa4": {"4k": 0.0, "8k": 0.01, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0016666666666666668}, "ru_babilong_qa5": {"4k": 0.4416666666666667, "8k": 0.44, "16k": 0.475, "32k": 0.40666666666666673, "64k": 0, "128k": 0, "dataset_total_score": 0.2938888888888889}, "ru_quality": {"16k": 0.4824016563146999, "8k": 0.39837398373983746, "dataset_total_score": 0.4403878200272687}, "ru_tpo": {"8k": 0.06772908366533864, "dataset_total_score": 0.06772908366533864}, "ru_qasper": {"16k": 0.02179956634280568, "8k": 0.02260857442083522, "32k": 0.015666306859159988, "dataset_total_score": 0.020024815874266962}, "total_score": 0.13683026679372187}
results/LongChat-7B-v1.5-32k.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 0.995, "8k": 1.0, "16k": 1.0, "32k": 0.995, "64k": 0, "128k": 0, "dataset_total_score": 0.665}, "matreshka_yes_no": {"4k": 0.5016722408026756, "8k": 0.5, "16k": 0.5, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.33361204013377926}, "matreshka_names": {"4k": 0.17333333333333334, "8k": 0.06666666666666667, "16k": 0.08, "32k": 0.03333333333333333, "64k": 0, "128k": 0, "dataset_total_score": 0.058888888888888886}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 0.985, "32k": 0.975, "64k": 0, "128k": 0, "dataset_total_score": 0.66}, "librusec_history": {"8k": 0.5625, "16k": 0.34375, "32k": 0.15625, "64k": 0, "dataset_total_score": 0.265625}, "ru_gsm100": {"16k": 0.05, "dataset_total_score": 0.05}, "ru_sci_passage_count": {"4k": 0.18, "8k": 0.08, "16k": 0.01, "32k": 0.02, "64k": 0, "128k": 0, "dataset_total_score": 0.04833333333333334}, "ru_2wikimultihopqa": {"8k": 0.42857142857142855, "16k": 0.3984375, "32k": 0.22764227642276422, "dataset_total_score": 0.3515504016647309}, "long_context_multiq": {"8k": 0.14, "16k": 0.025, "4k": 0.025, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.03166666666666667}, "ru_sci_abstract_retrieval": {"4k": 0.8742063492063491, "8k": 0.7618964368964368, "16k": 0.6055101098106439, "32k": 0.22205033016504383, "64k": 0, "128k": 0, "dataset_total_score": 0.4106105376797456}, "ru_trec": {"4k": 0.05405405405405406, "8k": 0.1, "16k": 0.07692307692307693, "32k": 0.06557377049180328, "dataset_total_score": 0.07413772536723356}, "ru_sci_fi": {"32k": 0.05555555555555555, "64k": 0, "dataset_total_score": 0.027777777777777776}, "librusec_mhqa": {"8k": 0.24739583333333334, "dataset_total_score": 0.24739583333333334}, "ru_babilong_qa1": {"4k": 0.26, "8k": 0.29, "16k": 0.31, "32k": 0.19, "64k": 0, "128k": 0, "dataset_total_score": 0.17500000000000002}, "ru_babilong_qa2": {"4k": 0.11, "8k": 0.08, "16k": 0.16, "32k": 0.08, "64k": 0, "128k": 0, "dataset_total_score": 0.07166666666666667}, "ru_babilong_qa3": {"4k": 0.09, "8k": 0.05, "16k": 0.04, "32k": 0.06, "64k": 0, "128k": 0, "dataset_total_score": 0.04}, "ru_babilong_qa4": {"4k": 0.25214285714285717, "8k": 0.2921428571428571, "16k": 0.15642857142857142, "32k": 0.05928571428571429, "64k": 0, "128k": 0, "dataset_total_score": 0.12666666666666665}, "ru_babilong_qa5": {"4k": 0.5133333333333333, "8k": 0.5, "16k": 0.48333333333333334, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.3327777777777778}, "ru_quality": {"16k": 0.1780538302277433, "8k": 0.28455284552845533, "dataset_total_score": 0.2313033378780993}, "ru_tpo": {"8k": 0.3957503320053121, "dataset_total_score": 0.3957503320053121}, "ru_qasper": {"16k": 0.06475047107877817, "8k": 0.060803436943138944, "32k": 0.023538832049622788, "dataset_total_score": 0.04969758002384664}, "total_score": 0.22130764599351707}
results/Mistral-7B-Instruct-v0.3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0, "128k": 0, "dataset_total_score": 0.6666666666666666}, "matreshka_yes_no": {"4k": 0.5652173913043478, "8k": 0.5066666666666667, "16k": 0.5466666666666666, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.35309178743961356}, "matreshka_names": {"4k": 0.38, "8k": 0.32, "16k": 0.16666666666666666, "32k": 0.11333333333333333, "64k": 0, "128k": 0, "dataset_total_score": 0.1633333333333333}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 0.995, "64k": 0, "128k": 0, "dataset_total_score": 0.6658333333333334}, "librusec_history": {"8k": 0.71875, "16k": 0.625, "32k": 0.6875, "64k": 0, "dataset_total_score": 0.5078125}, "ru_gsm100": {"16k": 0.11, "dataset_total_score": 0.11}, "ru_sci_passage_count": {"4k": 0.26, "8k": 0.14, "16k": 0.07, "32k": 0.02, "64k": 0, "128k": 0, "dataset_total_score": 0.08166666666666668}, "ru_2wikimultihopqa": {"8k": 0.5510204081632653, "16k": 0.46875, "32k": 0.2764227642276423, "dataset_total_score": 0.4320643907969692}, "long_context_multiq": {"8k": 0.22, "16k": 0.035, "4k": 0.035, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.04833333333333334}, "ru_sci_abstract_retrieval": {"4k": 0.9824603174603175, "8k": 0.8690578865578865, "16k": 0.7111051699917106, "32k": 0.05148203921970241, "64k": 0, "128k": 0, "dataset_total_score": 0.43568423553826957}, "ru_trec": {"4k": 0.5675675675675675, "8k": 0.38, "16k": 0.4065934065934066, "32k": 0.3442622950819672, "dataset_total_score": 0.4246058173107353}, "ru_sci_fi": {"32k": 0.3055555555555556, "64k": 0, "dataset_total_score": 0.1527777777777778}, "librusec_mhqa": {"8k": 0.3359375, "dataset_total_score": 0.3359375}, "ru_babilong_qa1": {"4k": 0.25, "8k": 0.15, "16k": 0.22, "32k": 0.24, "64k": 0, "128k": 0, "dataset_total_score": 0.14333333333333334}, "ru_babilong_qa2": {"4k": 0.08, "8k": 0.05, "16k": 0.02, "32k": 0.02, "64k": 0, "128k": 0, "dataset_total_score": 0.028333333333333332}, "ru_babilong_qa3": {"4k": 0.1, "8k": 0.08, "16k": 0.1, "32k": 0.08, "64k": 0, "128k": 0, "dataset_total_score": 0.060000000000000005}, "ru_babilong_qa4": {"4k": 0.5178571428571428, "8k": 0.4428571428571429, "16k": 0.39285714285714285, "32k": 0.3028571428571429, "64k": 0, "128k": 0, "dataset_total_score": 0.2760714285714286}, "ru_babilong_qa5": {"4k": 0.5466666666666667, "8k": 0.62, "16k": 0.5533333333333333, "32k": 0.5333333333333333, "64k": 0, "128k": 0, "dataset_total_score": 0.3755555555555556}, "ru_quality": {"16k": 0.22981366459627334, "8k": 0.3821138211382114, "dataset_total_score": 0.30596374286724237}, "ru_tpo": {"8k": 0.6640106241699868, "dataset_total_score": 0.6640106241699868}, "ru_qasper": {"16k": 0.06576022201401649, "8k": 0.06619134922698901, "32k": 0.02936483912182809, "dataset_total_score": 0.05377213678761119}, "total_score": 0.2992784522292948}
results/Mistral-7B-v0.1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 0.975, "16k": 0.125, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.35000000000000003}, "matreshka_yes_no": {"4k": 0.5016722408026756, "8k": 0.5, "16k": 0.0033333333333333335, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.1675009290226682}, "matreshka_names": {"4k": 0.32666666666666666, "8k": 0.16, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.08111111111111112}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 0.3, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.3833333333333333}, "librusec_history": {"8k": 0.78125, "16k": 0.15625, "32k": 0.0, "64k": 0, "dataset_total_score": 0.234375}, "ru_gsm100": {"16k": 0.13, "dataset_total_score": 0.13}, "ru_sci_passage_count": {"4k": 0.04, "8k": 0.04, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.013333333333333334}, "ru_2wikimultihopqa": {"8k": 0.42857142857142855, "16k": 0.1796875, "32k": 0.08130081300813008, "dataset_total_score": 0.22985324719318623}, "long_context_multiq": {"8k": 0.22, "16k": 0.005, "4k": 0.04, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.04416666666666667}, "ru_sci_abstract_retrieval": {"4k": 0.9484126984126985, "8k": 0.7607802118516404, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.28486548504405645}, "ru_trec": {"4k": 0.02702702702702703, "8k": 0.1, "16k": 0.03296703296703297, "32k": 0.0, "dataset_total_score": 0.039998514998515}, "ru_sci_fi": {"32k": 0.027777777777777776, "64k": 0, "dataset_total_score": 0.013888888888888888}, "librusec_mhqa": {"8k": 0.3411458333333333, "dataset_total_score": 0.3411458333333333}, "ru_babilong_qa1": {"4k": 0.63, "8k": 0.63, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.21}, "ru_babilong_qa2": {"4k": 0.21, "8k": 0.25, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.07666666666666666}, "ru_babilong_qa3": {"4k": 0.29, "8k": 0.25, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.09000000000000001}, "ru_babilong_qa4": {"4k": 0.4292857142857143, "8k": 0.3157142857142857, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.12416666666666669}, "ru_babilong_qa5": {"4k": 0.7, "8k": 0.6933333333333335, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.23222222222222225}, "ru_quality": {"16k": 0.11801242236024845, "8k": 0.22764227642276424, "dataset_total_score": 0.17282734939150635}, "ru_tpo": {"8k": 0.3957503320053121, "dataset_total_score": 0.3957503320053121}, "ru_qasper": {"16k": 0.011042882576489372, "8k": 0.0625419691096683, "32k": 0.0008322260797508323, "dataset_total_score": 0.024805692588636172}, "total_score": 0.17333387011743345}
results/Mistral-7B-v0.3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0, "128k": 0, "dataset_total_score": 0.6666666666666666}, "matreshka_yes_no": {"4k": 0.44816053511705684, "8k": 0.47, "16k": 0.5, "32k": 0.5, "64k": 0, "128k": 0, "dataset_total_score": 0.31969342251950944}, "matreshka_names": {"4k": 0.2866666666666667, "8k": 0.16, "16k": 0.10666666666666667, "32k": 0.04666666666666667, "64k": 0, "128k": 0, "dataset_total_score": 0.09999999999999999}, "passkey_with_librusec": {"4k": 1.0, "8k": 1.0, "16k": 1.0, "32k": 1.0, "64k": 0, "128k": 0, "dataset_total_score": 0.6666666666666666}, "librusec_history": {"8k": 0.9375, "16k": 0.9375, "32k": 0.84375, "64k": 0, "dataset_total_score": 0.6796875}, "ru_gsm100": {"16k": 0.09, "dataset_total_score": 0.09}, "ru_sci_passage_count": {"4k": 0.0, "8k": 0.0, "16k": 0.0, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.0}, "ru_2wikimultihopqa": {"8k": 0.46938775510204084, "16k": 0.4921875, "32k": 0.2682926829268293, "dataset_total_score": 0.40995597934295674}, "long_context_multiq": {"8k": 0.24, "16k": 0.035, "4k": 0.04, "32k": 0.0, "64k": 0, "128k": 0, "dataset_total_score": 0.05249999999999999}, "ru_sci_abstract_retrieval": {"4k": 0.8735714285714286, "8k": 0.5658155733155733, "16k": 0.3690303371278, "32k": 0.01873920061760887, "64k": 0, "128k": 0, "dataset_total_score": 0.30452608993873514}, "ru_trec": {"4k": 0.0, "8k": 0.08, "16k": 0.04395604395604396, "32k": 0.09016393442622951, "dataset_total_score": 0.05352999459556837}, "ru_sci_fi": {"32k": 0.0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.390625, "dataset_total_score": 0.390625}, "ru_babilong_qa1": {"4k": 0.6, "8k": 0.63, "16k": 0.58, "32k": 0.43, "64k": 0, "128k": 0, "dataset_total_score": 0.37333333333333335}, "ru_babilong_qa2": {"4k": 0.35, "8k": 0.23, "16k": 0.18, "32k": 0.24, "64k": 0, "128k": 0, "dataset_total_score": 0.16666666666666666}, "ru_babilong_qa3": {"4k": 0.29, "8k": 0.23, "16k": 0.23, "32k": 0.19, "64k": 0, "128k": 0, "dataset_total_score": 0.15666666666666665}, "ru_babilong_qa4": {"4k": 0.4628571428571429, "8k": 0.3442857142857143, "16k": 0.36214285714285716, "32k": 0.24857142857142858, "64k": 0, "128k": 0, "dataset_total_score": 0.23630952380952383}, "ru_babilong_qa5": {"4k": 0.7033333333333335, "8k": 0.6866666666666668, "16k": 0.7533333333333334, "32k": 0.6833333333333335, "64k": 0, "128k": 0, "dataset_total_score": 0.4711111111111112}, "ru_quality": {"16k": 0.06832298136645963, "8k": 0.23577235772357727, "dataset_total_score": 0.15204766954501844}, "ru_tpo": {"8k": 0.3970783532536521, "dataset_total_score": 0.3970783532536521}, "ru_qasper": {"16k": 0.0648885639017482, "8k": 0.08898968989798027, "32k": 0.01870864419884028, "dataset_total_score": 0.05752896599952292}, "total_score": 0.27355207667217124}
results/Saiga-LLaMA-3-8B.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"passkey": {"4k": 1.0, "8k": 1.0, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3333333333333333}, "matreshka_yes_no": {"4k": 0.8729096989966555, "8k": 0.81, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.28048494983277594}, "matreshka_names": {"4k": 0.5333333333333333, "8k": 0.4, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.15555555555555556}, "passkey_with_librusec": {"4k": 1.0, "8k": 0.995, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.3325}, "librusec_history": {"8k": 0.96875, "16k": 0, "32k": 0, "64k": 0, "dataset_total_score": 0.2421875}, "ru_gsm100": {"16k": 0, "dataset_total_score": 0.0}, "ru_sci_passage_count": {"4k": 0.195, "8k": 0.035, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.03833333333333334}, "ru_2wikimultihopqa": {"8k": 0.5306122448979592, "16k": 0, "32k": 0, "dataset_total_score": 0.17687074829931973}, "long_context_multiq": {"8k": 0.235, "4k": 0.055, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.04833333333333333}, "ru_sci_abstract_retrieval": {"4k": 0.9772222222222221, "8k": 0.9258564054992626, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.31717977128691416}, "ru_trec": {"4k": 0.5135135135135135, "8k": 0.54, "16k": 0, "32k": 0, "dataset_total_score": 0.2633783783783784}, "ru_sci_fi": {"32k": 0, "64k": 0, "dataset_total_score": 0.0}, "librusec_mhqa": {"8k": 0.4505208333333333, "dataset_total_score": 0.4505208333333333}, "ru_babilong_qa1": {"4k": 0.7629583333333333, "8k": 0.758125, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.25351388888888887}, "ru_babilong_qa2": {"4k": 0.195625, "8k": 0.06875, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.044062500000000004}, "ru_babilong_qa3": {"4k": 0.14733333333333334, "8k": 0.21585714285714286, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.06053174603174603}, "ru_babilong_qa4": {"4k": 0.6347142857142858, "8k": 0.5821428571428572, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.20280952380952386}, "ru_babilong_qa5": {"4k": 0.7466666666666667, "8k": 0.7633333333333334, "16k": 0, "32k": 0, "64k": 0, "128k": 0, "dataset_total_score": 0.2516666666666667}, "ru_quality": {"8k": 0.35772357723577236, "16k": 0, "dataset_total_score": 0.17886178861788618}, "ru_tpo": {"8k": 0.7569721115537849, "dataset_total_score": 0.7569721115537849}, "ru_qasper": {"8k": 0.07413702213069599, "16k": 0, "32k": 0, "dataset_total_score": 0.024712340710231998}, "total_score": 0.21008610966500027}