Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
36c5a0c
·
1 Parent(s): f30cbcc

feat: implement the submission part

Browse files
Files changed (4) hide show
  1. app.py +53 -9
  2. src/about.py +5 -20
  3. src/populate.py +54 -32
  4. utils.py +16 -0
app.py CHANGED
@@ -6,6 +6,7 @@ from src.about import (
6
  INTRODUCTION_TEXT,
7
  LLM_BENCHMARKS_TEXT,
8
  TITLE,
 
9
  )
10
  from src.display.css_html_js import custom_css
11
  from src.display.utils import (
@@ -13,13 +14,14 @@ from src.display.utils import (
13
  LONG_DOC_BENCHMARK_COLS,
14
  COLS_QA,
15
  COLS_LONG_DOC,
 
16
  TYPES,
17
  AutoEvalColumnQA,
18
  fields
19
  )
20
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
21
- from src.populate import get_leaderboard_df
22
- from utils import update_table, update_metric, update_table_long_doc
23
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
24
 
25
 
@@ -75,11 +77,11 @@ def update_metric_long_doc(
75
  return update_metric(raw_data_qa, 'long_doc', metric, domains, langs, reranking_model, query)
76
 
77
 
78
- # (
79
- # finished_eval_queue_df,
80
- # running_eval_queue_df,
81
- # pending_eval_queue_df,
82
- # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
83
 
84
 
85
  demo = gr.Blocks(css=custom_css)
@@ -305,8 +307,50 @@ with demo:
305
  queue=True
306
  )
307
 
308
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
309
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
  scheduler = BackgroundScheduler()
312
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
6
  INTRODUCTION_TEXT,
7
  LLM_BENCHMARKS_TEXT,
8
  TITLE,
9
+ EVALUATION_QUEUE_TEXT
10
  )
11
  from src.display.css_html_js import custom_css
12
  from src.display.utils import (
 
14
  LONG_DOC_BENCHMARK_COLS,
15
  COLS_QA,
16
  COLS_LONG_DOC,
17
+ EVAL_COLS,
18
  TYPES,
19
  AutoEvalColumnQA,
20
  fields
21
  )
22
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
23
+ from src.populate import get_leaderboard_df, get_evaluation_queue_df
24
+ from utils import update_table, update_metric, update_table_long_doc, upload_file
25
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
26
 
27
 
 
77
  return update_metric(raw_data_qa, 'long_doc', metric, domains, langs, reranking_model, query)
78
 
79
 
80
+ (
81
+ finished_eval_queue_df,
82
+ running_eval_queue_df,
83
+ pending_eval_queue_df,
84
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
85
 
86
 
87
  demo = gr.Blocks(css=custom_css)
 
307
  queue=True
308
  )
309
 
310
+ with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
311
+ with gr.Column():
312
+ with gr.Row():
313
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
314
+ with gr.Row():
315
+ with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
316
+ with gr.Row():
317
+ finished_eval_table = gr.components.Dataframe(
318
+ value=finished_eval_queue_df,
319
+ row_count=5,
320
+ )
321
+ with gr.Row():
322
+ with gr.Accordion(
323
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
324
+ open=False,
325
+ ):
326
+ with gr.Row():
327
+ running_eval_table = gr.components.Dataframe(
328
+ value=running_eval_queue_df,
329
+ row_count=5,
330
+ )
331
+ with gr.Row():
332
+ with gr.Accordion(
333
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
334
+ open=False,
335
+ ):
336
+ with gr.Row():
337
+ pending_eval_table = gr.components.Dataframe(
338
+ value=pending_eval_queue_df,
339
+ row_count=5,
340
+ )
341
+ with gr.Row():
342
+ gr.Markdown("## ✉️Submit your model here!", elem_classes="markdown-text")
343
+ # with gr.Row():
344
+ # with gr.Column():
345
+ # model_name_textbox = gr.Textbox(label="Model name")
346
+ # with gr.Column():
347
+ # model_url = gr.Textbox(label="Model URL")
348
+ file_output = gr.File()
349
+ upload_button = gr.UploadButton("Click to submit evaluation", file_count="multiple")
350
+ upload_button.upload(upload_file, upload_button, file_output)
351
+
352
+ # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
353
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
354
 
355
  scheduler = BackgroundScheduler()
356
  scheduler.add_job(restart_space, "interval", seconds=1800)
src/about.py CHANGED
@@ -57,26 +57,11 @@ To reproduce our results, here is the commands you can run:
57
  EVALUATION_QUEUE_TEXT = """
58
  ## Some good practices before submitting a model
59
 
60
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
61
- ```python
62
- from transformers import AutoConfig, AutoModel, AutoTokenizer
63
- config = AutoConfig.from_pretrained("your model name", revision=revision)
64
- model = AutoModel.from_pretrained("your model name", revision=revision)
65
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
66
- ```
67
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
68
-
69
- Note: make sure your model is public!
70
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
71
-
72
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
73
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
74
-
75
- ### 3) Make sure your model has an open license!
76
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
77
-
78
- ### 4) Fill up your model card
79
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
80
 
81
  ## In case of model failure
82
  If your model is displayed in the `FAILED` category, its execution stopped.
 
57
  EVALUATION_QUEUE_TEXT = """
58
  ## Some good practices before submitting a model
59
 
60
+ ### 1)
61
+ ### 2)
62
+ ### 3)
63
+ ### 4)
64
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  ## In case of model failure
67
  If your model is displayed in the `FAILED` category, its execution stopped.
src/populate.py CHANGED
@@ -38,35 +38,57 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], cols: list, benchmark_col
38
 
39
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
40
  """Creates the different dataframes for the evaluation queues requests"""
41
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
42
- all_evals = []
43
-
44
- for entry in entries:
45
- if ".json" in entry:
46
- file_path = os.path.join(save_path, entry)
47
- with open(file_path) as fp:
48
- data = json.load(fp)
49
-
50
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
51
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
52
-
53
- all_evals.append(data)
54
- elif ".md" not in entry:
55
- # this is a folder
56
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
57
- for sub_entry in sub_entries:
58
- file_path = os.path.join(save_path, entry, sub_entry)
59
- with open(file_path) as fp:
60
- data = json.load(fp)
61
-
62
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
63
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
64
- all_evals.append(data)
65
-
66
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
67
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
68
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
69
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
70
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
71
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
72
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
40
  """Creates the different dataframes for the evaluation queues requests"""
41
+ # entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
42
+ # all_evals = []
43
+ #
44
+ # for entry in entries:
45
+ # if ".json" in entry:
46
+ # file_path = os.path.join(save_path, entry)
47
+ # with open(file_path) as fp:
48
+ # data = json.load(fp)
49
+ #
50
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
51
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
52
+ #
53
+ # all_evals.append(data)
54
+ # elif ".md" not in entry:
55
+ # # this is a folder
56
+ # sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
57
+ # for sub_entry in sub_entries:
58
+ # file_path = os.path.join(save_path, entry, sub_entry)
59
+ # with open(file_path) as fp:
60
+ # data = json.load(fp)
61
+ #
62
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
63
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
64
+ # all_evals.append(data)
65
+ #
66
+ # pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
67
+ # running_list = [e for e in all_evals if e["status"] == "RUNNING"]
68
+ # finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
69
+ # df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
70
+ # df_running = pd.DataFrame.from_records(running_list, columns=cols)
71
+ # df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
72
+ cols = ["Retrieval Model", "Submitted Time", "Status"]
73
+ df_finished = pd.DataFrame(
74
+ {
75
+ "Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
76
+ "Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
77
+ "Status": ["FINISHED", "FINISHED"]
78
+ }
79
+ )
80
+ df_running = pd.DataFrame(
81
+ {
82
+ "Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
83
+ "Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
84
+ "Status": ["RUNNING", "RUNNING"]
85
+ }
86
+ )
87
+ df_pending = pd.DataFrame(
88
+ {
89
+ "Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
90
+ "Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
91
+ "Status": ["PENDING", "PENDING"]
92
+ }
93
+ )
94
+ return df_finished, df_running, df_pending
utils.py CHANGED
@@ -1,4 +1,9 @@
1
  import pandas as pd
 
 
 
 
 
2
 
3
  from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
4
  from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
@@ -124,3 +129,14 @@ def update_metric(
124
  reranking_model,
125
  query
126
  )
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
+ import os
3
+
4
+ from src.display.formatting import styled_error, styled_message, styled_warning
5
+
6
+ from huggingface_hub import HfApi
7
 
8
  from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
9
  from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
 
129
  reranking_model,
130
  query
131
  )
132
+
133
+
134
+ def upload_file(files):
135
+ file_paths = [file.name for file in files]
136
+ print(f"file uploaded: {file_paths}")
137
+ # for fp in file_paths:
138
+ # # upload the file
139
+ # print(file_paths)
140
+ # HfApi(token="").upload_file(...)
141
+ # os.remove(fp)
142
+ return file_paths