mgyigit commited on
Commit
1cc2077
·
verified ·
1 Parent(s): 4260f48

Upload 5 files

Browse files
Files changed (5) hide show
  1. Makefile +13 -0
  2. README.md +41 -7
  3. app.py +129 -0
  4. pyproject.toml +13 -0
  5. requirements.txt +19 -0
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md CHANGED
@@ -1,11 +1,45 @@
1
  ---
2
- title: Probe3
3
- emoji: 🐢
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: static
 
7
  pinned: false
8
- license: mit
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: PROBE
3
+ emoji: 🥇
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ app_file: app.py
8
  pinned: false
9
+ license: gpl
10
+ python_version: 3.8.1
11
  ---
12
 
13
+ # Start the configuration
14
+
15
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
16
+
17
+ Results files should have the following format and be stored as json files:
18
+ ```json
19
+ {
20
+ "config": {
21
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
22
+ "model_name": "path of the model on the hub: org/model",
23
+ "model_sha": "revision on the hub",
24
+ },
25
+ "results": {
26
+ "task_name": {
27
+ "metric_name": score,
28
+ },
29
+ "task_name2": {
30
+ "metric_name": score,
31
+ }
32
+ }
33
+ }
34
+ ```
35
+
36
+ Request files are created automatically by this tool.
37
+
38
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
+
40
+ # Code logic for more complex edits
41
+
42
+ You'll find
43
+ - the main table' columns names and properties in `src/display/utils.py`
44
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
45
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import re
6
+ import pandas as pd
7
+ import os
8
+ import json
9
+
10
+ from src.about import *
11
+
12
+ global data_component, filter_component
13
+
14
+
15
+ def get_baseline_df():
16
+ df = pd.read_csv(CSV_RESULT_PATH)
17
+ present_columns = ["Method"] + checkbox_group.value
18
+ df = df[present_columns]
19
+ return df
20
+
21
+ def add_new_eval(
22
+ human_file,
23
+ skempi_file,
24
+ model_name_textbox: str,
25
+ revision_name_textbox: str,
26
+ benchmark_type: str,
27
+ ):
28
+ representation_name = model_name_textbox if revision_name_textbox == '' else revision_name_textbox
29
+ print(representation_name)
30
+ # Save human and skempi files under ./src/data/representation_vectors using pandas
31
+ if human_file is not None:
32
+ human_df = pd.read_csv(human_file)
33
+ human_df.to_csv(f"./src/data/representation_vectors/{representation_name}_human.csv", index=False)
34
+
35
+ return None
36
+
37
+ block = gr.Blocks()
38
+
39
+ with block:
40
+ gr.Markdown(
41
+ LEADERBOARD_INTRODUCTION
42
+ )
43
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
44
+ # table jmmmu bench
45
+ with gr.TabItem("🏅 PROBE Benchmark", elem_id="probe-benchmark-tab-table", id=1):
46
+ # selection for column part:
47
+ checkbox_group = gr.CheckboxGroup(
48
+ choices=TASK_INFO,
49
+ label="Benchmark Type",
50
+ interactive=True,
51
+ ) # user can select the evaluation dimension
52
+
53
+ baseline_value = get_baseline_df()
54
+ baseline_header = ["Method"] + checkbox_group.value
55
+ baseline_datatype = ['markdown'] + ['number'] * len(checkbox_group.value)
56
+
57
+ data_component = gr.components.Dataframe(
58
+ value=baseline_value,
59
+ headers=baseline_header,
60
+ type="pandas",
61
+ datatype=baseline_datatype,
62
+ interactive=False,
63
+ visible=True,
64
+ )
65
+
66
+ # table 5
67
+ with gr.TabItem("📝 About", elem_id="probe-benchmark-tab-table", id=2):
68
+ with gr.Row():
69
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
70
+
71
+ with gr.TabItem("🚀 Submit here! ", elem_id="probe-benchmark-tab-table", id=3):
72
+ with gr.Row():
73
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
74
+
75
+ with gr.Row():
76
+ gr.Markdown("# ✉️✨ Submit your model's representation files here!", elem_classes="markdown-text")
77
+
78
+ with gr.Row():
79
+ with gr.Column():
80
+ model_name_textbox = gr.Textbox(
81
+ label="Model name",
82
+ )
83
+ revision_name_textbox = gr.Textbox(
84
+ label="Revision Model Name",
85
+ )
86
+ # Selection for benchmark type from (similartiy, family, function, affinity) to eval the representations (chekbox)
87
+ benchmark_type = gr.CheckboxGroup(
88
+ choices=TASK_INFO,
89
+ label="Benchmark Type",
90
+ interactive=True,
91
+ )
92
+
93
+ with gr.Column():
94
+ human_file = gr.components.File(label="Click to Upload the representation file (csv) for Human dataset", file_count="single", type='binary')
95
+ skempi_file = gr.components.File(label="Click to Upload the representation file (csv) for SKEMPI dataset", file_count="single", type='binary')
96
+
97
+ submit_button = gr.Button("Submit Eval")
98
+ submission_result = gr.Markdown()
99
+ submit_button.click(
100
+ add_new_eval,
101
+ inputs = [
102
+ human_file,
103
+ skempi_file,
104
+ model_name_textbox,
105
+ revision_name_textbox,
106
+ benchmark_type
107
+ ],
108
+ )
109
+
110
+ def refresh_data():
111
+ value = get_baseline_df()
112
+
113
+ return value
114
+
115
+ with gr.Row():
116
+ data_run = gr.Button("Refresh")
117
+ data_run.click(
118
+ refresh_data, outputs=[data_component]
119
+ )
120
+
121
+ with gr.Accordion("Citation", open=False):
122
+ citation_button = gr.Textbox(
123
+ value=CITATION_BUTTON_TEXT,
124
+ label=CITATION_BUTTON_LABEL,
125
+ elem_id="citation-button",
126
+ show_copy_button=True,
127
+ )
128
+
129
+ block.launch()
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler
2
+ black
3
+ datasets
4
+ gradio
5
+ gradio[oauth]
6
+ gradio_leaderboard==0.0.9
7
+ gradio_client
8
+ huggingface-hub>=0.18.0
9
+ python-dateutil
10
+ tqdm
11
+ transformers
12
+ tokenizers>=0.15.0
13
+ sentencepiece
14
+ matplotlib
15
+ numpy
16
+ pandas==1.1.4
17
+ pyyaml==5.1
18
+ scikit-learn==0.22
19
+ scikit-multilearn==0.2.0