Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
add data
Browse files- .DS_Store +0 -0
- README.md +5 -4
- app.py +8 -3
- app_empty.py +1 -2
- requirements.txt +1 -0
- src/backend/envs.py +1 -1
- src/backend/run_eval_suite.py +1 -1
- src/display/utils.py +25 -105
- src/envs.py +4 -2
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
README.md
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: pink
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Test Leaderboard
|
3 |
+
emoji: 🐢
|
4 |
colorFrom: pink
|
5 |
+
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.15.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -39,7 +39,7 @@ from src.display.utils import Tasks
|
|
39 |
|
40 |
from huggingface_hub import snapshot_download
|
41 |
|
42 |
-
## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ##
|
43 |
|
44 |
def restart_space():
|
45 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
@@ -100,6 +100,8 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
|
|
100 |
|
101 |
def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool) -> pd.DataFrame:
|
102 |
|
|
|
|
|
103 |
print(f"filter_models()'s df: {df}\n")
|
104 |
# Show all models
|
105 |
if show_deleted:
|
@@ -108,7 +110,10 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
|
|
108 |
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True]
|
109 |
|
110 |
type_emoji = [t[0] for t in type_query]
|
|
|
|
|
111 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
|
|
112 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
113 |
|
114 |
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
@@ -353,7 +358,7 @@ scheduler = BackgroundScheduler()
|
|
353 |
scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
|
354 |
|
355 |
scheduler.start()
|
356 |
-
|
357 |
|
358 |
# demo.launch(show_api=False, enable_queue=False)
|
359 |
-
demo.launch() # TypeError: Blocks.launch() got an unexpected keyword argument 'enable_queue'
|
|
|
39 |
|
40 |
from huggingface_hub import snapshot_download
|
41 |
|
42 |
+
## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## -------
|
43 |
|
44 |
def restart_space():
|
45 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
|
|
100 |
|
101 |
def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool) -> pd.DataFrame:
|
102 |
|
103 |
+
|
104 |
+
print("aa this is an example", df)
|
105 |
print(f"filter_models()'s df: {df}\n")
|
106 |
# Show all models
|
107 |
if show_deleted:
|
|
|
110 |
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True]
|
111 |
|
112 |
type_emoji = [t[0] for t in type_query]
|
113 |
+
print("aa this is an example", df, AutoEvalColumn.model_type_symbol.name, "thhhthht")
|
114 |
+
print("type", type_emoji)
|
115 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
116 |
+
print("bb", filtered_df)
|
117 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
118 |
|
119 |
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
|
|
358 |
scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
|
359 |
|
360 |
scheduler.start()
|
361 |
+
demo.queue().launch()
|
362 |
|
363 |
# demo.launch(show_api=False, enable_queue=False)
|
364 |
+
# demo.launch(enable_queue=False).queue() # TypeError: Blocks.launch() got an unexpected keyword argument 'enable_queue'
|
app_empty.py
CHANGED
@@ -4,5 +4,4 @@ def greet(name):
|
|
4 |
return "Hello " + name + "!!"
|
5 |
|
6 |
# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
7 |
-
# iface.launch()
|
8 |
-
# autocomplete
|
|
|
4 |
return "Hello " + name + "!!"
|
5 |
|
6 |
# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
7 |
+
# iface.launch()
|
|
requirements.txt
CHANGED
@@ -22,6 +22,7 @@ accelerate
|
|
22 |
sentencepiece
|
23 |
langdetect
|
24 |
sacrebleu
|
|
|
25 |
rouge_score
|
26 |
bert-score
|
27 |
evaluate
|
|
|
22 |
sentencepiece
|
23 |
langdetect
|
24 |
sacrebleu
|
25 |
+
cchardet
|
26 |
rouge_score
|
27 |
bert-score
|
28 |
evaluate
|
src/backend/envs.py
CHANGED
@@ -27,7 +27,7 @@ class Tasks(Enum):
|
|
27 |
task5 = Task("college_medicine (mmlu)", "MMLU College Medicine", 0)
|
28 |
task6 = Task("medical_genetics (mmlu)", "MMLU Medical Genetics", 0)
|
29 |
task7 = Task("professional_medicine (mmlu)", "MMLU Professional Medicine", 0)
|
30 |
-
task8 = Task("pubmedqa", "PubMedQA", 0)
|
31 |
|
32 |
|
33 |
|
|
|
27 |
task5 = Task("college_medicine (mmlu)", "MMLU College Medicine", 0)
|
28 |
task6 = Task("medical_genetics (mmlu)", "MMLU Medical Genetics", 0)
|
29 |
task7 = Task("professional_medicine (mmlu)", "MMLU Professional Medicine", 0)
|
30 |
+
task8 = Task("pubmedqa", "PubMedQA", 0)
|
31 |
|
32 |
|
33 |
|
src/backend/run_eval_suite.py
CHANGED
@@ -33,7 +33,7 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
33 |
# indexes all tasks from the `lm_eval/tasks` subdirectory.
|
34 |
# Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
|
35 |
# to include a set of tasks in a separate directory.
|
36 |
-
task_manager = TaskManager(include_path="src/backend/
|
37 |
|
38 |
if "gpt" in eval_request.model:
|
39 |
model = "openai-chat-completions"
|
|
|
33 |
# indexes all tasks from the `lm_eval/tasks` subdirectory.
|
34 |
# Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
|
35 |
# to include a set of tasks in a separate directory.
|
36 |
+
task_manager = TaskManager(include_path="src/backend/probing_tasks")
|
37 |
|
38 |
if "gpt" in eval_request.model:
|
39 |
model = "openai-chat-completions"
|
src/display/utils.py
CHANGED
@@ -1,13 +1,11 @@
|
|
1 |
-
from dataclasses import dataclass,
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
|
7 |
def fields(raw_class):
|
8 |
-
return [
|
9 |
-
v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
|
10 |
-
]
|
11 |
|
12 |
|
13 |
@dataclass
|
@@ -28,13 +26,10 @@ class Tasks(Enum):
|
|
28 |
mmlu_mg = Task("medical_genetics (mmlu)", "acc", "MMLU Medical Genetics")
|
29 |
mmlu_pm = Task("professional_medicine (mmlu)", "acc", "MMLU Professional Medicine")
|
30 |
pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
|
31 |
-
|
32 |
-
|
33 |
# These classes are for user facing column names,
|
34 |
# to avoid having to change them all around the code
|
35 |
# when a modif is needed
|
36 |
-
|
37 |
-
|
38 |
@dataclass
|
39 |
class ColumnContent:
|
40 |
name: str
|
@@ -45,103 +40,29 @@ class ColumnContent:
|
|
45 |
dummy: bool = False
|
46 |
is_task: bool = False
|
47 |
|
48 |
-
|
49 |
-
# Define a function to generate ColumnContent instances
|
50 |
-
def column_content_factory(
|
51 |
-
name: str,
|
52 |
-
type: str,
|
53 |
-
displayed_by_default: bool,
|
54 |
-
hidden: bool = False,
|
55 |
-
never_hidden: bool = False,
|
56 |
-
dummy: bool = False,
|
57 |
-
is_task: bool = False,
|
58 |
-
):
|
59 |
-
return lambda: ColumnContent(
|
60 |
-
name=name,
|
61 |
-
type=type,
|
62 |
-
displayed_by_default=displayed_by_default,
|
63 |
-
hidden=hidden,
|
64 |
-
never_hidden=never_hidden,
|
65 |
-
dummy=dummy,
|
66 |
-
is_task=is_task,
|
67 |
-
)
|
68 |
-
|
69 |
-
|
70 |
auto_eval_column_dict = []
|
71 |
# Init
|
72 |
-
auto_eval_column_dict.append(
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
ColumnContent("T", "str", True, never_hidden=True),
|
77 |
-
]
|
78 |
-
)
|
79 |
-
auto_eval_column_dict.append(
|
80 |
-
[
|
81 |
-
"model",
|
82 |
-
ColumnContent,
|
83 |
-
ColumnContent("Model", "markdown", True, never_hidden=True),
|
84 |
-
]
|
85 |
-
)
|
86 |
-
# Scores
|
87 |
-
auto_eval_column_dict.append(
|
88 |
-
["average", ColumnContent, ColumnContent("Avg", "number", True)]
|
89 |
-
)
|
90 |
for task in Tasks:
|
91 |
-
auto_eval_column_dict.append(
|
92 |
-
[
|
93 |
-
task.name,
|
94 |
-
ColumnContent,
|
95 |
-
ColumnContent(task.value.col_name, "number", True, is_task=True),
|
96 |
-
]
|
97 |
-
) # hidden was true by default
|
98 |
# Model information
|
99 |
-
auto_eval_column_dict.append(
|
100 |
-
|
101 |
-
)
|
102 |
-
auto_eval_column_dict.append(
|
103 |
-
|
104 |
-
)
|
105 |
-
auto_eval_column_dict.append(
|
106 |
-
|
107 |
-
)
|
108 |
-
auto_eval_column_dict.append(
|
109 |
-
["precision", ColumnContent, ColumnContent("Precision", "str", False)]
|
110 |
-
)
|
111 |
-
auto_eval_column_dict.append(
|
112 |
-
["license", ColumnContent, ColumnContent("Hub License", "str", False)]
|
113 |
-
)
|
114 |
-
auto_eval_column_dict.append(
|
115 |
-
["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]
|
116 |
-
)
|
117 |
-
auto_eval_column_dict.append(
|
118 |
-
["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]
|
119 |
-
)
|
120 |
-
auto_eval_column_dict.append(
|
121 |
-
[
|
122 |
-
"still_on_hub",
|
123 |
-
ColumnContent,
|
124 |
-
ColumnContent("Available on the hub", "bool", False),
|
125 |
-
]
|
126 |
-
)
|
127 |
-
auto_eval_column_dict.append(
|
128 |
-
["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]
|
129 |
-
)
|
130 |
# Dummy column for the search bar (hidden by the custom CSS)
|
131 |
-
|
132 |
-
auto_eval_column_fields = [
|
133 |
-
(
|
134 |
-
"model_type_symbol",
|
135 |
-
ColumnContent,
|
136 |
-
field(
|
137 |
-
default_factory=column_content_factory("T", "str", True, never_hidden=True)
|
138 |
-
),
|
139 |
-
),
|
140 |
-
# Add other fields similarly...
|
141 |
-
]
|
142 |
|
143 |
# We use make dataclass to dynamically fill the scores from Tasks
|
144 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn",
|
145 |
|
146 |
|
147 |
@dataclass(frozen=True)
|
@@ -189,6 +110,9 @@ class WeightType(Enum):
|
|
189 |
Delta = ModelDetails("Delta")
|
190 |
|
191 |
|
|
|
|
|
|
|
192 |
class Precision(Enum):
|
193 |
float32 = ModelDetails("float32")
|
194 |
float16 = ModelDetails("float16")
|
@@ -213,17 +137,13 @@ class Precision(Enum):
|
|
213 |
if precision in ["GPTQ", "None"]:
|
214 |
return Precision.qt_GPTQ
|
215 |
return Precision.Unknown
|
216 |
-
|
217 |
|
218 |
# Column selection
|
219 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
220 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
221 |
-
COLS_LITE = [
|
222 |
-
|
223 |
-
]
|
224 |
-
TYPES_LITE = [
|
225 |
-
c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
|
226 |
-
]
|
227 |
|
228 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
229 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
|
7 |
def fields(raw_class):
|
8 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
|
9 |
|
10 |
|
11 |
@dataclass
|
|
|
26 |
mmlu_mg = Task("medical_genetics (mmlu)", "acc", "MMLU Medical Genetics")
|
27 |
mmlu_pm = Task("professional_medicine (mmlu)", "acc", "MMLU Professional Medicine")
|
28 |
pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
|
29 |
+
|
|
|
30 |
# These classes are for user facing column names,
|
31 |
# to avoid having to change them all around the code
|
32 |
# when a modif is needed
|
|
|
|
|
33 |
@dataclass
|
34 |
class ColumnContent:
|
35 |
name: str
|
|
|
40 |
dummy: bool = False
|
41 |
is_task: bool = False
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
auto_eval_column_dict = []
|
44 |
# Init
|
45 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
46 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
47 |
+
#Scores
|
48 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
for task in Tasks:
|
50 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, is_task=True)]) # hidden was true by default
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
# Model information
|
52 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
53 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
54 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
55 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
56 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
57 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
58 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
59 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
60 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
# Dummy column for the search bar (hidden by the custom CSS)
|
62 |
+
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
# We use make dataclass to dynamically fill the scores from Tasks
|
65 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
66 |
|
67 |
|
68 |
@dataclass(frozen=True)
|
|
|
110 |
Delta = ModelDetails("Delta")
|
111 |
|
112 |
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
class Precision(Enum):
|
117 |
float32 = ModelDetails("float32")
|
118 |
float16 = ModelDetails("float16")
|
|
|
137 |
if precision in ["GPTQ", "None"]:
|
138 |
return Precision.qt_GPTQ
|
139 |
return Precision.Unknown
|
140 |
+
|
141 |
|
142 |
# Column selection
|
143 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
144 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
145 |
+
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
146 |
+
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
|
|
|
|
|
|
|
|
147 |
|
148 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
149 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
src/envs.py
CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import HfApi
|
|
5 |
|
6 |
H4_TOKEN = os.environ.get("HF_SECRET", None)
|
7 |
|
|
|
8 |
REPO_ID = "openlifescienceai/open_medical_llm_leaderboard"
|
9 |
|
10 |
QUEUE_REPO = "openlifescienceai/test_requests"
|
@@ -16,7 +17,7 @@ PRIVATE_RESULTS_REPO = "openlifescienceai/test_private-results"
|
|
16 |
|
17 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
18 |
|
19 |
-
|
20 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
21 |
|
22 |
print(f"CACHE_PATH = {CACHE_PATH}")
|
@@ -27,6 +28,7 @@ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
27 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
28 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
29 |
|
|
|
30 |
|
31 |
# Rate limit variables
|
32 |
RATE_LIMIT_PERIOD = 7
|
@@ -34,4 +36,4 @@ RATE_LIMIT_QUOTA = 5
|
|
34 |
HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
|
35 |
|
36 |
API = HfApi(token=H4_TOKEN)
|
37 |
-
# API = HfApi()
|
|
|
5 |
|
6 |
H4_TOKEN = os.environ.get("HF_SECRET", None)
|
7 |
|
8 |
+
# REPO_ID = "pminervini/hallucinations-leaderboard"
|
9 |
REPO_ID = "openlifescienceai/open_medical_llm_leaderboard"
|
10 |
|
11 |
QUEUE_REPO = "openlifescienceai/test_requests"
|
|
|
17 |
|
18 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
19 |
|
20 |
+
# CACHE_PATH = "/Users/chaeeunlee/Documents/VSC_workspaces/test_leaderboard" #
|
21 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
22 |
|
23 |
print(f"CACHE_PATH = {CACHE_PATH}")
|
|
|
28 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
29 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
30 |
|
31 |
+
# PATH_TO_COLLECTION = "hallucinations-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03" # ??
|
32 |
|
33 |
# Rate limit variables
|
34 |
RATE_LIMIT_PERIOD = 7
|
|
|
36 |
HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
|
37 |
|
38 |
API = HfApi(token=H4_TOKEN)
|
39 |
+
# API = HfApi()
|