update
Browse files
app.py
CHANGED
@@ -3,8 +3,10 @@ import json
|
|
3 |
import pandas as pd
|
4 |
from collections import defaultdict
|
5 |
import copy as cp
|
6 |
-
from urllib.request import urlopen
|
7 |
import re
|
|
|
|
|
8 |
|
9 |
# Constants
|
10 |
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
@@ -22,11 +24,37 @@ GITHUB_REPO = 'https://github.com/open-compass/opencompass'
|
|
22 |
GITHUB_RAW = 'https://raw.githubusercontent.com/open-compass/opencompass'
|
23 |
GITHUB_BLOB = 'https://github.com/open-compass/opencompass/blob'
|
24 |
|
25 |
-
# URL for the JSON data
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Markdown content
|
29 |
-
|
|
|
|
|
30 |
MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results
|
31 |
The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
|
32 |
- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
|
@@ -34,7 +62,6 @@ The CompassAcademic currently focuses on the comprehensive reasoning abilities o
|
|
34 |
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)π.
|
35 |
"""
|
36 |
|
37 |
-
|
38 |
def fix_image_urls(content):
|
39 |
"""Fix image URLs in markdown content."""
|
40 |
# Handle the specific logo.svg path
|
@@ -57,8 +84,8 @@ MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown']
|
|
57 |
MODEL_TYPE = ['API', 'OpenSource']
|
58 |
|
59 |
|
60 |
-
def load_data():
|
61 |
-
response = urlopen(
|
62 |
data = json.loads(response.read().decode('utf-8'))
|
63 |
return data
|
64 |
|
@@ -141,7 +168,6 @@ def filter_table(df, size_ranges, model_types):
|
|
141 |
type_mask |= filtered_df['OpenSource'] == 'Yes'
|
142 |
filtered_df = filtered_df[type_mask]
|
143 |
|
144 |
-
# η΄ζ₯θΏεθΏζ»€εη DataFrame
|
145 |
return filtered_df
|
146 |
|
147 |
|
@@ -172,11 +198,13 @@ def calculate_column_widths(df):
|
|
172 |
|
173 |
|
174 |
def create_interface():
|
175 |
-
|
|
|
176 |
df = build_main_table(data)
|
|
|
177 |
|
178 |
with gr.Blocks() as demo:
|
179 |
-
gr.Markdown(
|
180 |
|
181 |
with gr.Tabs() as tabs:
|
182 |
with gr.TabItem("π
Main Leaderboard", elem_id='main'):
|
@@ -206,6 +234,22 @@ def create_interface():
|
|
206 |
column_widths=calculate_column_widths(df),
|
207 |
)
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
def update_table(size_ranges, model_types):
|
210 |
filtered_df = filter_table(df, size_ranges, model_types)
|
211 |
return filtered_df.sort_values(
|
@@ -224,10 +268,8 @@ def create_interface():
|
|
224 |
outputs=table,
|
225 |
)
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
# fixed_content = fix_image_urls(readme_content)
|
230 |
-
# gr.Markdown(fixed_content)
|
231 |
|
232 |
with gr.Row():
|
233 |
with gr.Accordion("Citation", open=False):
|
|
|
3 |
import pandas as pd
|
4 |
from collections import defaultdict
|
5 |
import copy as cp
|
6 |
+
from urllib.request import urlopen, URLError
|
7 |
import re
|
8 |
+
from datetime import datetime
|
9 |
+
import time
|
10 |
|
11 |
# Constants
|
12 |
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
|
|
24 |
GITHUB_RAW = 'https://raw.githubusercontent.com/open-compass/opencompass'
|
25 |
GITHUB_BLOB = 'https://github.com/open-compass/opencompass/blob'
|
26 |
|
27 |
+
# Base URL for the JSON data
|
28 |
+
DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/assets/research-rank/research-data.REALTIME."
|
29 |
+
|
30 |
+
def find_latest_data_url():
|
31 |
+
"""Find the latest available data URL by trying different dates."""
|
32 |
+
today = datetime.now()
|
33 |
+
# Try last 365 days
|
34 |
+
for i in range(365):
|
35 |
+
date = today.replace(day=today.day - i)
|
36 |
+
date_str = date.strftime("%Y%m%d")
|
37 |
+
url = f"{DATA_URL_BASE}{date_str}.json"
|
38 |
+
try:
|
39 |
+
urlopen(url)
|
40 |
+
return url, date_str
|
41 |
+
except URLError:
|
42 |
+
continue
|
43 |
+
# If no valid URL found, return None
|
44 |
+
return None, None
|
45 |
+
|
46 |
+
def get_latest_data():
|
47 |
+
"""Get latest data URL and update time"""
|
48 |
+
data_url, update_time = find_latest_data_url()
|
49 |
+
if not data_url:
|
50 |
+
raise Exception("Could not find valid data URL")
|
51 |
+
formatted_update_time = datetime.strptime(update_time, "%Y%m%d").strftime("%Y-%m-%d")
|
52 |
+
return data_url, formatted_update_time
|
53 |
|
54 |
# Markdown content
|
55 |
+
def get_leaderboard_title(update_time):
|
56 |
+
return f"# CompassAcademic Leaderboard (Last Updated: {update_time})"
|
57 |
+
|
58 |
MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results
|
59 |
The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
|
60 |
- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
|
|
|
62 |
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)π.
|
63 |
"""
|
64 |
|
|
|
65 |
def fix_image_urls(content):
|
66 |
"""Fix image URLs in markdown content."""
|
67 |
# Handle the specific logo.svg path
|
|
|
84 |
MODEL_TYPE = ['API', 'OpenSource']
|
85 |
|
86 |
|
87 |
+
def load_data(data_url):
|
88 |
+
response = urlopen(data_url)
|
89 |
data = json.loads(response.read().decode('utf-8'))
|
90 |
return data
|
91 |
|
|
|
168 |
type_mask |= filtered_df['OpenSource'] == 'Yes'
|
169 |
filtered_df = filtered_df[type_mask]
|
170 |
|
|
|
171 |
return filtered_df
|
172 |
|
173 |
|
|
|
198 |
|
199 |
|
200 |
def create_interface():
|
201 |
+
data_url, update_time = get_latest_data()
|
202 |
+
data = load_data(data_url)
|
203 |
df = build_main_table(data)
|
204 |
+
title = gr.Markdown(get_leaderboard_title(update_time))
|
205 |
|
206 |
with gr.Blocks() as demo:
|
207 |
+
title_comp = gr.Markdown(get_leaderboard_title(update_time))
|
208 |
|
209 |
with gr.Tabs() as tabs:
|
210 |
with gr.TabItem("π
Main Leaderboard", elem_id='main'):
|
|
|
234 |
column_widths=calculate_column_widths(df),
|
235 |
)
|
236 |
|
237 |
+
def update_data():
|
238 |
+
"""Periodically check for new data and update the interface"""
|
239 |
+
while True:
|
240 |
+
time.sleep(300) # Check every 5 minutes
|
241 |
+
try:
|
242 |
+
new_data_url, new_update_time = get_latest_data()
|
243 |
+
if new_data_url != data_url:
|
244 |
+
new_data = load_data(new_data_url)
|
245 |
+
new_df = build_main_table(new_data)
|
246 |
+
filtered_df = filter_table(new_df, size_filter.value, type_filter.value)
|
247 |
+
title_comp.value = get_leaderboard_title(new_update_time)
|
248 |
+
table.value = filtered_df.sort_values("Average Score", ascending=False)
|
249 |
+
except Exception as e:
|
250 |
+
print(f"Error updating data: {e}")
|
251 |
+
continue
|
252 |
+
|
253 |
def update_table(size_ranges, model_types):
|
254 |
filtered_df = filter_table(df, size_ranges, model_types)
|
255 |
return filtered_df.sort_values(
|
|
|
268 |
outputs=table,
|
269 |
)
|
270 |
|
271 |
+
# Set up periodic data update
|
272 |
+
demo.load(update_data)
|
|
|
|
|
273 |
|
274 |
with gr.Row():
|
275 |
with gr.Accordion("Citation", open=False):
|