Spaces:
Running
Running
Try analyze winscore with bokeh
Browse files- analyze_winscore.py +181 -0
- app.py +4 -8
- server.py +37 -0
analyze_winscore.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
import csv
|
4 |
+
import random
|
5 |
+
import numpy as np
|
6 |
+
from bokeh.plotting import figure
|
7 |
+
from bokeh.models import LabelSet, LogScale
|
8 |
+
from bokeh.palettes import Turbo256 # A color palette with enough colors
|
9 |
+
from bokeh.models import ColumnDataSource
|
10 |
+
|
11 |
+
# Function to fit a polynomial curve and return the x and y values of the fitted curve
|
12 |
+
def fit_curve(x, y, degree=1):
|
13 |
+
# Fit a polynomial of given degree
|
14 |
+
coeffs = np.polyfit(x, y, degree)
|
15 |
+
poly = np.poly1d(coeffs)
|
16 |
+
x_fit = np.linspace(min(x), max(x), 100)
|
17 |
+
y_fit = poly(x_fit)
|
18 |
+
return x_fit, y_fit
|
19 |
+
|
20 |
+
# Function to detect and remove outliers using the IQR method
|
21 |
+
def remove_outliers(x, y):
|
22 |
+
x = np.array(x)
|
23 |
+
y = np.array(y)
|
24 |
+
|
25 |
+
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
|
26 |
+
Q1_x, Q3_x = np.percentile(x, [25, 75])
|
27 |
+
Q1_y, Q3_y = np.percentile(y, [25, 75])
|
28 |
+
|
29 |
+
IQR_x = Q3_x - Q1_x
|
30 |
+
IQR_y = Q3_y - Q1_y
|
31 |
+
|
32 |
+
# Define bounds for outliers
|
33 |
+
lower_bound_x = Q1_x - 1.5 * IQR_x
|
34 |
+
upper_bound_x = Q3_x + 1.5 * IQR_x
|
35 |
+
lower_bound_y = Q1_y - 1.5 * IQR_y
|
36 |
+
upper_bound_y = Q3_y + 1.5 * IQR_y
|
37 |
+
|
38 |
+
# Filter out outliers
|
39 |
+
mask_x = (x >= lower_bound_x) & (x <= upper_bound_x)
|
40 |
+
mask_y = (y >= lower_bound_y) & (y <= upper_bound_y)
|
41 |
+
mask = mask_x & mask_y
|
42 |
+
|
43 |
+
return x[mask], y[mask], x[~mask], y[~mask]
|
44 |
+
|
45 |
+
def get_ldb_records(name_map, csv_file_path):
|
46 |
+
model_mapping = {model_title: model_title for model_title in name_map.values()}
|
47 |
+
|
48 |
+
ldb_records={}
|
49 |
+
with open(csv_file_path, mode='r') as file:
|
50 |
+
reader = csv.DictReader(file)
|
51 |
+
for row in reader:
|
52 |
+
if row['Model'].startswith("Qwen/Qwen2.5"):
|
53 |
+
continue
|
54 |
+
sanitized_name = model_mapping[row['Model']]
|
55 |
+
ldb_records[sanitized_name] = row
|
56 |
+
|
57 |
+
return ldb_records
|
58 |
+
|
59 |
+
def create_scatter_plot_with_curve_with_variances_named(category, variance_across_categories, x, y, sizes, model_names, ldb_records):
|
60 |
+
FONTSIZE = 10
|
61 |
+
|
62 |
+
# Remove outliers
|
63 |
+
x_filtered, y_filtered, x_outliers, y_outliers = remove_outliers(x, y)
|
64 |
+
|
65 |
+
# Scale the variance to a range suitable for marker sizes (e.g., between 5 and 30)
|
66 |
+
min_marker_size = 5
|
67 |
+
max_marker_size = 30
|
68 |
+
|
69 |
+
def scale_variance_to_size(variance):
|
70 |
+
# Scale variance to marker size (linear mapping)
|
71 |
+
return min_marker_size + (variance - min(variance_across_categories.values())) * (max_marker_size - min_marker_size) / (max(variance_across_categories.values()) - min(variance_across_categories.values()))
|
72 |
+
|
73 |
+
# Function to get the variance for a given model name
|
74 |
+
def get_variance_for_model(model_name):
|
75 |
+
print(model_name)
|
76 |
+
return variance_across_categories.get(model_name, 0) # Default to 0 if model not found
|
77 |
+
|
78 |
+
# Get marker sizes and variances for the filtered data
|
79 |
+
filtered_variances = [get_variance_for_model(mname) for mname in np.array(model_names)[np.in1d(x, x_filtered)]]
|
80 |
+
marker_sizes_filtered = [scale_variance_to_size(var) for var in filtered_variances]
|
81 |
+
|
82 |
+
# Get marker sizes and variances for the outlier data
|
83 |
+
outlier_variances = [get_variance_for_model(mname) for mname in np.array(model_names)[np.in1d(x, x_outliers)]]
|
84 |
+
marker_sizes_outliers = [scale_variance_to_size(var) for var in outlier_variances]
|
85 |
+
|
86 |
+
# Randomly assign symbols to the filtered data points
|
87 |
+
filtered_symbols = ['circle' if ldb_records[mname]['Type'] == 'chat' else 'triangle' for mname in np.array(model_names)[np.in1d(x, x_filtered)]]
|
88 |
+
|
89 |
+
# Randomly assign symbols to the outlier data points
|
90 |
+
outlier_symbols = ['circle' if ldb_records[mname]['Type'] == 'chat' else 'triangle' for mname in np.array(model_names)[np.in1d(x, x_outliers)]]
|
91 |
+
|
92 |
+
# Define a color palette with enough colors
|
93 |
+
stride = len(Turbo256) // len(model_names)
|
94 |
+
color_palette = list(Turbo256[::stride]) # Adjust this palette size based on the number of data points
|
95 |
+
random.shuffle(color_palette)
|
96 |
+
|
97 |
+
# Create unique colors for filtered data
|
98 |
+
filtered_colors = [color_palette[i % len(color_palette)] for i in range(len(x_filtered))]
|
99 |
+
|
100 |
+
# Create unique colors for outliers
|
101 |
+
outlier_colors = [color_palette[(i + len(x_filtered)) % len(color_palette)] for i in range(len(x_outliers))]
|
102 |
+
|
103 |
+
# Create ColumnDataSource with filtered data
|
104 |
+
source_filtered = ColumnDataSource(data={
|
105 |
+
'x': x_filtered,
|
106 |
+
'y': y_filtered,
|
107 |
+
'sizes': np.array(sizes)[np.in1d(x, x_filtered)], # Keep original model sizes
|
108 |
+
'marker_sizes': marker_sizes_filtered, # New field for marker sizes based on variance
|
109 |
+
'model_names': np.array(model_names)[np.in1d(x, x_filtered)],
|
110 |
+
'variance': filtered_variances, # New field for variance
|
111 |
+
'color': filtered_colors,
|
112 |
+
'symbol': filtered_symbols
|
113 |
+
})
|
114 |
+
|
115 |
+
# Create ColumnDataSource with outlier data
|
116 |
+
source_outliers = ColumnDataSource(data={
|
117 |
+
'x': x_outliers,
|
118 |
+
'y': y_outliers,
|
119 |
+
'sizes': np.array(sizes)[np.in1d(x, x_outliers)], # Keep original model sizes
|
120 |
+
'marker_sizes': marker_sizes_outliers, # New field for marker sizes based on variance
|
121 |
+
'model_names': np.array(model_names)[np.in1d(x, x_outliers)],
|
122 |
+
'variance': outlier_variances, # New field for variance
|
123 |
+
'color': outlier_colors,
|
124 |
+
'symbol': outlier_symbols
|
125 |
+
})
|
126 |
+
|
127 |
+
# Create a figure for the category
|
128 |
+
p = figure(#width=900, height=800, #title=f"{category} vs Model Size vs Variance Across Categories",
|
129 |
+
#tools="pan,wheel_zoom,box_zoom,reset,save",
|
130 |
+
tooltips=[("Model", "@model_names"),
|
131 |
+
("Model Size (B parameters)", "@sizes"),
|
132 |
+
("Variance", "@variance"), # Added variance to the tooltip
|
133 |
+
("Performance", "@y")])
|
134 |
+
|
135 |
+
# Plot filtered data with unique colors and scaled marker sizes
|
136 |
+
p.scatter('x', 'y', size='marker_sizes', source=source_filtered, fill_alpha=0.6, color='color', marker='symbol')
|
137 |
+
|
138 |
+
# Plot outliers with unique colors and scaled marker sizes
|
139 |
+
p.scatter('x', 'y', size='marker_sizes', source=source_outliers, fill_alpha=0.6, color='color', marker='symbol')
|
140 |
+
|
141 |
+
# Fit and plot a curve
|
142 |
+
x_fit, y_fit = fit_curve(x_filtered, y_filtered, degree=1) # You can adjust the degree of the polynomial
|
143 |
+
|
144 |
+
|
145 |
+
p.line(x_fit, y_fit, line_color='gray', line_width=2, line_dash='dashed')
|
146 |
+
|
147 |
+
# Add labels (with slight offset to avoid overlap)
|
148 |
+
p.add_layout(LabelSet(x='x', y='y', text='model_names', source=source_filtered,
|
149 |
+
x_offset=5, y_offset=8, text_font_size=f"{FONTSIZE-4}pt", text_color='black'))
|
150 |
+
|
151 |
+
p.add_layout(LabelSet(x='x', y='y', text='model_names', source=source_outliers,
|
152 |
+
x_offset=5, y_offset=8, text_font_size=f"{FONTSIZE-4}pt", text_color='black'))
|
153 |
+
|
154 |
+
|
155 |
+
# Set axis labels
|
156 |
+
p.xaxis.axis_label = 'Model Size (B parameters)'
|
157 |
+
p.yaxis.axis_label = f'{category}'
|
158 |
+
|
159 |
+
# Set axis label font sizes
|
160 |
+
p.xaxis.axis_label_text_font_size = f"{FONTSIZE}pt" # Set font size for x-axis label
|
161 |
+
p.yaxis.axis_label_text_font_size = f"{FONTSIZE}pt" # Set font size for y-axis label
|
162 |
+
|
163 |
+
|
164 |
+
# Increase tick label font sizes
|
165 |
+
p.xaxis.major_label_text_font_size = f"{FONTSIZE}pt" # Increase x-axis tick label size
|
166 |
+
p.yaxis.major_label_text_font_size = f"{FONTSIZE}pt" # Increase y-axis tick label size
|
167 |
+
|
168 |
+
#p.x_range.start = 1
|
169 |
+
#p.x_range.end = 18
|
170 |
+
|
171 |
+
#p.y_range.end = 60
|
172 |
+
|
173 |
+
p.x_scale = LogScale()
|
174 |
+
|
175 |
+
p.xaxis.ticker = [1,2,4,7,12,15]
|
176 |
+
p.xaxis.axis_label_text_font_style = "normal"
|
177 |
+
p.yaxis.axis_label_text_font_style = "normal"
|
178 |
+
|
179 |
+
return p
|
180 |
+
|
181 |
+
# EOF
|
app.py
CHANGED
@@ -6,8 +6,6 @@ import gradio as gr
|
|
6 |
from gradio.themes.utils.sizes import text_md
|
7 |
from gradio_modal import Modal
|
8 |
|
9 |
-
from bokeh.plotting import figure
|
10 |
-
|
11 |
from content import (
|
12 |
HEADER_MARKDOWN,
|
13 |
LEADERBOARD_TAB_TITLE_MARKDOWN,
|
@@ -628,12 +626,10 @@ def gradio_app():
|
|
628 |
gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
|
629 |
|
630 |
with gr.Row():
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
fig.circle(x, y0, size=10, color="navy", alpha=0.5)
|
636 |
-
p1 = gr.Plot(value=fig, label='Plot 1')
|
637 |
|
638 |
with gr.Row():
|
639 |
leaderboard_category_of_tasks = gr.Dropdown(
|
|
|
6 |
from gradio.themes.utils.sizes import text_md
|
7 |
from gradio_modal import Modal
|
8 |
|
|
|
|
|
9 |
from content import (
|
10 |
HEADER_MARKDOWN,
|
11 |
LEADERBOARD_TAB_TITLE_MARKDOWN,
|
|
|
626 |
gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
|
627 |
|
628 |
with gr.Row():
|
629 |
+
gr.Plot(
|
630 |
+
value=leaderboard_server.get_bokeh_figure(),
|
631 |
+
label='Foo',
|
632 |
+
)
|
|
|
|
|
633 |
|
634 |
with gr.Row():
|
635 |
leaderboard_category_of_tasks = gr.Dropdown(
|
server.py
CHANGED
@@ -622,6 +622,43 @@ class LeaderboardServer:
|
|
622 |
dataframe.to_csv(filepath, index=False)
|
623 |
return filepath
|
624 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
625 |
def get_leaderboard_csv(self, pre_submit=None, category=None):
|
626 |
if pre_submit == None:
|
627 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|
|
|
622 |
dataframe.to_csv(filepath, index=False)
|
623 |
return filepath
|
624 |
|
625 |
+
def get_bokeh_figure(self):
|
626 |
+
import numpy as np
|
627 |
+
from analyze_winscore import get_ldb_records, create_scatter_plot_with_curve_with_variances_named
|
628 |
+
|
629 |
+
#m = self.TASKS_METADATA
|
630 |
+
#tournament = self.tournament_results
|
631 |
+
name_map = self.submission_id_to_model_title
|
632 |
+
|
633 |
+
category = self.TASKS_CATEGORY_OVERALL
|
634 |
+
csv_file_path = self.leaderboard_dataframes_csv[category]
|
635 |
+
ldb_records = get_ldb_records(name_map, csv_file_path)
|
636 |
+
categories = self.TASKS_CATEGORIES
|
637 |
+
model_names = list(ldb_records.keys())
|
638 |
+
sizes = [float(ldb_records[model]['# θ (B)']) for model in model_names]
|
639 |
+
average_performance = [float(ldb_records[model]['Average ⬆️']) for model in model_names]
|
640 |
+
|
641 |
+
variances={}
|
642 |
+
for model, record in ldb_records.items():
|
643 |
+
r = [float(record[cat]) for cat in categories]
|
644 |
+
variances[model] = np.var(r)
|
645 |
+
|
646 |
+
print(variances)
|
647 |
+
print(min(variances.values()))
|
648 |
+
variance_across_categories = variances
|
649 |
+
|
650 |
+
fig = create_scatter_plot_with_curve_with_variances_named(
|
651 |
+
'Overall Duel Win Score',
|
652 |
+
variance_across_categories,
|
653 |
+
sizes,
|
654 |
+
average_performance,
|
655 |
+
sizes,
|
656 |
+
model_names,
|
657 |
+
ldb_records,
|
658 |
+
)
|
659 |
+
|
660 |
+
return fig
|
661 |
+
|
662 |
def get_leaderboard_csv(self, pre_submit=None, category=None):
|
663 |
if pre_submit == None:
|
664 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|