# -*- coding: utf-8 -*-
# author: Martin Fajčík
# modified by: Jan DoleΕΎal
import csv
import random
import numpy as np
from bokeh.plotting import figure
from bokeh.models import LabelSet, LogScale, ColumnDataSource, tickers
from bokeh.palettes import Turbo256 # A color palette with enough colors
# Function to fit a polynomial curve and return the x and y values of the fitted curve
def fit_curve(x, y, degree=1):
# Fit a polynomial of given degree
coeffs = np.polyfit(x, y, degree)
poly = np.poly1d(coeffs)
x_fit = np.linspace(min(x), max(x), 100)
y_fit = poly(x_fit)
return x_fit, y_fit
# Function to detect and remove outliers using the IQR method
def remove_outliers(x, y):
x = np.array(x)
y = np.array(y)
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1_x, Q3_x = np.percentile(x, [25, 75])
Q1_y, Q3_y = np.percentile(y, [25, 75])
IQR_x = Q3_x - Q1_x
IQR_y = Q3_y - Q1_y
# Define bounds for outliers
lower_bound_x = Q1_x - 1.5 * IQR_x
upper_bound_x = Q3_x + 1.5 * IQR_x
lower_bound_y = Q1_y - 1.5 * IQR_y
upper_bound_y = Q3_y + 1.5 * IQR_y
# Filter out outliers
mask_x = (x >= lower_bound_x) & (x <= upper_bound_x)
mask_y = (y >= lower_bound_y) & (y <= upper_bound_y)
mask = mask_x & mask_y
return x[mask], y[mask], x[~mask], y[~mask]
def get_ldb_records(name_map, csv_file_path):
model_mapping = {model_title: model_title for model_title in name_map.values()}
with open(csv_file_path, mode='r') as file:
reader = csv.DictReader(file)
for row in reader:
sanitized_name = model_mapping[row['Model']]
ldb_records[sanitized_name] = row
return ldb_records
def create_scatter_plot_with_curve_with_variances_named(category, variance_across_categories, x, y, sizes, model_names, ldb_records):
# Remove outliers
x_filtered, y_filtered, x_outliers, y_outliers = remove_outliers(x, y)
# Scale the variance to a range suitable for marker sizes (e.g., between 5 and 30)
min_marker_size = 5
max_marker_size = 30
def scale_variance_to_size(variance):
# Scale variance to marker size (linear mapping)
return min_marker_size + (variance - min(variance_across_categories.values())) * (max_marker_size - min_marker_size) / (max(variance_across_categories.values()) - min(variance_across_categories.values()))
# Function to get the variance for a given model name
def get_variance_for_model(model_name):
return variance_across_categories.get(model_name, 0) # Default to 0 if model not found
# Get markers
filtered_markers = np.array(model_names)[np.in1d(x, x_filtered)]
outlier_markers = np.array(model_names)[np.in1d(x, x_outliers)]
# Get marker sizes and variances for the filtered data
filtered_variances = [get_variance_for_model(mname) for mname in filtered_markers]
marker_sizes_filtered = [scale_variance_to_size(var) for var in filtered_variances]
# Get marker sizes and variances for the outlier data
outlier_variances = [get_variance_for_model(mname) for mname in outlier_markers]
marker_sizes_outliers = [scale_variance_to_size(var) for var in outlier_variances]
# Assign symbols to the model types
_model_type2symbol = {
'chat': 'circle',
'pretrained': 'triangle',
'ensemble': 'star',
model_type2symbol = lambda model_type: _model_type2symbol.get(model_type, 'diamond')
# Assign symbols to the filtered data points
filtered_symbols = [model_type2symbol(ldb_records[mname]['Type']) for mname in filtered_markers]
# Assign symbols to the outlier data points
outlier_symbols = [model_type2symbol(ldb_records[mname]['Type']) for mname in outlier_markers]
# Define a color palette with enough colors
stride = len(Turbo256) // len(model_names)
color_palette = list(Turbo256[::stride]) # Adjust this palette size based on the number of data points
# Create unique colors for filtered data
filtered_colors = [color_palette[i % len(color_palette)] for i in range(len(x_filtered))]
# Create unique colors for outliers
outlier_colors = [color_palette[(i + len(x_filtered)) % len(color_palette)] for i in range(len(x_outliers))]
# Create ColumnDataSource with filtered data
source_filtered = ColumnDataSource(data={
'x': x_filtered,
'y': y_filtered,
'sizes': np.array(sizes)[np.in1d(x, x_filtered)], # Keep original model sizes
'marker_sizes': marker_sizes_filtered, # New field for marker sizes based on variance
'model_names': np.array(model_names)[np.in1d(x, x_filtered)],
'variance': filtered_variances, # New field for variance
'color': filtered_colors,
'symbol': filtered_symbols
# Create ColumnDataSource with outlier data
source_outliers = ColumnDataSource(data={
'x': x_outliers,
'y': y_outliers,
'sizes': np.array(sizes)[np.in1d(x, x_outliers)], # Keep original model sizes
'marker_sizes': marker_sizes_outliers, # New field for marker sizes based on variance
'model_names': np.array(model_names)[np.in1d(x, x_outliers)],
'variance': outlier_variances, # New field for variance
'color': outlier_colors,
'symbol': outlier_symbols
# Create a figure for the category
p = figure(
#title=f"{category} vs Model Size vs Variance Across Categories",
("Model", "@model_names"),
("Model Size (B parameters)", "@sizes"),
("Variance", "@variance"), # Added variance to the tooltip
("Performance", "@y"),
# Plot filtered data with unique colors and scaled marker sizes
p.scatter('x', 'y', size='marker_sizes', source=source_filtered, fill_alpha=0.6, color='color', marker='symbol')
# Plot outliers with unique colors and scaled marker sizes
p.scatter('x', 'y', size='marker_sizes', source=source_outliers, fill_alpha=0.6, color='color', marker='symbol')
# Fit and plot a curve
x_fit, y_fit = fit_curve(x_filtered, y_filtered, degree=1) # You can adjust the degree of the polynomial
p.line(x_fit, y_fit, line_color='gray', line_width=2, line_dash='dashed')
# Add labels (with slight offset to avoid overlap)
# Set axis labels
p.xaxis.axis_label = 'Model Size (B parameters)'
p.yaxis.axis_label = f'{category}'
# Set axis label font sizes
p.xaxis.axis_label_text_font_size = f"{FONTSIZE}pt" # Set font size for x-axis label
p.yaxis.axis_label_text_font_size = f"{FONTSIZE}pt" # Set font size for y-axis label
# Increase tick label font sizes
p.xaxis.major_label_text_font_size = f"{FONTSIZE}pt" # Increase x-axis tick label size
p.yaxis.major_label_text_font_size = f"{FONTSIZE}pt" # Increase y-axis tick label size
p.x_scale = LogScale()
p.xaxis.ticker = tickers.LogTicker()
p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"
return p