Spaces:

CZLC
/

BenCzechMark

Running

App Files Files Community

BenCzechMark / analyze_winscore.py

idolezal

Bokeh figure - added symbol for "ensemble" model type

68ff3fc 14 days ago

raw

history blame contribute delete

7.95 kB

	# -- coding: utf-8 --

	# author: Martin Fajčík
	# modified by: Jan Doležal

	import csv
	import random
	import numpy as np
	from bokeh.plotting import figure
	from bokeh.models import LabelSet, LogScale, ColumnDataSource, tickers
	from bokeh.palettes import Turbo256 # A color palette with enough colors

	# Function to fit a polynomial curve and return the x and y values of the fitted curve
	def fit_curve(x, y, degree=1):
	# Fit a polynomial of given degree
	coeffs = np.polyfit(x, y, degree)
	poly = np.poly1d(coeffs)
	x_fit = np.linspace(min(x), max(x), 100)
	y_fit = poly(x_fit)
	return x_fit, y_fit

	# Function to detect and remove outliers using the IQR method
	def remove_outliers(x, y):
	x = np.array(x)
	y = np.array(y)

	# Calculate Q1 (25th percentile) and Q3 (75th percentile)
	Q1_x, Q3_x = np.percentile(x, [25, 75])
	Q1_y, Q3_y = np.percentile(y, [25, 75])

	IQR_x = Q3_x - Q1_x
	IQR_y = Q3_y - Q1_y

	# Define bounds for outliers
	lower_bound_x = Q1_x - 1.5 * IQR_x
	upper_bound_x = Q3_x + 1.5 * IQR_x
	lower_bound_y = Q1_y - 1.5 * IQR_y
	upper_bound_y = Q3_y + 1.5 * IQR_y

	# Filter out outliers
	mask_x = (x >= lower_bound_x) & (x <= upper_bound_x)
	mask_y = (y >= lower_bound_y) & (y <= upper_bound_y)
	mask = mask_x & mask_y

	return x[mask], y[mask], x[~mask], y[~mask]

	def get_ldb_records(name_map, csv_file_path):
	model_mapping = {model_title: model_title for model_title in name_map.values()}

	ldb_records={}
	with open(csv_file_path, mode='r') as file:
	reader = csv.DictReader(file)
	for row in reader:
	sanitized_name = model_mapping[row['Model']]
	ldb_records[sanitized_name] = row

	return ldb_records

	def create_scatter_plot_with_curve_with_variances_named(category, variance_across_categories, x, y, sizes, model_names, ldb_records):
	FONTSIZE = 12

	# Remove outliers
	x_filtered, y_filtered, x_outliers, y_outliers = remove_outliers(x, y)

	# Scale the variance to a range suitable for marker sizes (e.g., between 5 and 30)
	min_marker_size = 5
	max_marker_size = 30

	def scale_variance_to_size(variance):
	# Scale variance to marker size (linear mapping)
	return min_marker_size + (variance - min(variance_across_categories.values())) * (max_marker_size - min_marker_size) / (max(variance_across_categories.values()) - min(variance_across_categories.values()))

	# Function to get the variance for a given model name
	def get_variance_for_model(model_name):
	return variance_across_categories.get(model_name, 0) # Default to 0 if model not found

	# Get markers
	filtered_markers = np.array(model_names)[np.in1d(x, x_filtered)]
	outlier_markers = np.array(model_names)[np.in1d(x, x_outliers)]

	# Get marker sizes and variances for the filtered data
	filtered_variances = [get_variance_for_model(mname) for mname in filtered_markers]
	marker_sizes_filtered = [scale_variance_to_size(var) for var in filtered_variances]

	# Get marker sizes and variances for the outlier data
	outlier_variances = [get_variance_for_model(mname) for mname in outlier_markers]
	marker_sizes_outliers = [scale_variance_to_size(var) for var in outlier_variances]

	# Assign symbols to the model types
	# https://docs.bokeh.org/en/latest/docs/examples/basic/scatters/markers.html
	_model_type2symbol = {
	'chat': 'circle',
	'pretrained': 'triangle',
	'ensemble': 'star',
	}
	model_type2symbol = lambda model_type: _model_type2symbol.get(model_type, 'diamond')

	# Assign symbols to the filtered data points
	filtered_symbols = [model_type2symbol(ldb_records[mname]['Type']) for mname in filtered_markers]

	# Assign symbols to the outlier data points
	outlier_symbols = [model_type2symbol(ldb_records[mname]['Type']) for mname in outlier_markers]

	# Define a color palette with enough colors
	stride = len(Turbo256) // len(model_names)
	color_palette = list(Turbo256[::stride]) # Adjust this palette size based on the number of data points
	random.shuffle(color_palette)

	# Create unique colors for filtered data
	filtered_colors = [color_palette[i % len(color_palette)] for i in range(len(x_filtered))]

	# Create unique colors for outliers
	outlier_colors = [color_palette[(i + len(x_filtered)) % len(color_palette)] for i in range(len(x_outliers))]

	# Create ColumnDataSource with filtered data
	source_filtered = ColumnDataSource(data={
	'x': x_filtered,
	'y': y_filtered,
	'sizes': np.array(sizes)[np.in1d(x, x_filtered)], # Keep original model sizes
	'marker_sizes': marker_sizes_filtered, # New field for marker sizes based on variance
	'model_names': np.array(model_names)[np.in1d(x, x_filtered)],
	'variance': filtered_variances, # New field for variance
	'color': filtered_colors,
	'symbol': filtered_symbols
	})

	# Create ColumnDataSource with outlier data
	source_outliers = ColumnDataSource(data={
	'x': x_outliers,
	'y': y_outliers,
	'sizes': np.array(sizes)[np.in1d(x, x_outliers)], # Keep original model sizes
	'marker_sizes': marker_sizes_outliers, # New field for marker sizes based on variance
	'model_names': np.array(model_names)[np.in1d(x, x_outliers)],
	'variance': outlier_variances, # New field for variance
	'color': outlier_colors,
	'symbol': outlier_symbols
	})

	# Create a figure for the category
	p = figure(
	sizing_mode="stretch_width",
	height=800,
	#title=f"{category} vs Model Size vs Variance Across Categories",
	tools="pan,wheel_zoom,box_zoom,save,reset",
	active_scroll="wheel_zoom",
	tooltips=[
	("Model", "@model_names"),
	("Model Size (B parameters)", "@sizes"),
	("Variance", "@variance"), # Added variance to the tooltip
	("Performance", "@y"),
	]
	)

	# Plot filtered data with unique colors and scaled marker sizes
	p.scatter('x', 'y', size='marker_sizes', source=source_filtered, fill_alpha=0.6, color='color', marker='symbol')

	# Plot outliers with unique colors and scaled marker sizes
	p.scatter('x', 'y', size='marker_sizes', source=source_outliers, fill_alpha=0.6, color='color', marker='symbol')

	# Fit and plot a curve
	x_fit, y_fit = fit_curve(x_filtered, y_filtered, degree=1) # You can adjust the degree of the polynomial


	p.line(x_fit, y_fit, line_color='gray', line_width=2, line_dash='dashed')

	# Add labels (with slight offset to avoid overlap)
	p.add_layout(LabelSet(
	x='x',
	y='y',
	text='model_names',
	source=source_filtered,
	x_offset=5,
	y_offset=8,
	text_font_size=f"{FONTSIZE-2}pt",
	text_color='black',
	))

	p.add_layout(LabelSet(
	x='x',
	y='y',
	text='model_names',
	source=source_outliers,
	x_offset=5,
	y_offset=8,
	text_font_size=f"{FONTSIZE-2}pt",
	text_color='black',
	))


	# Set axis labels
	p.xaxis.axis_label = 'Model Size (B parameters)'
	p.yaxis.axis_label = f'{category}'

	# Set axis label font sizes
	p.xaxis.axis_label_text_font_size = f"{FONTSIZE}pt" # Set font size for x-axis label
	p.yaxis.axis_label_text_font_size = f"{FONTSIZE}pt" # Set font size for y-axis label

	# Increase tick label font sizes
	p.xaxis.major_label_text_font_size = f"{FONTSIZE}pt" # Increase x-axis tick label size
	p.yaxis.major_label_text_font_size = f"{FONTSIZE}pt" # Increase y-axis tick label size

	p.x_scale = LogScale()

	p.xaxis.ticker = tickers.LogTicker()
	p.xaxis.axis_label_text_font_style = "normal"
	p.yaxis.axis_label_text_font_style = "normal"

	return p

	# EOF