Spaces:

CZLC
/

BenCzechMark

Running

File size: 7,952 Bytes

# -*- coding: utf-8 -*-

# author: Martin Fajčík
# modified by: Jan Doležal

import csv
import random
import numpy as np
from bokeh.plotting import figure
from bokeh.models import LabelSet, LogScale, ColumnDataSource, tickers
from bokeh.palettes import Turbo256  # A color palette with enough colors

# Function to fit a polynomial curve and return the x and y values of the fitted curve
def fit_curve(x, y, degree=1):
    # Fit a polynomial of given degree
    coeffs = np.polyfit(x, y, degree)
    poly = np.poly1d(coeffs)
    x_fit = np.linspace(min(x), max(x), 100)
    y_fit = poly(x_fit)
    return x_fit, y_fit

# Function to detect and remove outliers using the IQR method
def remove_outliers(x, y):
    x = np.array(x)
    y = np.array(y)
    
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1_x, Q3_x = np.percentile(x, [25, 75])
    Q1_y, Q3_y = np.percentile(y, [25, 75])
    
    IQR_x = Q3_x - Q1_x
    IQR_y = Q3_y - Q1_y
    
    # Define bounds for outliers
    lower_bound_x = Q1_x - 1.5 * IQR_x
    upper_bound_x = Q3_x + 1.5 * IQR_x
    lower_bound_y = Q1_y - 1.5 * IQR_y
    upper_bound_y = Q3_y + 1.5 * IQR_y
    
    # Filter out outliers
    mask_x = (x >= lower_bound_x) & (x <= upper_bound_x)
    mask_y = (y >= lower_bound_y) & (y <= upper_bound_y)
    mask = mask_x & mask_y
    
    return x[mask], y[mask], x[~mask], y[~mask]

def get_ldb_records(name_map, csv_file_path):
    model_mapping = {model_title: model_title for model_title in name_map.values()}
    
    ldb_records={}
    with open(csv_file_path, mode='r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            sanitized_name = model_mapping[row['Model']]
            ldb_records[sanitized_name] = row
    
    return ldb_records

def create_scatter_plot_with_curve_with_variances_named(category, variance_across_categories, x, y, sizes, model_names, ldb_records):
    FONTSIZE = 12
    
    # Remove outliers
    x_filtered, y_filtered, x_outliers, y_outliers = remove_outliers(x, y)

    # Scale the variance to a range suitable for marker sizes (e.g., between 5 and 30)
    min_marker_size = 5
    max_marker_size = 30

    def scale_variance_to_size(variance):
        # Scale variance to marker size (linear mapping)
        return min_marker_size + (variance - min(variance_across_categories.values())) * (max_marker_size - min_marker_size) / (max(variance_across_categories.values()) - min(variance_across_categories.values()))

    # Function to get the variance for a given model name
    def get_variance_for_model(model_name):
        return variance_across_categories.get(model_name, 0)  # Default to 0 if model not found

    # Get markers
    filtered_markers = np.array(model_names)[np.in1d(x, x_filtered)]
    outlier_markers = np.array(model_names)[np.in1d(x, x_outliers)]

    # Get marker sizes and variances for the filtered data
    filtered_variances = [get_variance_for_model(mname) for mname in filtered_markers]
    marker_sizes_filtered = [scale_variance_to_size(var) for var in filtered_variances]
    
    # Get marker sizes and variances for the outlier data
    outlier_variances = [get_variance_for_model(mname) for mname in outlier_markers]
    marker_sizes_outliers = [scale_variance_to_size(var) for var in outlier_variances]

    # Assign symbols to the model types
    # https://docs.bokeh.org/en/latest/docs/examples/basic/scatters/markers.html
    _model_type2symbol = {
        'chat': 'circle',
        'pretrained': 'triangle',
        'ensemble': 'star',
    }
    model_type2symbol = lambda model_type: _model_type2symbol.get(model_type, 'diamond')
    
    # Assign symbols to the filtered data points
    filtered_symbols = [model_type2symbol(ldb_records[mname]['Type']) for mname in filtered_markers]
    
    # Assign symbols to the outlier data points
    outlier_symbols = [model_type2symbol(ldb_records[mname]['Type']) for mname in outlier_markers]
    
    # Define a color palette with enough colors
    stride = len(Turbo256) // len(model_names)
    color_palette = list(Turbo256[::stride])  # Adjust this palette size based on the number of data points
    random.shuffle(color_palette)
    
    # Create unique colors for filtered data
    filtered_colors = [color_palette[i % len(color_palette)] for i in range(len(x_filtered))]
    
    # Create unique colors for outliers
    outlier_colors = [color_palette[(i + len(x_filtered)) % len(color_palette)] for i in range(len(x_outliers))]

    # Create ColumnDataSource with filtered data
    source_filtered = ColumnDataSource(data={
        'x': x_filtered,
        'y': y_filtered,
        'sizes': np.array(sizes)[np.in1d(x, x_filtered)],  # Keep original model sizes
        'marker_sizes': marker_sizes_filtered,  # New field for marker sizes based on variance
        'model_names': np.array(model_names)[np.in1d(x, x_filtered)],
        'variance': filtered_variances,  # New field for variance
        'color': filtered_colors,
        'symbol': filtered_symbols
    })
    
    # Create ColumnDataSource with outlier data
    source_outliers = ColumnDataSource(data={
        'x': x_outliers,
        'y': y_outliers,
        'sizes': np.array(sizes)[np.in1d(x, x_outliers)],  # Keep original model sizes
        'marker_sizes': marker_sizes_outliers,  # New field for marker sizes based on variance
        'model_names': np.array(model_names)[np.in1d(x, x_outliers)],
        'variance': outlier_variances,  # New field for variance
        'color': outlier_colors,
        'symbol': outlier_symbols
    })

    # Create a figure for the category
    p = figure(
        sizing_mode="stretch_width",
        height=800,
        #title=f"{category} vs Model Size vs Variance Across Categories",
        tools="pan,wheel_zoom,box_zoom,save,reset",
        active_scroll="wheel_zoom",
        tooltips=[
            ("Model", "@model_names"), 
            ("Model Size (B parameters)", "@sizes"), 
            ("Variance", "@variance"),  # Added variance to the tooltip
            ("Performance", "@y"),
        ]
    )

    # Plot filtered data with unique colors and scaled marker sizes
    p.scatter('x', 'y', size='marker_sizes', source=source_filtered, fill_alpha=0.6, color='color', marker='symbol')

    # Plot outliers with unique colors and scaled marker sizes
    p.scatter('x', 'y', size='marker_sizes', source=source_outliers, fill_alpha=0.6, color='color', marker='symbol')

    # Fit and plot a curve
    x_fit, y_fit = fit_curve(x_filtered, y_filtered, degree=1)  # You can adjust the degree of the polynomial   

    
    p.line(x_fit, y_fit, line_color='gray', line_width=2, line_dash='dashed')

    # Add labels (with slight offset to avoid overlap)
    p.add_layout(LabelSet(
        x='x',
        y='y',
        text='model_names',
        source=source_filtered,
        x_offset=5,
        y_offset=8,
        text_font_size=f"{FONTSIZE-2}pt",
        text_color='black',
    ))
    
    p.add_layout(LabelSet(
        x='x',
        y='y',
        text='model_names',
        source=source_outliers,
        x_offset=5,
        y_offset=8,
        text_font_size=f"{FONTSIZE-2}pt",
        text_color='black',
    ))


    # Set axis labels
    p.xaxis.axis_label = 'Model Size (B parameters)'
    p.yaxis.axis_label = f'{category}'

    # Set axis label font sizes
    p.xaxis.axis_label_text_font_size = f"{FONTSIZE}pt"  # Set font size for x-axis label
    p.yaxis.axis_label_text_font_size = f"{FONTSIZE}pt"  # Set font size for y-axis label

    # Increase tick label font sizes
    p.xaxis.major_label_text_font_size = f"{FONTSIZE}pt"  # Increase x-axis tick label size
    p.yaxis.major_label_text_font_size = f"{FONTSIZE}pt"  # Increase y-axis tick label size

    p.x_scale = LogScale()
    
    p.xaxis.ticker = tickers.LogTicker()
    p.xaxis.axis_label_text_font_style = "normal"
    p.yaxis.axis_label_text_font_style = "normal"
    
    return p

# EOF