Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1092

parms-count

#887

by alozowski HF staff - opened Aug 20, 2024

base: refs/heads/main

←

from: refs/pr/887

Discussion Files changed

+12

-166

Files changed (3) hide show

app.py +0 -8
src/submission/check_validity.py +12 -6
src/tools/plots.py +0 -152

app.py CHANGED Viewed

@@ -17,9 +17,7 @@ from src.display.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
-    FAQ_TEXT,
     INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
@@ -48,7 +46,6 @@ from src.envs import (
 )
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
-from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 from src.voting.vote_system import VoteManager, run_scheduler
 # Configure logging
@@ -169,11 +166,6 @@ LEADERBOARD_DF, eval_queue_dfs = init_space()
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
-# Data processing for plots now only on demand in the respective Gradio tab
-def load_and_create_plots():
-    plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
-    return plot_df
 # Function to check if a user is logged in
 def check_login(profile: gr.OAuthProfile | None) -> bool:
     if profile is None:

     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
 )
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.voting.vote_system import VoteManager, run_scheduler
 # Configure logging
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
 # Function to check if a user is logged in
 def check_login(profile: gr.OAuthProfile | None) -> bool:
     if profile is None:

src/submission/check_validity.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import os
 import re
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
@@ -75,28 +76,33 @@ def is_model_on_hub(
         return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
-def get_model_size(model_info: ModelInfo, precision: str):
     size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
     safetensors = None
     try:
         safetensors = get_safetensors_metadata(model_info.id)
     except Exception as e:
-        print(e)
     if safetensors is not None:
         model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
     else:
         try:
             size_match = re.search(size_pattern, model_info.id.lower())
-            model_size = size_match.group(0)
-            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
         except AttributeError:
-            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
     size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
     model_size = size_factor * model_size
-    return model_size
 def get_model_arch(model_info: ModelInfo):
     return model_info.config.get("architectures", "Unknown")

 import json
 import os
 import re
+import logging
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
         return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
+def get_model_size(model_info: ModelInfo, precision: str) -> float:
     size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
     safetensors = None
     try:
         safetensors = get_safetensors_metadata(model_info.id)
     except Exception as e:
+        logging.error(f"Failed to get safetensors metadata for model {model_info.id}: {str(e)}")
     if safetensors is not None:
         model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
     else:
         try:
             size_match = re.search(size_pattern, model_info.id.lower())
+            if size_match:
+                model_size = size_match.group(0)
+                model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+            else:
+                return -1  # Unknown model size
         except AttributeError:
+            logging.warning(f"Unable to parse model size from ID: {model_info.id}")
+            return -1  # Unknown model size
     size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
     model_size = size_factor * model_size
+    return model_size
 def get_model_arch(model_info: ModelInfo):
     return model_info.config.get("architectures", "Unknown")

src/tools/plots.py DELETED Viewed

@@ -1,152 +0,0 @@
-import numpy as np
-import pandas as pd
-import plotly.express as px
-from plotly.graph_objs import Figure
-from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
-# from src.display.utils import human_baseline_row as HUMAN_BASELINE
-from src.leaderboard.filter_models import FLAGGED_MODELS
-def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
-    """
-    Generates a DataFrame containing the maximum scores until each date.
-    :param results_df: A DataFrame containing result information including metric scores and dates.
-    :return: A new DataFrame containing the maximum scores until each date for every metric.
-    """
-    # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
-    results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
-    results_df.sort_values(by="date", inplace=True)
-    # Step 2: Initialize the scores dictionary
-    scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
-    # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
-    for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
-        current_max = 0
-        last_date = ""
-        column = task.col_name
-        for _, row in results_df.iterrows():
-            current_model = row[AutoEvalColumn.fullname.name]
-            # We ignore models that are flagged/no longer on the hub/not finished
-            to_ignore = (
-                not row[AutoEvalColumn.still_on_hub.name]
-                or not row[AutoEvalColumn.not_flagged.name]
-                or current_model in FLAGGED_MODELS
-            )
-            if to_ignore:
-                continue
-            current_date = row[AutoEvalColumn.date.name]
-            current_score = row[task.col_name]
-            if current_score > current_max:
-                if current_date == last_date and len(scores[column]) > 0:
-                    scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
-                else:
-                    scores[column].append({"model": current_model, "date": current_date, "score": current_score})
-                current_max = current_score
-                last_date = current_date
-    # Step 4: Return all dictionaries as DataFrames
-    return {k: pd.DataFrame(v) for k, v in scores.items()}
-def create_plot_df(scores_df: dict[str : pd.DataFrame]) -> pd.DataFrame:
-    """
-    Transforms the scores DataFrame into a new format suitable for plotting.
-    :param scores_df: A DataFrame containing metric scores and dates.
-    :return: A new DataFrame reshaped for plotting purposes.
-    """
-    # Initialize the list to store DataFrames
-    dfs = []
-    # Iterate over the cols and create a new DataFrame for each column
-    for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
-        d = scores_df[col].reset_index(drop=True)
-        d["task"] = col
-        dfs.append(d)
-    # Concatenate all the created DataFrames
-    concat_df = pd.concat(dfs, ignore_index=True)
-    # # Sort values by 'date'
-    # concat_df.sort_values(by="date", inplace=True)
-    # concat_df.reset_index(drop=True, inplace=True)
-    # return concat_df
-def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
-    """
-    Create a Plotly figure object with lines representing different metrics
-    and horizontal dotted lines representing human baselines.
-    :param df: The DataFrame containing the metric values, names, and dates.
-    :param metrics: A list of strings representing the names of the metrics
-                    to be included in the plot.
-    :param title: A string representing the title of the plot.
-    :return: A Plotly figure object with lines representing metrics and
-             horizontal dotted lines representing human baselines.
-    """
-    # Filter the DataFrame based on the specified metrics
-    df = df[df["task"].isin(metrics)]
-    # Filter the human baselines based on the specified metrics
-    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
-    # Create a line figure using plotly express with specified markers and custom data
-    fig = px.line(
-        df,
-        x="date",
-        y="score",
-        color="task",
-        markers=True,
-        custom_data=["task", "score", "model"],
-        title=title,
-    )
-    # Update hovertemplate for better hover interaction experience
-    fig.update_traces(
-        hovertemplate="<br>".join(
-            [
-                "Model Name: %{customdata[2]}",
-                "Metric Name: %{customdata[0]}",
-                "Date: %{x}",
-                "Metric Value: %{y}",
-            ]
-        )
-    )
-    # Update the range of the y-axis
-    fig.update_layout(yaxis_range=[0, 100])
-    # Create a dictionary to hold the color mapping for each metric
-    metric_color_mapping = {}
-    # Map each metric name to its color in the figure
-    for trace in fig.data:
-        metric_color_mapping[trace.name] = trace.line.color
-    # Iterate over filtered human baselines and add horizontal lines to the figure
-    for metric, value in filtered_human_baselines.items():
-        color = metric_color_mapping.get(metric, "blue")  # Retrieve color from mapping; default to blue if not found
-        location = "top left" if metric == "HellaSwag" else "bottom left"  # Set annotation position
-        # Add horizontal line with matched color and positioned annotation
-        fig.add_hline(
-            y=value,
-            line_dash="dot",
-            annotation_text=f"{metric} human baseline",
-            annotation_position=location,
-            annotation_font_size=10,
-            annotation_font_color=color,
-            line_color=color,
-        )
-    return fig
-# Example Usage:
-# human_baselines dictionary is defined.
-# chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")