Spaces:

Salesforce
/

GIFT-Eval

Running

App Files Files Community

juncliu commited on Oct 24, 2024

Commit

f5303bc

1 Parent(s): 9087f3e

update code and result files

Browse files

Files changed (28) hide show

README.md +2 -2
app.py +42 -17
results/auto_arima/config.json +5 -0
results/auto_ets/config.json +5 -0
results/auto_theta/config.json +5 -0
results/chronos-small/config.json +5 -0
results/chronos_base/config.json +5 -0
results/chronos_large/config.json +5 -0
results/crossformer/config.json +5 -0
results/d_linear/config.json +5 -0
results/deepar/config.json +5 -0
results/i_transformer/config.json +5 -0
results/moirai_1.1_R_base_no_leak/config.json +5 -0
results/moirai_1.1_R_large_no_leak/config.json +5 -0
results/moirai_1.1_R_small_no_leak/config.json +5 -0
results/n_beats/config.json +5 -0
results/naive/config.json +5 -0
results/patch_tst/config.json +5 -0
results/seasonal_naive/config.json +5 -0
results/tft/config.json +5 -0
results/tide/config.json +5 -0
results/timesfm/config.json +5 -0
results/visionts/config.json +5 -0
src/display/utils.py +24 -27
src/envs.py +2 -1
src/leaderboard/read_evals.py +83 -2
src/populate.py +18 -3
src/utils.py +1 -0

README.md CHANGED Viewed

@@ -7,7 +7,7 @@ sdk: gradio
 app_file: app.py
 pinned: true
 license: apache-2.0
-short_description: 'GIFT-Eval: A Benchmark for General Time Series Forecasting M'
 sdk_version: 4.44.0
 ---
@@ -43,4 +43,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
 - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 app_file: app.py
 pinned: true
 license: apache-2.0
+short_description: 'GIFT-Eval: A Benchmark for General Time Series Forecasting'
 sdk_version: 4.44.0
 ---
 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
 - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -15,17 +16,16 @@ from src.about import (
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     BENCHMARK_COLS,
-    COLS,
     EVAL_COLS,
     EVAL_TYPES,
-    AutoEvalColumn,
     ModelType,
     fields,
     WeightType,
     Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.utils import norm_sNavie, pivot_df
 # import ipdb
@@ -83,6 +83,16 @@ term_length_df = pivot_df('results/grouped_results_by_term_length.csv', tab_name
 print(f'Term length dataframe is {term_length_df}')
 variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
 print(f'Variate type dataframe is {variate_type_df}')
 # (
 #     finished_eval_queue_df,
@@ -91,20 +101,32 @@ print(f'Variate type dataframe is {variate_type_df}')
 # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
         select_columns=SelectColumns(
-            # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.name not in ['params', 'available_on_hub', 'hub', 'Model sha','Hub License']],
-            default_selection=list(dataframe.columns),
-            cant_deselect=['model'],
-            label="Select Datasets to Display:",
             # How to uncheck??
         ),
         search_columns=['model'],
         # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         # filter_columns=[
@@ -121,7 +143,10 @@ def init_leaderboard(dataframe):
         #         AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
         #     ),
         # ],
-        bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
@@ -133,19 +158,19 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(domain_df)
             print(f"FINAL Domain LEADERBOARD 1 {domain_df}")
         with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
-            leaderboard = init_leaderboard(freq_df)
             print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")
         with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
-            leaderboard = init_leaderboard(term_length_df)
             print(f"FINAL term length LEADERBOARD 1 {term_length_df}")
         with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
-            leaderboard = init_leaderboard(variate_type_df)
             print(f"FINAL LEADERBOARD 1 {variate_type_df}")
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

 import gradio as gr
+import ipdb
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     BENCHMARK_COLS,
     EVAL_COLS,
     EVAL_TYPES,
+    ModelInfoColumn,
     ModelType,
     fields,
     WeightType,
     Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_info_df, get_merged_df
 from src.submission.submit import add_new_eval
 from src.utils import norm_sNavie, pivot_df
 # import ipdb
 print(f'Term length dataframe is {term_length_df}')
 variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
 print(f'Variate type dataframe is {variate_type_df}')
+model_info_df = get_model_info_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
+# domain_df = get_merged_df(domain_df, model_info_df)
+# print('Merged domain df: ', domain_df)
+# freq_df = get_merged_df(freq_df, model_info_df)
+# print('Merged freq df: ', freq_df)
+# term_length_df = get_merged_df(term_length_df, model_info_df)
+# print('Merged term length df: ', term_length_df)
+# variate_type_df = get_merged_df(variate_type_df, model_info_df)
+# print('Merged variate type df: ', variate_type_df)
 # (
 #     finished_eval_queue_df,
 # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(ori_dataframe, model_info_df):
+    if ori_dataframe is None or ori_dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
+    model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
+    default_selection_list = list(ori_dataframe.columns) + model_info_col_list
+    print('default_selection_list: ', default_selection_list)
+    # ipdb.set_trace()
+    # default_selection_list = [col for col in default_selection_list if col not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
+    merged_df = get_merged_df(ori_dataframe, model_info_df)
+    new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
+    merged_df = merged_df[new_cols]
+    print('Merged df: ', merged_df)
     return Leaderboard(
+        value=merged_df,
+        # datatype=[c.type for c in fields(ModelInfoColumn)],
         select_columns=SelectColumns(
+            default_selection=default_selection_list,
+            # default_selection=[c.name for c in fields(ModelInfoColumn) if
+            #                    c.displayed_by_default and c.name not in ['params', 'available_on_hub', 'hub',
+            #                                                              'Model sha', 'Hub License']],
+            # default_selection=list(dataframe.columns),
+            cant_deselect=[c.name for c in fields(ModelInfoColumn) if c.never_hidden],
+            label="Select Columns to Display:",
             # How to uncheck??
         ),
+        hide_columns=[c.name for c in fields(ModelInfoColumn) if c.hidden],
         search_columns=['model'],
         # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         # filter_columns=[
         #         AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
         #     ),
         # ],
+        filter_columns=[
+            ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
+        ],
+        # bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(domain_df, model_info_df)
             print(f"FINAL Domain LEADERBOARD 1 {domain_df}")
         with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
+            leaderboard = init_leaderboard(freq_df, model_info_df)
             print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")
         with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
+            leaderboard = init_leaderboard(term_length_df, model_info_df)
             print(f"FINAL term length LEADERBOARD 1 {term_length_df}")
         with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
+            leaderboard = init_leaderboard(variate_type_df, model_info_df)
             print(f"FINAL LEADERBOARD 1 {variate_type_df}")
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

results/auto_arima/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "auto_arima",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/auto_ets/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "auto_ets",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/auto_theta/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "auto_theta",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/chronos-small/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "chronos-small",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/chronos_base/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "chronos_base",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/chronos_large/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "chronos_large",
+    "model_type": "pretrained",
+    "model_dtype": "float32"
+}

results/crossformer/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "crossformer",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/d_linear/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "d_linear",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/deepar/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "deepar",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/i_transformer/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "i_transformer",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/moirai_1.1_R_base_no_leak/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "moirai_1.1_R_base_no_leak",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/moirai_1.1_R_large_no_leak/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "moirai_1.1_R_large_no_leak",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/moirai_1.1_R_small_no_leak/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "moirai_1.1_R_small_no_leak",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/n_beats/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "n_beats",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/naive/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "naive",
+    "model_type": "statistical",
+    "model_dtype": "float32"
+}

results/patch_tst/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "patch_tst",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/seasonal_naive/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "seasonal_naive",
+    "model_type": "statistical",
+    "model_dtype": "float32"
+}

results/tft/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "tft",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/tide/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "tide",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/timesfm/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "timesfm",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

results/visionts/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "model": "visionts",
+    "model_type": "deep-learning",
+    "model_dtype": "float32"
+}

src/display/utils.py CHANGED Viewed

@@ -21,27 +21,23 @@ class ColumnContent:
     never_hidden: bool = False
 ## Leaderboard columns
-auto_eval_column_dict = []
-# Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
-for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
-AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
@@ -62,10 +58,11 @@ class ModelDetails:
 class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
@@ -77,10 +74,10 @@ class ModelType(Enum):
             return ModelType.FT
         if "pretrained" in type or "🟢" in type:
             return ModelType.PT
-        if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
-        if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
         return ModelType.Unknown
 class WeightType(Enum):
@@ -101,7 +98,7 @@ class Precision(Enum):
         return Precision.Unknown
 # Column selection
-COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]

     never_hidden: bool = False
 ## Leaderboard columns
+model_info_dict = []
+# Init column for the model properties
+model_info_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+model_info_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
 # Model information
+model_info_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+model_info_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+model_info_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+model_info_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+model_info_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+model_info_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
+ModelInfoColumn = make_dataclass("ModelInfoColumn", model_info_dict, frozen=True)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class ModelType(Enum):
+    PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
+    FT = ModelDetails(name="🔶 fine-tuned", symbol="🔶")
+    DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
+    ST = ModelDetails(name="🟣 statistical", symbol="🟣")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
             return ModelType.FT
         if "pretrained" in type or "🟢" in type:
             return ModelType.PT
+        if "deep-learning" in type or "🟦" in type:
+            return ModelType.DL
+        if "statistical" in type or "🟣" in type:
+            return ModelType.ST
         return ModelType.Unknown
 class WeightType(Enum):
         return Precision.Unknown
 # Column selection
+MODEL_INFO_COLS = [c.name for c in fields(ModelInfoColumn) if not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]

src/envs.py CHANGED Viewed

@@ -18,7 +18,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")

 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+# EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,10 +8,48 @@ import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.
@@ -154,7 +192,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
     return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
@@ -172,6 +210,49 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result

 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import ModelType, Tasks, Precision, WeightType, ModelInfoColumn
 from src.submission.check_validity import is_model_on_hub
+@dataclass
+class ModelConfig:
+    """Represents the model configuration of a model"""
+    model: str
+    model_type: ModelType = ModelType.Unknown
+    precision: Precision = Precision.Unknown
+    license: str = "?"
+    likes: int = 0
+    num_params: int = 0
+    @classmethod
+    def init_from_json_file(cls, json_filepath):
+        """Inits the result from the specific model result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        # config = data.get("config")
+        # Precision
+        precision = Precision.from_str(data.get("model_dtype"))
+        model_type = ModelType.from_str(data.get("model_type", ""))
+        model = data.get("model", "")
+        return cls(model=model, model_type=model_type, precision=precision)
+    def to_dict(self):
+        """Converts the model info to a dict compatible with our dataframe display"""
+        data_dict = {
+            "model": self.model,  # not a column, just a save name,
+            ModelInfoColumn.precision.name: self.precision.value.name,
+            ModelInfoColumn.model_type.name: self.model_type.value.name,
+            ModelInfoColumn.model_type_symbol.name: self.model_type.value.symbol,
+            ModelInfoColumn.license.name: self.license,
+            ModelInfoColumn.likes.name: self.likes,
+            ModelInfoColumn.params.name: self.num_params,
+        }
+        return data_dict
 @dataclass
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.
     return request_file
+def get_model_info(results_path: str, requests_path: str) -> list[ModelConfig]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
+    model_infos = {}
+    for model_result_filepath in model_result_filepaths:
+        # Creation of result
+        model_info = ModelConfig.init_from_json_file(model_result_filepath)
+        # eval_result.update_with_request_file(requests_path)
+        # Store results of same eval together
+        model_name = model_info.model
+        model_infos[model_name] = model_info
+        # if eval_name in eval_results.keys():
+        #     eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        # else:
+        #     eval_results[eval_name] = eval_result
+    results = []
+    for v in model_infos.values():
+        try:
+            v.to_dict() # we test if the dict version is complete
+            results.append(v)
+        except KeyError:  # not all eval values present
+            continue
+    return results
+def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
+    """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths = []
+    for root, _, files in os.walk(results_path):
+        # We should only have json files in model results
+        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+            continue
+        # # Sort the files by date
+        # try:
+        #     files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        # except dateutil.parser._parser.ParserError:
+        #     files = [files[-1]]
+        for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result

src/populate.py CHANGED Viewed

@@ -4,12 +4,27 @@ import os
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals import get_raw_eval_results
-# import ipdb
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     # raw_data = get_raw_eval_results(results_path, requests_path)
     # print('results_path:', results_path)
     # all_data_json = [v.to_dict() for v in raw_data]

 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import EvalQueueColumn
+from src.leaderboard.read_evals import get_model_info
+import ipdb
+def get_model_info_df(results_path: str, requests_path: str, cols: list=[], benchmark_cols: list=[]) -> pd.DataFrame:
+    """Creates a dataframe from all the individual experiment results"""
+    raw_data = get_model_info(results_path, requests_path)
+    all_data_json = [v.to_dict() for v in raw_data]
+    print(f"The raw data is {all_data_json}")
+    df = pd.DataFrame.from_records(all_data_json)
+    print(f"DF for Model Info ********** {df}")
+    return df
+def get_merged_df(result_df: pd.DataFrame, model_info_df: pd.DataFrame) -> pd.DataFrame:
+    """Merges the model info dataframe with the results dataframe"""
+    merged_df = pd.merge(model_info_df, result_df, on='model', how='inner')
+    return merged_df
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
+    raw_data = get_raw_eval_results(results_path, requests_path)
     # raw_data = get_raw_eval_results(results_path, requests_path)
     # print('results_path:', results_path)
     # all_data_json = [v.to_dict() for v in raw_data]

src/utils.py CHANGED Viewed

@@ -24,4 +24,5 @@ def pivot_df(file_name, tab_name):
     # df_pivot.to_csv('pivoted_df.csv')
     # print(df_pivot)
     df_pivot = df_pivot.reset_index()
     return df_pivot

     # df_pivot.to_csv('pivoted_df.csv')
     # print(df_pivot)
     df_pivot = df_pivot.reset_index()
+    df_pivot = df_pivot.round(3)
     return df_pivot