File size: 4,765 Bytes
bc87bb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
064d8d7
bc87bb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import pandas as pd
from typing import List, Dict, Optional
import gradio as gr
from datasets import load_dataset
import numpy as np

class AuthorLeaderboard:
    """
    A class to manage and process author leaderboard data for display in a Gradio Dataframe component.
    """

    # Class-level constants defining columns and their data types
    COLUMNS_ORDER: List[str] = [
        'Rank',
        'Author',
        'Total Artifacts',
        'Avg Artifacts per Paper',
        'Total Papers',
        'Total Models',
        'Total Datasets',
        'Total Spaces',
        'Upvotes',
        'Comments',
    ]

    DATATYPES: Dict[str, str] = {
        'Rank': 'str',
        'Author': 'markdown',
        'Total Artifacts': 'int',
        'Avg Artifacts per Paper': 'float',
        'Total Papers': 'int',
        'Total Models': 'int',
        'Total Datasets': 'int',
        'Total Spaces': 'int',
        'Upvotes': 'int',
        'Comments': 'int',
    }

    EMOTICONS = {
        1: '🥇',
        2: '🥈',
        3: '🥉'
    }

    def __init__(self):
        """
        Initialize the AuthorLeaderboard class by loading and processing the dataset.
        """
        self.df_raw: pd.DataFrame = self.get_df()
        self.df_prettified: pd.DataFrame = self.prettify(self.df_raw)

    @staticmethod
    def get_df() -> pd.DataFrame:
        """
        Load and process the leaderboard dataset.

        Returns:
            pd.DataFrame: The processed DataFrame.
        """
        # Load the dataset from the Hugging Face Hub
        dataset = load_dataset('IAMJB/paper-central-leaderboard', split='train')
        df = dataset.to_pandas()

        # Calculate total artifacts
        df['Total Artifacts'] = df['num_models'] + df['num_datasets'] + df['num_spaces']

        # Calculate average artifacts per paper
        df['Avg Artifacts per Paper'] = df['Total Artifacts'] / df['num_papers']
        df['Avg Artifacts per Paper'] = df['Avg Artifacts per Paper'].round(2)

        # Rename columns for clarity
        df.rename(columns={
            'name': 'Author',
            'num_papers': 'Total Papers',
            'num_models': 'Total Models',
            'num_datasets': 'Total Datasets',
            'num_spaces': 'Total Spaces',
            'upvotes': 'Upvotes',
            'num_comments': 'Comments',
        }, inplace=True)

        return df

    def prettify(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Prettify the DataFrame by adding rankings, emoticons, and markdown links.

        Args:
            df (pd.DataFrame): The DataFrame to prettify.

        Returns:
            pd.DataFrame: The prettified DataFrame.
        """
        df = df.copy()

        # Sort authors by Total Artifacts descending
        df.sort_values(by='Total Artifacts', ascending=False, inplace=True)

        # Reset index to get ranks
        df.reset_index(drop=True, inplace=True)
        df.index += 1  # Start ranks from 1

        # Add Rank column
        df['Rank'] = df.index

        # Add emoticons for top 3 ranks
        df['Rank'] = df['Rank'].apply(lambda x: f"{self.EMOTICONS.get(x, '')} {x}" if x <= 3 else f"{x}")

        # Convert 'Author' to markdown with profile links if 'username' is available
        df['Author'] = df.apply(self._create_author_link, axis=1)

        # Select columns to display
        df = df[self.COLUMNS_ORDER]

        return df

    def _create_author_link(self, row: pd.Series) -> str:
        """
        Create a markdown link for the author's profile.

        Args:
            row (pd.Series): A row from the DataFrame.

        Returns:
            str: The markdown link for the author.
        """
        if pd.notna(row.get('username')) and row['username']:
            profile_url = f"https://huggingface.co/{row['username']}"
            return f"[{row['Author']}]({profile_url})"
        else:
            return row['Author']

    def filter(self, author_search_input: Optional[str] = None) -> gr.update:
        """
        Filter the DataFrame based on the author search input.

        Args:
            author_search_input (Optional[str]): The author name to search for.

        Returns:
            gr.Update: An update object for the Gradio Dataframe component.
        """
        filtered_df: pd.DataFrame = self.df_prettified.copy()

        if author_search_input:
            search_string = author_search_input.lower()
            filtered_df = filtered_df[filtered_df['Author'].str.lower().str.contains(search_string)]

        # Get the corresponding data types for the columns
        datatypes: List[str] = [self.DATATYPES.get(col, 'str') for col in filtered_df.columns]

        return gr.update(value=filtered_df, datatype=datatypes)