Spaces:

huggingface
/

paper-central

Running

paper-central / df /author_leaderboard_contrib.py

jbdel

leaderboards

064d8d7 2 months ago

4.84 kB

	import pandas as pd
	from typing import List, Dict, Optional
	import gradio as gr
	from datasets import load_dataset

	class AuthorLeaderboardContrib:
	"""
	A class to manage and process author resource leaderboard data for display in a Gradio Dataframe component.
	"""

	# Class-level constants defining columns and their data types
	COLUMNS_ORDER: List[str] = [
	'Rank',
	'Author',
	'Entity Type',
	'Total Artifacts',
	'Total Papers',
	'Total Models',
	'Total Datasets',
	'Total Spaces',
	'Likes',
	'Downloads',
	]

	DATATYPES: Dict[str, str] = {
	'Rank': 'str',
	'Author': 'markdown',
	'Entity Type': 'str',
	'Total Artifacts': 'int',
	'Total Papers': 'int',
	'Total Models': 'int',
	'Total Datasets': 'int',
	'Total Spaces': 'int',
	'Likes': 'int',
	'Downloads': 'int',
	}

	EMOTICONS = {
	1: '🥇',
	2: '🥈',
	3: '🥉'
	}

	def __init__(self):
	"""
	Initialize the AuthorLeaderboardContrib class by loading and processing the dataset.
	"""
	self.df_raw: pd.DataFrame = self.get_df()
	self.df_prettified: pd.DataFrame = self.prettify(self.df_raw)

	@staticmethod
	def get_df() -> pd.DataFrame:
	"""
	Load and process the leaderboard dataset.

	Returns:
	pd.DataFrame: The processed DataFrame.
	"""
	# Load the dataset from the Hugging Face Hub
	dataset = load_dataset('IAMJB/paper-central-leaderboard-contrib', split='train')
	df = dataset.to_pandas()

	# Exclude entries with 'entity_type' == 'unknown'
	df = df[df['entity_type'] != 'unknown']

	# Rename columns for clarity
	df.rename(columns={
	'author': 'Author',
	'entity_type': 'Entity Type',
	'total_artifacts': 'Total Artifacts',
	'total_papers': 'Total Papers',
	'total_models': 'Total Models',
	'total_datasets': 'Total Datasets',
	'total_spaces': 'Total Spaces',
	'likes': 'Likes',
	'downloads': 'Downloads',
	}, inplace=True)

	return df

	def prettify(self, df: pd.DataFrame) -> pd.DataFrame:
	"""
	Prettify the DataFrame by adding rankings, emoticons, and markdown links.

	Args:
	df (pd.DataFrame): The DataFrame to prettify.

	Returns:
	pd.DataFrame: The prettified DataFrame.
	"""
	df = df.copy()

	# Sort authors by Total Artifacts descending
	df.sort_values(by='Total Artifacts', ascending=False, inplace=True)

	# Reset index to get ranks
	df.reset_index(drop=True, inplace=True)
	df.index += 1 # Start ranks from 1

	# Add Rank column
	df['Rank'] = df.index

	# Add emoticons for top 3 ranks
	df['Rank'] = df['Rank'].apply(lambda x: f"{self.EMOTICONS.get(x, '')} {x}" if x <= 3 else f"{x}")

	# Convert 'Author' to markdown with profile links
	df['Author'] = df.apply(self._create_author_link, axis=1)

	# Select columns to display
	df = df[self.COLUMNS_ORDER]

	return df

	def _create_author_link(self, row: pd.Series) -> str:
	"""
	Create a markdown link for the author's profile.

	Args:
	row (pd.Series): A row from the DataFrame.

	Returns:
	str: The markdown link for the author.
	"""
	author = row['Author']
	profile_url = f"https://huggingface.co/{author}"
	return f"[{author}]({profile_url})"

	def filter(
	self,
	author_search_input: Optional[str] = None,
	entity_type_filter: Optional[str] = 'All'
	) -> gr.update:
	"""
	Filter the DataFrame based on the author search input and entity type.

	Args:
	author_search_input (Optional[str]): The author name to search for.
	entity_type_filter (Optional[str]): The entity type to filter by ('All', 'user', 'org').

	Returns:
	gr.Update: An update object for the Gradio Dataframe component.
	"""
	filtered_df: pd.DataFrame = self.df_prettified.copy()

	if author_search_input:
	search_string = author_search_input.lower()
	filtered_df = filtered_df[filtered_df['Author'].str.lower().str.contains(search_string)]

	# Filter by entity type
	if entity_type_filter != 'All':
	filtered_df = filtered_df[filtered_df['Entity Type'] == entity_type_filter]

	# Get the corresponding data types for the columns
	datatypes: List[str] = [self.DATATYPES.get(col, 'str') for col in filtered_df.columns]

	return gr.update(value=filtered_df, datatype=datatypes)