Spaces:
Sleeping
Sleeping
File size: 3,185 Bytes
90386c8 f6bbf83 5c5f52e 90386c8 5c5f52e 90386c8 5c5f52e 90386c8 ed37630 e222298 90386c8 eb1a198 90386c8 e222298 5c5f52e f6bbf83 90386c8 e667ed5 6ad2cd0 f6bbf83 4dde354 90386c8 43cdccf f8eede2 43cdccf f6bbf83 90386c8 5c5f52e 90386c8 e222298 90386c8 e222298 90386c8 e222298 90386c8 e222298 90386c8 e222298 90386c8 6ad2cd0 90386c8 e1b1d1b 90386c8 5c5f52e 90386c8 5c5f52e 90386c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import os
import gradio as gr
import pandas as pd
import time
import threading
from huggingface_hub import HfApi
from humanize import naturalsize
api = HfApi()
HF_TOKEN = os.getenv('HF_TOKEN')
def clickable(x):
return f'<a target="_blank" href="https://huggingface.co/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>'
def apply_headers(df, headers):
tmp = df.copy()
tmp.columns = headers
return tmp
def search(search_text):
if not search_text:
return df
return df[df['👤 Author'].str.contains(search_text, case=False, na=False)]
def restart_space():
time.sleep(36000)
api.restart_space(repo_id="Weyaxi/data-leaderboard", token=HF_TOKEN)
df = pd.read_csv("author_data_hf_merged.csv")
df_author_copy = df.copy()
df["author"] = df["author"].apply(lambda x: clickable(x))
df['Total Usage'] = df[['models', 'datasets', 'spaces']].sum(axis=1)
df = df.sort_values(by='Total Usage', ascending=False)
sum_all_author = naturalsize(sum(df['models'].tolist()+df['datasets'].tolist()+df['spaces'].tolist()))
naturalsize_columns = ['Total Usage', 'models', 'datasets', 'spaces']
df[naturalsize_columns] = df[naturalsize_columns].applymap(naturalsize)
df['Serial Number'] = [i for i in range(1, len(df)+1)]
df = df[['Serial Number', "author", "Total Usage", "models", "datasets", "spaces"]]
df = apply_headers(df, ["🔢 Serial Number", "👤 Author", "⚡️ Total Usage", "🏛️ Models", "📊 Datasets", "🚀 Spaces"])
desc = f"""
🎯 The Leaderboard aims to track authors data usage in 🤗 Huggingface.
## 📄 Information
🛠️ This leaderboard consists of 125k authors scraped from [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard).
These 125k authors have been selected based on their [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard) positions:
- 🤖 Top 60k authors in the models category
- 📊 Top 60k authors in the datasets category
- 🚀 Top 50k authors in the spaces category
## 📒 Notes
Note that these numbers may not be entirely accurate due to the following reasons:
- I only calculated the data usage from the main branch and did not include deleted files that cannot be directly seen.
- There may be large datasets/models to which I don't have access (either private or gated).
# 📶 Total Data Usage From All Authors
According to this leaderboard, there is a total of {sum_all_author} of data on this platform.
"""
# Write note maybe?
title = """
<div style="text-align:center">
<h1 id="space-title">💾 Data Leaderboard 💾</h1>
</div>
"""
with gr.Blocks() as demo:
gr.Markdown("""<h1 align="center" id="space-title">💾 Data Leaderboard 💾</h1>""")
gr.Markdown(desc)
with gr.Column(min_width=320):
search_bar = gr.Textbox(placeholder="🔍 Search for a author", show_label=False)
gr_followers = gr.Dataframe(df, interactive=False, datatype=["number", 'markdown', 'number'])
search_bar.submit(fn=search, inputs=search_bar, outputs=gr_followers)
threading.Thread(target=restart_space).start()
demo.launch() |