File size: 3,185 Bytes
90386c8
 
 
 
 
 
f6bbf83
5c5f52e
90386c8
 
 
 
5c5f52e
90386c8
 
 
5c5f52e
90386c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed37630
e222298
90386c8
 
eb1a198
90386c8
 
 
e222298
5c5f52e
f6bbf83
90386c8
e667ed5
6ad2cd0
f6bbf83
4dde354
90386c8
43cdccf
f8eede2
43cdccf
f6bbf83
90386c8
 
5c5f52e
90386c8
 
 
e222298
90386c8
e222298
90386c8
e222298
90386c8
e222298
90386c8
e222298
90386c8
6ad2cd0
 
 
 
 
 
 
 
 
 
 
90386c8
e1b1d1b
 
90386c8
 
 
5c5f52e
90386c8
 
 
 
5c5f52e
90386c8
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import gradio as gr
import pandas as pd
import time
import threading
from huggingface_hub import HfApi
from humanize import naturalsize

api = HfApi()

HF_TOKEN = os.getenv('HF_TOKEN')


def clickable(x):
    return f'<a target="_blank" href="https://huggingface.co/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>'


def apply_headers(df, headers):
    tmp = df.copy()
    tmp.columns = headers

    return tmp


def search(search_text):
    if not search_text:
        return df

    return df[df['👤 Author'].str.contains(search_text, case=False, na=False)]


def restart_space():
  time.sleep(36000)
  api.restart_space(repo_id="Weyaxi/data-leaderboard", token=HF_TOKEN)


df = pd.read_csv("author_data_hf_merged.csv")

df_author_copy = df.copy()

df["author"] = df["author"].apply(lambda x: clickable(x))
df['Total Usage'] = df[['models', 'datasets', 'spaces']].sum(axis=1)
df = df.sort_values(by='Total Usage', ascending=False)

sum_all_author = naturalsize(sum(df['models'].tolist()+df['datasets'].tolist()+df['spaces'].tolist()))

naturalsize_columns = ['Total Usage', 'models', 'datasets', 'spaces']
df[naturalsize_columns] = df[naturalsize_columns].applymap(naturalsize)

df['Serial Number'] = [i for i in range(1, len(df)+1)]
df = df[['Serial Number', "author", "Total Usage", "models", "datasets", "spaces"]]

df = apply_headers(df, ["🔢 Serial Number", "👤 Author", "⚡️ Total Usage", "🏛️ Models", "📊 Datasets", "🚀 Spaces"])

desc = f"""
🎯 The Leaderboard aims to track authors data usage in 🤗 Huggingface.

## 📄 Information

🛠️ This leaderboard consists of 125k authors scraped from [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard).

These 125k authors have been selected based on their [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard) positions:

- 🤖 Top 60k authors in the models category

- 📊 Top 60k authors in the datasets category

- 🚀 Top 50k authors in the spaces category

## 📒 Notes

Note that these numbers may not be entirely accurate due to the following reasons:

- I only calculated the data usage from the main branch and did not include deleted files that cannot be directly seen.

- There may be large datasets/models to which I don't have access (either private or gated).

# 📶 Total Data Usage From All Authors

According to this leaderboard, there is a total of {sum_all_author} of data on this platform.
"""
# Write note maybe?


title = """
<div style="text-align:center">
  <h1 id="space-title">💾 Data Leaderboard 💾</h1>
</div>
"""

with gr.Blocks() as demo:
    gr.Markdown("""<h1 align="center" id="space-title">💾 Data Leaderboard 💾</h1>""")
    gr.Markdown(desc)
    with gr.Column(min_width=320):
        search_bar = gr.Textbox(placeholder="🔍 Search for a author", show_label=False)

    gr_followers = gr.Dataframe(df, interactive=False, datatype=["number", 'markdown', 'number'])

    search_bar.submit(fn=search, inputs=search_bar, outputs=gr_followers)


threading.Thread(target=restart_space).start()
demo.launch()