File size: 3,100 Bytes
ebd6079
442b62f
 
ca7532f
ebd6079
442b62f
 
5fe7079
442b62f
5fe7079
 
 
 
 
 
 
442b62f
5fe7079
 
 
 
 
 
442b62f
5fe7079
ca7532f
 
96a127d
 
5d12e20
 
 
897f09c
 
 
 
 
 
 
 
 
 
 
5d12e20
 
 
5fe7079
 
 
 
 
 
 
 
 
a1d5d83
 
 
 
442b62f
 
 
 
 
a1d5d83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442b62f
 
 
 
 
 
 
7e1d8eb
a6efc5f
 
7e1d8eb
442b62f
 
 
 
 
 
 
 
 
8b6987d
 
442b62f
5fe7079
442b62f
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
import numpy as np
import pandas as pd
from datasets import load_dataset

st.set_page_config(layout="wide")

col1, col2 = st.columns([2, 3])  # Adjust the width ratio as needed

sources = [
    "https://huggingface.co/datasets/cfahlgren1/hub-stats",
    "https://huggingface.co/datasets/maxiw/hf-posts",
]

with col1:
    st.header("HuggingFace 🤗 Posts leaderboard")

with col2:
    selected_source = st.selectbox(
        "Data Source:",
        options=sources,
        index=0,
    )

if selected_source == sources[0]:
    try:
        df = pd.read_parquet("hf://datasets/cfahlgren1/hub-stats/posts.parquet")
        # ds = load_dataset("cfahlgren1/hub-stats", "posts")
        # df = pd.DataFrame(ds['train']).info()
        df["Name"] = df.fullname
        df["username"] = df.name
    except Exception as exp:
        st.error(f'''
        ERROR>> in loading {selected_source}
        
        >> {exp}''', icon="🚨")
        selected_source = sources[1]
        st.info(f'''
        This can be solved by "Space Restart"
        
        Switching Sources for now...
        
        New Source: {selected_source}''', icon="ℹ️")
        
        
    

if selected_source == sources[1]:
    df = pd.read_json("hf://datasets/maxiw/hf-posts/posts.jsonl", lines=True)

    df["publishedAt"] = pd.to_datetime(df.publishedAt)
    print(">>> ", df.columns)

    df["Name"] = df.author.apply(lambda x: x["fullname"])
    df["username"] = df.author.apply(lambda x: x["name"])

# Define the metrics
metrics = ["totalUniqueImpressions", "totalReactions", "numComments", "Num of posts"]


# Get min and max dates from the DataFrame
min_date = df["publishedAt"].min().to_pydatetime()
max_date = df["publishedAt"].max().to_pydatetime()

# Create columns for the slider and the selectbox
col1, col2 = st.columns([3, 1])  # Adjust the width ratio as needed

with col1:
    date_range = st.slider(
        "Select Date Range",
        min_value=min_date,
        max_value=max_date,
        value=(min_date, max_date),
        format="DD/MMM/YYYY",
    )

with col2:
    selected_metric = st.selectbox(
        "Sort by:",
        options=metrics,
        index=0,
    )


# Filter the DataFrame based on selected date range
mask = df["publishedAt"].between(*date_range)
df = df[mask]


df["totalReactions"] = df.reactions.apply(lambda x: sum([_["count"] for _ in x]))
df["Num of posts"] = 1

# Ensure metrics columns are integers, handling NaN values
df[metrics] = df[metrics].fillna(0).astype(int)

data = (
    df.groupby(["username", "Name"])[metrics]
    .sum()
    .sort_values(selected_metric, ascending=False)
    .reset_index()
)
data.index = np.arange(1, len(data) + 1)
data.index.name = "Rank"

# Format metrics columns with commas
data[metrics] = data[metrics].applymap(lambda x: f"{x:,}")


def make_clickable(val):
    return f'<a target="_blank" href="https://huggingface.co/{val}">{val}</a>'


df_styled = data.style.format({"username": make_clickable})
st.write(
    f"""<center>{df_styled.to_html(escape=False, index=False)}""",
    unsafe_allow_html=True,
)