File size: 3,100 Bytes
ebd6079 442b62f ca7532f ebd6079 442b62f 5fe7079 442b62f 5fe7079 442b62f 5fe7079 442b62f 5fe7079 ca7532f 96a127d 5d12e20 897f09c 5d12e20 5fe7079 a1d5d83 442b62f a1d5d83 442b62f 7e1d8eb a6efc5f 7e1d8eb 442b62f 8b6987d 442b62f 5fe7079 442b62f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import streamlit as st
import numpy as np
import pandas as pd
from datasets import load_dataset
st.set_page_config(layout="wide")
col1, col2 = st.columns([2, 3]) # Adjust the width ratio as needed
sources = [
"https://huggingface.co/datasets/cfahlgren1/hub-stats",
"https://huggingface.co/datasets/maxiw/hf-posts",
]
with col1:
st.header("HuggingFace 🤗 Posts leaderboard")
with col2:
selected_source = st.selectbox(
"Data Source:",
options=sources,
index=0,
)
if selected_source == sources[0]:
try:
df = pd.read_parquet("hf://datasets/cfahlgren1/hub-stats/posts.parquet")
# ds = load_dataset("cfahlgren1/hub-stats", "posts")
# df = pd.DataFrame(ds['train']).info()
df["Name"] = df.fullname
df["username"] = df.name
except Exception as exp:
st.error(f'''
ERROR>> in loading {selected_source}
>> {exp}''', icon="🚨")
selected_source = sources[1]
st.info(f'''
This can be solved by "Space Restart"
Switching Sources for now...
New Source: {selected_source}''', icon="ℹ️")
if selected_source == sources[1]:
df = pd.read_json("hf://datasets/maxiw/hf-posts/posts.jsonl", lines=True)
df["publishedAt"] = pd.to_datetime(df.publishedAt)
print(">>> ", df.columns)
df["Name"] = df.author.apply(lambda x: x["fullname"])
df["username"] = df.author.apply(lambda x: x["name"])
# Define the metrics
metrics = ["totalUniqueImpressions", "totalReactions", "numComments", "Num of posts"]
# Get min and max dates from the DataFrame
min_date = df["publishedAt"].min().to_pydatetime()
max_date = df["publishedAt"].max().to_pydatetime()
# Create columns for the slider and the selectbox
col1, col2 = st.columns([3, 1]) # Adjust the width ratio as needed
with col1:
date_range = st.slider(
"Select Date Range",
min_value=min_date,
max_value=max_date,
value=(min_date, max_date),
format="DD/MMM/YYYY",
)
with col2:
selected_metric = st.selectbox(
"Sort by:",
options=metrics,
index=0,
)
# Filter the DataFrame based on selected date range
mask = df["publishedAt"].between(*date_range)
df = df[mask]
df["totalReactions"] = df.reactions.apply(lambda x: sum([_["count"] for _ in x]))
df["Num of posts"] = 1
# Ensure metrics columns are integers, handling NaN values
df[metrics] = df[metrics].fillna(0).astype(int)
data = (
df.groupby(["username", "Name"])[metrics]
.sum()
.sort_values(selected_metric, ascending=False)
.reset_index()
)
data.index = np.arange(1, len(data) + 1)
data.index.name = "Rank"
# Format metrics columns with commas
data[metrics] = data[metrics].applymap(lambda x: f"{x:,}")
def make_clickable(val):
return f'<a target="_blank" href="https://huggingface.co/{val}">{val}</a>'
df_styled = data.style.format({"username": make_clickable})
st.write(
f"""<center>{df_styled.to_html(escape=False, index=False)}""",
unsafe_allow_html=True,
)
|