Spaces:

jsulz
/

spaces-ship

Sleeping

File size: 13,911 Bytes

import gradio as gr
import pandas as pd
import numpy as np
import plotly.express as px
# Load the spaces.parquet file as a dataframe and do some pre cleaning steps


"""
Todos:
    Clean up existing filtering code
"""


def filtered_df(emoji, likes, author, hardware, tags, models, datasets, space_licenses):
    _df = df
    # if emoji is not none, filter the dataframe with it
    if emoji:
        _df = _df[_df["emoji"].isin(emoji)]
    # if likes is not none, filter the dataframe with it
    if likes:
        _df = _df[_df["likes"] >= likes]
    if author:
        _df = _df[_df["author"].isin(author)]
    if hardware:
        _df = _df[_df["hardware"].isin(hardware)]
    # check to see if the array of sdk_tags contains any of the selected tags
    if tags:
        _df = _df[_df["sdk_tags"].apply(lambda x: any(tag in x for tag in tags))]
    if models:
        _df = _df[
            _df["models"].apply(
                lambda x: (
                    any(model in x for model in models) if x is not None else False
                )
            )
        ]
    if datasets:
        _df = _df[
            _df["datasets"].apply(
                lambda x: (
                    any(dataset in x for dataset in datasets)
                    if x is not None
                    else False
                )
            )
        ]
    if space_licenses:
        _df = _df[
            _df["licenses"].apply(
                lambda x: (
                    any(space_license in x for space_license in space_licenses)
                    if x is not None
                    else False
                )
            )
        ]

    # rename the columns names to make them more readable
    _df = _df.rename(
        columns={
            'url': 'URL',
            'likes': 'Likes',
            "r_models": "Models",
            "r_datasets": "Datasets",
            "r_licenses": "Licenses",
        }
    )

    return _df[["URL", "Likes", "Models", "Datasets", "Licenses" ]]


with gr.Blocks(fill_width=True) as demo:
    with gr.Tab(label="Spaces Overview"):

        # The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time. 
        # The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
        df = pd.read_parquet("spaces.parquet")
        df = df.sort_values("created_at")
        df['cumulative_spaces'] = df['created_at'].rank(method='first').astype(int)
        fig1 = px.line(df, x='created_at', y='cumulative_spaces', title='Growth of Spaces Over Time', labels={'created_at': 'Date', 'cumulative_spaces': 'Number of Spaces'}, template='plotly_dark')
        gr.Plot(fig1)

        # Create a pie charge showing the distribution of spaces by SDK
        fig2 = px.pie(df, names='sdk', title='Distribution of Spaces by SDK', template='plotly_dark')
        gr.Plot(fig2)

        # create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
        emoji_counts = df['emoji'].value_counts().head(10).reset_index()
        fig3 = px.pie(emoji_counts, names='emoji', values='count', title='Distribution of Spaces by Emoji', template='plotly_dark')
        gr.Plot(fig3)

        # Create a dataframe with the top 10 authors and the number of spaces they have created
        author_counts = df['author'].value_counts().head(20).reset_index()
        author_counts.columns = ['Author', 'Number of Spaces']
        gr.DataFrame(author_counts)

        # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
        author_likes = df.groupby('author').agg({'likes': 'sum', 'id': 'count'}).reset_index()
        fig4 = px.scatter(author_likes, x='id', y='likes', title='Relationship between Number of Spaces Created and Number of Likes', labels={'id': 'Number of Spaces Created', 'likes': 'Number of Likes'}, hover_data={'author': True}, template='plotly_dark')
        gr.Plot(fig4)

        # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
        emoji_likes = df.groupby('emoji').agg({'likes': 'sum', 'id': 'count'}).sort_values(by='likes', ascending=False).head(20).reset_index()
        fig10 = px.scatter(emoji_likes, x='id', y='likes', title='Relationship between Number of Spaces Created and Number of Likes', labels={'id': 'Number of Spaces Created', 'likes': 'Number of Likes'}, hover_data={'emoji': True}, template='plotly_dark')
        gr.Plot(fig10)

        # Create a bar chart of hardware in use
        hardware = df['hardware'].value_counts().reset_index()
        hardware.columns = ['Hardware', 'Number of Spaces']
        fig5 = px.bar(hardware, x='Hardware', y='Number of Spaces', title='Hardware in Use', labels={'Hardware': 'Hardware', 'Number of Spaces': 'Number of Spaces (log scale)'}, color='Hardware', template='plotly_dark')
        fig5.update_layout(yaxis_type='log')
        gr.Plot(fig5)

        models = np.concatenate([arr for arr in df['models'].values if arr is not None])
        model_count = {}
        model_author_count = {}
        for model in models:
            author = model.split('/')[0]
            if model in model_count:
                model_count[model] += 1
            else:
                model_count[model] = 1
            if author in model_author_count:
                model_author_count[author] += 1
            else:
                model_author_count[author] = 1
        model_author_count = pd.DataFrame(model_author_count.items(), columns=['Model Author', 'Number of Spaces'])
        fig8 = px.bar(model_author_count.sort_values('Number of Spaces', ascending=False).head(20), x='Model Author', y='Number of Spaces', title='Most Popular Model Authors', labels={'Model': 'Model', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
        gr.Plot(fig8)
        model_count = pd.DataFrame(model_count.items(), columns=['Model', 'Number of Spaces'])
        # then make a bar chart
        fig6 = px.bar(model_count.sort_values('Number of Spaces', ascending=False).head(20), x='Model', y='Number of Spaces', title='Most Used Models', labels={'Model': 'Model', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
        gr.Plot(fig6)

        datasets = np.concatenate([arr for arr in df['datasets'].values if arr is not None])
        dataset_count = {}
        dataset_author_count = {}
        for dataset in datasets:
            author = dataset.split('/')[0]
            if dataset in dataset_count:
                dataset_count[dataset] += 1
            else:
                dataset_count[dataset] = 1
            if author in dataset_author_count:
                dataset_author_count[author] += 1
            else:
                dataset_author_count[author] = 1
        dataset_count = pd.DataFrame(dataset_count.items(), columns=['Datasets', 'Number of Spaces'])
        dataset_author_count = pd.DataFrame(dataset_author_count.items(), columns=['Dataset Author', 'Number of Spaces'])
        fig9 = px.bar(dataset_author_count.sort_values('Number of Spaces', ascending=False).head(20), x='Dataset Author', y='Number of Spaces', title='Most Popular Dataset Authors', labels={'Dataset Author': 'Dataset Author', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
        gr.Plot(fig9)
        # then make a bar chart
        fig7 = px.bar(dataset_count.sort_values('Number of Spaces', ascending=False).head(20), x='Datasets', y='Number of Spaces', title='Most Used Datasets', labels={'Datasets': 'Datasets', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
        gr.Plot(fig7)

        # Get the most duplicated spaces
        duplicated_spaces = df['duplicated_from'].value_counts().head(20).reset_index()
        duplicated_spaces.columns = ['Space', 'Number of Duplicates']
        gr.DataFrame(duplicated_spaces)

        # Get the most duplicated spaces
        liked_spaces = df[['id', 'likes']].sort_values(by='likes', ascending=False).head(20)
        liked_spaces.columns = ['Space', 'Number of Likes']
        gr.DataFrame(liked_spaces)

        # Get the spaces with the longest READMEs
        readme_sizes = df[['id', 'readme_size']].sort_values(by='readme_size', ascending=False).head(20)
        readme_sizes.columns = ['Space', 'Longest READMEs']
        gr.DataFrame(readme_sizes)
        
    with gr.Tab(label="Spaces Search"):
        df = pd.read_parquet("spaces.parquet")
        df = df[df["stage"] == "RUNNING"]
        # combine the sdk and tags columns, one of which is a string and the other is an array of strings
        # first convert the sdk column to an array of strings
        df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
        df["licenses"] = df["license"].apply(
            lambda x: np.array([str(x)]) if x is None else x
        )
        # then combine the sdk and tags columns so that their elements are together
        df["sdk_tags"] = df[["sdk", "tags"]].apply(
            lambda x: np.concatenate((x.iloc[0], x.iloc[1])), axis=1
        )

        df['emoji'] = np.where(df['emoji'].isnull(), '', df['emoji'])

        # where the custom_domains column is not null, use that as the url, otherwise, use the host column
        df["url"] = np.where(
            df["custom_domains"].isnull(),
            df["id"],
            df["custom_domains"],
        )
        df["url"] = df[["url", "emoji"]].apply(
            lambda x: (
                f"<a target='_blank' href=https://huggingface.co/spaces/{x.iloc[0]}>{str(x.iloc[1]) + " " + x.iloc[0]}</a>"
                if x.iloc[0] is not None and "/" in x.iloc[0]
                else f"<a target='_blank' href=https://{x.iloc[0][0]}>{str(x.iloc[1]) + " " + x.iloc[0][0]}</a>"
            ),
            axis=1,
        )

        # Make all of this human readable
        df["r_models"] = [', '.join(models) if models is not None else '' for models in df["models"]]
        df["r_sdk_tags"] = [', '.join(sdk_tags) if sdk_tags is not None else '' for sdk_tags in df["sdk_tags"]]
        df["r_datasets"] = [', '.join(datasets) if datasets is not None else '' for datasets in df["datasets"]]
        df["r_licenses"] = [', '.join(licenses) if licenses is not None else '' for licenses in df["licenses"]]


        emoji = gr.Dropdown(
            df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True
        )  # Dropdown to select the emoji
        likes = gr.Slider(
            minimum=df["likes"].min(),
            maximum=df["likes"].max(),
            step=1,
            label="Filter by Likes",
        )  # Slider to filter by likes
        hardware = gr.Dropdown(
            df["hardware"].unique().tolist(), label="Search by Hardware", multiselect=True
        )
        author = gr.Dropdown(
            df["author"].unique().tolist(), label="Search by Author", multiselect=True
        )


        # get the list of unique strings in the sdk_tags column
        sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
        # create a dropdown for the sdk_tags
        sdk_tags = gr.Dropdown(
            sdk_tags.tolist(), label="Filter by SDK/Tags", multiselect=True
        )
        # create a gradio checkbox group for hardware
        hardware = gr.CheckboxGroup(
            df["hardware"].unique().tolist(), label="Filter by Hardware"
        )

        licenses = np.unique(np.concatenate(df["licenses"].values))
        space_license = gr.CheckboxGroup(licenses.tolist(), label="Filter by license")

        # If the models column is none make it an array of "none" so that things don't break
        models_column_to_list = df["models"].apply(
            lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
        )
        # Now, flatten all arrays into one list
        models_flattened = np.concatenate(models_column_to_list.values)
        # Get unique strings
        unique_models = np.unique(models_flattened)
        models = gr.Dropdown(
            unique_models.tolist(),
            label="Search by Model",
            multiselect=True,
        )

        # Do the same for datasets that we did for models
        datasets_column_to_list = df["datasets"].apply(
            lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
        )
        flattened_datasets = np.concatenate(datasets_column_to_list.values)
        unique_datasets = np.unique(flattened_datasets)
        datasets = gr.Dropdown(
            unique_datasets.tolist(),
            label="Search by Dataset",
            multiselect=True,
        )

        devMode = gr.Checkbox(value=False, label="DevMode Enabled")
        clear = gr.ClearButton(components=[
                emoji,
                author,
                hardware,
                sdk_tags,
                models,
                datasets,
                space_license
                ])

        df = pd.DataFrame(
            df[
                [
                    "id",
                    "emoji",
                    "author",
                    "url",
                    "likes",
                    "hardware",
                    "sdk_tags",
                    "models",
                    "datasets",
                    "licenses",
                    "r_sdk_tags",
                    "r_models",
                    "r_datasets",
                    "r_licenses",
                ]
            ]
        )
        gr.DataFrame(
            filtered_df,
            inputs=[
                emoji,
                likes,
                author,
                hardware,
                sdk_tags,
                models,
                datasets,
                space_license,
            ],
            datatype="html",
            wrap=True, 
            column_widths=["25%", "5%", "25%", "25%", "20%"]
        )


demo.launch()