parti-prompts-leaderboard

Runtime error

File size: 7,128 Bytes

from datasets import load_dataset
from collections import Counter, defaultdict
import pandas as pd
from huggingface_hub import list_datasets
import os
import gradio as gr

parti_prompt_results = []
ORG = "diffusers-parti-prompts"
SUBMISSIONS = {
    "sd-v1-5": None,
    "sd-v2-1": None,
    "if-v1-0": None,
    "karlo": None,
}
LINKS = {
    "sd-v1-5": "https://huggingface.co/runwayml/stable-diffusion-v1-5",
    "sd-v2-1": "https://huggingface.co/stabilityai/stable-diffusion-2-1",
    "if-v1-0": "https://huggingface.co/DeepFloyd/IF-I-XL-v1.0",
    "karlo": "https://huggingface.co/kakaobrain/karlo-v1-alpha",
}
MODEL_KEYS = "-".join(SUBMISSIONS.keys())
SUBMISSION_ORG = f"results-{MODEL_KEYS}"

submission_names = list(SUBMISSIONS.keys())
parti_prompt_categories = load_dataset(os.path.join(ORG, "sd-v1-5"))["train"]["Category"]
parti_prompt_challenge = load_dataset(os.path.join(ORG, "sd-v1-5"))["train"]["Challenge"]


def load_submissions():
    all_datasets = list_datasets(author=SUBMISSION_ORG)
    relevant_ids = [d.id for d in all_datasets]
    
    ids = defaultdict(list)
    challenges = defaultdict(list)
    categories = defaultdict(list)

    for _id in relevant_ids:
        ds = load_dataset(_id)["train"]
        for result, image_id in zip(ds["result"], ds["id"]):
            ids[result].append(image_id)
            challenges[parti_prompt_challenge[image_id]].append(result)
            categories[parti_prompt_categories[image_id]].append(result)
    
    all_values = sum(len(v) for v in ids.values())
    main_dict = {k: '{:.2%}'.format(len(v)/all_values) for k, v in ids.items()}
    challenges = {k: Counter(v) for k, v in challenges.items()}
    categories = {k: Counter(v) for k, v in categories.items()}

    return main_dict, challenges, categories

def sort_by_highest_percentage(df):
    # Convert percentage values to numeric format
    df = df[df.loc[0].sort_values(ascending=False).index]

    return df

def get_dataframe_all():
    main, challenges, categories = load_submissions()
    main_frame = pd.DataFrame([main])

    challenges_frame = pd.DataFrame.from_dict(challenges).fillna(0).T
    challenges_frame = challenges_frame.div(challenges_frame.sum(axis=1), axis=0)
    challenges_frame = challenges_frame.applymap(lambda x: '{:.2%}'.format(x))

    categories_frame = pd.DataFrame.from_dict(categories).fillna(0).T
    categories_frame = categories_frame.div(categories_frame.sum(axis=1), axis=0)
    categories_frame = categories_frame.applymap(lambda x: '{:.2%}'.format(x))

    main_frame = sort_by_highest_percentage(main_frame)

    categories_frame = categories_frame.reindex(columns=main_frame.columns.to_list())
    challenges_frame = challenges_frame.reindex(columns=main_frame.columns.to_list())

    categories_frame = categories_frame.reset_index().rename(columns={'index': 'Category'})
    challenges_frame = challenges_frame.reset_index().rename(columns={'index': 'Challenge'})

    return main_frame, challenges_frame, categories_frame

TITLE = "# Open Parti Prompts Leaderboard"
DESCRIPTION = """
The *Open Parti Prompts Leaderboard* compares state-of-the-art, open-source text-to-image models to each other according to **human preferences**. \n\n
Text-to-image models are notoriously difficult to evaluate. [FID](https://en.wikipedia.org/wiki/Fr%C3%A9chet_inception_distance) and 
[CLIP Score](https://en.wikipedia.org/wiki/Fr%C3%A9chet_inception_distance) are not enough to accurately state whether a text-to-image model can 
**generate "good" images**. "Good" is extremely difficult to put into numbers. \n\n
Instead, the **Open Parti Prompts Leaderboard** uses human feedback from the community to compare images from different text-to-image models to each other.

\n\n

❤️ ***Please take 3 minutes to contribute to the benchmark.*** \n
👉 ***Play one round of [Open Parti Prompts Game](https://huggingface.co/spaces/OpenGenAI/open-parti-prompts) to contribute 10 answers.*** 🤗
"""

EXPLANATION = """\n\n
## How the is data collected 📊 \n\n

In more detail, the [Open Parti Prompts Game](https://huggingface.co/spaces/OpenGenAI/open-parti-prompts) collects human preferences that state which generated image 
best fits a given prompt from the [Parti Prompts](https://huggingface.co/datasets/nateraw/parti-prompts) dataset. Parti Prompts has been designed to challenge
text-to-image models on prompts of varying categories and difficulty. The images have been pre-generated from the models that are compared in this space.
For more information of how the images were created, please refer to [Open Parti Prompts](https://huggingface.co/spaces/OpenGenAI/open-parti-prompts).
The community's answers are then stored and used in this space to give a human-preference-based comparison of the different models. \n\n

Currently the leaderboard includes the following models:
- [sd-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)
- [sd-v2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1)
- [if-v1-0](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0)
- [karlo](https://huggingface.co/kakaobrain/karlo-v1-alpha) \n\n

In the following you can see three result tables. The first shows the overall comparison of the 4 models. The score states, 
**the percentage at which images generated from the corresponding model are preferred over the image from all other models**. The second and third tables
show you a breakdown analysis per category and per type of challenge as defined by [Parti Prompts](https://huggingface.co/datasets/nateraw/parti-prompts).
"""

GALLERY_COLUMN_NUM = len(SUBMISSIONS)

def refresh():
    return get_dataframe_all()

with gr.Blocks() as demo:
    with gr.Column(visible=True) as intro_view:
        gr.Markdown(TITLE)
        gr.Markdown(DESCRIPTION)
        gr.Markdown(EXPLANATION)

    headers = list(SUBMISSIONS.keys())
    datatype = "str"

    main_df, challenge_df, category_df = get_dataframe_all()

    with gr.Column():
        gr.Markdown("# Open Parti Prompts")
        main_dataframe = gr.Dataframe(
            value=main_df,
            headers=main_df.columns.to_list(),
            datatype="str",
            row_count=main_df.shape[0],
            col_count=main_df.shape[1],
            interactive=False,
        )

    with gr.Column():
        gr.Markdown("## per category")
        cat_dataframe = gr.Dataframe(
            value=category_df,
            headers=category_df.columns.to_list(),
            datatype="str",
            row_count=category_df.shape[0],
            col_count=category_df.shape[1],
            interactive=False,
        )

    with gr.Column():
        gr.Markdown("## per challenge")
        chal_dataframe = gr.Dataframe(
            value=challenge_df,
            headers=challenge_df.columns.to_list(),
            datatype="str",
            row_count=challenge_df.shape[0],
            col_count=challenge_df.shape[1],
            interactive=False,
        )

    with gr.Row():
        refresh_button = gr.Button("Refresh")
        refresh_button.click(refresh, inputs=[], outputs=[main_dataframe, cat_dataframe, chal_dataframe])

demo.launch()