import gradio as gr import pandas as pd from dataset import get_dataframe from markdown import GUIDELINES, PANEL_MARKDOWN df = get_dataframe() def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes): """ Filter the dataframe based on the provided evaluation dataset, contaminated source, and checkboxes. Args: dataframe (pandas.DataFrame): The input dataframe to filter. eval_dataset (str): The evaluation dataset to filter by. cont_source (str): The contaminated source to filter by. checkboxes (list): The checkboxes to filter by. Returns: pandas.DataFrame: The filtered dataframe. """ if isinstance(eval_dataset, str): dataframe = dataframe[ dataframe["Evaluation Dataset"].str.contains(f"(?i){eval_dataset}") ] if isinstance(cont_source, str): dataframe = dataframe[ dataframe["Contaminated Source"].str.contains(f"(?i){cont_source}") ] if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes: dataframe = dataframe[dataframe["Approach"] != "model-based"] if isinstance(checkboxes, list) and "Show only contaminated" in checkboxes: dataframe = dataframe[ (dataframe["Train Split"] > 0.0) | (dataframe["Development Split"] > 0.0) | (dataframe["Test Split"] > 0.0) ] dataframe = dataframe.sort_values("Test Split", ascending=False) return dataframe.style.format( { "Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}", }, na_rep="Unknown", ) def filter_dataframe_corpus(*args, **kwargs) -> pd.DataFrame: """ Filter the dataframe for corpus contamination. Returns: pandas.DataFrame: The filtered dataframe for corpus contamination. """ # Get rows in which the column Model or corpus is equal to dataset filtered_df = df[df["Model or corpus"] == "corpus"] filtered_df = filtered_df.drop(columns=["Model or corpus"]) return filter_dataframe(filtered_df, *args, **kwargs) def filter_dataframe_model(*args, **kwargs) -> pd.DataFrame: """ Filter the dataframe for model contamination. Returns: pandas.DataFrame: The filtered dataframe for model contamination. """ # Get rows in which the column Model or corpus is equal to dataset filtered_df = df[df["Model or corpus"] == "model"] filtered_df = filtered_df.drop(columns=["Model or corpus"]) return filter_dataframe(filtered_df, *args, **kwargs) theme = gr.themes.Soft( primary_hue="emerald", secondary_hue="cyan", text_size="md", spacing_size="lg", font=[ gr.themes.GoogleFont("Poppins"), gr.themes.GoogleFont("Poppins"), gr.themes.GoogleFont("Poppins"), gr.themes.GoogleFont("Poppins"), ], ).set( block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950", section_header_text_size="*text_lg", section_header_text_weight="800", ) with gr.Blocks( theme=theme, title="💨 Data Contamination Database", analytics_enabled=False, fill_height=True, ) as demo: gr.Markdown(PANEL_MARKDOWN) with gr.Tab("Corpus contamination") as tab_corpus: with gr.Row(variant="compact"): with gr.Column(): eval_dataset_corpus = gr.Textbox( placeholder="Evaluation dataset", label="Evaluation dataset", value="", ) cont_corpora = gr.Textbox( placeholder="Pre-training corpora", label="Pre-training corpora", value="", ) with gr.Column(): checkboxes_corpus = gr.CheckboxGroup( ["Exclude model-based evidences", "Show only contaminated"], label="Search options", value=[], ) filter_corpus_btn = gr.Button("Filter") corpus_dataframe = gr.DataFrame( value=filter_dataframe_corpus( eval_dataset_corpus, cont_corpora, checkboxes_corpus ), headers=df.columns.to_list(), datatype=[ "markdown", "markdown", "number", "number", "number", "str", "markdown", "markdown", ], ) with gr.Tab("Model contamination") as tab_model: with gr.Row(variant="compact"): with gr.Column(): eval_dataset_model = gr.Textbox( placeholder="Evaluation dataset", label="Evaluation dataset", value="", ) cont_model = gr.Textbox( placeholder="Model", label="Pre-trained model", value="" ) with gr.Column(): checkboxes_model = gr.CheckboxGroup( ["Exclude model-based evidences", "Show only contaminated"], label="Search options", value=[], ) filter_model_btn = gr.Button("Filter") model_dataframe = gr.DataFrame( value=filter_dataframe_model( eval_dataset_model, cont_model, checkboxes_model ), headers=df.columns.to_list(), datatype=[ "markdown", "markdown", "number", "number", "number", "str", "markdown", "markdown", ], ) filter_corpus_btn.click( filter_dataframe_corpus, inputs=[eval_dataset_corpus, cont_corpora, checkboxes_corpus], outputs=corpus_dataframe, ) filter_model_btn.click( filter_dataframe_model, inputs=[eval_dataset_model, cont_model, checkboxes_model], outputs=model_dataframe, ) with gr.Tab("Contribution Guidelines") as tab_guidelines: gr.Markdown(GUIDELINES) demo.launch()