Spaces:

Yeyito
/

llm_contamination_detector

Runtime error

File size: 15,713 Bytes

import gradio as gr
import subprocess
import os
import sys
import time
import pandas as pd
from threading import Thread
import numpy as np

# Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)

import run as evaluator  # Import the run module
from src.css_html import custom_css
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2
from src.envs import API, H4_TOKEN, REPO_ID
from huggingface_hub import HfApi
from src.utils import (
    AutoEvalColumn,
    fields,
    is_model_on_hub,
    make_clickable_names,
    styled_error,
    styled_message,
    EVAL_COLS,
    EVAL_TYPES
)

COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]

# CONFIGURATION:
test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
print(modelQueue)

def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
    API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)

def formatr(result):
    result = str(result)
    result = result.split(",")[2].replace(")","")
    result = result.replace(" ","")
    return result

def save_to_txt(model, results, model_type,ref_model):
    file_path = "data/code_eval_board.csv"

    with open(file_path, "a") as f:
        f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}")
        
        print(f"Finished evaluation of model: {model} using ref_model: {ref_model}")
        print(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}")
        f.close()

def run_test(model,ref_model,data):
    print(f"|| TESTING {data} ||")
    return evaluator.main(
                target_model=f"{model}",
                ref_model=f"{ref_model}",
                output_dir="out",
                data=f"{data}",
                length=64,
                key_name="input",
                ratio_gen=0.4
            ) # Call the main function in detect-pretrain-code-contamination/src/run.py

def evaluate(model,model_type,ref_model):
    print(f"|| EVALUATING {model} ||")
    results = {
        "arc": run_test(model, ref_model, test_datasets[2]),
        "hellaswag": run_test(model, ref_model, test_datasets[4]),
        "mmlu": run_test(model, ref_model, test_datasets[1]),
        "truthfulQA": run_test(model, ref_model, test_datasets[0]),
        "winogrande": run_test(model, ref_model, test_datasets[5]),
        "gsm8k": run_test(model, ref_model, test_datasets[3]),
        "ref_model": ref_model,
    }

    # Save to .txt file in /Evaluations/{model}
    save_to_txt(model, results, model_type,ref_model)
    return "\n".join([f"{k}:{results[k]}" for k in results])

def worker_thread():
    global modelQueue, server
    while True:
        for submission in modelQueue:
            #evaluate(submission[1],submission[0].split(" ")[0],submission[2])
            #modelQueue.pop(modelQueue.index(submission))
            #exit()
            
            #The exit above is temporal while I figure out how to unload a model from a thread or similar.
            # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
            # I highly encourage you to try to reproduce the results I get using your own implementation.
            # Do NOT take anything listed here as fact, as I'm not 100% my implementation works as intended.
            # Take whatever you see in the leaderboard as a grain of salt, do NOT accuse models of cheating just because of their placement here alone.

            time.sleep(1)
        
        time.sleep(1)

def queue(model,model_type,ref_model):
    global modelQueue
    modelQueue.append([model_type,model,ref_model])

    file_path = "data/queue.csv"
    with open(file_path, "a") as f:
        model = model.strip()
        ref_model = ref_model.strip()
        f.write(f"\n{model_type},{model},{ref_model}")
        f.close()
    print(f"QUEUE:\n{modelQueue}")

### bigcode/bigcode-models-leaderboard
def add_new_eval(
    model: str,
    revision: str,
    ref_model: str,
    model_type: str,
):
    ref_model = ref_model

    if model_type is None or model_type == "" or model_type == []:
        return styled_error("Please select a model type.")
    print(model_type)
    # check the model actually exists before adding the eval
    if revision == "":
        revision = "main"

    model_on_hub, error = is_model_on_hub(model, revision)
    if not model_on_hub:
        return styled_error(f'Model "{model}" {error}')

    print("Adding new eval")
    queue(model,model_type,ref_model)
    return styled_message("Your request has been submitted to the evaluation queue!\n")

def select_columns(df, columns):
    always_here_cols = [
        AutoEvalColumn.model_type_symbol.name,
        AutoEvalColumn.model.name,
    ]
    # We use COLS to maintain sorting
    filtered_df = df[
        always_here_cols + [c for c in COLS if c in df.columns and c in columns]
    ]
    return filtered_df


def filter_items(df, leaderboard_table, query):
    if query == "All":
        return df[leaderboard_table.columns]
    else:
        query = query[0]  # take only the emoji character
    filtered_df = df[(df["T"] == query)]
    return filtered_df[leaderboard_table.columns]

def search_table(df, leaderboard_table, query):
    filtered_df = df[(df["Models"].str.contains(query, case=False))]
    return filtered_df[leaderboard_table.columns]

demo = gr.Blocks(css=custom_css)
with demo:
    with gr.Row():
        gr.Markdown(
            """<div style="text-align: center;"><h1> 📄 LLM Contamination Detector </h1></div>\
            <br>\
            <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">🤗 Big Code Models Leaderboard ⭐</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\
            This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""",
            elem_classes="markdown-text",
        )

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.Column():
            with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
                with gr.TabItem("🔍 Evaluations", id=0):
                    with gr.Column():
                        with gr.Accordion("➡️ See filters", open=False):
                            shown_columns = gr.CheckboxGroup(
                                choices=[
                                    c
                                    for c in COLS
                                    if c
                                    not in [
                                        AutoEvalColumn.dummy.name,
                                        AutoEvalColumn.model.name,
                                        AutoEvalColumn.model_type_symbol.name,
                                    ]
                                ],
                                value=[
                                    c
                                    for c in COLS_LITE
                                    if c
                                    not in [
                                        AutoEvalColumn.dummy.name,
                                        AutoEvalColumn.model.name,
                                        AutoEvalColumn.model_type_symbol.name,
                                    ]
                                ],
                                label="",
                                elem_id="column-select",
                                interactive=True,
                            )
                        # with gr.Column(min_width=780):
                        with gr.Row():
                            search_bar = gr.Textbox(
                                placeholder="🔍 Search for a model and press ENTER...",
                                show_label=False,
                                elem_id="search-bar",
                            )
                            filter_columns = gr.Radio(
                                label="⏚ Filter model types",
                                choices=["All", "🟢 Base", "🔶 Finetuned"],
                                value="All",
                                elem_id="filter-columns",
                            )

                    df = pd.read_csv("data/code_eval_board.csv")
                    leaderboard_df = gr.components.Dataframe(
                        value=df[
                            [
                                AutoEvalColumn.model_type_symbol.name,
                                AutoEvalColumn.model.name,
                            ]
                            + shown_columns.value
                        ],
                        headers=[
                            AutoEvalColumn.model_type_symbol.name,
                            AutoEvalColumn.model.name,
                        ]
                        + shown_columns.value,
                        datatype=TYPES,
                        elem_id="leaderboard-table",
                        interactive=False,
                    )

                    hidden_leaderboard_df = gr.components.Dataframe(
                        value=df,
                        headers=COLS,
                        datatype=["str" for _ in range(len(COLS))],
                        visible=False,
                    )

                    search_bar.submit(
                        search_table,
                        [hidden_leaderboard_df, leaderboard_df, search_bar],
                        leaderboard_df,
                    )

                    filter_columns.change(
                        filter_items,
                        [hidden_leaderboard_df, leaderboard_df, filter_columns],
                        leaderboard_df,
                    )

                    shown_columns.change(
                        select_columns,
                        [hidden_leaderboard_df, shown_columns],
                        leaderboard_df,
                    )

                    gr.Markdown(
                        """
                    **Notes:**
                    - The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available.
                    - Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation.
                    - For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777.
                    - Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about.
                    """,
                        elem_classes="markdown-text",
                    )

                with gr.TabItem("📝 About", id=2):
                    gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
                with gr.TabItem("🛠️ Submit models", id=3):
                    gr.Markdown(SUBMISSION_TEXT)
                    gr.Markdown(
                        "## 📤  Submit a model here:", elem_classes="markdown-text"
                    )
                    with gr.Column():
                        with gr.Column():
                            with gr.Accordion(
                                f"⏳ Evaluation Queue ({len(modelQueue)})",
                                open=False,
                            ):
                                with gr.Row():
                                    finished_eval_table = gr.components.Dataframe(
                                            value=pd.DataFrame(modelQueue, columns=['Type','Model','Reference Model']),
                                    )
                        with gr.Row():
                            model_name = gr.Textbox(label="Model name")
                            revision_name = gr.Textbox(
                                label="revision", placeholder="main"
                            )
                        with gr.Row():
                            ref_model = gr.Dropdown(
                                choices=[
                                    "mistralai/Mistral-7B-v0.1",
                                    "huggyllama/llama-7b",
                                    "NousResearch/Llama-2-7b-hf",
                                    "upstage/SOLAR-10.7B-v1.0",
                                ],
                                label="Reference Model",
                                multiselect=False,
                                value="mistralai/Mistral-7B-v0.1",
                                interactive=True,
                            )
                            model_type = gr.Dropdown(
                                choices=["🟢 base", "🔶 finetuned"],
                                label="Model type",
                                multiselect=False,
                                value=None,
                                interactive=True,
                            )
                        submit_button = gr.Button("Submit Eval")
                        submission_result = gr.Markdown()
                        submit_button.click(
                            add_new_eval,
                            inputs=[model_name, revision_name, ref_model, model_type],
                            outputs=[submission_result],
                        )
                        gr.Markdown(SUBMISSION_TEXT_2)

thread = Thread(target=worker_thread)
thread.start()
demo.launch(share=True)

# Some worries:
# 1. Am I testing things correctly in eval.py, following the template format?

# 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train
#   (As in: if test exists, I go with that, then validation, then default)

# 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily.
#   (Not sure which one open llm leaderboard uses, or what is the standard)

# 4. I'm unsure why in eval.py we append the output at the end of the input.

# 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B?