Yeyito's picture
Functional
8ea42fc
raw
history blame
13.5 kB
import gradio as gr
import subprocess
import os
import sys
import time
import pandas as pd
from threading import Thread
# Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)
import run as evaluator # Import the run module
from src.css_html import custom_css
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2
from src.envs import API, H4_TOKEN, REPO_ID
from huggingface_hub import HfApi
from src.utils import (
AutoEvalColumn,
fields,
is_model_on_hub,
make_clickable_names,
styled_error,
styled_message,
)
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
# CONFIGURATION:
ref_model = "huggyllama/llama-7b"
test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
modelQueue = []
def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
def save_to_txt(model, results, model_type):
file_path = "data/code_eval_board.csv"
with open(file_path, "a") as f:
f.write(f"\n{model_type},{model}," + str(results["arc"]) + "," + str(results["hellaswag"]) + "," + str(results["mmlu"]) + "," + str(results["truthfulQA"]) + "," + str(results["winogrande"]) + "," + str(results["gsm8k"]))
f.close()
restart_space()
def run_test(model,ref_model,data):
print(f"|| TESTING {data} ||")
return evaluator.main(
target_model=f"{model}",
ref_model=f"{ref_model}",
output_dir="out",
data=f"{data}",
length=64,
key_name="input",
ratio_gen=0.4
) # Call the main function in detect-pretrain-code-contamination/src/run.py
def evaluate(model,model_type):
global ref_model
print(f"|| EVALUATING {model} ||")
results = {
"arc": run_test(model, ref_model, test_datasets[2]),
"hellaswag": run_test(model, ref_model, test_datasets[4]),
"mmlu": run_test(model, ref_model, test_datasets[1]),
"truthfulQA": run_test(model, ref_model, test_datasets[0]),
"winogrande": run_test(model, ref_model, test_datasets[5]),
"gsm8k": run_test(model, ref_model, test_datasets[3]),
"ref_model": ref_model,
}
# Save to .txt file in /Evaluations/{model}
save_to_txt(model, results, model_type)
return "\n".join([f"{k}:{results[k]}" for k in results])
def worker_thread():
global modelQueue, server
while True:
for submission in modelQueue:
evaluate(submission[0],submission[1].split(" ")[0])
modelQueue.pop(modelQueue.index(submission))
time.sleep(1)
time.sleep(1)
def queue(model,model_type):
global modelQueue
modelQueue.append([model,model_type])
print(f"QUEUE:\n{modelQueue}")
### bigcode/bigcode-models-leaderboard
def add_new_eval(
model: str,
revision: str,
precision: str,
model_type: str,
):
precision = precision
if model_type is None or model_type == "" or model_type == []:
return styled_error("Please select a model type.")
print(model_type)
# check the model actually exists before adding the eval
if revision == "":
revision = "main"
model_on_hub, error = is_model_on_hub(model, revision)
if not model_on_hub:
return styled_error(f'Model "{model}" {error}')
print("Adding new eval")
queue(model,model_type)
return styled_message("Your request has been submitted to the evaluation queue!\n")
def select_columns(df, columns):
always_here_cols = [
AutoEvalColumn.model_type_symbol.name,
AutoEvalColumn.model.name,
]
# We use COLS to maintain sorting
filtered_df = df[
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
]
return filtered_df
def filter_items(df, leaderboard_table, query):
if query == "All":
return df[leaderboard_table.columns]
else:
query = query[0] # take only the emoji character
filtered_df = df[(df["T"] == query)]
return filtered_df[leaderboard_table.columns]
def search_table(df, leaderboard_table, query):
filtered_df = df[(df["Models"].str.contains(query, case=False))]
return filtered_df[leaderboard_table.columns]
demo = gr.Blocks(css=custom_css)
with demo:
with gr.Row():
gr.Markdown(
"""<div style="text-align: center;"><h1> πŸ“„ LLM Contamination Detector </h1></div>\
<br>\
<p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">πŸ€— Big Code Models Leaderboard ⭐</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\
This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""",
elem_classes="markdown-text",
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.Column():
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
with gr.TabItem("πŸ” Evaluations", id=0):
with gr.Column():
with gr.Accordion("➑️ See filters", open=False):
shown_columns = gr.CheckboxGroup(
choices=[
c
for c in COLS
if c
not in [
AutoEvalColumn.dummy.name,
AutoEvalColumn.model.name,
AutoEvalColumn.model_type_symbol.name,
]
],
value=[
c
for c in COLS_LITE
if c
not in [
AutoEvalColumn.dummy.name,
AutoEvalColumn.model.name,
AutoEvalColumn.model_type_symbol.name,
]
],
label="",
elem_id="column-select",
interactive=True,
)
# with gr.Column(min_width=780):
with gr.Row():
search_bar = gr.Textbox(
placeholder="πŸ” Search for a model and press ENTER...",
show_label=False,
elem_id="search-bar",
)
filter_columns = gr.Radio(
label="⏚ Filter model types",
choices=["All", "🟒 Base", "πŸ”Ά Finetuned"],
value="All",
elem_id="filter-columns",
)
df = pd.read_csv("data/code_eval_board.csv")
leaderboard_df = gr.components.Dataframe(
value=df[
[
AutoEvalColumn.model_type_symbol.name,
AutoEvalColumn.model.name,
]
+ shown_columns.value
],
headers=[
AutoEvalColumn.model_type_symbol.name,
AutoEvalColumn.model.name,
]
+ shown_columns.value,
datatype=TYPES,
elem_id="leaderboard-table",
interactive=False,
)
hidden_leaderboard_df = gr.components.Dataframe(
value=df,
headers=COLS,
datatype=["str" for _ in range(len(COLS))],
visible=False,
)
search_bar.submit(
search_table,
[hidden_leaderboard_df, leaderboard_df, search_bar],
leaderboard_df,
)
filter_columns.change(
filter_items,
[hidden_leaderboard_df, leaderboard_df, filter_columns],
leaderboard_df,
)
shown_columns.change(
select_columns,
[hidden_leaderboard_df, shown_columns],
leaderboard_df,
)
gr.Markdown(
"""
**Notes:**
- The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available.
- Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation.
- For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777.
- Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about.
""",
elem_classes="markdown-text",
)
with gr.TabItem("πŸ“ About", id=2):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸ› οΈ Submit models", id=3):
gr.Markdown(SUBMISSION_TEXT)
gr.Markdown(
"## πŸ“€ Submit a model here:", elem_classes="markdown-text"
)
with gr.Column():
with gr.Row():
model_name = gr.Textbox(label="Model name")
revision_name = gr.Textbox(
label="revision", placeholder="main"
)
with gr.Row():
precision = gr.Dropdown(
choices=[
"float16",
"bfloat16",
"8bit",
"4bit",
],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
model_type = gr.Dropdown(
choices=["🟒 base", "πŸ”Ά instruction-tuned"],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
inputs=[model_name, revision_name, precision, model_type],
outputs=[submission_result],
)
gr.Markdown(SUBMISSION_TEXT_2)
thread = Thread(target=worker_thread)
thread.start()
demo.launch()
# Some worries:
# 1. Am I testing things correctly in eval.py, following the template format?
# 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train
# (As in: if test exists, I go with that, then validation, then default)
# 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily.
# (Not sure which one open llm leaderboard uses, or what is the standard)
# 4. I'm unsure why in eval.py we append the output at the end of the input.
# 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B?