Spaces:
Runtime error
Runtime error
import gradio as gr | |
import subprocess | |
import os | |
import sys | |
import time | |
import pandas as pd | |
from threading import Thread | |
import numpy as np | |
# Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path | |
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination")) | |
src_dir = os.path.join(project_root, "src") | |
sys.path.insert(0, src_dir) | |
import run as evaluator # Import the run module | |
from src.css_html import custom_css | |
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2 | |
from src.envs import API, H4_TOKEN, REPO_ID | |
from huggingface_hub import HfApi | |
from src.utils import ( | |
AutoEvalColumn, | |
fields, | |
is_model_on_hub, | |
make_clickable_names, | |
styled_error, | |
styled_message, | |
EVAL_COLS, | |
EVAL_TYPES | |
) | |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] | |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden] | |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] | |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] | |
# CONFIGURATION: | |
test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"] | |
modelQueue = (pd.read_csv('data/queue.csv')).values.tolist() | |
print(modelQueue) | |
def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way. | |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN) | |
def formatr(result): | |
result = str(result) | |
result = result.split(",")[2].replace(")","") | |
result = result.replace(" ","") | |
return result | |
def save_to_txt(model, results, model_type,ref_model): | |
file_path = "data/code_eval_board.csv" | |
with open(file_path, "a") as f: | |
f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}") | |
print(f"Finished evaluation of model: {model} using ref_model: {ref_model}") | |
print(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}") | |
f.close() | |
def run_test(model,ref_model,data): | |
print(f"|| TESTING {data} ||") | |
return evaluator.main( | |
target_model=f"{model}", | |
ref_model=f"{ref_model}", | |
output_dir="out", | |
data=f"{data}", | |
length=64, | |
key_name="input", | |
ratio_gen=0.4 | |
) # Call the main function in detect-pretrain-code-contamination/src/run.py | |
def evaluate(model,model_type,ref_model): | |
print(f"|| EVALUATING {model} ||") | |
results = { | |
"arc": run_test(model, ref_model, test_datasets[2]), | |
"hellaswag": run_test(model, ref_model, test_datasets[4]), | |
"mmlu": run_test(model, ref_model, test_datasets[1]), | |
"truthfulQA": run_test(model, ref_model, test_datasets[0]), | |
"winogrande": run_test(model, ref_model, test_datasets[5]), | |
"gsm8k": run_test(model, ref_model, test_datasets[3]), | |
"ref_model": ref_model, | |
} | |
# Save to .txt file in /Evaluations/{model} | |
save_to_txt(model, results, model_type,ref_model) | |
return "\n".join([f"{k}:{results[k]}" for k in results]) | |
def worker_thread(): | |
global modelQueue, server | |
while True: | |
for submission in modelQueue: | |
#evaluate(submission[1],submission[0].split(" ")[0],submission[2]) | |
#modelQueue.pop(modelQueue.index(submission)) | |
#exit() | |
#The exit above is temporal while I figure out how to unload a model from a thread or similar. | |
# Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back. | |
# I highly encourage you to try to reproduce the results I get using your own implementation. | |
# Do NOT take anything listed here as fact, as I'm not 100% my implementation works as intended. | |
# Take whatever you see in the leaderboard as a grain of salt, do NOT accuse models of cheating just because of their placement here alone. | |
time.sleep(1) | |
time.sleep(1) | |
def queue(model,model_type,ref_model): | |
global modelQueue | |
modelQueue.append([model_type,model,ref_model]) | |
file_path = "data/queue.csv" | |
with open(file_path, "a") as f: | |
model = model.strip() | |
ref_model = ref_model.strip() | |
f.write(f"\n{model_type},{model},{ref_model}") | |
f.close() | |
print(f"QUEUE:\n{modelQueue}") | |
### bigcode/bigcode-models-leaderboard | |
def add_new_eval( | |
model: str, | |
revision: str, | |
ref_model: str, | |
model_type: str, | |
): | |
ref_model = ref_model | |
if model_type is None or model_type == "" or model_type == []: | |
return styled_error("Please select a model type.") | |
print(model_type) | |
# check the model actually exists before adding the eval | |
if revision == "": | |
revision = "main" | |
model_on_hub, error = is_model_on_hub(model, revision) | |
if not model_on_hub: | |
return styled_error(f'Model "{model}" {error}') | |
print("Adding new eval") | |
queue(model,model_type,ref_model) | |
return styled_message("Your request has been submitted to the evaluation queue!\n") | |
def select_columns(df, columns): | |
always_here_cols = [ | |
AutoEvalColumn.model_type_symbol.name, | |
AutoEvalColumn.model.name, | |
] | |
# We use COLS to maintain sorting | |
filtered_df = df[ | |
always_here_cols + [c for c in COLS if c in df.columns and c in columns] | |
] | |
return filtered_df | |
def filter_items(df, leaderboard_table, query): | |
if query == "All": | |
return df[leaderboard_table.columns] | |
else: | |
query = query[0] # take only the emoji character | |
filtered_df = df[(df["T"] == query)] | |
return filtered_df[leaderboard_table.columns] | |
def search_table(df, leaderboard_table, query): | |
filtered_df = df[(df["Models"].str.contains(query, case=False))] | |
return filtered_df[leaderboard_table.columns] | |
demo = gr.Blocks(css=custom_css) | |
with demo: | |
with gr.Row(): | |
gr.Markdown( | |
"""<div style="text-align: center;"><h1> π LLM Contamination Detector </h1></div>\ | |
<br>\ | |
<p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">π€ Big Code Models Leaderboard β</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\ | |
This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""", | |
elem_classes="markdown-text", | |
) | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.Column(): | |
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs: | |
with gr.TabItem("π Evaluations", id=0): | |
with gr.Column(): | |
with gr.Accordion("β‘οΈ See filters", open=False): | |
shown_columns = gr.CheckboxGroup( | |
choices=[ | |
c | |
for c in COLS | |
if c | |
not in [ | |
AutoEvalColumn.dummy.name, | |
AutoEvalColumn.model.name, | |
AutoEvalColumn.model_type_symbol.name, | |
] | |
], | |
value=[ | |
c | |
for c in COLS_LITE | |
if c | |
not in [ | |
AutoEvalColumn.dummy.name, | |
AutoEvalColumn.model.name, | |
AutoEvalColumn.model_type_symbol.name, | |
] | |
], | |
label="", | |
elem_id="column-select", | |
interactive=True, | |
) | |
# with gr.Column(min_width=780): | |
with gr.Row(): | |
search_bar = gr.Textbox( | |
placeholder="π Search for a model and press ENTER...", | |
show_label=False, | |
elem_id="search-bar", | |
) | |
filter_columns = gr.Radio( | |
label="β Filter model types", | |
choices=["All", "π’ Base", "πΆ Finetuned"], | |
value="All", | |
elem_id="filter-columns", | |
) | |
df = pd.read_csv("data/code_eval_board.csv") | |
leaderboard_df = gr.components.Dataframe( | |
value=df[ | |
[ | |
AutoEvalColumn.model_type_symbol.name, | |
AutoEvalColumn.model.name, | |
] | |
+ shown_columns.value | |
], | |
headers=[ | |
AutoEvalColumn.model_type_symbol.name, | |
AutoEvalColumn.model.name, | |
] | |
+ shown_columns.value, | |
datatype=TYPES, | |
elem_id="leaderboard-table", | |
interactive=False, | |
) | |
hidden_leaderboard_df = gr.components.Dataframe( | |
value=df, | |
headers=COLS, | |
datatype=["str" for _ in range(len(COLS))], | |
visible=False, | |
) | |
search_bar.submit( | |
search_table, | |
[hidden_leaderboard_df, leaderboard_df, search_bar], | |
leaderboard_df, | |
) | |
filter_columns.change( | |
filter_items, | |
[hidden_leaderboard_df, leaderboard_df, filter_columns], | |
leaderboard_df, | |
) | |
shown_columns.change( | |
select_columns, | |
[hidden_leaderboard_df, shown_columns], | |
leaderboard_df, | |
) | |
gr.Markdown( | |
""" | |
**Notes:** | |
- The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available. | |
- Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation. | |
- For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777. | |
- Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about. | |
""", | |
elem_classes="markdown-text", | |
) | |
with gr.TabItem("π About", id=2): | |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") | |
with gr.TabItem("π οΈ Submit models", id=3): | |
gr.Markdown(SUBMISSION_TEXT) | |
gr.Markdown( | |
"## π€ Submit a model here:", elem_classes="markdown-text" | |
) | |
with gr.Column(): | |
with gr.Column(): | |
with gr.Accordion( | |
f"β³ Evaluation Queue ({len(modelQueue)})", | |
open=False, | |
): | |
with gr.Row(): | |
finished_eval_table = gr.components.Dataframe( | |
value=pd.DataFrame(modelQueue, columns=['Type','Model','Reference Model']), | |
) | |
with gr.Row(): | |
model_name = gr.Textbox(label="Model name") | |
revision_name = gr.Textbox( | |
label="revision", placeholder="main" | |
) | |
with gr.Row(): | |
ref_model = gr.Dropdown( | |
choices=[ | |
"mistralai/Mistral-7B-v0.1", | |
"huggyllama/llama-7b", | |
"NousResearch/Llama-2-7b-hf", | |
"upstage/SOLAR-10.7B-v1.0", | |
], | |
label="Reference Model", | |
multiselect=False, | |
value="mistralai/Mistral-7B-v0.1", | |
interactive=True, | |
) | |
model_type = gr.Dropdown( | |
choices=["π’ base", "πΆ finetuned"], | |
label="Model type", | |
multiselect=False, | |
value=None, | |
interactive=True, | |
) | |
submit_button = gr.Button("Submit Eval") | |
submission_result = gr.Markdown() | |
submit_button.click( | |
add_new_eval, | |
inputs=[model_name, revision_name, ref_model, model_type], | |
outputs=[submission_result], | |
) | |
gr.Markdown(SUBMISSION_TEXT_2) | |
thread = Thread(target=worker_thread) | |
thread.start() | |
demo.launch(share=True) | |
# Some worries: | |
# 1. Am I testing things correctly in eval.py, following the template format? | |
# 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train | |
# (As in: if test exists, I go with that, then validation, then default) | |
# 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily. | |
# (Not sure which one open llm leaderboard uses, or what is the standard) | |
# 4. I'm unsure why in eval.py we append the output at the end of the input. | |
# 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B? | |