Spaces:

Yeyito
/

llm_contamination_detector

Runtime error

App Files Files Community

llm_contamination_detector / app.py

Yeyito

Functional

8ea42fc about 1 year ago

raw

history blame

13.5 kB

	import gradio as gr
	import subprocess
	import os
	import sys
	import time
	import pandas as pd
	from threading import Thread

	# Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
	project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
	src_dir = os.path.join(project_root, "src")
	sys.path.insert(0, src_dir)

	import run as evaluator # Import the run module
	from src.css_html import custom_css
	from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2
	from src.envs import API, H4_TOKEN, REPO_ID
	from huggingface_hub import HfApi
	from src.utils import (
	AutoEvalColumn,
	fields,
	is_model_on_hub,
	make_clickable_names,
	styled_error,
	styled_message,
	)

	COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
	TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
	COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
	TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]

	# CONFIGURATION:
	ref_model = "huggyllama/llama-7b"
	test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
	modelQueue = []

	def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
	API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)


	def save_to_txt(model, results, model_type):
	file_path = "data/code_eval_board.csv"

	with open(file_path, "a") as f:
	f.write(f"\n{model_type},{model}," + str(results["arc"]) + "," + str(results["hellaswag"]) + "," + str(results["mmlu"]) + "," + str(results["truthfulQA"]) + "," + str(results["winogrande"]) + "," + str(results["gsm8k"]))
	f.close()

	restart_space()

	def run_test(model,ref_model,data):
	print(f"\|\| TESTING {data} \|\|")
	return evaluator.main(
	target_model=f"{model}",
	ref_model=f"{ref_model}",
	output_dir="out",
	data=f"{data}",
	length=64,
	key_name="input",
	ratio_gen=0.4
	) # Call the main function in detect-pretrain-code-contamination/src/run.py

	def evaluate(model,model_type):
	global ref_model
	print(f"\|\| EVALUATING {model} \|\|")
	results = {
	"arc": run_test(model, ref_model, test_datasets[2]),
	"hellaswag": run_test(model, ref_model, test_datasets[4]),
	"mmlu": run_test(model, ref_model, test_datasets[1]),
	"truthfulQA": run_test(model, ref_model, test_datasets[0]),
	"winogrande": run_test(model, ref_model, test_datasets[5]),
	"gsm8k": run_test(model, ref_model, test_datasets[3]),
	"ref_model": ref_model,
	}

	# Save to .txt file in /Evaluations/{model}
	save_to_txt(model, results, model_type)
	return "\n".join([f"{k}:{results[k]}" for k in results])

	def worker_thread():
	global modelQueue, server
	while True:
	for submission in modelQueue:
	evaluate(submission[0],submission[1].split(" ")[0])
	modelQueue.pop(modelQueue.index(submission))
	time.sleep(1)
	time.sleep(1)

	def queue(model,model_type):
	global modelQueue
	modelQueue.append([model,model_type])
	print(f"QUEUE:\n{modelQueue}")


	### bigcode/bigcode-models-leaderboard
	def add_new_eval(
	model: str,
	revision: str,
	precision: str,
	model_type: str,
	):
	precision = precision

	if model_type is None or model_type == "" or model_type == []:
	return styled_error("Please select a model type.")
	print(model_type)
	# check the model actually exists before adding the eval
	if revision == "":
	revision = "main"

	model_on_hub, error = is_model_on_hub(model, revision)
	if not model_on_hub:
	return styled_error(f'Model "{model}" {error}')

	print("Adding new eval")
	queue(model,model_type)
	return styled_message("Your request has been submitted to the evaluation queue!\n")

	def select_columns(df, columns):
	always_here_cols = [
	AutoEvalColumn.model_type_symbol.name,
	AutoEvalColumn.model.name,
	]
	# We use COLS to maintain sorting
	filtered_df = df[
	always_here_cols + [c for c in COLS if c in df.columns and c in columns]
	]
	return filtered_df


	def filter_items(df, leaderboard_table, query):
	if query == "All":
	return df[leaderboard_table.columns]
	else:
	query = query[0] # take only the emoji character
	filtered_df = df[(df["T"] == query)]
	return filtered_df[leaderboard_table.columns]

	def search_table(df, leaderboard_table, query):
	filtered_df = df[(df["Models"].str.contains(query, case=False))]
	return filtered_df[leaderboard_table.columns]

	demo = gr.Blocks(css=custom_css)
	with demo:
	with gr.Row():
	gr.Markdown(
	"""<div style="text-align: center;"><h1> 📄 LLM Contamination Detector </h1></div>\
	<br>\
	<p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">🤗 Big Code Models Leaderboard ⭐</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\
	This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""",
	elem_classes="markdown-text",
	)

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.Column():
	with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
	with gr.TabItem("🔍 Evaluations", id=0):
	with gr.Column():
	with gr.Accordion("➡️ See filters", open=False):
	shown_columns = gr.CheckboxGroup(
	choices=[
	c
	for c in COLS
	if c
	not in [
	AutoEvalColumn.dummy.name,
	AutoEvalColumn.model.name,
	AutoEvalColumn.model_type_symbol.name,
	]
	],
	value=[
	c
	for c in COLS_LITE
	if c
	not in [
	AutoEvalColumn.dummy.name,
	AutoEvalColumn.model.name,
	AutoEvalColumn.model_type_symbol.name,
	]
	],
	label="",
	elem_id="column-select",
	interactive=True,
	)
	# with gr.Column(min_width=780):
	with gr.Row():
	search_bar = gr.Textbox(
	placeholder="🔍 Search for a model and press ENTER...",
	show_label=False,
	elem_id="search-bar",
	)
	filter_columns = gr.Radio(
	label="⏚ Filter model types",
	choices=["All", "🟢 Base", "🔶 Finetuned"],
	value="All",
	elem_id="filter-columns",
	)

	df = pd.read_csv("data/code_eval_board.csv")
	leaderboard_df = gr.components.Dataframe(
	value=df[
	[
	AutoEvalColumn.model_type_symbol.name,
	AutoEvalColumn.model.name,
	]
	+ shown_columns.value
	],
	headers=[
	AutoEvalColumn.model_type_symbol.name,
	AutoEvalColumn.model.name,
	]
	+ shown_columns.value,
	datatype=TYPES,
	elem_id="leaderboard-table",
	interactive=False,
	)

	hidden_leaderboard_df = gr.components.Dataframe(
	value=df,
	headers=COLS,
	datatype=["str" for _ in range(len(COLS))],
	visible=False,
	)

	search_bar.submit(
	search_table,
	[hidden_leaderboard_df, leaderboard_df, search_bar],
	leaderboard_df,
	)

	filter_columns.change(
	filter_items,
	[hidden_leaderboard_df, leaderboard_df, filter_columns],
	leaderboard_df,
	)

	shown_columns.change(
	select_columns,
	[hidden_leaderboard_df, shown_columns],
	leaderboard_df,
	)

	gr.Markdown(
	"""
	Notes:
	- The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available.
	- Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation.
	- For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777.
	- Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about.
	""",
	elem_classes="markdown-text",
	)

	with gr.TabItem("📝 About", id=2):
	gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
	with gr.TabItem("🛠️ Submit models", id=3):
	gr.Markdown(SUBMISSION_TEXT)
	gr.Markdown(
	"## 📤 Submit a model here:", elem_classes="markdown-text"
	)
	with gr.Column():
	with gr.Row():
	model_name = gr.Textbox(label="Model name")
	revision_name = gr.Textbox(
	label="revision", placeholder="main"
	)
	with gr.Row():
	precision = gr.Dropdown(
	choices=[
	"float16",
	"bfloat16",
	"8bit",
	"4bit",
	],
	label="Precision",
	multiselect=False,
	value="float16",
	interactive=True,
	)
	model_type = gr.Dropdown(
	choices=["🟢 base", "🔶 instruction-tuned"],
	label="Model type",
	multiselect=False,
	value=None,
	interactive=True,
	)
	submit_button = gr.Button("Submit Eval")
	submission_result = gr.Markdown()
	submit_button.click(
	add_new_eval,
	inputs=[model_name, revision_name, precision, model_type],
	outputs=[submission_result],
	)
	gr.Markdown(SUBMISSION_TEXT_2)

	thread = Thread(target=worker_thread)
	thread.start()
	demo.launch()

	# Some worries:
	# 1. Am I testing things correctly in eval.py, following the template format?

	# 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train
	# (As in: if test exists, I go with that, then validation, then default)

	# 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily.
	# (Not sure which one open llm leaderboard uses, or what is the standard)

	# 4. I'm unsure why in eval.py we append the output at the end of the input.

	# 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B?