Spaces:

Yeyito
/

llm_contamination_detector

Runtime error

App Files Files Community

Yeyito commited on Dec 19, 2023

Commit

b56563d

1 Parent(s): c13858c

Meant to do last commit on run.py not app.py

Browse files

Files changed (1) hide show

app.py +307 -231

app.py CHANGED Viewed

@@ -1,235 +1,311 @@
-import logging
-logging.basicConfig(level='ERROR')
-import numpy as np
-from pathlib import Path
-import openai
-import torch
-import zlib
-import statistics
-from torch.utils.data import DataLoader
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from tqdm import tqdm
-import math
-import numpy as np
-from datasets import load_dataset
-from options import Options
-from ipdb import set_trace as bp
-from eval import *
-from utils import evaluate_model
-from analyze import analyze_data
-import argparse
 import os
 import sys
-import gc
-import pickle
-models = {}
-def save_data(filename, data):
-    with open(filename, 'wb') as filehandle:
-        # store the data as binary data stream
-        pickle.dump(data, filehandle)
-def load_data(filename):
-    with open(filename, 'rb') as filehandle:
-        # read the data as binary data stream
-        loaded_data = pickle.load(filehandle)
-    return loaded_data
-def unload_model(model,tokenizer):
-    print("[X] Cannot unload model! Functionality not implemented!")
-def load_model(name1):
-    if name1 not in models:
-        model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
-        model1.eval()
-        tokenizer1 = AutoTokenizer.from_pretrained(name1)
-        tokenizer1.pad_token = tokenizer1.eos_token
-        models[name1] = model1
-        models[name1 + "_tokenizer"] = tokenizer1
-    return models[name1], models[name1 + "_tokenizer"]
-def calculatePerplexity(sentence, model, tokenizer, gpu):
-    """
-    exp(loss)
-    """
-    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
-    input_ids = input_ids.to(gpu)
-    with torch.no_grad():
-        outputs = model(input_ids, labels=input_ids)
-    loss, logits = outputs[:2]
-    '''
-    extract logits:
-    '''
-    # Apply softmax to the logits to get probabilities
-    probabilities = torch.nn.functional.log_softmax(logits, dim=-1)
-    # probabilities = torch.nn.functional.softmax(logits, dim=-1)
-    all_prob = []
-    input_ids_processed = input_ids[0][1:]
-    for i, token_id in enumerate(input_ids_processed):
-        probability = probabilities[0, i, token_id].item()
-        all_prob.append(probability)
-    return torch.exp(loss).item(), all_prob, loss.item()
-def sample_generation(sentence, model, tokenizer, args,data_name):
-    half_sentence_index = math.ceil(len(sentence.split())*args['prefix_length'])
-    if half_sentence_index > 0:
-        prefix = " ".join(sentence.split()[:half_sentence_index])
-    else:
-        prefix = '<|startoftext|> '
-    input_ids = torch.tensor(tokenizer.encode(prefix)).unsqueeze(0)
-    input_ids = input_ids.to(model.device)
-    output = None
-    if data_name != "cais/mmlu" or data_name != "gsm8k":
-        output = model.generate(input_ids, max_new_tokens=len(sentence.split())-half_sentence_index, min_new_tokens=1, num_return_sequences=args['num_z'], pad_token_id=tokenizer.eos_token_id, **args['generate_args'])
     else:
-        output = model.generate(input_ids, max_new_tokens=(len(sentence.split())-half_sentence_index)/2, min_new_tokens=1, num_return_sequences=args['num_z'], pad_token_id=tokenizer.eos_token_id, **args['generate_args'])
-    # print(output)
-    complete_generated_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-    return complete_generated_text
-def RMIA_1(text,target_loss,ref_loss,model1,tokenizer1,ratio_gen,neighbors_dl):
-    target_losses_z = evaluate_model(model1,tokenizer1,neighbors_dl)
-    result = torch.count_nonzero(target_losses_z < target_loss).item() / len(target_losses_z)
-    return result
-def get_neighbors(text,ref_loss,model2,tokenizer2,ratio_gen,data_name):
-    cur_args = {'prefix_length': ratio_gen, 'num_z': 100, 'generate_args': {'do_sample': True}}
-    neighbors = sample_generation(text, model2, tokenizer2, cur_args,data_name)
-    neighbors_dl = DataLoader(neighbors, batch_size=32, shuffle=False)
-    return neighbors_dl
-def evaluate_data(test_data, col_name, target_model, ref_model, ratio_gen, data_name):
-    global model1,model2,tokenizer1,tokenizer2
-    print(f"all data size: {len(test_data)}")
-    random.seed(0)
-    random.shuffle(test_data)
-    test_data = test_data[:100]
-    inference2_pass = None
-    neighbors_dls = None
-    ref_model_clean = ref_model.replace("/","-")
-    data_name_clean = data_name.replace("/","-")
-    os.makedirs(os.path.join(f"saves/{ref_model_clean}",f"{data_name_clean}"),exist_ok=True)
-    try:
-        inference2_pass = load_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt')
-        neighbors_dls = load_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt')
-    except:
-        ### MODEL 2 likelihoods
-        model2, tokenizer2 = load_model(ref_model)
-        inference2_pass = [] #0: p_ref, #1: all_prob_ref, #2: p_ref_likelihood
-        for ex in tqdm(test_data):
-            text = ex[col_name]
-            new_ex = inference_model2(model2, tokenizer2, text)
-            inference2_pass.append(new_ex)
-        # Invariant. Doesn't take in model1 so I'm good
-        ### Neighbors:
-        neighbors_dls = []
-        counter = 0
-        for ex in tqdm(test_data):
-            text = ex[col_name]
-            new_ex = get_neighbors(text,inference2_pass[counter][2],model2,tokenizer2,ratio_gen,data_name)
-            counter = counter + 1
-            neighbors_dls.append(new_ex)
-        unload_model(model2,tokenizer2)
-        # Because it uses temp it is not invariant, however taking a snapshot in time should be just fine.
-        save_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt',inference2_pass)
-        save_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt',neighbors_dls)
-        print("Saved ref data, exiting.")
-    ### MODEL 1 likelihoods
-    model1, tokenizer1 = load_model(target_model)
-    inference1_pass = [] #0: p1, #1: all_prob, #2: p1_likelihood, #3: p_lower, #4: p_lower_likelihood
-    for ex in tqdm(test_data):
-        text = ex[col_name]
-        new_ex = inference_model1(model1,tokenizer1,text)
-        inference1_pass.append(new_ex)
-    ### RIMA results
-    model1, tokenizer1 = load_model(target_model)
-    counter = 0
-    results = []
-    for ex in tqdm(test_data):
-        text = ex[col_name]
-        new_ex = RMIA_1(text,inference1_pass[counter][2],inference2_pass[counter][2],model1,tokenizer1,ratio_gen,neighbors_dls[counter])
-        counter = counter + 1
-        results.append(new_ex)
-    unload_model(model1,tokenizer1)
-    ### Inference ex
-    all_output = []
-    counter = 0
-    for ex in tqdm(test_data):
-        text = ex[col_name]
-        pred = {}
-        pred["minkprob_w/_ref"] = results[counter]
-        pred["ppl"] = inference1_pass[counter][0]
-        pred["ppl/Ref_ppl (calibrate PPL to the reference model)"] = inference1_pass[counter][2]-inference2_pass[counter][2]
-        pred["ppl/lowercase_ppl"] = -(np.log(inference1_pass[counter][3]) / np.log(inference1_pass[counter][0])).item()
-        zlib_entropy = len(zlib.compress(bytes(text, 'utf-8')))
-        pred["ppl/zlib"] = np.log(inference1_pass[counter][0])/zlib_entropy
-        ex["pred"] = pred
-        counter = counter + 1
-        all_output.append(ex)
-    return all_output
-def inference_model1 (model1, tokenizer1, text):
-    p1, all_prob, p1_likelihood = calculatePerplexity(text, model1, tokenizer1, gpu=model1.device)
-    p_lower, _, p_lower_likelihood = calculatePerplexity(text.lower(), model1, tokenizer1, gpu=model1.device)
-    return [p1, all_prob, p1_likelihood, p_lower, p_lower_likelihood]
-def inference_model2 (model2, tokenizer2, text):
-    p_ref, all_prob_ref, p_ref_likelihood = calculatePerplexity(text, model2, tokenizer2, gpu=model2.device)
-    return [p_ref,all_prob_ref,p_ref_likelihood]
-def main(target_model,ref_model,output_dir,data,length,key_name,ratio_gen):
-    output_dir = f"{output_dir}/{target_model}_{ref_model}/{key_name}"
-    Path(output_dir).mkdir(parents=True, exist_ok=True)
-    # load model and data
-    data_name = data
-    if "jsonl" in data:
-        data = load_jsonl(f"{data}")
-    elif data == "truthful_qa":
-        # bp()
-        dataset = load_dataset(data, "multiple_choice", split="validation")
-        data = convert_huggingface_data_to_list_dic(dataset)
-        data = process_truthful_qa(data)
-    elif data == "cais/mmlu":
-        dataset = load_dataset(data, "all", split="test")
-        data = convert_huggingface_data_to_list_dic(dataset)
-        data = process_mmlu(data)
-    elif data == "ai2_arc":
-        dataset = load_dataset(data, "ARC-Challenge", split="test")
-        data = convert_huggingface_data_to_list_dic(dataset)
-        data = process_arc(data)
-    elif data == "gsm8k":
-        dataset = load_dataset(data, "main", split="test")
-        data = convert_huggingface_data_to_list_dic(dataset)
-        data = process_gsm8k(data)
-    elif data == "Rowan/hellaswag":
-        dataset = load_dataset(data, "default", split="validation")
-        # We use validation since labels for the test set are not available?
-        data = convert_huggingface_data_to_list_dic(dataset)
-        data = process_hellaswag(data)
-    elif data == "winogrande":
-        dataset = load_dataset(data,"winogrande_debiased", split="validation")
-        data = convert_huggingface_data_to_list_dic(dataset)
-        data = process_winogrande(data)
-    #model1, model2, tokenizer1, tokenizer2 = load_model(target_model, ref_model)
-    all_output = evaluate_data(data,key_name, target_model, ref_model,ratio_gen,data_name)
-    dump_jsonl(all_output, f"{output_dir}/all_output.jsonl")
-    return analyze_data(all_output)
-    # fig_fpr_tpr(all_output, output_dir)

+import gradio as gr
+import subprocess
 import os
 import sys
+import time
+import pandas as pd
+from threading import Thread
+# Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
+src_dir = os.path.join(project_root, "src")
+sys.path.insert(0, src_dir)
+import run as evaluator  # Import the run module
+from src.css_html import custom_css
+from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2
+from src.envs import API, H4_TOKEN, REPO_ID
+from huggingface_hub import HfApi
+from src.utils import (
+    AutoEvalColumn,
+    fields,
+    is_model_on_hub,
+    make_clickable_names,
+    styled_error,
+    styled_message,
+)
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
+COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
+TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
+# CONFIGURATION:
+ref_model = "huggyllama/llama-7b"
+test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
+modelQueue = []
+def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
+    API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
+def save_to_txt(model, results, model_type):
+    file_path = "data/code_eval_board.csv"
+    with open(file_path, "a") as f:
+        f.write(f"\n{model_type},{model}," + str(results["arc"]) + "," + str(results["hellaswag"]) + "," + str(results["mmlu"]) + "," + str(results["truthfulQA"]) + "," + str(results["winogrande"]) + "," + str(results["gsm8k"]))
+        f.close()
+    restart_space()
+def run_test(model,ref_model,data):
+    print(f"|| TESTING {data} ||")
+    return evaluator.main(
+                target_model=f"{model}",
+                ref_model=f"{ref_model}",
+                output_dir="out",
+                data=f"{data}",
+                length=64,
+                key_name="input",
+                ratio_gen=0.4
+            ) # Call the main function in detect-pretrain-code-contamination/src/run.py
+def evaluate(model,model_type):
+    global ref_model
+    print(f"|| EVALUATING {model} ||")
+    results = {
+        "arc": run_test(model, ref_model, test_datasets[2]),
+        "hellaswag": run_test(model, ref_model, test_datasets[4]),
+        "mmlu": run_test(model, ref_model, test_datasets[1]),
+        "truthfulQA": run_test(model, ref_model, test_datasets[0]),
+        "winogrande": run_test(model, ref_model, test_datasets[5]),
+        "gsm8k": run_test(model, ref_model, test_datasets[3]),
+        "ref_model": ref_model,
+    }
+    # Save to .txt file in /Evaluations/{model}
+    save_to_txt(model, results, model_type)
+    return "\n".join([f"{k}:{results[k]}" for k in results])
+def worker_thread():
+    global modelQueue, server
+    while True:
+        for submission in modelQueue:
+            evaluate(submission[0],submission[1].split(" ")[0])
+            modelQueue.pop(modelQueue.index(submission))
+            time.sleep(1)
+        time.sleep(1)
+def queue(model,model_type):
+    global modelQueue
+    modelQueue.append([model,model_type])
+    print(f"QUEUE:\n{modelQueue}")
+### bigcode/bigcode-models-leaderboard
+def add_new_eval(
+    model: str,
+    revision: str,
+    precision: str,
+    model_type: str,
+):
+    precision = precision
+    if model_type is None or model_type == "" or model_type == []:
+        return styled_error("Please select a model type.")
+    print(model_type)
+    # check the model actually exists before adding the eval
+    if revision == "":
+        revision = "main"
+    model_on_hub, error = is_model_on_hub(model, revision)
+    if not model_on_hub:
+        return styled_error(f'Model "{model}" {error}')
+    print("Adding new eval")
+    queue(model,model_type)
+    return styled_message("Your request has been submitted to the evaluation queue!\n")
+def select_columns(df, columns):
+    always_here_cols = [
+        AutoEvalColumn.model_type_symbol.name,
+        AutoEvalColumn.model.name,
+    ]
+    # We use COLS to maintain sorting
+    filtered_df = df[
+        always_here_cols + [c for c in COLS if c in df.columns and c in columns]
+    ]
+    return filtered_df
+def filter_items(df, leaderboard_table, query):
+    if query == "All":
+        return df[leaderboard_table.columns]
     else:
+        query = query[0]  # take only the emoji character
+    filtered_df = df[(df["T"] == query)]
+    return filtered_df[leaderboard_table.columns]
+def search_table(df, leaderboard_table, query):
+    filtered_df = df[(df["Models"].str.contains(query, case=False))]
+    return filtered_df[leaderboard_table.columns]
+demo = gr.Blocks(css=custom_css)
+with demo:
+    with gr.Row():
+        gr.Markdown(
+            """<div style="text-align: center;"><h1> 📄 LLM Contamination Detector </h1></div>\
+            <br>\
+            <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">🤗 Big Code Models Leaderboard ���</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\
+            This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""",
+            elem_classes="markdown-text",
+        )
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.Column():
+            with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
+                with gr.TabItem("🔍 Evaluations", id=0):
+                    with gr.Column():
+                        with gr.Accordion("➡️ See filters", open=False):
+                            shown_columns = gr.CheckboxGroup(
+                                choices=[
+                                    c
+                                    for c in COLS
+                                    if c
+                                    not in [
+                                        AutoEvalColumn.dummy.name,
+                                        AutoEvalColumn.model.name,
+                                        AutoEvalColumn.model_type_symbol.name,
+                                    ]
+                                ],
+                                value=[
+                                    c
+                                    for c in COLS_LITE
+                                    if c
+                                    not in [
+                                        AutoEvalColumn.dummy.name,
+                                        AutoEvalColumn.model.name,
+                                        AutoEvalColumn.model_type_symbol.name,
+                                    ]
+                                ],
+                                label="",
+                                elem_id="column-select",
+                                interactive=True,
+                            )
+                        # with gr.Column(min_width=780):
+                        with gr.Row():
+                            search_bar = gr.Textbox(
+                                placeholder="🔍 Search for a model and press ENTER...",
+                                show_label=False,
+                                elem_id="search-bar",
+                            )
+                            filter_columns = gr.Radio(
+                                label="⏚ Filter model types",
+                                choices=["All", "🟢 Base", "🔶 Finetuned"],
+                                value="All",
+                                elem_id="filter-columns",
+                            )
+                    df = pd.read_csv("data/code_eval_board.csv")
+                    leaderboard_df = gr.components.Dataframe(
+                        value=df[
+                            [
+                                AutoEvalColumn.model_type_symbol.name,
+                                AutoEvalColumn.model.name,
+                            ]
+                            + shown_columns.value
+                        ],
+                        headers=[
+                            AutoEvalColumn.model_type_symbol.name,
+                            AutoEvalColumn.model.name,
+                        ]
+                        + shown_columns.value,
+                        datatype=TYPES,
+                        elem_id="leaderboard-table",
+                        interactive=False,
+                    )
+                    hidden_leaderboard_df = gr.components.Dataframe(
+                        value=df,
+                        headers=COLS,
+                        datatype=["str" for _ in range(len(COLS))],
+                        visible=False,
+                    )
+                    search_bar.submit(
+                        search_table,
+                        [hidden_leaderboard_df, leaderboard_df, search_bar],
+                        leaderboard_df,
+                    )
+                    filter_columns.change(
+                        filter_items,
+                        [hidden_leaderboard_df, leaderboard_df, filter_columns],
+                        leaderboard_df,
+                    )
+                    shown_columns.change(
+                        select_columns,
+                        [hidden_leaderboard_df, shown_columns],
+                        leaderboard_df,
+                    )
+                    gr.Markdown(
+                        """
+                    **Notes:**
+                    - The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available.
+                    - Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation.
+                    - For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777.
+                    - Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about.
+                    """,
+                        elem_classes="markdown-text",
+                    )
+                with gr.TabItem("📝 About", id=2):
+                    gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
+                with gr.TabItem("🛠️ Submit models", id=3):
+                    gr.Markdown(SUBMISSION_TEXT)
+                    gr.Markdown(
+                        "## 📤  Submit a model here:", elem_classes="markdown-text"
+                    )
+                    with gr.Column():
+                        with gr.Row():
+                            model_name = gr.Textbox(label="Model name")
+                            revision_name = gr.Textbox(
+                                label="revision", placeholder="main"
+                            )
+                        with gr.Row():
+                            precision = gr.Dropdown(
+                                choices=[
+                                    "float16",
+                                    "bfloat16",
+                                    "8bit",
+                                    "4bit",
+                                ],
+                                label="Precision",
+                                multiselect=False,
+                                value="float16",
+                                interactive=True,
+                            )
+                            model_type = gr.Dropdown(
+                                choices=["🟢 base", "🔶 instruction-tuned"],
+                                label="Model type",
+                                multiselect=False,
+                                value=None,
+                                interactive=True,
+                            )
+                        submit_button = gr.Button("Submit Eval")
+                        submission_result = gr.Markdown()
+                        submit_button.click(
+                            add_new_eval,
+                            inputs=[model_name, revision_name, precision, model_type],
+                            outputs=[submission_result],
+                        )
+                        gr.Markdown(SUBMISSION_TEXT_2)
+thread = Thread(target=worker_thread)
+thread.start()
+demo.launch(share=True)
+# Some worries:
+# 1. Am I testing things correctly in eval.py, following the template format?
+# 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train
+#   (As in: if test exists, I go with that, then validation, then default)
+# 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily.
+#   (Not sure which one open llm leaderboard uses, or what is the standard)
+# 4. I'm unsure why in eval.py we append the output at the end of the input.
+# 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B?