Initital commit
Browse files- README.md +9 -4
- app.py +181 -0
- contamination_report.csv +4 -0
- dataset.py +260 -0
- markdown.py +69 -0
- requirements.txt +7 -0
- utils.py +181 -0
README.md
CHANGED
@@ -1,13 +1,18 @@
|
|
1 |
---
|
2 |
-
title: Data Contamination Report
|
3 |
-
emoji:
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
-
|
|
|
8 |
app_file: app.py
|
|
|
|
|
9 |
pinned: false
|
10 |
-
license:
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: ππ¨ Data Contamination Report
|
3 |
+
emoji: π
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
+
python_version: 3.10
|
8 |
+
sdk_version: 4.19.1
|
9 |
app_file: app.py
|
10 |
+
app_port: 7860
|
11 |
+
fullWidth: true
|
12 |
pinned: false
|
13 |
+
license: mit
|
14 |
+
suggested_hardware: cpu-upgrade
|
15 |
+
|
16 |
---
|
17 |
|
18 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
from dataset import get_dataframe
|
5 |
+
from markdown import GUIDELINES, PANEL_MARKDOWN
|
6 |
+
|
7 |
+
|
8 |
+
df = get_dataframe()
|
9 |
+
|
10 |
+
|
11 |
+
def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes):
|
12 |
+
"""
|
13 |
+
Filter the dataframe based on the provided evaluation dataset, contaminated source, and checkboxes.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
dataframe (pandas.DataFrame): The input dataframe to filter.
|
17 |
+
eval_dataset (str): The evaluation dataset to filter by.
|
18 |
+
cont_source (str): The contaminated source to filter by.
|
19 |
+
checkboxes (list): The checkboxes to filter by.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
pandas.DataFrame: The filtered dataframe.
|
23 |
+
"""
|
24 |
+
if isinstance(eval_dataset, str):
|
25 |
+
dataframe = dataframe[
|
26 |
+
dataframe["Evaluation Dataset"].str.contains(eval_dataset)
|
27 |
+
]
|
28 |
+
if isinstance(cont_source, str):
|
29 |
+
dataframe = dataframe[
|
30 |
+
dataframe["Contaminated Source"].str.contains(cont_source)
|
31 |
+
]
|
32 |
+
if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes:
|
33 |
+
dataframe = dataframe[dataframe["Approach"] != "model-based"]
|
34 |
+
if isinstance(checkboxes, list) and "Show only contaminated" in checkboxes:
|
35 |
+
dataframe = dataframe[
|
36 |
+
(dataframe["Train Split"] > 0.0)
|
37 |
+
| (dataframe["Development Split"] > 0.0)
|
38 |
+
| (dataframe["Test Split"] > 0.0)
|
39 |
+
]
|
40 |
+
|
41 |
+
return dataframe
|
42 |
+
|
43 |
+
|
44 |
+
def filter_dataframe_corpus(*args, **kwargs) -> pd.DataFrame:
|
45 |
+
"""
|
46 |
+
Filter the dataframe for corpus contamination.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
pandas.DataFrame: The filtered dataframe for corpus contamination.
|
50 |
+
"""
|
51 |
+
# Get rows in which the column Model or corpus is equal to dataset
|
52 |
+
filtered_df = df[df["Model or corpus"] == "corpus"]
|
53 |
+
filtered_df = filtered_df.drop(columns=["Model or corpus"])
|
54 |
+
return filter_dataframe(filtered_df, *args, **kwargs)
|
55 |
+
|
56 |
+
|
57 |
+
def filter_dataframe_model(*args, **kwargs) -> pd.DataFrame:
|
58 |
+
"""
|
59 |
+
Filter the dataframe for model contamination.
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
pandas.DataFrame: The filtered dataframe for model contamination.
|
63 |
+
"""
|
64 |
+
# Get rows in which the column Model or corpus is equal to dataset
|
65 |
+
filtered_df = df[df["Model or corpus"] == "model"]
|
66 |
+
filtered_df = filtered_df.drop(columns=["Model or corpus"])
|
67 |
+
return filter_dataframe(filtered_df, *args, **kwargs)
|
68 |
+
|
69 |
+
|
70 |
+
theme = gr.themes.Soft(
|
71 |
+
primary_hue="emerald",
|
72 |
+
secondary_hue="red",
|
73 |
+
text_size="sm",
|
74 |
+
spacing_size="sm",
|
75 |
+
font=[
|
76 |
+
gr.themes.GoogleFont("Poppins"),
|
77 |
+
gr.themes.GoogleFont("Poppins"),
|
78 |
+
gr.themes.GoogleFont("Poppins"),
|
79 |
+
gr.themes.GoogleFont("Poppins"),
|
80 |
+
],
|
81 |
+
).set(block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950")
|
82 |
+
|
83 |
+
|
84 |
+
with gr.Blocks(
|
85 |
+
theme=theme,
|
86 |
+
title="π¨ Data Contamination Report",
|
87 |
+
analytics_enabled=False,
|
88 |
+
) as demo:
|
89 |
+
gr.Markdown(PANEL_MARKDOWN)
|
90 |
+
with gr.Tab("Corpus contamination") as tab_corpus:
|
91 |
+
with gr.Row(variant="compact"):
|
92 |
+
with gr.Column():
|
93 |
+
eval_dataset_corpus = gr.Textbox(
|
94 |
+
placeholder="Evaluation dataset",
|
95 |
+
label="Evaluation dataset",
|
96 |
+
value="",
|
97 |
+
)
|
98 |
+
cont_corpora = gr.Textbox(
|
99 |
+
placeholder="Pre-training corpora",
|
100 |
+
label="Pre-training corpora",
|
101 |
+
value="",
|
102 |
+
)
|
103 |
+
with gr.Column():
|
104 |
+
checkboxes_corpus = gr.CheckboxGroup(
|
105 |
+
["Exclude model-based evidences", "Show only contaminated"],
|
106 |
+
label="Search options",
|
107 |
+
value=[],
|
108 |
+
)
|
109 |
+
|
110 |
+
filter_corpus_btn = gr.Button("Filter")
|
111 |
+
|
112 |
+
corpus_dataframe = gr.DataFrame(
|
113 |
+
value=filter_dataframe_corpus(
|
114 |
+
eval_dataset_corpus, cont_corpora, checkboxes_corpus
|
115 |
+
).style.format(precision=2),
|
116 |
+
headers=df.columns.to_list(),
|
117 |
+
datatype=[
|
118 |
+
"markdown",
|
119 |
+
"markdown",
|
120 |
+
"number",
|
121 |
+
"number",
|
122 |
+
"number",
|
123 |
+
"str",
|
124 |
+
"markdown",
|
125 |
+
"markdown",
|
126 |
+
],
|
127 |
+
)
|
128 |
+
|
129 |
+
with gr.Tab("Model contamination") as tab_model:
|
130 |
+
with gr.Row(variant="compact"):
|
131 |
+
with gr.Column():
|
132 |
+
eval_dataset_model = gr.Textbox(
|
133 |
+
placeholder="Evaluation dataset",
|
134 |
+
label="Evaluation dataset",
|
135 |
+
value="",
|
136 |
+
)
|
137 |
+
cont_model = gr.Textbox(
|
138 |
+
placeholder="Model", label="Pre-training corpora", value=""
|
139 |
+
)
|
140 |
+
with gr.Column():
|
141 |
+
checkboxes_model = gr.CheckboxGroup(
|
142 |
+
["Exclude model-based evidences", "Show only contaminated"],
|
143 |
+
label="Search options",
|
144 |
+
value=[],
|
145 |
+
)
|
146 |
+
|
147 |
+
filter_model_btn = gr.Button("Filter")
|
148 |
+
|
149 |
+
model_dataframe = gr.DataFrame(
|
150 |
+
value=filter_dataframe_model(
|
151 |
+
eval_dataset_model, cont_model, checkboxes_model
|
152 |
+
),
|
153 |
+
headers=df.columns.to_list(),
|
154 |
+
datatype=[
|
155 |
+
"markdown",
|
156 |
+
"markdown",
|
157 |
+
"number",
|
158 |
+
"number",
|
159 |
+
"number",
|
160 |
+
"str",
|
161 |
+
"markdown",
|
162 |
+
"markdown",
|
163 |
+
],
|
164 |
+
)
|
165 |
+
|
166 |
+
filter_corpus_btn.click(
|
167 |
+
filter_dataframe_corpus,
|
168 |
+
inputs=[eval_dataset_corpus, cont_corpora, checkboxes_corpus],
|
169 |
+
outputs=corpus_dataframe,
|
170 |
+
)
|
171 |
+
filter_model_btn.click(
|
172 |
+
filter_dataframe_model,
|
173 |
+
inputs=[eval_dataset_model, cont_model, checkboxes_model],
|
174 |
+
outputs=model_dataframe,
|
175 |
+
)
|
176 |
+
|
177 |
+
with gr.Tab("Guidelines") as tab_guidelines:
|
178 |
+
gr.Markdown(GUIDELINES)
|
179 |
+
|
180 |
+
|
181 |
+
demo.launch()
|
contamination_report.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Evaluation Dataset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Citation;PR Link
|
2 |
+
conll2003;google/gemma-7b;model;1.0;1.0;1.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;
|
3 |
+
conll2003;EleutherAI/the_pile_deduplicated;corpus;1.0;1.0;1.0;data-based;https://aclanthology.org/2023.findings-emnlp.722/;www.google.com
|
4 |
+
Test;lololol;corpus;1.0;1.0;1.0;data-based;https://arxiv.org/abs/2310.03668;
|
dataset.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import filelock
|
5 |
+
import huggingface_hub
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
from utils import (
|
9 |
+
build_datasets_urls,
|
10 |
+
build_models_urls,
|
11 |
+
build_text_icon,
|
12 |
+
download_favicons,
|
13 |
+
get_base_url,
|
14 |
+
get_domain_name,
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg"
|
19 |
+
CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png"
|
20 |
+
|
21 |
+
DISABLE_ONLINE_CACHE = False
|
22 |
+
ONLINE_CACHE = "CONDA-Workshop/RequestCache"
|
23 |
+
|
24 |
+
|
25 |
+
def save_cache(cache_data, cache_file, initial_timestamp):
|
26 |
+
print(f"Saving cache to {cache_file}")
|
27 |
+
# Acquire lock before reading and updating the file to prevent race conditions
|
28 |
+
with filelock.FileLock(f"{cache_file}.lock"):
|
29 |
+
# Check if the file has been modified since the initial read
|
30 |
+
current_timestamp = (
|
31 |
+
os.path.getmtime(cache_file) if os.path.exists(cache_file) else None
|
32 |
+
)
|
33 |
+
if current_timestamp is None or initial_timestamp != current_timestamp:
|
34 |
+
# File has been modified or created since initial read, re-read the file
|
35 |
+
try:
|
36 |
+
with open(cache_file, "r", encoding="utf8") as f:
|
37 |
+
# Update the dictionary with newly added entries
|
38 |
+
cache_dict = json.load(f)
|
39 |
+
# Test if cache_dict and cache_data are different
|
40 |
+
if cache_dict != cache_data:
|
41 |
+
cache_data.update(cache_dict)
|
42 |
+
|
43 |
+
except FileNotFoundError:
|
44 |
+
pass # If the file doesn't exist at this point, continue with the current dictionary
|
45 |
+
|
46 |
+
# Write the updated dictionary back to the file
|
47 |
+
with open(cache_file, "w", encoding="utf8") as f:
|
48 |
+
json.dump(cache_data, f, ensure_ascii=False, indent=4)
|
49 |
+
|
50 |
+
if not DISABLE_ONLINE_CACHE:
|
51 |
+
try:
|
52 |
+
huggingface_hub.upload_file(
|
53 |
+
repo_id=ONLINE_CACHE,
|
54 |
+
repo_type="dataset",
|
55 |
+
token=os.environ.get("TOKEN") or True,
|
56 |
+
path_in_repo=cache_file,
|
57 |
+
path_or_fileobj=cache_file,
|
58 |
+
)
|
59 |
+
except Exception as e:
|
60 |
+
print(f"Unable to upload {cache_file}: {e}")
|
61 |
+
|
62 |
+
return cache_data
|
63 |
+
|
64 |
+
|
65 |
+
def update_favicon_cache(sources):
|
66 |
+
# Load the favicon dictionary if it exists
|
67 |
+
favicon_dict = {}
|
68 |
+
favicon_file_path = "favicons.json"
|
69 |
+
initial_timestamp = None
|
70 |
+
|
71 |
+
if not DISABLE_ONLINE_CACHE:
|
72 |
+
try:
|
73 |
+
huggingface_hub.hf_hub_download(
|
74 |
+
repo_id=ONLINE_CACHE,
|
75 |
+
repo_type="dataset",
|
76 |
+
token=os.environ.get("TOKEN") or True,
|
77 |
+
filename=favicon_file_path,
|
78 |
+
local_dir=os.getcwd(),
|
79 |
+
)
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Unable to download favicons.json: {e}")
|
82 |
+
|
83 |
+
# Attempt to load the favicon dictionary and record its last modification time
|
84 |
+
if os.path.exists(favicon_file_path):
|
85 |
+
initial_timestamp = os.path.getmtime(favicon_file_path)
|
86 |
+
try:
|
87 |
+
with open(favicon_file_path, "r", encoding="utf8") as f:
|
88 |
+
favicon_dict = json.load(f)
|
89 |
+
except FileNotFoundError:
|
90 |
+
pass # File not found, proceed with an empty dictionary
|
91 |
+
|
92 |
+
# Determine which favicons need to be downloaded
|
93 |
+
missing_domains = [domain for domain in sources if domain not in favicon_dict]
|
94 |
+
|
95 |
+
# Download missing favicons in batch
|
96 |
+
if missing_domains:
|
97 |
+
new_favicon_urls = download_favicons(missing_domains)
|
98 |
+
favicon_dict.update(new_favicon_urls)
|
99 |
+
favicon_dict = save_cache(
|
100 |
+
cache_data=favicon_dict,
|
101 |
+
cache_file=favicon_file_path,
|
102 |
+
initial_timestamp=initial_timestamp,
|
103 |
+
)
|
104 |
+
|
105 |
+
return favicon_dict
|
106 |
+
|
107 |
+
|
108 |
+
def update_model_url_cache(models):
|
109 |
+
models = [x for x in models if x is not None]
|
110 |
+
models = list(set(models))
|
111 |
+
|
112 |
+
# Load the model url dictionary if it exists
|
113 |
+
model_url_dict = {}
|
114 |
+
model_url_file_path = "model_urls.json"
|
115 |
+
initial_timestamp = None
|
116 |
+
|
117 |
+
if not DISABLE_ONLINE_CACHE:
|
118 |
+
try:
|
119 |
+
huggingface_hub.hf_hub_download(
|
120 |
+
repo_id=ONLINE_CACHE,
|
121 |
+
repo_type="dataset",
|
122 |
+
token=os.environ.get("TOKEN") or True,
|
123 |
+
filename=model_url_file_path,
|
124 |
+
local_dir=os.getcwd(),
|
125 |
+
)
|
126 |
+
except Exception as e:
|
127 |
+
print(f"Unable to download model_urls.json: {e}")
|
128 |
+
|
129 |
+
# Attempt to load the model url dictionary and record its last modification time
|
130 |
+
if os.path.exists(model_url_file_path):
|
131 |
+
initial_timestamp = os.path.getmtime(model_url_file_path)
|
132 |
+
try:
|
133 |
+
with open(model_url_file_path, "r", encoding="utf8") as f:
|
134 |
+
model_url_dict = json.load(f)
|
135 |
+
except FileNotFoundError:
|
136 |
+
pass # File not found, proceed with an empty dictionary
|
137 |
+
|
138 |
+
# Determine which model urls need to be downloaded
|
139 |
+
missing_model_urls = [model for model in models if model not in model_url_dict]
|
140 |
+
|
141 |
+
# Download missing model urls in batch
|
142 |
+
if missing_model_urls:
|
143 |
+
new_model_urls = build_models_urls(missing_model_urls)
|
144 |
+
model_url_dict.update(new_model_urls)
|
145 |
+
model_url_dict = save_cache(
|
146 |
+
cache_data=model_url_dict,
|
147 |
+
cache_file=model_url_file_path,
|
148 |
+
initial_timestamp=initial_timestamp,
|
149 |
+
)
|
150 |
+
|
151 |
+
return model_url_dict
|
152 |
+
|
153 |
+
|
154 |
+
def update_dataset_url_cache(datasets):
|
155 |
+
datasets = [x for x in datasets if x is not None]
|
156 |
+
datasets = list(set(datasets))
|
157 |
+
|
158 |
+
# Load the dataset url dictionary if it exists
|
159 |
+
dataset_url_dict = {}
|
160 |
+
dataset_url_file_path = "dataset_urls.json"
|
161 |
+
initial_timestamp = None
|
162 |
+
|
163 |
+
if not DISABLE_ONLINE_CACHE:
|
164 |
+
try:
|
165 |
+
huggingface_hub.hf_hub_download(
|
166 |
+
repo_id=ONLINE_CACHE,
|
167 |
+
repo_type="dataset",
|
168 |
+
token=os.environ.get("TOKEN") or True,
|
169 |
+
filename=dataset_url_file_path,
|
170 |
+
local_dir=os.getcwd(),
|
171 |
+
)
|
172 |
+
except Exception as e:
|
173 |
+
print(f"Unable to download dataset_urls.json: {e}")
|
174 |
+
|
175 |
+
# Attempt to load the dataset url dictionary and record its last modification time
|
176 |
+
if os.path.exists(dataset_url_file_path):
|
177 |
+
initial_timestamp = os.path.getmtime(dataset_url_file_path)
|
178 |
+
try:
|
179 |
+
with open(dataset_url_file_path, "r", encoding="utf8") as f:
|
180 |
+
dataset_url_dict = json.load(f)
|
181 |
+
except FileNotFoundError:
|
182 |
+
pass # File not found, proceed with an empty dictionary
|
183 |
+
|
184 |
+
# Determine which dataset urls need to be downloaded
|
185 |
+
missing_dataset_urls = [
|
186 |
+
dataset for dataset in datasets if dataset not in dataset_url_dict
|
187 |
+
]
|
188 |
+
|
189 |
+
# Download missing dataset urls in batch
|
190 |
+
if missing_dataset_urls:
|
191 |
+
new_dataset_urls = build_datasets_urls(missing_dataset_urls)
|
192 |
+
dataset_url_dict.update(new_dataset_urls)
|
193 |
+
dataset_url_dict = save_cache(
|
194 |
+
cache_data=dataset_url_dict,
|
195 |
+
cache_file=dataset_url_file_path,
|
196 |
+
initial_timestamp=initial_timestamp,
|
197 |
+
)
|
198 |
+
|
199 |
+
return dataset_url_dict
|
200 |
+
|
201 |
+
|
202 |
+
def get_dataframe():
|
203 |
+
# Load the contamination_report.csv file
|
204 |
+
data = pd.read_csv("contamination_report.csv", delimiter=";", header=0)
|
205 |
+
|
206 |
+
# Load the favicon dictionary if it exists
|
207 |
+
favicon_dict = {}
|
208 |
+
|
209 |
+
# Update the favicon dictionary
|
210 |
+
favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Citation"]])
|
211 |
+
|
212 |
+
# Update the model url dictionary
|
213 |
+
model_url_dict = update_model_url_cache(
|
214 |
+
data[data["Model or corpus"] == "model"]["Contaminated Source"]
|
215 |
+
)
|
216 |
+
|
217 |
+
# Update the dataset url dictionary
|
218 |
+
dataset_url_dict = update_dataset_url_cache(
|
219 |
+
list(data["Evaluation Dataset"])
|
220 |
+
+ list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"])
|
221 |
+
)
|
222 |
+
|
223 |
+
# Add favicons URLs to the dataframe in a vectorized manner
|
224 |
+
data["Citation"] = data["Citation"].apply(
|
225 |
+
lambda x: build_text_icon(
|
226 |
+
text=get_domain_name(x),
|
227 |
+
url=x,
|
228 |
+
icon_url=favicon_dict.get(get_base_url(x), ""),
|
229 |
+
)
|
230 |
+
)
|
231 |
+
|
232 |
+
data["PR Link"] = data["PR Link"].apply(
|
233 |
+
lambda x: build_text_icon(
|
234 |
+
text="",
|
235 |
+
url=x if x == x else "no link",
|
236 |
+
icon_url=HF_ICON if x == x else CROSS_ICON,
|
237 |
+
)
|
238 |
+
)
|
239 |
+
|
240 |
+
data["Evaluation Dataset"] = data["Evaluation Dataset"].apply(
|
241 |
+
lambda x: build_text_icon(
|
242 |
+
text=x,
|
243 |
+
url=dataset_url_dict.get(x, ""),
|
244 |
+
icon_url=HF_ICON,
|
245 |
+
)
|
246 |
+
)
|
247 |
+
|
248 |
+
# For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
|
249 |
+
data["Contaminated Source"] = data.apply(
|
250 |
+
lambda x: build_text_icon(
|
251 |
+
text=x["Contaminated Source"],
|
252 |
+
url=dataset_url_dict.get(x["Contaminated Source"], "")
|
253 |
+
if x["Model or corpus"] == "corpus"
|
254 |
+
else model_url_dict.get(x["Contaminated Source"], ""),
|
255 |
+
icon_url=HF_ICON,
|
256 |
+
),
|
257 |
+
axis=1,
|
258 |
+
)
|
259 |
+
|
260 |
+
return data
|
markdown.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GUIDELINES = """
|
2 |
+
# Contribution Guidelines
|
3 |
+
|
4 |
+
The π¨Data Contamination Report is a community-driven project and we welcome contributions from everyone.The objetive of this project is to provide a comprehensive list of data contamination cases, for both models and datasets. We aim to provide a tool for the community for avoiding evaluating
|
5 |
+
models on contaminated datasets. We also expect to generate a dataset that will help researchers
|
6 |
+
to develop algorithms to automatically detect contaminated datasets in the future.
|
7 |
+
|
8 |
+
If you wish to contribute to the project by reporting a data contamination case, please open a pull request
|
9 |
+
in the [βCommunity Tab](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions).Your pull request should edit the [contamination_report.csv](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/blob/main/contamination_report.csv)
|
10 |
+
file and add a new row with the details of the contamination case. Please will the following template with the details of the contamination case. ***Pull Requests that do not follow the template won't be accepted.***
|
11 |
+
|
12 |
+
# Template for reporting data contamination
|
13 |
+
|
14 |
+
```markdown
|
15 |
+
## What are you reporting:
|
16 |
+
- [ ] Evaluation dataset(s) found in a pre-training corpus. (e.g. COPA found in ThePile)
|
17 |
+
- [ ] Evaluation dataset(s) found in a pre-trained model. (e.g. FLAN T5 has been trained on ANLI)
|
18 |
+
|
19 |
+
**Evaluation dataset(s)**: Name(s) of the evaluation dataset(s). If available in the HuggingFace Hub please write the path (e.g. `uonlp/CulturaX`), otherwise provide a link to a paper, GitHub or dataset-card.
|
20 |
+
|
21 |
+
**Contaminated model(s)**: Name of the model(s) (if any) that have been contaminated with the evaluation dataset. If available in the HuggingFace Hub please list the corresponding paths (e.g. `allenai/OLMo-7B`).
|
22 |
+
|
23 |
+
**Contaminated corpora**: Name of the corpora used to pretrain models (if any) that have been contaminated with the evaluation dataset. If available in the HuggingFace hub please write the path (e.g. `CohereForAI/aya_dataset`)
|
24 |
+
|
25 |
+
**Contaminated split(s)**: If the dataset has Train, Development and/or Test splits please report the contaminated split(s). You can report a percentage of the dataset contaminated.
|
26 |
+
|
27 |
+
|
28 |
+
## Briefly describe your method to detect data contamination
|
29 |
+
|
30 |
+
- [ ] Data-based approach
|
31 |
+
- [ ] Model-based approach
|
32 |
+
|
33 |
+
Description of your method, 3-4 sentences. Evidence of data contamination (Read below):
|
34 |
+
|
35 |
+
#### Data-based approaches
|
36 |
+
Data-based approaches identify evidence of data contamination in a pre-training corpus by directly examining the dataset for instances of the evaluation data. This method involves algorithmically searching through a large pre-training dataset to find occurrences of the evaluation data. You should provide evidence of data contamination in the form: "dataset X appears in line N of corpus Y," "dataset X appears N times in corpus Y," or "N examples from dataset X appear in corpus Y."
|
37 |
+
|
38 |
+
#### Model-based approaches
|
39 |
+
|
40 |
+
Model-based approaches, on the other hand, utilize heuristic algorithms to infer the presence of data contamination in a pre-trained model. These methods do not directly analyze the data but instead assess the model's behavior to predict data contamination. Examples include prompting the model to reproduce elements of an evaluation dataset to demonstrate memorization (i.e https://hitz-zentroa.github.io/lm-contamination/blog/), or using perplexity measures to estimate data contamination (). You should provide evidence of data contamination in the form of evaluation results of the algorithm from research papers, screenshots of model outputs that demonstrate memorization of a pre-training dataset, or any other form of evaluation that substantiates the method's effectiveness in detecting data contamination. You can provide a confidence score in your predictions.
|
41 |
+
|
42 |
+
## Citation
|
43 |
+
|
44 |
+
Is there a paper that reports the data contamination or describes the method used to detect data contamination?
|
45 |
+
|
46 |
+
URL: `https://aclanthology.org/2023.findings-emnlp.722/`
|
47 |
+
Citation: `@inproceedings{...`
|
48 |
+
```
|
49 |
+
---
|
50 |
+
|
51 |
+
### How to update the contamination_report.csv file
|
52 |
+
|
53 |
+
The [contamination_report.csv](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/blob/main/contamination_report.csv) file is a csv filed with `;` delimiters. You will need to update the following columns:
|
54 |
+
- Evaluation Dataset: Name of the evaluation dataset contaminated. If available in the HuggingFace Hub please write the path (e.g. `uonlp/CulturaX`), otherwise proviede the name of the dataset.
|
55 |
+
- Contaminated Source: Name of the model that has been trained with the evaluation dataset or name of the pre-training copora that contains the evaluation datset. If available in the HuggingFace Hub please write the path (e.g. `allenai/OLMo-7B`), otherwise proviede the name of the model/dataset.
|
56 |
+
- Train split: Percentage of the train split contaminated. 0 means no contamination. 1 means that the dataset has been fully contamianted. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
|
57 |
+
- Development split: Percentage of the development split contaminated. 0 means no contamination. 1 means that the dataset has been fully contamianted.
|
58 |
+
- Train split: Percentage of the test split contaminated. 0 means no contamination. 1 means that the dataset has been fully contamianted. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
|
59 |
+
- Approach: data-based or model-based approach. See above for more information.
|
60 |
+
- Citation: If there is paper or any other resource describing how you have detected this contamination example, provide the URL.
|
61 |
+
- PR Link: Leave it blank, we will update it after you create the Pull Request.
|
62 |
+
""".strip()
|
63 |
+
|
64 |
+
|
65 |
+
PANEL_MARKDOWN = """
|
66 |
+
# Data Contamination Report
|
67 |
+
The π¨Data Contamination Report aims to track instances of data contamination in pre-trained models and corpora.
|
68 |
+
This effort is part of [The 1st Workshop on Data Contamination (CONDA)](https://conda-workshop.github.io/) that will be held at ACL 2024.
|
69 |
+
""".strip()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
setuptools
|
3 |
+
filelock
|
4 |
+
pandas
|
5 |
+
beautifulsoup4
|
6 |
+
requests
|
7 |
+
huggingface_hub
|
utils.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4 |
+
from typing import Dict, List, Union
|
5 |
+
from urllib.parse import urljoin, urlparse
|
6 |
+
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
|
10 |
+
|
11 |
+
def get_base_url(url: str) -> str:
|
12 |
+
"""
|
13 |
+
Extracts the base URL from a given URL.
|
14 |
+
|
15 |
+
Parameters:
|
16 |
+
- url (str): The URL to extract the base URL from.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
- str: The base URL.
|
20 |
+
"""
|
21 |
+
parsed_url = urlparse(url)
|
22 |
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
23 |
+
return base_url
|
24 |
+
|
25 |
+
|
26 |
+
def get_domain_name(url: str) -> str:
|
27 |
+
"""
|
28 |
+
Get the domain name from a URL.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
url (str): The URL.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
str: The domain name.
|
35 |
+
"""
|
36 |
+
|
37 |
+
parsed_uri = urlparse(url)
|
38 |
+
domain = "{uri.netloc}".format(uri=parsed_uri)
|
39 |
+
if domain.startswith("www."):
|
40 |
+
domain = domain[4:]
|
41 |
+
# First latter in uppercase
|
42 |
+
return domain.capitalize()
|
43 |
+
|
44 |
+
|
45 |
+
def get_favicon(url: str) -> str:
|
46 |
+
headers = {
|
47 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
48 |
+
}
|
49 |
+
try:
|
50 |
+
response = requests.get(url, headers=headers, timeout=2)
|
51 |
+
if response.status_code == 200:
|
52 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
53 |
+
# Search for all potential icons including meta tags
|
54 |
+
icon_links = soup.find_all(
|
55 |
+
"link", rel=re.compile(r"(shortcut icon|icon|apple-touch-icon)", re.I)
|
56 |
+
)
|
57 |
+
meta_icons = soup.find_all(
|
58 |
+
"meta", attrs={"content": re.compile(r".ico$", re.I)}
|
59 |
+
)
|
60 |
+
icons = icon_links + meta_icons
|
61 |
+
|
62 |
+
if icons:
|
63 |
+
for icon in icons:
|
64 |
+
favicon_url = icon.get("href") or icon.get("content")
|
65 |
+
if favicon_url:
|
66 |
+
if favicon_url.startswith("/"):
|
67 |
+
favicon_url = urljoin(url, favicon_url)
|
68 |
+
return favicon_url
|
69 |
+
# If icons found but no href or content, return default
|
70 |
+
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
|
71 |
+
else:
|
72 |
+
# No icons found, return default
|
73 |
+
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
|
74 |
+
else:
|
75 |
+
# Response was not OK, return default
|
76 |
+
return (
|
77 |
+
"https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
|
78 |
+
)
|
79 |
+
except requests.Timeout:
|
80 |
+
logging.warning(f"Request timed out for {url}")
|
81 |
+
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
|
82 |
+
except Exception as e:
|
83 |
+
logging.warning(f"An error occurred while fetching favicon for {url}: {e}")
|
84 |
+
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
|
85 |
+
|
86 |
+
|
87 |
+
def download_favicons(urls: List[str]) -> Dict[str, str]:
|
88 |
+
favicons = {}
|
89 |
+
urls = list(set(urls))
|
90 |
+
with ThreadPoolExecutor(max_workers=20) as executor:
|
91 |
+
future_to_url = {executor.submit(get_favicon, url): url for url in urls}
|
92 |
+
for future in as_completed(future_to_url):
|
93 |
+
url = future_to_url[future]
|
94 |
+
try:
|
95 |
+
favicon_url = future.result()
|
96 |
+
favicons[url] = favicon_url
|
97 |
+
except Exception as e:
|
98 |
+
logging.warning(f"Failed to fetch favicon for {url}: {e}")
|
99 |
+
favicons[url] = (
|
100 |
+
"https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
|
101 |
+
)
|
102 |
+
return favicons
|
103 |
+
|
104 |
+
|
105 |
+
def url_exists(url):
|
106 |
+
"""
|
107 |
+
Checks if a URL exists by making a HEAD request.
|
108 |
+
|
109 |
+
Parameters:
|
110 |
+
- url (str): The URL to check.
|
111 |
+
|
112 |
+
Returns:
|
113 |
+
- bool: True if the URL exists, False otherwise.
|
114 |
+
"""
|
115 |
+
try:
|
116 |
+
response = requests.head(url, allow_redirects=True)
|
117 |
+
return response.status_code < 400
|
118 |
+
except requests.RequestException:
|
119 |
+
# In case of network problems, SSL errors, etc.
|
120 |
+
return False
|
121 |
+
|
122 |
+
|
123 |
+
def build_dataset_url(dataset_name: str):
|
124 |
+
"""
|
125 |
+
Build an HTML string with the dataset URL.
|
126 |
+
"""
|
127 |
+
url = f"https://huggingface.co/datasets/{dataset_name}"
|
128 |
+
# Test if the url exists
|
129 |
+
if url_exists(url):
|
130 |
+
return url
|
131 |
+
else:
|
132 |
+
return None
|
133 |
+
|
134 |
+
|
135 |
+
def build_model_url(model_name: str):
|
136 |
+
"""
|
137 |
+
Build an HTML string with the model URL.
|
138 |
+
"""
|
139 |
+
url = f"https://huggingface.co/{model_name}"
|
140 |
+
# Test if the url exists
|
141 |
+
if url_exists(url):
|
142 |
+
return url
|
143 |
+
else:
|
144 |
+
return None
|
145 |
+
|
146 |
+
|
147 |
+
def build_text_icon(text: str, url: Union[str, None], icon_url: str):
|
148 |
+
if url is not None:
|
149 |
+
return (
|
150 |
+
f'<a href="{url}" target="_blank" style="text-decoration: none; color: inherit; display: inline-flex; align-items: center;">'
|
151 |
+
f'<img src="{icon_url}" alt="{url}" style="display: inline-block; vertical-align: middle; margin-right: 4px;" width="16" height="16">'
|
152 |
+
f'<span style="display: inline-block; vertical-align: middle;">{text}</span> </a>'
|
153 |
+
)
|
154 |
+
else:
|
155 |
+
return text
|
156 |
+
|
157 |
+
|
158 |
+
def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]:
|
159 |
+
"""
|
160 |
+
Build a dictionary of dataset URLs from a list of dataset names.
|
161 |
+
|
162 |
+
Parameters:
|
163 |
+
- datasets_names (List[str]): The list of dataset names.
|
164 |
+
|
165 |
+
Returns:
|
166 |
+
- Dict[str, str]: A dictionary of dataset URLs.
|
167 |
+
"""
|
168 |
+
return {dataset: build_dataset_url(dataset) for dataset in datasets_names}
|
169 |
+
|
170 |
+
|
171 |
+
def build_models_urls(models_names: List[str]) -> Dict[str, str]:
|
172 |
+
"""
|
173 |
+
Build a dictionary of model URLs from a list of model names.
|
174 |
+
|
175 |
+
Parameters:
|
176 |
+
- models_names (List[str]): The list of model names.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
- Dict[str, str]: A dictionary of model URLs.
|
180 |
+
"""
|
181 |
+
return {model: build_model_url(model) for model in models_names}
|