Spaces:

CONDA-Workshop
/

Data-Contamination-Database

Sleeping

App Files Files Community

File fixes and cleaning

#17

by OSainz - opened Apr 29, 2024

base: refs/heads/main

←

from: refs/pr/17

Discussion Files changed

-56

Files changed (5) hide show

README.md +2 -2
app.py +1 -6
contamination_report.csv +0 -41
markdown.py +4 -3
postprocessing.py +0 -4

README.md CHANGED Viewed

@@ -4,8 +4,8 @@ emoji: 🏭
 colorFrom: green
 colorTo: blue
 sdk: gradio
-python_version: 3.11
-sdk_version: 4.36.0
 app_file: app.py
 app_port: 7860
 fullWidth: true

 colorFrom: green
 colorTo: blue
 sdk: gradio
+python_version: 3.10
+sdk_version: 4.19.1
 app_file: app.py
 app_port: 7860
 fullWidth: true

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import pandas as pd
 from dataset import get_dataframe
-from markdown import COLUMN_DESC_MARKDOWN, GUIDELINES, PANEL_MARKDOWN
 df = get_dataframe()
@@ -101,11 +101,6 @@ with gr.Blocks(
     fill_height=True,
 ) as demo:
     gr.Markdown(PANEL_MARKDOWN)
-    with gr.Accordion("Column descriptions (See details)", open=False) as accordion:
-        gr.Markdown(COLUMN_DESC_MARKDOWN)
-    gr.Markdown(f"### Total contributions: {len(df)}")
     with gr.Tab("Corpus contamination") as tab_corpus:
         with gr.Row(variant="compact"):
             with gr.Column():

 import pandas as pd
 from dataset import get_dataframe
+from markdown import GUIDELINES, PANEL_MARKDOWN
 df = get_dataframe()
     fill_height=True,
 ) as demo:
     gr.Markdown(PANEL_MARKDOWN)
     with gr.Tab("Corpus contamination") as tab_corpus:
         with gr.Row(variant="compact"):
             with gr.Column():

contamination_report.csv CHANGED Viewed

@@ -6,9 +6,6 @@ Anagrams 1;;GPT-3;;model;;3.0;;data-based;https://arxiv.org/abs/2005.14165;13
 Anagrams 2;;GPT-3;;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
-CodeForces2305;;GPT-3.5-turbo;0613;model;;;0.0;model-based;https://arxiv.org/abs/2402.15938;28
-CodeForces2305;;GPT-3.5-turbo;1106;model;;;0.0;model-based;https://arxiv.org/abs/2402.15938;28
 Cycled Letters;;GPT-3;;model;;1.0;;data-based;https://arxiv.org/abs/2005.14165;13
 EdinburghNLP/xsum;;GPT-3.5;;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
@@ -20,9 +17,6 @@ EdinburghNLP/xsum;;allenai/c4;;corpus;;;15.49;data-based;https://arxiv.org/abs/2
 EleutherAI/hendrycks_math;;GPT-4;;model;100.0;;;data-based;https://arxiv.org/abs/2303.08774;11
-HumanEval_R;;GPT-3.5-turbo;0613;model;;;9.76;model-based;https://arxiv.org/abs/2402.15938;28
-HumanEval_R;;GPT-3.5-turbo;1106;model;;;10.97;model-based;https://arxiv.org/abs/2402.15938;28
 RadNLI;;GPT-3.5;;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
 RadNLI;;GPT-4;;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
@@ -149,34 +143,13 @@ facebook/anli;test_r2;GPT-3;;model;;;18.0;data-based;https://arxiv.org/abs/2005.
 facebook/anli;test_r3;GPT-3;;model;;;16.0;data-based;https://arxiv.org/abs/2005.14165;13
-facebook/flores;;Claude 3 Opus;;model;;100.0;;model-based;https://arxiv.org/abs/2404.13813;29
-facebook/flores;;bigscience/bloomz;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/bloomz-1b1;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/bloomz-1b7;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/bloomz-3b;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/bloomz-560m;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/bloomz-7b1;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/mt0-base;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/mt0-large;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/mt0-small;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/mt0-xl;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/mt0-xxl;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
-facebook/flores;;bigscience/xP3;;corpus;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
 gigaword;;EleutherAI/pile;;corpus;;;1.18;data-based;https://arxiv.org/abs/2310.20707;2
 gigaword;;allenai/c4;;corpus;;;0.15;data-based;https://arxiv.org/abs/2310.20707;2
 gigaword;;oscar-corpus/OSCAR-2301;;corpus;;;0.36;data-based;https://arxiv.org/abs/2310.20707;2
 gigaword;;togethercomputer/RedPajama-Data-V2;;corpus;;;2.82;data-based;https://arxiv.org/abs/2310.20707;2
-gsm8k;;BAAI/Aquila2-34B;;model;;;100.0;model-based;https://huggingface.co/BAAI/Aquila2-34B/blob/main/README.md;21
-gsm8k;;BAAI/AquilaChat2-34B;;model;;;100.0;model-based;https://huggingface.co/BAAI/AquilaChat2-34B/blob/main/README.md;21
-gsm8k;;EleutherAI/llemma_34b;;model;;;0.15;data-based;https://openreview.net/forum?id=4WnqRR915j;23
-gsm8k;;EleutherAI/llemma_7b;;model;;;0.15;data-based;https://openreview.net/forum?id=4WnqRR915j;23
-gsm8k;;EleutherAI/proof-pile-2;;corpus;;;0.15;data-based;https://openreview.net/forum?id=4WnqRR915j;23
 gsm8k;;GPT-4;;model;100.0;;1.0;data-based;https://arxiv.org/abs/2303.08774;11
 gsm8k;;GPT-4;;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
-gsm8k;;Qwen/Qwen-14B;;model;0.5;;;model-based;https://arxiv.org/abs/2404.18824;27
-gsm8k;;Qwen/Qwen-1_8B;;model;12.8;;0.075;model-based;https://arxiv.org/abs/2404.18824;27
 head_qa;en;EleutherAI/pile;;corpus;;;5.11;data-based;https://arxiv.org/abs/2310.20707;2
 head_qa;en;allenai/c4;;corpus;;;5.22;data-based;https://arxiv.org/abs/2310.20707;2
@@ -188,18 +161,6 @@ health_fact;;allenai/c4;;corpus;;;7.53;data-based;https://arxiv.org/abs/2310.207
 health_fact;;oscar-corpus/OSCAR-2301;;corpus;;;3.4;data-based;https://arxiv.org/abs/2310.20707;2
 health_fact;;togethercomputer/RedPajama-Data-V2;;corpus;;;18.7;data-based;https://arxiv.org/abs/2310.20707;2
-hendrycks/competition_math;;BAAI/Aquila2-34B;;model;3.366;;1.166;model-based;https://arxiv.org/abs/2404.18824;27
-hendrycks/competition_math;;BAAI/Aquila2-7B;;model;1;;0.133;model-based;https://arxiv.org/abs/2404.18824;27
-hendrycks/competition_math;;EleutherAI/llemma_34b;;model;;;7.72;data-based;https://openreview.net/forum?id=4WnqRR915j;23
-hendrycks/competition_math;;EleutherAI/llemma_7b;;model;;;7.72;data-based;https://openreview.net/forum?id=4WnqRR915j;23
-hendrycks/competition_math;;EleutherAI/proof-pile-2;;corpus;;;7.72;data-based;https://openreview.net/forum?id=4WnqRR915j;23
-hendrycks/competition_math;;Qwen/Qwen-14B;;model;1.766;;1.6;model-based;https://arxiv.org/abs/2404.18824;27
-hendrycks/competition_math;;Qwen/Qwen-1_8B;;model;4.533;;1.70;model-based;https://arxiv.org/abs/2404.18824;27
-hendrycks/competition_math;;Qwen/Qwen-7B;;model;1.266;;0.766;model-based;https://arxiv.org/abs/2404.18824;27
-hendrycks/competition_math;;THUDM/chatglm3-6b;;model;0.70;;0.4;model-based;https://arxiv.org/abs/2404.18824;27
-hendrycks/competition_math;;internlm/internlm2-20b;;model;4.733;;0.666;model-based;https://arxiv.org/abs/2404.18824;27
-hendrycks/competition_math;;internlm/internlm2-7b;;model;3.033;;0.433;model-based;https://arxiv.org/abs/2404.18824;27
 hlgd;;EleutherAI/pile;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
 hlgd;;allenai/c4;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
 hlgd;;oscar-corpus/OSCAR-2301;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
@@ -703,8 +664,6 @@ wmt/wmt16;fr-en;GPT-3;;model;;;14.0;data-based;https://arxiv.org/abs/2005.14165;
 wmt/wmt16;ro-en;FLAN;;model;;;12.4;data-based;https://arxiv.org/abs/2109.01652;13
 wmt/wmt16;ro-en;GPT-3;;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
-xlangai/spider;;GPT-3.5;;model;;11.3;;model-based;https://arxiv.org/abs/2402.08100;18
 xnli;en;EleutherAI/pile;;corpus;;;0.36;data-based;https://arxiv.org/abs/2310.20707;2
 xnli;en;allenai/c4;;corpus;;;0.12;data-based;https://arxiv.org/abs/2310.20707;2
 xnli;en;oscar-corpus/OSCAR-2301;;corpus;;;0.24;data-based;https://arxiv.org/abs/2310.20707;2

 Anagrams 2;;GPT-3;;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
 Cycled Letters;;GPT-3;;model;;1.0;;data-based;https://arxiv.org/abs/2005.14165;13
 EdinburghNLP/xsum;;GPT-3.5;;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
 EleutherAI/hendrycks_math;;GPT-4;;model;100.0;;;data-based;https://arxiv.org/abs/2303.08774;11
 RadNLI;;GPT-3.5;;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
 RadNLI;;GPT-4;;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
 facebook/anli;test_r3;GPT-3;;model;;;16.0;data-based;https://arxiv.org/abs/2005.14165;13
 gigaword;;EleutherAI/pile;;corpus;;;1.18;data-based;https://arxiv.org/abs/2310.20707;2
 gigaword;;allenai/c4;;corpus;;;0.15;data-based;https://arxiv.org/abs/2310.20707;2
 gigaword;;oscar-corpus/OSCAR-2301;;corpus;;;0.36;data-based;https://arxiv.org/abs/2310.20707;2
 gigaword;;togethercomputer/RedPajama-Data-V2;;corpus;;;2.82;data-based;https://arxiv.org/abs/2310.20707;2
 gsm8k;;GPT-4;;model;100.0;;1.0;data-based;https://arxiv.org/abs/2303.08774;11
 gsm8k;;GPT-4;;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
 head_qa;en;EleutherAI/pile;;corpus;;;5.11;data-based;https://arxiv.org/abs/2310.20707;2
 head_qa;en;allenai/c4;;corpus;;;5.22;data-based;https://arxiv.org/abs/2310.20707;2
 health_fact;;oscar-corpus/OSCAR-2301;;corpus;;;3.4;data-based;https://arxiv.org/abs/2310.20707;2
 health_fact;;togethercomputer/RedPajama-Data-V2;;corpus;;;18.7;data-based;https://arxiv.org/abs/2310.20707;2
 hlgd;;EleutherAI/pile;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
 hlgd;;allenai/c4;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
 hlgd;;oscar-corpus/OSCAR-2301;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
 wmt/wmt16;ro-en;FLAN;;model;;;12.4;data-based;https://arxiv.org/abs/2109.01652;13
 wmt/wmt16;ro-en;GPT-3;;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
 xnli;en;EleutherAI/pile;;corpus;;;0.36;data-based;https://arxiv.org/abs/2310.20707;2
 xnli;en;allenai/c4;;corpus;;;0.12;data-based;https://arxiv.org/abs/2310.20707;2
 xnli;en;oscar-corpus/OSCAR-2301;;corpus;;;0.24;data-based;https://arxiv.org/abs/2310.20707;2

markdown.py CHANGED Viewed

@@ -79,9 +79,9 @@ The Data Contamination Database is a community-driven project and we welcome con
 We are organizing a community effort on centralized data contamination evidence collection. While the problem of data contamination is prevalent and serious, the breadth and depth of this contamination are still largely unknown. The concrete evidence of contamination is scattered across papers, blog posts, and social media, and it is suspected that the true scope of data contamination in NLP is significantly larger than reported. With this shared task we aim to provide a structured, centralized platform for contamination evidence collection to help the community understand the extent of the problem and to help researchers avoid repeating the same mistakes.
 If you wish to contribute to the project by reporting a data contamination case, please read the Contribution Guidelines tab.
-""".strip()
-COLUMN_DESC_MARKDOWN = """
 - **Evaluation Dataset:** Name of the evaluation dataset that has (not) been compromised.
 - **Contaminated Source:** Name of the model that has been trained with the evaluation dataset or name of the pre-training corpora that contains the evaluation dataset.
 - **Train Split:** Percentage of the train split contaminated. 0 means no contamination; 100 means that the dataset has been fully compromised.
@@ -90,4 +90,5 @@ COLUMN_DESC_MARKDOWN = """
 - **Approach:** Data-based or model-based approach. Data-based approaches search in publicly available data instances of evaluation benchmarks. Model-based approaches attempt to detect data contamination in already pre-trained models.
 - **Reference:** Paper or any other resource describing how this contamination case has been detected.
 - **PR Link:** Link to the PR in which the contamination case was described.
-"""

 We are organizing a community effort on centralized data contamination evidence collection. While the problem of data contamination is prevalent and serious, the breadth and depth of this contamination are still largely unknown. The concrete evidence of contamination is scattered across papers, blog posts, and social media, and it is suspected that the true scope of data contamination in NLP is significantly larger than reported. With this shared task we aim to provide a structured, centralized platform for contamination evidence collection to help the community understand the extent of the problem and to help researchers avoid repeating the same mistakes.
 If you wish to contribute to the project by reporting a data contamination case, please read the Contribution Guidelines tab.
+Here is a description of each column in the table below:
 - **Evaluation Dataset:** Name of the evaluation dataset that has (not) been compromised.
 - **Contaminated Source:** Name of the model that has been trained with the evaluation dataset or name of the pre-training corpora that contains the evaluation dataset.
 - **Train Split:** Percentage of the train split contaminated. 0 means no contamination; 100 means that the dataset has been fully compromised.
 - **Approach:** Data-based or model-based approach. Data-based approaches search in publicly available data instances of evaluation benchmarks. Model-based approaches attempt to detect data contamination in already pre-trained models.
 - **Reference:** Paper or any other resource describing how this contamination case has been detected.
 - **PR Link:** Link to the PR in which the contamination case was described.
+""".strip()

postprocessing.py CHANGED Viewed

@@ -17,9 +17,6 @@ def remove_duplicates(data):
 def fix_arxiv_links(data):
     return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data]
-def fix_openreview_links(data):
-    return [[*item[:-2], item[-2].replace("openreview.net/pdf", "openreview.net/forum"), item[-1]] for item in data]
 def sort_data(data):
     return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1]))
@@ -28,7 +25,6 @@ def main():
     data = sort_data(data)
     data = remove_duplicates(data)
     data = fix_arxiv_links(data)
-    data = fix_openreview_links(data)
     print("Total datapoints:", len(data))
     with open("contamination_report.csv", 'w') as f:

 def fix_arxiv_links(data):
     return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data]
 def sort_data(data):
     return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1]))
     data = sort_data(data)
     data = remove_duplicates(data)
     data = fix_arxiv_links(data)
     print("Total datapoints:", len(data))
     with open("contamination_report.csv", 'w') as f: