File fixes and cleaning

#17
by OSainz - opened
Files changed (5) hide show
  1. README.md +2 -2
  2. app.py +1 -6
  3. contamination_report.csv +0 -41
  4. markdown.py +4 -3
  5. postprocessing.py +0 -4
README.md CHANGED
@@ -4,8 +4,8 @@ emoji: 🏭
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
- python_version: 3.11
8
- sdk_version: 4.36.0
9
  app_file: app.py
10
  app_port: 7860
11
  fullWidth: true
 
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
+ python_version: 3.10
8
+ sdk_version: 4.19.1
9
  app_file: app.py
10
  app_port: 7860
11
  fullWidth: true
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import pandas as pd
3
 
4
  from dataset import get_dataframe
5
- from markdown import COLUMN_DESC_MARKDOWN, GUIDELINES, PANEL_MARKDOWN
6
 
7
  df = get_dataframe()
8
 
@@ -101,11 +101,6 @@ with gr.Blocks(
101
  fill_height=True,
102
  ) as demo:
103
  gr.Markdown(PANEL_MARKDOWN)
104
- with gr.Accordion("Column descriptions (See details)", open=False) as accordion:
105
- gr.Markdown(COLUMN_DESC_MARKDOWN)
106
-
107
- gr.Markdown(f"### Total contributions: {len(df)}")
108
-
109
  with gr.Tab("Corpus contamination") as tab_corpus:
110
  with gr.Row(variant="compact"):
111
  with gr.Column():
 
2
  import pandas as pd
3
 
4
  from dataset import get_dataframe
5
+ from markdown import GUIDELINES, PANEL_MARKDOWN
6
 
7
  df = get_dataframe()
8
 
 
101
  fill_height=True,
102
  ) as demo:
103
  gr.Markdown(PANEL_MARKDOWN)
 
 
 
 
 
104
  with gr.Tab("Corpus contamination") as tab_corpus:
105
  with gr.Row(variant="compact"):
106
  with gr.Column():
contamination_report.csv CHANGED
@@ -6,9 +6,6 @@ Anagrams 1;;GPT-3;;model;;3.0;;data-based;https://arxiv.org/abs/2005.14165;13
6
 
7
  Anagrams 2;;GPT-3;;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
8
 
9
- CodeForces2305;;GPT-3.5-turbo;0613;model;;;0.0;model-based;https://arxiv.org/abs/2402.15938;28
10
- CodeForces2305;;GPT-3.5-turbo;1106;model;;;0.0;model-based;https://arxiv.org/abs/2402.15938;28
11
-
12
  Cycled Letters;;GPT-3;;model;;1.0;;data-based;https://arxiv.org/abs/2005.14165;13
13
 
14
  EdinburghNLP/xsum;;GPT-3.5;;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
@@ -20,9 +17,6 @@ EdinburghNLP/xsum;;allenai/c4;;corpus;;;15.49;data-based;https://arxiv.org/abs/2
20
 
21
  EleutherAI/hendrycks_math;;GPT-4;;model;100.0;;;data-based;https://arxiv.org/abs/2303.08774;11
22
 
23
- HumanEval_R;;GPT-3.5-turbo;0613;model;;;9.76;model-based;https://arxiv.org/abs/2402.15938;28
24
- HumanEval_R;;GPT-3.5-turbo;1106;model;;;10.97;model-based;https://arxiv.org/abs/2402.15938;28
25
-
26
  RadNLI;;GPT-3.5;;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
27
  RadNLI;;GPT-4;;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
28
 
@@ -149,34 +143,13 @@ facebook/anli;test_r2;GPT-3;;model;;;18.0;data-based;https://arxiv.org/abs/2005.
149
 
150
  facebook/anli;test_r3;GPT-3;;model;;;16.0;data-based;https://arxiv.org/abs/2005.14165;13
151
 
152
- facebook/flores;;Claude 3 Opus;;model;;100.0;;model-based;https://arxiv.org/abs/2404.13813;29
153
- facebook/flores;;bigscience/bloomz;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
154
- facebook/flores;;bigscience/bloomz-1b1;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
155
- facebook/flores;;bigscience/bloomz-1b7;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
156
- facebook/flores;;bigscience/bloomz-3b;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
157
- facebook/flores;;bigscience/bloomz-560m;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
158
- facebook/flores;;bigscience/bloomz-7b1;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
159
- facebook/flores;;bigscience/mt0-base;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
160
- facebook/flores;;bigscience/mt0-large;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
161
- facebook/flores;;bigscience/mt0-small;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
162
- facebook/flores;;bigscience/mt0-xl;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
163
- facebook/flores;;bigscience/mt0-xxl;;model;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
164
- facebook/flores;;bigscience/xP3;;corpus;;100.0;;data-based;https://aclanthology.org/2023.acl-long.891/;20
165
-
166
  gigaword;;EleutherAI/pile;;corpus;;;1.18;data-based;https://arxiv.org/abs/2310.20707;2
167
  gigaword;;allenai/c4;;corpus;;;0.15;data-based;https://arxiv.org/abs/2310.20707;2
168
  gigaword;;oscar-corpus/OSCAR-2301;;corpus;;;0.36;data-based;https://arxiv.org/abs/2310.20707;2
169
  gigaword;;togethercomputer/RedPajama-Data-V2;;corpus;;;2.82;data-based;https://arxiv.org/abs/2310.20707;2
170
 
171
- gsm8k;;BAAI/Aquila2-34B;;model;;;100.0;model-based;https://huggingface.co/BAAI/Aquila2-34B/blob/main/README.md;21
172
- gsm8k;;BAAI/AquilaChat2-34B;;model;;;100.0;model-based;https://huggingface.co/BAAI/AquilaChat2-34B/blob/main/README.md;21
173
- gsm8k;;EleutherAI/llemma_34b;;model;;;0.15;data-based;https://openreview.net/forum?id=4WnqRR915j;23
174
- gsm8k;;EleutherAI/llemma_7b;;model;;;0.15;data-based;https://openreview.net/forum?id=4WnqRR915j;23
175
- gsm8k;;EleutherAI/proof-pile-2;;corpus;;;0.15;data-based;https://openreview.net/forum?id=4WnqRR915j;23
176
  gsm8k;;GPT-4;;model;100.0;;1.0;data-based;https://arxiv.org/abs/2303.08774;11
177
  gsm8k;;GPT-4;;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
178
- gsm8k;;Qwen/Qwen-14B;;model;0.5;;;model-based;https://arxiv.org/abs/2404.18824;27
179
- gsm8k;;Qwen/Qwen-1_8B;;model;12.8;;0.075;model-based;https://arxiv.org/abs/2404.18824;27
180
 
181
  head_qa;en;EleutherAI/pile;;corpus;;;5.11;data-based;https://arxiv.org/abs/2310.20707;2
182
  head_qa;en;allenai/c4;;corpus;;;5.22;data-based;https://arxiv.org/abs/2310.20707;2
@@ -188,18 +161,6 @@ health_fact;;allenai/c4;;corpus;;;7.53;data-based;https://arxiv.org/abs/2310.207
188
  health_fact;;oscar-corpus/OSCAR-2301;;corpus;;;3.4;data-based;https://arxiv.org/abs/2310.20707;2
189
  health_fact;;togethercomputer/RedPajama-Data-V2;;corpus;;;18.7;data-based;https://arxiv.org/abs/2310.20707;2
190
 
191
- hendrycks/competition_math;;BAAI/Aquila2-34B;;model;3.366;;1.166;model-based;https://arxiv.org/abs/2404.18824;27
192
- hendrycks/competition_math;;BAAI/Aquila2-7B;;model;1;;0.133;model-based;https://arxiv.org/abs/2404.18824;27
193
- hendrycks/competition_math;;EleutherAI/llemma_34b;;model;;;7.72;data-based;https://openreview.net/forum?id=4WnqRR915j;23
194
- hendrycks/competition_math;;EleutherAI/llemma_7b;;model;;;7.72;data-based;https://openreview.net/forum?id=4WnqRR915j;23
195
- hendrycks/competition_math;;EleutherAI/proof-pile-2;;corpus;;;7.72;data-based;https://openreview.net/forum?id=4WnqRR915j;23
196
- hendrycks/competition_math;;Qwen/Qwen-14B;;model;1.766;;1.6;model-based;https://arxiv.org/abs/2404.18824;27
197
- hendrycks/competition_math;;Qwen/Qwen-1_8B;;model;4.533;;1.70;model-based;https://arxiv.org/abs/2404.18824;27
198
- hendrycks/competition_math;;Qwen/Qwen-7B;;model;1.266;;0.766;model-based;https://arxiv.org/abs/2404.18824;27
199
- hendrycks/competition_math;;THUDM/chatglm3-6b;;model;0.70;;0.4;model-based;https://arxiv.org/abs/2404.18824;27
200
- hendrycks/competition_math;;internlm/internlm2-20b;;model;4.733;;0.666;model-based;https://arxiv.org/abs/2404.18824;27
201
- hendrycks/competition_math;;internlm/internlm2-7b;;model;3.033;;0.433;model-based;https://arxiv.org/abs/2404.18824;27
202
-
203
  hlgd;;EleutherAI/pile;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
204
  hlgd;;allenai/c4;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
205
  hlgd;;oscar-corpus/OSCAR-2301;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
@@ -703,8 +664,6 @@ wmt/wmt16;fr-en;GPT-3;;model;;;14.0;data-based;https://arxiv.org/abs/2005.14165;
703
  wmt/wmt16;ro-en;FLAN;;model;;;12.4;data-based;https://arxiv.org/abs/2109.01652;13
704
  wmt/wmt16;ro-en;GPT-3;;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
705
 
706
- xlangai/spider;;GPT-3.5;;model;;11.3;;model-based;https://arxiv.org/abs/2402.08100;18
707
-
708
  xnli;en;EleutherAI/pile;;corpus;;;0.36;data-based;https://arxiv.org/abs/2310.20707;2
709
  xnli;en;allenai/c4;;corpus;;;0.12;data-based;https://arxiv.org/abs/2310.20707;2
710
  xnli;en;oscar-corpus/OSCAR-2301;;corpus;;;0.24;data-based;https://arxiv.org/abs/2310.20707;2
 
6
 
7
  Anagrams 2;;GPT-3;;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
8
 
 
 
 
9
  Cycled Letters;;GPT-3;;model;;1.0;;data-based;https://arxiv.org/abs/2005.14165;13
10
 
11
  EdinburghNLP/xsum;;GPT-3.5;;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
 
17
 
18
  EleutherAI/hendrycks_math;;GPT-4;;model;100.0;;;data-based;https://arxiv.org/abs/2303.08774;11
19
 
 
 
 
20
  RadNLI;;GPT-3.5;;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
21
  RadNLI;;GPT-4;;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
22
 
 
143
 
144
  facebook/anli;test_r3;GPT-3;;model;;;16.0;data-based;https://arxiv.org/abs/2005.14165;13
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  gigaword;;EleutherAI/pile;;corpus;;;1.18;data-based;https://arxiv.org/abs/2310.20707;2
147
  gigaword;;allenai/c4;;corpus;;;0.15;data-based;https://arxiv.org/abs/2310.20707;2
148
  gigaword;;oscar-corpus/OSCAR-2301;;corpus;;;0.36;data-based;https://arxiv.org/abs/2310.20707;2
149
  gigaword;;togethercomputer/RedPajama-Data-V2;;corpus;;;2.82;data-based;https://arxiv.org/abs/2310.20707;2
150
 
 
 
 
 
 
151
  gsm8k;;GPT-4;;model;100.0;;1.0;data-based;https://arxiv.org/abs/2303.08774;11
152
  gsm8k;;GPT-4;;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
 
 
153
 
154
  head_qa;en;EleutherAI/pile;;corpus;;;5.11;data-based;https://arxiv.org/abs/2310.20707;2
155
  head_qa;en;allenai/c4;;corpus;;;5.22;data-based;https://arxiv.org/abs/2310.20707;2
 
161
  health_fact;;oscar-corpus/OSCAR-2301;;corpus;;;3.4;data-based;https://arxiv.org/abs/2310.20707;2
162
  health_fact;;togethercomputer/RedPajama-Data-V2;;corpus;;;18.7;data-based;https://arxiv.org/abs/2310.20707;2
163
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  hlgd;;EleutherAI/pile;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
165
  hlgd;;allenai/c4;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
166
  hlgd;;oscar-corpus/OSCAR-2301;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
 
664
  wmt/wmt16;ro-en;FLAN;;model;;;12.4;data-based;https://arxiv.org/abs/2109.01652;13
665
  wmt/wmt16;ro-en;GPT-3;;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
666
 
 
 
667
  xnli;en;EleutherAI/pile;;corpus;;;0.36;data-based;https://arxiv.org/abs/2310.20707;2
668
  xnli;en;allenai/c4;;corpus;;;0.12;data-based;https://arxiv.org/abs/2310.20707;2
669
  xnli;en;oscar-corpus/OSCAR-2301;;corpus;;;0.24;data-based;https://arxiv.org/abs/2310.20707;2
markdown.py CHANGED
@@ -79,9 +79,9 @@ The Data Contamination Database is a community-driven project and we welcome con
79
  We are organizing a community effort on centralized data contamination evidence collection. While the problem of data contamination is prevalent and serious, the breadth and depth of this contamination are still largely unknown. The concrete evidence of contamination is scattered across papers, blog posts, and social media, and it is suspected that the true scope of data contamination in NLP is significantly larger than reported. With this shared task we aim to provide a structured, centralized platform for contamination evidence collection to help the community understand the extent of the problem and to help researchers avoid repeating the same mistakes.
80
 
81
  If you wish to contribute to the project by reporting a data contamination case, please read the Contribution Guidelines tab.
82
- """.strip()
83
 
84
- COLUMN_DESC_MARKDOWN = """
 
85
  - **Evaluation Dataset:** Name of the evaluation dataset that has (not) been compromised.
86
  - **Contaminated Source:** Name of the model that has been trained with the evaluation dataset or name of the pre-training corpora that contains the evaluation dataset.
87
  - **Train Split:** Percentage of the train split contaminated. 0 means no contamination; 100 means that the dataset has been fully compromised.
@@ -90,4 +90,5 @@ COLUMN_DESC_MARKDOWN = """
90
  - **Approach:** Data-based or model-based approach. Data-based approaches search in publicly available data instances of evaluation benchmarks. Model-based approaches attempt to detect data contamination in already pre-trained models.
91
  - **Reference:** Paper or any other resource describing how this contamination case has been detected.
92
  - **PR Link:** Link to the PR in which the contamination case was described.
93
- """
 
 
79
  We are organizing a community effort on centralized data contamination evidence collection. While the problem of data contamination is prevalent and serious, the breadth and depth of this contamination are still largely unknown. The concrete evidence of contamination is scattered across papers, blog posts, and social media, and it is suspected that the true scope of data contamination in NLP is significantly larger than reported. With this shared task we aim to provide a structured, centralized platform for contamination evidence collection to help the community understand the extent of the problem and to help researchers avoid repeating the same mistakes.
80
 
81
  If you wish to contribute to the project by reporting a data contamination case, please read the Contribution Guidelines tab.
 
82
 
83
+ Here is a description of each column in the table below:
84
+
85
  - **Evaluation Dataset:** Name of the evaluation dataset that has (not) been compromised.
86
  - **Contaminated Source:** Name of the model that has been trained with the evaluation dataset or name of the pre-training corpora that contains the evaluation dataset.
87
  - **Train Split:** Percentage of the train split contaminated. 0 means no contamination; 100 means that the dataset has been fully compromised.
 
90
  - **Approach:** Data-based or model-based approach. Data-based approaches search in publicly available data instances of evaluation benchmarks. Model-based approaches attempt to detect data contamination in already pre-trained models.
91
  - **Reference:** Paper or any other resource describing how this contamination case has been detected.
92
  - **PR Link:** Link to the PR in which the contamination case was described.
93
+
94
+ """.strip()
postprocessing.py CHANGED
@@ -17,9 +17,6 @@ def remove_duplicates(data):
17
  def fix_arxiv_links(data):
18
  return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data]
19
 
20
- def fix_openreview_links(data):
21
- return [[*item[:-2], item[-2].replace("openreview.net/pdf", "openreview.net/forum"), item[-1]] for item in data]
22
-
23
  def sort_data(data):
24
  return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1]))
25
 
@@ -28,7 +25,6 @@ def main():
28
  data = sort_data(data)
29
  data = remove_duplicates(data)
30
  data = fix_arxiv_links(data)
31
- data = fix_openreview_links(data)
32
  print("Total datapoints:", len(data))
33
 
34
  with open("contamination_report.csv", 'w') as f:
 
17
  def fix_arxiv_links(data):
18
  return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data]
19
 
 
 
 
20
  def sort_data(data):
21
  return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1]))
22
 
 
25
  data = sort_data(data)
26
  data = remove_duplicates(data)
27
  data = fix_arxiv_links(data)
 
28
  print("Total datapoints:", len(data))
29
 
30
  with open("contamination_report.csv", 'w') as f: