OSainz commited on
Commit
99a8650
β€’
1 Parent(s): 888fb82

File fixes and cleaning (#17)

Browse files

- Add changes (23add198b9aed3461771ec64c740e7c2f6789dd1)
- Add info about the changes in the markdown. (4a1e5cc01386ce466b5172d77f8d97e0792609f9)

Files changed (4) hide show
  1. contamination_report.csv +0 -0
  2. dataset.py +2 -1
  3. markdown.py +2 -1
  4. postprocessing.py +43 -0
contamination_report.csv CHANGED
The diff for this file is too large to render. See raw diff
 
dataset.py CHANGED
@@ -256,7 +256,7 @@ def get_dataframe():
256
  # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
257
  data["Contaminated Source"] = data.apply(
258
  lambda x: build_text_icon(
259
- text=x["Contaminated Source"],
260
  url=dataset_url_dict.get(x["Contaminated Source"], "")
261
  if x["Model or corpus"] == "corpus"
262
  else model_url_dict.get(x["Contaminated Source"], ""),
@@ -264,6 +264,7 @@ def get_dataframe():
264
  ),
265
  axis=1,
266
  )
 
267
 
268
  data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
269
  data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
 
256
  # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
257
  data["Contaminated Source"] = data.apply(
258
  lambda x: build_text_icon(
259
+ text=x["Contaminated Source"] + f" ({x['Version']})" if pd.notna(x["Version"]) else x["Contaminated Source"],
260
  url=dataset_url_dict.get(x["Contaminated Source"], "")
261
  if x["Model or corpus"] == "corpus"
262
  else model_url_dict.get(x["Contaminated Source"], ""),
 
264
  ),
265
  axis=1,
266
  )
267
+ del data["Version"]
268
 
269
  data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
270
  data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
markdown.py CHANGED
@@ -60,8 +60,9 @@ Citation: `@inproceedings{...`
60
 
61
  The [contamination_report.csv](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database/blob/main/contamination_report.csv) file is a csv filed with `;` delimiters. You will need to update the following columns:
62
  - **Evaluation Dataset**: Name of the evaluation dataset that has has (not) been compromised. If available in the HuggingFace Hub please write the path (e.g. `uonlp/CulturaX`), otherwise proviede the name of the dataset.
63
- - **Subset**: Many HuggingFace datasets have different subsets or splits on a single dataset. This field is to define a particular subset of a given dataset. For example, `qnli` subset of `glue`.
64
  - **Contaminated Source**: Name of the model that has been trained with the evaluation dataset or name of the pre-training copora that contains the evaluation datset. If available in the HuggingFace Hub please write the path (e.g. `allenai/OLMo-7B`), otherwise proviede the name of the model/dataset.
 
65
  - **Train split**: Percentage of the train split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
66
  - **Development split**: Percentage of the development split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised.
67
  - **Train split**: Percentage of the test split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
 
60
 
61
  The [contamination_report.csv](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database/blob/main/contamination_report.csv) file is a csv filed with `;` delimiters. You will need to update the following columns:
62
  - **Evaluation Dataset**: Name of the evaluation dataset that has has (not) been compromised. If available in the HuggingFace Hub please write the path (e.g. `uonlp/CulturaX`), otherwise proviede the name of the dataset.
63
+ - **Subset**: (Optional) Many HuggingFace datasets have different subsets or splits on a single dataset. This field is to define a particular subset of a given dataset. For example, `qnli` subset of `glue`.
64
  - **Contaminated Source**: Name of the model that has been trained with the evaluation dataset or name of the pre-training copora that contains the evaluation datset. If available in the HuggingFace Hub please write the path (e.g. `allenai/OLMo-7B`), otherwise proviede the name of the model/dataset.
65
+ - **Version**: (Optional) Any information relevant to identify the version of the model or dataset. This information will be shown between parentheses in the Contaminated Source column.
66
  - **Train split**: Percentage of the train split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
67
  - **Development split**: Percentage of the development split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised.
68
  - **Train split**: Percentage of the test split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
postprocessing.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def load_file(filename):
2
+ with open(filename, 'r') as f:
3
+ header = f.readline().strip().split(";")
4
+ return header, [line.strip().split(";") for line in f if line.strip()]
5
+
6
+ def remove_duplicates(data):
7
+ keys = set()
8
+ _data = []
9
+ for item in data:
10
+ key = tuple((item[0], item[1], item[2], item[3], item[-1]))
11
+ if key in keys:
12
+ continue
13
+ _data += [item]
14
+ keys.add(key)
15
+ return _data
16
+
17
+ def fix_arxiv_links(data):
18
+ return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data]
19
+
20
+ def sort_data(data):
21
+ return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1]))
22
+
23
+ def main():
24
+ header, data = load_file("contamination_report.csv")
25
+ data = sort_data(data)
26
+ data = remove_duplicates(data)
27
+ data = fix_arxiv_links(data)
28
+ print("Total datapoints:", len(data))
29
+
30
+ with open("contamination_report.csv", 'w') as f:
31
+ f.write(";".join(header) + "\n")
32
+ past_key = None
33
+ for line in data:
34
+ key = tuple((line[0], line[1]))
35
+ if key != past_key:
36
+ f.write("\n")
37
+ past_key = key
38
+ line = line[:3] + [""] + line[3:]
39
+ f.write(";".join(line) + "\n")
40
+
41
+
42
+ if __name__ == "__main__":
43
+ main()