Last commit not found
def load_file(filename): | |
with open(filename, 'r') as f: | |
header = f.readline().strip().split(";") | |
return header, [line.strip().split(";") for line in f if line.strip()] | |
def remove_duplicates(data): | |
keys = set() | |
_data = [] | |
for item in data: | |
key = tuple((item[0], item[1], item[2], item[3], item[-1])) | |
if key in keys: | |
continue | |
_data += [item] | |
keys.add(key) | |
return _data | |
def fix_arxiv_links(data): | |
return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data] | |
def fix_openreview_links(data): | |
return [[*item[:-2], item[-2].replace("openreview.net/pdf", "openreview.net/forum"), item[-1]] for item in data] | |
def sort_data(data): | |
return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1])) | |
def main(): | |
header, data = load_file("contamination_report.csv") | |
data = sort_data(data) | |
data = remove_duplicates(data) | |
data = fix_arxiv_links(data) | |
data = fix_openreview_links(data) | |
print("Total datapoints:", len(data)) | |
with open("contamination_report.csv", 'w') as f: | |
f.write(";".join(header) + "\n") | |
past_key = None | |
for line in data: | |
key = tuple((line[0], line[1])) | |
if key != past_key: | |
f.write("\n") | |
past_key = key | |
line = line[:3] + line[3:] | |
f.write(";".join(line) + "\n") | |
if __name__ == "__main__": | |
main() |