import json import os import filelock import huggingface_hub import pandas as pd from utils import ( build_datasets_urls, build_models_urls, build_text_icon, download_favicons, get_base_url, get_domain_name, ) HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg" CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png" DISABLE_ONLINE_CACHE = False ONLINE_CACHE = "CONDA-Workshop/RequestCache" def save_cache(cache_data, cache_file, initial_timestamp): print(f"Saving cache to {cache_file}") # Acquire lock before reading and updating the file to prevent race conditions with filelock.FileLock(f"{cache_file}.lock"): # Check if the file has been modified since the initial read current_timestamp = ( os.path.getmtime(cache_file) if os.path.exists(cache_file) else None ) if current_timestamp is None or initial_timestamp != current_timestamp: # File has been modified or created since initial read, re-read the file try: with open(cache_file, "r", encoding="utf8") as f: # Update the dictionary with newly added entries cache_dict = json.load(f) # Test if cache_dict and cache_data are different if cache_dict != cache_data: cache_data.update(cache_dict) except FileNotFoundError: pass # If the file doesn't exist at this point, continue with the current dictionary # Write the updated dictionary back to the file with open(cache_file, "w", encoding="utf8") as f: json.dump(cache_data, f, ensure_ascii=False, indent=4) if not DISABLE_ONLINE_CACHE: try: huggingface_hub.upload_file( repo_id=ONLINE_CACHE, repo_type="dataset", token=os.environ.get("TOKEN") or True, path_in_repo=cache_file, path_or_fileobj=cache_file, ) except Exception as e: print(f"Unable to upload {cache_file}: {e}") return cache_data def update_favicon_cache(sources): # Load the favicon dictionary if it exists favicon_dict = {} favicon_file_path = "favicons.json" initial_timestamp = None if not DISABLE_ONLINE_CACHE: try: huggingface_hub.hf_hub_download( repo_id=ONLINE_CACHE, repo_type="dataset", token=os.environ.get("TOKEN") or True, filename=favicon_file_path, local_dir=os.getcwd(), ) except Exception as e: print(f"Unable to download favicons.json: {e}") # Attempt to load the favicon dictionary and record its last modification time if os.path.exists(favicon_file_path): initial_timestamp = os.path.getmtime(favicon_file_path) try: with open(favicon_file_path, "r", encoding="utf8") as f: favicon_dict = json.load(f) except FileNotFoundError: pass # File not found, proceed with an empty dictionary # Determine which favicons need to be downloaded missing_domains = [domain for domain in sources if domain not in favicon_dict] # Download missing favicons in batch if missing_domains: new_favicon_urls = download_favicons(missing_domains) favicon_dict.update(new_favicon_urls) favicon_dict = save_cache( cache_data=favicon_dict, cache_file=favicon_file_path, initial_timestamp=initial_timestamp, ) return favicon_dict def update_model_url_cache(models): models = [x for x in models if x is not None] models = list(set(models)) # Load the model url dictionary if it exists model_url_dict = {} model_url_file_path = "model_urls.json" initial_timestamp = None if not DISABLE_ONLINE_CACHE: try: huggingface_hub.hf_hub_download( repo_id=ONLINE_CACHE, repo_type="dataset", token=os.environ.get("TOKEN") or True, filename=model_url_file_path, local_dir=os.getcwd(), ) except Exception as e: print(f"Unable to download model_urls.json: {e}") # Attempt to load the model url dictionary and record its last modification time if os.path.exists(model_url_file_path): initial_timestamp = os.path.getmtime(model_url_file_path) try: with open(model_url_file_path, "r", encoding="utf8") as f: model_url_dict = json.load(f) except FileNotFoundError: pass # File not found, proceed with an empty dictionary # Determine which model urls need to be downloaded missing_model_urls = [model for model in models if model not in model_url_dict] # Download missing model urls in batch if missing_model_urls: new_model_urls = build_models_urls(missing_model_urls) model_url_dict.update(new_model_urls) model_url_dict = save_cache( cache_data=model_url_dict, cache_file=model_url_file_path, initial_timestamp=initial_timestamp, ) return model_url_dict def update_dataset_url_cache(datasets): datasets = [x for x in datasets if x is not None] datasets = list(set(datasets)) # Load the dataset url dictionary if it exists dataset_url_dict = {} dataset_url_file_path = "dataset_urls.json" initial_timestamp = None if not DISABLE_ONLINE_CACHE: try: huggingface_hub.hf_hub_download( repo_id=ONLINE_CACHE, repo_type="dataset", token=os.environ.get("TOKEN") or True, filename=dataset_url_file_path, local_dir=os.getcwd(), ) except Exception as e: print(f"Unable to download dataset_urls.json: {e}") # Attempt to load the dataset url dictionary and record its last modification time if os.path.exists(dataset_url_file_path): initial_timestamp = os.path.getmtime(dataset_url_file_path) try: with open(dataset_url_file_path, "r", encoding="utf8") as f: dataset_url_dict = json.load(f) except FileNotFoundError: pass # File not found, proceed with an empty dictionary # Determine which dataset urls need to be downloaded missing_dataset_urls = [ dataset for dataset in datasets if dataset not in dataset_url_dict ] # Download missing dataset urls in batch if missing_dataset_urls: new_dataset_urls = build_datasets_urls(missing_dataset_urls) dataset_url_dict.update(new_dataset_urls) dataset_url_dict = save_cache( cache_data=dataset_url_dict, cache_file=dataset_url_file_path, initial_timestamp=initial_timestamp, ) return dataset_url_dict def get_dataframe(): # Load the contamination_report.csv file data = pd.read_csv("contamination_report.csv", delimiter=";", header=0) # Load the favicon dictionary if it exists favicon_dict = {} # Update the favicon dictionary favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]]) # Update the model url dictionary model_url_dict = update_model_url_cache( data[data["Model or corpus"] == "model"]["Contaminated Source"] ) # Update the dataset url dictionary dataset_url_dict = update_dataset_url_cache( list(data["Evaluation Dataset"]) + list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"]) ) # Add favicons URLs to the dataframe in a vectorized manner data["Reference"] = data["Reference"].apply( lambda x: build_text_icon( text=get_domain_name(x), url=x, icon_url=favicon_dict.get(get_base_url(x), ""), ) ) PR_URL_FORMAT = "https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions/{}" data["PR"] = data["PR"].apply( lambda x: build_text_icon( text="", url=PR_URL_FORMAT.format(int(x)) if not pd.isna(x) else "no link", icon_url=HF_ICON if x == x else CROSS_ICON, ) ) data["Evaluation Dataset"] = data["Evaluation Dataset"].apply( lambda x: build_text_icon( text=x, url=dataset_url_dict.get(x, ""), icon_url=HF_ICON, ) ) data["Evaluation Dataset"] = data.apply( lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"], axis=1, ) del data["Subset"] # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model" data["Contaminated Source"] = data.apply( lambda x: build_text_icon( text=x["Contaminated Source"] + f" ({x['Version']})" if pd.notna(x["Version"]) else x["Contaminated Source"], url=dataset_url_dict.get(x["Contaminated Source"], "") if x["Model or corpus"] == "corpus" else model_url_dict.get(x["Contaminated Source"], ""), icon_url=HF_ICON, ), axis=1, ) del data["Version"] data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x) data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x) data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x) return data