dataset-creator-reddit-uwaterloo

Running

App Files Files Community

dataset-creator-reddit-uwaterloo / utilities /readme_update.py

derek-thomas HF staff

Removing old repo

41daa3d 11 months ago

raw

history blame

3.34 kB

	import os
	from datetime import datetime
	from pathlib import Path
	from shutil import rmtree

	import pytz
	from huggingface_hub import HfApi, Repository

	frequency = os.environ.get("FREQUENCY", '').lower()
	GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
	hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
	local_repo_path = "./readme_repo"


	def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
	"""
	Update the README file of a specified dataset repository with new information.

	Args:
	dataset_name (str): Name of the dataset repository.
	subreddit (str): Name of the subreddit being used for dataset creation.
	new_rows (int): Number of new rows added in the latest update.
	hf_token (str): Hugging Face authentication token.
	local_repo_path (str): Local path to clone the repository.
	"""
	# Initialize HfApi
	api = HfApi()

	if Path(local_repo_path).exists():
	rmtree(local_repo_path)

	# Clone the repository locally
	repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)

	# Read the README file
	with open(f"{local_repo_path}/README.md", "r") as file:
	old_readme = file.read()

	# Modify the README
	new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)

	# Write the updated README back to the repository
	with open(f"{local_repo_path}/README.md", "w") as file:
	file.write(new_readme)

	# Push the changes
	repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows')


	def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
	"""
	Append new information to the existing README content.

	Args:
	subreddit (str): Name of the subreddit.
	new_rows (int): Number of new rows added.
	old_readme (str): Existing README content.

	Returns:
	str: Updated README content.
	"""
	latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
	latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')

	readme_text = f"""
	## Dataset Overview
	The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads.

	There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.

	## Creation Details
	This dataset was created by [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-{subreddit})

	## Update Frequency
	The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added {new_rows} new rows.

	## Licensing
	[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
	[License information]

	## Opt-out
	To opt-out of this dataset please make a request in the community tab
	"""

	if GENERATED_BELOW_MARKER in old_readme:
	index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
	new_readme = old_readme[:index] + "\n\n" + readme_text
	else:
	new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"

	return new_readme