Spaces:

reddit-tools-HF
/

processing-bestofredditorupdates

Running

App Files Files Community

processing-bestofredditorupdates / src /readme_update.py

derek-thomas HF staff

Adding readme updates and removing debug from logs

9930cd7 6 months ago

raw

history blame

No virus

3.76 kB

	import os
	from datetime import datetime
	from pathlib import Path
	from shutil import rmtree

	import pytz
	from huggingface_hub import HfApi, Repository

	GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
	hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
	local_repo_path = "./readme_repo"


	def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
	"""
	Update the README file of a specified dataset repository with new information.

	Args:
	dataset_name (str): Name of the dataset repository.
	subreddit (str): Name of the subreddit being used for dataset creation.
	new_rows (int): Number of new rows added in the latest update.
	hf_token (str): Hugging Face authentication token.
	local_repo_path (str): Local path to clone the repository.
	"""
	# Initialize HfApi
	api = HfApi()

	if Path(local_repo_path).exists():
	rmtree(local_repo_path)

	# Clone the repository locally
	repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)

	# Read the README file
	with open(f"{local_repo_path}/README.md", "r") as file:
	old_readme = file.read()

	# Modify the README
	new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)

	# Write the updated README back to the repository
	with open(f"{local_repo_path}/README.md", "w") as file:
	file.write(new_readme)

	# Push the changes
	repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows')


	def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
	"""
	Append new information to the existing README content.

	Args:
	subreddit (str): Name of the subreddit.
	new_rows (int): Number of new rows added.
	old_readme (str): Existing README content.

	Returns:
	str: Updated README content.
	"""
	latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
	latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')

	readme_text = f"""
	## Dataset Overview
	This dataset is based on [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
	and will add [nomic-ai/nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1) embeddings based on the
	`content` field.

	The goal is to be able to have an automatic and free semantic/neural tool for any subreddit.

	The last run was on {latest_hour_str} and updated {new_rows} new rows.

	## Creation Details
	This is done by triggering [derek-thomas/processing-bestofredditorupdates](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates)
	based on a repository update [webhook](https://huggingface.co/docs/hub/en/webhooks) to calculate the embeddings and update the [nomic atlas](https://docs.nomic.ai)
	visualization. This is done by this [processing space](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates).

	## Update Frequency
	The dataset is updated based on a [webhook](https://huggingface.co/docs/hub/en/webhooks) trigger, so each time [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
	is updated, this dataset will be updated.

	## Opt-out
	To opt-out of this dataset please make a request in the community tab
	"""

	if GENERATED_BELOW_MARKER in old_readme:
	index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
	new_readme = old_readme[:index] + "\n\n" + readme_text
	else:
	new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"

	return new_readme