Spaces:
Runtime error
Runtime error
import os | |
from datetime import datetime | |
from pathlib import Path | |
from shutil import rmtree | |
import pytz | |
from huggingface_hub import HfApi, Repository | |
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---" | |
hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"] | |
local_repo_path = "./readme_repo" | |
def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None: | |
""" | |
Update the README file of a specified dataset repository with new information. | |
Args: | |
dataset_name (str): Name of the dataset repository. | |
subreddit (str): Name of the subreddit being used for dataset creation. | |
new_rows (int): Number of new rows added in the latest update. | |
hf_token (str): Hugging Face authentication token. | |
local_repo_path (str): Local path to clone the repository. | |
""" | |
# Initialize HfApi | |
api = HfApi() | |
if Path(local_repo_path).exists(): | |
rmtree(local_repo_path) | |
# Clone the repository locally | |
repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token) | |
# Read the README file | |
with open(f"{local_repo_path}/README.md", "r") as file: | |
old_readme = file.read() | |
# Modify the README | |
new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme) | |
# Write the updated README back to the repository | |
with open(f"{local_repo_path}/README.md", "w") as file: | |
file.write(new_readme) | |
# Push the changes | |
repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows') | |
def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str: | |
""" | |
Append new information to the existing README content. | |
Args: | |
subreddit (str): Name of the subreddit. | |
new_rows (int): Number of new rows added. | |
old_readme (str): Existing README content. | |
Returns: | |
str: Updated README content. | |
""" | |
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0) | |
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z') | |
readme_text = f""" | |
## Dataset Overview | |
This dataset is based on [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit}) | |
and will add [nomic-ai/nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1) embeddings based on the | |
`content` field. | |
The goal is to be able to have an automatic and free semantic/neural tool for any subreddit. | |
The last run was on {latest_hour_str} and updated {new_rows} new rows. | |
## Creation Details | |
This is done by triggering [derek-thomas/processing-bestofredditorupdates](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates) | |
based on a repository update [webhook](https://huggingface.co/docs/hub/en/webhooks) to calculate the embeddings and update the [nomic atlas](https://docs.nomic.ai) | |
visualization. This is done by this [processing space](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates). | |
## Update Frequency | |
The dataset is updated based on a [webhook](https://huggingface.co/docs/hub/en/webhooks) trigger, so each time [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit}) | |
is updated, this dataset will be updated. | |
## Opt-out | |
To opt-out of this dataset please make a request in the community tab | |
""" | |
if GENERATED_BELOW_MARKER in old_readme: | |
index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER) | |
new_readme = old_readme[:index] + "\n\n" + readme_text | |
else: | |
new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n" | |
return new_readme | |