derek-thomas's picture
derek-thomas HF staff
Update readme to not use Repository
9188762
raw
history blame
3.23 kB
import os
from datetime import datetime
from pathlib import Path
from shutil import rmtree
import pytz
from huggingface_hub import HfApi, Repository
frequency = os.environ.get("FREQUENCY", '').lower()
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
username = os.environ["USERNAME"]
hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
local_repo_path = "./readme_repo"
def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
"""
Update the README file of a specified dataset repository with new information.
Args:
dataset_name (str): Name of the dataset repository.
subreddit (str): Name of the subreddit being used for dataset creation.
new_rows (int): Number of new rows added in the latest update.
hf_token (str): Hugging Face authentication token.
local_repo_path (str): Local path to clone the repository.
"""
# Initialize HfApi
api = HfApi()
# Download README file
readme_path = api.hf_hub_download(repo_id=dataset_name, repo_type="dataset", filename="README.md")
# Read it
with open(readme_path, "r") as file:
old_readme = file.read()
# Modify it
new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)
# Commit modifications
api.upload_file(
path_or_fileobj=new_readme.encode(),
path_in_repo="README.md",
repo_id=dataset_name,
repo_type="dataset",
commit_message=f'Pushing {new_rows} new rows'
)
def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
"""
Append new information to the existing README content.
Args:
subreddit (str): Name of the subreddit.
new_rows (int): Number of new rows added.
old_readme (str): Existing README content.
Returns:
str: Updated README content.
"""
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
readme_text = f"""
## Dataset Overview
The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads.
There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.
## Creation Details
This dataset was created by [{username}/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/{username}/dataset-creator-reddit-{subreddit})
## Update Frequency
The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added **{new_rows} new rows**.
## Licensing
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
[License information]
## Opt-out
To opt-out of this dataset please make a request in the community tab
"""
if GENERATED_BELOW_MARKER in old_readme:
index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
new_readme = old_readme[:index] + "\n\n" + readme_text
else:
new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"
return new_readme