|
import os |
|
from datetime import datetime |
|
from pathlib import Path |
|
from shutil import rmtree |
|
|
|
import pytz |
|
from huggingface_hub import HfApi, Repository |
|
|
|
frequency = os.environ.get("FREQUENCY", '').lower() |
|
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---" |
|
hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"] |
|
local_repo_path = "./readme_repo" |
|
|
|
|
|
def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None: |
|
""" |
|
Update the README file of a specified dataset repository with new information. |
|
|
|
Args: |
|
dataset_name (str): Name of the dataset repository. |
|
subreddit (str): Name of the subreddit being used for dataset creation. |
|
new_rows (int): Number of new rows added in the latest update. |
|
hf_token (str): Hugging Face authentication token. |
|
local_repo_path (str): Local path to clone the repository. |
|
""" |
|
|
|
api = HfApi() |
|
|
|
if Path(local_repo_path).exists(): |
|
rmtree(local_repo_path) |
|
|
|
|
|
repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token) |
|
|
|
|
|
with open(f"{local_repo_path}/README.md", "r") as file: |
|
old_readme = file.read() |
|
|
|
|
|
new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme) |
|
|
|
|
|
with open(f"{local_repo_path}/README.md", "w") as file: |
|
file.write(new_readme) |
|
|
|
|
|
repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows') |
|
|
|
|
|
def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str: |
|
""" |
|
Append new information to the existing README content. |
|
|
|
Args: |
|
subreddit (str): Name of the subreddit. |
|
new_rows (int): Number of new rows added. |
|
old_readme (str): Existing README content. |
|
|
|
Returns: |
|
str: Updated README content. |
|
""" |
|
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0) |
|
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z') |
|
|
|
readme_text = f""" |
|
## Dataset Overview |
|
The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads. |
|
|
|
There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions. |
|
|
|
## Creation Details |
|
This dataset was created by [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-{subreddit}) |
|
|
|
## Update Frequency |
|
The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added **{new_rows} new rows**. |
|
|
|
## Licensing |
|
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25: |
|
[License information] |
|
|
|
## Opt-out |
|
To opt-out of this dataset please make a request in the community tab |
|
""" |
|
|
|
if GENERATED_BELOW_MARKER in old_readme: |
|
index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER) |
|
new_readme = old_readme[:index] + "\n\n" + readme_text |
|
else: |
|
new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n" |
|
|
|
return new_readme |
|
|