File size: 3,335 Bytes
ed3130d 5d9e0b8 41daa3d ed3130d 5d9e0b8 5ec6657 ed3130d 5d9e0b8 5ec6657 41daa3d 5ec6657 41daa3d 5ec6657 5d9e0b8 ed3130d 285612d 5ec6657 285612d 5d9e0b8 285612d 613d6f5 5d9e0b8 613d6f5 7641c8b 5d9e0b8 bc7f4d5 5ec6657 5d9e0b8 285612d ed3130d 5ec6657 ed3130d 5ec6657 ed3130d 41daa3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
from datetime import datetime
from pathlib import Path
from shutil import rmtree
import pytz
from huggingface_hub import HfApi, Repository
frequency = os.environ.get("FREQUENCY", '').lower()
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
local_repo_path = "./readme_repo"
def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
"""
Update the README file of a specified dataset repository with new information.
Args:
dataset_name (str): Name of the dataset repository.
subreddit (str): Name of the subreddit being used for dataset creation.
new_rows (int): Number of new rows added in the latest update.
hf_token (str): Hugging Face authentication token.
local_repo_path (str): Local path to clone the repository.
"""
# Initialize HfApi
api = HfApi()
if Path(local_repo_path).exists():
rmtree(local_repo_path)
# Clone the repository locally
repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)
# Read the README file
with open(f"{local_repo_path}/README.md", "r") as file:
old_readme = file.read()
# Modify the README
new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)
# Write the updated README back to the repository
with open(f"{local_repo_path}/README.md", "w") as file:
file.write(new_readme)
# Push the changes
repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows')
def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
"""
Append new information to the existing README content.
Args:
subreddit (str): Name of the subreddit.
new_rows (int): Number of new rows added.
old_readme (str): Existing README content.
Returns:
str: Updated README content.
"""
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
readme_text = f"""
## Dataset Overview
The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads.
There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.
## Creation Details
This dataset was created by [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-{subreddit})
## Update Frequency
The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added **{new_rows} new rows**.
## Licensing
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
[License information]
## Opt-out
To opt-out of this dataset please make a request in the community tab
"""
if GENERATED_BELOW_MARKER in old_readme:
index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
new_readme = old_readme[:index] + "\n\n" + readme_text
else:
new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"
return new_readme
|