dataset-creator-reddit-uwaterloo

Running

File size: 3,335 Bytes

ed3130d
5d9e0b8
41daa3d
 
ed3130d
5d9e0b8
5ec6657
ed3130d
5d9e0b8
5ec6657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41daa3d
 
 
5ec6657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41daa3d
 
5ec6657
 
 
 
 
 
 
 
 
 
 
 
 
5d9e0b8
 
 
ed3130d
285612d
5ec6657
285612d
5d9e0b8
285612d
613d6f5
5d9e0b8
613d6f5
7641c8b
5d9e0b8
bc7f4d5
 
 
5ec6657
5d9e0b8
 
 
285612d
ed3130d
5ec6657
 
 
ed3130d
5ec6657
ed3130d
41daa3d

import os
from datetime import datetime
from pathlib import Path
from shutil import rmtree

import pytz
from huggingface_hub import HfApi, Repository

frequency = os.environ.get("FREQUENCY", '').lower()
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
local_repo_path = "./readme_repo"


def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
    """
    Update the README file of a specified dataset repository with new information.

    Args:
    dataset_name (str): Name of the dataset repository.
    subreddit (str): Name of the subreddit being used for dataset creation.
    new_rows (int): Number of new rows added in the latest update.
    hf_token (str): Hugging Face authentication token.
    local_repo_path (str): Local path to clone the repository.
    """
    # Initialize HfApi
    api = HfApi()

    if Path(local_repo_path).exists():
        rmtree(local_repo_path)

    # Clone the repository locally
    repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)

    # Read the README file
    with open(f"{local_repo_path}/README.md", "r") as file:
        old_readme = file.read()

    # Modify the README
    new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)

    # Write the updated README back to the repository
    with open(f"{local_repo_path}/README.md", "w") as file:
        file.write(new_readme)

    # Push the changes
    repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows')


def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
    """
    Append new information to the existing README content.

    Args:
    subreddit (str): Name of the subreddit.
    new_rows (int): Number of new rows added.
    old_readme (str): Existing README content.

    Returns:
    str: Updated README content.
    """
    latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
    latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')

    readme_text = f"""
## Dataset Overview
The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads.

There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.

## Creation Details
This dataset was created by [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-{subreddit})

## Update Frequency
The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added **{new_rows} new rows**.

## Licensing 
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
[License information]

## Opt-out
To opt-out of this dataset please make a request in the community tab
"""

    if GENERATED_BELOW_MARKER in old_readme:
        index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
        new_readme = old_readme[:index] + "\n\n" + readme_text
    else:
        new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"

    return new_readme