dataset-creator-reddit-uwaterloo

Running

File size: 5,268 Bytes

8e8a9fc
 
 
749d1d8
5d9e0b8
fceefe7
749d1d8
 
 
fceefe7
 
8e8a9fc
 
fceefe7
 
 
8e8a9fc
fceefe7
 
5d9e0b8
 
c1f39f8
fceefe7
 
 
749d1d8
 
 
e014498
749d1d8
e6a15ab
749d1d8
52bca1a
 
 
d8d1956
52bca1a
 
 
 
749d1d8
5d9e0b8
749d1d8
 
5d9e0b8
 
 
 
 
 
 
 
 
 
 
e6a15ab
 
 
5d9e0b8
 
 
 
 
 
 
749d1d8
5d9e0b8
fc00c85
5d9e0b8
 
 
fceefe7
 
8e8a9fc
24c9f40
8e8a9fc
749d1d8
5d9e0b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fceefe7
5d9e0b8
 
9a66c2f
 
 
fceefe7
 
8f11653
 
 
 
fc00c85
5d9e0b8
 
 
 
075c34d
 
 
 
 
 
 
f0e56b8
fceefe7
 
962f45f
fceefe7
 
 
 
749d1d8
 
f3c4357
fceefe7

import os
from pathlib import Path

import gradio as gr
from bs4 import BeautifulSoup
from huggingface_hub import WebhookPayload, WebhooksServer
from rich.console import Console
from rich.syntax import Syntax

from utilities.my_logger import setup_logger

proj_dir = Path(__name__).parent

SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
DATASET_NAME = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"

FREQUENCY = os.environ.get("FREQUENCY", '').lower()
if FREQUENCY not in ["daily", "hourly"]:
    raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")

WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')

logger = setup_logger(__name__)


def log_file_to_html_string():
    log_file = "mylog.log"
    num_lines_visualize = 50

    console = Console(record=True, width=150, style="#272822")
    with open(log_file, "rt") as f:
        # Seek to the end of the file minus 300 lines
        # Read the last 300 lines of the file
        lines = f.readlines()
        lines = lines[-num_lines_visualize:]

        # Syntax-highlight the last 300 lines of the file using the Python lexer and Monokai style
        output = "".join(lines)
        syntax = Syntax(output, "python", theme="monokai", word_wrap=True)

    console.print(syntax);
    html_content = console.export_html(inline_styles=True)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'lxml')

    # Modify the <pre> tag
    pre_tag = soup.pre
    pre_tag['class'] = 'scrollable'
    del pre_tag['style']

    # Add your custom styles and the .scrollable CSS to the <style> tag
    style_tag = soup.style
    style_content = """
pre, code {
    background-color: #272822;
}
    .scrollable {
        font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
        height: 500px;
        overflow: auto;
    }
    """
    style_tag.append(style_content)

    return soup.prettify()


intro_md = f"""
# Reddit Dataset Creator
This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}) 
which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details. 

As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
"""

how_to_md = f"""
# How to make your own space and dataset
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
    - Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
    - You need the `secret` and the `Client ID` from the reddit application.
    - `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
and fill in the information
"""

how_does_it_work_md = f"""
# Core Components
There are 2 core components [main](main.py) and [app](app.py).
Main does a few things: 
- Pulls from a datasource 
- Updates a dataset on the hub
- Updates the README of the dataset
- Writes a local log file (inaccessible outside the spaces container)

App
- Visualizes the log file from Main

# Running it
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
log files. I use gradio for `app` and map that to the open port of huggingface spaces. 

The only communication between `app` and `main` is the log file.
"""

with gr.Blocks() as ui:
    with gr.Tab("Application"):
        gr.Markdown(intro_md)
        gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
        gr.Markdown("# Logs")
        output = gr.HTML(log_file_to_html_string, every=1)
        ui.load(None,
                _js="""
        () => {
            document.body.classList.toggle('dark');
            document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
        }
        """, )
    with gr.Tab("How to Create?"):
        gr.Markdown(how_to_md)
    with gr.Tab("How does it work?"):
        gr.Markdown(how_does_it_work_md)
    with gr.Tab("Hidden"):
        with gr.Column():
            input_text = gr.Textbox(label="Input Text")
            health_btn = gr.Button(value="Health")
        with gr.Column():
            output_text = gr.Textbox(label="Output Text")

app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)


@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
    if payload.event.scope.startswith("repo"):
        logger.info(f"Webhook received from {DATASET_NAME} indicating a repo {payload.event.action}")


if __name__ == '__main__':
    app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
    # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)