|
import os |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
from bs4 import BeautifulSoup |
|
from huggingface_hub import WebhookPayload, WebhooksServer |
|
from rich.console import Console |
|
from rich.syntax import Syntax |
|
|
|
from utilities.my_logger import setup_logger |
|
|
|
proj_dir = Path(__name__).parent |
|
|
|
SUBREDDIT = os.environ["SUBREDDIT"] |
|
USERNAME = os.environ["USERNAME"] |
|
DATASET_NAME = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}" |
|
|
|
FREQUENCY = os.environ.get("FREQUENCY", '').lower() |
|
if FREQUENCY not in ["daily", "hourly"]: |
|
raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'") |
|
|
|
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret') |
|
|
|
logger = setup_logger(__name__) |
|
|
|
|
|
def log_file_to_html_string(): |
|
log_file = "mylog.log" |
|
num_lines_visualize = 50 |
|
|
|
console = Console(record=True, width=150, style="#272822") |
|
with open(log_file, "rt") as f: |
|
|
|
|
|
lines = f.readlines() |
|
lines = lines[-num_lines_visualize:] |
|
|
|
|
|
output = "".join(lines) |
|
syntax = Syntax(output, "python", theme="monokai", word_wrap=True) |
|
|
|
console.print(syntax); |
|
html_content = console.export_html(inline_styles=True) |
|
|
|
|
|
soup = BeautifulSoup(html_content, 'lxml') |
|
|
|
|
|
pre_tag = soup.pre |
|
pre_tag['class'] = 'scrollable' |
|
del pre_tag['style'] |
|
|
|
|
|
style_tag = soup.style |
|
style_content = """ |
|
pre, code { |
|
background-color: #272822; |
|
} |
|
.scrollable { |
|
font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace; |
|
height: 500px; |
|
overflow: auto; |
|
} |
|
""" |
|
style_tag.append(style_content) |
|
|
|
return soup.prettify() |
|
|
|
|
|
intro_md = f""" |
|
# Reddit Dataset Creator |
|
This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}) |
|
which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details. |
|
|
|
As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset. |
|
""" |
|
|
|
how_to_md = f""" |
|
# How to make your own space and dataset |
|
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use' |
|
- Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri' |
|
- You need the `secret` and the `Client ID` from the reddit application. |
|
- `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too. |
|
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens) |
|
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a> |
|
and fill in the information |
|
""" |
|
|
|
how_does_it_work_md = f""" |
|
# Core Components |
|
There are 2 core components [main](main.py) and [app](app.py). |
|
Main does a few things: |
|
- Pulls from a datasource |
|
- Updates a dataset on the hub |
|
- Updates the README of the dataset |
|
- Writes a local log file (inaccessible outside the spaces container) |
|
|
|
App |
|
- Visualizes the log file from Main |
|
|
|
# Running it |
|
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the |
|
log files. I use gradio for `app` and map that to the open port of huggingface spaces. |
|
|
|
The only communication between `app` and `main` is the log file. |
|
""" |
|
|
|
with gr.Blocks() as ui: |
|
with gr.Tab("Application"): |
|
gr.Markdown(intro_md) |
|
gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png') |
|
gr.Markdown("# Logs") |
|
output = gr.HTML(log_file_to_html_string, every=1) |
|
ui.load(None, |
|
_js=""" |
|
() => { |
|
document.body.classList.toggle('dark'); |
|
document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)' |
|
} |
|
""", ) |
|
with gr.Tab("How to Create?"): |
|
gr.Markdown(how_to_md) |
|
with gr.Tab("How does it work?"): |
|
gr.Markdown(how_does_it_work_md) |
|
with gr.Tab("Hidden"): |
|
with gr.Column(): |
|
input_text = gr.Textbox(label="Input Text") |
|
health_btn = gr.Button(value="Health") |
|
with gr.Column(): |
|
output_text = gr.Textbox(label="Output Text") |
|
|
|
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET) |
|
|
|
|
|
@app.add_webhook("/dataset_repo") |
|
async def community(payload: WebhookPayload): |
|
if payload.event.scope.startswith("repo"): |
|
logger.info(f"Webhook received from {DATASET_NAME} indicating a repo {payload.event.action}") |
|
|
|
|
|
if __name__ == '__main__': |
|
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860) |
|
|
|
|