File size: 5,268 Bytes
8e8a9fc 749d1d8 5d9e0b8 fceefe7 749d1d8 fceefe7 8e8a9fc fceefe7 8e8a9fc fceefe7 5d9e0b8 c1f39f8 fceefe7 749d1d8 e014498 749d1d8 e6a15ab 749d1d8 52bca1a d8d1956 52bca1a 749d1d8 5d9e0b8 749d1d8 5d9e0b8 e6a15ab 5d9e0b8 749d1d8 5d9e0b8 fc00c85 5d9e0b8 fceefe7 8e8a9fc 24c9f40 8e8a9fc 749d1d8 5d9e0b8 fceefe7 5d9e0b8 9a66c2f fceefe7 8f11653 fc00c85 5d9e0b8 075c34d f0e56b8 fceefe7 962f45f fceefe7 749d1d8 f3c4357 fceefe7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
from pathlib import Path
import gradio as gr
from bs4 import BeautifulSoup
from huggingface_hub import WebhookPayload, WebhooksServer
from rich.console import Console
from rich.syntax import Syntax
from utilities.my_logger import setup_logger
proj_dir = Path(__name__).parent
SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
DATASET_NAME = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
FREQUENCY = os.environ.get("FREQUENCY", '').lower()
if FREQUENCY not in ["daily", "hourly"]:
raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
logger = setup_logger(__name__)
def log_file_to_html_string():
log_file = "mylog.log"
num_lines_visualize = 50
console = Console(record=True, width=150, style="#272822")
with open(log_file, "rt") as f:
# Seek to the end of the file minus 300 lines
# Read the last 300 lines of the file
lines = f.readlines()
lines = lines[-num_lines_visualize:]
# Syntax-highlight the last 300 lines of the file using the Python lexer and Monokai style
output = "".join(lines)
syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
console.print(syntax);
html_content = console.export_html(inline_styles=True)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')
# Modify the <pre> tag
pre_tag = soup.pre
pre_tag['class'] = 'scrollable'
del pre_tag['style']
# Add your custom styles and the .scrollable CSS to the <style> tag
style_tag = soup.style
style_content = """
pre, code {
background-color: #272822;
}
.scrollable {
font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
height: 500px;
overflow: auto;
}
"""
style_tag.append(style_content)
return soup.prettify()
intro_md = f"""
# Reddit Dataset Creator
This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME})
which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details.
As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
"""
how_to_md = f"""
# How to make your own space and dataset
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
- Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
- You need the `secret` and the `Client ID` from the reddit application.
- `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
and fill in the information
"""
how_does_it_work_md = f"""
# Core Components
There are 2 core components [main](main.py) and [app](app.py).
Main does a few things:
- Pulls from a datasource
- Updates a dataset on the hub
- Updates the README of the dataset
- Writes a local log file (inaccessible outside the spaces container)
App
- Visualizes the log file from Main
# Running it
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
log files. I use gradio for `app` and map that to the open port of huggingface spaces.
The only communication between `app` and `main` is the log file.
"""
with gr.Blocks() as ui:
with gr.Tab("Application"):
gr.Markdown(intro_md)
gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
gr.Markdown("# Logs")
output = gr.HTML(log_file_to_html_string, every=1)
ui.load(None,
_js="""
() => {
document.body.classList.toggle('dark');
document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
}
""", )
with gr.Tab("How to Create?"):
gr.Markdown(how_to_md)
with gr.Tab("How does it work?"):
gr.Markdown(how_does_it_work_md)
with gr.Tab("Hidden"):
with gr.Column():
input_text = gr.Textbox(label="Input Text")
health_btn = gr.Button(value="Health")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)
@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
if payload.event.scope.startswith("repo"):
logger.info(f"Webhook received from {DATASET_NAME} indicating a repo {payload.event.action}")
if __name__ == '__main__':
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|