dataset-creator-reddit-uwaterloo

Running

App Files Files Community

dataset-creator-reddit-uwaterloo / app.py

derek-thomas HF staff

Upgrading huggingface-hub to work with webhooks

962f45f 10 months ago

raw

history blame

5.27 kB

	import os
	from pathlib import Path

	import gradio as gr
	from bs4 import BeautifulSoup
	from huggingface_hub import WebhookPayload, WebhooksServer
	from rich.console import Console
	from rich.syntax import Syntax

	from utilities.my_logger import setup_logger

	proj_dir = Path(__name__).parent

	SUBREDDIT = os.environ["SUBREDDIT"]
	USERNAME = os.environ["USERNAME"]
	DATASET_NAME = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"

	FREQUENCY = os.environ.get("FREQUENCY", '').lower()
	if FREQUENCY not in ["daily", "hourly"]:
	raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")

	WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')

	logger = setup_logger(__name__)


	def log_file_to_html_string():
	log_file = "mylog.log"
	num_lines_visualize = 50

	console = Console(record=True, width=150, style="#272822")
	with open(log_file, "rt") as f:
	# Seek to the end of the file minus 300 lines
	# Read the last 300 lines of the file
	lines = f.readlines()
	lines = lines[-num_lines_visualize:]

	# Syntax-highlight the last 300 lines of the file using the Python lexer and Monokai style
	output = "".join(lines)
	syntax = Syntax(output, "python", theme="monokai", word_wrap=True)

	console.print(syntax);
	html_content = console.export_html(inline_styles=True)

	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(html_content, 'lxml')

	# Modify the <pre> tag
	pre_tag = soup.pre
	pre_tag['class'] = 'scrollable'
	del pre_tag['style']

	# Add your custom styles and the .scrollable CSS to the <style> tag
	style_tag = soup.style
	style_content = """
	pre, code {
	background-color: #272822;
	}
	.scrollable {
	font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
	height: 500px;
	overflow: auto;
	}
	"""
	style_tag.append(style_content)

	return soup.prettify()


	intro_md = f"""
	# Reddit Dataset Creator
	This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME})
	which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details.

	As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
	"""

	how_to_md = f"""
	# How to make your own space and dataset
	1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
	- Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
	- You need the `secret` and the `Client ID` from the reddit application.
	- `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
	2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
	3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
	and fill in the information
	"""

	how_does_it_work_md = f"""
	# Core Components
	There are 2 core components [main](main.py) and [app](app.py).
	Main does a few things:
	- Pulls from a datasource
	- Updates a dataset on the hub
	- Updates the README of the dataset
	- Writes a local log file (inaccessible outside the spaces container)

	App
	- Visualizes the log file from Main

	# Running it
	This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
	log files. I use gradio for `app` and map that to the open port of huggingface spaces.

	The only communication between `app` and `main` is the log file.
	"""

	with gr.Blocks() as ui:
	with gr.Tab("Application"):
	gr.Markdown(intro_md)
	gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
	gr.Markdown("# Logs")
	output = gr.HTML(log_file_to_html_string, every=1)
	ui.load(None,
	_js="""
	() => {
	document.body.classList.toggle('dark');
	document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
	}
	""", )
	with gr.Tab("How to Create?"):
	gr.Markdown(how_to_md)
	with gr.Tab("How does it work?"):
	gr.Markdown(how_does_it_work_md)
	with gr.Tab("Hidden"):
	with gr.Column():
	input_text = gr.Textbox(label="Input Text")
	health_btn = gr.Button(value="Health")
	with gr.Column():
	output_text = gr.Textbox(label="Output Text")

	app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)


	@app.add_webhook("/dataset_repo")
	async def community(payload: WebhookPayload):
	if payload.event.scope.startswith("repo"):
	logger.info(f"Webhook received from {DATASET_NAME} indicating a repo {payload.event.action}")


	if __name__ == '__main__':
	app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
	# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)