dataset-creator-reddit-uwaterloo

Building

App Files Files Community

dataset-creator-reddit-uwaterloo / app.py

derek-thomas HF staff

Major updates, moving away from pushshift.io into PRAW

285612d about 1 year ago

raw

history blame

1.79 kB

	import os
	from pathlib import Path

	import gradio as gr
	from rich.console import Console
	from rich.syntax import Syntax

	proj_dir = Path(__name__).parent

	subreddit = os.environ["SUBREDDIT"]
	username = os.environ["USERNAME"]
	dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"


	def log_file_to_html_string():
	log_file = "mylog.log"
	num_lines_visualize = 50

	console = Console(record=True, width=150)
	with open(log_file, "rt") as f:
	# Seek to the end of the file minus 300 lines
	# Read the last 300 lines of the file
	lines = f.readlines()
	lines = lines[-num_lines_visualize:]

	# Syntax-highlight the last 300 lines of the file using the Python lexer and Monokai style
	output = "".join(lines)
	syntax = Syntax(output, "python", theme="monokai", word_wrap=True)

	console.print(syntax)
	html_content = console.export_html(inline_styles=True)

	return html_content


	markdown = f"""
	# Reddit Scraper
	This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).

	As shown below this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
	"""

	with gr.Blocks() as demo:
	gr.Markdown(markdown)
	gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
	output = gr.HTML(log_file_to_html_string, every=1)
	demo.load(None,
	_js="""
	() => {
	document.body.classList.toggle('dark');
	document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
	}
	""", )

	if __name__ == '__main__':
	demo.launch(server_name="0.0.0.0", show_error=True, server_port=7860, enable_queue=True)