derek-thomas's picture
derek-thomas HF staff
Major updates, moving away from pushshift.io into PRAW
285612d
raw
history blame
1.79 kB
import os
from pathlib import Path
import gradio as gr
from rich.console import Console
from rich.syntax import Syntax
proj_dir = Path(__name__).parent
subreddit = os.environ["SUBREDDIT"]
username = os.environ["USERNAME"]
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
def log_file_to_html_string():
log_file = "mylog.log"
num_lines_visualize = 50
console = Console(record=True, width=150)
with open(log_file, "rt") as f:
# Seek to the end of the file minus 300 lines
# Read the last 300 lines of the file
lines = f.readlines()
lines = lines[-num_lines_visualize:]
# Syntax-highlight the last 300 lines of the file using the Python lexer and Monokai style
output = "".join(lines)
syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
console.print(syntax)
html_content = console.export_html(inline_styles=True)
return html_content
markdown = f"""
# Reddit Scraper
This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).
As shown below this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
"""
with gr.Blocks() as demo:
gr.Markdown(markdown)
gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
output = gr.HTML(log_file_to_html_string, every=1)
demo.load(None,
_js="""
() => {
document.body.classList.toggle('dark');
document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
}
""", )
if __name__ == '__main__':
demo.launch(server_name="0.0.0.0", show_error=True, server_port=7860, enable_queue=True)