taesiri's picture
Update app.py
798ee13 verified
import gradio as gr
from datasets import load_dataset
import json
import random
from datetime import datetime
import os
from PIL import Image
import io
import numpy as np
# Get access token from environment
access_token = os.environ.get("HUGGINGFACE_TOKEN")
class DatasetViewer:
def __init__(self):
self.dataset = None
self.dataset_size = 0
self.last_refresh_time = None
self.max_display_size = (800, 600) # Maximum width and height for displayed images
self.load_dataset()
def resize_image(self, image):
"""Resize image keeping aspect ratio with a maximum size constraint"""
if isinstance(image, np.ndarray):
# Convert numpy array to PIL Image
image = Image.fromarray(image)
elif isinstance(image, bytes):
# Convert bytes to PIL Image
image = Image.open(io.BytesIO(image))
# Calculate scaling factor to fit within max dimensions
width_ratio = self.max_display_size[0] / image.width
height_ratio = self.max_display_size[1] / image.height
scale_factor = min(width_ratio, height_ratio)
# Only resize if image is larger than max dimensions
if scale_factor < 1:
new_width = int(image.width * scale_factor)
new_height = int(image.height * scale_factor)
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
# Convert back to numpy array for gradio
return np.array(image)
def load_dataset(self):
"""Load the complete dataset into memory"""
# Load the full dataset (non-streaming)
self.dataset = load_dataset(
"taesiri/PhotoshopRequest-DailyDump-January-2025-RandomSample",
split="train",
token=access_token
)
self.dataset_size = len(self.dataset)
self.last_refresh_time = datetime.now()
def get_next_samples(self, num_samples=5):
"""Get random samples from the dataset"""
# Generate random indices
indices = random.sample(range(self.dataset_size), min(num_samples, self.dataset_size))
results = []
for idx in indices:
sample = self.dataset[idx]
# Get post information
post_id = sample["post_id"]
title = sample["title"]
reddit_url = f"https://www.reddit.com/r/PhotoshopRequest/comments/{post_id}"
# Extract selftext if available
selftext = ""
try:
selftext = json.loads(sample["json_data"])["post"]["selftext"]
except:
print(f"No selftext found for post {post_id}")
# Create markdown text
markdown_text = f"# {title}\n\n{selftext}\n\n[View post on r/PhotoshopRequest]({reddit_url})"
# Append the triple (post_info, source_image, edited_image)
results.append(markdown_text)
# Resize images before adding to results
source_image = self.resize_image(sample["source_image"])
edited_image = self.resize_image(sample["edited_image"])
results.append(source_image)
results.append(edited_image)
return tuple(results)
def get_info(self):
"""Return dataset information"""
return f"""
<div style="text-align: center;">
<hr>
Dataset Size: {self.dataset_size} items<br>
Last Refreshed: {self.last_refresh_time.strftime('%Y-%m-%d %H:%M:%S UTC')}
</div>
"""
def create_interface():
viewer = DatasetViewer()
with gr.Blocks() as demo:
gr.Markdown("# PhotoshopRequest Dataset Viewer")
gr.Markdown("""
This is a viewer for the PhotoshopRequest dataset. Each sample shows a Photoshop editing request post.
Click the 'Show New Samples' button to see **5 random samples** from the dataset.
**Layout**: For each sample, you'll see:
1. The post title and description
2. The source image (left) and edited result (right)
""")
# Create 5 sets of outputs
outputs = []
for i in range(5):
post_info = gr.Markdown()
outputs.append(post_info)
with gr.Row():
source = gr.Image(label=f"Source Image {i+1}")
edited = gr.Image(label=f"Edited Image {i+1}")
outputs.extend([source, edited])
sample_button = gr.Button("Show New Samples")
info_md = gr.Markdown()
# Set up event handlers
sample_button.click(
viewer.get_next_samples,
outputs=outputs
).then(
viewer.get_info,
outputs=[info_md]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()