SoMeScreenShotter

Runtime error

App Files Files Community

acecalisto3 commited on Jan 9

Commit

c5e9b83

verified ·

1 Parent(s): 59a3a44

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -102

app.py CHANGED Viewed

@@ -1,104 +1,106 @@
-import gradio as gr
-import requests
-from bs4 import BeautifulSoup
-import asyncio
-from fastapi import FastAPI, WebSocket
-from gradio import queue
-import httpx
-from starlette.middleware.cors import CORSMiddleware
-# Initialize FastAPI app
-app = FastAPI()
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Gradio app
-with gr.Blocks() as demo:
-    gr.Markdown("<h1 style='text-align: center; color: white;'>Smart Scraper with Change Detection</h1>")
-    with gr.Tabs():
-        with gr.Tab("URL Scrape/Screenshot"):
-            # Existing components for URL processing
-            pass
-        with gr.Tab("Monitoring"):
-            monitor_urls_input = gr.Textbox(label="Enter URLs to Monitor (separated by newline)")
-            interval_input = gr.Slider(label="Monitoring Interval (seconds)", minimum=1, maximum=3600, value=300)
-            start_monitoring_button = gr.Button("Start Monitoring")
-            stop_monitoring_button = gr.Button("Stop Monitoring")
-# Monitoring Manager
-class MonitoringManager:
-    def __init__(self):
-        self.monitored_urls = []
-        self.interval = 300  # default interval in seconds
-        self.is_monitoring = False
-        self.connections = set()
-        self.url_data = {}  # Stores latest HTML and screenshot for each URL
-    async def start_monitoring(self):
-        if not self.is_monitoring:
-            self.is_monitoring = True
-            while self.is_monitoring:
-                await asyncio.sleep(self.interval)
-                for url in self.monitored_urls:
-                    if await self.check_url_for_changes(url):
-                        message = f"Change detected at {url}"
-                        await self.notify_clients(message)
-    def stop_monitoring(self):
-        self.is_monitoring = False
-    async def check_url_for_changes(self, url):
-        # Fetch latest HTML content
-        async with httpx.AsyncClient() as client:
-            response = await client.get(url)
-            new_html = response.text
-        # Compare with stored HTML
-        if url in self.url_data:
-            if self.url_data[url] != new_html:
-                self.url_data[url] = new_html
-                return True
-        else:
-            self.url_data[url] = new_html
-            return False
-    async def notify_clients(self, message):
-        for websocket in self.connections:
-            await websocket.send_text(message)
-# WebSocket endpoint
-@app.websocket_route("/ws")
-async def websocket_endpoint(websocket: WebSocket):
-    await websocket.accept()
-    monitor_manager.connections.add(websocket)
-    try:
-        while True:
-            await websocket.receive_text()  # Keep the connection alive
-            await asyncio.sleep(0)
-    finally:
-        monitor_manager.connections.remove(websocket)
-# API endpoint to receive monitoring parameters
-@app.post("/start_monitoring")
-async def start_monitoring_endpoint(urls: list[str], interval: int):
-    monitor_manager.monitored_urls = urls
-    monitor_manager.interval = interval
-    asyncio.create_task(monitor_manager.start_monitoring())
-    return {"status": "started"}
-@app.post("/stop_monitoring")
-async def stop_monitoring_endpoint():
-    monitor_manager.stop_monitoring()
-    return {"status": "stopped"}
-# Gradio queue setup
-queue(app, port=8000)
-# Run the Gradio app
-demo.launch(server_name="0.0.0.0", server_port=8000)

+def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
+    """Process URLs with crawl depth and change detection."""
+    # Validate URLs first
+    urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
+    urls = [url.strip() for url in urls if url.strip()]  # Remove empty entries
+    urls = urls[:int(max_urls)]
+    # Validate all URLs
+    invalid_urls = [url for url in urls if not validate_url(url)]
+    if invalid_urls:
+        return None, f"Invalid URLs detected: {', '.join(invalid_urls)}"
+    scraped_data = []
+    screenshots = []
+    # Initialize progress tracking
+    total_urls = len(urls)
+    # Create memory file for ZIP archive
+    memory_file = io.BytesIO()
+    with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        for idx, url in enumerate(urls):
+            # Update progress
+            progress((idx + 1) / total_urls)  # Remove label argument
+            if not url.startswith(('http://', 'https://')):
+                url = f'https://{url}'
+            if action_radio in ['Scrape data', 'Both']:
+                try:
+                    response = requests.get(url, timeout=10)
+                    scraped_data.append({url: response.text})
+                except Exception as e:
+                    logging.error(f"Scraping error for {url}: {str(e)}")
+                    scraped_data.append({url: f"Error: {str(e)}"})
+            if action_radio in ['Capture image', 'Both']:
+                # Crawl the URL up to the specified depth
+                screenshots = crawl_url(url, 1, int(crawl_depth))
+                for screenshot_url, screenshot in screenshots:
+                    # Save the screenshot to a temporary file
+                    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
+                        temp_file.write(screenshot)
+                        temp_file_path = temp_file.name
+                    # Add the temporary file to the ZIP archive
+                    filename = f"screenshot_{idx}_{screenshot_url.split('//')[1].replace('/', '_')}.png"
+                    zipf.write(temp_file_path, filename)
+                    # Clean up the temporary file
+                    os.unlink(temp_file_path)
+    # Prepare return values
+    memory_file.seek(0)
+    zip_bytes = memory_file.getvalue()
+    scraped_data_json = json.dumps(scraped_data, indent=2)
+    return zip_bytes, scraped_data_json
+def create_interface():
+    """Create the Gradio interface."""
+    with gr.Blocks() as demo:
+        gr.Markdown("<h1 style='text-align: center; color: white;'>Smart Scraper with Change Detection</h1>")
+        with gr.Tabs():
+            with gr.Tab("URL Scrape/Screenshot"):
+                url_input = gr.Textbox(
+                    label="Enter URL(s)",
+                    value="https://example.com",
+                    placeholder="Enter single URL or multiple URLs separated by commas"
+                )
+                bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
+                action_radio = gr.Radio(
+                    ["Scrape data", "Capture image", "Both"],
+                    label="Select Action",
+                    value="Both"
+                )
+                max_urls = gr.Slider(
+                    minimum=1,
+                    maximum=20,
+                    value=5,
+                    step=1,
+                    label="Max URLs to process"
+                )
+                crawl_depth = gr.Slider(
+                    minimum=1,
+                    maximum=3,
+                    value=1,
+                    step=1,
+                    label="Crawl Depth"
+                )
+                screenshot_zip = gr.File(label="Download Screenshots", file_name='screenshots.zip')
+                scraped_data_output = gr.Textbox(label="Scraped Data")
+                process_button = gr.Button("Process URLs")
+                process_button.click(
+                    fn=process_urls,
+                    inputs=[url_input, bulk_toggle, action_radio, max_urls, crawl_depth],
+                    outputs=[screenshot_zip, scraped_data_output]
+                )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()