SoMeScreenShotter

Runtime error

App Files Files Community

acecalisto3 commited on Jan 9

Commit

c276c05

verified ·

1 Parent(s): c5e9b83

Update app.py

Browse files

Files changed (1) hide show

app.py +284 -62

app.py CHANGED Viewed

@@ -1,8 +1,143 @@
 def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
     """Process URLs with crawl depth and change detection."""
     # Validate URLs first
     urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
-    urls = [url.strip() for url in urls if url.strip()]  # Remove empty entries
     urls = urls[:int(max_urls)]
     # Validate all URLs
@@ -12,55 +147,105 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
     scraped_data = []
     screenshots = []
     # Initialize progress tracking
     total_urls = len(urls)
-    # Create memory file for ZIP archive
     memory_file = io.BytesIO()
     with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
-        for idx, url in enumerate(urls):
-            # Update progress
-            progress((idx + 1) / total_urls)  # Remove label argument
-            if not url.startswith(('http://', 'https://')):
-                url = f'https://{url}'
-            if action_radio in ['Scrape data', 'Both']:
-                try:
-                    response = requests.get(url, timeout=10)
-                    scraped_data.append({url: response.text})
-                except Exception as e:
-                    logging.error(f"Scraping error for {url}: {str(e)}")
-                    scraped_data.append({url: f"Error: {str(e)}"})
-            if action_radio in ['Capture image', 'Both']:
-                # Crawl the URL up to the specified depth
-                screenshots = crawl_url(url, 1, int(crawl_depth))
-                for screenshot_url, screenshot in screenshots:
-                    # Save the screenshot to a temporary file
-                    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
-                        temp_file.write(screenshot)
-                        temp_file_path = temp_file.name
-                    # Add the temporary file to the ZIP archive
-                    filename = f"screenshot_{idx}_{screenshot_url.split('//')[1].replace('/', '_')}.png"
-                    zipf.write(temp_file_path, filename)
-                    # Clean up the temporary file
-                    os.unlink(temp_file_path)
     # Prepare return values
     memory_file.seek(0)
     zip_bytes = memory_file.getvalue()
-    scraped_data_json = json.dumps(scraped_data, indent=2)
-    return zip_bytes, scraped_data_json
 def create_interface():
     """Create the Gradio interface."""
-    with gr.Blocks() as demo:
-        gr.Markdown("<h1 style='text-align: center; color: white;'>Smart Scraper with Change Detection</h1>")
         with gr.Tabs():
             with gr.Tab("URL Scrape/Screenshot"):
@@ -69,38 +254,75 @@ def create_interface():
                     value="https://example.com",
                     placeholder="Enter single URL or multiple URLs separated by commas"
                 )
-                bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
-                action_radio = gr.Radio(
-                    ["Scrape data", "Capture image", "Both"],
-                    label="Select Action",
-                    value="Both"
-                )
-                max_urls = gr.Slider(
-                    minimum=1,
-                    maximum=20,
-                    value=5,
-                    step=1,
-                    label="Max URLs to process"
-                )
-                crawl_depth = gr.Slider(
-                    minimum=1,
-                    maximum=3,
-                    value=1,
-                    step=1,
-                    label="Crawl Depth"
-                )
-                screenshot_zip = gr.File(label="Download Screenshots", file_name='screenshots.zip')
-                scraped_data_output = gr.Textbox(label="Scraped Data")
-                process_button = gr.Button("Process URLs")
                 process_button.click(
                     fn=process_urls,
-                    inputs=[url_input, bulk_toggle, action_radio, max_urls, crawl_depth],
-                    outputs=[screenshot_zip, scraped_data_output]
                 )
     return demo
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch()

+import gradio as gr
+import requests
+import re
+import logging
+import json
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from PIL import Image
+import io
+import zipfile
+import os
+import tempfile
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+from datetime import datetime
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+def validate_url(url):
+    """Validate if the URL is properly formatted."""
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except:
+        return False
+def get_latest_data(url):
+    """Get the latest HTML content of a webpage."""
+    try:
+        response = requests.get(url, timeout=10)
+        return response.text
+    except Exception as e:
+        logging.error(f"Error fetching latest data from {url}: {str(e)}")
+        return None
+def compare_html(old_html, new_html):
+    """Compare two HTML contents to detect changes."""
+    if not old_html or not new_html:
+        return False
+    return old_html.strip() != new_html.strip()
+def compare_screenshot(old_screenshot, new_screenshot):
+    """Compare two screenshots to detect changes."""
+    try:
+        if not old_screenshot or not new_screenshot:
+            return False
+        old_img = Image.open(io.BytesIO(old_screenshot))
+        new_img = Image.open(io.BytesIO(new_screenshot))
+        return not (old_img == new_img)
+    except Exception as e:
+        logging.error(f"Error comparing screenshots: {str(e)}")
+        return False
+def alert_changes(url, change_type):
+    """Log detected changes."""
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
+    return f"[{timestamp}] {change_type}"
+def extract_links_from_page(url):
+    """Extract all links from a webpage."""
+    try:
+        response = requests.get(url, timeout=10)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        links = [a['href'] for a in soup.find_all('a', href=True)]
+        return links
+    except Exception as e:
+        logging.error(f"Error extracting links from {url}: {str(e)}")
+        return []
+def take_screenshot(url):
+    """Take a screenshot of a webpage."""
+    try:
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument("--window-size=1920,1080")
+        driver = webdriver.Chrome(options=chrome_options)
+        driver.get(url)
+        screenshot = driver.get_screenshot_as_png()
+        driver.quit()
+        image = Image.open(io.BytesIO(screenshot))
+        max_size = (1024, 1024)
+        image.thumbnail(max_size, Image.LANCZOS)
+        img_byte_arr = io.BytesIO()
+        image.save(img_byte_arr, format='PNG')
+        return img_byte_arr.getvalue()
+    except Exception as e:
+        logging.error(f"Screenshot error for {url}: {str(e)}")
+        return None
+def is_webpage(url):
+    """Check if the URL points to a webpage (HTML)."""
+    try:
+        response = requests.head(url, timeout=10)
+        content_type = response.headers.get('Content-Type', '').lower()
+        return 'text/html' in content_type
+    except Exception as e:
+        logging.error(f"Error checking content type for {url}: {str(e)}")
+        return False
+def crawl_url(url, depth, max_depth, visited=None):
+    """Recursively crawl a URL up to a specified depth."""
+    if visited is None:
+        visited = set()
+    if depth > max_depth or url in visited:
+        return []
+    visited.add(url)
+    screenshots = []
+    if is_webpage(url):
+        links = extract_links_from_page(url)
+        screenshot = take_screenshot(url)
+        if screenshot:
+            screenshots.append((url, screenshot))
+        if depth < max_depth:
+            for link in links:
+                if not link.startswith(('http://', 'https://')):
+                    link = f"https://{link}"
+                screenshots.extend(crawl_url(link, depth + 1, max_depth, visited))
+    else:
+        logging.info(f"Skipping non-webpage content: {url}")
+    return screenshots
 def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
     """Process URLs with crawl depth and change detection."""
     # Validate URLs first
     urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
+    urls = [url.strip() for url in urls if url.strip()]
     urls = urls[:int(max_urls)]
     # Validate all URLs
     scraped_data = []
     screenshots = []
+    changes_log = []
     # Initialize progress tracking
     total_urls = len(urls)
+    progress(0)
+    # Directory to store scraped data
+    data_dir = 'scraped_data'
+    os.makedirs(data_dir, exist_ok=True)
+    # Process each URL
+    for idx, url in enumerate(urls):
+        if not url.startswith(('http://', 'https://')):
+            url = f'https://{url}'
+        # Check for changes
+        old_html_path = os.path.join(data_dir, f"{url.replace('/', '_')}_html.txt")
+        old_screenshot_path = os.path.join(data_dir, f"{url.replace('/', '_')}_screenshot.png")
+        # Fetch latest data
+        latest_html = get_latest_data(url)
+        latest_screenshot = take_screenshot(url)
+        # Compare with previous data if available
+        if os.path.exists(old_html_path):
+            with open(old_html_path, 'r', encoding='utf-8') as f:
+                old_html = f.read()
+            if compare_html(old_html, latest_html):
+                changes_log.append(alert_changes(url, "HTML content has changed"))
+        if os.path.exists(old_screenshot_path):
+            with open(old_screenshot_path, 'rb') as f:
+                old_screenshot = f.read()
+            if latest_screenshot and compare_screenshot(old_screenshot, latest_screenshot):
+                changes_log.append(alert_changes(url, "Visual content has changed"))
+        # Store latest data
+        if latest_html:
+            with open(old_html_path, 'w', encoding='utf-8') as f:
+                f.write(latest_html)
+        if latest_screenshot:
+            with open(old_screenshot_path, 'wb') as f:
+                f.write(latest_screenshot)
+        # Prepare output data
+        if action_radio in ['Scrape data', 'Both']:
+            scraped_data.append({
+                'url': url,
+                'content': latest_html,
+                'timestamp': datetime.now().isoformat(),
+                'changes_detected': changes_log
+            })
+        if action_radio in ['Capture image', 'Both']:
+            crawled_screenshots = crawl_url(url, depth=1, max_depth=int(crawl_depth))
+            screenshots.extend(crawled_screenshots)
+        # Update progress
+        progress((idx + 1) / total_urls)
+    # Create ZIP file in memory
     memory_file = io.BytesIO()
     with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        # Add screenshots to ZIP
+        for screenshot_url, screenshot_data in screenshots:
+            filename = f"{screenshot_url.split('//')[1].replace('/', '_')}.png"
+            zipf.writestr(filename, screenshot_data)
+        # Add scraped data and changes log to ZIP
+        if scraped_data:
+            data_to_save = {
+                'scraped_data': scraped_data,
+                'changes_log': changes_log,
+                'timestamp': datetime.now().isoformat()
+            }
+            zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
     # Prepare return values
     memory_file.seek(0)
     zip_bytes = memory_file.getvalue()
+    # Prepare display data
+    display_data = {
+        'scraped_urls': len(scraped_data),
+        'screenshots_taken': len(screenshots),
+        'changes_detected': changes_log
+    }
+    return zip_bytes, json.dumps(display_data, indent=2)
 def create_interface():
     """Create the Gradio interface."""
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            """
+            # Smart Web Scraper with Change Detection
+            Monitor and capture changes in web content automatically.
+            """
+        )
         with gr.Tabs():
             with gr.Tab("URL Scrape/Screenshot"):
                     value="https://example.com",
                     placeholder="Enter single URL or multiple URLs separated by commas"
                 )
+                with gr.Row():
+                    bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
+                    action_radio = gr.Radio(
+                        ["Scrape data", "Capture image", "Both"],
+                        label="Select Action",
+                        value="Both"
+                    )
+                with gr.Row():
+                    max_urls = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=5,
+                        step=1,
+                        label="Max URLs to process"
+                    )
+                    crawl_depth = gr.Slider(
+                        minimum=1,
+                        maximum=3,
+                        value=1,
+                        step=1,
+                        label="Crawl Depth"
+                    )
+                process_button = gr.Button("Process URLs", variant="primary")
+                with gr.Column():
+                    screenshot_zip = gr.File(
+                        label="Download Results",
+                        file_count="single",
+                        file_types=[".zip"]
+                    )
+                    scraped_data_output = gr.JSON(label="Results Summary")
                 process_button.click(
                     fn=process_urls,
+                    inputs=[
+                        url_input,
+                        bulk_toggle,
+                        action_radio,
+                        max_urls,
+                        crawl_depth
+                    ],
+                    outputs=[
+                        screenshot_zip,
+                        scraped_data_output
+                    ],
+                    show_progress=True
                 )
+        gr.Markdown(
+            """
+            ### Features
+            - Bulk URL processing
+            - Screenshot capture
+            - Content change detection
+            - Recursive crawling
+            - Automatic data storage
+            """
+        )
     return demo
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7861,
+        share=True,
+        debug=True
+    )