SoMeScreenShotter

Runtime error

App Files Files Community

acecalisto3 commited on Jan 9

Commit

6f3886d

verified ·

1 Parent(s): 8edebce

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -18

app.py CHANGED Viewed

@@ -9,15 +9,15 @@ from PIL import Image
 import io
 import zipfile
 import os
-import tempfile
-from bs4 import BeautifulSoup
-from urllib.parse import urlparse
-from datetime import datetime
 # Configure logging
 logging.basicConfig(level=logging.INFO,
                     format='%(asctime)s - %(levelname)s - %(message)s')
 def validate_url(url):
     """Validate if the URL is properly formatted."""
     try:
@@ -55,7 +55,7 @@ def compare_screenshot(old_screenshot, new_screenshot):
 def alert_changes(url, change_type):
     """Log detected changes."""
-    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
     return f"[{timestamp}] {change_type}"
@@ -143,7 +143,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
     # Validate all URLs
     invalid_urls = [url for url in urls if not validate_url(url)]
     if invalid_urls:
-        return gr.FileData(None, "error.zip"), f"Invalid URLs detected: {', '.join(invalid_urls)}"
     scraped_data = []
     screenshots = []
@@ -162,9 +162,12 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
         if not url.startswith(('http://', 'https://')):
             url = f'https://{url}'
         # Check for changes
-        old_html_path = os.path.join(data_dir, f"{url.replace('/', '_')}_html.txt")
-        old_screenshot_path = os.path.join(data_dir, f"{url.replace('/', '_')}_screenshot.png")
         # Fetch latest data
         latest_html = get_latest_data(url)
@@ -196,7 +199,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
             scraped_data.append({
                 'url': url,
                 'content': latest_html,
-                'timestamp': datetime.now().isoformat(),
                 'changes_detected': changes_log
             })
@@ -212,7 +215,8 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
     with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
         # Add screenshots to ZIP
         for screenshot_url, screenshot_data in screenshots:
-            filename = f"{screenshot_url.split('//')[1].replace('/', '_')}.png"
             zipf.writestr(filename, screenshot_data)
         # Add scraped data and changes log to ZIP
@@ -220,7 +224,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
             data_to_save = {
                 'scraped_data': scraped_data,
                 'changes_log': changes_log,
-                'timestamp': datetime.now().isoformat()
             }
             zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
@@ -235,8 +239,8 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
         'changes_detected': changes_log
     }
-    # Return ZIP file data and display data
-    return gr.FileData(zip_bytes, "results.zip"), json.dumps(display_data, indent=2)
 def create_interface():
     """Create the Gradio interface."""
@@ -283,11 +287,7 @@ def create_interface():
                 process_button = gr.Button("Process URLs", variant="primary")
                 with gr.Column():
-                    screenshot_zip = gr.File(
-                        label="Download Results",
-                        file_count="single",
-                        file_types=[".zip"]
-                    )
                     scraped_data_output = gr.JSON(label="Results Summary")
                 process_button.click(

 import io
 import zipfile
 import os
+import datetime
 # Configure logging
 logging.basicConfig(level=logging.INFO,
                     format='%(asctime)s - %(levelname)s - %(message)s')
+def sanitize_filename(filename):
+    return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
 def validate_url(url):
     """Validate if the URL is properly formatted."""
     try:
 def alert_changes(url, change_type):
     """Log detected changes."""
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
     return f"[{timestamp}] {change_type}"
     # Validate all URLs
     invalid_urls = [url for url in urls if not validate_url(url)]
     if invalid_urls:
+        return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
     scraped_data = []
     screenshots = []
         if not url.startswith(('http://', 'https://')):
             url = f'https://{url}'
+        # Sanitize URL for file naming
+        sanitized_url = sanitize_filename(url)
         # Check for changes
+        old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt")
+        old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png")
         # Fetch latest data
         latest_html = get_latest_data(url)
             scraped_data.append({
                 'url': url,
                 'content': latest_html,
+                'timestamp': datetime.datetime.now().isoformat(),
                 'changes_detected': changes_log
             })
     with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
         # Add screenshots to ZIP
         for screenshot_url, screenshot_data in screenshots:
+            sanitized_screenshot_url = sanitize_filename(screenshot_url)
+            filename = f"{sanitized_screenshot_url}.png"
             zipf.writestr(filename, screenshot_data)
         # Add scraped data and changes log to ZIP
             data_to_save = {
                 'scraped_data': scraped_data,
                 'changes_log': changes_log,
+                'timestamp': datetime.datetime.now().isoformat()
             }
             zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
         'changes_detected': changes_log
     }
+    # Return ZIP bytes and display data
+    return zip_bytes, json.dumps(display_data, indent=2)
 def create_interface():
     """Create the Gradio interface."""
                 process_button = gr.Button("Process URLs", variant="primary")
                 with gr.Column():
+                    screenshot_zip = gr.File(label="Download Results", file_name="results.zip")
                     scraped_data_output = gr.JSON(label="Results Summary")
                 process_button.click(