SoMeScreenShotter

Runtime error

App Files Files Community

acecalisto3 commited on Jan 12

Commit

7730e40

verified ·

1 Parent(s): ed3955d

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -19

app.py CHANGED Viewed

@@ -135,7 +135,7 @@ def crawl_url(url, depth, max_depth, visited=None):
         logging.info(f"Skipping non-webpage content: {url}")
     return screenshots
 def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
     """Process URLs with crawl depth and change detection."""
     # Validate URLs first
@@ -146,7 +146,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
     # Validate all URLs
     invalid_urls = [url for url in urls if not validate_url(url)]
     if invalid_urls:
-        return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2), []
     scraped_data = []
     screenshots = []
@@ -199,16 +199,11 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
         # Prepare output data
         if action_radio in ['Scrape data', 'Both']:
-            cleaned_content = BeautifulSoup(latest_html, 'html.parser').get_text(separator="\n").strip()
             scraped_data.append({
                 'url': url,
-                'content': cleaned_content,
                 'timestamp': datetime.datetime.now().isoformat(),
-                'changes_detected': changes_log.copy(),  # Ensure changes_log is a copy, not a reference
-                'metadata': {
-                    'html_length': len(cleaned_content),
-                    'screenshot_available': latest_screenshot is not None
-                }
             })
         if action_radio in ['Capture image', 'Both']:
@@ -231,7 +226,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
             if scraped_data:
                 data_to_save = {
                     'scraped_data': scraped_data,
-                    'changes_log': changes_log.copy(),  # Ensure changes_log is a copy, not a reference
                     'timestamp': datetime.datetime.now().isoformat()
                 }
                 zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
@@ -244,15 +239,46 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
         'total_scraped_urls': len(scraped_data),
         'total_screenshots_taken': len(screenshots),
         'changes_detected': changes_log,
-        'screenshots': [screenshot_data for _, screenshot_data in screenshots]
     }
-    # Convert screenshots to a format suitable for Gradio
-    screenshot_display = [io.BytesIO(screenshot_data) for _, screenshot_data in screenshots]
-    # Return the path to the temporary ZIP file, display data, and screenshots
-    return zip_file_path, json.dumps(display_data, indent=2), screenshot_display
 def create_interface():
     """Create the Gradio interface."""
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -300,7 +326,6 @@ def create_interface():
                 with gr.Column():
                     screenshot_zip = gr.File(label="Download Results")
                     scraped_data_output = gr.JSON(label="Results Summary")
-                    screenshot_gallery = gr.Gallery(label="Screenshots", show_label=True, scale=2)
                 process_button.click(
                     fn=process_urls,
@@ -313,11 +338,51 @@ def create_interface():
                     ],
                     outputs=[
                         screenshot_zip,
-                        scraped_data_output,
-                        screenshot_gallery
                     ],
                     show_progress=True
                 )
         gr.Markdown(
             """
@@ -326,7 +391,7 @@ def create_interface():
             - Screenshot capture
             - Content change detection
             - Recursive crawling
-            - Automatic data storage
             """
         )

         logging.info(f"Skipping non-webpage content: {url}")
     return screenshots
 def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
     """Process URLs with crawl depth and change detection."""
     # Validate URLs first
     # Validate all URLs
     invalid_urls = [url for url in urls if not validate_url(url)]
     if invalid_urls:
+        return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
     scraped_data = []
     screenshots = []
         # Prepare output data
         if action_radio in ['Scrape data', 'Both']:
             scraped_data.append({
                 'url': url,
+                'content': latest_html,  # Include full HTML content
                 'timestamp': datetime.datetime.now().isoformat(),
+                'changes_detected': changes_log
             })
         if action_radio in ['Capture image', 'Both']:
             if scraped_data:
                 data_to_save = {
                     'scraped_data': scraped_data,
+                    'changes_log': changes_log,
                     'timestamp': datetime.datetime.now().isoformat()
                 }
                 zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
         'total_scraped_urls': len(scraped_data),
         'total_screenshots_taken': len(screenshots),
         'changes_detected': changes_log,
+        'scraped_data': scraped_data  # Include full scraped data
     }
+    # Return the path to the temporary ZIP file and display data
+    return zip_file_path, json.dumps(display_data, indent=2)
+def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
+    """Handle chat-based instructions for scraping."""
+    # Example: Parse instructions like "Scrape all links" or "Extract all images"
+    if "scrape all links" in instruction.lower():
+        # Extract links from the provided URL(s)
+        urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
+        urls = [url.strip() for url in urls if url.strip()]
+        urls = urls[:int(max_urls)]
+        all_links = []
+        for url in urls:
+            links = extract_links_from_page(url)
+            all_links.extend(links)
+        return f"Extracted links: {', '.join(all_links)}"
+    elif "extract all images" in instruction.lower():
+        # Extract image URLs from the provided URL(s)
+        urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
+        urls = [url.strip() for url in urls if url.strip()]
+        urls = urls[:int(max_urls)]
+        all_images = []
+        for url in urls:
+            response = requests.get(url, timeout=10)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            images = [img['src'] for img in soup.find_all('img', src=True)]
+            all_images.extend(images)
+        return f"Extracted images: {', '.join(all_images)}"
+    else:
+        return "Instruction not recognized. Please try again."
 def create_interface():
     """Create the Gradio interface."""
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 with gr.Column():
                     screenshot_zip = gr.File(label="Download Results")
                     scraped_data_output = gr.JSON(label="Results Summary")
                 process_button.click(
                     fn=process_urls,
                     ],
                     outputs=[
                         screenshot_zip,
+                        scraped_data_output
                     ],
                     show_progress=True
                 )
+            with gr.Tab("Chat-Based Scrape"):
+                chat_instruction = gr.Textbox(
+                    label="Enter Instruction",
+                    placeholder="e.g., 'Scrape all links' or 'Extract all images'"
+                )
+                chat_url_input = gr.Textbox(
+                    label="Enter URL(s)",
+                    value="https://example.com",
+                    placeholder="Enter single URL or multiple URLs separated by commas"
+                )
+                chat_bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
+                chat_max_urls = gr.Slider(
+                    minimum=1,
+                    maximum=20,
+                    value=5,
+                    step=1,
+                    label="Max URLs to process"
+                )
+                chat_crawl_depth = gr.Slider(
+                    minimum=1,
+                    maximum=3,
+                    value=1,
+                    step=1,
+                    label="Crawl Depth"
+                )
+                chat_output = gr.Textbox(label="Chat Output")
+                chat_button = gr.Button("Submit Instruction", variant="primary")
+                chat_button.click(
+                    fn=chat_based_scrape,
+                    inputs=[
+                        chat_instruction,
+                        chat_url_input,
+                        chat_bulk_toggle,
+                        chat_max_urls,
+                        chat_crawl_depth
+                    ],
+                    outputs=chat_output
+                )
         gr.Markdown(
             """
             - Screenshot capture
             - Content change detection
             - Recursive crawling
+            - Chat-based instructions
             """
         )