SoMeScreenShotter

Runtime error

App Files Files Community

acecalisto3 commited on Jan 16

Commit

8e26410

verified ·

1 Parent(s): 0faed31

Rename app.tsx to 2app.py

Browse files

Files changed (2) hide show

2app.py +410 -0
app.tsx +0 -205

2app.py ADDED Viewed

	@@ -0,0 +1,410 @@

+import gradio as gr
+import requests
+import re
+import logging
+import json
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from PIL import Image
+import io
+import zipfile
+import os
+import datetime
+from urllib.parse import urlparse
+import tempfile
+import nltk
+try:
+    nltk.download('punkt')
+    nltk.download('stopwords')
+    nltk.download('wordnet')
+    nltk.download('averaged_perceptron_tagger')
+except Exception as e:
+    logging.error(f"Error downloading NLTK data: {str(e)}")
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def sanitize_filename(filename):
+    return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
+def validate_url(url):
+    """Validate if the URL is properly formatted."""
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except:
+        return False
+def get_latest_data(url):
+    """Get the latest HTML content of a webpage."""
+    try:
+        response = requests.get(url, timeout=10)
+        return response.text
+    except Exception as e:
+        logging.error(f"Error fetching latest data from {url}: {str(e)}")
+        return None
+def compare_html(old_html, new_html):
+    """Compare two HTML contents to detect changes."""
+    if not old_html or not new_html:
+        return False
+    return old_html.strip() != new_html.strip()
+def compare_screenshot(old_screenshot, new_screenshot):
+    """Compare two screenshots to detect changes."""
+    try:
+        if not old_screenshot or not new_screenshot:
+            return False
+        old_img = Image.open(io.BytesIO(old_screenshot))
+        new_img = Image.open(io.BytesIO(new_screenshot))
+        return not (old_img == new_img)
+    except Exception as e:
+        logging.error(f"Error comparing screenshots: {str(e)}")
+        return False
+def alert_changes(url, change_type):
+    """Log detected changes."""
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
+    return f"[{timestamp}] {change_type}"
+def extract_links_from_page(url):
+    """Extract all links from a webpage."""
+    try:
+        response = requests.get(url, timeout=10)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        links = [a['href'] for a in soup.find_all('a', href=True)]
+        return links
+    except Exception as e:
+        logging.error(f"Error extracting links from {url}: {str(e)}")
+        return []
+def take_screenshot(url):
+    """Take a screenshot of a webpage."""
+    try:
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument("--window-size=1920,1080")
+        driver = webdriver.Chrome(options=chrome_options)
+        driver.get(url)
+        screenshot = driver.get_screenshot_as_png()
+        driver.quit()
+        image = Image.open(io.BytesIO(screenshot))
+        max_size = (1024, 1024)
+        image.thumbnail(max_size, Image.LANCZOS)
+        img_byte_arr = io.BytesIO()
+        image.save(img_byte_arr, format='PNG')
+        return img_byte_arr.getvalue()
+    except Exception as e:
+        logging.error(f"Screenshot error for {url}: {str(e)}")
+        return None
+def is_webpage(url):
+    """Check if the URL points to a webpage (HTML)."""
+    try:
+        response = requests.head(url, timeout=10)
+        content_type = response.headers.get('Content-Type', '').lower()
+        return 'text/html' in content_type
+    except Exception as e:
+        logging.error(f"Error checking content type for {url}: {str(e)}")
+        return False
+def crawl_url(url, depth, max_depth, visited=None):
+    """Recursively crawl a URL up to a specified depth."""
+    if visited is None:
+        visited = set()
+    if depth > max_depth or url in visited:
+        return []
+    visited.add(url)
+    screenshots = []
+    if is_webpage(url):
+        links = extract_links_from_page(url)
+        screenshot = take_screenshot(url)
+        if screenshot:
+            screenshots.append((url, screenshot))
+        if depth < max_depth:
+            for link in links:
+                if not link.startswith(('http://', 'https://')):
+                    link = f"https://{link}"
+                screenshots.extend(crawl_url(link, depth + 1, max_depth, visited))
+    else:
+        logging.info(f"Skipping non-webpage content: {url}")
+    return screenshots
+def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mode='standard', progress=gr.Progress()):
+    """Process URLs with crawl depth and change detection."""
+    # Validate URLs first
+    urls = re.split(r'[,\n]+', url_input.strip())
+    if bulk_toggle:
+        urls = [url.strip() for url in urls if url.strip()]
+    else:
+        urls = [url_input.strip()]
+    urls = urls[:int(max_urls)]
+    # Validate all URLs
+    invalid_urls = [url for url in urls if not validate_url(url)]
+    if invalid_urls:
+        if mode == 'chat':
+            return f"Invalid URLs detected: {', '.join(invalid_urls)}"
+        else:
+            return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
+    scraped_data = []
+    screenshots = []
+    changes_log = []
+    # Initialize progress tracking
+    total_urls = len(urls)
+    progress(0)
+    # Directory to store scraped data
+    data_dir = 'scraped_data'
+    os.makedirs(data_dir, exist_ok=True)
+    # Process each URL
+    for idx, url in enumerate(urls):
+        if not url.startswith(('http://', 'https://')):
+            url = f'https://{url}'
+        # Sanitize URL for file naming
+        sanitized_url = sanitize_filename(url)
+        # Check for changes
+        old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt")
+        old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png")
+        # Fetch latest data
+        latest_html = get_latest_data(url)
+        latest_screenshot = take_screenshot(url)
+        # Compare with previous data if available
+        if os.path.exists(old_html_path):
+            with open(old_html_path, 'r', encoding='utf-8') as f:
+                old_html = f.read()
+            if compare_html(old_html, latest_html):
+                changes_log.append(alert_changes(url, "HTML content has changed"))
+        if os.path.exists(old_screenshot_path):
+            with open(old_screenshot_path, 'rb') as f:
+                old_screenshot = f.read()
+            if latest_screenshot and compare_screenshot(old_screenshot, latest_screenshot):
+                changes_log.append(alert_changes(url, "Visual content has changed"))
+        # Store latest data
+        if latest_html:
+            with open(old_html_path, 'w', encoding='utf-8') as f:
+                f.write(latest_html)
+        if latest_screenshot:
+            with open(old_screenshot_path, 'wb') as f:
+                f.write(latest_screenshot)
+        # Prepare output data
+        if action_radio in ['Scrape data', 'Both']:
+            scraped_data.append({
+                'url': url,
+                'content': latest_html,  # Include full HTML content
+                'timestamp': datetime.datetime.now().isoformat(),
+                'changes_detected': changes_log
+            })
+        if action_radio in ['Capture image', 'Both']:
+            crawled_screenshots = crawl_url(url, depth=1, max_depth=int(crawl_depth))
+            screenshots.extend(crawled_screenshots)
+        # Update progress
+        progress((idx + 1) / total_urls)
+    if mode == 'chat':
+        return "\n".join(changes_log)
+    else:
+        # Create a temporary file to store the ZIP
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
+            with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                # Add screenshots to ZIP
+                for screenshot_url, screenshot_data in screenshots:
+                    sanitized_screenshot_url = sanitize_filename(screenshot_url)
+                    filename = f"{sanitized_screenshot_url}.png"
+                    zipf.writestr(filename, screenshot_data)
+                # Add scraped data and changes log to ZIP
+                if scraped_data:
+                    data_to_save = {
+                        'scraped_data': scraped_data,
+                        'changes_log': changes_log,
+                        'timestamp': datetime.datetime.now().isoformat()
+                    }
+                    zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
+        # Get the path to the temporary file
+        zip_file_path = tmp_file.name
+        # Prepare display data
+        display_data = {
+            'total_scraped_urls': len(scraped_data),
+            'total_screenshots_taken': len(screenshots),
+            'changes_detected': changes_log,
+            'scraped_data': scraped_data  # Include full scraped data
+        }
+        # Return the path to the temporary ZIP file and display data
+        return zip_file_path, json.dumps(display_data, indent=2)
+def recognize_intent(instruction: str) -> str:
+    instruction = instruction.lower()
+    # General patterns for actions and data types
+    action_patterns = {
+        r'\b(find|extract|scrape)\s+(links|images|videos|texts|prices|product names|reviews)\b': 'extract_data',
+        r'\b(count)\s+(links|images|videos|products)\b': 'count_data',
+        r'\b(what is|get|fetch)\s+(channel name|subscriber count|viewers)\b': 'fetch_specific_data',
+        r'\b(monitor)\s+changes\b': 'monitor_changes',
+    }
+    for pattern, intent in action_patterns.items():
+        if re.search(pattern, instruction):
+            return intent
+    return "unknown"
+def extract_data_type(instruction: str) -> str:
+    instruction = instruction.lower()
+    data_types = {
+        r'\b(links|images|videos|texts|prices|product names|reviews)\b': 'links',
+        r'\b(links|images|videos|products)\b': 'images',
+        r'\b(channel name|subscriber count|viewers)\b': 'channel name',
+    }
+    for pattern, data_type in data_types.items():
+        if re.search(pattern, instruction):
+            return data_type
+    return "unknown"
+def format_output(data, output_format):
+    if output_format == "JSON":
+        return json.dumps(data, indent=2)
+    elif output_format == "Cleaned JSON":
+        # Implement data cleaning logic here
+        return json.dumps(data, indent=2)
+    else:
+        return str(data)
+def generate_command(intent: str, url_input: str, data_type: str, output_format: str) -> str:
+    if intent == "extract_data":
+        data = extract_data(url_input, data_type)
+        return format_output(data, output_format)
+    elif intent == "count_data":
+        count = count_data(url_input, data_type)
+        return f"The number of {data_type} is {count}."
+    elif intent == "fetch_specific_data":
+        specific_data = fetch_specific_data(url_input, data_type)
+        return specific_data
+    elif intent == "monitor_changes":
+        changes_log = monitor_changes(url_input)
+        return changes_log
+    else:
+        return "Instruction not recognized. Please try again."
+def extract_data(url, data_type):
+    try:
+        response = requests.get(url)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        if data_type == "links":
+            return [a['href'] for a in soup.find_all('a', href=True)]
+        elif data_type == "images":
+            return [img['src'] for img in soup.find_all('img', src=True)]
+        # Add more data types as needed
+        else:
+            return []
+    except Exception as e:
+        return f"Error extracting {data_type}: {str(e)}"
+def count_data(url, data_type):
+    try:
+        response = requests.get(url)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        if data_type == "links":
+            return len(soup.find_all('a', href=True))
+        elif data_type == "images":
+            return len(soup.find_all('img', src=True))
+        # Add more data types as needed
+        else:
+            return 0
+    except Exception as e:
+        return f"Error counting {data_type}: {str(e)}"
+def fetch_specific_data(url, data_type):
+    try:
+        # Implement specific data fetching logic here
+        # For demonstration, return a placeholder
+        return f"Fetched {data_type} from {url}"
+    except Exception as e:
+        return f"Error fetching {data_type}: {str(e)}"
+def monitor_changes(url_input):
+    try:
+        # Implement change monitoring logic here
+        # For demonstration, return a placeholder
+        return f"Changes monitored for {url_input}"
+    except Exception as e:
+        return f"Error monitoring changes: {str(e)}"
+def chat_based_scrape(instruction, url_input, output_format):
+    # Recognize intent and extract data type if applicable
+    intent = recognize_intent(instruction)
+    data_type = extract_data_type(instruction)
+    # Generate command based on the recognized intent
+    command_output = generate_command(intent, url_input, data_type, output_format)
+    return command_output
+def create_interface():
+    """Create the Gradio interface."""
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            """
+            # Smart Web Scraper with Change Detection
+            Monitor and capture changes in web content automatically.
+            """
+        )
+        with gr.Tabs():
+            with gr.Tab("URL Scrape/Screenshot"):
+                url_input = gr.Textbox(label="Enter URL(s)", value="https://example.com", placeholder="Enter single URL or multiple URLs separated by commas")
+                with gr.Row():
+                    bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
+                    action_radio = gr.Radio(["Scrape data", "Capture image", "Both"], label="Select Action", value="Both")
+                with gr.Row():
+                    max_urls = gr.Slider(minimum=1, maximum=1000, value=5, step=1, label="Max URLs to process")
+                    crawl_depth = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Crawl Depth")
+                process_button = gr.Button("Process URLs", variant="primary")
+                with gr.Column():
+                    screenshot_zip = gr.File(label="Download Results")
+                    scraped_data_output = gr.JSON(label="Results Summary")
+                process_button.click(fn=process_urls, inputs=[url_input, bulk_toggle, action_radio, max_urls, crawl_depth], outputs=[screenshot_zip, scraped_data_output], show_progress=True)
+            with gr.Tab("Chat-Based Scrape"):
+                instruction = gr.Textbox(label="Enter Instruction", placeholder="e.g., 'Scrape all links' or 'Extract all images'")
+                url_input = gr.Textbox(label="Enter URL", value="https://example.com", placeholder="Enter the target URL")
+                output_format = gr.Radio(["JSON", "Cleaned JSON", "Raw Data"], label="Output Format", value="JSON")
+                output = gr.Textbox(label="Output")
+                chat_button = gr.Button("Execute Instruction", variant="primary")
+                chat_button.click(fn=chat_based_scrape, inputs=[instruction, url_input, output_format], outputs=output)
+        gr.Markdown(
+            """
+            ### Features
+            - Bulk URL processing
+            - Screenshot capture
+            - Content change detection
+            - Recursive crawling
+            - Chat-based instructions
+            """
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()  # Call the function to create the interface
+    demo.launch()  # Launch the Gradio app

app.tsx DELETED Viewed

@@ -1,205 +0,0 @@
-import React, { useState, useEffect, useRef } from 'react';
-import { LineChart, Line, XAxis, YAxis, CartesianGrid, Tooltip, Legend, ResponsiveContainer } from "recharts";
-type ChatMessage = {
-    role: 'user' | 'system';
-    content: string;
-};
-const App: React.FC = () => {
-    const [urlInput, setUrlInput] = useState<string>('https://www.example.com');
-    const [bulkToggle, setBulkToggle] = useState<boolean>(false);
-    const [actionRadio, setActionRadio] = useState<'Scrape data' | 'Capture image' | 'Both'>('Both');
-    const [maxUrls, setMaxUrls] = useState<number>(5);
-    const [crawlDepth, setCrawlDepth] = useState<number>(1);
-    const [scrapedDataOutput, setScrapedDataOutput] = useState<string>('');
-    const [screenshotOutput, setScreenshotOutput] = useState<string | null>(null);
-    const [monitorUrlsInput, setMonitorUrlsInput] = useState<string>('');
-    const [intervalInput, setIntervalInput] = useState<number>(300);
-    const [changeOutput, setChangeOutput] = useState<string>('');
-    const [chatHistory, setChatHistory] = useState<ChatMessage[]>([]);
-    const [isMonitoring, setIsMonitoring] = useState<boolean>(false);
-    const [monitoringData, setMonitoringData] = useState<{ time: string; changes: number }[]>([]);
-    const [isProcessing, setIsProcessing] = useState<boolean>(false);
-    const [error, setError] = useState<string | null>(null);
-    const wsRef = useRef<WebSocket | null>(null);
-    useEffect(() => {
-        if (isMonitoring) {
-            wsRef.current = new WebSocket('ws://localhost:8000/ws');
-            wsRef.current.onmessage = (event) => {
-                const message = event.data;
-                setChangeOutput(prev => prev + `Change detected: ${message}\n`);
-                setChatHistory(prev => [...prev, { role: 'system', content: `Change detected: ${message}` }]);
-                setMonitoringData(prev => {
-                    const now = new Date();
-                    const time = now.toLocaleTimeString();
-                    return [...prev, { time, changes: 1 }];
-                });
-            };
-            wsRef.current.onclose = () => {
-                console.log("Disconnected from WebSocket server.");
-            };
-        } else {
-            if (wsRef.current) {
-                wsRef.current.close();
-                wsRef.current = null;
-            }
-        }
-    }, [isMonitoring]);
-    const handleProcessUrls = async () => {
-        setIsProcessing(true);
-        setError(null);
-        try {
-            const response = await fetch('http://localhost:8000/process_urls', {
-                method: 'POST',
-                headers: { 'Content-Type': 'application/json' },
-                body: JSON.stringify({
-                    url_input: urlInput,
-                    bulk_toggle: bulkToggle,
-                    action_radio: actionRadio,
-                    max_urls: maxUrls,
-                    crawl_depth: crawlDepth,
-                }),
-            });
-            if (!response.ok) {
-                const errorData = await response.json();
-                throw new Error(`HTTP error! Status: ${response.status}, Message: ${errorData.message || 'Unknown error'}`);
-            }
-            const data = await response.json();
-            setScrapedDataOutput(JSON.stringify(data.scraped_data, null, 2));
-            if (data.screenshot_data) {
-                setScreenshotOutput(data.screenshot_data);
-            } else {
-                setScreenshotOutput(null);
-            }
-            setError(null);
-        } catch (e: any) {
-            console.error("Error processing URLs:", e);
-            setError(e.message);
-            setScrapedDataOutput('');
-            setScreenshotOutput(null);
-        } finally {
-            setIsProcessing(false);
-        }
-    };
-    const handleStartMonitoring = async () => {
-        setIsMonitoring(true);
-        const urls = monitorUrlsInput.split('\n').map(url => url.trim()).filter(url => url !== '');
-        await fetch('http://localhost:8000/start_monitoring', {
-            method: 'POST',
-            headers: { 'Content-Type': 'application/json' },
-            body: JSON.stringify({ urls, interval: intervalInput }),
-        });
-        setChatHistory(prev => [...prev, { role: 'system', content: "Monitoring started." }]);
-    };
-    const handleStopMonitoring = async () => {
-        setIsMonitoring(false);
-        await fetch('http://localhost:8000/stop_monitoring', {
-            method: 'POST',
-        });
-        setChatHistory(prev => [...prev, { role: 'system', content: "Monitoring stopped." }]);
-        setMonitoringData([]);
-    };
-    return (
-        <div className="bg-gray-100 min-h-screen p-4">
-            <h1 className="text-3xl font-bold text-center text-gray-800 mb-8">Smart Scraper with Change Detection</h1>
-            {error && <div className="bg-red-200 text-red-700 rounded-md p-2 mb-4">{error}</div>}
-            <div className="flex flex-col md:flex-row space-y-4 md:space-y-0 md:space-x-4">
-                {/* URL Scrape/Screenshot Tab */}
-                <div className="bg-white rounded-lg shadow-md p-4 flex-1">
-                    {/* Existing components */}
-                </div>
-                {/* Monitoring Tab */}
-                <div className="bg-white rounded-lg shadow-md p-4 flex-1">
-                    <h2 className="text-xl font-semibold mb-4 text-gray-700">Monitoring</h2>
-                    <div className="mb-2">
-                        <label className="block text-gray-700 text-sm font-bold mb-2">Enter URLs to Monitor (separated by newline)</label>
-                        <textarea
-                            className="shadow appearance-none border rounded w-full py-2 px-3 text-gray-700 leading-tight focus:outline-none focus:shadow-outline"
-                            value={monitorUrlsInput}
-                            onChange={(e) => setMonitorUrlsInput(e.target.value)}
-                        />
-                    </div>
-                    <div className="mb-2">
-                        <label className="block text-gray-700 text-sm font-bold mb-2">Monitoring Interval (seconds)</label>
-                        <input type="range" className="form-range w-full" min={1} max={3600} value={intervalInput} onChange={(e) => setIntervalInput(parseInt(e.target.value))} />
-                        <span className="text-sm text-gray-600">{intervalInput}</span>
-                    </div>
-                    <div className="flex space-x-4 mb-4">
-                        <button
-                            className={`bg-green-600 hover:bg-green-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline ${isMonitoring ? 'opacity-50 cursor-not-allowed' : ''}`}
-                            onClick={handleStartMonitoring}
-                            disabled={isMonitoring}
-                        >
-                            Start Monitoring
-                        </button>
-                        <button
-                            className={`bg-red-600 hover:bg-red-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline ${!isMonitoring ? 'opacity-50 cursor-not-allowed' : ''}`}
-                            onClick={handleStopMonitoring}
-                            disabled={!isMonitoring}
-                        >
-                            Stop Monitoring
-                        </button>
-                    </div>
-                    {changeOutput && (
-                        <div className="mb-4">
-                            <label className="block text-gray-700 text-sm font-bold mb-2">Monitoring Changes</label>
-                            <pre className="border border-gray-300 rounded-md bg-gray-50 p-2 overflow-auto max-h-48 whitespace-pre-wrap">
-                                {changeOutput}
-                            </pre>
-                        </div>
-                    )}
-                    {monitoringData.length > 0 && (
-                        <div className="mb-4">
-                            <label className="block text-gray-700 text-sm font-bold mb-2">Change History Graph</label>
-                            <ResponsiveContainer width="100%" height={200}>
-                                <LineChart data={monitoringData}>
-                                    <CartesianGrid strokeDasharray="3 3" />
-                                    <XAxis dataKey="time" />
-                                    <YAxis />
-                                    <Tooltip />
-                                    <Legend />
-                                    <Line type="monotone" dataKey="changes" stroke="#8884d8" />
-                                </LineChart>
-                            </ResponsiveContainer>
-                        </div>
-                    )}
-                    <div className="mb-2">
-                        <label className="block text-gray-700 text-sm font-bold mb-2">Monitoring Chat</label>
-                        <div className="border rounded-md bg-gray-50 p-2 overflow-auto max-h-48 mb-2">
-                            <ul className="space-y-2">
-                                {chatHistory.map((msg, index) => (
-                                    <li key={index} className={msg.role === 'user' ? 'text-right' : 'text-left'}>
-                                        <div className={`${msg.role === 'user' ? 'bg-indigo-100' : 'bg-gray-100'} inline-block rounded-md p-2`}>
-                                            <span className="font-bold text-gray-700">{msg.role === 'user' ? 'You' : 'System'}:</span> <span className="text-gray-800">{msg.content}</span>
-                                        </div>
-                                    </li>
-                                ))}
-                            </ul>
-                        </div>
-                        <input
-                            type="text"
-                            placeholder="Type command"
-                            className="shadow appearance-none border rounded w-full py-2 px-3 text-gray-700 leading-tight focus:outline-none focus:shadow-outline"
-                            onKeyDown={(e) => {
-                                if (e.key === 'Enter') {
-                                    setChatHistory(prev => [...prev, { role: 'user', content: e.target.value }]);
-                                    (e.target as HTMLInputElement).value = '';
-                                }
-                            }}
-                        />
-                    </div>
-                </div>
-            </div>
-        </div>
-    );
-};
-export default App;