acecalisto3 commited on
Commit
c5e9b83
·
verified ·
1 Parent(s): 59a3a44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -102
app.py CHANGED
@@ -1,104 +1,106 @@
1
- import gradio as gr
2
- import requests
3
- from bs4 import BeautifulSoup
4
- import asyncio
5
- from fastapi import FastAPI, WebSocket
6
- from gradio import queue
7
- import httpx
8
- from starlette.middleware.cors import CORSMiddleware
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Initialize FastAPI app
11
- app = FastAPI()
12
- app.add_middleware(
13
- CORSMiddleware,
14
- allow_origins=["*"],
15
- allow_credentials=True,
16
- allow_methods=["*"],
17
- allow_headers=["*"],
18
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # Gradio app
21
- with gr.Blocks() as demo:
22
- gr.Markdown("<h1 style='text-align: center; color: white;'>Smart Scraper with Change Detection</h1>")
23
-
24
- with gr.Tabs():
25
- with gr.Tab("URL Scrape/Screenshot"):
26
- # Existing components for URL processing
27
- pass
28
- with gr.Tab("Monitoring"):
29
- monitor_urls_input = gr.Textbox(label="Enter URLs to Monitor (separated by newline)")
30
- interval_input = gr.Slider(label="Monitoring Interval (seconds)", minimum=1, maximum=3600, value=300)
31
- start_monitoring_button = gr.Button("Start Monitoring")
32
- stop_monitoring_button = gr.Button("Stop Monitoring")
33
-
34
- # Monitoring Manager
35
- class MonitoringManager:
36
- def __init__(self):
37
- self.monitored_urls = []
38
- self.interval = 300 # default interval in seconds
39
- self.is_monitoring = False
40
- self.connections = set()
41
- self.url_data = {} # Stores latest HTML and screenshot for each URL
42
-
43
- async def start_monitoring(self):
44
- if not self.is_monitoring:
45
- self.is_monitoring = True
46
- while self.is_monitoring:
47
- await asyncio.sleep(self.interval)
48
- for url in self.monitored_urls:
49
- if await self.check_url_for_changes(url):
50
- message = f"Change detected at {url}"
51
- await self.notify_clients(message)
52
-
53
- def stop_monitoring(self):
54
- self.is_monitoring = False
55
-
56
- async def check_url_for_changes(self, url):
57
- # Fetch latest HTML content
58
- async with httpx.AsyncClient() as client:
59
- response = await client.get(url)
60
- new_html = response.text
61
-
62
- # Compare with stored HTML
63
- if url in self.url_data:
64
- if self.url_data[url] != new_html:
65
- self.url_data[url] = new_html
66
- return True
67
- else:
68
- self.url_data[url] = new_html
69
- return False
70
-
71
- async def notify_clients(self, message):
72
- for websocket in self.connections:
73
- await websocket.send_text(message)
74
-
75
- # WebSocket endpoint
76
- @app.websocket_route("/ws")
77
- async def websocket_endpoint(websocket: WebSocket):
78
- await websocket.accept()
79
- monitor_manager.connections.add(websocket)
80
- try:
81
- while True:
82
- await websocket.receive_text() # Keep the connection alive
83
- await asyncio.sleep(0)
84
- finally:
85
- monitor_manager.connections.remove(websocket)
86
-
87
- # API endpoint to receive monitoring parameters
88
- @app.post("/start_monitoring")
89
- async def start_monitoring_endpoint(urls: list[str], interval: int):
90
- monitor_manager.monitored_urls = urls
91
- monitor_manager.interval = interval
92
- asyncio.create_task(monitor_manager.start_monitoring())
93
- return {"status": "started"}
94
-
95
- @app.post("/stop_monitoring")
96
- async def stop_monitoring_endpoint():
97
- monitor_manager.stop_monitoring()
98
- return {"status": "stopped"}
99
-
100
- # Gradio queue setup
101
- queue(app, port=8000)
102
-
103
- # Run the Gradio app
104
- demo.launch(server_name="0.0.0.0", server_port=8000)
 
1
+ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
2
+ """Process URLs with crawl depth and change detection."""
3
+ # Validate URLs first
4
+ urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
5
+ urls = [url.strip() for url in urls if url.strip()] # Remove empty entries
6
+ urls = urls[:int(max_urls)]
7
+
8
+ # Validate all URLs
9
+ invalid_urls = [url for url in urls if not validate_url(url)]
10
+ if invalid_urls:
11
+ return None, f"Invalid URLs detected: {', '.join(invalid_urls)}"
12
+
13
+ scraped_data = []
14
+ screenshots = []
15
+
16
+ # Initialize progress tracking
17
+ total_urls = len(urls)
18
+
19
+ # Create memory file for ZIP archive
20
+ memory_file = io.BytesIO()
21
+ with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
22
+ for idx, url in enumerate(urls):
23
+ # Update progress
24
+ progress((idx + 1) / total_urls) # Remove label argument
25
+
26
+ if not url.startswith(('http://', 'https://')):
27
+ url = f'https://{url}'
28
+
29
+ if action_radio in ['Scrape data', 'Both']:
30
+ try:
31
+ response = requests.get(url, timeout=10)
32
+ scraped_data.append({url: response.text})
33
+ except Exception as e:
34
+ logging.error(f"Scraping error for {url}: {str(e)}")
35
+ scraped_data.append({url: f"Error: {str(e)}"})
36
+
37
+ if action_radio in ['Capture image', 'Both']:
38
+ # Crawl the URL up to the specified depth
39
+ screenshots = crawl_url(url, 1, int(crawl_depth))
40
+ for screenshot_url, screenshot in screenshots:
41
+ # Save the screenshot to a temporary file
42
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
43
+ temp_file.write(screenshot)
44
+ temp_file_path = temp_file.name
45
+
46
+ # Add the temporary file to the ZIP archive
47
+ filename = f"screenshot_{idx}_{screenshot_url.split('//')[1].replace('/', '_')}.png"
48
+ zipf.write(temp_file_path, filename)
49
+
50
+ # Clean up the temporary file
51
+ os.unlink(temp_file_path)
52
+
53
+ # Prepare return values
54
+ memory_file.seek(0)
55
+ zip_bytes = memory_file.getvalue()
56
+ scraped_data_json = json.dumps(scraped_data, indent=2)
57
+
58
+ return zip_bytes, scraped_data_json
59
 
60
+ def create_interface():
61
+ """Create the Gradio interface."""
62
+ with gr.Blocks() as demo:
63
+ gr.Markdown("<h1 style='text-align: center; color: white;'>Smart Scraper with Change Detection</h1>")
64
+
65
+ with gr.Tabs():
66
+ with gr.Tab("URL Scrape/Screenshot"):
67
+ url_input = gr.Textbox(
68
+ label="Enter URL(s)",
69
+ value="https://example.com",
70
+ placeholder="Enter single URL or multiple URLs separated by commas"
71
+ )
72
+ bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
73
+ action_radio = gr.Radio(
74
+ ["Scrape data", "Capture image", "Both"],
75
+ label="Select Action",
76
+ value="Both"
77
+ )
78
+ max_urls = gr.Slider(
79
+ minimum=1,
80
+ maximum=20,
81
+ value=5,
82
+ step=1,
83
+ label="Max URLs to process"
84
+ )
85
+ crawl_depth = gr.Slider(
86
+ minimum=1,
87
+ maximum=3,
88
+ value=1,
89
+ step=1,
90
+ label="Crawl Depth"
91
+ )
92
+ screenshot_zip = gr.File(label="Download Screenshots", file_name='screenshots.zip')
93
+ scraped_data_output = gr.Textbox(label="Scraped Data")
94
+
95
+ process_button = gr.Button("Process URLs")
96
+ process_button.click(
97
+ fn=process_urls,
98
+ inputs=[url_input, bulk_toggle, action_radio, max_urls, crawl_depth],
99
+ outputs=[screenshot_zip, scraped_data_output]
100
+ )
101
+
102
+ return demo
103
 
104
+ if __name__ == "__main__":
105
+ demo = create_interface()
106
+ demo.launch()