Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,104 +1,106 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
with gr.Tabs():
|
25 |
-
with gr.Tab("URL Scrape/Screenshot"):
|
26 |
-
# Existing components for URL processing
|
27 |
-
pass
|
28 |
-
with gr.Tab("Monitoring"):
|
29 |
-
monitor_urls_input = gr.Textbox(label="Enter URLs to Monitor (separated by newline)")
|
30 |
-
interval_input = gr.Slider(label="Monitoring Interval (seconds)", minimum=1, maximum=3600, value=300)
|
31 |
-
start_monitoring_button = gr.Button("Start Monitoring")
|
32 |
-
stop_monitoring_button = gr.Button("Stop Monitoring")
|
33 |
-
|
34 |
-
# Monitoring Manager
|
35 |
-
class MonitoringManager:
|
36 |
-
def __init__(self):
|
37 |
-
self.monitored_urls = []
|
38 |
-
self.interval = 300 # default interval in seconds
|
39 |
-
self.is_monitoring = False
|
40 |
-
self.connections = set()
|
41 |
-
self.url_data = {} # Stores latest HTML and screenshot for each URL
|
42 |
-
|
43 |
-
async def start_monitoring(self):
|
44 |
-
if not self.is_monitoring:
|
45 |
-
self.is_monitoring = True
|
46 |
-
while self.is_monitoring:
|
47 |
-
await asyncio.sleep(self.interval)
|
48 |
-
for url in self.monitored_urls:
|
49 |
-
if await self.check_url_for_changes(url):
|
50 |
-
message = f"Change detected at {url}"
|
51 |
-
await self.notify_clients(message)
|
52 |
-
|
53 |
-
def stop_monitoring(self):
|
54 |
-
self.is_monitoring = False
|
55 |
-
|
56 |
-
async def check_url_for_changes(self, url):
|
57 |
-
# Fetch latest HTML content
|
58 |
-
async with httpx.AsyncClient() as client:
|
59 |
-
response = await client.get(url)
|
60 |
-
new_html = response.text
|
61 |
-
|
62 |
-
# Compare with stored HTML
|
63 |
-
if url in self.url_data:
|
64 |
-
if self.url_data[url] != new_html:
|
65 |
-
self.url_data[url] = new_html
|
66 |
-
return True
|
67 |
-
else:
|
68 |
-
self.url_data[url] = new_html
|
69 |
-
return False
|
70 |
-
|
71 |
-
async def notify_clients(self, message):
|
72 |
-
for websocket in self.connections:
|
73 |
-
await websocket.send_text(message)
|
74 |
-
|
75 |
-
# WebSocket endpoint
|
76 |
-
@app.websocket_route("/ws")
|
77 |
-
async def websocket_endpoint(websocket: WebSocket):
|
78 |
-
await websocket.accept()
|
79 |
-
monitor_manager.connections.add(websocket)
|
80 |
-
try:
|
81 |
-
while True:
|
82 |
-
await websocket.receive_text() # Keep the connection alive
|
83 |
-
await asyncio.sleep(0)
|
84 |
-
finally:
|
85 |
-
monitor_manager.connections.remove(websocket)
|
86 |
-
|
87 |
-
# API endpoint to receive monitoring parameters
|
88 |
-
@app.post("/start_monitoring")
|
89 |
-
async def start_monitoring_endpoint(urls: list[str], interval: int):
|
90 |
-
monitor_manager.monitored_urls = urls
|
91 |
-
monitor_manager.interval = interval
|
92 |
-
asyncio.create_task(monitor_manager.start_monitoring())
|
93 |
-
return {"status": "started"}
|
94 |
-
|
95 |
-
@app.post("/stop_monitoring")
|
96 |
-
async def stop_monitoring_endpoint():
|
97 |
-
monitor_manager.stop_monitoring()
|
98 |
-
return {"status": "stopped"}
|
99 |
-
|
100 |
-
# Gradio queue setup
|
101 |
-
queue(app, port=8000)
|
102 |
-
|
103 |
-
# Run the Gradio app
|
104 |
-
demo.launch(server_name="0.0.0.0", server_port=8000)
|
|
|
1 |
+
def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
|
2 |
+
"""Process URLs with crawl depth and change detection."""
|
3 |
+
# Validate URLs first
|
4 |
+
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
5 |
+
urls = [url.strip() for url in urls if url.strip()] # Remove empty entries
|
6 |
+
urls = urls[:int(max_urls)]
|
7 |
+
|
8 |
+
# Validate all URLs
|
9 |
+
invalid_urls = [url for url in urls if not validate_url(url)]
|
10 |
+
if invalid_urls:
|
11 |
+
return None, f"Invalid URLs detected: {', '.join(invalid_urls)}"
|
12 |
+
|
13 |
+
scraped_data = []
|
14 |
+
screenshots = []
|
15 |
+
|
16 |
+
# Initialize progress tracking
|
17 |
+
total_urls = len(urls)
|
18 |
+
|
19 |
+
# Create memory file for ZIP archive
|
20 |
+
memory_file = io.BytesIO()
|
21 |
+
with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
22 |
+
for idx, url in enumerate(urls):
|
23 |
+
# Update progress
|
24 |
+
progress((idx + 1) / total_urls) # Remove label argument
|
25 |
+
|
26 |
+
if not url.startswith(('http://', 'https://')):
|
27 |
+
url = f'https://{url}'
|
28 |
+
|
29 |
+
if action_radio in ['Scrape data', 'Both']:
|
30 |
+
try:
|
31 |
+
response = requests.get(url, timeout=10)
|
32 |
+
scraped_data.append({url: response.text})
|
33 |
+
except Exception as e:
|
34 |
+
logging.error(f"Scraping error for {url}: {str(e)}")
|
35 |
+
scraped_data.append({url: f"Error: {str(e)}"})
|
36 |
+
|
37 |
+
if action_radio in ['Capture image', 'Both']:
|
38 |
+
# Crawl the URL up to the specified depth
|
39 |
+
screenshots = crawl_url(url, 1, int(crawl_depth))
|
40 |
+
for screenshot_url, screenshot in screenshots:
|
41 |
+
# Save the screenshot to a temporary file
|
42 |
+
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
|
43 |
+
temp_file.write(screenshot)
|
44 |
+
temp_file_path = temp_file.name
|
45 |
+
|
46 |
+
# Add the temporary file to the ZIP archive
|
47 |
+
filename = f"screenshot_{idx}_{screenshot_url.split('//')[1].replace('/', '_')}.png"
|
48 |
+
zipf.write(temp_file_path, filename)
|
49 |
+
|
50 |
+
# Clean up the temporary file
|
51 |
+
os.unlink(temp_file_path)
|
52 |
+
|
53 |
+
# Prepare return values
|
54 |
+
memory_file.seek(0)
|
55 |
+
zip_bytes = memory_file.getvalue()
|
56 |
+
scraped_data_json = json.dumps(scraped_data, indent=2)
|
57 |
+
|
58 |
+
return zip_bytes, scraped_data_json
|
59 |
|
60 |
+
def create_interface():
|
61 |
+
"""Create the Gradio interface."""
|
62 |
+
with gr.Blocks() as demo:
|
63 |
+
gr.Markdown("<h1 style='text-align: center; color: white;'>Smart Scraper with Change Detection</h1>")
|
64 |
+
|
65 |
+
with gr.Tabs():
|
66 |
+
with gr.Tab("URL Scrape/Screenshot"):
|
67 |
+
url_input = gr.Textbox(
|
68 |
+
label="Enter URL(s)",
|
69 |
+
value="https://example.com",
|
70 |
+
placeholder="Enter single URL or multiple URLs separated by commas"
|
71 |
+
)
|
72 |
+
bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
|
73 |
+
action_radio = gr.Radio(
|
74 |
+
["Scrape data", "Capture image", "Both"],
|
75 |
+
label="Select Action",
|
76 |
+
value="Both"
|
77 |
+
)
|
78 |
+
max_urls = gr.Slider(
|
79 |
+
minimum=1,
|
80 |
+
maximum=20,
|
81 |
+
value=5,
|
82 |
+
step=1,
|
83 |
+
label="Max URLs to process"
|
84 |
+
)
|
85 |
+
crawl_depth = gr.Slider(
|
86 |
+
minimum=1,
|
87 |
+
maximum=3,
|
88 |
+
value=1,
|
89 |
+
step=1,
|
90 |
+
label="Crawl Depth"
|
91 |
+
)
|
92 |
+
screenshot_zip = gr.File(label="Download Screenshots", file_name='screenshots.zip')
|
93 |
+
scraped_data_output = gr.Textbox(label="Scraped Data")
|
94 |
+
|
95 |
+
process_button = gr.Button("Process URLs")
|
96 |
+
process_button.click(
|
97 |
+
fn=process_urls,
|
98 |
+
inputs=[url_input, bulk_toggle, action_radio, max_urls, crawl_depth],
|
99 |
+
outputs=[screenshot_zip, scraped_data_output]
|
100 |
+
)
|
101 |
+
|
102 |
+
return demo
|
103 |
|
104 |
+
if __name__ == "__main__":
|
105 |
+
demo = create_interface()
|
106 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|