Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -9,15 +9,15 @@ from PIL import Image
|
|
9 |
import io
|
10 |
import zipfile
|
11 |
import os
|
12 |
-
import
|
13 |
-
from bs4 import BeautifulSoup
|
14 |
-
from urllib.parse import urlparse
|
15 |
-
from datetime import datetime
|
16 |
|
17 |
# Configure logging
|
18 |
logging.basicConfig(level=logging.INFO,
|
19 |
format='%(asctime)s - %(levelname)s - %(message)s')
|
20 |
|
|
|
|
|
|
|
21 |
def validate_url(url):
|
22 |
"""Validate if the URL is properly formatted."""
|
23 |
try:
|
@@ -55,7 +55,7 @@ def compare_screenshot(old_screenshot, new_screenshot):
|
|
55 |
|
56 |
def alert_changes(url, change_type):
|
57 |
"""Log detected changes."""
|
58 |
-
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
59 |
logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
|
60 |
return f"[{timestamp}] {change_type}"
|
61 |
|
@@ -143,7 +143,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
143 |
# Validate all URLs
|
144 |
invalid_urls = [url for url in urls if not validate_url(url)]
|
145 |
if invalid_urls:
|
146 |
-
return
|
147 |
|
148 |
scraped_data = []
|
149 |
screenshots = []
|
@@ -162,9 +162,12 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
162 |
if not url.startswith(('http://', 'https://')):
|
163 |
url = f'https://{url}'
|
164 |
|
|
|
|
|
|
|
165 |
# Check for changes
|
166 |
-
old_html_path = os.path.join(data_dir, f"{
|
167 |
-
old_screenshot_path = os.path.join(data_dir, f"{
|
168 |
|
169 |
# Fetch latest data
|
170 |
latest_html = get_latest_data(url)
|
@@ -196,7 +199,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
196 |
scraped_data.append({
|
197 |
'url': url,
|
198 |
'content': latest_html,
|
199 |
-
'timestamp': datetime.now().isoformat(),
|
200 |
'changes_detected': changes_log
|
201 |
})
|
202 |
|
@@ -212,7 +215,8 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
212 |
with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
213 |
# Add screenshots to ZIP
|
214 |
for screenshot_url, screenshot_data in screenshots:
|
215 |
-
|
|
|
216 |
zipf.writestr(filename, screenshot_data)
|
217 |
|
218 |
# Add scraped data and changes log to ZIP
|
@@ -220,7 +224,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
220 |
data_to_save = {
|
221 |
'scraped_data': scraped_data,
|
222 |
'changes_log': changes_log,
|
223 |
-
'timestamp': datetime.now().isoformat()
|
224 |
}
|
225 |
zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
|
226 |
|
@@ -235,8 +239,8 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
235 |
'changes_detected': changes_log
|
236 |
}
|
237 |
|
238 |
-
# Return ZIP
|
239 |
-
return
|
240 |
|
241 |
def create_interface():
|
242 |
"""Create the Gradio interface."""
|
@@ -283,11 +287,7 @@ def create_interface():
|
|
283 |
process_button = gr.Button("Process URLs", variant="primary")
|
284 |
|
285 |
with gr.Column():
|
286 |
-
screenshot_zip = gr.File(
|
287 |
-
label="Download Results",
|
288 |
-
file_count="single",
|
289 |
-
file_types=[".zip"]
|
290 |
-
)
|
291 |
scraped_data_output = gr.JSON(label="Results Summary")
|
292 |
|
293 |
process_button.click(
|
|
|
9 |
import io
|
10 |
import zipfile
|
11 |
import os
|
12 |
+
import datetime
|
|
|
|
|
|
|
13 |
|
14 |
# Configure logging
|
15 |
logging.basicConfig(level=logging.INFO,
|
16 |
format='%(asctime)s - %(levelname)s - %(message)s')
|
17 |
|
18 |
+
def sanitize_filename(filename):
|
19 |
+
return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
|
20 |
+
|
21 |
def validate_url(url):
|
22 |
"""Validate if the URL is properly formatted."""
|
23 |
try:
|
|
|
55 |
|
56 |
def alert_changes(url, change_type):
|
57 |
"""Log detected changes."""
|
58 |
+
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
59 |
logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
|
60 |
return f"[{timestamp}] {change_type}"
|
61 |
|
|
|
143 |
# Validate all URLs
|
144 |
invalid_urls = [url for url in urls if not validate_url(url)]
|
145 |
if invalid_urls:
|
146 |
+
return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
|
147 |
|
148 |
scraped_data = []
|
149 |
screenshots = []
|
|
|
162 |
if not url.startswith(('http://', 'https://')):
|
163 |
url = f'https://{url}'
|
164 |
|
165 |
+
# Sanitize URL for file naming
|
166 |
+
sanitized_url = sanitize_filename(url)
|
167 |
+
|
168 |
# Check for changes
|
169 |
+
old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt")
|
170 |
+
old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png")
|
171 |
|
172 |
# Fetch latest data
|
173 |
latest_html = get_latest_data(url)
|
|
|
199 |
scraped_data.append({
|
200 |
'url': url,
|
201 |
'content': latest_html,
|
202 |
+
'timestamp': datetime.datetime.now().isoformat(),
|
203 |
'changes_detected': changes_log
|
204 |
})
|
205 |
|
|
|
215 |
with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
216 |
# Add screenshots to ZIP
|
217 |
for screenshot_url, screenshot_data in screenshots:
|
218 |
+
sanitized_screenshot_url = sanitize_filename(screenshot_url)
|
219 |
+
filename = f"{sanitized_screenshot_url}.png"
|
220 |
zipf.writestr(filename, screenshot_data)
|
221 |
|
222 |
# Add scraped data and changes log to ZIP
|
|
|
224 |
data_to_save = {
|
225 |
'scraped_data': scraped_data,
|
226 |
'changes_log': changes_log,
|
227 |
+
'timestamp': datetime.datetime.now().isoformat()
|
228 |
}
|
229 |
zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
|
230 |
|
|
|
239 |
'changes_detected': changes_log
|
240 |
}
|
241 |
|
242 |
+
# Return ZIP bytes and display data
|
243 |
+
return zip_bytes, json.dumps(display_data, indent=2)
|
244 |
|
245 |
def create_interface():
|
246 |
"""Create the Gradio interface."""
|
|
|
287 |
process_button = gr.Button("Process URLs", variant="primary")
|
288 |
|
289 |
with gr.Column():
|
290 |
+
screenshot_zip = gr.File(label="Download Results", file_name="results.zip")
|
|
|
|
|
|
|
|
|
291 |
scraped_data_output = gr.JSON(label="Results Summary")
|
292 |
|
293 |
process_button.click(
|