acecalisto3 commited on
Commit
6f3886d
·
verified ·
1 Parent(s): 8edebce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -18
app.py CHANGED
@@ -9,15 +9,15 @@ from PIL import Image
9
  import io
10
  import zipfile
11
  import os
12
- import tempfile
13
- from bs4 import BeautifulSoup
14
- from urllib.parse import urlparse
15
- from datetime import datetime
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO,
19
  format='%(asctime)s - %(levelname)s - %(message)s')
20
 
 
 
 
21
  def validate_url(url):
22
  """Validate if the URL is properly formatted."""
23
  try:
@@ -55,7 +55,7 @@ def compare_screenshot(old_screenshot, new_screenshot):
55
 
56
  def alert_changes(url, change_type):
57
  """Log detected changes."""
58
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
59
  logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
60
  return f"[{timestamp}] {change_type}"
61
 
@@ -143,7 +143,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
143
  # Validate all URLs
144
  invalid_urls = [url for url in urls if not validate_url(url)]
145
  if invalid_urls:
146
- return gr.FileData(None, "error.zip"), f"Invalid URLs detected: {', '.join(invalid_urls)}"
147
 
148
  scraped_data = []
149
  screenshots = []
@@ -162,9 +162,12 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
162
  if not url.startswith(('http://', 'https://')):
163
  url = f'https://{url}'
164
 
 
 
 
165
  # Check for changes
166
- old_html_path = os.path.join(data_dir, f"{url.replace('/', '_')}_html.txt")
167
- old_screenshot_path = os.path.join(data_dir, f"{url.replace('/', '_')}_screenshot.png")
168
 
169
  # Fetch latest data
170
  latest_html = get_latest_data(url)
@@ -196,7 +199,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
196
  scraped_data.append({
197
  'url': url,
198
  'content': latest_html,
199
- 'timestamp': datetime.now().isoformat(),
200
  'changes_detected': changes_log
201
  })
202
 
@@ -212,7 +215,8 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
212
  with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
213
  # Add screenshots to ZIP
214
  for screenshot_url, screenshot_data in screenshots:
215
- filename = f"{screenshot_url.split('//')[1].replace('/', '_')}.png"
 
216
  zipf.writestr(filename, screenshot_data)
217
 
218
  # Add scraped data and changes log to ZIP
@@ -220,7 +224,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
220
  data_to_save = {
221
  'scraped_data': scraped_data,
222
  'changes_log': changes_log,
223
- 'timestamp': datetime.now().isoformat()
224
  }
225
  zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
226
 
@@ -235,8 +239,8 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
235
  'changes_detected': changes_log
236
  }
237
 
238
- # Return ZIP file data and display data
239
- return gr.FileData(zip_bytes, "results.zip"), json.dumps(display_data, indent=2)
240
 
241
  def create_interface():
242
  """Create the Gradio interface."""
@@ -283,11 +287,7 @@ def create_interface():
283
  process_button = gr.Button("Process URLs", variant="primary")
284
 
285
  with gr.Column():
286
- screenshot_zip = gr.File(
287
- label="Download Results",
288
- file_count="single",
289
- file_types=[".zip"]
290
- )
291
  scraped_data_output = gr.JSON(label="Results Summary")
292
 
293
  process_button.click(
 
9
  import io
10
  import zipfile
11
  import os
12
+ import datetime
 
 
 
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO,
16
  format='%(asctime)s - %(levelname)s - %(message)s')
17
 
18
+ def sanitize_filename(filename):
19
+ return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
20
+
21
  def validate_url(url):
22
  """Validate if the URL is properly formatted."""
23
  try:
 
55
 
56
  def alert_changes(url, change_type):
57
  """Log detected changes."""
58
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
59
  logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
60
  return f"[{timestamp}] {change_type}"
61
 
 
143
  # Validate all URLs
144
  invalid_urls = [url for url in urls if not validate_url(url)]
145
  if invalid_urls:
146
+ return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
147
 
148
  scraped_data = []
149
  screenshots = []
 
162
  if not url.startswith(('http://', 'https://')):
163
  url = f'https://{url}'
164
 
165
+ # Sanitize URL for file naming
166
+ sanitized_url = sanitize_filename(url)
167
+
168
  # Check for changes
169
+ old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt")
170
+ old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png")
171
 
172
  # Fetch latest data
173
  latest_html = get_latest_data(url)
 
199
  scraped_data.append({
200
  'url': url,
201
  'content': latest_html,
202
+ 'timestamp': datetime.datetime.now().isoformat(),
203
  'changes_detected': changes_log
204
  })
205
 
 
215
  with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
216
  # Add screenshots to ZIP
217
  for screenshot_url, screenshot_data in screenshots:
218
+ sanitized_screenshot_url = sanitize_filename(screenshot_url)
219
+ filename = f"{sanitized_screenshot_url}.png"
220
  zipf.writestr(filename, screenshot_data)
221
 
222
  # Add scraped data and changes log to ZIP
 
224
  data_to_save = {
225
  'scraped_data': scraped_data,
226
  'changes_log': changes_log,
227
+ 'timestamp': datetime.datetime.now().isoformat()
228
  }
229
  zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
230
 
 
239
  'changes_detected': changes_log
240
  }
241
 
242
+ # Return ZIP bytes and display data
243
+ return zip_bytes, json.dumps(display_data, indent=2)
244
 
245
  def create_interface():
246
  """Create the Gradio interface."""
 
287
  process_button = gr.Button("Process URLs", variant="primary")
288
 
289
  with gr.Column():
290
+ screenshot_zip = gr.File(label="Download Results", file_name="results.zip")
 
 
 
 
291
  scraped_data_output = gr.JSON(label="Results Summary")
292
 
293
  process_button.click(