acecalisto3 commited on
Commit
c276c05
·
verified ·
1 Parent(s): c5e9b83

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +284 -62
app.py CHANGED
@@ -1,8 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
2
  """Process URLs with crawl depth and change detection."""
3
  # Validate URLs first
4
  urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
5
- urls = [url.strip() for url in urls if url.strip()] # Remove empty entries
6
  urls = urls[:int(max_urls)]
7
 
8
  # Validate all URLs
@@ -12,55 +147,105 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
12
 
13
  scraped_data = []
14
  screenshots = []
 
15
 
16
  # Initialize progress tracking
17
  total_urls = len(urls)
 
18
 
19
- # Create memory file for ZIP archive
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  memory_file = io.BytesIO()
21
  with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
22
- for idx, url in enumerate(urls):
23
- # Update progress
24
- progress((idx + 1) / total_urls) # Remove label argument
25
-
26
- if not url.startswith(('http://', 'https://')):
27
- url = f'https://{url}'
28
-
29
- if action_radio in ['Scrape data', 'Both']:
30
- try:
31
- response = requests.get(url, timeout=10)
32
- scraped_data.append({url: response.text})
33
- except Exception as e:
34
- logging.error(f"Scraping error for {url}: {str(e)}")
35
- scraped_data.append({url: f"Error: {str(e)}"})
36
-
37
- if action_radio in ['Capture image', 'Both']:
38
- # Crawl the URL up to the specified depth
39
- screenshots = crawl_url(url, 1, int(crawl_depth))
40
- for screenshot_url, screenshot in screenshots:
41
- # Save the screenshot to a temporary file
42
- with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
43
- temp_file.write(screenshot)
44
- temp_file_path = temp_file.name
45
-
46
- # Add the temporary file to the ZIP archive
47
- filename = f"screenshot_{idx}_{screenshot_url.split('//')[1].replace('/', '_')}.png"
48
- zipf.write(temp_file_path, filename)
49
-
50
- # Clean up the temporary file
51
- os.unlink(temp_file_path)
52
 
53
  # Prepare return values
54
  memory_file.seek(0)
55
  zip_bytes = memory_file.getvalue()
56
- scraped_data_json = json.dumps(scraped_data, indent=2)
57
 
58
- return zip_bytes, scraped_data_json
 
 
 
 
 
 
 
59
 
60
  def create_interface():
61
  """Create the Gradio interface."""
62
- with gr.Blocks() as demo:
63
- gr.Markdown("<h1 style='text-align: center; color: white;'>Smart Scraper with Change Detection</h1>")
 
 
 
 
 
64
 
65
  with gr.Tabs():
66
  with gr.Tab("URL Scrape/Screenshot"):
@@ -69,38 +254,75 @@ def create_interface():
69
  value="https://example.com",
70
  placeholder="Enter single URL or multiple URLs separated by commas"
71
  )
72
- bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
73
- action_radio = gr.Radio(
74
- ["Scrape data", "Capture image", "Both"],
75
- label="Select Action",
76
- value="Both"
77
- )
78
- max_urls = gr.Slider(
79
- minimum=1,
80
- maximum=20,
81
- value=5,
82
- step=1,
83
- label="Max URLs to process"
84
- )
85
- crawl_depth = gr.Slider(
86
- minimum=1,
87
- maximum=3,
88
- value=1,
89
- step=1,
90
- label="Crawl Depth"
91
- )
92
- screenshot_zip = gr.File(label="Download Screenshots", file_name='screenshots.zip')
93
- scraped_data_output = gr.Textbox(label="Scraped Data")
94
 
95
- process_button = gr.Button("Process URLs")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  process_button.click(
97
  fn=process_urls,
98
- inputs=[url_input, bulk_toggle, action_radio, max_urls, crawl_depth],
99
- outputs=[screenshot_zip, scraped_data_output]
 
 
 
 
 
 
 
 
 
 
100
  )
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  return demo
103
 
104
  if __name__ == "__main__":
105
  demo = create_interface()
106
- demo.launch()
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import re
4
+ import logging
5
+ import json
6
+ from selenium import webdriver
7
+ from selenium.webdriver.chrome.options import Options
8
+ from PIL import Image
9
+ import io
10
+ import zipfile
11
+ import os
12
+ import tempfile
13
+ from bs4 import BeautifulSoup
14
+ from urllib.parse import urlparse
15
+ from datetime import datetime
16
+
17
+ # Configure logging
18
+ logging.basicConfig(level=logging.INFO,
19
+ format='%(asctime)s - %(levelname)s - %(message)s')
20
+
21
+ def validate_url(url):
22
+ """Validate if the URL is properly formatted."""
23
+ try:
24
+ result = urlparse(url)
25
+ return all([result.scheme, result.netloc])
26
+ except:
27
+ return False
28
+
29
+ def get_latest_data(url):
30
+ """Get the latest HTML content of a webpage."""
31
+ try:
32
+ response = requests.get(url, timeout=10)
33
+ return response.text
34
+ except Exception as e:
35
+ logging.error(f"Error fetching latest data from {url}: {str(e)}")
36
+ return None
37
+
38
+ def compare_html(old_html, new_html):
39
+ """Compare two HTML contents to detect changes."""
40
+ if not old_html or not new_html:
41
+ return False
42
+ return old_html.strip() != new_html.strip()
43
+
44
+ def compare_screenshot(old_screenshot, new_screenshot):
45
+ """Compare two screenshots to detect changes."""
46
+ try:
47
+ if not old_screenshot or not new_screenshot:
48
+ return False
49
+ old_img = Image.open(io.BytesIO(old_screenshot))
50
+ new_img = Image.open(io.BytesIO(new_screenshot))
51
+ return not (old_img == new_img)
52
+ except Exception as e:
53
+ logging.error(f"Error comparing screenshots: {str(e)}")
54
+ return False
55
+
56
+ def alert_changes(url, change_type):
57
+ """Log detected changes."""
58
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
59
+ logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
60
+ return f"[{timestamp}] {change_type}"
61
+
62
+ def extract_links_from_page(url):
63
+ """Extract all links from a webpage."""
64
+ try:
65
+ response = requests.get(url, timeout=10)
66
+ soup = BeautifulSoup(response.text, 'html.parser')
67
+ links = [a['href'] for a in soup.find_all('a', href=True)]
68
+ return links
69
+ except Exception as e:
70
+ logging.error(f"Error extracting links from {url}: {str(e)}")
71
+ return []
72
+
73
+ def take_screenshot(url):
74
+ """Take a screenshot of a webpage."""
75
+ try:
76
+ chrome_options = Options()
77
+ chrome_options.add_argument("--headless")
78
+ chrome_options.add_argument("--no-sandbox")
79
+ chrome_options.add_argument("--disable-dev-shm-usage")
80
+ chrome_options.add_argument("--window-size=1920,1080")
81
+
82
+ driver = webdriver.Chrome(options=chrome_options)
83
+ driver.get(url)
84
+
85
+ screenshot = driver.get_screenshot_as_png()
86
+ driver.quit()
87
+
88
+ image = Image.open(io.BytesIO(screenshot))
89
+ max_size = (1024, 1024)
90
+ image.thumbnail(max_size, Image.LANCZOS)
91
+
92
+ img_byte_arr = io.BytesIO()
93
+ image.save(img_byte_arr, format='PNG')
94
+ return img_byte_arr.getvalue()
95
+ except Exception as e:
96
+ logging.error(f"Screenshot error for {url}: {str(e)}")
97
+ return None
98
+
99
+ def is_webpage(url):
100
+ """Check if the URL points to a webpage (HTML)."""
101
+ try:
102
+ response = requests.head(url, timeout=10)
103
+ content_type = response.headers.get('Content-Type', '').lower()
104
+ return 'text/html' in content_type
105
+ except Exception as e:
106
+ logging.error(f"Error checking content type for {url}: {str(e)}")
107
+ return False
108
+
109
+ def crawl_url(url, depth, max_depth, visited=None):
110
+ """Recursively crawl a URL up to a specified depth."""
111
+ if visited is None:
112
+ visited = set()
113
+
114
+ if depth > max_depth or url in visited:
115
+ return []
116
+
117
+ visited.add(url)
118
+ screenshots = []
119
+
120
+ if is_webpage(url):
121
+ links = extract_links_from_page(url)
122
+ screenshot = take_screenshot(url)
123
+ if screenshot:
124
+ screenshots.append((url, screenshot))
125
+
126
+ if depth < max_depth:
127
+ for link in links:
128
+ if not link.startswith(('http://', 'https://')):
129
+ link = f"https://{link}"
130
+ screenshots.extend(crawl_url(link, depth + 1, max_depth, visited))
131
+ else:
132
+ logging.info(f"Skipping non-webpage content: {url}")
133
+
134
+ return screenshots
135
+
136
  def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
137
  """Process URLs with crawl depth and change detection."""
138
  # Validate URLs first
139
  urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
140
+ urls = [url.strip() for url in urls if url.strip()]
141
  urls = urls[:int(max_urls)]
142
 
143
  # Validate all URLs
 
147
 
148
  scraped_data = []
149
  screenshots = []
150
+ changes_log = []
151
 
152
  # Initialize progress tracking
153
  total_urls = len(urls)
154
+ progress(0)
155
 
156
+ # Directory to store scraped data
157
+ data_dir = 'scraped_data'
158
+ os.makedirs(data_dir, exist_ok=True)
159
+
160
+ # Process each URL
161
+ for idx, url in enumerate(urls):
162
+ if not url.startswith(('http://', 'https://')):
163
+ url = f'https://{url}'
164
+
165
+ # Check for changes
166
+ old_html_path = os.path.join(data_dir, f"{url.replace('/', '_')}_html.txt")
167
+ old_screenshot_path = os.path.join(data_dir, f"{url.replace('/', '_')}_screenshot.png")
168
+
169
+ # Fetch latest data
170
+ latest_html = get_latest_data(url)
171
+ latest_screenshot = take_screenshot(url)
172
+
173
+ # Compare with previous data if available
174
+ if os.path.exists(old_html_path):
175
+ with open(old_html_path, 'r', encoding='utf-8') as f:
176
+ old_html = f.read()
177
+ if compare_html(old_html, latest_html):
178
+ changes_log.append(alert_changes(url, "HTML content has changed"))
179
+
180
+ if os.path.exists(old_screenshot_path):
181
+ with open(old_screenshot_path, 'rb') as f:
182
+ old_screenshot = f.read()
183
+ if latest_screenshot and compare_screenshot(old_screenshot, latest_screenshot):
184
+ changes_log.append(alert_changes(url, "Visual content has changed"))
185
+
186
+ # Store latest data
187
+ if latest_html:
188
+ with open(old_html_path, 'w', encoding='utf-8') as f:
189
+ f.write(latest_html)
190
+ if latest_screenshot:
191
+ with open(old_screenshot_path, 'wb') as f:
192
+ f.write(latest_screenshot)
193
+
194
+ # Prepare output data
195
+ if action_radio in ['Scrape data', 'Both']:
196
+ scraped_data.append({
197
+ 'url': url,
198
+ 'content': latest_html,
199
+ 'timestamp': datetime.now().isoformat(),
200
+ 'changes_detected': changes_log
201
+ })
202
+
203
+ if action_radio in ['Capture image', 'Both']:
204
+ crawled_screenshots = crawl_url(url, depth=1, max_depth=int(crawl_depth))
205
+ screenshots.extend(crawled_screenshots)
206
+
207
+ # Update progress
208
+ progress((idx + 1) / total_urls)
209
+
210
+ # Create ZIP file in memory
211
  memory_file = io.BytesIO()
212
  with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
213
+ # Add screenshots to ZIP
214
+ for screenshot_url, screenshot_data in screenshots:
215
+ filename = f"{screenshot_url.split('//')[1].replace('/', '_')}.png"
216
+ zipf.writestr(filename, screenshot_data)
217
+
218
+ # Add scraped data and changes log to ZIP
219
+ if scraped_data:
220
+ data_to_save = {
221
+ 'scraped_data': scraped_data,
222
+ 'changes_log': changes_log,
223
+ 'timestamp': datetime.now().isoformat()
224
+ }
225
+ zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  # Prepare return values
228
  memory_file.seek(0)
229
  zip_bytes = memory_file.getvalue()
 
230
 
231
+ # Prepare display data
232
+ display_data = {
233
+ 'scraped_urls': len(scraped_data),
234
+ 'screenshots_taken': len(screenshots),
235
+ 'changes_detected': changes_log
236
+ }
237
+
238
+ return zip_bytes, json.dumps(display_data, indent=2)
239
 
240
  def create_interface():
241
  """Create the Gradio interface."""
242
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
243
+ gr.Markdown(
244
+ """
245
+ # Smart Web Scraper with Change Detection
246
+ Monitor and capture changes in web content automatically.
247
+ """
248
+ )
249
 
250
  with gr.Tabs():
251
  with gr.Tab("URL Scrape/Screenshot"):
 
254
  value="https://example.com",
255
  placeholder="Enter single URL or multiple URLs separated by commas"
256
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
+ with gr.Row():
259
+ bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
260
+ action_radio = gr.Radio(
261
+ ["Scrape data", "Capture image", "Both"],
262
+ label="Select Action",
263
+ value="Both"
264
+ )
265
+
266
+ with gr.Row():
267
+ max_urls = gr.Slider(
268
+ minimum=1,
269
+ maximum=20,
270
+ value=5,
271
+ step=1,
272
+ label="Max URLs to process"
273
+ )
274
+ crawl_depth = gr.Slider(
275
+ minimum=1,
276
+ maximum=3,
277
+ value=1,
278
+ step=1,
279
+ label="Crawl Depth"
280
+ )
281
+
282
+ process_button = gr.Button("Process URLs", variant="primary")
283
+
284
+ with gr.Column():
285
+ screenshot_zip = gr.File(
286
+ label="Download Results",
287
+ file_count="single",
288
+ file_types=[".zip"]
289
+ )
290
+ scraped_data_output = gr.JSON(label="Results Summary")
291
+
292
  process_button.click(
293
  fn=process_urls,
294
+ inputs=[
295
+ url_input,
296
+ bulk_toggle,
297
+ action_radio,
298
+ max_urls,
299
+ crawl_depth
300
+ ],
301
+ outputs=[
302
+ screenshot_zip,
303
+ scraped_data_output
304
+ ],
305
+ show_progress=True
306
  )
307
+
308
+ gr.Markdown(
309
+ """
310
+ ### Features
311
+ - Bulk URL processing
312
+ - Screenshot capture
313
+ - Content change detection
314
+ - Recursive crawling
315
+ - Automatic data storage
316
+ """
317
+ )
318
 
319
  return demo
320
 
321
  if __name__ == "__main__":
322
  demo = create_interface()
323
+ demo.launch(
324
+ server_name="0.0.0.0",
325
+ server_port=7861,
326
+ share=True,
327
+ debug=True
328
+ )