acecalisto3 commited on
Commit
32572e6
·
verified ·
1 Parent(s): d4a8ad2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +628 -336
app.py CHANGED
@@ -3,30 +3,538 @@ import requests
3
  import re
4
  import logging
5
  import json
 
6
  from bs4 import BeautifulSoup
7
- from selenium import webdriver
8
- from selenium.webdriver.chrome.options import Options
9
- from PIL import Image
 
 
10
  import io
11
  import zipfile
12
  import os
13
- import datetime
14
- from urllib.parse import urlparse
15
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  import nltk
17
  try:
18
- nltk.download('punkt')
19
- nltk.download('stopwords')
20
- nltk.download('wordnet')
21
- nltk.download('averaged_perceptron_tagger')
22
  except Exception as e:
23
  logging.error(f"Error downloading NLTK data: {str(e)}")
24
 
25
- # Configure logging
26
- logging.basicConfig(level=logging.INFO,
27
- format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def sanitize_filename(filename):
 
30
  return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
31
 
32
  def validate_url(url):
@@ -34,53 +542,22 @@ def validate_url(url):
34
  try:
35
  result = urlparse(url)
36
  return all([result.scheme, result.netloc])
37
- except:
38
  return False
39
 
40
  def get_latest_data(url):
41
  """Get the latest HTML content of a webpage."""
42
  try:
43
- response = requests.get(url, timeout=10)
 
 
 
 
44
  return response.text
45
- except Exception as e:
46
  logging.error(f"Error fetching latest data from {url}: {str(e)}")
47
  return None
48
 
49
- def compare_html(old_html, new_html):
50
- """Compare two HTML contents to detect changes."""
51
- if not old_html or not new_html:
52
- return False
53
- return old_html.strip() != new_html.strip()
54
-
55
- def compare_screenshot(old_screenshot, new_screenshot):
56
- """Compare two screenshots to detect changes."""
57
- try:
58
- if not old_screenshot or not new_screenshot:
59
- return False
60
- old_img = Image.open(io.BytesIO(old_screenshot))
61
- new_img = Image.open(io.BytesIO(new_screenshot))
62
- return not (old_img == new_img)
63
- except Exception as e:
64
- logging.error(f"Error comparing screenshots: {str(e)}")
65
- return False
66
-
67
- def alert_changes(url, change_type):
68
- """Log detected changes."""
69
- timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
70
- logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
71
- return f"[{timestamp}] {change_type}"
72
-
73
- def extract_links_from_page(url):
74
- """Extract all links from a webpage."""
75
- try:
76
- response = requests.get(url, timeout=10)
77
- soup = BeautifulSoup(response.text, 'html.parser')
78
- links = [a['href'] for a in soup.find_all('a', href=True)]
79
- return links
80
- except Exception as e:
81
- logging.error(f"Error extracting links from {url}: {str(e)}")
82
- return []
83
-
84
  def take_screenshot(url):
85
  """Take a screenshot of a webpage."""
86
  try:
@@ -89,17 +566,17 @@ def take_screenshot(url):
89
  chrome_options.add_argument("--no-sandbox")
90
  chrome_options.add_argument("--disable-dev-shm-usage")
91
  chrome_options.add_argument("--window-size=1920,1080")
92
-
93
  driver = webdriver.Chrome(options=chrome_options)
94
  driver.get(url)
95
-
96
  screenshot = driver.get_screenshot_as_png()
97
  driver.quit()
98
-
99
  image = Image.open(io.BytesIO(screenshot))
100
  max_size = (1024, 1024)
101
  image.thumbnail(max_size, Image.LANCZOS)
102
-
103
  img_byte_arr = io.BytesIO()
104
  image.save(img_byte_arr, format='PNG')
105
  return img_byte_arr.getvalue()
@@ -107,285 +584,88 @@ def take_screenshot(url):
107
  logging.error(f"Screenshot error for {url}: {str(e)}")
108
  return None
109
 
110
- def is_webpage(url):
111
- """Check if the URL points to a webpage (HTML)."""
112
  try:
113
- response = requests.head(url, timeout=10)
114
- content_type = response.headers.get('Content-Type', '').lower()
115
- return 'text/html' in content_type
116
- except Exception as e:
117
- logging.error(f"Error checking content type for {url}: {str(e)}")
118
- return False
119
 
120
- def crawl_url(url, depth, max_depth, visited=None):
121
- """Recursively crawl a URL up to a specified depth."""
122
- if visited is None:
123
- visited = set()
124
-
125
- if depth > max_depth or url in visited:
126
- return []
127
-
128
- visited.add(url)
129
- screenshots = []
130
-
131
- if is_webpage(url):
132
- links = extract_links_from_page(url)
133
- screenshot = take_screenshot(url)
134
- if screenshot:
135
- screenshots.append((url, screenshot))
136
-
137
- if depth < max_depth:
138
- for link in links:
139
- if not link.startswith(('http://', 'https://')):
140
- link = f"https://{link}"
141
- screenshots.extend(crawl_url(link, depth + 1, max_depth, visited))
142
- else:
143
- logging.info(f"Skipping non-webpage content: {url}")
144
-
145
- return screenshots
146
 
147
- def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mode='standard', progress=gr.Progress()):
148
- """Process URLs with crawl depth and change detection."""
149
- # Validate URLs first
150
- urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
151
- urls = [url.strip() for url in urls if url.strip()]
152
- urls = urls[:int(max_urls)]
153
-
154
- # Validate all URLs
155
- invalid_urls = [url for url in urls if not validate_url(url)]
156
- if invalid_urls:
157
- if mode == 'chat':
158
- return f"Invalid URLs detected: {', '.join(invalid_urls)}"
159
- else:
160
- return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
161
-
162
- scraped_data = []
163
- screenshots = []
164
- changes_log = []
165
-
166
- # Initialize progress tracking
167
- total_urls = len(urls)
168
- progress(0)
169
-
170
- # Directory to store scraped data
171
- data_dir = 'scraped_data'
172
- os.makedirs(data_dir, exist_ok=True)
173
-
174
- # Process each URL
175
- for idx, url in enumerate(urls):
176
- if not url.startswith(('http://', 'https://')):
177
- url = f'https://{url}'
178
-
179
- # Sanitize URL for file naming
180
- sanitized_url = sanitize_filename(url)
181
-
182
- # Check for changes
183
- old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt")
184
- old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png")
185
-
186
- # Fetch latest data
187
- latest_html = get_latest_data(url)
188
- latest_screenshot = take_screenshot(url)
189
-
190
- # Compare with previous data if available
191
- if os.path.exists(old_html_path):
192
- with open(old_html_path, 'r', encoding='utf-8') as f:
193
- old_html = f.read()
194
- if compare_html(old_html, latest_html):
195
- changes_log.append(alert_changes(url, "HTML content has changed"))
196
-
197
- if os.path.exists(old_screenshot_path):
198
- with open(old_screenshot_path, 'rb') as f:
199
- old_screenshot = f.read()
200
- if latest_screenshot and compare_screenshot(old_screenshot, latest_screenshot):
201
- changes_log.append(alert_changes(url, "Visual content has changed"))
202
-
203
- # Store latest data
204
- if latest_html:
205
- with open(old_html_path, 'w', encoding='utf-8') as f:
206
- f.write(latest_html)
207
- if latest_screenshot:
208
- with open(old_screenshot_path, 'wb') as f:
209
- f.write(latest_screenshot)
210
-
211
- # Prepare output data
212
- if action_radio in ['Scrape data', 'Both']:
213
- scraped_data.append({
214
- 'url': url,
215
- 'content': latest_html, # Include full HTML content
216
- 'timestamp': datetime.datetime.now().isoformat(),
217
- 'changes_detected': changes_log
218
- })
219
-
220
- if action_radio in ['Capture image', 'Both']:
221
- crawled_screenshots = crawl_url(url, depth=1, max_depth=int(crawl_depth))
222
- screenshots.extend(crawled_screenshots)
223
-
224
- # Update progress
225
- progress((idx + 1) / total_urls)
226
-
227
- if mode == 'chat':
228
- return "\n".join(changes_log)
229
- else:
230
- # Create a temporary file to store the ZIP
231
- with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
232
- with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
233
- # Add screenshots to ZIP
234
- for screenshot_url, screenshot_data in screenshots:
235
- sanitized_screenshot_url = sanitize_filename(screenshot_url)
236
- filename = f"{sanitized_screenshot_url}.png"
237
- zipf.writestr(filename, screenshot_data)
238
-
239
- # Add scraped data and changes log to ZIP
240
- if scraped_data:
241
- data_to_save = {
242
- 'scraped_data': scraped_data,
243
- 'changes_log': changes_log,
244
- 'timestamp': datetime.datetime.now().isoformat()
245
- }
246
- zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
247
 
248
- # Get the path to the temporary file
249
- zip_file_path = tmp_file.name
250
-
251
- # Prepare display data
252
- display_data = {
253
- 'total_scraped_urls': len(scraped_data),
254
- 'total_screenshots_taken': len(screenshots),
255
- 'changes_detected': changes_log,
256
- 'scraped_data': scraped_data # Include full scraped data
257
- }
258
-
259
- # Return the path to the temporary ZIP file and display data
260
- return zip_file_path, json.dumps(display_data, indent=2)
261
 
262
- def recognize_intent(instruction: str) -> str:
263
- instruction = instruction.lower()
264
-
265
- # General patterns for actions and data types
266
- action_patterns = {
267
- r'\b(find|extract|scrape)\s+(links|images|videos|texts|prices|product names|reviews)\b': 'extract_data',
268
- r'\b(count)\s+(links|images|videos|products)\b': 'count_data',
269
- r'\b(what is|get|fetch)\s+(channel name|subscriber count|viewers)\b': 'fetch_specific_data',
270
- r'\b(monitor)\s+changes\b': 'monitor_changes',
271
- }
272
-
273
- for pattern, intent in action_patterns.items():
274
- if re.search(pattern, instruction):
275
- return intent
276
- return "unknown"
277
-
278
- def extract_data_type(instruction: str) -> str:
279
- instruction = instruction.lower()
280
- data_types = {
281
- r'\b(links|images|videos|texts|prices|product names|reviews)\b': 'links',
282
- r'\b(links|images|videos|products)\b': 'images',
283
- r'\b(channel name|subscriber count|viewers)\b': 'channel name',
284
- }
285
- for pattern, data_type in data_types.items():
286
- if re.search(pattern, instruction):
287
- return data_type
288
- return "unknown"
289
-
290
- def format_output(data, output_format):
291
- if output_format == "JSON":
292
- return json.dumps(data, indent=2)
293
- elif output_format == "Cleaned JSON":
294
- # Implement data cleaning logic here
295
- return json.dumps(data, indent=2)
296
- else:
297
- return str(data)
298
-
299
- def generate_command(intent: str, url_input: str, data_type: str, output_format: str) -> str:
300
- if intent == "extract_data":
301
- data = extract_data(url_input, data_type)
302
- return format_output(data, output_format)
303
- elif intent == "count_data":
304
- count = count_data(url_input, data_type)
305
- return f"The number of {data_type} is {count}."
306
- elif intent == "fetch_specific_data":
307
- specific_data = fetch_specific_data(url_input, data_type)
308
- return specific_data
309
- elif intent == "monitor_changes":
310
- changes_log = monitor_changes(url_input)
311
- return changes_log
312
- else:
313
- return "Instruction not recognized. Please try again."
314
-
315
- def extract_data(url, data_type):
316
- try:
317
- response = requests.get(url)
318
- soup = BeautifulSoup(response.text, 'html.parser')
319
-
320
- if data_type == "links":
321
- return [a['href'] for a in soup.find_all('a', href=True)]
322
- elif data_type == "images":
323
- return [img['src'] for img in soup.find_all('img', src=True)]
324
- # Add more data types as needed
325
- else:
326
- return []
327
- except Exception as e:
328
- return f"Error extracting {data_type}: {str(e)}"
329
 
330
- def count_data(url, data_type):
331
- try:
332
- response = requests.get(url)
333
- soup = BeautifulSoup(response.text, 'html.parser')
334
-
335
- if data_type == "links":
336
- return len(soup.find_all('a', href=True))
337
- elif data_type == "images":
338
- return len(soup.find_all('img', src=True))
339
- # Add more data types as needed
340
- else:
341
- return 0
342
- except Exception as e:
343
- return f"Error counting {data_type}: {str(e)}"
344
 
345
- def fetch_specific_data(url, data_type):
346
- try:
347
- # Implement specific data fetching logic here
348
- # For demonstration, return a placeholder
349
- return f"Fetched {data_type} from {url}"
350
- except Exception as e:
351
- return f"Error fetching {data_type}: {str(e)}"
352
 
353
- def monitor_changes(url_input):
354
- try:
355
- # Implement change monitoring logic here
356
- # For demonstration, return a placeholder
357
- return f"Changes monitored for {url_input}"
358
  except Exception as e:
359
- return f"Error monitoring changes: {str(e)}"
 
360
 
361
- def chat_based_scrape(instruction, url_input, output_format):
362
- # Recognize intent and extract data type if applicable
363
- intent = recognize_intent(instruction)
364
- data_type = extract_data_type(instruction)
365
-
366
- # Generate command based on the recognized intent
367
- command_output = generate_command(intent, url_input, data_type, output_format)
368
-
369
- return command_output
370
 
371
  def create_interface():
372
  """Create the Gradio interface."""
 
 
373
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
374
  gr.Markdown(
375
  """
376
- # Smart Web Scraper with Change Detection
377
- Monitor and capture changes in web content automatically.
378
  """
379
  )
380
-
381
  with gr.Tabs():
382
  with gr.Tab("URL Scrape/Screenshot"):
383
  url_input = gr.Textbox(
384
  label="Enter URL(s)",
385
- value="https://example.com",
386
  placeholder="Enter single URL or multiple URLs separated by commas"
387
  )
388
-
389
  with gr.Row():
390
  bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
391
  action_radio = gr.Radio(
@@ -393,29 +673,41 @@ def create_interface():
393
  label="Select Action",
394
  value="Both"
395
  )
396
-
397
  with gr.Row():
398
  max_urls = gr.Slider(
399
  minimum=1,
400
- maximum=1000,
401
  value=5,
402
  step=1,
403
  label="Max URLs to process"
404
  )
405
  crawl_depth = gr.Slider(
406
- minimum=1,
407
- maximum=5,
408
  value=1,
409
  step=1,
410
- label="Crawl Depth"
411
  )
412
-
413
  process_button = gr.Button("Process URLs", variant="primary")
414
-
415
  with gr.Column():
416
- screenshot_zip = gr.File(label="Download Results")
 
 
 
 
 
 
 
 
 
 
 
 
417
  scraped_data_output = gr.JSON(label="Results Summary")
418
-
419
  process_button.click(
420
  fn=process_urls,
421
  inputs=[
@@ -426,37 +718,37 @@ def create_interface():
426
  crawl_depth
427
  ],
428
  outputs=[
429
- screenshot_zip,
 
430
  scraped_data_output
431
  ],
432
  show_progress=True
433
- )
434
-
435
  with gr.Tab("Chat-Based Scrape"):
436
  instruction = gr.Textbox(
437
  label="Enter Instruction",
438
  placeholder="e.g., 'Scrape all links' or 'Extract all images'"
439
  )
440
- url_input = gr.Textbox(
441
  label="Enter URL",
442
  value="https://example.com",
443
  placeholder="Enter the target URL"
444
  )
445
  output_format = gr.Radio(
446
- ["JSON", "Cleaned JSON", "Raw Data"],
447
  label="Output Format",
448
- value="JSON"
449
  )
450
- output = gr.Textbox(label="Output")
451
-
452
  chat_button = gr.Button("Execute Instruction", variant="primary")
453
-
454
- chat_button.click(
455
- fn=chat_based_scrape,
456
- inputs=[instruction, url_input, output_format],
457
- outputs=output
458
  )
459
-
460
  gr.Markdown(
461
  """
462
  ### Features
@@ -464,12 +756,12 @@ def create_interface():
464
  - Screenshot capture
465
  - Content change detection
466
  - Recursive crawling
467
- - Chat-based instructions
468
  """
469
  )
470
-
471
  return demo
472
 
473
  if __name__ == "__main__":
474
- demo = create_interface() # Call the function to create the interface
475
- demo.launch() # Launch the Gradio app
 
3
  import re
4
  import logging
5
  import json
6
+ from typing import Tuple, List, Dict, Union, Optional
7
  from bs4 import BeautifulSoup
8
+ from urllib.parse import urlparse, urljoin
9
+ from nltk import word_tokenize
10
+ from nltk.corpus import stopwords
11
+ from nltk.stem import WordNetLemmatizer
12
+ from datetime import datetime
13
  import io
14
  import zipfile
15
  import os
 
 
16
  import tempfile
17
+ from selenium import webdriver
18
+ from selenium.webdriver.chrome.service import Service
19
+ from selenium.webdriver.chrome.options import Options
20
+ from PIL import Image
21
+ import base64
22
+ import asyncio
23
+ import yaml
24
+ from pathlib import Path
25
+ from tqdm import tqdm
26
+ import plotly.graph_objects as go
27
+
28
+ # Configure detailed logging
29
+ logging.basicConfig(
30
+ level=logging.DEBUG,
31
+ format='%(asctime)s - %(levelname)s - %(message)s',
32
+ handlers=[
33
+ logging.FileHandler('webscraper.log'),
34
+ logging.StreamHandler()
35
+ ]
36
+ )
37
+
38
+ # Download necessary NLTK data
39
  import nltk
40
  try:
41
+ nltk.download('punkt', quiet=True)
42
+ nltk.download('stopwords', quiet=True)
43
+ nltk.download('wordnet', quiet=True)
44
+ nltk.download('averaged_perceptron_tagger', quiet=True)
45
  except Exception as e:
46
  logging.error(f"Error downloading NLTK data: {str(e)}")
47
 
48
+ # Configuration and logging setup
49
+ class Config:
50
+ DATA_DIR = Path('scraped_data')
51
+ LOGS_DIR = Path('logs')
52
+ MAX_RETRIES = 3
53
+ TIMEOUT = 30
54
+
55
+ @classmethod
56
+ def initialize(cls):
57
+ """Initialize necessary directories and configurations"""
58
+ cls.DATA_DIR.mkdir(exist_ok=True)
59
+ cls.LOGS_DIR.mkdir(exist_ok=True)
60
+
61
+ # Setup logging
62
+ logging.basicConfig(
63
+ level=logging.INFO,
64
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
65
+ handlers=[
66
+ logging.FileHandler(cls.LOGS_DIR / 'app.log'),
67
+ logging.StreamHandler()
68
+ ]
69
+ )
70
+
71
+ return logging.getLogger(__name__)
72
+
73
+ logger = Config.initialize()
74
+
75
+ class WebDriverManager:
76
+ """Manage WebDriver instances"""
77
+ @staticmethod
78
+ def get_driver() -> webdriver.Chrome:
79
+ options = Options()
80
+ options.add_argument('--headless')
81
+ options.add_argument('--no-sandbox')
82
+ options.add_argument('--disable-dev-shm-usage')
83
+ options.add_argument('--window-size=1920,1080')
84
+
85
+ return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
86
+
87
+ class DataExtractor:
88
+ """Extract and process webpage content"""
89
+ def __init__(self):
90
+ self.soup = None
91
+ self.url = None
92
+ self.logger = logging.getLogger(__name__)
93
+
94
+ def set_page(self, html: str, url: str):
95
+ """Set the page content for extraction"""
96
+ self.soup = BeautifulSoup(html, 'html.parser')
97
+ self.url = url
98
+
99
+ def extract_images(self) -> List[Dict]:
100
+ """Extract image information from the page"""
101
+ images = []
102
+ try:
103
+ for img in self.soup.find_all('img'):
104
+ image_info = {
105
+ 'src': urljoin(self.url, img.get('src', '')),
106
+ 'alt': img.get('alt', ''),
107
+ 'title': img.get('title', ''),
108
+ 'dimensions': self._get_image_dimensions(img),
109
+ 'file_type': self._get_file_type(img.get('src', ''))
110
+ }
111
+ images.append(image_info)
112
+ except Exception as e:
113
+ self.logger.error(f"Error extracting images: {str(e)}")
114
+ return images
115
+
116
+ def extract_links(self) -> List[Dict]:
117
+ """Extract link information from the page"""
118
+ links = []
119
+ try:
120
+ for a in self.soup.find_all('a', href=True):
121
+ absolute_url = urljoin(self.url, a.get('href', ''))
122
+ link_info = {
123
+ 'href': absolute_url,
124
+ 'text': a.get_text(strip=True),
125
+ 'title': a.get('title', ''),
126
+ 'type': 'internal' if self.url in absolute_url else 'external',
127
+ 'has_image': bool(a.find('img'))
128
+ }
129
+ links.append(link_info)
130
+ except Exception as e:
131
+ self.logger.error(f"Error extracting links: {str(e)}")
132
+ return links
133
+
134
+ def extract_text(self) -> List[Dict]:
135
+ """Extract text content from the page"""
136
+ texts = []
137
+ try:
138
+ for text_element in self.soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
139
+ text_info = {
140
+ 'content': text_element.get_text(strip=True),
141
+ 'source': text_element.name
142
+ }
143
+ if text_info['content']: # Only add non-empty text blocks
144
+ texts.append(text_info)
145
+ except Exception as e:
146
+ self.logger.error(f"Error extracting text: {str(e)}")
147
+ return texts
148
+
149
+ def _get_image_dimensions(self, img_tag) -> str:
150
+ """Get image dimensions from tag attributes"""
151
+ width = img_tag.get('width', '')
152
+ height = img_tag.get('height', '')
153
+ if width and height:
154
+ return f"{width}x{height}"
155
+ return "unknown"
156
+
157
+ def _get_file_type(self, src: str) -> str:
158
+ """Determine image file type from URL"""
159
+ if not src:
160
+ return "unknown"
161
+ ext = src.split('.')[-1].lower()
162
+ return ext if ext in ['jpg', 'jpeg', 'png', 'gif', 'webp'] else "unknown"
163
+
164
+ class QueryAnalyzer:
165
+ """Analyze natural language queries"""
166
+ def __init__(self):
167
+ self.logger = logging.getLogger(__name__)
168
+ self.stop_words = set(stopwords.words('english'))
169
+ self.lemmatizer = WordNetLemmatizer()
170
+ self.logger.info("QueryAnalyzer initialized")
171
+
172
+ def parse_query(self, query: str) -> Dict[str, Union[str, int]]:
173
+ try:
174
+ self.logger.info(f"Parsing query: {query}")
175
+ tokens = word_tokenize(query.lower())
176
+ filtered_tokens = [self.lemmatizer.lemmatize(token) for token in tokens
177
+ if token.isalnum() and token not in self.stop_words]
178
+
179
+ return {
180
+ 'target': self._identify_target(filtered_tokens),
181
+ 'limit': self._identify_limit(filtered_tokens),
182
+ 'filters': self._identify_filters(filtered_tokens),
183
+ 'output': 'JSON' if 'json' in query.lower() else 'Formatted Text'
184
+ }
185
+ except Exception as e:
186
+ self.logger.error(f"Error parsing query: {str(e)}")
187
+ return {'target': 'unknown', 'limit': 0, 'filters': {}}
188
+
189
+ def _identify_target(self, tokens: List[str]) -> str:
190
+ target_map = {
191
+ 'image': 'image',
192
+ 'images': 'image',
193
+ 'picture': 'image',
194
+ 'link': 'link',
195
+ 'links': 'link',
196
+ 'text': 'text',
197
+ 'content': 'text'
198
+ }
199
+ for token in tokens:
200
+ if token in target_map:
201
+ return target_map[token]
202
+ return 'unknown'
203
+
204
+ def _identify_limit(self, tokens: List[str]) -> int:
205
+ for token in tokens:
206
+ if token.isdigit():
207
+ return int(token)
208
+ return 0
209
+
210
+ def _identify_filters(self, tokens: List[str]) -> Dict[str, str]:
211
+ filters = {}
212
+ if 'external' in tokens:
213
+ filters['link_type'] = 'external'
214
+ elif 'internal' in tokens:
215
+ filters['link_type'] = 'internal'
216
+ if 'png' in tokens:
217
+ filters['file_type'] = 'png'
218
+ elif 'jpg' in tokens or 'jpeg' in tokens:
219
+ filters['file_type'] = 'jpg'
220
+ return filters
221
+
222
+ class ResponseFormatter:
223
+ """Format scraped data based on user preferences"""
224
+ def __init__(self):
225
+ self.logger = logging.getLogger(__name__)
226
+
227
+ def format_data(self, data: List[Dict], query_info: Dict) -> str:
228
+ try:
229
+ if not data:
230
+ return "No data found for the specified query."
231
+
232
+ # Apply filters
233
+ filtered_data = self._apply_filters(data, query_info.get('filters', {}))
234
+
235
+ # Apply limit
236
+ if query_info.get('limit', 0) > 0:
237
+ filtered_data = filtered_data[:query_info['limit']]
238
+
239
+ if query_info['output'] == "JSON":
240
+ return json.dumps({
241
+ "metadata": {
242
+ "query": query_info,
243
+ "timestamp": datetime.now().isoformat(),
244
+ "results_count": len(filtered_data)
245
+ },
246
+ "results": filtered_data
247
+ }, indent=2)
248
+
249
+ return self._format_human_readable(filtered_data, query_info['target'])
250
+
251
+ except Exception as e:
252
+ self.logger.error(f"Formatting error: {str(e)}")
253
+ return f"Error formatting results: {str(e)}"
254
+
255
+ def _apply_filters(self, data: List[Dict], filters: Dict) -> List[Dict]:
256
+ filtered_data = data
257
+ if 'link_type' in filters:
258
+ filtered_data = [item for item in filtered_data
259
+ if item.get('type', '') == filters['link_type']]
260
+ if 'file_type' in filters:
261
+ filtered_data = [item for item in filtered_data
262
+ if item.get('file_type', '').lower() == filters['file_type']]
263
+ return filtered_data
264
+
265
+ def _format_human_readable(self, data: List[Dict], target: str) -> str:
266
+ formats = {
267
+ 'image': self._format_images,
268
+ 'link': self._format_links,
269
+ 'text': self._format_texts
270
+ }
271
+ return formats.get(target, lambda x: "Unknown data type")(data)
272
+
273
+ def _format_images(self, images: List[Dict]) -> str:
274
+ return "\n\n".join(
275
+ f"Image {idx+1}:\n"
276
+ f"Source: {img['src']}\n"
277
+ f"Alt Text: {img['alt']}\n"
278
+ f"Dimensions: {img['dimensions']}\n"
279
+ f"Type: {img['file_type']}"
280
+ for idx, img in enumerate(images)
281
+ )
282
+
283
+ def _format_links(self, links: List[Dict]) -> str:
284
+ return "\n\n".join(
285
+ f"Link {idx+1}:\n"
286
+ f"URL: {link['href']}\n"
287
+ f"Text: {link['text']}\n"
288
+ f"Type: {link['type']}\n"
289
+ f"Contains Image: {'Yes' if link['has_image'] else 'No'}"
290
+ for idx, link in enumerate(links)
291
+ )
292
+
293
+ def _format_texts(self, texts: List[Dict]) -> str:
294
+ return "\n\n".join(
295
+ f"Text Block {idx+1} ({text['source'].upper()}):\n"
296
+ f"{text['content']}"
297
+ for idx, text in enumerate(texts)
298
+ )
299
+
300
+ class Scraper:
301
+ """Core scraping functionality with improved error handling"""
302
+ def __init__(self):
303
+ self.logger = logging.getLogger(__name__)
304
+ self.session = requests.Session()
305
+ self.session.headers.update({
306
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
307
+ })
308
+
309
+ async def fetch_page(self, url: str) -> Optional[str]:
310
+ """Fetch page content with retry mechanism"""
311
+ for attempt in range(Config.MAX_RETRIES):
312
+ try:
313
+ response = self.session.get(url, timeout=Config.TIMEOUT)
314
+ response.raise_for_status()
315
+ return response.text
316
+ except Exception as e:
317
+ self.logger.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
318
+ if attempt == Config.MAX_RETRIES - 1:
319
+ return None
320
+
321
+ async def take_screenshot(self, url: str) -> Optional[bytes]:
322
+ """Take a screenshot of a webpage with improved error handling."""
323
+ driver = None
324
+ try:
325
+ options = Options()
326
+ options.add_argument("--headless")
327
+ options.add_argument("--no-sandbox")
328
+ options.add_argument("--disable-dev-shm-usage")
329
+ options.add_argument("--window-size=1920,1080")
330
+
331
+ driver = webdriver.Chrome(options=options)
332
+ driver.get(url)
333
+
334
+ # Wait for page load
335
+ time.sleep(2)
336
+
337
+ # Take screenshot
338
+ screenshot = driver.get_screenshot_as_png()
339
+
340
+ # Process image
341
+ img = Image.open(io.BytesIO(screenshot))
342
+ img = img.convert('RGB') # Convert to RGB to ensure compatibility
343
+
344
+ # Save to bytes
345
+ img_byte_arr = io.BytesIO()
346
+ img.save(img_byte_arr, format='PNG', optimize=True)
347
+ return img_byte_arr.getvalue()
348
+
349
+ except Exception as e:
350
+ logging.error(f"Screenshot error for {url}: {str(e)}")
351
+ return None
352
+ finally:
353
+ if driver:
354
+ driver.quit()
355
+
356
+ class SmartWebScraper:
357
+ """Smart web scraping with natural language processing capabilities"""
358
+ def __init__(self):
359
+ self.query_analyzer = QueryAnalyzer()
360
+ self.data_extractor = DataExtractor()
361
+ self.response_formatter = ResponseFormatter()
362
+ self.logger = logging.getLogger(__name__)
363
+ self.scraped_data = {}
364
+
365
+ def chat_based_scrape(self, instruction: str, url: str, output_format: str = "Formatted Text") -> str:
366
+ """Process natural language instructions for web scraping"""
367
+ try:
368
+ if not instruction or not url:
369
+ return "Please provide both instruction and URL."
370
+
371
+ # Process the URL and instruction
372
+ raw_data = self.process_url(url, instruction)
373
+ query_info = self.query_analyzer.parse_query(instruction)
374
+ query_info['output'] = output_format
375
+
376
+ if output_format == "JSON":
377
+ return json.dumps({
378
+ "status": "success",
379
+ "request": {
380
+ "url": url,
381
+ "instruction": instruction,
382
+ "timestamp": datetime.now().isoformat()
383
+ },
384
+ "data": raw_data,
385
+ "metadata": {
386
+ "source": url,
387
+ "elements_found": len(raw_data),
388
+ "content_type": type(raw_data).__name__
389
+ }
390
+ }, indent=2)
391
+
392
+ return self.response_formatter.format_data(raw_data, query_info)
393
+
394
+ except Exception as e:
395
+ error_msg = f"Error processing chat-based scrape: {str(e)}"
396
+ self.logger.error(error_msg)
397
+ return error_msg
398
+
399
+ def process_url(self, url: str, query: str) -> str:
400
+ """Process URL based on query"""
401
+ try:
402
+ # Validate URL
403
+ if not self._validate_url(url):
404
+ return "Please provide a valid URL (including http:// or https://)."
405
+
406
+ # Fetch page
407
+ headers = {
408
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
409
+ }
410
+ response = requests.get(url, headers=headers, timeout=10)
411
+ response.raise_for_status()
412
+
413
+ # Set page content and store in scraped_data
414
+ self.data_extractor.set_page(response.text, url)
415
+ self.scraped_data[url] = {
416
+ 'images': self.data_extractor.extract_images(),
417
+ 'links': self.data_extractor.extract_links(),
418
+ 'texts': self.data_extractor.extract_text()
419
+ }
420
+
421
+ # Analyze query and extract data
422
+ query_info = self.query_analyzer.parse_query(query)
423
+ data = self._get_data_for_target(query_info['target'], url)
424
+
425
+ # Format response
426
+ return self.response_formatter.format_data(data, query_info)
427
+
428
+ except requests.exceptions.RequestException as e:
429
+ error_msg = f"Error fetching the webpage: {str(e)}"
430
+ self.logger.error(error_msg)
431
+ return error_msg
432
+ except Exception as e:
433
+ error_msg = f"An error occurred: {str(e)}"
434
+ self.logger.error(error_msg)
435
+ return error_msg
436
+
437
+ def _validate_url(self, url: str) -> bool:
438
+ """Validate URL format"""
439
+ try:
440
+ result = urlparse(url)
441
+ return all([result.scheme, result.netloc])
442
+ except Exception as e:
443
+ self.logger.error(f"URL validation error: {str(e)}")
444
+ return False
445
+
446
+ def _get_data_for_target(self, target: str, url: str) -> List[Dict]:
447
+ """Get specific data based on target type"""
448
+ if url not in self.scraped_data:
449
+ self.logger.warning(f"No data found for URL: {url}")
450
+ return []
451
+
452
+ if target == 'image':
453
+ return self.scraped_data[url]['images']
454
+ elif target == 'link':
455
+ return self.scraped_data[url]['links']
456
+ elif target == 'text':
457
+ return self.scraped_data[url]['texts']
458
+ else:
459
+ self.logger.warning(f"Unknown target type: {target}")
460
+ return []
461
+
462
+ class QueryAnalyzer:
463
+ def __init__(self):
464
+ self.logger = logging.getLogger(__name__)
465
+ self.stop_words = set(stopwords.words('english'))
466
+ self.lemmatizer = WordNetLemmatizer()
467
+
468
+ def parse_query(self, query: str) -> Dict[str, Union[str, int]]:
469
+ try:
470
+ tokens = word_tokenize(query.lower())
471
+ filtered_tokens = [
472
+ self.lemmatizer.lemmatize(token)
473
+ for token in tokens
474
+ if token.isalnum() and token not in self.stop_words
475
+ ]
476
+
477
+ return {
478
+ 'target': self._identify_target(filtered_tokens),
479
+ 'limit': self._identify_limit(filtered_tokens),
480
+ 'filters': self._identify_filters(filtered_tokens),
481
+ 'output': 'JSON' if 'json' in query.lower() else 'Formatted Text'
482
+ }
483
+ except Exception as e:
484
+ self.logger.error(f"Error parsing query: {str(e)}")
485
+ return {'target': 'unknown', 'limit': 0, 'filters': {}}
486
+
487
+ def _identify_target(self, tokens: List[str]) -> str:
488
+ targets = {'image': 'image', 'link': 'link', 'text': 'text'}
489
+ for token in tokens:
490
+ if token in targets:
491
+ return targets[token]
492
+ return 'unknown'
493
+
494
+ def _identify_limit(self, tokens: List[str]) -> int:
495
+ for token in tokens:
496
+ if token.isdigit():
497
+ return int(token)
498
+ return 0
499
+
500
+ def _identify_filters(self, tokens: List[str]) -> Dict[str, str]:
501
+ filters = {}
502
+ if 'external' in tokens:
503
+ filters['link_type'] = 'external'
504
+ elif 'internal' in tokens:
505
+ filters['link_type'] = 'internal'
506
+ return filters
507
+
508
+ class ResponseFormatter:
509
+ def __init__(self):
510
+ self.logger = logging.getLogger(__name__)
511
+
512
+ def format_data(self, data: List[Dict], query_info: Dict) -> Union[str, dict]:
513
+ try:
514
+ if not data:
515
+ return {"status": "success", "data": [], "message": "No data found"} if query_info['output'] == "JSON" else "No data found"
516
+
517
+ response = {
518
+ "metadata": {
519
+ "target": query_info['target'],
520
+ "limit": query_info['limit'],
521
+ "filters": query_info['filters'],
522
+ "timestamp": datetime.now().isoformat()
523
+ },
524
+ "data": data[:query_info['limit']] if query_info['limit'] > 0 else data
525
+ }
526
+
527
+ return json.dumps(response, indent=2) if query_info['output'] == "JSON" else self._format_text(response)
528
+
529
+ except Exception as e:
530
+ error_msg = {"status": "error", "message": str(e)}
531
+ return json.dumps(error_msg, indent=2) if query_info['output'] == "JSON" else f"Error: {str(e)}"
532
+
533
+ def _format_text(self, response: dict) -> str:
534
+ return json.dumps(response, indent=2) # Fallback if text formatting fails
535
 
536
  def sanitize_filename(filename):
537
+ """Sanitizes a filename by removing invalid characters."""
538
  return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
539
 
540
  def validate_url(url):
 
542
  try:
543
  result = urlparse(url)
544
  return all([result.scheme, result.netloc])
545
+ except Exception:
546
  return False
547
 
548
  def get_latest_data(url):
549
  """Get the latest HTML content of a webpage."""
550
  try:
551
+ headers = {
552
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
553
+ }
554
+ response = requests.get(url, headers=headers, timeout=10)
555
+ response.raise_for_status() # Raise an exception for bad status codes
556
  return response.text
557
+ except requests.exceptions.RequestException as e:
558
  logging.error(f"Error fetching latest data from {url}: {str(e)}")
559
  return None
560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  def take_screenshot(url):
562
  """Take a screenshot of a webpage."""
563
  try:
 
566
  chrome_options.add_argument("--no-sandbox")
567
  chrome_options.add_argument("--disable-dev-shm-usage")
568
  chrome_options.add_argument("--window-size=1920,1080")
569
+
570
  driver = webdriver.Chrome(options=chrome_options)
571
  driver.get(url)
572
+
573
  screenshot = driver.get_screenshot_as_png()
574
  driver.quit()
575
+
576
  image = Image.open(io.BytesIO(screenshot))
577
  max_size = (1024, 1024)
578
  image.thumbnail(max_size, Image.LANCZOS)
579
+
580
  img_byte_arr = io.BytesIO()
581
  image.save(img_byte_arr, format='PNG')
582
  return img_byte_arr.getvalue()
 
584
  logging.error(f"Screenshot error for {url}: {str(e)}")
585
  return None
586
 
587
+ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth):
588
+ """Process URLs with crawl depth and change detection."""
589
  try:
590
+ urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
591
+ urls = [url.strip() for url in urls if url.strip()]
592
+ urls = urls[:int(max_urls)]
 
 
 
593
 
594
+ # Validate URLs
595
+ invalid_urls = [url for url in urls if not validate_url(url)]
596
+ if invalid_urls:
597
+ return None, None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
 
599
+ scraped_data = []
600
+ screenshots = []
601
+ changes_log = []
602
+
603
+ # Create temporary directory for screenshots
604
+ temp_dir = Path("temp_screenshots")
605
+ temp_dir.mkdir(exist_ok=True)
606
+
607
+ # Process each URL with progress tracking
608
+ total_urls = len(urls)
609
+ for idx, url in enumerate(urls):
610
+ if not url.startswith(('http://', 'https://')):
611
+ url = f'https://{url}'
612
+
613
+ sanitized_url = sanitize_filename(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
 
615
+ # Take screenshot
616
+ if action_radio in ['Capture image', 'Both']:
617
+ screenshot = take_screenshot(url)
618
+ if screenshot:
619
+ screenshot_path = temp_dir / f"{sanitized_url}.png"
620
+ with open(screenshot_path, 'wb') as f:
621
+ f.write(screenshot)
622
+ screenshots.append((url, str(screenshot_path))) # Convert Path to string
 
 
 
 
 
623
 
624
+ # Scrape data
625
+ if action_radio in ['Scrape data', 'Both']:
626
+ html_content = get_latest_data(url)
627
+ if html_content:
628
+ scraped_data.append({
629
+ 'url': url,
630
+ 'content_length': len(html_content),
631
+ 'timestamp': datetime.now().isoformat()
632
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
633
 
634
+ # Create a ZIP file for the screenshots
635
+ zip_file_path = temp_dir / "screenshots.zip"
636
+ with zipfile.ZipFile(zip_file_path, 'w') as zipf:
637
+ for screenshot in screenshots:
638
+ zipf.write(screenshot[1], arcname=Path(screenshot[1]).name) # Use string for writing
 
 
 
 
 
 
 
 
 
639
 
640
+ # Return the results
641
+ return str(zip_file_path), screenshots, scraped_data # Return structured data for JSON output
 
 
 
 
 
642
 
 
 
 
 
 
643
  except Exception as e:
644
+ logging.error(f"Error in process_urls: {str(e)}")
645
+ return None, None, json.dumps({"error": str(e)}, indent=2)
646
 
647
+
648
+ return demo
 
 
 
 
 
 
 
649
 
650
  def create_interface():
651
  """Create the Gradio interface."""
652
+ scraper = SmartWebScraper()
653
+
654
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
655
  gr.Markdown(
656
  """
657
+ # 🌐 Enhanced Web Scraper with Change Detection and Chat
658
+ Monitor and capture changes in web content automatically. Use the chat interface to interact with scraped data.
659
  """
660
  )
661
+
662
  with gr.Tabs():
663
  with gr.Tab("URL Scrape/Screenshot"):
664
  url_input = gr.Textbox(
665
  label="Enter URL(s)",
 
666
  placeholder="Enter single URL or multiple URLs separated by commas"
667
  )
668
+
669
  with gr.Row():
670
  bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
671
  action_radio = gr.Radio(
 
673
  label="Select Action",
674
  value="Both"
675
  )
676
+
677
  with gr.Row():
678
  max_urls = gr.Slider(
679
  minimum=1,
680
+ maximum=20,
681
  value=5,
682
  step=1,
683
  label="Max URLs to process"
684
  )
685
  crawl_depth = gr.Slider(
686
+ minimum=0,
687
+ maximum=3,
688
  value=1,
689
  step=1,
690
+ label="Crawl Depth (0 for no recursion)"
691
  )
692
+
693
  process_button = gr.Button("Process URLs", variant="primary")
694
+
695
  with gr.Column():
696
+ # Add gallery for screenshot preview
697
+ gallery = gr.Gallery(
698
+ label="Screenshots Preview",
699
+ show_label=True,
700
+ elem_id="gallery",
701
+ columns=[3],
702
+ rows=[2],
703
+ height="auto",
704
+ object_fit="contain" # Add proper image scaling
705
+ )
706
+
707
+ # Download button and results
708
+ download_file = gr.File(label="Download Results (ZIP)")
709
  scraped_data_output = gr.JSON(label="Results Summary")
710
+
711
  process_button.click(
712
  fn=process_urls,
713
  inputs=[
 
718
  crawl_depth
719
  ],
720
  outputs=[
721
+ download_file,
722
+ gallery,
723
  scraped_data_output
724
  ],
725
  show_progress=True
726
+ )
 
727
  with gr.Tab("Chat-Based Scrape"):
728
  instruction = gr.Textbox(
729
  label="Enter Instruction",
730
  placeholder="e.g., 'Scrape all links' or 'Extract all images'"
731
  )
732
+ chat_url_input = gr.Textbox(
733
  label="Enter URL",
734
  value="https://example.com",
735
  placeholder="Enter the target URL"
736
  )
737
  output_format = gr.Radio(
738
+ ["Formatted Text", "JSON"],
739
  label="Output Format",
740
+ value="Formatted Text"
741
  )
742
+ chat_output = gr.Textbox(label="Output")
743
+
744
  chat_button = gr.Button("Execute Instruction", variant="primary")
745
+
746
+ chat_button.click (
747
+ fn=scraper.chat_based_scrape,
748
+ inputs=[instruction, chat_url_input, output_format],
749
+ outputs=chat_output
750
  )
751
+
752
  gr.Markdown(
753
  """
754
  ### Features
 
756
  - Screenshot capture
757
  - Content change detection
758
  - Recursive crawling
759
+ - Chat-based instructions for interacting with scraped data
760
  """
761
  )
762
+
763
  return demo
764
 
765
  if __name__ == "__main__":
766
+ demo = create_interface()
767
+ demo.launch(debug=True)