acecalisto3 commited on
Commit
4c6fb3f
·
verified ·
1 Parent(s): 85ef800

Update 1app.py

Browse files
Files changed (1) hide show
  1. 1app.py +438 -101
1app.py CHANGED
@@ -10,6 +10,13 @@ from nltk import word_tokenize
10
  from nltk.corpus import stopwords
11
  from nltk.stem import WordNetLemmatizer
12
  from datetime import datetime
 
 
 
 
 
 
 
13
 
14
  # Configure detailed logging
15
  logging.basicConfig(
@@ -23,10 +30,251 @@ logging.basicConfig(
23
 
24
  # Download necessary NLTK data
25
  import nltk
26
- nltk.download('punkt')
27
- nltk.download('punkt_tab')
28
- nltk.download('stopwords')
29
- nltk.download('wordnet')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  class DataExtractor:
32
  def __init__(self):
@@ -43,20 +291,17 @@ class DataExtractor:
43
  if not self.soup:
44
  self.logger.error("BeautifulSoup object not initialized")
45
  return []
46
-
47
  images = []
48
  all_imgs = self.soup.find_all('img')
49
  self.logger.info(f"Found {len(all_imgs)} raw image tags")
50
-
51
  for img in all_imgs:
52
  try:
53
  src = img.get('src', '')
54
  if src:
55
  # Handle relative URLs
56
- if src.startswith('/'):
57
- src = urljoin(self.base_url, src)
58
- elif not src.startswith(('http://', 'https://')):
59
- src = urljoin(self.base_url, src)
60
 
61
  image_data = {
62
  'src': src,
@@ -70,7 +315,7 @@ class DataExtractor:
70
  except Exception as e:
71
  self.logger.error(f"Error processing image: {str(e)}")
72
  continue
73
-
74
  self.logger.info(f"Successfully extracted {len(images)} valid images")
75
  return images
76
 
@@ -78,20 +323,17 @@ class DataExtractor:
78
  if not self.soup:
79
  self.logger.error("BeautifulSoup object not initialized")
80
  return []
81
-
82
  links = []
83
  all_links = self.soup.find_all('a')
84
  self.logger.info(f"Found {len(all_links)} raw link tags")
85
-
86
  for a in all_links:
87
  try:
88
  href = a.get('href', '')
89
  if href and not href.startswith(('#', 'javascript:', 'mailto:')):
90
  # Handle relative URLs
91
- if href.startswith('/'):
92
- href = urljoin(self.base_url, href)
93
- elif not href.startswith(('http://', 'https://')):
94
- href = urljoin(self.base_url, href)
95
 
96
  links.append({
97
  'href': href,
@@ -104,7 +346,7 @@ class DataExtractor:
104
  except Exception as e:
105
  self.logger.error(f"Error processing link: {str(e)}")
106
  continue
107
-
108
  self.logger.info(f"Successfully extracted {len(links)} valid links")
109
  return links
110
 
@@ -112,11 +354,11 @@ class DataExtractor:
112
  if not self.soup:
113
  self.logger.error("BeautifulSoup object not initialized")
114
  return []
115
-
116
  texts = []
117
  all_paragraphs = self.soup.find_all('p') # Extracting all paragraph tags
118
  self.logger.info(f"Found {len(all_paragraphs)} raw paragraph tags")
119
-
120
  for p in all_paragraphs:
121
  try:
122
  text_content = p.get_text(strip=True)
@@ -129,7 +371,7 @@ class DataExtractor:
129
  except Exception as e:
130
  self.logger.error(f"Error processing text block: {str(e)}")
131
  continue
132
-
133
  self.logger.info(f"Successfully extracted {len(texts)} valid text blocks")
134
  return texts
135
 
@@ -158,13 +400,13 @@ class QueryAnalyzer:
158
  self.logger.info(f"Parsing query: {query}")
159
  tokens = word_tokenize(query.lower())
160
  filtered_tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in self.stop_words]
161
-
162
  query_info = {
163
  'target': self._identify_target(filtered_tokens),
164
  'limit': self._identify_limit(filtered_tokens),
165
  'filters': self._identify_filters(filtered_tokens)
166
  }
167
-
168
  self.logger.info(f"Query parsed: {query_info}")
169
  return query_info
170
  except Exception as e:
@@ -274,7 +516,8 @@ class SmartWebScraper:
274
  self.data_extractor = DataExtractor()
275
  self.response_formatter = ResponseFormatter()
276
  self.logger = logging.getLogger(__name__)
277
-
 
278
  def process_url(self, url: str, query: str) -> str:
279
  try:
280
  # Validate URL
@@ -290,21 +533,26 @@ class SmartWebScraper:
290
  response.raise_for_status()
291
  self.logger.info(f"Successfully fetched page. Status code: {response.status_code}")
292
 
293
- # Set page content
294
  self.data_extractor.set_page(response.text, url)
 
 
 
 
 
295
 
296
  # Analyze query
297
  query_info = self.query_analyzer.parse_query(query)
298
  self.logger.info(f"Query analysis: {query_info}")
299
 
300
  # Extract requested data
301
- data = self._get_data_for_target(query_info['target'])
302
  self.logger.info(f"Extracted {len(data)} items for target: {query_info['target']}")
303
 
304
  # Format response
305
  formatted_response = self.response_formatter.format_data(data, query_info)
306
  self.logger.info("Response formatted successfully")
307
-
308
  return formatted_response
309
 
310
  except requests.exceptions.RequestException as e:
@@ -324,89 +572,178 @@ class SmartWebScraper:
324
  self.logger.error(f"URL validation error: {str(e)}")
325
  return False
326
 
327
- def _get_data_for_target(self, target: str) -> List[Dict]:
328
- extractors = {
329
- 'image': self.data_extractor.extract_images,
330
- 'link': self.data_extractor.extract_links,
331
- 'text': self.data_extractor.extract_text # extract_text method is now defined
332
- }
333
- extractor = extractors.get(target)
334
- if not extractor:
335
- self.logger.warning(f"No extractor found for target: {target}")
336
  return []
337
-
338
- try:
339
- data = extractor()
340
- self.logger.info(f"Extracted {len(data)} items using {target} extractor")
341
- return data
342
- except Exception as e:
343
- self.logger.error(f"Error in data extraction for {target}: {str(e)}")
 
 
344
  return []
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  def create_interface():
 
347
  scraper = SmartWebScraper()
348
-
349
- def process_request(url: str, query: str) -> str:
350
- return scraper.process_url(url, query)
351
-
352
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
353
- gr.Markdown("# 🌐 Enhanced Web Scraper")
354
- gr.Markdown("Ask natural questions about any webpage, and I'll analyze it for you!")
355
-
356
- with gr.Row():
357
- url_input = gr.Textbox(
358
- label="Website URL",
359
- placeholder="https://example.com",
360
- show_label=True
361
- )
362
-
363
- query_input = gr.Textbox(
364
- label="What would you like to know?",
365
- placeholder="Try: 'Show me the first 3 images' or 'How many links are there?'",
366
- show_label=True
367
  )
368
-
369
- submit_btn = gr.Button("🔍 Analyze", variant="primary")
370
-
371
- output = gr.Textbox(
372
- label="Results",
373
- lines=10,
374
- max_lines=30,
375
- show_copy_button=True,
376
- interactive=False
377
- )
378
-
379
- submit_btn.click(
380
- fn=process_request,
381
- inputs=[url_input, query_input],
382
- outputs=output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  )
384
 
385
- # Example queries section
386
- gr.Markdown("""
387
- ## 📝 Example queries:
388
-
389
- **Images:**
390
- - "Show me all images with their descriptions"
391
- - "How many images are on this page?"
392
- - "Find the largest images"
393
-
394
- **Links:**
395
- - "List the first 5 external links"
396
- - "Show me links with images"
397
- - "How many internal links are there?"
398
-
399
- **Text:**
400
- - "Extract main paragraphs with their headings"
401
- - "Show me the longest text blocks"
402
- - "Find paragraphs containing links"
403
-
404
- **Advanced:**
405
- - "Give me an overview of the page structure"
406
- - "Show me the most recent content"
407
- - "Analyze the page's organization"
408
- """)
409
-
410
  return demo
411
 
412
  if __name__ == "__main__":
 
10
  from nltk.corpus import stopwords
11
  from nltk.stem import WordNetLemmatizer
12
  from datetime import datetime
13
+ import io
14
+ import zipfile
15
+ import os
16
+ import tempfile
17
+ from selenium import webdriver
18
+ from selenium.webdriver.chrome.options import Options
19
+ from PIL import Image
20
 
21
  # Configure detailed logging
22
  logging.basicConfig(
 
30
 
31
  # Download necessary NLTK data
32
  import nltk
33
+
34
+ try:
35
+ nltk.download('punkt', quiet=True)
36
+ nltk.download('stopwords', quiet=True)
37
+ nltk.download('wordnet', quiet=True)
38
+ nltk.download('averaged_perceptron_tagger', quiet=True)
39
+ except Exception as e:
40
+ logging.error(f"Error downloading NLTK data: {str(e)}")
41
+
42
+ def sanitize_filename(filename):
43
+ """Sanitizes a filename by removing invalid characters."""
44
+ return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
45
+
46
+ def validate_url(url):
47
+ """Validate if the URL is properly formatted."""
48
+ try:
49
+ result = urlparse(url)
50
+ return all([result.scheme, result.netloc])
51
+ except Exception:
52
+ return False
53
+
54
+ def get_latest_data(url):
55
+ """Get the latest HTML content of a webpage."""
56
+ try:
57
+ headers = {
58
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
59
+ }
60
+ response = requests.get(url, headers=headers, timeout=10)
61
+ response.raise_for_status() # Raise an exception for bad status codes
62
+ return response.text
63
+ except requests.exceptions.RequestException as e:
64
+ logging.error(f"Error fetching latest data from {url}: {str(e)}")
65
+ return None
66
+
67
+ def compare_html(old_html, new_html):
68
+ """Compare two HTML contents to detect changes."""
69
+ if not old_html or not new_html:
70
+ return False
71
+ return old_html.strip() != new_html.strip()
72
+
73
+ def compare_screenshot(old_screenshot, new_screenshot):
74
+ """Compare two screenshots to detect changes."""
75
+ try:
76
+ if not old_screenshot or not new_screenshot:
77
+ return False
78
+ old_img = Image.open(io.BytesIO(old_screenshot))
79
+ new_img = Image.open(io.BytesIO(new_screenshot))
80
+ return not (old_img.tobytes() == new_img.tobytes())
81
+ except Exception as e:
82
+ logging.error(f"Error comparing screenshots: {str(e)}")
83
+ return False
84
+
85
+ def alert_changes(url, change_type):
86
+ """Log detected changes."""
87
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
88
+ logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
89
+ return f"[{timestamp}] {change_type}"
90
+
91
+ def extract_links_from_page(url):
92
+ """Extract all links from a webpage."""
93
+ try:
94
+ response = requests.get(url, timeout=10)
95
+ response.raise_for_status()
96
+ soup = BeautifulSoup(response.text, 'html.parser')
97
+ links = [a.get('href') for a in soup.find_all('a', href=True)]
98
+ return links
99
+ except requests.exceptions.RequestException as e:
100
+ logging.error(f"Error extracting links from {url}: {str(e)}")
101
+ return []
102
+
103
+ def take_screenshot(url):
104
+ """Take a screenshot of a webpage."""
105
+ try:
106
+ chrome_options = Options()
107
+ chrome_options.add_argument("--headless")
108
+ chrome_options.add_argument("--no-sandbox")
109
+ chrome_options.add_argument("--disable-dev-shm-usage")
110
+ chrome_options.add_argument("--window-size=1920,1080")
111
+
112
+ driver = webdriver.Chrome(options=chrome_options)
113
+ driver.get(url)
114
+
115
+ screenshot = driver.get_screenshot_as_png()
116
+ driver.quit()
117
+
118
+ image = Image.open(io.BytesIO(screenshot))
119
+ max_size = (1024, 1024)
120
+ image.thumbnail(max_size, Image.LANCZOS)
121
+
122
+ img_byte_arr = io.BytesIO()
123
+ image.save(img_byte_arr, format='PNG')
124
+ return img_byte_arr.getvalue()
125
+ except Exception as e:
126
+ logging.error(f"Screenshot error for {url}: {str(e)}")
127
+ return None
128
+
129
+ def is_webpage(url):
130
+ """Check if the URL points to a webpage (HTML)."""
131
+ try:
132
+ response = requests.head(url, timeout=10)
133
+ response.raise_for_status()
134
+ content_type = response.headers.get('Content-Type', '').lower()
135
+ return 'text/html' in content_type
136
+ except requests.exceptions.RequestException as e:
137
+ logging.error(f"Error checking content type for {url}: {str(e)}")
138
+ return False
139
+
140
+ def crawl_url(url, depth, max_depth, visited=None):
141
+ """Recursively crawl a URL up to a specified depth."""
142
+ if visited is None:
143
+ visited = set()
144
+
145
+ if depth > max_depth or url in visited or not validate_url(url):
146
+ return []
147
+
148
+ visited.add(url)
149
+ screenshots = []
150
+
151
+ if is_webpage(url):
152
+ links = extract_links_from_page(url)
153
+ screenshot = take_screenshot(url)
154
+ if screenshot:
155
+ screenshots.append((url, screenshot))
156
+
157
+ if depth < max_depth:
158
+ for link in links:
159
+ absolute_link = urljoin(url, link)
160
+ screenshots.extend(crawl_url(absolute_link, depth + 1, max_depth, visited))
161
+ else:
162
+ logging.info(f"Skipping non-webpage content: {url}")
163
+
164
+ return screenshots
165
+
166
+ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mode='standard', progress=gr.Progress()):
167
+ """Process URLs with crawl depth and change detection."""
168
+ urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
169
+ urls = [url.strip() for url in urls if url.strip()]
170
+ urls = urls[:int(max_urls)]
171
+
172
+ # Validate all URLs
173
+ invalid_urls = [url for url in urls if not validate_url(url)]
174
+ if invalid_urls:
175
+ if mode == 'chat':
176
+ return f"Invalid URLs detected: {', '.join(invalid_urls)}"
177
+ else:
178
+ return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
179
+
180
+ scraped_data = []
181
+ screenshots = []
182
+ changes_log = []
183
+
184
+ # Initialize progress tracking
185
+ total_urls = len(urls)
186
+ progress(0, desc="Starting...")
187
+
188
+ # Directory to store scraped data
189
+ data_dir = 'scraped_data'
190
+ os.makedirs(data_dir, exist_ok=True)
191
+
192
+ # Process each URL
193
+ for idx, url in enumerate(urls):
194
+ progress((idx + 1) / total_urls, desc=f"Processing: {url}")
195
+
196
+ if not url.startswith(('http://', 'https://')):
197
+ url = f'https://{url}'
198
+
199
+ # Sanitize URL for file naming
200
+ sanitized_url = sanitize_filename(url)
201
+
202
+ # Check for changes
203
+ old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt")
204
+ old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png")
205
+
206
+ # Fetch latest data
207
+ latest_html = get_latest_data(url)
208
+ latest_screenshot = take_screenshot(url)
209
+
210
+ # Compare with previous data if available
211
+ if os.path.exists(old_html_path):
212
+ with open(old_html_path, 'r', encoding='utf-8') as f:
213
+ old_html = f.read()
214
+ if compare_html(old_html, latest_html):
215
+ changes_log.append(alert_changes(url, "HTML content has changed"))
216
+
217
+ if os.path.exists(old_screenshot_path):
218
+ with open(old_screenshot_path, 'rb') as f:
219
+ old_screenshot = f.read()
220
+ if latest_screenshot and compare_screenshot(old_screenshot, latest_screenshot):
221
+ changes_log.append(alert_changes(url, "Visual content has changed"))
222
+
223
+ # Store latest data
224
+ if latest_html:
225
+ with open(old_html_path, 'w', encoding='utf-8') as f:
226
+ f.write(latest_html)
227
+ if latest_screenshot:
228
+ with open(old_screenshot_path, 'wb') as f:
229
+ f.write(latest_screenshot)
230
+
231
+ # Prepare output data
232
+ if action_radio in ['Scrape data', 'Both']:
233
+ scraped_data.append({
234
+ 'url': url,
235
+ 'content': latest_html,
236
+ 'timestamp': datetime.datetime.now().isoformat(),
237
+ 'changes_detected': changes_log
238
+ })
239
+
240
+ if action_radio in ['Capture image', 'Both']:
241
+ crawled_screenshots = crawl_url(url, depth=0, max_depth=int(crawl_depth))
242
+ screenshots.extend(crawled_screenshots)
243
+
244
+ if mode == 'chat':
245
+ return "\n".join(changes_log)
246
+ else:
247
+ # Create a temporary file to store the ZIP
248
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
249
+ with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
250
+ # Add screenshots to ZIP
251
+ for screenshot_url, screenshot_data in screenshots:
252
+ sanitized_screenshot_url = sanitize_filename(screenshot_url)
253
+ filename = f"{sanitized_screenshot_url}.png"
254
+ zipf.writestr(filename, screenshot_data)
255
+
256
+ # Add scraped data and changes log to ZIP
257
+ if scraped_data:
258
+ data_to_save = {
259
+ 'scraped_data': scraped_data,
260
+ 'changes_log': changes_log,
261
+ 'timestamp': datetime.datetime.now().isoformat()
262
+ }
263
+ zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
264
+
265
+ # Get the path to the temporary file
266
+ zip_file_path = tmp_file.name
267
+
268
+ # Prepare display data
269
+ display_data = {
270
+ 'total_scraped_urls': len(scraped_data),
271
+ 'total_screenshots_taken': len(screenshots),
272
+ 'changes_detected': changes_log,
273
+ 'scraped_data': scraped_data
274
+ }
275
+
276
+ # Return the path to the temporary ZIP file and display data
277
+ return zip_file_path, json.dumps(display_data, indent=2)
278
 
279
  class DataExtractor:
280
  def __init__(self):
 
291
  if not self.soup:
292
  self.logger.error("BeautifulSoup object not initialized")
293
  return []
294
+
295
  images = []
296
  all_imgs = self.soup.find_all('img')
297
  self.logger.info(f"Found {len(all_imgs)} raw image tags")
298
+
299
  for img in all_imgs:
300
  try:
301
  src = img.get('src', '')
302
  if src:
303
  # Handle relative URLs
304
+ src = urljoin(self.base_url, src)
 
 
 
305
 
306
  image_data = {
307
  'src': src,
 
315
  except Exception as e:
316
  self.logger.error(f"Error processing image: {str(e)}")
317
  continue
318
+
319
  self.logger.info(f"Successfully extracted {len(images)} valid images")
320
  return images
321
 
 
323
  if not self.soup:
324
  self.logger.error("BeautifulSoup object not initialized")
325
  return []
326
+
327
  links = []
328
  all_links = self.soup.find_all('a')
329
  self.logger.info(f"Found {len(all_links)} raw link tags")
330
+
331
  for a in all_links:
332
  try:
333
  href = a.get('href', '')
334
  if href and not href.startswith(('#', 'javascript:', 'mailto:')):
335
  # Handle relative URLs
336
+ href = urljoin(self.base_url, href)
 
 
 
337
 
338
  links.append({
339
  'href': href,
 
346
  except Exception as e:
347
  self.logger.error(f"Error processing link: {str(e)}")
348
  continue
349
+
350
  self.logger.info(f"Successfully extracted {len(links)} valid links")
351
  return links
352
 
 
354
  if not self.soup:
355
  self.logger.error("BeautifulSoup object not initialized")
356
  return []
357
+
358
  texts = []
359
  all_paragraphs = self.soup.find_all('p') # Extracting all paragraph tags
360
  self.logger.info(f"Found {len(all_paragraphs)} raw paragraph tags")
361
+
362
  for p in all_paragraphs:
363
  try:
364
  text_content = p.get_text(strip=True)
 
371
  except Exception as e:
372
  self.logger.error(f"Error processing text block: {str(e)}")
373
  continue
374
+
375
  self.logger.info(f"Successfully extracted {len(texts)} valid text blocks")
376
  return texts
377
 
 
400
  self.logger.info(f"Parsing query: {query}")
401
  tokens = word_tokenize(query.lower())
402
  filtered_tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in self.stop_words]
403
+
404
  query_info = {
405
  'target': self._identify_target(filtered_tokens),
406
  'limit': self._identify_limit(filtered_tokens),
407
  'filters': self._identify_filters(filtered_tokens)
408
  }
409
+
410
  self.logger.info(f"Query parsed: {query_info}")
411
  return query_info
412
  except Exception as e:
 
516
  self.data_extractor = DataExtractor()
517
  self.response_formatter = ResponseFormatter()
518
  self.logger = logging.getLogger(__name__)
519
+ self.scraped_data = {} # Temporarily store scraped data
520
+
521
  def process_url(self, url: str, query: str) -> str:
522
  try:
523
  # Validate URL
 
533
  response.raise_for_status()
534
  self.logger.info(f"Successfully fetched page. Status code: {response.status_code}")
535
 
536
+ # Set page content and store in scraped_data
537
  self.data_extractor.set_page(response.text, url)
538
+ self.scraped_data[url] = {
539
+ 'images': self.data_extractor.extract_images(),
540
+ 'links': self.data_extractor.extract_links(),
541
+ 'texts': self.data_extractor.extract_text()
542
+ }
543
 
544
  # Analyze query
545
  query_info = self.query_analyzer.parse_query(query)
546
  self.logger.info(f"Query analysis: {query_info}")
547
 
548
  # Extract requested data
549
+ data = self._get_data_for_target(query_info['target'], url)
550
  self.logger.info(f"Extracted {len(data)} items for target: {query_info['target']}")
551
 
552
  # Format response
553
  formatted_response = self.response_formatter.format_data(data, query_info)
554
  self.logger.info("Response formatted successfully")
555
+
556
  return formatted_response
557
 
558
  except requests.exceptions.RequestException as e:
 
572
  self.logger.error(f"URL validation error: {str(e)}")
573
  return False
574
 
575
+ def _get_data_for_target(self, target: str, url: str) -> List[Dict]:
576
+ if url not in self.scraped_data:
577
+ self.logger.warning(f"No data found for URL: {url}")
 
 
 
 
 
 
578
  return []
579
+
580
+ if target == 'image':
581
+ return self.scraped_data[url]['images']
582
+ elif target == 'link':
583
+ return self.scraped_data[url]['links']
584
+ elif target == 'text':
585
+ return self.scraped_data[url]['texts']
586
+ else:
587
+ self.logger.warning(f"No extractor found for target: {target}")
588
  return []
589
 
590
+ def recognize_intent(self, instruction: str) -> str:
591
+ """Recognizes the intent of an instruction."""
592
+ instruction = instruction.lower()
593
+ # General patterns for actions and data types
594
+ action_patterns = {
595
+ r'\b(find|extract|scrape)\s+(links|images|texts)\b': 'extract_data',
596
+ r'\b(count)\s+(links|images|texts)\b': 'count_data',
597
+ }
598
+ for pattern, intent in action_patterns.items():
599
+ if re.search(pattern, instruction):
600
+ return intent
601
+ return "unknown"
602
+
603
+ def extract_data_type(self, instruction: str) -> str:
604
+ """Extracts the data type from an instruction."""
605
+ instruction = instruction.lower()
606
+ data_types = {
607
+ r'\b(links)\b': 'link',
608
+ r'\b(images)\b': 'image',
609
+ r'\b(texts)\b': 'text',
610
+ }
611
+ for pattern, data_type in data_types.items():
612
+ if re.search(pattern, instruction):
613
+ return data_type
614
+ return "unknown"
615
+
616
+ def chat_based_scrape(self, instruction, url_input, output_format):
617
+ """Handles chat-based instructions for web scraping."""
618
+ if not validate_url(url_input):
619
+ return "Invalid URL. Please enter a valid URL."
620
+
621
+ if url_input not in self.scraped_data:
622
+ self.process_url(url_input, "") # Fetch and store data if not already present
623
+
624
+ # Recognize intent and extract data type if applicable
625
+ intent = self.recognize_intent(instruction)
626
+ data_type = self.extract_data_type(instruction)
627
+
628
+ if intent == "unknown" or data_type == "unknown":
629
+ return "Instruction not recognized. Please try again."
630
+
631
+ # Extract data based on intent and data type
632
+ if intent == "extract_data":
633
+ data = self._get_data_for_target(data_type, url_input)
634
+ if output_format == "JSON":
635
+ return json.dumps(data, indent=2)
636
+ else:
637
+ query_info = {'target': data_type, 'limit': 0, 'filters': {}}
638
+ return self.response_formatter.format_data(data, query_info)
639
+ elif intent == "count_data":
640
+ data = self._get_data_for_target(data_type, url_input)
641
+ return f"The number of {data_type}s is {len(data)}."
642
+ else:
643
+ return "Instruction not recognized. Please try again."
644
+
645
  def create_interface():
646
+ """Create the Gradio interface."""
647
  scraper = SmartWebScraper()
648
+
 
 
 
649
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
650
+ gr.Markdown(
651
+ """
652
+ # 🌐 Enhanced Web Scraper with Change Detection and Chat
653
+ Monitor and capture changes in web content automatically. Use the chat interface to interact with scraped data.
654
+ """
 
 
 
 
 
 
 
 
 
655
  )
656
+
657
+ with gr.Tabs():
658
+ with gr.Tab("URL Scrape/Screenshot"):
659
+ url_input = gr.Textbox(
660
+ label="Enter URL(s)",
661
+ value="https://example.com",
662
+ placeholder="Enter single URL or multiple URLs separated by commas"
663
+ )
664
+
665
+ with gr.Row():
666
+ bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
667
+ action_radio = gr.Radio(
668
+ ["Scrape data", "Capture image", "Both"],
669
+ label="Select Action",
670
+ value="Both"
671
+ )
672
+
673
+ with gr.Row():
674
+ max_urls = gr.Slider(
675
+ minimum=1,
676
+ maximum=20,
677
+ value=5,
678
+ step=1,
679
+ label="Max URLs to process"
680
+ )
681
+ crawl_depth = gr.Slider(
682
+ minimum=0,
683
+ maximum=3,
684
+ value=1,
685
+ step=1,
686
+ label="Crawl Depth (0 for no recursion)"
687
+ )
688
+
689
+ process_button = gr.Button("Process URLs", variant="primary")
690
+
691
+ with gr.Column():
692
+ screenshot_zip = gr.File(label="Download Results")
693
+ scraped_data_output = gr.JSON(label="Results Summary")
694
+
695
+ process_button.click(
696
+ fn=process_urls,
697
+ inputs=[
698
+ url_input,
699
+ bulk_toggle,
700
+ action_radio,
701
+ max_urls,
702
+ crawl_depth
703
+ ],
704
+ outputs=[
705
+ screenshot_zip,
706
+ scraped_data_output
707
+ ],
708
+ show_progress=True
709
+ )
710
+
711
+ with gr.Tab("Chat-Based Scrape"):
712
+ instruction = gr.Textbox(
713
+ label="Enter Instruction",
714
+ placeholder="e.g., 'Scrape all links' or 'Extract all images'"
715
+ )
716
+ chat_url_input = gr.Textbox(
717
+ label="Enter URL",
718
+ value="https://example.com",
719
+ placeholder="Enter the target URL"
720
+ )
721
+ output_format = gr.Radio(
722
+ ["Formatted Text", "JSON"],
723
+ label="Output Format",
724
+ value="Formatted Text"
725
+ )
726
+ chat_output = gr.Textbox(label="Output")
727
+
728
+ chat_button = gr.Button("Execute Instruction", variant="primary")
729
+
730
+ chat_button.click(
731
+ fn=scraper.chat_based_scrape,
732
+ inputs=[instruction, chat_url_input, output_format],
733
+ outputs=chat_output
734
+ )
735
+
736
+ gr.Markdown(
737
+ """
738
+ ### Features
739
+ - Bulk URL processing
740
+ - Screenshot capture
741
+ - Content change detection
742
+ - Recursive crawling
743
+ - Chat-based instructions for interacting with scraped data
744
+ """
745
  )
746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
747
  return demo
748
 
749
  if __name__ == "__main__":