Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,30 +3,538 @@ import requests
|
|
3 |
import re
|
4 |
import logging
|
5 |
import json
|
|
|
6 |
from bs4 import BeautifulSoup
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
|
|
|
|
10 |
import io
|
11 |
import zipfile
|
12 |
import os
|
13 |
-
import datetime
|
14 |
-
from urllib.parse import urlparse
|
15 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
import nltk
|
17 |
try:
|
18 |
-
nltk.download('punkt')
|
19 |
-
nltk.download('stopwords')
|
20 |
-
nltk.download('wordnet')
|
21 |
-
nltk.download('averaged_perceptron_tagger')
|
22 |
except Exception as e:
|
23 |
logging.error(f"Error downloading NLTK data: {str(e)}")
|
24 |
|
25 |
-
#
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
def sanitize_filename(filename):
|
|
|
30 |
return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
|
31 |
|
32 |
def validate_url(url):
|
@@ -34,53 +542,22 @@ def validate_url(url):
|
|
34 |
try:
|
35 |
result = urlparse(url)
|
36 |
return all([result.scheme, result.netloc])
|
37 |
-
except:
|
38 |
return False
|
39 |
|
40 |
def get_latest_data(url):
|
41 |
"""Get the latest HTML content of a webpage."""
|
42 |
try:
|
43 |
-
|
|
|
|
|
|
|
|
|
44 |
return response.text
|
45 |
-
except
|
46 |
logging.error(f"Error fetching latest data from {url}: {str(e)}")
|
47 |
return None
|
48 |
|
49 |
-
def compare_html(old_html, new_html):
|
50 |
-
"""Compare two HTML contents to detect changes."""
|
51 |
-
if not old_html or not new_html:
|
52 |
-
return False
|
53 |
-
return old_html.strip() != new_html.strip()
|
54 |
-
|
55 |
-
def compare_screenshot(old_screenshot, new_screenshot):
|
56 |
-
"""Compare two screenshots to detect changes."""
|
57 |
-
try:
|
58 |
-
if not old_screenshot or not new_screenshot:
|
59 |
-
return False
|
60 |
-
old_img = Image.open(io.BytesIO(old_screenshot))
|
61 |
-
new_img = Image.open(io.BytesIO(new_screenshot))
|
62 |
-
return not (old_img == new_img)
|
63 |
-
except Exception as e:
|
64 |
-
logging.error(f"Error comparing screenshots: {str(e)}")
|
65 |
-
return False
|
66 |
-
|
67 |
-
def alert_changes(url, change_type):
|
68 |
-
"""Log detected changes."""
|
69 |
-
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
70 |
-
logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
|
71 |
-
return f"[{timestamp}] {change_type}"
|
72 |
-
|
73 |
-
def extract_links_from_page(url):
|
74 |
-
"""Extract all links from a webpage."""
|
75 |
-
try:
|
76 |
-
response = requests.get(url, timeout=10)
|
77 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
78 |
-
links = [a['href'] for a in soup.find_all('a', href=True)]
|
79 |
-
return links
|
80 |
-
except Exception as e:
|
81 |
-
logging.error(f"Error extracting links from {url}: {str(e)}")
|
82 |
-
return []
|
83 |
-
|
84 |
def take_screenshot(url):
|
85 |
"""Take a screenshot of a webpage."""
|
86 |
try:
|
@@ -89,17 +566,17 @@ def take_screenshot(url):
|
|
89 |
chrome_options.add_argument("--no-sandbox")
|
90 |
chrome_options.add_argument("--disable-dev-shm-usage")
|
91 |
chrome_options.add_argument("--window-size=1920,1080")
|
92 |
-
|
93 |
driver = webdriver.Chrome(options=chrome_options)
|
94 |
driver.get(url)
|
95 |
-
|
96 |
screenshot = driver.get_screenshot_as_png()
|
97 |
driver.quit()
|
98 |
-
|
99 |
image = Image.open(io.BytesIO(screenshot))
|
100 |
max_size = (1024, 1024)
|
101 |
image.thumbnail(max_size, Image.LANCZOS)
|
102 |
-
|
103 |
img_byte_arr = io.BytesIO()
|
104 |
image.save(img_byte_arr, format='PNG')
|
105 |
return img_byte_arr.getvalue()
|
@@ -107,285 +584,88 @@ def take_screenshot(url):
|
|
107 |
logging.error(f"Screenshot error for {url}: {str(e)}")
|
108 |
return None
|
109 |
|
110 |
-
def
|
111 |
-
"""
|
112 |
try:
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
except Exception as e:
|
117 |
-
logging.error(f"Error checking content type for {url}: {str(e)}")
|
118 |
-
return False
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
if depth > max_depth or url in visited:
|
126 |
-
return []
|
127 |
-
|
128 |
-
visited.add(url)
|
129 |
-
screenshots = []
|
130 |
-
|
131 |
-
if is_webpage(url):
|
132 |
-
links = extract_links_from_page(url)
|
133 |
-
screenshot = take_screenshot(url)
|
134 |
-
if screenshot:
|
135 |
-
screenshots.append((url, screenshot))
|
136 |
-
|
137 |
-
if depth < max_depth:
|
138 |
-
for link in links:
|
139 |
-
if not link.startswith(('http://', 'https://')):
|
140 |
-
link = f"https://{link}"
|
141 |
-
screenshots.extend(crawl_url(link, depth + 1, max_depth, visited))
|
142 |
-
else:
|
143 |
-
logging.info(f"Skipping non-webpage content: {url}")
|
144 |
-
|
145 |
-
return screenshots
|
146 |
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
scraped_data = []
|
163 |
-
screenshots = []
|
164 |
-
changes_log = []
|
165 |
-
|
166 |
-
# Initialize progress tracking
|
167 |
-
total_urls = len(urls)
|
168 |
-
progress(0)
|
169 |
-
|
170 |
-
# Directory to store scraped data
|
171 |
-
data_dir = 'scraped_data'
|
172 |
-
os.makedirs(data_dir, exist_ok=True)
|
173 |
-
|
174 |
-
# Process each URL
|
175 |
-
for idx, url in enumerate(urls):
|
176 |
-
if not url.startswith(('http://', 'https://')):
|
177 |
-
url = f'https://{url}'
|
178 |
-
|
179 |
-
# Sanitize URL for file naming
|
180 |
-
sanitized_url = sanitize_filename(url)
|
181 |
-
|
182 |
-
# Check for changes
|
183 |
-
old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt")
|
184 |
-
old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png")
|
185 |
-
|
186 |
-
# Fetch latest data
|
187 |
-
latest_html = get_latest_data(url)
|
188 |
-
latest_screenshot = take_screenshot(url)
|
189 |
-
|
190 |
-
# Compare with previous data if available
|
191 |
-
if os.path.exists(old_html_path):
|
192 |
-
with open(old_html_path, 'r', encoding='utf-8') as f:
|
193 |
-
old_html = f.read()
|
194 |
-
if compare_html(old_html, latest_html):
|
195 |
-
changes_log.append(alert_changes(url, "HTML content has changed"))
|
196 |
-
|
197 |
-
if os.path.exists(old_screenshot_path):
|
198 |
-
with open(old_screenshot_path, 'rb') as f:
|
199 |
-
old_screenshot = f.read()
|
200 |
-
if latest_screenshot and compare_screenshot(old_screenshot, latest_screenshot):
|
201 |
-
changes_log.append(alert_changes(url, "Visual content has changed"))
|
202 |
-
|
203 |
-
# Store latest data
|
204 |
-
if latest_html:
|
205 |
-
with open(old_html_path, 'w', encoding='utf-8') as f:
|
206 |
-
f.write(latest_html)
|
207 |
-
if latest_screenshot:
|
208 |
-
with open(old_screenshot_path, 'wb') as f:
|
209 |
-
f.write(latest_screenshot)
|
210 |
-
|
211 |
-
# Prepare output data
|
212 |
-
if action_radio in ['Scrape data', 'Both']:
|
213 |
-
scraped_data.append({
|
214 |
-
'url': url,
|
215 |
-
'content': latest_html, # Include full HTML content
|
216 |
-
'timestamp': datetime.datetime.now().isoformat(),
|
217 |
-
'changes_detected': changes_log
|
218 |
-
})
|
219 |
-
|
220 |
-
if action_radio in ['Capture image', 'Both']:
|
221 |
-
crawled_screenshots = crawl_url(url, depth=1, max_depth=int(crawl_depth))
|
222 |
-
screenshots.extend(crawled_screenshots)
|
223 |
-
|
224 |
-
# Update progress
|
225 |
-
progress((idx + 1) / total_urls)
|
226 |
-
|
227 |
-
if mode == 'chat':
|
228 |
-
return "\n".join(changes_log)
|
229 |
-
else:
|
230 |
-
# Create a temporary file to store the ZIP
|
231 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
|
232 |
-
with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
233 |
-
# Add screenshots to ZIP
|
234 |
-
for screenshot_url, screenshot_data in screenshots:
|
235 |
-
sanitized_screenshot_url = sanitize_filename(screenshot_url)
|
236 |
-
filename = f"{sanitized_screenshot_url}.png"
|
237 |
-
zipf.writestr(filename, screenshot_data)
|
238 |
-
|
239 |
-
# Add scraped data and changes log to ZIP
|
240 |
-
if scraped_data:
|
241 |
-
data_to_save = {
|
242 |
-
'scraped_data': scraped_data,
|
243 |
-
'changes_log': changes_log,
|
244 |
-
'timestamp': datetime.datetime.now().isoformat()
|
245 |
-
}
|
246 |
-
zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
|
247 |
|
248 |
-
#
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
'scraped_data': scraped_data # Include full scraped data
|
257 |
-
}
|
258 |
-
|
259 |
-
# Return the path to the temporary ZIP file and display data
|
260 |
-
return zip_file_path, json.dumps(display_data, indent=2)
|
261 |
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
}
|
272 |
-
|
273 |
-
for pattern, intent in action_patterns.items():
|
274 |
-
if re.search(pattern, instruction):
|
275 |
-
return intent
|
276 |
-
return "unknown"
|
277 |
-
|
278 |
-
def extract_data_type(instruction: str) -> str:
|
279 |
-
instruction = instruction.lower()
|
280 |
-
data_types = {
|
281 |
-
r'\b(links|images|videos|texts|prices|product names|reviews)\b': 'links',
|
282 |
-
r'\b(links|images|videos|products)\b': 'images',
|
283 |
-
r'\b(channel name|subscriber count|viewers)\b': 'channel name',
|
284 |
-
}
|
285 |
-
for pattern, data_type in data_types.items():
|
286 |
-
if re.search(pattern, instruction):
|
287 |
-
return data_type
|
288 |
-
return "unknown"
|
289 |
-
|
290 |
-
def format_output(data, output_format):
|
291 |
-
if output_format == "JSON":
|
292 |
-
return json.dumps(data, indent=2)
|
293 |
-
elif output_format == "Cleaned JSON":
|
294 |
-
# Implement data cleaning logic here
|
295 |
-
return json.dumps(data, indent=2)
|
296 |
-
else:
|
297 |
-
return str(data)
|
298 |
-
|
299 |
-
def generate_command(intent: str, url_input: str, data_type: str, output_format: str) -> str:
|
300 |
-
if intent == "extract_data":
|
301 |
-
data = extract_data(url_input, data_type)
|
302 |
-
return format_output(data, output_format)
|
303 |
-
elif intent == "count_data":
|
304 |
-
count = count_data(url_input, data_type)
|
305 |
-
return f"The number of {data_type} is {count}."
|
306 |
-
elif intent == "fetch_specific_data":
|
307 |
-
specific_data = fetch_specific_data(url_input, data_type)
|
308 |
-
return specific_data
|
309 |
-
elif intent == "monitor_changes":
|
310 |
-
changes_log = monitor_changes(url_input)
|
311 |
-
return changes_log
|
312 |
-
else:
|
313 |
-
return "Instruction not recognized. Please try again."
|
314 |
-
|
315 |
-
def extract_data(url, data_type):
|
316 |
-
try:
|
317 |
-
response = requests.get(url)
|
318 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
319 |
-
|
320 |
-
if data_type == "links":
|
321 |
-
return [a['href'] for a in soup.find_all('a', href=True)]
|
322 |
-
elif data_type == "images":
|
323 |
-
return [img['src'] for img in soup.find_all('img', src=True)]
|
324 |
-
# Add more data types as needed
|
325 |
-
else:
|
326 |
-
return []
|
327 |
-
except Exception as e:
|
328 |
-
return f"Error extracting {data_type}: {str(e)}"
|
329 |
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
if data_type == "links":
|
336 |
-
return len(soup.find_all('a', href=True))
|
337 |
-
elif data_type == "images":
|
338 |
-
return len(soup.find_all('img', src=True))
|
339 |
-
# Add more data types as needed
|
340 |
-
else:
|
341 |
-
return 0
|
342 |
-
except Exception as e:
|
343 |
-
return f"Error counting {data_type}: {str(e)}"
|
344 |
|
345 |
-
|
346 |
-
|
347 |
-
# Implement specific data fetching logic here
|
348 |
-
# For demonstration, return a placeholder
|
349 |
-
return f"Fetched {data_type} from {url}"
|
350 |
-
except Exception as e:
|
351 |
-
return f"Error fetching {data_type}: {str(e)}"
|
352 |
|
353 |
-
def monitor_changes(url_input):
|
354 |
-
try:
|
355 |
-
# Implement change monitoring logic here
|
356 |
-
# For demonstration, return a placeholder
|
357 |
-
return f"Changes monitored for {url_input}"
|
358 |
except Exception as e:
|
359 |
-
|
|
|
360 |
|
361 |
-
|
362 |
-
|
363 |
-
intent = recognize_intent(instruction)
|
364 |
-
data_type = extract_data_type(instruction)
|
365 |
-
|
366 |
-
# Generate command based on the recognized intent
|
367 |
-
command_output = generate_command(intent, url_input, data_type, output_format)
|
368 |
-
|
369 |
-
return command_output
|
370 |
|
371 |
def create_interface():
|
372 |
"""Create the Gradio interface."""
|
|
|
|
|
373 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
374 |
gr.Markdown(
|
375 |
"""
|
376 |
-
#
|
377 |
-
Monitor and capture changes in web content automatically.
|
378 |
"""
|
379 |
)
|
380 |
-
|
381 |
with gr.Tabs():
|
382 |
with gr.Tab("URL Scrape/Screenshot"):
|
383 |
url_input = gr.Textbox(
|
384 |
label="Enter URL(s)",
|
385 |
-
value="https://example.com",
|
386 |
placeholder="Enter single URL or multiple URLs separated by commas"
|
387 |
)
|
388 |
-
|
389 |
with gr.Row():
|
390 |
bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
|
391 |
action_radio = gr.Radio(
|
@@ -393,29 +673,41 @@ def create_interface():
|
|
393 |
label="Select Action",
|
394 |
value="Both"
|
395 |
)
|
396 |
-
|
397 |
with gr.Row():
|
398 |
max_urls = gr.Slider(
|
399 |
minimum=1,
|
400 |
-
maximum=
|
401 |
value=5,
|
402 |
step=1,
|
403 |
label="Max URLs to process"
|
404 |
)
|
405 |
crawl_depth = gr.Slider(
|
406 |
-
minimum=
|
407 |
-
maximum=
|
408 |
value=1,
|
409 |
step=1,
|
410 |
-
label="Crawl Depth"
|
411 |
)
|
412 |
-
|
413 |
process_button = gr.Button("Process URLs", variant="primary")
|
414 |
-
|
415 |
with gr.Column():
|
416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
417 |
scraped_data_output = gr.JSON(label="Results Summary")
|
418 |
-
|
419 |
process_button.click(
|
420 |
fn=process_urls,
|
421 |
inputs=[
|
@@ -426,37 +718,37 @@ def create_interface():
|
|
426 |
crawl_depth
|
427 |
],
|
428 |
outputs=[
|
429 |
-
|
|
|
430 |
scraped_data_output
|
431 |
],
|
432 |
show_progress=True
|
433 |
-
)
|
434 |
-
|
435 |
with gr.Tab("Chat-Based Scrape"):
|
436 |
instruction = gr.Textbox(
|
437 |
label="Enter Instruction",
|
438 |
placeholder="e.g., 'Scrape all links' or 'Extract all images'"
|
439 |
)
|
440 |
-
|
441 |
label="Enter URL",
|
442 |
value="https://example.com",
|
443 |
placeholder="Enter the target URL"
|
444 |
)
|
445 |
output_format = gr.Radio(
|
446 |
-
["
|
447 |
label="Output Format",
|
448 |
-
value="
|
449 |
)
|
450 |
-
|
451 |
-
|
452 |
chat_button = gr.Button("Execute Instruction", variant="primary")
|
453 |
-
|
454 |
-
chat_button.click(
|
455 |
-
fn=chat_based_scrape,
|
456 |
-
inputs=[instruction,
|
457 |
-
outputs=
|
458 |
)
|
459 |
-
|
460 |
gr.Markdown(
|
461 |
"""
|
462 |
### Features
|
@@ -464,12 +756,12 @@ def create_interface():
|
|
464 |
- Screenshot capture
|
465 |
- Content change detection
|
466 |
- Recursive crawling
|
467 |
-
- Chat-based instructions
|
468 |
"""
|
469 |
)
|
470 |
-
|
471 |
return demo
|
472 |
|
473 |
if __name__ == "__main__":
|
474 |
-
demo = create_interface()
|
475 |
-
demo.launch()
|
|
|
3 |
import re
|
4 |
import logging
|
5 |
import json
|
6 |
+
from typing import Tuple, List, Dict, Union, Optional
|
7 |
from bs4 import BeautifulSoup
|
8 |
+
from urllib.parse import urlparse, urljoin
|
9 |
+
from nltk import word_tokenize
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from nltk.stem import WordNetLemmatizer
|
12 |
+
from datetime import datetime
|
13 |
import io
|
14 |
import zipfile
|
15 |
import os
|
|
|
|
|
16 |
import tempfile
|
17 |
+
from selenium import webdriver
|
18 |
+
from selenium.webdriver.chrome.service import Service
|
19 |
+
from selenium.webdriver.chrome.options import Options
|
20 |
+
from PIL import Image
|
21 |
+
import base64
|
22 |
+
import asyncio
|
23 |
+
import yaml
|
24 |
+
from pathlib import Path
|
25 |
+
from tqdm import tqdm
|
26 |
+
import plotly.graph_objects as go
|
27 |
+
|
28 |
+
# Configure detailed logging
|
29 |
+
logging.basicConfig(
|
30 |
+
level=logging.DEBUG,
|
31 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
32 |
+
handlers=[
|
33 |
+
logging.FileHandler('webscraper.log'),
|
34 |
+
logging.StreamHandler()
|
35 |
+
]
|
36 |
+
)
|
37 |
+
|
38 |
+
# Download necessary NLTK data
|
39 |
import nltk
|
40 |
try:
|
41 |
+
nltk.download('punkt', quiet=True)
|
42 |
+
nltk.download('stopwords', quiet=True)
|
43 |
+
nltk.download('wordnet', quiet=True)
|
44 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
45 |
except Exception as e:
|
46 |
logging.error(f"Error downloading NLTK data: {str(e)}")
|
47 |
|
48 |
+
# Configuration and logging setup
|
49 |
+
class Config:
|
50 |
+
DATA_DIR = Path('scraped_data')
|
51 |
+
LOGS_DIR = Path('logs')
|
52 |
+
MAX_RETRIES = 3
|
53 |
+
TIMEOUT = 30
|
54 |
+
|
55 |
+
@classmethod
|
56 |
+
def initialize(cls):
|
57 |
+
"""Initialize necessary directories and configurations"""
|
58 |
+
cls.DATA_DIR.mkdir(exist_ok=True)
|
59 |
+
cls.LOGS_DIR.mkdir(exist_ok=True)
|
60 |
+
|
61 |
+
# Setup logging
|
62 |
+
logging.basicConfig(
|
63 |
+
level=logging.INFO,
|
64 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
65 |
+
handlers=[
|
66 |
+
logging.FileHandler(cls.LOGS_DIR / 'app.log'),
|
67 |
+
logging.StreamHandler()
|
68 |
+
]
|
69 |
+
)
|
70 |
+
|
71 |
+
return logging.getLogger(__name__)
|
72 |
+
|
73 |
+
logger = Config.initialize()
|
74 |
+
|
75 |
+
class WebDriverManager:
|
76 |
+
"""Manage WebDriver instances"""
|
77 |
+
@staticmethod
|
78 |
+
def get_driver() -> webdriver.Chrome:
|
79 |
+
options = Options()
|
80 |
+
options.add_argument('--headless')
|
81 |
+
options.add_argument('--no-sandbox')
|
82 |
+
options.add_argument('--disable-dev-shm-usage')
|
83 |
+
options.add_argument('--window-size=1920,1080')
|
84 |
+
|
85 |
+
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
86 |
+
|
87 |
+
class DataExtractor:
|
88 |
+
"""Extract and process webpage content"""
|
89 |
+
def __init__(self):
|
90 |
+
self.soup = None
|
91 |
+
self.url = None
|
92 |
+
self.logger = logging.getLogger(__name__)
|
93 |
+
|
94 |
+
def set_page(self, html: str, url: str):
|
95 |
+
"""Set the page content for extraction"""
|
96 |
+
self.soup = BeautifulSoup(html, 'html.parser')
|
97 |
+
self.url = url
|
98 |
+
|
99 |
+
def extract_images(self) -> List[Dict]:
|
100 |
+
"""Extract image information from the page"""
|
101 |
+
images = []
|
102 |
+
try:
|
103 |
+
for img in self.soup.find_all('img'):
|
104 |
+
image_info = {
|
105 |
+
'src': urljoin(self.url, img.get('src', '')),
|
106 |
+
'alt': img.get('alt', ''),
|
107 |
+
'title': img.get('title', ''),
|
108 |
+
'dimensions': self._get_image_dimensions(img),
|
109 |
+
'file_type': self._get_file_type(img.get('src', ''))
|
110 |
+
}
|
111 |
+
images.append(image_info)
|
112 |
+
except Exception as e:
|
113 |
+
self.logger.error(f"Error extracting images: {str(e)}")
|
114 |
+
return images
|
115 |
+
|
116 |
+
def extract_links(self) -> List[Dict]:
|
117 |
+
"""Extract link information from the page"""
|
118 |
+
links = []
|
119 |
+
try:
|
120 |
+
for a in self.soup.find_all('a', href=True):
|
121 |
+
absolute_url = urljoin(self.url, a.get('href', ''))
|
122 |
+
link_info = {
|
123 |
+
'href': absolute_url,
|
124 |
+
'text': a.get_text(strip=True),
|
125 |
+
'title': a.get('title', ''),
|
126 |
+
'type': 'internal' if self.url in absolute_url else 'external',
|
127 |
+
'has_image': bool(a.find('img'))
|
128 |
+
}
|
129 |
+
links.append(link_info)
|
130 |
+
except Exception as e:
|
131 |
+
self.logger.error(f"Error extracting links: {str(e)}")
|
132 |
+
return links
|
133 |
+
|
134 |
+
def extract_text(self) -> List[Dict]:
|
135 |
+
"""Extract text content from the page"""
|
136 |
+
texts = []
|
137 |
+
try:
|
138 |
+
for text_element in self.soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
139 |
+
text_info = {
|
140 |
+
'content': text_element.get_text(strip=True),
|
141 |
+
'source': text_element.name
|
142 |
+
}
|
143 |
+
if text_info['content']: # Only add non-empty text blocks
|
144 |
+
texts.append(text_info)
|
145 |
+
except Exception as e:
|
146 |
+
self.logger.error(f"Error extracting text: {str(e)}")
|
147 |
+
return texts
|
148 |
+
|
149 |
+
def _get_image_dimensions(self, img_tag) -> str:
|
150 |
+
"""Get image dimensions from tag attributes"""
|
151 |
+
width = img_tag.get('width', '')
|
152 |
+
height = img_tag.get('height', '')
|
153 |
+
if width and height:
|
154 |
+
return f"{width}x{height}"
|
155 |
+
return "unknown"
|
156 |
+
|
157 |
+
def _get_file_type(self, src: str) -> str:
|
158 |
+
"""Determine image file type from URL"""
|
159 |
+
if not src:
|
160 |
+
return "unknown"
|
161 |
+
ext = src.split('.')[-1].lower()
|
162 |
+
return ext if ext in ['jpg', 'jpeg', 'png', 'gif', 'webp'] else "unknown"
|
163 |
+
|
164 |
+
class QueryAnalyzer:
|
165 |
+
"""Analyze natural language queries"""
|
166 |
+
def __init__(self):
|
167 |
+
self.logger = logging.getLogger(__name__)
|
168 |
+
self.stop_words = set(stopwords.words('english'))
|
169 |
+
self.lemmatizer = WordNetLemmatizer()
|
170 |
+
self.logger.info("QueryAnalyzer initialized")
|
171 |
+
|
172 |
+
def parse_query(self, query: str) -> Dict[str, Union[str, int]]:
|
173 |
+
try:
|
174 |
+
self.logger.info(f"Parsing query: {query}")
|
175 |
+
tokens = word_tokenize(query.lower())
|
176 |
+
filtered_tokens = [self.lemmatizer.lemmatize(token) for token in tokens
|
177 |
+
if token.isalnum() and token not in self.stop_words]
|
178 |
+
|
179 |
+
return {
|
180 |
+
'target': self._identify_target(filtered_tokens),
|
181 |
+
'limit': self._identify_limit(filtered_tokens),
|
182 |
+
'filters': self._identify_filters(filtered_tokens),
|
183 |
+
'output': 'JSON' if 'json' in query.lower() else 'Formatted Text'
|
184 |
+
}
|
185 |
+
except Exception as e:
|
186 |
+
self.logger.error(f"Error parsing query: {str(e)}")
|
187 |
+
return {'target': 'unknown', 'limit': 0, 'filters': {}}
|
188 |
+
|
189 |
+
def _identify_target(self, tokens: List[str]) -> str:
|
190 |
+
target_map = {
|
191 |
+
'image': 'image',
|
192 |
+
'images': 'image',
|
193 |
+
'picture': 'image',
|
194 |
+
'link': 'link',
|
195 |
+
'links': 'link',
|
196 |
+
'text': 'text',
|
197 |
+
'content': 'text'
|
198 |
+
}
|
199 |
+
for token in tokens:
|
200 |
+
if token in target_map:
|
201 |
+
return target_map[token]
|
202 |
+
return 'unknown'
|
203 |
+
|
204 |
+
def _identify_limit(self, tokens: List[str]) -> int:
|
205 |
+
for token in tokens:
|
206 |
+
if token.isdigit():
|
207 |
+
return int(token)
|
208 |
+
return 0
|
209 |
+
|
210 |
+
def _identify_filters(self, tokens: List[str]) -> Dict[str, str]:
|
211 |
+
filters = {}
|
212 |
+
if 'external' in tokens:
|
213 |
+
filters['link_type'] = 'external'
|
214 |
+
elif 'internal' in tokens:
|
215 |
+
filters['link_type'] = 'internal'
|
216 |
+
if 'png' in tokens:
|
217 |
+
filters['file_type'] = 'png'
|
218 |
+
elif 'jpg' in tokens or 'jpeg' in tokens:
|
219 |
+
filters['file_type'] = 'jpg'
|
220 |
+
return filters
|
221 |
+
|
222 |
+
class ResponseFormatter:
|
223 |
+
"""Format scraped data based on user preferences"""
|
224 |
+
def __init__(self):
|
225 |
+
self.logger = logging.getLogger(__name__)
|
226 |
+
|
227 |
+
def format_data(self, data: List[Dict], query_info: Dict) -> str:
|
228 |
+
try:
|
229 |
+
if not data:
|
230 |
+
return "No data found for the specified query."
|
231 |
+
|
232 |
+
# Apply filters
|
233 |
+
filtered_data = self._apply_filters(data, query_info.get('filters', {}))
|
234 |
+
|
235 |
+
# Apply limit
|
236 |
+
if query_info.get('limit', 0) > 0:
|
237 |
+
filtered_data = filtered_data[:query_info['limit']]
|
238 |
+
|
239 |
+
if query_info['output'] == "JSON":
|
240 |
+
return json.dumps({
|
241 |
+
"metadata": {
|
242 |
+
"query": query_info,
|
243 |
+
"timestamp": datetime.now().isoformat(),
|
244 |
+
"results_count": len(filtered_data)
|
245 |
+
},
|
246 |
+
"results": filtered_data
|
247 |
+
}, indent=2)
|
248 |
+
|
249 |
+
return self._format_human_readable(filtered_data, query_info['target'])
|
250 |
+
|
251 |
+
except Exception as e:
|
252 |
+
self.logger.error(f"Formatting error: {str(e)}")
|
253 |
+
return f"Error formatting results: {str(e)}"
|
254 |
+
|
255 |
+
def _apply_filters(self, data: List[Dict], filters: Dict) -> List[Dict]:
|
256 |
+
filtered_data = data
|
257 |
+
if 'link_type' in filters:
|
258 |
+
filtered_data = [item for item in filtered_data
|
259 |
+
if item.get('type', '') == filters['link_type']]
|
260 |
+
if 'file_type' in filters:
|
261 |
+
filtered_data = [item for item in filtered_data
|
262 |
+
if item.get('file_type', '').lower() == filters['file_type']]
|
263 |
+
return filtered_data
|
264 |
+
|
265 |
+
def _format_human_readable(self, data: List[Dict], target: str) -> str:
|
266 |
+
formats = {
|
267 |
+
'image': self._format_images,
|
268 |
+
'link': self._format_links,
|
269 |
+
'text': self._format_texts
|
270 |
+
}
|
271 |
+
return formats.get(target, lambda x: "Unknown data type")(data)
|
272 |
+
|
273 |
+
def _format_images(self, images: List[Dict]) -> str:
|
274 |
+
return "\n\n".join(
|
275 |
+
f"Image {idx+1}:\n"
|
276 |
+
f"Source: {img['src']}\n"
|
277 |
+
f"Alt Text: {img['alt']}\n"
|
278 |
+
f"Dimensions: {img['dimensions']}\n"
|
279 |
+
f"Type: {img['file_type']}"
|
280 |
+
for idx, img in enumerate(images)
|
281 |
+
)
|
282 |
+
|
283 |
+
def _format_links(self, links: List[Dict]) -> str:
|
284 |
+
return "\n\n".join(
|
285 |
+
f"Link {idx+1}:\n"
|
286 |
+
f"URL: {link['href']}\n"
|
287 |
+
f"Text: {link['text']}\n"
|
288 |
+
f"Type: {link['type']}\n"
|
289 |
+
f"Contains Image: {'Yes' if link['has_image'] else 'No'}"
|
290 |
+
for idx, link in enumerate(links)
|
291 |
+
)
|
292 |
+
|
293 |
+
def _format_texts(self, texts: List[Dict]) -> str:
|
294 |
+
return "\n\n".join(
|
295 |
+
f"Text Block {idx+1} ({text['source'].upper()}):\n"
|
296 |
+
f"{text['content']}"
|
297 |
+
for idx, text in enumerate(texts)
|
298 |
+
)
|
299 |
+
|
300 |
+
class Scraper:
|
301 |
+
"""Core scraping functionality with improved error handling"""
|
302 |
+
def __init__(self):
|
303 |
+
self.logger = logging.getLogger(__name__)
|
304 |
+
self.session = requests.Session()
|
305 |
+
self.session.headers.update({
|
306 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
307 |
+
})
|
308 |
+
|
309 |
+
async def fetch_page(self, url: str) -> Optional[str]:
|
310 |
+
"""Fetch page content with retry mechanism"""
|
311 |
+
for attempt in range(Config.MAX_RETRIES):
|
312 |
+
try:
|
313 |
+
response = self.session.get(url, timeout=Config.TIMEOUT)
|
314 |
+
response.raise_for_status()
|
315 |
+
return response.text
|
316 |
+
except Exception as e:
|
317 |
+
self.logger.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
|
318 |
+
if attempt == Config.MAX_RETRIES - 1:
|
319 |
+
return None
|
320 |
+
|
321 |
+
async def take_screenshot(self, url: str) -> Optional[bytes]:
|
322 |
+
"""Take a screenshot of a webpage with improved error handling."""
|
323 |
+
driver = None
|
324 |
+
try:
|
325 |
+
options = Options()
|
326 |
+
options.add_argument("--headless")
|
327 |
+
options.add_argument("--no-sandbox")
|
328 |
+
options.add_argument("--disable-dev-shm-usage")
|
329 |
+
options.add_argument("--window-size=1920,1080")
|
330 |
+
|
331 |
+
driver = webdriver.Chrome(options=options)
|
332 |
+
driver.get(url)
|
333 |
+
|
334 |
+
# Wait for page load
|
335 |
+
time.sleep(2)
|
336 |
+
|
337 |
+
# Take screenshot
|
338 |
+
screenshot = driver.get_screenshot_as_png()
|
339 |
+
|
340 |
+
# Process image
|
341 |
+
img = Image.open(io.BytesIO(screenshot))
|
342 |
+
img = img.convert('RGB') # Convert to RGB to ensure compatibility
|
343 |
+
|
344 |
+
# Save to bytes
|
345 |
+
img_byte_arr = io.BytesIO()
|
346 |
+
img.save(img_byte_arr, format='PNG', optimize=True)
|
347 |
+
return img_byte_arr.getvalue()
|
348 |
+
|
349 |
+
except Exception as e:
|
350 |
+
logging.error(f"Screenshot error for {url}: {str(e)}")
|
351 |
+
return None
|
352 |
+
finally:
|
353 |
+
if driver:
|
354 |
+
driver.quit()
|
355 |
+
|
356 |
+
class SmartWebScraper:
|
357 |
+
"""Smart web scraping with natural language processing capabilities"""
|
358 |
+
def __init__(self):
|
359 |
+
self.query_analyzer = QueryAnalyzer()
|
360 |
+
self.data_extractor = DataExtractor()
|
361 |
+
self.response_formatter = ResponseFormatter()
|
362 |
+
self.logger = logging.getLogger(__name__)
|
363 |
+
self.scraped_data = {}
|
364 |
+
|
365 |
+
def chat_based_scrape(self, instruction: str, url: str, output_format: str = "Formatted Text") -> str:
|
366 |
+
"""Process natural language instructions for web scraping"""
|
367 |
+
try:
|
368 |
+
if not instruction or not url:
|
369 |
+
return "Please provide both instruction and URL."
|
370 |
+
|
371 |
+
# Process the URL and instruction
|
372 |
+
raw_data = self.process_url(url, instruction)
|
373 |
+
query_info = self.query_analyzer.parse_query(instruction)
|
374 |
+
query_info['output'] = output_format
|
375 |
+
|
376 |
+
if output_format == "JSON":
|
377 |
+
return json.dumps({
|
378 |
+
"status": "success",
|
379 |
+
"request": {
|
380 |
+
"url": url,
|
381 |
+
"instruction": instruction,
|
382 |
+
"timestamp": datetime.now().isoformat()
|
383 |
+
},
|
384 |
+
"data": raw_data,
|
385 |
+
"metadata": {
|
386 |
+
"source": url,
|
387 |
+
"elements_found": len(raw_data),
|
388 |
+
"content_type": type(raw_data).__name__
|
389 |
+
}
|
390 |
+
}, indent=2)
|
391 |
+
|
392 |
+
return self.response_formatter.format_data(raw_data, query_info)
|
393 |
+
|
394 |
+
except Exception as e:
|
395 |
+
error_msg = f"Error processing chat-based scrape: {str(e)}"
|
396 |
+
self.logger.error(error_msg)
|
397 |
+
return error_msg
|
398 |
+
|
399 |
+
def process_url(self, url: str, query: str) -> str:
|
400 |
+
"""Process URL based on query"""
|
401 |
+
try:
|
402 |
+
# Validate URL
|
403 |
+
if not self._validate_url(url):
|
404 |
+
return "Please provide a valid URL (including http:// or https://)."
|
405 |
+
|
406 |
+
# Fetch page
|
407 |
+
headers = {
|
408 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
409 |
+
}
|
410 |
+
response = requests.get(url, headers=headers, timeout=10)
|
411 |
+
response.raise_for_status()
|
412 |
+
|
413 |
+
# Set page content and store in scraped_data
|
414 |
+
self.data_extractor.set_page(response.text, url)
|
415 |
+
self.scraped_data[url] = {
|
416 |
+
'images': self.data_extractor.extract_images(),
|
417 |
+
'links': self.data_extractor.extract_links(),
|
418 |
+
'texts': self.data_extractor.extract_text()
|
419 |
+
}
|
420 |
+
|
421 |
+
# Analyze query and extract data
|
422 |
+
query_info = self.query_analyzer.parse_query(query)
|
423 |
+
data = self._get_data_for_target(query_info['target'], url)
|
424 |
+
|
425 |
+
# Format response
|
426 |
+
return self.response_formatter.format_data(data, query_info)
|
427 |
+
|
428 |
+
except requests.exceptions.RequestException as e:
|
429 |
+
error_msg = f"Error fetching the webpage: {str(e)}"
|
430 |
+
self.logger.error(error_msg)
|
431 |
+
return error_msg
|
432 |
+
except Exception as e:
|
433 |
+
error_msg = f"An error occurred: {str(e)}"
|
434 |
+
self.logger.error(error_msg)
|
435 |
+
return error_msg
|
436 |
+
|
437 |
+
def _validate_url(self, url: str) -> bool:
|
438 |
+
"""Validate URL format"""
|
439 |
+
try:
|
440 |
+
result = urlparse(url)
|
441 |
+
return all([result.scheme, result.netloc])
|
442 |
+
except Exception as e:
|
443 |
+
self.logger.error(f"URL validation error: {str(e)}")
|
444 |
+
return False
|
445 |
+
|
446 |
+
def _get_data_for_target(self, target: str, url: str) -> List[Dict]:
|
447 |
+
"""Get specific data based on target type"""
|
448 |
+
if url not in self.scraped_data:
|
449 |
+
self.logger.warning(f"No data found for URL: {url}")
|
450 |
+
return []
|
451 |
+
|
452 |
+
if target == 'image':
|
453 |
+
return self.scraped_data[url]['images']
|
454 |
+
elif target == 'link':
|
455 |
+
return self.scraped_data[url]['links']
|
456 |
+
elif target == 'text':
|
457 |
+
return self.scraped_data[url]['texts']
|
458 |
+
else:
|
459 |
+
self.logger.warning(f"Unknown target type: {target}")
|
460 |
+
return []
|
461 |
+
|
462 |
+
class QueryAnalyzer:
|
463 |
+
def __init__(self):
|
464 |
+
self.logger = logging.getLogger(__name__)
|
465 |
+
self.stop_words = set(stopwords.words('english'))
|
466 |
+
self.lemmatizer = WordNetLemmatizer()
|
467 |
+
|
468 |
+
def parse_query(self, query: str) -> Dict[str, Union[str, int]]:
|
469 |
+
try:
|
470 |
+
tokens = word_tokenize(query.lower())
|
471 |
+
filtered_tokens = [
|
472 |
+
self.lemmatizer.lemmatize(token)
|
473 |
+
for token in tokens
|
474 |
+
if token.isalnum() and token not in self.stop_words
|
475 |
+
]
|
476 |
+
|
477 |
+
return {
|
478 |
+
'target': self._identify_target(filtered_tokens),
|
479 |
+
'limit': self._identify_limit(filtered_tokens),
|
480 |
+
'filters': self._identify_filters(filtered_tokens),
|
481 |
+
'output': 'JSON' if 'json' in query.lower() else 'Formatted Text'
|
482 |
+
}
|
483 |
+
except Exception as e:
|
484 |
+
self.logger.error(f"Error parsing query: {str(e)}")
|
485 |
+
return {'target': 'unknown', 'limit': 0, 'filters': {}}
|
486 |
+
|
487 |
+
def _identify_target(self, tokens: List[str]) -> str:
|
488 |
+
targets = {'image': 'image', 'link': 'link', 'text': 'text'}
|
489 |
+
for token in tokens:
|
490 |
+
if token in targets:
|
491 |
+
return targets[token]
|
492 |
+
return 'unknown'
|
493 |
+
|
494 |
+
def _identify_limit(self, tokens: List[str]) -> int:
|
495 |
+
for token in tokens:
|
496 |
+
if token.isdigit():
|
497 |
+
return int(token)
|
498 |
+
return 0
|
499 |
+
|
500 |
+
def _identify_filters(self, tokens: List[str]) -> Dict[str, str]:
|
501 |
+
filters = {}
|
502 |
+
if 'external' in tokens:
|
503 |
+
filters['link_type'] = 'external'
|
504 |
+
elif 'internal' in tokens:
|
505 |
+
filters['link_type'] = 'internal'
|
506 |
+
return filters
|
507 |
+
|
508 |
+
class ResponseFormatter:
|
509 |
+
def __init__(self):
|
510 |
+
self.logger = logging.getLogger(__name__)
|
511 |
+
|
512 |
+
def format_data(self, data: List[Dict], query_info: Dict) -> Union[str, dict]:
|
513 |
+
try:
|
514 |
+
if not data:
|
515 |
+
return {"status": "success", "data": [], "message": "No data found"} if query_info['output'] == "JSON" else "No data found"
|
516 |
+
|
517 |
+
response = {
|
518 |
+
"metadata": {
|
519 |
+
"target": query_info['target'],
|
520 |
+
"limit": query_info['limit'],
|
521 |
+
"filters": query_info['filters'],
|
522 |
+
"timestamp": datetime.now().isoformat()
|
523 |
+
},
|
524 |
+
"data": data[:query_info['limit']] if query_info['limit'] > 0 else data
|
525 |
+
}
|
526 |
+
|
527 |
+
return json.dumps(response, indent=2) if query_info['output'] == "JSON" else self._format_text(response)
|
528 |
+
|
529 |
+
except Exception as e:
|
530 |
+
error_msg = {"status": "error", "message": str(e)}
|
531 |
+
return json.dumps(error_msg, indent=2) if query_info['output'] == "JSON" else f"Error: {str(e)}"
|
532 |
+
|
533 |
+
def _format_text(self, response: dict) -> str:
|
534 |
+
return json.dumps(response, indent=2) # Fallback if text formatting fails
|
535 |
|
536 |
def sanitize_filename(filename):
|
537 |
+
"""Sanitizes a filename by removing invalid characters."""
|
538 |
return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
|
539 |
|
540 |
def validate_url(url):
|
|
|
542 |
try:
|
543 |
result = urlparse(url)
|
544 |
return all([result.scheme, result.netloc])
|
545 |
+
except Exception:
|
546 |
return False
|
547 |
|
548 |
def get_latest_data(url):
|
549 |
"""Get the latest HTML content of a webpage."""
|
550 |
try:
|
551 |
+
headers = {
|
552 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
553 |
+
}
|
554 |
+
response = requests.get(url, headers=headers, timeout=10)
|
555 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
556 |
return response.text
|
557 |
+
except requests.exceptions.RequestException as e:
|
558 |
logging.error(f"Error fetching latest data from {url}: {str(e)}")
|
559 |
return None
|
560 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
561 |
def take_screenshot(url):
|
562 |
"""Take a screenshot of a webpage."""
|
563 |
try:
|
|
|
566 |
chrome_options.add_argument("--no-sandbox")
|
567 |
chrome_options.add_argument("--disable-dev-shm-usage")
|
568 |
chrome_options.add_argument("--window-size=1920,1080")
|
569 |
+
|
570 |
driver = webdriver.Chrome(options=chrome_options)
|
571 |
driver.get(url)
|
572 |
+
|
573 |
screenshot = driver.get_screenshot_as_png()
|
574 |
driver.quit()
|
575 |
+
|
576 |
image = Image.open(io.BytesIO(screenshot))
|
577 |
max_size = (1024, 1024)
|
578 |
image.thumbnail(max_size, Image.LANCZOS)
|
579 |
+
|
580 |
img_byte_arr = io.BytesIO()
|
581 |
image.save(img_byte_arr, format='PNG')
|
582 |
return img_byte_arr.getvalue()
|
|
|
584 |
logging.error(f"Screenshot error for {url}: {str(e)}")
|
585 |
return None
|
586 |
|
587 |
+
def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth):
|
588 |
+
"""Process URLs with crawl depth and change detection."""
|
589 |
try:
|
590 |
+
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
591 |
+
urls = [url.strip() for url in urls if url.strip()]
|
592 |
+
urls = urls[:int(max_urls)]
|
|
|
|
|
|
|
593 |
|
594 |
+
# Validate URLs
|
595 |
+
invalid_urls = [url for url in urls if not validate_url(url)]
|
596 |
+
if invalid_urls:
|
597 |
+
return None, None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
598 |
|
599 |
+
scraped_data = []
|
600 |
+
screenshots = []
|
601 |
+
changes_log = []
|
602 |
+
|
603 |
+
# Create temporary directory for screenshots
|
604 |
+
temp_dir = Path("temp_screenshots")
|
605 |
+
temp_dir.mkdir(exist_ok=True)
|
606 |
+
|
607 |
+
# Process each URL with progress tracking
|
608 |
+
total_urls = len(urls)
|
609 |
+
for idx, url in enumerate(urls):
|
610 |
+
if not url.startswith(('http://', 'https://')):
|
611 |
+
url = f'https://{url}'
|
612 |
+
|
613 |
+
sanitized_url = sanitize_filename(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
614 |
|
615 |
+
# Take screenshot
|
616 |
+
if action_radio in ['Capture image', 'Both']:
|
617 |
+
screenshot = take_screenshot(url)
|
618 |
+
if screenshot:
|
619 |
+
screenshot_path = temp_dir / f"{sanitized_url}.png"
|
620 |
+
with open(screenshot_path, 'wb') as f:
|
621 |
+
f.write(screenshot)
|
622 |
+
screenshots.append((url, str(screenshot_path))) # Convert Path to string
|
|
|
|
|
|
|
|
|
|
|
623 |
|
624 |
+
# Scrape data
|
625 |
+
if action_radio in ['Scrape data', 'Both']:
|
626 |
+
html_content = get_latest_data(url)
|
627 |
+
if html_content:
|
628 |
+
scraped_data.append({
|
629 |
+
'url': url,
|
630 |
+
'content_length': len(html_content),
|
631 |
+
'timestamp': datetime.now().isoformat()
|
632 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
633 |
|
634 |
+
# Create a ZIP file for the screenshots
|
635 |
+
zip_file_path = temp_dir / "screenshots.zip"
|
636 |
+
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
|
637 |
+
for screenshot in screenshots:
|
638 |
+
zipf.write(screenshot[1], arcname=Path(screenshot[1]).name) # Use string for writing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
639 |
|
640 |
+
# Return the results
|
641 |
+
return str(zip_file_path), screenshots, scraped_data # Return structured data for JSON output
|
|
|
|
|
|
|
|
|
|
|
642 |
|
|
|
|
|
|
|
|
|
|
|
643 |
except Exception as e:
|
644 |
+
logging.error(f"Error in process_urls: {str(e)}")
|
645 |
+
return None, None, json.dumps({"error": str(e)}, indent=2)
|
646 |
|
647 |
+
|
648 |
+
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
649 |
|
650 |
def create_interface():
|
651 |
"""Create the Gradio interface."""
|
652 |
+
scraper = SmartWebScraper()
|
653 |
+
|
654 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
655 |
gr.Markdown(
|
656 |
"""
|
657 |
+
# 🌐 Enhanced Web Scraper with Change Detection and Chat
|
658 |
+
Monitor and capture changes in web content automatically. Use the chat interface to interact with scraped data.
|
659 |
"""
|
660 |
)
|
661 |
+
|
662 |
with gr.Tabs():
|
663 |
with gr.Tab("URL Scrape/Screenshot"):
|
664 |
url_input = gr.Textbox(
|
665 |
label="Enter URL(s)",
|
|
|
666 |
placeholder="Enter single URL or multiple URLs separated by commas"
|
667 |
)
|
668 |
+
|
669 |
with gr.Row():
|
670 |
bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
|
671 |
action_radio = gr.Radio(
|
|
|
673 |
label="Select Action",
|
674 |
value="Both"
|
675 |
)
|
676 |
+
|
677 |
with gr.Row():
|
678 |
max_urls = gr.Slider(
|
679 |
minimum=1,
|
680 |
+
maximum=20,
|
681 |
value=5,
|
682 |
step=1,
|
683 |
label="Max URLs to process"
|
684 |
)
|
685 |
crawl_depth = gr.Slider(
|
686 |
+
minimum=0,
|
687 |
+
maximum=3,
|
688 |
value=1,
|
689 |
step=1,
|
690 |
+
label="Crawl Depth (0 for no recursion)"
|
691 |
)
|
692 |
+
|
693 |
process_button = gr.Button("Process URLs", variant="primary")
|
694 |
+
|
695 |
with gr.Column():
|
696 |
+
# Add gallery for screenshot preview
|
697 |
+
gallery = gr.Gallery(
|
698 |
+
label="Screenshots Preview",
|
699 |
+
show_label=True,
|
700 |
+
elem_id="gallery",
|
701 |
+
columns=[3],
|
702 |
+
rows=[2],
|
703 |
+
height="auto",
|
704 |
+
object_fit="contain" # Add proper image scaling
|
705 |
+
)
|
706 |
+
|
707 |
+
# Download button and results
|
708 |
+
download_file = gr.File(label="Download Results (ZIP)")
|
709 |
scraped_data_output = gr.JSON(label="Results Summary")
|
710 |
+
|
711 |
process_button.click(
|
712 |
fn=process_urls,
|
713 |
inputs=[
|
|
|
718 |
crawl_depth
|
719 |
],
|
720 |
outputs=[
|
721 |
+
download_file,
|
722 |
+
gallery,
|
723 |
scraped_data_output
|
724 |
],
|
725 |
show_progress=True
|
726 |
+
)
|
|
|
727 |
with gr.Tab("Chat-Based Scrape"):
|
728 |
instruction = gr.Textbox(
|
729 |
label="Enter Instruction",
|
730 |
placeholder="e.g., 'Scrape all links' or 'Extract all images'"
|
731 |
)
|
732 |
+
chat_url_input = gr.Textbox(
|
733 |
label="Enter URL",
|
734 |
value="https://example.com",
|
735 |
placeholder="Enter the target URL"
|
736 |
)
|
737 |
output_format = gr.Radio(
|
738 |
+
["Formatted Text", "JSON"],
|
739 |
label="Output Format",
|
740 |
+
value="Formatted Text"
|
741 |
)
|
742 |
+
chat_output = gr.Textbox(label="Output")
|
743 |
+
|
744 |
chat_button = gr.Button("Execute Instruction", variant="primary")
|
745 |
+
|
746 |
+
chat_button.click (
|
747 |
+
fn=scraper.chat_based_scrape,
|
748 |
+
inputs=[instruction, chat_url_input, output_format],
|
749 |
+
outputs=chat_output
|
750 |
)
|
751 |
+
|
752 |
gr.Markdown(
|
753 |
"""
|
754 |
### Features
|
|
|
756 |
- Screenshot capture
|
757 |
- Content change detection
|
758 |
- Recursive crawling
|
759 |
+
- Chat-based instructions for interacting with scraped data
|
760 |
"""
|
761 |
)
|
762 |
+
|
763 |
return demo
|
764 |
|
765 |
if __name__ == "__main__":
|
766 |
+
demo = create_interface()
|
767 |
+
demo.launch(debug=True)
|