Spaces:
Runtime error
Runtime error
Update 1app.py
Browse files
1app.py
CHANGED
@@ -10,6 +10,13 @@ from nltk import word_tokenize
|
|
10 |
from nltk.corpus import stopwords
|
11 |
from nltk.stem import WordNetLemmatizer
|
12 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Configure detailed logging
|
15 |
logging.basicConfig(
|
@@ -23,10 +30,251 @@ logging.basicConfig(
|
|
23 |
|
24 |
# Download necessary NLTK data
|
25 |
import nltk
|
26 |
-
|
27 |
-
|
28 |
-
nltk.download('
|
29 |
-
nltk.download('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
class DataExtractor:
|
32 |
def __init__(self):
|
@@ -43,20 +291,17 @@ class DataExtractor:
|
|
43 |
if not self.soup:
|
44 |
self.logger.error("BeautifulSoup object not initialized")
|
45 |
return []
|
46 |
-
|
47 |
images = []
|
48 |
all_imgs = self.soup.find_all('img')
|
49 |
self.logger.info(f"Found {len(all_imgs)} raw image tags")
|
50 |
-
|
51 |
for img in all_imgs:
|
52 |
try:
|
53 |
src = img.get('src', '')
|
54 |
if src:
|
55 |
# Handle relative URLs
|
56 |
-
|
57 |
-
src = urljoin(self.base_url, src)
|
58 |
-
elif not src.startswith(('http://', 'https://')):
|
59 |
-
src = urljoin(self.base_url, src)
|
60 |
|
61 |
image_data = {
|
62 |
'src': src,
|
@@ -70,7 +315,7 @@ class DataExtractor:
|
|
70 |
except Exception as e:
|
71 |
self.logger.error(f"Error processing image: {str(e)}")
|
72 |
continue
|
73 |
-
|
74 |
self.logger.info(f"Successfully extracted {len(images)} valid images")
|
75 |
return images
|
76 |
|
@@ -78,20 +323,17 @@ class DataExtractor:
|
|
78 |
if not self.soup:
|
79 |
self.logger.error("BeautifulSoup object not initialized")
|
80 |
return []
|
81 |
-
|
82 |
links = []
|
83 |
all_links = self.soup.find_all('a')
|
84 |
self.logger.info(f"Found {len(all_links)} raw link tags")
|
85 |
-
|
86 |
for a in all_links:
|
87 |
try:
|
88 |
href = a.get('href', '')
|
89 |
if href and not href.startswith(('#', 'javascript:', 'mailto:')):
|
90 |
# Handle relative URLs
|
91 |
-
|
92 |
-
href = urljoin(self.base_url, href)
|
93 |
-
elif not href.startswith(('http://', 'https://')):
|
94 |
-
href = urljoin(self.base_url, href)
|
95 |
|
96 |
links.append({
|
97 |
'href': href,
|
@@ -104,7 +346,7 @@ class DataExtractor:
|
|
104 |
except Exception as e:
|
105 |
self.logger.error(f"Error processing link: {str(e)}")
|
106 |
continue
|
107 |
-
|
108 |
self.logger.info(f"Successfully extracted {len(links)} valid links")
|
109 |
return links
|
110 |
|
@@ -112,11 +354,11 @@ class DataExtractor:
|
|
112 |
if not self.soup:
|
113 |
self.logger.error("BeautifulSoup object not initialized")
|
114 |
return []
|
115 |
-
|
116 |
texts = []
|
117 |
all_paragraphs = self.soup.find_all('p') # Extracting all paragraph tags
|
118 |
self.logger.info(f"Found {len(all_paragraphs)} raw paragraph tags")
|
119 |
-
|
120 |
for p in all_paragraphs:
|
121 |
try:
|
122 |
text_content = p.get_text(strip=True)
|
@@ -129,7 +371,7 @@ class DataExtractor:
|
|
129 |
except Exception as e:
|
130 |
self.logger.error(f"Error processing text block: {str(e)}")
|
131 |
continue
|
132 |
-
|
133 |
self.logger.info(f"Successfully extracted {len(texts)} valid text blocks")
|
134 |
return texts
|
135 |
|
@@ -158,13 +400,13 @@ class QueryAnalyzer:
|
|
158 |
self.logger.info(f"Parsing query: {query}")
|
159 |
tokens = word_tokenize(query.lower())
|
160 |
filtered_tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in self.stop_words]
|
161 |
-
|
162 |
query_info = {
|
163 |
'target': self._identify_target(filtered_tokens),
|
164 |
'limit': self._identify_limit(filtered_tokens),
|
165 |
'filters': self._identify_filters(filtered_tokens)
|
166 |
}
|
167 |
-
|
168 |
self.logger.info(f"Query parsed: {query_info}")
|
169 |
return query_info
|
170 |
except Exception as e:
|
@@ -274,7 +516,8 @@ class SmartWebScraper:
|
|
274 |
self.data_extractor = DataExtractor()
|
275 |
self.response_formatter = ResponseFormatter()
|
276 |
self.logger = logging.getLogger(__name__)
|
277 |
-
|
|
|
278 |
def process_url(self, url: str, query: str) -> str:
|
279 |
try:
|
280 |
# Validate URL
|
@@ -290,21 +533,26 @@ class SmartWebScraper:
|
|
290 |
response.raise_for_status()
|
291 |
self.logger.info(f"Successfully fetched page. Status code: {response.status_code}")
|
292 |
|
293 |
-
# Set page content
|
294 |
self.data_extractor.set_page(response.text, url)
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
# Analyze query
|
297 |
query_info = self.query_analyzer.parse_query(query)
|
298 |
self.logger.info(f"Query analysis: {query_info}")
|
299 |
|
300 |
# Extract requested data
|
301 |
-
data = self._get_data_for_target(query_info['target'])
|
302 |
self.logger.info(f"Extracted {len(data)} items for target: {query_info['target']}")
|
303 |
|
304 |
# Format response
|
305 |
formatted_response = self.response_formatter.format_data(data, query_info)
|
306 |
self.logger.info("Response formatted successfully")
|
307 |
-
|
308 |
return formatted_response
|
309 |
|
310 |
except requests.exceptions.RequestException as e:
|
@@ -324,89 +572,178 @@ class SmartWebScraper:
|
|
324 |
self.logger.error(f"URL validation error: {str(e)}")
|
325 |
return False
|
326 |
|
327 |
-
def _get_data_for_target(self, target: str) -> List[Dict]:
|
328 |
-
|
329 |
-
|
330 |
-
'link': self.data_extractor.extract_links,
|
331 |
-
'text': self.data_extractor.extract_text # extract_text method is now defined
|
332 |
-
}
|
333 |
-
extractor = extractors.get(target)
|
334 |
-
if not extractor:
|
335 |
-
self.logger.warning(f"No extractor found for target: {target}")
|
336 |
return []
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
return
|
342 |
-
|
343 |
-
self.
|
|
|
|
|
344 |
return []
|
345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
def create_interface():
|
|
|
347 |
scraper = SmartWebScraper()
|
348 |
-
|
349 |
-
def process_request(url: str, query: str) -> str:
|
350 |
-
return scraper.process_url(url, query)
|
351 |
-
|
352 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
353 |
-
gr.Markdown(
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
label="Website URL",
|
359 |
-
placeholder="https://example.com",
|
360 |
-
show_label=True
|
361 |
-
)
|
362 |
-
|
363 |
-
query_input = gr.Textbox(
|
364 |
-
label="What would you like to know?",
|
365 |
-
placeholder="Try: 'Show me the first 3 images' or 'How many links are there?'",
|
366 |
-
show_label=True
|
367 |
)
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
)
|
384 |
|
385 |
-
# Example queries section
|
386 |
-
gr.Markdown("""
|
387 |
-
## 📝 Example queries:
|
388 |
-
|
389 |
-
**Images:**
|
390 |
-
- "Show me all images with their descriptions"
|
391 |
-
- "How many images are on this page?"
|
392 |
-
- "Find the largest images"
|
393 |
-
|
394 |
-
**Links:**
|
395 |
-
- "List the first 5 external links"
|
396 |
-
- "Show me links with images"
|
397 |
-
- "How many internal links are there?"
|
398 |
-
|
399 |
-
**Text:**
|
400 |
-
- "Extract main paragraphs with their headings"
|
401 |
-
- "Show me the longest text blocks"
|
402 |
-
- "Find paragraphs containing links"
|
403 |
-
|
404 |
-
**Advanced:**
|
405 |
-
- "Give me an overview of the page structure"
|
406 |
-
- "Show me the most recent content"
|
407 |
-
- "Analyze the page's organization"
|
408 |
-
""")
|
409 |
-
|
410 |
return demo
|
411 |
|
412 |
if __name__ == "__main__":
|
|
|
10 |
from nltk.corpus import stopwords
|
11 |
from nltk.stem import WordNetLemmatizer
|
12 |
from datetime import datetime
|
13 |
+
import io
|
14 |
+
import zipfile
|
15 |
+
import os
|
16 |
+
import tempfile
|
17 |
+
from selenium import webdriver
|
18 |
+
from selenium.webdriver.chrome.options import Options
|
19 |
+
from PIL import Image
|
20 |
|
21 |
# Configure detailed logging
|
22 |
logging.basicConfig(
|
|
|
30 |
|
31 |
# Download necessary NLTK data
|
32 |
import nltk
|
33 |
+
|
34 |
+
try:
|
35 |
+
nltk.download('punkt', quiet=True)
|
36 |
+
nltk.download('stopwords', quiet=True)
|
37 |
+
nltk.download('wordnet', quiet=True)
|
38 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
39 |
+
except Exception as e:
|
40 |
+
logging.error(f"Error downloading NLTK data: {str(e)}")
|
41 |
+
|
42 |
+
def sanitize_filename(filename):
|
43 |
+
"""Sanitizes a filename by removing invalid characters."""
|
44 |
+
return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
|
45 |
+
|
46 |
+
def validate_url(url):
|
47 |
+
"""Validate if the URL is properly formatted."""
|
48 |
+
try:
|
49 |
+
result = urlparse(url)
|
50 |
+
return all([result.scheme, result.netloc])
|
51 |
+
except Exception:
|
52 |
+
return False
|
53 |
+
|
54 |
+
def get_latest_data(url):
|
55 |
+
"""Get the latest HTML content of a webpage."""
|
56 |
+
try:
|
57 |
+
headers = {
|
58 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
59 |
+
}
|
60 |
+
response = requests.get(url, headers=headers, timeout=10)
|
61 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
62 |
+
return response.text
|
63 |
+
except requests.exceptions.RequestException as e:
|
64 |
+
logging.error(f"Error fetching latest data from {url}: {str(e)}")
|
65 |
+
return None
|
66 |
+
|
67 |
+
def compare_html(old_html, new_html):
|
68 |
+
"""Compare two HTML contents to detect changes."""
|
69 |
+
if not old_html or not new_html:
|
70 |
+
return False
|
71 |
+
return old_html.strip() != new_html.strip()
|
72 |
+
|
73 |
+
def compare_screenshot(old_screenshot, new_screenshot):
|
74 |
+
"""Compare two screenshots to detect changes."""
|
75 |
+
try:
|
76 |
+
if not old_screenshot or not new_screenshot:
|
77 |
+
return False
|
78 |
+
old_img = Image.open(io.BytesIO(old_screenshot))
|
79 |
+
new_img = Image.open(io.BytesIO(new_screenshot))
|
80 |
+
return not (old_img.tobytes() == new_img.tobytes())
|
81 |
+
except Exception as e:
|
82 |
+
logging.error(f"Error comparing screenshots: {str(e)}")
|
83 |
+
return False
|
84 |
+
|
85 |
+
def alert_changes(url, change_type):
|
86 |
+
"""Log detected changes."""
|
87 |
+
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
88 |
+
logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
|
89 |
+
return f"[{timestamp}] {change_type}"
|
90 |
+
|
91 |
+
def extract_links_from_page(url):
|
92 |
+
"""Extract all links from a webpage."""
|
93 |
+
try:
|
94 |
+
response = requests.get(url, timeout=10)
|
95 |
+
response.raise_for_status()
|
96 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
97 |
+
links = [a.get('href') for a in soup.find_all('a', href=True)]
|
98 |
+
return links
|
99 |
+
except requests.exceptions.RequestException as e:
|
100 |
+
logging.error(f"Error extracting links from {url}: {str(e)}")
|
101 |
+
return []
|
102 |
+
|
103 |
+
def take_screenshot(url):
|
104 |
+
"""Take a screenshot of a webpage."""
|
105 |
+
try:
|
106 |
+
chrome_options = Options()
|
107 |
+
chrome_options.add_argument("--headless")
|
108 |
+
chrome_options.add_argument("--no-sandbox")
|
109 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
110 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
111 |
+
|
112 |
+
driver = webdriver.Chrome(options=chrome_options)
|
113 |
+
driver.get(url)
|
114 |
+
|
115 |
+
screenshot = driver.get_screenshot_as_png()
|
116 |
+
driver.quit()
|
117 |
+
|
118 |
+
image = Image.open(io.BytesIO(screenshot))
|
119 |
+
max_size = (1024, 1024)
|
120 |
+
image.thumbnail(max_size, Image.LANCZOS)
|
121 |
+
|
122 |
+
img_byte_arr = io.BytesIO()
|
123 |
+
image.save(img_byte_arr, format='PNG')
|
124 |
+
return img_byte_arr.getvalue()
|
125 |
+
except Exception as e:
|
126 |
+
logging.error(f"Screenshot error for {url}: {str(e)}")
|
127 |
+
return None
|
128 |
+
|
129 |
+
def is_webpage(url):
|
130 |
+
"""Check if the URL points to a webpage (HTML)."""
|
131 |
+
try:
|
132 |
+
response = requests.head(url, timeout=10)
|
133 |
+
response.raise_for_status()
|
134 |
+
content_type = response.headers.get('Content-Type', '').lower()
|
135 |
+
return 'text/html' in content_type
|
136 |
+
except requests.exceptions.RequestException as e:
|
137 |
+
logging.error(f"Error checking content type for {url}: {str(e)}")
|
138 |
+
return False
|
139 |
+
|
140 |
+
def crawl_url(url, depth, max_depth, visited=None):
|
141 |
+
"""Recursively crawl a URL up to a specified depth."""
|
142 |
+
if visited is None:
|
143 |
+
visited = set()
|
144 |
+
|
145 |
+
if depth > max_depth or url in visited or not validate_url(url):
|
146 |
+
return []
|
147 |
+
|
148 |
+
visited.add(url)
|
149 |
+
screenshots = []
|
150 |
+
|
151 |
+
if is_webpage(url):
|
152 |
+
links = extract_links_from_page(url)
|
153 |
+
screenshot = take_screenshot(url)
|
154 |
+
if screenshot:
|
155 |
+
screenshots.append((url, screenshot))
|
156 |
+
|
157 |
+
if depth < max_depth:
|
158 |
+
for link in links:
|
159 |
+
absolute_link = urljoin(url, link)
|
160 |
+
screenshots.extend(crawl_url(absolute_link, depth + 1, max_depth, visited))
|
161 |
+
else:
|
162 |
+
logging.info(f"Skipping non-webpage content: {url}")
|
163 |
+
|
164 |
+
return screenshots
|
165 |
+
|
166 |
+
def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mode='standard', progress=gr.Progress()):
|
167 |
+
"""Process URLs with crawl depth and change detection."""
|
168 |
+
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
169 |
+
urls = [url.strip() for url in urls if url.strip()]
|
170 |
+
urls = urls[:int(max_urls)]
|
171 |
+
|
172 |
+
# Validate all URLs
|
173 |
+
invalid_urls = [url for url in urls if not validate_url(url)]
|
174 |
+
if invalid_urls:
|
175 |
+
if mode == 'chat':
|
176 |
+
return f"Invalid URLs detected: {', '.join(invalid_urls)}"
|
177 |
+
else:
|
178 |
+
return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
|
179 |
+
|
180 |
+
scraped_data = []
|
181 |
+
screenshots = []
|
182 |
+
changes_log = []
|
183 |
+
|
184 |
+
# Initialize progress tracking
|
185 |
+
total_urls = len(urls)
|
186 |
+
progress(0, desc="Starting...")
|
187 |
+
|
188 |
+
# Directory to store scraped data
|
189 |
+
data_dir = 'scraped_data'
|
190 |
+
os.makedirs(data_dir, exist_ok=True)
|
191 |
+
|
192 |
+
# Process each URL
|
193 |
+
for idx, url in enumerate(urls):
|
194 |
+
progress((idx + 1) / total_urls, desc=f"Processing: {url}")
|
195 |
+
|
196 |
+
if not url.startswith(('http://', 'https://')):
|
197 |
+
url = f'https://{url}'
|
198 |
+
|
199 |
+
# Sanitize URL for file naming
|
200 |
+
sanitized_url = sanitize_filename(url)
|
201 |
+
|
202 |
+
# Check for changes
|
203 |
+
old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt")
|
204 |
+
old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png")
|
205 |
+
|
206 |
+
# Fetch latest data
|
207 |
+
latest_html = get_latest_data(url)
|
208 |
+
latest_screenshot = take_screenshot(url)
|
209 |
+
|
210 |
+
# Compare with previous data if available
|
211 |
+
if os.path.exists(old_html_path):
|
212 |
+
with open(old_html_path, 'r', encoding='utf-8') as f:
|
213 |
+
old_html = f.read()
|
214 |
+
if compare_html(old_html, latest_html):
|
215 |
+
changes_log.append(alert_changes(url, "HTML content has changed"))
|
216 |
+
|
217 |
+
if os.path.exists(old_screenshot_path):
|
218 |
+
with open(old_screenshot_path, 'rb') as f:
|
219 |
+
old_screenshot = f.read()
|
220 |
+
if latest_screenshot and compare_screenshot(old_screenshot, latest_screenshot):
|
221 |
+
changes_log.append(alert_changes(url, "Visual content has changed"))
|
222 |
+
|
223 |
+
# Store latest data
|
224 |
+
if latest_html:
|
225 |
+
with open(old_html_path, 'w', encoding='utf-8') as f:
|
226 |
+
f.write(latest_html)
|
227 |
+
if latest_screenshot:
|
228 |
+
with open(old_screenshot_path, 'wb') as f:
|
229 |
+
f.write(latest_screenshot)
|
230 |
+
|
231 |
+
# Prepare output data
|
232 |
+
if action_radio in ['Scrape data', 'Both']:
|
233 |
+
scraped_data.append({
|
234 |
+
'url': url,
|
235 |
+
'content': latest_html,
|
236 |
+
'timestamp': datetime.datetime.now().isoformat(),
|
237 |
+
'changes_detected': changes_log
|
238 |
+
})
|
239 |
+
|
240 |
+
if action_radio in ['Capture image', 'Both']:
|
241 |
+
crawled_screenshots = crawl_url(url, depth=0, max_depth=int(crawl_depth))
|
242 |
+
screenshots.extend(crawled_screenshots)
|
243 |
+
|
244 |
+
if mode == 'chat':
|
245 |
+
return "\n".join(changes_log)
|
246 |
+
else:
|
247 |
+
# Create a temporary file to store the ZIP
|
248 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
|
249 |
+
with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
250 |
+
# Add screenshots to ZIP
|
251 |
+
for screenshot_url, screenshot_data in screenshots:
|
252 |
+
sanitized_screenshot_url = sanitize_filename(screenshot_url)
|
253 |
+
filename = f"{sanitized_screenshot_url}.png"
|
254 |
+
zipf.writestr(filename, screenshot_data)
|
255 |
+
|
256 |
+
# Add scraped data and changes log to ZIP
|
257 |
+
if scraped_data:
|
258 |
+
data_to_save = {
|
259 |
+
'scraped_data': scraped_data,
|
260 |
+
'changes_log': changes_log,
|
261 |
+
'timestamp': datetime.datetime.now().isoformat()
|
262 |
+
}
|
263 |
+
zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
|
264 |
+
|
265 |
+
# Get the path to the temporary file
|
266 |
+
zip_file_path = tmp_file.name
|
267 |
+
|
268 |
+
# Prepare display data
|
269 |
+
display_data = {
|
270 |
+
'total_scraped_urls': len(scraped_data),
|
271 |
+
'total_screenshots_taken': len(screenshots),
|
272 |
+
'changes_detected': changes_log,
|
273 |
+
'scraped_data': scraped_data
|
274 |
+
}
|
275 |
+
|
276 |
+
# Return the path to the temporary ZIP file and display data
|
277 |
+
return zip_file_path, json.dumps(display_data, indent=2)
|
278 |
|
279 |
class DataExtractor:
|
280 |
def __init__(self):
|
|
|
291 |
if not self.soup:
|
292 |
self.logger.error("BeautifulSoup object not initialized")
|
293 |
return []
|
294 |
+
|
295 |
images = []
|
296 |
all_imgs = self.soup.find_all('img')
|
297 |
self.logger.info(f"Found {len(all_imgs)} raw image tags")
|
298 |
+
|
299 |
for img in all_imgs:
|
300 |
try:
|
301 |
src = img.get('src', '')
|
302 |
if src:
|
303 |
# Handle relative URLs
|
304 |
+
src = urljoin(self.base_url, src)
|
|
|
|
|
|
|
305 |
|
306 |
image_data = {
|
307 |
'src': src,
|
|
|
315 |
except Exception as e:
|
316 |
self.logger.error(f"Error processing image: {str(e)}")
|
317 |
continue
|
318 |
+
|
319 |
self.logger.info(f"Successfully extracted {len(images)} valid images")
|
320 |
return images
|
321 |
|
|
|
323 |
if not self.soup:
|
324 |
self.logger.error("BeautifulSoup object not initialized")
|
325 |
return []
|
326 |
+
|
327 |
links = []
|
328 |
all_links = self.soup.find_all('a')
|
329 |
self.logger.info(f"Found {len(all_links)} raw link tags")
|
330 |
+
|
331 |
for a in all_links:
|
332 |
try:
|
333 |
href = a.get('href', '')
|
334 |
if href and not href.startswith(('#', 'javascript:', 'mailto:')):
|
335 |
# Handle relative URLs
|
336 |
+
href = urljoin(self.base_url, href)
|
|
|
|
|
|
|
337 |
|
338 |
links.append({
|
339 |
'href': href,
|
|
|
346 |
except Exception as e:
|
347 |
self.logger.error(f"Error processing link: {str(e)}")
|
348 |
continue
|
349 |
+
|
350 |
self.logger.info(f"Successfully extracted {len(links)} valid links")
|
351 |
return links
|
352 |
|
|
|
354 |
if not self.soup:
|
355 |
self.logger.error("BeautifulSoup object not initialized")
|
356 |
return []
|
357 |
+
|
358 |
texts = []
|
359 |
all_paragraphs = self.soup.find_all('p') # Extracting all paragraph tags
|
360 |
self.logger.info(f"Found {len(all_paragraphs)} raw paragraph tags")
|
361 |
+
|
362 |
for p in all_paragraphs:
|
363 |
try:
|
364 |
text_content = p.get_text(strip=True)
|
|
|
371 |
except Exception as e:
|
372 |
self.logger.error(f"Error processing text block: {str(e)}")
|
373 |
continue
|
374 |
+
|
375 |
self.logger.info(f"Successfully extracted {len(texts)} valid text blocks")
|
376 |
return texts
|
377 |
|
|
|
400 |
self.logger.info(f"Parsing query: {query}")
|
401 |
tokens = word_tokenize(query.lower())
|
402 |
filtered_tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in self.stop_words]
|
403 |
+
|
404 |
query_info = {
|
405 |
'target': self._identify_target(filtered_tokens),
|
406 |
'limit': self._identify_limit(filtered_tokens),
|
407 |
'filters': self._identify_filters(filtered_tokens)
|
408 |
}
|
409 |
+
|
410 |
self.logger.info(f"Query parsed: {query_info}")
|
411 |
return query_info
|
412 |
except Exception as e:
|
|
|
516 |
self.data_extractor = DataExtractor()
|
517 |
self.response_formatter = ResponseFormatter()
|
518 |
self.logger = logging.getLogger(__name__)
|
519 |
+
self.scraped_data = {} # Temporarily store scraped data
|
520 |
+
|
521 |
def process_url(self, url: str, query: str) -> str:
|
522 |
try:
|
523 |
# Validate URL
|
|
|
533 |
response.raise_for_status()
|
534 |
self.logger.info(f"Successfully fetched page. Status code: {response.status_code}")
|
535 |
|
536 |
+
# Set page content and store in scraped_data
|
537 |
self.data_extractor.set_page(response.text, url)
|
538 |
+
self.scraped_data[url] = {
|
539 |
+
'images': self.data_extractor.extract_images(),
|
540 |
+
'links': self.data_extractor.extract_links(),
|
541 |
+
'texts': self.data_extractor.extract_text()
|
542 |
+
}
|
543 |
|
544 |
# Analyze query
|
545 |
query_info = self.query_analyzer.parse_query(query)
|
546 |
self.logger.info(f"Query analysis: {query_info}")
|
547 |
|
548 |
# Extract requested data
|
549 |
+
data = self._get_data_for_target(query_info['target'], url)
|
550 |
self.logger.info(f"Extracted {len(data)} items for target: {query_info['target']}")
|
551 |
|
552 |
# Format response
|
553 |
formatted_response = self.response_formatter.format_data(data, query_info)
|
554 |
self.logger.info("Response formatted successfully")
|
555 |
+
|
556 |
return formatted_response
|
557 |
|
558 |
except requests.exceptions.RequestException as e:
|
|
|
572 |
self.logger.error(f"URL validation error: {str(e)}")
|
573 |
return False
|
574 |
|
575 |
+
def _get_data_for_target(self, target: str, url: str) -> List[Dict]:
|
576 |
+
if url not in self.scraped_data:
|
577 |
+
self.logger.warning(f"No data found for URL: {url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
578 |
return []
|
579 |
+
|
580 |
+
if target == 'image':
|
581 |
+
return self.scraped_data[url]['images']
|
582 |
+
elif target == 'link':
|
583 |
+
return self.scraped_data[url]['links']
|
584 |
+
elif target == 'text':
|
585 |
+
return self.scraped_data[url]['texts']
|
586 |
+
else:
|
587 |
+
self.logger.warning(f"No extractor found for target: {target}")
|
588 |
return []
|
589 |
|
590 |
+
def recognize_intent(self, instruction: str) -> str:
|
591 |
+
"""Recognizes the intent of an instruction."""
|
592 |
+
instruction = instruction.lower()
|
593 |
+
# General patterns for actions and data types
|
594 |
+
action_patterns = {
|
595 |
+
r'\b(find|extract|scrape)\s+(links|images|texts)\b': 'extract_data',
|
596 |
+
r'\b(count)\s+(links|images|texts)\b': 'count_data',
|
597 |
+
}
|
598 |
+
for pattern, intent in action_patterns.items():
|
599 |
+
if re.search(pattern, instruction):
|
600 |
+
return intent
|
601 |
+
return "unknown"
|
602 |
+
|
603 |
+
def extract_data_type(self, instruction: str) -> str:
|
604 |
+
"""Extracts the data type from an instruction."""
|
605 |
+
instruction = instruction.lower()
|
606 |
+
data_types = {
|
607 |
+
r'\b(links)\b': 'link',
|
608 |
+
r'\b(images)\b': 'image',
|
609 |
+
r'\b(texts)\b': 'text',
|
610 |
+
}
|
611 |
+
for pattern, data_type in data_types.items():
|
612 |
+
if re.search(pattern, instruction):
|
613 |
+
return data_type
|
614 |
+
return "unknown"
|
615 |
+
|
616 |
+
def chat_based_scrape(self, instruction, url_input, output_format):
|
617 |
+
"""Handles chat-based instructions for web scraping."""
|
618 |
+
if not validate_url(url_input):
|
619 |
+
return "Invalid URL. Please enter a valid URL."
|
620 |
+
|
621 |
+
if url_input not in self.scraped_data:
|
622 |
+
self.process_url(url_input, "") # Fetch and store data if not already present
|
623 |
+
|
624 |
+
# Recognize intent and extract data type if applicable
|
625 |
+
intent = self.recognize_intent(instruction)
|
626 |
+
data_type = self.extract_data_type(instruction)
|
627 |
+
|
628 |
+
if intent == "unknown" or data_type == "unknown":
|
629 |
+
return "Instruction not recognized. Please try again."
|
630 |
+
|
631 |
+
# Extract data based on intent and data type
|
632 |
+
if intent == "extract_data":
|
633 |
+
data = self._get_data_for_target(data_type, url_input)
|
634 |
+
if output_format == "JSON":
|
635 |
+
return json.dumps(data, indent=2)
|
636 |
+
else:
|
637 |
+
query_info = {'target': data_type, 'limit': 0, 'filters': {}}
|
638 |
+
return self.response_formatter.format_data(data, query_info)
|
639 |
+
elif intent == "count_data":
|
640 |
+
data = self._get_data_for_target(data_type, url_input)
|
641 |
+
return f"The number of {data_type}s is {len(data)}."
|
642 |
+
else:
|
643 |
+
return "Instruction not recognized. Please try again."
|
644 |
+
|
645 |
def create_interface():
|
646 |
+
"""Create the Gradio interface."""
|
647 |
scraper = SmartWebScraper()
|
648 |
+
|
|
|
|
|
|
|
649 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
650 |
+
gr.Markdown(
|
651 |
+
"""
|
652 |
+
# 🌐 Enhanced Web Scraper with Change Detection and Chat
|
653 |
+
Monitor and capture changes in web content automatically. Use the chat interface to interact with scraped data.
|
654 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
655 |
)
|
656 |
+
|
657 |
+
with gr.Tabs():
|
658 |
+
with gr.Tab("URL Scrape/Screenshot"):
|
659 |
+
url_input = gr.Textbox(
|
660 |
+
label="Enter URL(s)",
|
661 |
+
value="https://example.com",
|
662 |
+
placeholder="Enter single URL or multiple URLs separated by commas"
|
663 |
+
)
|
664 |
+
|
665 |
+
with gr.Row():
|
666 |
+
bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
|
667 |
+
action_radio = gr.Radio(
|
668 |
+
["Scrape data", "Capture image", "Both"],
|
669 |
+
label="Select Action",
|
670 |
+
value="Both"
|
671 |
+
)
|
672 |
+
|
673 |
+
with gr.Row():
|
674 |
+
max_urls = gr.Slider(
|
675 |
+
minimum=1,
|
676 |
+
maximum=20,
|
677 |
+
value=5,
|
678 |
+
step=1,
|
679 |
+
label="Max URLs to process"
|
680 |
+
)
|
681 |
+
crawl_depth = gr.Slider(
|
682 |
+
minimum=0,
|
683 |
+
maximum=3,
|
684 |
+
value=1,
|
685 |
+
step=1,
|
686 |
+
label="Crawl Depth (0 for no recursion)"
|
687 |
+
)
|
688 |
+
|
689 |
+
process_button = gr.Button("Process URLs", variant="primary")
|
690 |
+
|
691 |
+
with gr.Column():
|
692 |
+
screenshot_zip = gr.File(label="Download Results")
|
693 |
+
scraped_data_output = gr.JSON(label="Results Summary")
|
694 |
+
|
695 |
+
process_button.click(
|
696 |
+
fn=process_urls,
|
697 |
+
inputs=[
|
698 |
+
url_input,
|
699 |
+
bulk_toggle,
|
700 |
+
action_radio,
|
701 |
+
max_urls,
|
702 |
+
crawl_depth
|
703 |
+
],
|
704 |
+
outputs=[
|
705 |
+
screenshot_zip,
|
706 |
+
scraped_data_output
|
707 |
+
],
|
708 |
+
show_progress=True
|
709 |
+
)
|
710 |
+
|
711 |
+
with gr.Tab("Chat-Based Scrape"):
|
712 |
+
instruction = gr.Textbox(
|
713 |
+
label="Enter Instruction",
|
714 |
+
placeholder="e.g., 'Scrape all links' or 'Extract all images'"
|
715 |
+
)
|
716 |
+
chat_url_input = gr.Textbox(
|
717 |
+
label="Enter URL",
|
718 |
+
value="https://example.com",
|
719 |
+
placeholder="Enter the target URL"
|
720 |
+
)
|
721 |
+
output_format = gr.Radio(
|
722 |
+
["Formatted Text", "JSON"],
|
723 |
+
label="Output Format",
|
724 |
+
value="Formatted Text"
|
725 |
+
)
|
726 |
+
chat_output = gr.Textbox(label="Output")
|
727 |
+
|
728 |
+
chat_button = gr.Button("Execute Instruction", variant="primary")
|
729 |
+
|
730 |
+
chat_button.click(
|
731 |
+
fn=scraper.chat_based_scrape,
|
732 |
+
inputs=[instruction, chat_url_input, output_format],
|
733 |
+
outputs=chat_output
|
734 |
+
)
|
735 |
+
|
736 |
+
gr.Markdown(
|
737 |
+
"""
|
738 |
+
### Features
|
739 |
+
- Bulk URL processing
|
740 |
+
- Screenshot capture
|
741 |
+
- Content change detection
|
742 |
+
- Recursive crawling
|
743 |
+
- Chat-based instructions for interacting with scraped data
|
744 |
+
"""
|
745 |
)
|
746 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
747 |
return demo
|
748 |
|
749 |
if __name__ == "__main__":
|