Spaces:
Runtime error
Runtime error
Rename app.tsx to 2app.py
Browse files
2app.py
ADDED
@@ -0,0 +1,410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import re
|
4 |
+
import logging
|
5 |
+
import json
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
from selenium import webdriver
|
8 |
+
from selenium.webdriver.chrome.options import Options
|
9 |
+
from PIL import Image
|
10 |
+
import io
|
11 |
+
import zipfile
|
12 |
+
import os
|
13 |
+
import datetime
|
14 |
+
from urllib.parse import urlparse
|
15 |
+
import tempfile
|
16 |
+
import nltk
|
17 |
+
|
18 |
+
try:
|
19 |
+
nltk.download('punkt')
|
20 |
+
nltk.download('stopwords')
|
21 |
+
nltk.download('wordnet')
|
22 |
+
nltk.download('averaged_perceptron_tagger')
|
23 |
+
except Exception as e:
|
24 |
+
logging.error(f"Error downloading NLTK data: {str(e)}")
|
25 |
+
|
26 |
+
# Configure logging
|
27 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
28 |
+
|
29 |
+
def sanitize_filename(filename):
|
30 |
+
return re.sub(r'[<>:"/\\|?*\n]+', '_', filename)
|
31 |
+
|
32 |
+
def validate_url(url):
|
33 |
+
"""Validate if the URL is properly formatted."""
|
34 |
+
try:
|
35 |
+
result = urlparse(url)
|
36 |
+
return all([result.scheme, result.netloc])
|
37 |
+
except:
|
38 |
+
return False
|
39 |
+
|
40 |
+
def get_latest_data(url):
|
41 |
+
"""Get the latest HTML content of a webpage."""
|
42 |
+
try:
|
43 |
+
response = requests.get(url, timeout=10)
|
44 |
+
return response.text
|
45 |
+
except Exception as e:
|
46 |
+
logging.error(f"Error fetching latest data from {url}: {str(e)}")
|
47 |
+
return None
|
48 |
+
|
49 |
+
def compare_html(old_html, new_html):
|
50 |
+
"""Compare two HTML contents to detect changes."""
|
51 |
+
if not old_html or not new_html:
|
52 |
+
return False
|
53 |
+
return old_html.strip() != new_html.strip()
|
54 |
+
|
55 |
+
def compare_screenshot(old_screenshot, new_screenshot):
|
56 |
+
"""Compare two screenshots to detect changes."""
|
57 |
+
try:
|
58 |
+
if not old_screenshot or not new_screenshot:
|
59 |
+
return False
|
60 |
+
old_img = Image.open(io.BytesIO(old_screenshot))
|
61 |
+
new_img = Image.open(io.BytesIO(new_screenshot))
|
62 |
+
return not (old_img == new_img)
|
63 |
+
except Exception as e:
|
64 |
+
logging.error(f"Error comparing screenshots: {str(e)}")
|
65 |
+
return False
|
66 |
+
|
67 |
+
def alert_changes(url, change_type):
|
68 |
+
"""Log detected changes."""
|
69 |
+
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
70 |
+
logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}")
|
71 |
+
return f"[{timestamp}] {change_type}"
|
72 |
+
|
73 |
+
def extract_links_from_page(url):
|
74 |
+
"""Extract all links from a webpage."""
|
75 |
+
try:
|
76 |
+
response = requests.get(url, timeout=10)
|
77 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
78 |
+
links = [a['href'] for a in soup.find_all('a', href=True)]
|
79 |
+
return links
|
80 |
+
except Exception as e:
|
81 |
+
logging.error(f"Error extracting links from {url}: {str(e)}")
|
82 |
+
return []
|
83 |
+
|
84 |
+
def take_screenshot(url):
|
85 |
+
"""Take a screenshot of a webpage."""
|
86 |
+
try:
|
87 |
+
chrome_options = Options()
|
88 |
+
chrome_options.add_argument("--headless")
|
89 |
+
chrome_options.add_argument("--no-sandbox")
|
90 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
91 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
92 |
+
driver = webdriver.Chrome(options=chrome_options)
|
93 |
+
driver.get(url)
|
94 |
+
screenshot = driver.get_screenshot_as_png()
|
95 |
+
driver.quit()
|
96 |
+
image = Image.open(io.BytesIO(screenshot))
|
97 |
+
max_size = (1024, 1024)
|
98 |
+
image.thumbnail(max_size, Image.LANCZOS)
|
99 |
+
img_byte_arr = io.BytesIO()
|
100 |
+
image.save(img_byte_arr, format='PNG')
|
101 |
+
return img_byte_arr.getvalue()
|
102 |
+
except Exception as e:
|
103 |
+
logging.error(f"Screenshot error for {url}: {str(e)}")
|
104 |
+
return None
|
105 |
+
|
106 |
+
def is_webpage(url):
|
107 |
+
"""Check if the URL points to a webpage (HTML)."""
|
108 |
+
try:
|
109 |
+
response = requests.head(url, timeout=10)
|
110 |
+
content_type = response.headers.get('Content-Type', '').lower()
|
111 |
+
return 'text/html' in content_type
|
112 |
+
except Exception as e:
|
113 |
+
logging.error(f"Error checking content type for {url}: {str(e)}")
|
114 |
+
return False
|
115 |
+
|
116 |
+
def crawl_url(url, depth, max_depth, visited=None):
|
117 |
+
"""Recursively crawl a URL up to a specified depth."""
|
118 |
+
if visited is None:
|
119 |
+
visited = set()
|
120 |
+
if depth > max_depth or url in visited:
|
121 |
+
return []
|
122 |
+
visited.add(url)
|
123 |
+
screenshots = []
|
124 |
+
if is_webpage(url):
|
125 |
+
links = extract_links_from_page(url)
|
126 |
+
screenshot = take_screenshot(url)
|
127 |
+
if screenshot:
|
128 |
+
screenshots.append((url, screenshot))
|
129 |
+
if depth < max_depth:
|
130 |
+
for link in links:
|
131 |
+
if not link.startswith(('http://', 'https://')):
|
132 |
+
link = f"https://{link}"
|
133 |
+
screenshots.extend(crawl_url(link, depth + 1, max_depth, visited))
|
134 |
+
else:
|
135 |
+
logging.info(f"Skipping non-webpage content: {url}")
|
136 |
+
return screenshots
|
137 |
+
|
138 |
+
def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mode='standard', progress=gr.Progress()):
|
139 |
+
"""Process URLs with crawl depth and change detection."""
|
140 |
+
# Validate URLs first
|
141 |
+
urls = re.split(r'[,\n]+', url_input.strip())
|
142 |
+
if bulk_toggle:
|
143 |
+
urls = [url.strip() for url in urls if url.strip()]
|
144 |
+
else:
|
145 |
+
urls = [url_input.strip()]
|
146 |
+
urls = urls[:int(max_urls)]
|
147 |
+
|
148 |
+
# Validate all URLs
|
149 |
+
invalid_urls = [url for url in urls if not validate_url(url)]
|
150 |
+
if invalid_urls:
|
151 |
+
if mode == 'chat':
|
152 |
+
return f"Invalid URLs detected: {', '.join(invalid_urls)}"
|
153 |
+
else:
|
154 |
+
return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
|
155 |
+
|
156 |
+
scraped_data = []
|
157 |
+
screenshots = []
|
158 |
+
changes_log = []
|
159 |
+
|
160 |
+
# Initialize progress tracking
|
161 |
+
total_urls = len(urls)
|
162 |
+
progress(0)
|
163 |
+
|
164 |
+
# Directory to store scraped data
|
165 |
+
data_dir = 'scraped_data'
|
166 |
+
os.makedirs(data_dir, exist_ok=True)
|
167 |
+
|
168 |
+
# Process each URL
|
169 |
+
for idx, url in enumerate(urls):
|
170 |
+
if not url.startswith(('http://', 'https://')):
|
171 |
+
url = f'https://{url}'
|
172 |
+
|
173 |
+
# Sanitize URL for file naming
|
174 |
+
sanitized_url = sanitize_filename(url)
|
175 |
+
|
176 |
+
# Check for changes
|
177 |
+
old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt")
|
178 |
+
old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png")
|
179 |
+
|
180 |
+
# Fetch latest data
|
181 |
+
latest_html = get_latest_data(url)
|
182 |
+
latest_screenshot = take_screenshot(url)
|
183 |
+
|
184 |
+
# Compare with previous data if available
|
185 |
+
if os.path.exists(old_html_path):
|
186 |
+
with open(old_html_path, 'r', encoding='utf-8') as f:
|
187 |
+
old_html = f.read()
|
188 |
+
if compare_html(old_html, latest_html):
|
189 |
+
changes_log.append(alert_changes(url, "HTML content has changed"))
|
190 |
+
|
191 |
+
if os.path.exists(old_screenshot_path):
|
192 |
+
with open(old_screenshot_path, 'rb') as f:
|
193 |
+
old_screenshot = f.read()
|
194 |
+
if latest_screenshot and compare_screenshot(old_screenshot, latest_screenshot):
|
195 |
+
changes_log.append(alert_changes(url, "Visual content has changed"))
|
196 |
+
|
197 |
+
# Store latest data
|
198 |
+
if latest_html:
|
199 |
+
with open(old_html_path, 'w', encoding='utf-8') as f:
|
200 |
+
f.write(latest_html)
|
201 |
+
|
202 |
+
if latest_screenshot:
|
203 |
+
with open(old_screenshot_path, 'wb') as f:
|
204 |
+
f.write(latest_screenshot)
|
205 |
+
|
206 |
+
# Prepare output data
|
207 |
+
if action_radio in ['Scrape data', 'Both']:
|
208 |
+
scraped_data.append({
|
209 |
+
'url': url,
|
210 |
+
'content': latest_html, # Include full HTML content
|
211 |
+
'timestamp': datetime.datetime.now().isoformat(),
|
212 |
+
'changes_detected': changes_log
|
213 |
+
})
|
214 |
+
|
215 |
+
if action_radio in ['Capture image', 'Both']:
|
216 |
+
crawled_screenshots = crawl_url(url, depth=1, max_depth=int(crawl_depth))
|
217 |
+
screenshots.extend(crawled_screenshots)
|
218 |
+
|
219 |
+
# Update progress
|
220 |
+
progress((idx + 1) / total_urls)
|
221 |
+
|
222 |
+
if mode == 'chat':
|
223 |
+
return "\n".join(changes_log)
|
224 |
+
else:
|
225 |
+
# Create a temporary file to store the ZIP
|
226 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
|
227 |
+
with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
228 |
+
# Add screenshots to ZIP
|
229 |
+
for screenshot_url, screenshot_data in screenshots:
|
230 |
+
sanitized_screenshot_url = sanitize_filename(screenshot_url)
|
231 |
+
filename = f"{sanitized_screenshot_url}.png"
|
232 |
+
zipf.writestr(filename, screenshot_data)
|
233 |
+
|
234 |
+
# Add scraped data and changes log to ZIP
|
235 |
+
if scraped_data:
|
236 |
+
data_to_save = {
|
237 |
+
'scraped_data': scraped_data,
|
238 |
+
'changes_log': changes_log,
|
239 |
+
'timestamp': datetime.datetime.now().isoformat()
|
240 |
+
}
|
241 |
+
zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
|
242 |
+
|
243 |
+
# Get the path to the temporary file
|
244 |
+
zip_file_path = tmp_file.name
|
245 |
+
|
246 |
+
# Prepare display data
|
247 |
+
display_data = {
|
248 |
+
'total_scraped_urls': len(scraped_data),
|
249 |
+
'total_screenshots_taken': len(screenshots),
|
250 |
+
'changes_detected': changes_log,
|
251 |
+
'scraped_data': scraped_data # Include full scraped data
|
252 |
+
}
|
253 |
+
|
254 |
+
# Return the path to the temporary ZIP file and display data
|
255 |
+
return zip_file_path, json.dumps(display_data, indent=2)
|
256 |
+
|
257 |
+
def recognize_intent(instruction: str) -> str:
|
258 |
+
instruction = instruction.lower()
|
259 |
+
# General patterns for actions and data types
|
260 |
+
action_patterns = {
|
261 |
+
r'\b(find|extract|scrape)\s+(links|images|videos|texts|prices|product names|reviews)\b': 'extract_data',
|
262 |
+
r'\b(count)\s+(links|images|videos|products)\b': 'count_data',
|
263 |
+
r'\b(what is|get|fetch)\s+(channel name|subscriber count|viewers)\b': 'fetch_specific_data',
|
264 |
+
r'\b(monitor)\s+changes\b': 'monitor_changes',
|
265 |
+
}
|
266 |
+
for pattern, intent in action_patterns.items():
|
267 |
+
if re.search(pattern, instruction):
|
268 |
+
return intent
|
269 |
+
return "unknown"
|
270 |
+
|
271 |
+
def extract_data_type(instruction: str) -> str:
|
272 |
+
instruction = instruction.lower()
|
273 |
+
data_types = {
|
274 |
+
r'\b(links|images|videos|texts|prices|product names|reviews)\b': 'links',
|
275 |
+
r'\b(links|images|videos|products)\b': 'images',
|
276 |
+
r'\b(channel name|subscriber count|viewers)\b': 'channel name',
|
277 |
+
}
|
278 |
+
for pattern, data_type in data_types.items():
|
279 |
+
if re.search(pattern, instruction):
|
280 |
+
return data_type
|
281 |
+
return "unknown"
|
282 |
+
|
283 |
+
def format_output(data, output_format):
|
284 |
+
if output_format == "JSON":
|
285 |
+
return json.dumps(data, indent=2)
|
286 |
+
elif output_format == "Cleaned JSON":
|
287 |
+
# Implement data cleaning logic here
|
288 |
+
return json.dumps(data, indent=2)
|
289 |
+
else:
|
290 |
+
return str(data)
|
291 |
+
|
292 |
+
def generate_command(intent: str, url_input: str, data_type: str, output_format: str) -> str:
|
293 |
+
if intent == "extract_data":
|
294 |
+
data = extract_data(url_input, data_type)
|
295 |
+
return format_output(data, output_format)
|
296 |
+
elif intent == "count_data":
|
297 |
+
count = count_data(url_input, data_type)
|
298 |
+
return f"The number of {data_type} is {count}."
|
299 |
+
elif intent == "fetch_specific_data":
|
300 |
+
specific_data = fetch_specific_data(url_input, data_type)
|
301 |
+
return specific_data
|
302 |
+
elif intent == "monitor_changes":
|
303 |
+
changes_log = monitor_changes(url_input)
|
304 |
+
return changes_log
|
305 |
+
else:
|
306 |
+
return "Instruction not recognized. Please try again."
|
307 |
+
|
308 |
+
def extract_data(url, data_type):
|
309 |
+
try:
|
310 |
+
response = requests.get(url)
|
311 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
312 |
+
if data_type == "links":
|
313 |
+
return [a['href'] for a in soup.find_all('a', href=True)]
|
314 |
+
elif data_type == "images":
|
315 |
+
return [img['src'] for img in soup.find_all('img', src=True)]
|
316 |
+
# Add more data types as needed
|
317 |
+
else:
|
318 |
+
return []
|
319 |
+
except Exception as e:
|
320 |
+
return f"Error extracting {data_type}: {str(e)}"
|
321 |
+
|
322 |
+
def count_data(url, data_type):
|
323 |
+
try:
|
324 |
+
response = requests.get(url)
|
325 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
326 |
+
if data_type == "links":
|
327 |
+
return len(soup.find_all('a', href=True))
|
328 |
+
elif data_type == "images":
|
329 |
+
return len(soup.find_all('img', src=True))
|
330 |
+
# Add more data types as needed
|
331 |
+
else:
|
332 |
+
return 0
|
333 |
+
except Exception as e:
|
334 |
+
return f"Error counting {data_type}: {str(e)}"
|
335 |
+
|
336 |
+
def fetch_specific_data(url, data_type):
|
337 |
+
try:
|
338 |
+
# Implement specific data fetching logic here
|
339 |
+
# For demonstration, return a placeholder
|
340 |
+
return f"Fetched {data_type} from {url}"
|
341 |
+
except Exception as e:
|
342 |
+
return f"Error fetching {data_type}: {str(e)}"
|
343 |
+
|
344 |
+
def monitor_changes(url_input):
|
345 |
+
try:
|
346 |
+
# Implement change monitoring logic here
|
347 |
+
# For demonstration, return a placeholder
|
348 |
+
return f"Changes monitored for {url_input}"
|
349 |
+
except Exception as e:
|
350 |
+
return f"Error monitoring changes: {str(e)}"
|
351 |
+
|
352 |
+
def chat_based_scrape(instruction, url_input, output_format):
|
353 |
+
# Recognize intent and extract data type if applicable
|
354 |
+
intent = recognize_intent(instruction)
|
355 |
+
data_type = extract_data_type(instruction)
|
356 |
+
# Generate command based on the recognized intent
|
357 |
+
command_output = generate_command(intent, url_input, data_type, output_format)
|
358 |
+
return command_output
|
359 |
+
|
360 |
+
def create_interface():
|
361 |
+
"""Create the Gradio interface."""
|
362 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
363 |
+
gr.Markdown(
|
364 |
+
"""
|
365 |
+
# Smart Web Scraper with Change Detection
|
366 |
+
Monitor and capture changes in web content automatically.
|
367 |
+
"""
|
368 |
+
)
|
369 |
+
with gr.Tabs():
|
370 |
+
with gr.Tab("URL Scrape/Screenshot"):
|
371 |
+
url_input = gr.Textbox(label="Enter URL(s)", value="https://example.com", placeholder="Enter single URL or multiple URLs separated by commas")
|
372 |
+
with gr.Row():
|
373 |
+
bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
|
374 |
+
action_radio = gr.Radio(["Scrape data", "Capture image", "Both"], label="Select Action", value="Both")
|
375 |
+
with gr.Row():
|
376 |
+
max_urls = gr.Slider(minimum=1, maximum=1000, value=5, step=1, label="Max URLs to process")
|
377 |
+
crawl_depth = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Crawl Depth")
|
378 |
+
|
379 |
+
process_button = gr.Button("Process URLs", variant="primary")
|
380 |
+
|
381 |
+
with gr.Column():
|
382 |
+
screenshot_zip = gr.File(label="Download Results")
|
383 |
+
scraped_data_output = gr.JSON(label="Results Summary")
|
384 |
+
|
385 |
+
process_button.click(fn=process_urls, inputs=[url_input, bulk_toggle, action_radio, max_urls, crawl_depth], outputs=[screenshot_zip, scraped_data_output], show_progress=True)
|
386 |
+
|
387 |
+
with gr.Tab("Chat-Based Scrape"):
|
388 |
+
instruction = gr.Textbox(label="Enter Instruction", placeholder="e.g., 'Scrape all links' or 'Extract all images'")
|
389 |
+
url_input = gr.Textbox(label="Enter URL", value="https://example.com", placeholder="Enter the target URL")
|
390 |
+
output_format = gr.Radio(["JSON", "Cleaned JSON", "Raw Data"], label="Output Format", value="JSON")
|
391 |
+
output = gr.Textbox(label="Output")
|
392 |
+
|
393 |
+
chat_button = gr.Button("Execute Instruction", variant="primary")
|
394 |
+
chat_button.click(fn=chat_based_scrape, inputs=[instruction, url_input, output_format], outputs=output)
|
395 |
+
|
396 |
+
gr.Markdown(
|
397 |
+
"""
|
398 |
+
### Features
|
399 |
+
- Bulk URL processing
|
400 |
+
- Screenshot capture
|
401 |
+
- Content change detection
|
402 |
+
- Recursive crawling
|
403 |
+
- Chat-based instructions
|
404 |
+
"""
|
405 |
+
)
|
406 |
+
return demo
|
407 |
+
|
408 |
+
if __name__ == "__main__":
|
409 |
+
demo = create_interface() # Call the function to create the interface
|
410 |
+
demo.launch() # Launch the Gradio app
|
app.tsx
DELETED
@@ -1,205 +0,0 @@
|
|
1 |
-
import React, { useState, useEffect, useRef } from 'react';
|
2 |
-
import { LineChart, Line, XAxis, YAxis, CartesianGrid, Tooltip, Legend, ResponsiveContainer } from "recharts";
|
3 |
-
|
4 |
-
type ChatMessage = {
|
5 |
-
role: 'user' | 'system';
|
6 |
-
content: string;
|
7 |
-
};
|
8 |
-
|
9 |
-
const App: React.FC = () => {
|
10 |
-
const [urlInput, setUrlInput] = useState<string>('https://www.example.com');
|
11 |
-
const [bulkToggle, setBulkToggle] = useState<boolean>(false);
|
12 |
-
const [actionRadio, setActionRadio] = useState<'Scrape data' | 'Capture image' | 'Both'>('Both');
|
13 |
-
const [maxUrls, setMaxUrls] = useState<number>(5);
|
14 |
-
const [crawlDepth, setCrawlDepth] = useState<number>(1);
|
15 |
-
const [scrapedDataOutput, setScrapedDataOutput] = useState<string>('');
|
16 |
-
const [screenshotOutput, setScreenshotOutput] = useState<string | null>(null);
|
17 |
-
const [monitorUrlsInput, setMonitorUrlsInput] = useState<string>('');
|
18 |
-
const [intervalInput, setIntervalInput] = useState<number>(300);
|
19 |
-
const [changeOutput, setChangeOutput] = useState<string>('');
|
20 |
-
const [chatHistory, setChatHistory] = useState<ChatMessage[]>([]);
|
21 |
-
const [isMonitoring, setIsMonitoring] = useState<boolean>(false);
|
22 |
-
const [monitoringData, setMonitoringData] = useState<{ time: string; changes: number }[]>([]);
|
23 |
-
const [isProcessing, setIsProcessing] = useState<boolean>(false);
|
24 |
-
const [error, setError] = useState<string | null>(null);
|
25 |
-
const wsRef = useRef<WebSocket | null>(null);
|
26 |
-
|
27 |
-
useEffect(() => {
|
28 |
-
if (isMonitoring) {
|
29 |
-
wsRef.current = new WebSocket('ws://localhost:8000/ws');
|
30 |
-
wsRef.current.onmessage = (event) => {
|
31 |
-
const message = event.data;
|
32 |
-
setChangeOutput(prev => prev + `Change detected: ${message}\n`);
|
33 |
-
setChatHistory(prev => [...prev, { role: 'system', content: `Change detected: ${message}` }]);
|
34 |
-
setMonitoringData(prev => {
|
35 |
-
const now = new Date();
|
36 |
-
const time = now.toLocaleTimeString();
|
37 |
-
return [...prev, { time, changes: 1 }];
|
38 |
-
});
|
39 |
-
};
|
40 |
-
wsRef.current.onclose = () => {
|
41 |
-
console.log("Disconnected from WebSocket server.");
|
42 |
-
};
|
43 |
-
} else {
|
44 |
-
if (wsRef.current) {
|
45 |
-
wsRef.current.close();
|
46 |
-
wsRef.current = null;
|
47 |
-
}
|
48 |
-
}
|
49 |
-
}, [isMonitoring]);
|
50 |
-
|
51 |
-
const handleProcessUrls = async () => {
|
52 |
-
setIsProcessing(true);
|
53 |
-
setError(null);
|
54 |
-
try {
|
55 |
-
const response = await fetch('http://localhost:8000/process_urls', {
|
56 |
-
method: 'POST',
|
57 |
-
headers: { 'Content-Type': 'application/json' },
|
58 |
-
body: JSON.stringify({
|
59 |
-
url_input: urlInput,
|
60 |
-
bulk_toggle: bulkToggle,
|
61 |
-
action_radio: actionRadio,
|
62 |
-
max_urls: maxUrls,
|
63 |
-
crawl_depth: crawlDepth,
|
64 |
-
}),
|
65 |
-
});
|
66 |
-
|
67 |
-
if (!response.ok) {
|
68 |
-
const errorData = await response.json();
|
69 |
-
throw new Error(`HTTP error! Status: ${response.status}, Message: ${errorData.message || 'Unknown error'}`);
|
70 |
-
}
|
71 |
-
|
72 |
-
const data = await response.json();
|
73 |
-
setScrapedDataOutput(JSON.stringify(data.scraped_data, null, 2));
|
74 |
-
if (data.screenshot_data) {
|
75 |
-
setScreenshotOutput(data.screenshot_data);
|
76 |
-
} else {
|
77 |
-
setScreenshotOutput(null);
|
78 |
-
}
|
79 |
-
setError(null);
|
80 |
-
} catch (e: any) {
|
81 |
-
console.error("Error processing URLs:", e);
|
82 |
-
setError(e.message);
|
83 |
-
setScrapedDataOutput('');
|
84 |
-
setScreenshotOutput(null);
|
85 |
-
} finally {
|
86 |
-
setIsProcessing(false);
|
87 |
-
}
|
88 |
-
};
|
89 |
-
|
90 |
-
const handleStartMonitoring = async () => {
|
91 |
-
setIsMonitoring(true);
|
92 |
-
const urls = monitorUrlsInput.split('\n').map(url => url.trim()).filter(url => url !== '');
|
93 |
-
await fetch('http://localhost:8000/start_monitoring', {
|
94 |
-
method: 'POST',
|
95 |
-
headers: { 'Content-Type': 'application/json' },
|
96 |
-
body: JSON.stringify({ urls, interval: intervalInput }),
|
97 |
-
});
|
98 |
-
setChatHistory(prev => [...prev, { role: 'system', content: "Monitoring started." }]);
|
99 |
-
};
|
100 |
-
|
101 |
-
const handleStopMonitoring = async () => {
|
102 |
-
setIsMonitoring(false);
|
103 |
-
await fetch('http://localhost:8000/stop_monitoring', {
|
104 |
-
method: 'POST',
|
105 |
-
});
|
106 |
-
setChatHistory(prev => [...prev, { role: 'system', content: "Monitoring stopped." }]);
|
107 |
-
setMonitoringData([]);
|
108 |
-
};
|
109 |
-
|
110 |
-
return (
|
111 |
-
<div className="bg-gray-100 min-h-screen p-4">
|
112 |
-
<h1 className="text-3xl font-bold text-center text-gray-800 mb-8">Smart Scraper with Change Detection</h1>
|
113 |
-
{error && <div className="bg-red-200 text-red-700 rounded-md p-2 mb-4">{error}</div>}
|
114 |
-
<div className="flex flex-col md:flex-row space-y-4 md:space-y-0 md:space-x-4">
|
115 |
-
{/* URL Scrape/Screenshot Tab */}
|
116 |
-
<div className="bg-white rounded-lg shadow-md p-4 flex-1">
|
117 |
-
{/* Existing components */}
|
118 |
-
</div>
|
119 |
-
{/* Monitoring Tab */}
|
120 |
-
<div className="bg-white rounded-lg shadow-md p-4 flex-1">
|
121 |
-
<h2 className="text-xl font-semibold mb-4 text-gray-700">Monitoring</h2>
|
122 |
-
<div className="mb-2">
|
123 |
-
<label className="block text-gray-700 text-sm font-bold mb-2">Enter URLs to Monitor (separated by newline)</label>
|
124 |
-
<textarea
|
125 |
-
className="shadow appearance-none border rounded w-full py-2 px-3 text-gray-700 leading-tight focus:outline-none focus:shadow-outline"
|
126 |
-
value={monitorUrlsInput}
|
127 |
-
onChange={(e) => setMonitorUrlsInput(e.target.value)}
|
128 |
-
/>
|
129 |
-
</div>
|
130 |
-
<div className="mb-2">
|
131 |
-
<label className="block text-gray-700 text-sm font-bold mb-2">Monitoring Interval (seconds)</label>
|
132 |
-
<input type="range" className="form-range w-full" min={1} max={3600} value={intervalInput} onChange={(e) => setIntervalInput(parseInt(e.target.value))} />
|
133 |
-
<span className="text-sm text-gray-600">{intervalInput}</span>
|
134 |
-
</div>
|
135 |
-
<div className="flex space-x-4 mb-4">
|
136 |
-
<button
|
137 |
-
className={`bg-green-600 hover:bg-green-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline ${isMonitoring ? 'opacity-50 cursor-not-allowed' : ''}`}
|
138 |
-
onClick={handleStartMonitoring}
|
139 |
-
disabled={isMonitoring}
|
140 |
-
>
|
141 |
-
Start Monitoring
|
142 |
-
</button>
|
143 |
-
<button
|
144 |
-
className={`bg-red-600 hover:bg-red-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline ${!isMonitoring ? 'opacity-50 cursor-not-allowed' : ''}`}
|
145 |
-
onClick={handleStopMonitoring}
|
146 |
-
disabled={!isMonitoring}
|
147 |
-
>
|
148 |
-
Stop Monitoring
|
149 |
-
</button>
|
150 |
-
</div>
|
151 |
-
{changeOutput && (
|
152 |
-
<div className="mb-4">
|
153 |
-
<label className="block text-gray-700 text-sm font-bold mb-2">Monitoring Changes</label>
|
154 |
-
<pre className="border border-gray-300 rounded-md bg-gray-50 p-2 overflow-auto max-h-48 whitespace-pre-wrap">
|
155 |
-
{changeOutput}
|
156 |
-
</pre>
|
157 |
-
</div>
|
158 |
-
)}
|
159 |
-
{monitoringData.length > 0 && (
|
160 |
-
<div className="mb-4">
|
161 |
-
<label className="block text-gray-700 text-sm font-bold mb-2">Change History Graph</label>
|
162 |
-
<ResponsiveContainer width="100%" height={200}>
|
163 |
-
<LineChart data={monitoringData}>
|
164 |
-
<CartesianGrid strokeDasharray="3 3" />
|
165 |
-
<XAxis dataKey="time" />
|
166 |
-
<YAxis />
|
167 |
-
<Tooltip />
|
168 |
-
<Legend />
|
169 |
-
<Line type="monotone" dataKey="changes" stroke="#8884d8" />
|
170 |
-
</LineChart>
|
171 |
-
</ResponsiveContainer>
|
172 |
-
</div>
|
173 |
-
)}
|
174 |
-
<div className="mb-2">
|
175 |
-
<label className="block text-gray-700 text-sm font-bold mb-2">Monitoring Chat</label>
|
176 |
-
<div className="border rounded-md bg-gray-50 p-2 overflow-auto max-h-48 mb-2">
|
177 |
-
<ul className="space-y-2">
|
178 |
-
{chatHistory.map((msg, index) => (
|
179 |
-
<li key={index} className={msg.role === 'user' ? 'text-right' : 'text-left'}>
|
180 |
-
<div className={`${msg.role === 'user' ? 'bg-indigo-100' : 'bg-gray-100'} inline-block rounded-md p-2`}>
|
181 |
-
<span className="font-bold text-gray-700">{msg.role === 'user' ? 'You' : 'System'}:</span> <span className="text-gray-800">{msg.content}</span>
|
182 |
-
</div>
|
183 |
-
</li>
|
184 |
-
))}
|
185 |
-
</ul>
|
186 |
-
</div>
|
187 |
-
<input
|
188 |
-
type="text"
|
189 |
-
placeholder="Type command"
|
190 |
-
className="shadow appearance-none border rounded w-full py-2 px-3 text-gray-700 leading-tight focus:outline-none focus:shadow-outline"
|
191 |
-
onKeyDown={(e) => {
|
192 |
-
if (e.key === 'Enter') {
|
193 |
-
setChatHistory(prev => [...prev, { role: 'user', content: e.target.value }]);
|
194 |
-
(e.target as HTMLInputElement).value = '';
|
195 |
-
}
|
196 |
-
}}
|
197 |
-
/>
|
198 |
-
</div>
|
199 |
-
</div>
|
200 |
-
</div>
|
201 |
-
</div>
|
202 |
-
);
|
203 |
-
};
|
204 |
-
|
205 |
-
export default App;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|