Spaces:
Runtime error
Runtime error
import gradio as gr | |
import requests | |
import re | |
import logging | |
import json | |
from typing import Tuple, List, Dict, Union | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse, urljoin | |
from nltk import word_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from datetime import datetime | |
import io | |
import zipfile | |
import os | |
import tempfile | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from PIL import Image | |
# Configure detailed logging | |
logging.basicConfig( | |
level=logging.DEBUG, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.FileHandler('webscraper.log'), | |
logging.StreamHandler() | |
] | |
) | |
# Download necessary NLTK data | |
import nltk | |
try: | |
nltk.download('punkt', quiet=True) | |
nltk.download('stopwords', quiet=True) | |
nltk.download('wordnet', quiet=True) | |
nltk.download('averaged_perceptron_tagger', quiet=True) | |
except Exception as e: | |
logging.error(f"Error downloading NLTK data: {str(e)}") | |
def sanitize_filename(filename): | |
"""Sanitizes a filename by removing invalid characters.""" | |
return re.sub(r'[<>:"/\\|?*\n]+', '_', filename) | |
def validate_url(url): | |
"""Validate if the URL is properly formatted.""" | |
try: | |
result = urlparse(url) | |
return all([result.scheme, result.netloc]) | |
except Exception: | |
return False | |
def get_latest_data(url): | |
"""Get the latest HTML content of a webpage.""" | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
response.raise_for_status() # Raise an exception for bad status codes | |
return response.text | |
except requests.exceptions.RequestException as e: | |
logging.error(f"Error fetching latest data from {url}: {str(e)}") | |
return None | |
def compare_html(old_html, new_html): | |
"""Compare two HTML contents to detect changes.""" | |
if not old_html or not new_html: | |
return False | |
return old_html.strip() != new_html.strip() | |
def compare_screenshot(old_screenshot, new_screenshot): | |
"""Compare two screenshots to detect changes.""" | |
try: | |
if not old_screenshot or not new_screenshot: | |
return False | |
old_img = Image.open(io.BytesIO(old_screenshot)) | |
new_img = Image.open(io.BytesIO(new_screenshot)) | |
return not (old_img.tobytes() == new_img.tobytes()) | |
except Exception as e: | |
logging.error(f"Error comparing screenshots: {str(e)}") | |
return False | |
def alert_changes(url, change_type): | |
"""Log detected changes.""" | |
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
logging.warning(f"[{timestamp}] Changes detected at {url}: {change_type}") | |
return f"[{timestamp}] {change_type}" | |
def extract_links_from_page(url): | |
"""Extract all links from a webpage.""" | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
links = [a.get('href') for a in soup.find_all('a', href=True)] | |
return links | |
except requests.exceptions.RequestException as e: | |
logging.error(f"Error extracting links from {url}: {str(e)}") | |
return [] | |
def take_screenshot(url): | |
"""Take a screenshot of a webpage.""" | |
try: | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") | |
chrome_options.add_argument("--no-sandbox") | |
chrome_options.add_argument("--disable-dev-shm-usage") | |
chrome_options.add_argument("--window-size=1920,1080") | |
driver = webdriver.Chrome(options=chrome_options) | |
driver.get(url) | |
screenshot = driver.get_screenshot_as_png() | |
driver.quit() | |
image = Image.open(io.BytesIO(screenshot)) | |
max_size = (1024, 1024) | |
image.thumbnail(max_size, Image.LANCZOS) | |
img_byte_arr = io.BytesIO() | |
image.save(img_byte_arr, format='PNG') | |
return img_byte_arr.getvalue() | |
except Exception as e: | |
logging.error(f"Screenshot error for {url}: {str(e)}") | |
return None | |
def is_webpage(url): | |
"""Check if the URL points to a webpage (HTML).""" | |
try: | |
response = requests.head(url, timeout=10) | |
response.raise_for_status() | |
content_type = response.headers.get('Content-Type', '').lower() | |
return 'text/html' in content_type | |
except requests.exceptions.RequestException as e: | |
logging.error(f"Error checking content type for {url}: {str(e)}") | |
return False | |
def crawl_url(url, depth, max_depth, visited=None): | |
"""Recursively crawl a URL up to a specified depth.""" | |
if visited is None: | |
visited = set() | |
if depth > max_depth or url in visited or not validate_url(url): | |
return [] | |
visited.add(url) | |
screenshots = [] | |
if is_webpage(url): | |
links = extract_links_from_page(url) | |
screenshot = take_screenshot(url) | |
if screenshot: | |
screenshots.append((url, screenshot)) | |
if depth < max_depth: | |
for link in links: | |
absolute_link = urljoin(url, link) | |
screenshots.extend(crawl_url(absolute_link, depth + 1, max_depth, visited)) | |
else: | |
logging.info(f"Skipping non-webpage content: {url}") | |
return screenshots | |
def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mode='standard', progress=gr.Progress()): | |
"""Process URLs with crawl depth and change detection.""" | |
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input] | |
urls = [url.strip() for url in urls if url.strip()] | |
urls = urls[:int(max_urls)] | |
# Validate all URLs | |
invalid_urls = [url for url in urls if not validate_url(url)] | |
if invalid_urls: | |
if mode == 'chat': | |
return f"Invalid URLs detected: {', '.join(invalid_urls)}" | |
else: | |
return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2) | |
scraped_data = [] | |
screenshots = [] | |
changes_log = [] | |
# Initialize progress tracking | |
total_urls = len(urls) | |
progress(0, desc="Starting...") | |
# Directory to store scraped data | |
data_dir = 'scraped_data' | |
os.makedirs(data_dir, exist_ok=True) | |
# Process each URL | |
for idx, url in enumerate(urls): | |
progress((idx + 1) / total_urls, desc=f"Processing: {url}") | |
if not url.startswith(('http://', 'https://')): | |
url = f'https://{url}' | |
# Sanitize URL for file naming | |
sanitized_url = sanitize_filename(url) | |
# Check for changes | |
old_html_path = os.path.join(data_dir, f"{sanitized_url}_html.txt") | |
old_screenshot_path = os.path.join(data_dir, f"{sanitized_url}_screenshot.png") | |
# Fetch latest data | |
latest_html = get_latest_data(url) | |
latest_screenshot = take_screenshot(url) | |
# Compare with previous data if available | |
if os.path.exists(old_html_path): | |
with open(old_html_path, 'r', encoding='utf-8') as f: | |
old_html = f.read() | |
if compare_html(old_html, latest_html): | |
changes_log.append(alert_changes(url, "HTML content has changed")) | |
if os.path.exists(old_screenshot_path): | |
with open(old_screenshot_path, 'rb') as f: | |
old_screenshot = f.read() | |
if latest_screenshot and compare_screenshot(old_screenshot, latest_screenshot): | |
changes_log.append(alert_changes(url, "Visual content has changed")) | |
# Store latest data | |
if latest_html: | |
with open(old_html_path, 'w', encoding='utf-8') as f: | |
f.write(latest_html) | |
if latest_screenshot: | |
with open(old_screenshot_path, 'wb') as f: | |
f.write(latest_screenshot) | |
# Prepare output data | |
if action_radio in ['Scrape data', 'Both']: | |
scraped_data.append({ | |
'url': url, | |
'content': latest_html, | |
'timestamp': datetime.datetime.now().isoformat(), | |
'changes_detected': changes_log | |
}) | |
if action_radio in ['Capture image', 'Both']: | |
crawled_screenshots = crawl_url(url, depth=0, max_depth=int(crawl_depth)) | |
screenshots.extend(crawled_screenshots) | |
if mode == 'chat': | |
return "\n".join(changes_log) | |
else: | |
# Create a temporary file to store the ZIP | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file: | |
with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
# Add screenshots to ZIP | |
for screenshot_url, screenshot_data in screenshots: | |
sanitized_screenshot_url = sanitize_filename(screenshot_url) | |
filename = f"{sanitized_screenshot_url}.png" | |
zipf.writestr(filename, screenshot_data) | |
# Add scraped data and changes log to ZIP | |
if scraped_data: | |
data_to_save = { | |
'scraped_data': scraped_data, | |
'changes_log': changes_log, | |
'timestamp': datetime.datetime.now().isoformat() | |
} | |
zipf.writestr('data.json', json.dumps(data_to_save, indent=2)) | |
# Get the path to the temporary file | |
zip_file_path = tmp_file.name | |
# Prepare display data | |
display_data = { | |
'total_scraped_urls': len(scraped_data), | |
'total_screenshots_taken': len(screenshots), | |
'changes_detected': changes_log, | |
'scraped_data': scraped_data | |
} | |
# Return the path to the temporary ZIP file and display data | |
return zip_file_path, json.dumps(display_data, indent=2) | |
class DataExtractor: | |
def __init__(self): | |
self.soup = None | |
self.base_url = None | |
self.logger = logging.getLogger(__name__) | |
def set_page(self, html_content: str, url: str): | |
self.soup = BeautifulSoup(html_content, 'html.parser') | |
self.base_url = url | |
self.logger.info(f"Page parsed. Base URL set to: {self.base_url}") | |
def extract_images(self) -> List[Dict]: | |
if not self.soup: | |
self.logger.error("BeautifulSoup object not initialized") | |
return [] | |
images = [] | |
all_imgs = self.soup.find_all('img') | |
self.logger.info(f"Found {len(all_imgs)} raw image tags") | |
for img in all_imgs: | |
try: | |
src = img.get('src', '') | |
if src: | |
# Handle relative URLs | |
src = urljoin(self.base_url, src) | |
image_data = { | |
'src': src, | |
'alt': img.get('alt', 'No description'), | |
'title': img.get('title', 'No title'), | |
'dimensions': f"{img.get('width', 'unknown')}x{img.get('height', 'unknown')}", | |
'file_type': self._get_file_type(src) | |
} | |
images.append(image_data) | |
self.logger.debug(f"Processed image: {src[:100]}...") | |
except Exception as e: | |
self.logger.error(f"Error processing image: {str(e)}") | |
continue | |
self.logger.info(f"Successfully extracted {len(images)} valid images") | |
return images | |
def extract_links(self) -> List[Dict]: | |
if not self.soup: | |
self.logger.error("BeautifulSoup object not initialized") | |
return [] | |
links = [] | |
all_links = self.soup.find_all('a') | |
self.logger.info(f"Found {len(all_links)} raw link tags") | |
for a in all_links: | |
try: | |
href = a.get('href', '') | |
if href and not href.startswith(('#', 'javascript:', 'mailto:')): | |
# Handle relative URLs | |
href = urljoin(self.base_url, href) | |
links.append({ | |
'href': href, | |
'text': a.get_text(strip=True) or '[No Text]', | |
'title': a.get('title', 'No title'), | |
'type': 'internal' if self._is_internal_link(href) else 'external', | |
'has_image': bool(a.find('img')) | |
}) | |
self.logger.debug(f"Processed link: {href[:100]}...") | |
except Exception as e: | |
self.logger.error(f"Error processing link: {str(e)}") | |
continue | |
self.logger.info(f"Successfully extracted {len(links)} valid links") | |
return links | |
def extract_text(self) -> List[Dict]: | |
if not self.soup: | |
self.logger.error("BeautifulSoup object not initialized") | |
return [] | |
texts = [] | |
all_paragraphs = self.soup.find_all('p') # Extracting all paragraph tags | |
self.logger.info(f"Found {len(all_paragraphs)} raw paragraph tags") | |
for p in all_paragraphs: | |
try: | |
text_content = p.get_text(strip=True) | |
if text_content: # Only add non-empty paragraphs | |
texts.append({ | |
'content': text_content, | |
'source': self.base_url | |
}) | |
self.logger.debug(f"Processed text block: {text_content[:100]}...") | |
except Exception as e: | |
self.logger.error(f"Error processing text block: {str(e)}") | |
continue | |
self.logger.info(f"Successfully extracted {len(texts)} valid text blocks") | |
return texts | |
def _get_file_type(self, url: str) -> str: | |
try: | |
ext = url.split('.')[-1].lower() | |
return ext if ext in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'svg'] else 'unknown' | |
except Exception: | |
return 'unknown' | |
def _is_internal_link(self, href: str) -> bool: | |
try: | |
return bool(self.base_url and (href.startswith('/') or self.base_url in href)) | |
except Exception: | |
return False | |
class QueryAnalyzer: | |
def __init__(self): | |
self.logger = logging.getLogger(__name__) | |
self.stop_words = set(stopwords.words('english')) | |
self.lemmatizer = WordNetLemmatizer() | |
self.logger.info("QueryAnalyzer initialized") | |
def parse_query(self, query: str) -> Dict[str, Union[str, int]]: | |
try: | |
self.logger.info(f"Parsing query: {query}") | |
tokens = word_tokenize(query.lower()) | |
filtered_tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in self.stop_words] | |
query_info = { | |
'target': self._identify_target(filtered_tokens), | |
'limit': self._identify_limit(filtered_tokens), | |
'filters': self._identify_filters(filtered_tokens) | |
} | |
self.logger.info(f"Query parsed: {query_info}") | |
return query_info | |
except Exception as e: | |
self.logger.error(f"Error parsing query: {str(e)}") | |
return {'target': 'unknown', 'limit': 0, 'filters': {}} | |
def _identify_target(self, tokens: List[str]) -> str: | |
if 'image' in tokens: | |
return 'image' | |
elif 'link' in tokens: | |
return 'link' | |
elif 'text' in tokens: | |
return 'text' | |
else: | |
return 'unknown' | |
def _identify_limit(self, tokens: List[str]) -> int: | |
for token in tokens: | |
if token.isdigit(): | |
return int(token) | |
return 0 | |
def _identify_filters(self, tokens: List[str]) -> Dict[str, str]: | |
filters = {} | |
if 'external' in tokens: | |
filters['link_type'] = 'external' | |
elif 'internal' in tokens: | |
filters['link_type'] = 'internal' | |
return filters | |
class ResponseFormatter: | |
def __init__(self): | |
self.logger = logging.getLogger(__name__) | |
def format_data(self, data: List[Dict], query_info: Dict) -> str: | |
try: | |
if not data: | |
return "No data found for the specified query." | |
target = query_info['target'] | |
limit = query_info['limit'] | |
if limit > 0: | |
data = data[:limit] | |
if target == 'image': | |
return self._format_images(data) | |
elif target == 'link': | |
return self._format_links(data) | |
elif target == 'text': | |
return self._format_texts(data) | |
else: | |
return "Unknown target for formatting." | |
except Exception as e: | |
self.logger.error(f"Error formatting response: {str(e)}") | |
return f"An error occurred while formatting the response: {str(e)}" | |
def _format_images(self, images: List[Dict]) -> str: | |
if not images: | |
return "No images found." | |
formatted_images = [] | |
for idx, img in enumerate(images, start=1): | |
formatted = f"Image {idx}:\n" | |
formatted += f" Source: {img['src']}\n" | |
formatted += f" Alt Text: {img['alt']}\n" | |
formatted += f" Title: {img['title']}\n" | |
formatted += f" Dimensions: {img['dimensions']}\n" | |
formatted += f" File Type: {img['file_type']}\n\n" | |
formatted_images.append(formatted) | |
return ''.join(formatted_images) | |
def _format_links(self, links: List[Dict]) -> str: | |
if not links: | |
return "No links found." | |
formatted_links = [] | |
for idx, link in enumerate(links, start=1): | |
formatted = f"Link {idx}:\n" | |
formatted += f" URL: {link['href']}\n" | |
formatted += f" Text: {link['text']}\n" | |
formatted += f" Title: {link['title']}\n" | |
formatted += f" Type: {link['type']}\n" | |
formatted += f" Has Image: {'Yes' if link['has_image'] else 'No'}\n\n" | |
formatted_links.append(formatted) | |
return ''.join(formatted_links) | |
def _format_texts(self, texts: List[Dict]) -> str: | |
if not texts: | |
return "No text blocks found." | |
formatted_texts = [] | |
for idx, text in enumerate(texts, start=1): | |
formatted = f"Text Block {idx}:\n" | |
formatted += f" Content: {text['content']}\n" | |
formatted += f" Source: {text['source']}\n\n" | |
formatted_texts.append(formatted) | |
return ''.join(formatted_texts) | |
class SmartWebScraper: | |
def __init__(self): | |
self.query_analyzer = QueryAnalyzer() | |
self.data_extractor = DataExtractor() | |
self.response_formatter = ResponseFormatter() | |
self.logger = logging.getLogger(__name__) | |
self.scraped_data = {} # Temporarily store scraped data | |
def process_url(self, url: str, query: str) -> str: | |
try: | |
# Validate URL | |
if not self._validate_url(url): | |
return "Please provide a valid URL (including http:// or https://)." | |
# Fetch page | |
self.logger.info(f"Fetching URL: {url}") | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
response.raise_for_status() | |
self.logger.info(f"Successfully fetched page. Status code: {response.status_code}") | |
# Set page content and store in scraped_data | |
self.data_extractor.set_page(response.text, url) | |
self.scraped_data[url] = { | |
'images': self.data_extractor.extract_images(), | |
'links': self.data_extractor.extract_links(), | |
'texts': self.data_extractor.extract_text() | |
} | |
# Analyze query | |
query_info = self.query_analyzer.parse_query(query) | |
self.logger.info(f"Query analysis: {query_info}") | |
# Extract requested data | |
data = self._get_data_for_target(query_info['target'], url) | |
self.logger.info(f"Extracted {len(data)} items for target: {query_info['target']}") | |
# Format response | |
formatted_response = self.response_formatter.format_data(data, query_info) | |
self.logger.info("Response formatted successfully") | |
return formatted_response | |
except requests.exceptions.RequestException as e: | |
error_msg = f"Error fetching the webpage: {str(e)}" | |
self.logger.error(error_msg) | |
return error_msg | |
except Exception as e: | |
error_msg = f"An error occurred: {str(e)}" | |
self.logger.error(error_msg) | |
return error_msg | |
def _validate_url(self, url: str) -> bool: | |
try: | |
result = urlparse(url) | |
return all([result.scheme, result.netloc]) | |
except Exception as e: | |
self.logger.error(f"URL validation error: {str(e)}") | |
return False | |
def _get_data_for_target(self, target: str, url: str) -> List[Dict]: | |
if url not in self.scraped_data: | |
self.logger.warning(f"No data found for URL: {url}") | |
return [] | |
if target == 'image': | |
return self.scraped_data[url]['images'] | |
elif target == 'link': | |
return self.scraped_data[url]['links'] | |
elif target == 'text': | |
return self.scraped_data[url]['texts'] | |
else: | |
self.logger.warning(f"No extractor found for target: {target}") | |
return [] | |
def recognize_intent(self, instruction: str) -> str: | |
"""Recognizes the intent of an instruction.""" | |
instruction = instruction.lower() | |
# General patterns for actions and data types | |
action_patterns = { | |
r'\b(find|extract|scrape)\s+(links|images|texts)\b': 'extract_data', | |
r'\b(count)\s+(links|images|texts)\b': 'count_data', | |
} | |
for pattern, intent in action_patterns.items(): | |
if re.search(pattern, instruction): | |
return intent | |
return "unknown" | |
def extract_data_type(self, instruction: str) -> str: | |
"""Extracts the data type from an instruction.""" | |
instruction = instruction.lower() | |
data_types = { | |
r'\b(links)\b': 'link', | |
r'\b(images)\b': 'image', | |
r'\b(texts)\b': 'text', | |
} | |
for pattern, data_type in data_types.items(): | |
if re.search(pattern, instruction): | |
return data_type | |
return "unknown" | |
def chat_based_scrape(self, instruction, url_input, output_format): | |
"""Handles chat-based instructions for web scraping.""" | |
if not validate_url(url_input): | |
return "Invalid URL. Please enter a valid URL." | |
if url_input not in self.scraped_data: | |
self.process_url(url_input, "") # Fetch and store data if not already present | |
# Recognize intent and extract data type if applicable | |
intent = self.recognize_intent(instruction) | |
data_type = self.extract_data_type(instruction) | |
if intent == "unknown" or data_type == "unknown": | |
return "Instruction not recognized. Please try again." | |
# Extract data based on intent and data type | |
if intent == "extract_data": | |
data = self._get_data_for_target(data_type, url_input) | |
if output_format == "JSON": | |
return json.dumps(data, indent=2) | |
else: | |
query_info = {'target': data_type, 'limit': 0, 'filters': {}} | |
return self.response_formatter.format_data(data, query_info) | |
elif intent == "count_data": | |
data = self._get_data_for_target(data_type, url_input) | |
return f"The number of {data_type}s is {len(data)}." | |
else: | |
return "Instruction not recognized. Please try again." | |
def create_interface(): | |
"""Create the Gradio interface.""" | |
scraper = SmartWebScraper() | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown( | |
""" | |
# 🌐 Enhanced Web Scraper with Change Detection and Chat | |
Monitor and capture changes in web content automatically. Use the chat interface to interact with scraped data. | |
""" | |
) | |
with gr.Tabs(): | |
with gr.Tab("URL Scrape/Screenshot"): | |
url_input = gr.Textbox( | |
label="Enter URL(s)", | |
value="https://example.com", | |
placeholder="Enter single URL or multiple URLs separated by commas" | |
) | |
with gr.Row(): | |
bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False) | |
action_radio = gr.Radio( | |
["Scrape data", "Capture image", "Both"], | |
label="Select Action", | |
value="Both" | |
) | |
with gr.Row(): | |
max_urls = gr.Slider( | |
minimum=1, | |
maximum=20, | |
value=5, | |
step=1, | |
label="Max URLs to process" | |
) | |
crawl_depth = gr.Slider( | |
minimum=0, | |
maximum=3, | |
value=1, | |
step=1, | |
label="Crawl Depth (0 for no recursion)" | |
) | |
process_button = gr.Button("Process URLs", variant="primary") | |
with gr.Column(): | |
screenshot_zip = gr.File(label="Download Results") | |
scraped_data_output = gr.JSON(label="Results Summary") | |
process_button.click( | |
fn=process_urls, | |
inputs=[ | |
url_input, | |
bulk_toggle, | |
action_radio, | |
max_urls, | |
crawl_depth | |
], | |
outputs=[ | |
screenshot_zip, | |
scraped_data_output | |
], | |
show_progress=True | |
) | |
with gr.Tab("Chat-Based Scrape"): | |
instruction = gr.Textbox( | |
label="Enter Instruction", | |
placeholder="e.g., 'Scrape all links' or 'Extract all images'" | |
) | |
chat_url_input = gr.Textbox( | |
label="Enter URL", | |
value="https://example.com", | |
placeholder="Enter the target URL" | |
) | |
output_format = gr.Radio( | |
["Formatted Text", "JSON"], | |
label="Output Format", | |
value="Formatted Text" | |
) | |
chat_output = gr.Textbox(label="Output") | |
chat_button = gr.Button("Execute Instruction", variant="primary") | |
chat_button.click( | |
fn=scraper.chat_based_scrape, | |
inputs=[instruction, chat_url_input, output_format], | |
outputs=chat_output | |
) | |
gr.Markdown( | |
""" | |
### Features | |
- Bulk URL processing | |
- Screenshot capture | |
- Content change detection | |
- Recursive crawling | |
- Chat-based instructions for interacting with scraped data | |
""" | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch(debug=True) |