Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -135,7 +135,7 @@ def crawl_url(url, depth, max_depth, visited=None):
|
|
135 |
logging.info(f"Skipping non-webpage content: {url}")
|
136 |
|
137 |
return screenshots
|
138 |
-
|
139 |
def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
|
140 |
"""Process URLs with crawl depth and change detection."""
|
141 |
# Validate URLs first
|
@@ -146,7 +146,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
146 |
# Validate all URLs
|
147 |
invalid_urls = [url for url in urls if not validate_url(url)]
|
148 |
if invalid_urls:
|
149 |
-
return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
|
150 |
|
151 |
scraped_data = []
|
152 |
screenshots = []
|
@@ -199,16 +199,11 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
199 |
|
200 |
# Prepare output data
|
201 |
if action_radio in ['Scrape data', 'Both']:
|
202 |
-
cleaned_content = BeautifulSoup(latest_html, 'html.parser').get_text(separator="\n").strip()
|
203 |
scraped_data.append({
|
204 |
'url': url,
|
205 |
-
'content':
|
206 |
'timestamp': datetime.datetime.now().isoformat(),
|
207 |
-
'changes_detected': changes_log
|
208 |
-
'metadata': {
|
209 |
-
'html_length': len(cleaned_content),
|
210 |
-
'screenshot_available': latest_screenshot is not None
|
211 |
-
}
|
212 |
})
|
213 |
|
214 |
if action_radio in ['Capture image', 'Both']:
|
@@ -231,7 +226,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
231 |
if scraped_data:
|
232 |
data_to_save = {
|
233 |
'scraped_data': scraped_data,
|
234 |
-
'changes_log': changes_log
|
235 |
'timestamp': datetime.datetime.now().isoformat()
|
236 |
}
|
237 |
zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
|
@@ -244,15 +239,46 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
244 |
'total_scraped_urls': len(scraped_data),
|
245 |
'total_screenshots_taken': len(screenshots),
|
246 |
'changes_detected': changes_log,
|
247 |
-
'
|
248 |
}
|
249 |
|
250 |
-
#
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
-
|
254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
|
|
|
|
|
|
256 |
def create_interface():
|
257 |
"""Create the Gradio interface."""
|
258 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
@@ -300,7 +326,6 @@ def create_interface():
|
|
300 |
with gr.Column():
|
301 |
screenshot_zip = gr.File(label="Download Results")
|
302 |
scraped_data_output = gr.JSON(label="Results Summary")
|
303 |
-
screenshot_gallery = gr.Gallery(label="Screenshots", show_label=True, scale=2)
|
304 |
|
305 |
process_button.click(
|
306 |
fn=process_urls,
|
@@ -313,11 +338,51 @@ def create_interface():
|
|
313 |
],
|
314 |
outputs=[
|
315 |
screenshot_zip,
|
316 |
-
scraped_data_output
|
317 |
-
screenshot_gallery
|
318 |
],
|
319 |
show_progress=True
|
320 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
|
322 |
gr.Markdown(
|
323 |
"""
|
@@ -326,7 +391,7 @@ def create_interface():
|
|
326 |
- Screenshot capture
|
327 |
- Content change detection
|
328 |
- Recursive crawling
|
329 |
-
-
|
330 |
"""
|
331 |
)
|
332 |
|
|
|
135 |
logging.info(f"Skipping non-webpage content: {url}")
|
136 |
|
137 |
return screenshots
|
138 |
+
|
139 |
def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
|
140 |
"""Process URLs with crawl depth and change detection."""
|
141 |
# Validate URLs first
|
|
|
146 |
# Validate all URLs
|
147 |
invalid_urls = [url for url in urls if not validate_url(url)]
|
148 |
if invalid_urls:
|
149 |
+
return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
|
150 |
|
151 |
scraped_data = []
|
152 |
screenshots = []
|
|
|
199 |
|
200 |
# Prepare output data
|
201 |
if action_radio in ['Scrape data', 'Both']:
|
|
|
202 |
scraped_data.append({
|
203 |
'url': url,
|
204 |
+
'content': latest_html, # Include full HTML content
|
205 |
'timestamp': datetime.datetime.now().isoformat(),
|
206 |
+
'changes_detected': changes_log
|
|
|
|
|
|
|
|
|
207 |
})
|
208 |
|
209 |
if action_radio in ['Capture image', 'Both']:
|
|
|
226 |
if scraped_data:
|
227 |
data_to_save = {
|
228 |
'scraped_data': scraped_data,
|
229 |
+
'changes_log': changes_log,
|
230 |
'timestamp': datetime.datetime.now().isoformat()
|
231 |
}
|
232 |
zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
|
|
|
239 |
'total_scraped_urls': len(scraped_data),
|
240 |
'total_screenshots_taken': len(screenshots),
|
241 |
'changes_detected': changes_log,
|
242 |
+
'scraped_data': scraped_data # Include full scraped data
|
243 |
}
|
244 |
|
245 |
+
# Return the path to the temporary ZIP file and display data
|
246 |
+
return zip_file_path, json.dumps(display_data, indent=2)
|
247 |
+
|
248 |
+
def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
|
249 |
+
"""Handle chat-based instructions for scraping."""
|
250 |
+
# Example: Parse instructions like "Scrape all links" or "Extract all images"
|
251 |
+
if "scrape all links" in instruction.lower():
|
252 |
+
# Extract links from the provided URL(s)
|
253 |
+
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
254 |
+
urls = [url.strip() for url in urls if url.strip()]
|
255 |
+
urls = urls[:int(max_urls)]
|
256 |
+
|
257 |
+
all_links = []
|
258 |
+
for url in urls:
|
259 |
+
links = extract_links_from_page(url)
|
260 |
+
all_links.extend(links)
|
261 |
+
|
262 |
+
return f"Extracted links: {', '.join(all_links)}"
|
263 |
|
264 |
+
elif "extract all images" in instruction.lower():
|
265 |
+
# Extract image URLs from the provided URL(s)
|
266 |
+
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
267 |
+
urls = [url.strip() for url in urls if url.strip()]
|
268 |
+
urls = urls[:int(max_urls)]
|
269 |
+
|
270 |
+
all_images = []
|
271 |
+
for url in urls:
|
272 |
+
response = requests.get(url, timeout=10)
|
273 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
274 |
+
images = [img['src'] for img in soup.find_all('img', src=True)]
|
275 |
+
all_images.extend(images)
|
276 |
+
|
277 |
+
return f"Extracted images: {', '.join(all_images)}"
|
278 |
|
279 |
+
else:
|
280 |
+
return "Instruction not recognized. Please try again."
|
281 |
+
|
282 |
def create_interface():
|
283 |
"""Create the Gradio interface."""
|
284 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
|
326 |
with gr.Column():
|
327 |
screenshot_zip = gr.File(label="Download Results")
|
328 |
scraped_data_output = gr.JSON(label="Results Summary")
|
|
|
329 |
|
330 |
process_button.click(
|
331 |
fn=process_urls,
|
|
|
338 |
],
|
339 |
outputs=[
|
340 |
screenshot_zip,
|
341 |
+
scraped_data_output
|
|
|
342 |
],
|
343 |
show_progress=True
|
344 |
)
|
345 |
+
|
346 |
+
with gr.Tab("Chat-Based Scrape"):
|
347 |
+
chat_instruction = gr.Textbox(
|
348 |
+
label="Enter Instruction",
|
349 |
+
placeholder="e.g., 'Scrape all links' or 'Extract all images'"
|
350 |
+
)
|
351 |
+
chat_url_input = gr.Textbox(
|
352 |
+
label="Enter URL(s)",
|
353 |
+
value="https://example.com",
|
354 |
+
placeholder="Enter single URL or multiple URLs separated by commas"
|
355 |
+
)
|
356 |
+
chat_bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
|
357 |
+
chat_max_urls = gr.Slider(
|
358 |
+
minimum=1,
|
359 |
+
maximum=20,
|
360 |
+
value=5,
|
361 |
+
step=1,
|
362 |
+
label="Max URLs to process"
|
363 |
+
)
|
364 |
+
chat_crawl_depth = gr.Slider(
|
365 |
+
minimum=1,
|
366 |
+
maximum=3,
|
367 |
+
value=1,
|
368 |
+
step=1,
|
369 |
+
label="Crawl Depth"
|
370 |
+
)
|
371 |
+
chat_output = gr.Textbox(label="Chat Output")
|
372 |
+
|
373 |
+
chat_button = gr.Button("Submit Instruction", variant="primary")
|
374 |
+
|
375 |
+
chat_button.click(
|
376 |
+
fn=chat_based_scrape,
|
377 |
+
inputs=[
|
378 |
+
chat_instruction,
|
379 |
+
chat_url_input,
|
380 |
+
chat_bulk_toggle,
|
381 |
+
chat_max_urls,
|
382 |
+
chat_crawl_depth
|
383 |
+
],
|
384 |
+
outputs=chat_output
|
385 |
+
)
|
386 |
|
387 |
gr.Markdown(
|
388 |
"""
|
|
|
391 |
- Screenshot capture
|
392 |
- Content change detection
|
393 |
- Recursive crawling
|
394 |
+
- Chat-based instructions
|
395 |
"""
|
396 |
)
|
397 |
|