acecalisto3 commited on
Commit
7730e40
·
verified ·
1 Parent(s): ed3955d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -19
app.py CHANGED
@@ -135,7 +135,7 @@ def crawl_url(url, depth, max_depth, visited=None):
135
  logging.info(f"Skipping non-webpage content: {url}")
136
 
137
  return screenshots
138
-
139
  def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
140
  """Process URLs with crawl depth and change detection."""
141
  # Validate URLs first
@@ -146,7 +146,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
146
  # Validate all URLs
147
  invalid_urls = [url for url in urls if not validate_url(url)]
148
  if invalid_urls:
149
- return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2), []
150
 
151
  scraped_data = []
152
  screenshots = []
@@ -199,16 +199,11 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
199
 
200
  # Prepare output data
201
  if action_radio in ['Scrape data', 'Both']:
202
- cleaned_content = BeautifulSoup(latest_html, 'html.parser').get_text(separator="\n").strip()
203
  scraped_data.append({
204
  'url': url,
205
- 'content': cleaned_content,
206
  'timestamp': datetime.datetime.now().isoformat(),
207
- 'changes_detected': changes_log.copy(), # Ensure changes_log is a copy, not a reference
208
- 'metadata': {
209
- 'html_length': len(cleaned_content),
210
- 'screenshot_available': latest_screenshot is not None
211
- }
212
  })
213
 
214
  if action_radio in ['Capture image', 'Both']:
@@ -231,7 +226,7 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
231
  if scraped_data:
232
  data_to_save = {
233
  'scraped_data': scraped_data,
234
- 'changes_log': changes_log.copy(), # Ensure changes_log is a copy, not a reference
235
  'timestamp': datetime.datetime.now().isoformat()
236
  }
237
  zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
@@ -244,15 +239,46 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
244
  'total_scraped_urls': len(scraped_data),
245
  'total_screenshots_taken': len(screenshots),
246
  'changes_detected': changes_log,
247
- 'screenshots': [screenshot_data for _, screenshot_data in screenshots]
248
  }
249
 
250
- # Convert screenshots to a format suitable for Gradio
251
- screenshot_display = [io.BytesIO(screenshot_data) for _, screenshot_data in screenshots]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
- # Return the path to the temporary ZIP file, display data, and screenshots
254
- return zip_file_path, json.dumps(display_data, indent=2), screenshot_display
 
 
 
 
 
 
 
 
 
 
 
 
255
 
 
 
 
256
  def create_interface():
257
  """Create the Gradio interface."""
258
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -300,7 +326,6 @@ def create_interface():
300
  with gr.Column():
301
  screenshot_zip = gr.File(label="Download Results")
302
  scraped_data_output = gr.JSON(label="Results Summary")
303
- screenshot_gallery = gr.Gallery(label="Screenshots", show_label=True, scale=2)
304
 
305
  process_button.click(
306
  fn=process_urls,
@@ -313,11 +338,51 @@ def create_interface():
313
  ],
314
  outputs=[
315
  screenshot_zip,
316
- scraped_data_output,
317
- screenshot_gallery
318
  ],
319
  show_progress=True
320
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  gr.Markdown(
323
  """
@@ -326,7 +391,7 @@ def create_interface():
326
  - Screenshot capture
327
  - Content change detection
328
  - Recursive crawling
329
- - Automatic data storage
330
  """
331
  )
332
 
 
135
  logging.info(f"Skipping non-webpage content: {url}")
136
 
137
  return screenshots
138
+
139
  def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
140
  """Process URLs with crawl depth and change detection."""
141
  # Validate URLs first
 
146
  # Validate all URLs
147
  invalid_urls = [url for url in urls if not validate_url(url)]
148
  if invalid_urls:
149
+ return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
150
 
151
  scraped_data = []
152
  screenshots = []
 
199
 
200
  # Prepare output data
201
  if action_radio in ['Scrape data', 'Both']:
 
202
  scraped_data.append({
203
  'url': url,
204
+ 'content': latest_html, # Include full HTML content
205
  'timestamp': datetime.datetime.now().isoformat(),
206
+ 'changes_detected': changes_log
 
 
 
 
207
  })
208
 
209
  if action_radio in ['Capture image', 'Both']:
 
226
  if scraped_data:
227
  data_to_save = {
228
  'scraped_data': scraped_data,
229
+ 'changes_log': changes_log,
230
  'timestamp': datetime.datetime.now().isoformat()
231
  }
232
  zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
 
239
  'total_scraped_urls': len(scraped_data),
240
  'total_screenshots_taken': len(screenshots),
241
  'changes_detected': changes_log,
242
+ 'scraped_data': scraped_data # Include full scraped data
243
  }
244
 
245
+ # Return the path to the temporary ZIP file and display data
246
+ return zip_file_path, json.dumps(display_data, indent=2)
247
+
248
+ def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
249
+ """Handle chat-based instructions for scraping."""
250
+ # Example: Parse instructions like "Scrape all links" or "Extract all images"
251
+ if "scrape all links" in instruction.lower():
252
+ # Extract links from the provided URL(s)
253
+ urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
254
+ urls = [url.strip() for url in urls if url.strip()]
255
+ urls = urls[:int(max_urls)]
256
+
257
+ all_links = []
258
+ for url in urls:
259
+ links = extract_links_from_page(url)
260
+ all_links.extend(links)
261
+
262
+ return f"Extracted links: {', '.join(all_links)}"
263
 
264
+ elif "extract all images" in instruction.lower():
265
+ # Extract image URLs from the provided URL(s)
266
+ urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
267
+ urls = [url.strip() for url in urls if url.strip()]
268
+ urls = urls[:int(max_urls)]
269
+
270
+ all_images = []
271
+ for url in urls:
272
+ response = requests.get(url, timeout=10)
273
+ soup = BeautifulSoup(response.text, 'html.parser')
274
+ images = [img['src'] for img in soup.find_all('img', src=True)]
275
+ all_images.extend(images)
276
+
277
+ return f"Extracted images: {', '.join(all_images)}"
278
 
279
+ else:
280
+ return "Instruction not recognized. Please try again."
281
+
282
  def create_interface():
283
  """Create the Gradio interface."""
284
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
326
  with gr.Column():
327
  screenshot_zip = gr.File(label="Download Results")
328
  scraped_data_output = gr.JSON(label="Results Summary")
 
329
 
330
  process_button.click(
331
  fn=process_urls,
 
338
  ],
339
  outputs=[
340
  screenshot_zip,
341
+ scraped_data_output
 
342
  ],
343
  show_progress=True
344
  )
345
+
346
+ with gr.Tab("Chat-Based Scrape"):
347
+ chat_instruction = gr.Textbox(
348
+ label="Enter Instruction",
349
+ placeholder="e.g., 'Scrape all links' or 'Extract all images'"
350
+ )
351
+ chat_url_input = gr.Textbox(
352
+ label="Enter URL(s)",
353
+ value="https://example.com",
354
+ placeholder="Enter single URL or multiple URLs separated by commas"
355
+ )
356
+ chat_bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
357
+ chat_max_urls = gr.Slider(
358
+ minimum=1,
359
+ maximum=20,
360
+ value=5,
361
+ step=1,
362
+ label="Max URLs to process"
363
+ )
364
+ chat_crawl_depth = gr.Slider(
365
+ minimum=1,
366
+ maximum=3,
367
+ value=1,
368
+ step=1,
369
+ label="Crawl Depth"
370
+ )
371
+ chat_output = gr.Textbox(label="Chat Output")
372
+
373
+ chat_button = gr.Button("Submit Instruction", variant="primary")
374
+
375
+ chat_button.click(
376
+ fn=chat_based_scrape,
377
+ inputs=[
378
+ chat_instruction,
379
+ chat_url_input,
380
+ chat_bulk_toggle,
381
+ chat_max_urls,
382
+ chat_crawl_depth
383
+ ],
384
+ outputs=chat_output
385
+ )
386
 
387
  gr.Markdown(
388
  """
 
391
  - Screenshot capture
392
  - Content change detection
393
  - Recursive crawling
394
+ - Chat-based instructions
395
  """
396
  )
397