acecalisto3 commited on
Commit
59bd798
·
verified ·
1 Parent(s): 4e194df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -60
app.py CHANGED
@@ -136,7 +136,7 @@ def crawl_url(url, depth, max_depth, visited=None):
136
 
137
  return screenshots
138
 
139
- def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
140
  """Process URLs with crawl depth and change detection."""
141
  # Validate URLs first
142
  urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
@@ -146,7 +146,10 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
146
  # Validate all URLs
147
  invalid_urls = [url for url in urls if not validate_url(url)]
148
  if invalid_urls:
149
- return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
 
 
 
150
 
151
  scraped_data = []
152
  screenshots = []
@@ -213,72 +216,53 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
213
  # Update progress
214
  progress((idx + 1) / total_urls)
215
 
216
- # Create a temporary file to store the ZIP
217
- with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
218
- with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
219
- # Add screenshots to ZIP
220
- for screenshot_url, screenshot_data in screenshots:
221
- sanitized_screenshot_url = sanitize_filename(screenshot_url)
222
- filename = f"{sanitized_screenshot_url}.png"
223
- zipf.writestr(filename, screenshot_data)
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- # Add scraped data and changes log to ZIP
226
- if scraped_data:
227
- data_to_save = {
228
- 'scraped_data': scraped_data,
229
- 'changes_log': changes_log,
230
- 'timestamp': datetime.datetime.now().isoformat()
231
- }
232
- zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
233
 
234
- # Get the path to the temporary file
235
- zip_file_path = tmp_file.name
236
-
237
- # Prepare display data
238
- display_data = {
239
- 'total_scraped_urls': len(scraped_data),
240
- 'total_screenshots_taken': len(screenshots),
241
- 'changes_detected': changes_log,
242
- 'scraped_data': scraped_data # Include full scraped data
243
- }
244
-
245
- # Return the path to the temporary ZIP file and display data
246
- return zip_file_path, json.dumps(display_data, indent=2)
247
-
248
- from smolagents import tool
249
 
250
- @tool
251
  def recognize_intent(instruction: str) -> str:
252
- """
253
- Recognizes the intent from the user's instruction.
254
- Args:
255
- instruction: The input instruction from the user.
256
-
257
- Returns:
258
- The recognized intent as a string.
259
- """
260
  instruction = instruction.lower()
261
  if "scrape all links" in instruction:
262
  return "scrape_links"
263
  elif "extract all images" in instruction:
264
  return "extract_images"
 
 
265
  else:
266
  return "unknown"
267
 
268
- @tool
269
- def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int) -> str:
270
- """
271
- Generates a command based on the recognized intent.
272
-
273
- Args:
274
- intent: The recognized intent from the user input.
275
- url_input: The input URL(s) from the user.
276
- bulk_toggle: Indicates if multiple URLs are being processed.
277
- max_urls: The maximum number of URLs to process.
278
-
279
- Returns:
280
- The result of the command execution.
281
- """
282
  urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
283
  urls = [url.strip() for url in urls if url.strip()]
284
  urls = urls[:max_urls]
@@ -299,15 +283,18 @@ def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: i
299
  all_images.extend(images)
300
  return f"Extracted images: {', '.join(all_images)}"
301
 
 
 
 
 
302
  return "Instruction not recognized. Please try again."
303
 
304
- def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
305
- """Handle chat-based instructions for scraping."""
306
  # Recognize intent
307
  intent = recognize_intent(instruction)
308
 
309
  # Generate command based on the recognized intent
310
- command_output = generate_command(intent, url_input, bulk_toggle, max_urls)
311
 
312
  return command_output
313
 
@@ -393,6 +380,13 @@ def create_interface():
393
  step=1,
394
  label="Max URLs to process"
395
  )
 
 
 
 
 
 
 
396
  chat_output = gr.Textbox(label="Chat Output")
397
 
398
  chat_button = gr.Button("Submit Instruction", variant="primary")
@@ -403,7 +397,9 @@ def create_interface():
403
  chat_instruction,
404
  chat_url_input,
405
  chat_bulk_toggle,
406
- chat_max_urls
 
 
407
  ],
408
  outputs=chat_output
409
  )
@@ -420,6 +416,7 @@ def create_interface():
420
  )
421
 
422
  return demo
 
423
  if __name__ == "__main__":
424
  demo = create_interface() # Call the function to create the interface
425
  demo.launch() # Launch the Gradio app
 
136
 
137
  return screenshots
138
 
139
+ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mode='standard', progress=gr.Progress()):
140
  """Process URLs with crawl depth and change detection."""
141
  # Validate URLs first
142
  urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
 
146
  # Validate all URLs
147
  invalid_urls = [url for url in urls if not validate_url(url)]
148
  if invalid_urls:
149
+ if mode == 'chat':
150
+ return f"Invalid URLs detected: {', '.join(invalid_urls)}"
151
+ else:
152
+ return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
153
 
154
  scraped_data = []
155
  screenshots = []
 
216
  # Update progress
217
  progress((idx + 1) / total_urls)
218
 
219
+ if mode == 'chat':
220
+ return "\n".join(changes_log)
221
+ else:
222
+ # Create a temporary file to store the ZIP
223
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
224
+ with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
225
+ # Add screenshots to ZIP
226
+ for screenshot_url, screenshot_data in screenshots:
227
+ sanitized_screenshot_url = sanitize_filename(screenshot_url)
228
+ filename = f"{sanitized_screenshot_url}.png"
229
+ zipf.writestr(filename, screenshot_data)
230
+
231
+ # Add scraped data and changes log to ZIP
232
+ if scraped_data:
233
+ data_to_save = {
234
+ 'scraped_data': scraped_data,
235
+ 'changes_log': changes_log,
236
+ 'timestamp': datetime.datetime.now().isoformat()
237
+ }
238
+ zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
239
 
240
+ # Get the path to the temporary file
241
+ zip_file_path = tmp_file.name
 
 
 
 
 
 
242
 
243
+ # Prepare display data
244
+ display_data = {
245
+ 'total_scraped_urls': len(scraped_data),
246
+ 'total_screenshots_taken': len(screenshots),
247
+ 'changes_detected': changes_log,
248
+ 'scraped_data': scraped_data # Include full scraped data
249
+ }
250
+
251
+ # Return the path to the temporary ZIP file and display data
252
+ return zip_file_path, json.dumps(display_data, indent=2)
 
 
 
 
 
253
 
 
254
  def recognize_intent(instruction: str) -> str:
 
 
 
 
 
 
 
 
255
  instruction = instruction.lower()
256
  if "scrape all links" in instruction:
257
  return "scrape_links"
258
  elif "extract all images" in instruction:
259
  return "extract_images"
260
+ elif "monitor changes" in instruction:
261
+ return "monitor_changes"
262
  else:
263
  return "unknown"
264
 
265
+ def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int, session_id: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
267
  urls = [url.strip() for url in urls if url.strip()]
268
  urls = urls[:max_urls]
 
283
  all_images.extend(images)
284
  return f"Extracted images: {', '.join(all_images)}"
285
 
286
+ elif intent == "monitor_changes":
287
+ changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
288
+ return changes_log
289
+
290
  return "Instruction not recognized. Please try again."
291
 
292
+ def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth, session_id):
 
293
  # Recognize intent
294
  intent = recognize_intent(instruction)
295
 
296
  # Generate command based on the recognized intent
297
+ command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth, session_id)
298
 
299
  return command_output
300
 
 
380
  step=1,
381
  label="Max URLs to process"
382
  )
383
+ chat_crawl_depth = gr.Slider(
384
+ minimum=1,
385
+ maximum=3,
386
+ value=1,
387
+ step=1,
388
+ label="Crawl Depth"
389
+ )
390
  chat_output = gr.Textbox(label="Chat Output")
391
 
392
  chat_button = gr.Button("Submit Instruction", variant="primary")
 
397
  chat_instruction,
398
  chat_url_input,
399
  chat_bulk_toggle,
400
+ chat_max_urls,
401
+ chat_crawl_depth,
402
+ gr.Session
403
  ],
404
  outputs=chat_output
405
  )
 
416
  )
417
 
418
  return demo
419
+
420
  if __name__ == "__main__":
421
  demo = create_interface() # Call the function to create the interface
422
  demo.launch() # Launch the Gradio app