acecalisto3 commited on
Commit
52c508e
·
verified ·
1 Parent(s): cc5a0ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -56
app.py CHANGED
@@ -3,6 +3,7 @@ import requests
3
  import re
4
  import logging
5
  import json
 
6
  from selenium import webdriver
7
  from selenium.webdriver.chrome.options import Options
8
  from PIL import Image
@@ -11,8 +12,6 @@ import zipfile
11
  import os
12
  import datetime
13
  from urllib.parse import urlparse
14
- from bs4 import BeautifulSoup
15
- import tempfile
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO,
@@ -253,16 +252,23 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mo
253
 
254
  def recognize_intent(instruction: str) -> str:
255
  instruction = instruction.lower()
256
- if re.search(r'\bscrape\s+all\s+links\b', instruction):
 
 
 
 
 
 
257
  return "scrape_links"
258
- elif re.search(r'\bextract\s+all\s+images\b', instruction):
259
- return "extract_images"
260
- elif re.search(r'\bmonitor\s+changes\b', instruction):
261
  return "monitor_changes"
 
262
  else:
263
  return "unknown"
264
 
265
- def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int, session_id: str) -> str:
266
  urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
267
  urls = [url.strip() for url in urls if url.strip()]
268
  urls = urls[:max_urls]
@@ -274,14 +280,14 @@ def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: i
274
  all_links.extend(links)
275
  return f"Extracted links: {', '.join(all_links)}"
276
 
277
- elif intent == "extract_images":
278
- all_images = []
279
  for url in urls:
280
  response = requests.get(url, timeout=10)
281
  soup = BeautifulSoup(response.text, 'html.parser')
282
- images = [img['src'] for img in soup.find_all('img', src=True)]
283
- all_images.extend(images)
284
- return f"Extracted images: {', '.join(all_images)}"
285
 
286
  elif intent == "monitor_changes":
287
  changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
@@ -289,14 +295,17 @@ def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: i
289
 
290
  return "Instruction not recognized. Please try again."
291
 
292
- def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth, session_id):
 
293
  # Recognize intent
294
  intent = recognize_intent(instruction)
 
295
 
296
  # Generate command based on the recognized intent
297
- command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth, session_id)
298
 
299
  return command_output
 
300
  def create_interface():
301
  """Create the Gradio interface."""
302
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -388,9 +397,6 @@ def create_interface():
388
  )
389
  chat_output = gr.Textbox(label="Chat Output")
390
 
391
- # Initialize session state
392
- session_state = gr.State({})
393
-
394
  chat_button = gr.Button("Submit Instruction", variant="primary")
395
 
396
  chat_button.click(
@@ -418,44 +424,6 @@ def create_interface():
418
 
419
  return demo
420
 
421
- def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
422
- print(f"Received instruction: {instruction}")
423
- # Recognize intent
424
- intent = recognize_intent(instruction)
425
- print(f"Recognized intent: {intent}")
426
-
427
- # Generate command based on the recognized intent
428
- command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth)
429
-
430
- return command_output
431
-
432
- def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int) -> str:
433
- urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
434
- urls = [url.strip() for url in urls if url.strip()]
435
- urls = urls[:max_urls]
436
-
437
- if intent == "scrape_links":
438
- all_links = []
439
- for url in urls:
440
- links = extract_links_from_page(url)
441
- all_links.extend(links)
442
- return f"Extracted links: {', '.join(all_links)}"
443
-
444
- elif intent == "extract_images":
445
- all_images = []
446
- for url in urls:
447
- response = requests.get(url, timeout=10)
448
- soup = BeautifulSoup(response.text, 'html.parser')
449
- images = [img['src'] for img in soup.find_all('img', src=True)]
450
- all_images.extend(images)
451
- return f"Extracted images: {', '.join(all_images)}"
452
-
453
- elif intent == "monitor_changes":
454
- changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
455
- return changes_log
456
-
457
- return "Instruction not recognized. Please try again."
458
-
459
  if __name__ == "__main__":
460
  demo = create_interface() # Call the function to create the interface
461
- demo.launch() # Launch the Gradio app app
 
3
  import re
4
  import logging
5
  import json
6
+ from bs4 import BeautifulSoup
7
  from selenium import webdriver
8
  from selenium.webdriver.chrome.options import Options
9
  from PIL import Image
 
12
  import os
13
  import datetime
14
  from urllib.parse import urlparse
 
 
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO,
 
252
 
253
  def recognize_intent(instruction: str) -> str:
254
  instruction = instruction.lower()
255
+
256
+ # Patterns for counting images
257
+ if re.search(r'\b(count the images|how many images|total images|image count)', instruction):
258
+ return "count_images"
259
+
260
+ # Patterns for listing links
261
+ elif re.search(r'\b(list all links|find hyperlinks|show me urls|extract links)', instruction):
262
  return "scrape_links"
263
+
264
+ # Patterns for monitoring changes
265
+ elif re.search(r'\b(monitor changes|watch for updates|detect changes|track updates)', instruction):
266
  return "monitor_changes"
267
+
268
  else:
269
  return "unknown"
270
 
271
+ def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int) -> str:
272
  urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
273
  urls = [url.strip() for url in urls if url.strip()]
274
  urls = urls[:max_urls]
 
280
  all_links.extend(links)
281
  return f"Extracted links: {', '.join(all_links)}"
282
 
283
+ elif intent == "count_images":
284
+ total_images = 0
285
  for url in urls:
286
  response = requests.get(url, timeout=10)
287
  soup = BeautifulSoup(response.text, 'html.parser')
288
+ images = soup.find_all('img')
289
+ total_images += len(images)
290
+ return f"There are {total_images} images across the specified URLs."
291
 
292
  elif intent == "monitor_changes":
293
  changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
 
295
 
296
  return "Instruction not recognized. Please try again."
297
 
298
+ def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
299
+ print(f"Received instruction: {instruction}")
300
  # Recognize intent
301
  intent = recognize_intent(instruction)
302
+ print(f"Recognized intent: {intent}")
303
 
304
  # Generate command based on the recognized intent
305
+ command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth)
306
 
307
  return command_output
308
+
309
  def create_interface():
310
  """Create the Gradio interface."""
311
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
397
  )
398
  chat_output = gr.Textbox(label="Chat Output")
399
 
 
 
 
400
  chat_button = gr.Button("Submit Instruction", variant="primary")
401
 
402
  chat_button.click(
 
424
 
425
  return demo
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  if __name__ == "__main__":
428
  demo = create_interface() # Call the function to create the interface
429
+ demo.launch() # Launch the Gradio app