Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ import requests
|
|
3 |
import re
|
4 |
import logging
|
5 |
import json
|
|
|
6 |
from selenium import webdriver
|
7 |
from selenium.webdriver.chrome.options import Options
|
8 |
from PIL import Image
|
@@ -11,8 +12,6 @@ import zipfile
|
|
11 |
import os
|
12 |
import datetime
|
13 |
from urllib.parse import urlparse
|
14 |
-
from bs4 import BeautifulSoup
|
15 |
-
import tempfile
|
16 |
|
17 |
# Configure logging
|
18 |
logging.basicConfig(level=logging.INFO,
|
@@ -253,16 +252,23 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mo
|
|
253 |
|
254 |
def recognize_intent(instruction: str) -> str:
|
255 |
instruction = instruction.lower()
|
256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
return "scrape_links"
|
258 |
-
|
259 |
-
|
260 |
-
elif re.search(r'\
|
261 |
return "monitor_changes"
|
|
|
262 |
else:
|
263 |
return "unknown"
|
264 |
|
265 |
-
def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int
|
266 |
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
267 |
urls = [url.strip() for url in urls if url.strip()]
|
268 |
urls = urls[:max_urls]
|
@@ -274,14 +280,14 @@ def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: i
|
|
274 |
all_links.extend(links)
|
275 |
return f"Extracted links: {', '.join(all_links)}"
|
276 |
|
277 |
-
elif intent == "
|
278 |
-
|
279 |
for url in urls:
|
280 |
response = requests.get(url, timeout=10)
|
281 |
soup = BeautifulSoup(response.text, 'html.parser')
|
282 |
-
images =
|
283 |
-
|
284 |
-
return f"
|
285 |
|
286 |
elif intent == "monitor_changes":
|
287 |
changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
|
@@ -289,14 +295,17 @@ def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: i
|
|
289 |
|
290 |
return "Instruction not recognized. Please try again."
|
291 |
|
292 |
-
def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth
|
|
|
293 |
# Recognize intent
|
294 |
intent = recognize_intent(instruction)
|
|
|
295 |
|
296 |
# Generate command based on the recognized intent
|
297 |
-
command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth
|
298 |
|
299 |
return command_output
|
|
|
300 |
def create_interface():
|
301 |
"""Create the Gradio interface."""
|
302 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
@@ -388,9 +397,6 @@ def create_interface():
|
|
388 |
)
|
389 |
chat_output = gr.Textbox(label="Chat Output")
|
390 |
|
391 |
-
# Initialize session state
|
392 |
-
session_state = gr.State({})
|
393 |
-
|
394 |
chat_button = gr.Button("Submit Instruction", variant="primary")
|
395 |
|
396 |
chat_button.click(
|
@@ -418,44 +424,6 @@ def create_interface():
|
|
418 |
|
419 |
return demo
|
420 |
|
421 |
-
def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
|
422 |
-
print(f"Received instruction: {instruction}")
|
423 |
-
# Recognize intent
|
424 |
-
intent = recognize_intent(instruction)
|
425 |
-
print(f"Recognized intent: {intent}")
|
426 |
-
|
427 |
-
# Generate command based on the recognized intent
|
428 |
-
command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth)
|
429 |
-
|
430 |
-
return command_output
|
431 |
-
|
432 |
-
def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int) -> str:
|
433 |
-
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
434 |
-
urls = [url.strip() for url in urls if url.strip()]
|
435 |
-
urls = urls[:max_urls]
|
436 |
-
|
437 |
-
if intent == "scrape_links":
|
438 |
-
all_links = []
|
439 |
-
for url in urls:
|
440 |
-
links = extract_links_from_page(url)
|
441 |
-
all_links.extend(links)
|
442 |
-
return f"Extracted links: {', '.join(all_links)}"
|
443 |
-
|
444 |
-
elif intent == "extract_images":
|
445 |
-
all_images = []
|
446 |
-
for url in urls:
|
447 |
-
response = requests.get(url, timeout=10)
|
448 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
449 |
-
images = [img['src'] for img in soup.find_all('img', src=True)]
|
450 |
-
all_images.extend(images)
|
451 |
-
return f"Extracted images: {', '.join(all_images)}"
|
452 |
-
|
453 |
-
elif intent == "monitor_changes":
|
454 |
-
changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
|
455 |
-
return changes_log
|
456 |
-
|
457 |
-
return "Instruction not recognized. Please try again."
|
458 |
-
|
459 |
if __name__ == "__main__":
|
460 |
demo = create_interface() # Call the function to create the interface
|
461 |
-
demo.launch() # Launch the Gradio app
|
|
|
3 |
import re
|
4 |
import logging
|
5 |
import json
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
from selenium import webdriver
|
8 |
from selenium.webdriver.chrome.options import Options
|
9 |
from PIL import Image
|
|
|
12 |
import os
|
13 |
import datetime
|
14 |
from urllib.parse import urlparse
|
|
|
|
|
15 |
|
16 |
# Configure logging
|
17 |
logging.basicConfig(level=logging.INFO,
|
|
|
252 |
|
253 |
def recognize_intent(instruction: str) -> str:
|
254 |
instruction = instruction.lower()
|
255 |
+
|
256 |
+
# Patterns for counting images
|
257 |
+
if re.search(r'\b(count the images|how many images|total images|image count)', instruction):
|
258 |
+
return "count_images"
|
259 |
+
|
260 |
+
# Patterns for listing links
|
261 |
+
elif re.search(r'\b(list all links|find hyperlinks|show me urls|extract links)', instruction):
|
262 |
return "scrape_links"
|
263 |
+
|
264 |
+
# Patterns for monitoring changes
|
265 |
+
elif re.search(r'\b(monitor changes|watch for updates|detect changes|track updates)', instruction):
|
266 |
return "monitor_changes"
|
267 |
+
|
268 |
else:
|
269 |
return "unknown"
|
270 |
|
271 |
+
def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int) -> str:
|
272 |
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
273 |
urls = [url.strip() for url in urls if url.strip()]
|
274 |
urls = urls[:max_urls]
|
|
|
280 |
all_links.extend(links)
|
281 |
return f"Extracted links: {', '.join(all_links)}"
|
282 |
|
283 |
+
elif intent == "count_images":
|
284 |
+
total_images = 0
|
285 |
for url in urls:
|
286 |
response = requests.get(url, timeout=10)
|
287 |
soup = BeautifulSoup(response.text, 'html.parser')
|
288 |
+
images = soup.find_all('img')
|
289 |
+
total_images += len(images)
|
290 |
+
return f"There are {total_images} images across the specified URLs."
|
291 |
|
292 |
elif intent == "monitor_changes":
|
293 |
changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
|
|
|
295 |
|
296 |
return "Instruction not recognized. Please try again."
|
297 |
|
298 |
+
def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
|
299 |
+
print(f"Received instruction: {instruction}")
|
300 |
# Recognize intent
|
301 |
intent = recognize_intent(instruction)
|
302 |
+
print(f"Recognized intent: {intent}")
|
303 |
|
304 |
# Generate command based on the recognized intent
|
305 |
+
command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth)
|
306 |
|
307 |
return command_output
|
308 |
+
|
309 |
def create_interface():
|
310 |
"""Create the Gradio interface."""
|
311 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
|
397 |
)
|
398 |
chat_output = gr.Textbox(label="Chat Output")
|
399 |
|
|
|
|
|
|
|
400 |
chat_button = gr.Button("Submit Instruction", variant="primary")
|
401 |
|
402 |
chat_button.click(
|
|
|
424 |
|
425 |
return demo
|
426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
if __name__ == "__main__":
|
428 |
demo = create_interface() # Call the function to create the interface
|
429 |
+
demo.launch() # Launch the Gradio app
|