SoMeScreenShotter

Runtime error

App Files Files Community

acecalisto3 commited on Jan 13

Commit

68b30df

verified ·

1 Parent(s): 52c508e

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -69

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import zipfile
 import os
 import datetime
 from urllib.parse import urlparse
 # Configure logging
 logging.basicConfig(level=logging.INFO,
@@ -253,56 +254,109 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mo
 def recognize_intent(instruction: str) -> str:
     instruction = instruction.lower()
-    # Patterns for counting images
-    if re.search(r'\b(count the images|how many images|total images|image count)', instruction):
-        return "count_images"
-    # Patterns for listing links
-    elif re.search(r'\b(list all links|find hyperlinks|show me urls|extract links)', instruction):
-        return "scrape_links"
-    # Patterns for monitoring changes
-    elif re.search(r'\b(monitor changes|watch for updates|detect changes|track updates)', instruction):
-        return "monitor_changes"
-    else:
-        return "unknown"
-def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int) -> str:
-    urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
-    urls = [url.strip() for url in urls if url.strip()]
-    urls = urls[:max_urls]
-    if intent == "scrape_links":
-        all_links = []
-        for url in urls:
-            links = extract_links_from_page(url)
-            all_links.extend(links)
-        return f"Extracted links: {', '.join(all_links)}"
-    elif intent == "count_images":
-        total_images = 0
-        for url in urls:
-            response = requests.get(url, timeout=10)
-            soup = BeautifulSoup(response.text, 'html.parser')
-            images = soup.find_all('img')
-            total_images += len(images)
-        return f"There are {total_images} images across the specified URLs."
     elif intent == "monitor_changes":
-        changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
         return changes_log
-    return "Instruction not recognized. Please try again."
-def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
-    print(f"Received instruction: {instruction}")
-    # Recognize intent
     intent = recognize_intent(instruction)
-    print(f"Recognized intent: {intent}")
     # Generate command based on the recognized intent
-    command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth)
     return command_output
@@ -371,44 +425,28 @@ def create_interface():
                 )
             with gr.Tab("Chat-Based Scrape"):
-                chat_instruction = gr.Textbox(
                     label="Enter Instruction",
                     placeholder="e.g., 'Scrape all links' or 'Extract all images'"
                 )
-                chat_url_input = gr.Textbox(
-                    label="Enter URL(s)",
                     value="https://example.com",
-                    placeholder="Enter single URL or multiple URLs separated by commas"
-                )
-                chat_bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
-                chat_max_urls = gr.Slider(
-                    minimum=1,
-                    maximum=20,
-                    value=5,
-                    step=1,
-                    label="Max URLs to process"
                 )
-                chat_crawl_depth = gr.Slider(
-                    minimum=1,
-                    maximum=3,
-                    value=1,
-                    step=1,
-                    label="Crawl Depth"
                 )
-                chat_output = gr.Textbox(label="Chat Output")
-                chat_button = gr.Button("Submit Instruction", variant="primary")
                 chat_button.click(
                     fn=chat_based_scrape,
-                    inputs=[
-                        chat_instruction,
-                        chat_url_input,
-                        chat_bulk_toggle,
-                        chat_max_urls,
-                        chat_crawl_depth
-                    ],
-                    outputs=chat_output
                 )
         gr.Markdown(

 import os
 import datetime
 from urllib.parse import urlparse
+import tempfile
 # Configure logging
 logging.basicConfig(level=logging.INFO,
 def recognize_intent(instruction: str) -> str:
     instruction = instruction.lower()
+    # General patterns for actions and data types
+    action_patterns = {
+        r'\b(find|extract|scrape)\s+(links|images|videos|texts|prices|product names|reviews)\b': 'extract_data',
+        r'\b(count)\s+(links|images|videos|products)\b': 'count_data',
+        r'\b(what is|get|fetch)\s+(channel name|subscriber count|viewers)\b': 'fetch_specific_data',
+        r'\b(monitor)\s+changes\b': 'monitor_changes',
+    }
+    for pattern, intent in action_patterns.items():
+        if re.search(pattern, instruction):
+            return intent
+    return "unknown"
+def extract_data_type(instruction: str) -> str:
+    instruction = instruction.lower()
+    data_types = {
+        r'\b(links|images|videos|texts|prices|product names|reviews)\b': 'links',
+        r'\b(links|images|videos|products)\b': 'images',
+        r'\b(channel name|subscriber count|viewers)\b': 'channel name',
+    }
+    for pattern, data_type in data_types.items():
+        if re.search(pattern, instruction):
+            return data_type
+    return "unknown"
+def format_output(data, output_format):
+    if output_format == "JSON":
+        return json.dumps(data, indent=2)
+    elif output_format == "Cleaned JSON":
+        # Implement data cleaning logic here
+        return json.dumps(data, indent=2)
+    else:
+        return str(data)
+def generate_command(intent: str, url_input: str, data_type: str, output_format: str) -> str:
+    if intent == "extract_data":
+        data = extract_data(url_input, data_type)
+        return format_output(data, output_format)
+    elif intent == "count_data":
+        count = count_data(url_input, data_type)
+        return f"The number of {data_type} is {count}."
+    elif intent == "fetch_specific_data":
+        specific_data = fetch_specific_data(url_input, data_type)
+        return specific_data
     elif intent == "monitor_changes":
+        changes_log = monitor_changes(url_input)
         return changes_log
+    else:
+        return "Instruction not recognized. Please try again."
+def extract_data(url, data_type):
+    try:
+        response = requests.get(url)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        if data_type == "links":
+            return [a['href'] for a in soup.find_all('a', href=True)]
+        elif data_type == "images":
+            return [img['src'] for img in soup.find_all('img', src=True)]
+        # Add more data types as needed
+        else:
+            return []
+    except Exception as e:
+        return f"Error extracting {data_type}: {str(e)}"
+def count_data(url, data_type):
+    try:
+        response = requests.get(url)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        if data_type == "links":
+            return len(soup.find_all('a', href=True))
+        elif data_type == "images":
+            return len(soup.find_all('img', src=True))
+        # Add more data types as needed
+        else:
+            return 0
+    except Exception as e:
+        return f"Error counting {data_type}: {str(e)}"
+def fetch_specific_data(url, data_type):
+    try:
+        # Implement specific data fetching logic here
+        # For demonstration, return a placeholder
+        return f"Fetched {data_type} from {url}"
+    except Exception as e:
+        return f"Error fetching {data_type}: {str(e)}"
+def monitor_changes(url_input):
+    try:
+        # Implement change monitoring logic here
+        # For demonstration, return a placeholder
+        return f"Changes monitored for {url_input}"
+    except Exception as e:
+        return f"Error monitoring changes: {str(e)}"
+def chat_based_scrape(instruction, url_input, output_format):
+    # Recognize intent and extract data type if applicable
     intent = recognize_intent(instruction)
+    data_type = extract_data_type(instruction)
     # Generate command based on the recognized intent
+    command_output = generate_command(intent, url_input, data_type, output_format)
     return command_output
                 )
             with gr.Tab("Chat-Based Scrape"):
+                instruction = gr.Textbox(
                     label="Enter Instruction",
                     placeholder="e.g., 'Scrape all links' or 'Extract all images'"
                 )
+                url_input = gr.Textbox(
+                    label="Enter URL",
                     value="https://example.com",
+                    placeholder="Enter the target URL"
                 )
+                output_format = gr.Radio(
+                    ["JSON", "Cleaned JSON", "Raw Data"],
+                    label="Output Format",
+                    value="JSON"
                 )
+                output = gr.Textbox(label="Output")
+                chat_button = gr.Button("Execute Instruction", variant="primary")
                 chat_button.click(
                     fn=chat_based_scrape,
+                    inputs=[instruction, url_input, output_format],
+                    outputs=output
                 )
         gr.Markdown(