Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -136,7 +136,7 @@ def crawl_url(url, depth, max_depth, visited=None):
|
|
136 |
|
137 |
return screenshots
|
138 |
|
139 |
-
def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, progress=gr.Progress()):
|
140 |
"""Process URLs with crawl depth and change detection."""
|
141 |
# Validate URLs first
|
142 |
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
@@ -146,7 +146,10 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
146 |
# Validate all URLs
|
147 |
invalid_urls = [url for url in urls if not validate_url(url)]
|
148 |
if invalid_urls:
|
149 |
-
|
|
|
|
|
|
|
150 |
|
151 |
scraped_data = []
|
152 |
screenshots = []
|
@@ -213,72 +216,53 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, pr
|
|
213 |
# Update progress
|
214 |
progress((idx + 1) / total_urls)
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
-
#
|
226 |
-
|
227 |
-
data_to_save = {
|
228 |
-
'scraped_data': scraped_data,
|
229 |
-
'changes_log': changes_log,
|
230 |
-
'timestamp': datetime.datetime.now().isoformat()
|
231 |
-
}
|
232 |
-
zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
|
233 |
|
234 |
-
#
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
# Return the path to the temporary ZIP file and display data
|
246 |
-
return zip_file_path, json.dumps(display_data, indent=2)
|
247 |
-
|
248 |
-
from smolagents import tool
|
249 |
|
250 |
-
@tool
|
251 |
def recognize_intent(instruction: str) -> str:
|
252 |
-
"""
|
253 |
-
Recognizes the intent from the user's instruction.
|
254 |
-
Args:
|
255 |
-
instruction: The input instruction from the user.
|
256 |
-
|
257 |
-
Returns:
|
258 |
-
The recognized intent as a string.
|
259 |
-
"""
|
260 |
instruction = instruction.lower()
|
261 |
if "scrape all links" in instruction:
|
262 |
return "scrape_links"
|
263 |
elif "extract all images" in instruction:
|
264 |
return "extract_images"
|
|
|
|
|
265 |
else:
|
266 |
return "unknown"
|
267 |
|
268 |
-
|
269 |
-
def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int) -> str:
|
270 |
-
"""
|
271 |
-
Generates a command based on the recognized intent.
|
272 |
-
|
273 |
-
Args:
|
274 |
-
intent: The recognized intent from the user input.
|
275 |
-
url_input: The input URL(s) from the user.
|
276 |
-
bulk_toggle: Indicates if multiple URLs are being processed.
|
277 |
-
max_urls: The maximum number of URLs to process.
|
278 |
-
|
279 |
-
Returns:
|
280 |
-
The result of the command execution.
|
281 |
-
"""
|
282 |
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
283 |
urls = [url.strip() for url in urls if url.strip()]
|
284 |
urls = urls[:max_urls]
|
@@ -299,15 +283,18 @@ def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: i
|
|
299 |
all_images.extend(images)
|
300 |
return f"Extracted images: {', '.join(all_images)}"
|
301 |
|
|
|
|
|
|
|
|
|
302 |
return "Instruction not recognized. Please try again."
|
303 |
|
304 |
-
def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
|
305 |
-
"""Handle chat-based instructions for scraping."""
|
306 |
# Recognize intent
|
307 |
intent = recognize_intent(instruction)
|
308 |
|
309 |
# Generate command based on the recognized intent
|
310 |
-
command_output = generate_command(intent, url_input, bulk_toggle, max_urls)
|
311 |
|
312 |
return command_output
|
313 |
|
@@ -393,6 +380,13 @@ def create_interface():
|
|
393 |
step=1,
|
394 |
label="Max URLs to process"
|
395 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
chat_output = gr.Textbox(label="Chat Output")
|
397 |
|
398 |
chat_button = gr.Button("Submit Instruction", variant="primary")
|
@@ -403,7 +397,9 @@ def create_interface():
|
|
403 |
chat_instruction,
|
404 |
chat_url_input,
|
405 |
chat_bulk_toggle,
|
406 |
-
chat_max_urls
|
|
|
|
|
407 |
],
|
408 |
outputs=chat_output
|
409 |
)
|
@@ -420,6 +416,7 @@ def create_interface():
|
|
420 |
)
|
421 |
|
422 |
return demo
|
|
|
423 |
if __name__ == "__main__":
|
424 |
demo = create_interface() # Call the function to create the interface
|
425 |
demo.launch() # Launch the Gradio app
|
|
|
136 |
|
137 |
return screenshots
|
138 |
|
139 |
+
def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mode='standard', progress=gr.Progress()):
|
140 |
"""Process URLs with crawl depth and change detection."""
|
141 |
# Validate URLs first
|
142 |
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
|
|
146 |
# Validate all URLs
|
147 |
invalid_urls = [url for url in urls if not validate_url(url)]
|
148 |
if invalid_urls:
|
149 |
+
if mode == 'chat':
|
150 |
+
return f"Invalid URLs detected: {', '.join(invalid_urls)}"
|
151 |
+
else:
|
152 |
+
return None, json.dumps({"error": f"Invalid URLs detected: {', '.join(invalid_urls)}"}, indent=2)
|
153 |
|
154 |
scraped_data = []
|
155 |
screenshots = []
|
|
|
216 |
# Update progress
|
217 |
progress((idx + 1) / total_urls)
|
218 |
|
219 |
+
if mode == 'chat':
|
220 |
+
return "\n".join(changes_log)
|
221 |
+
else:
|
222 |
+
# Create a temporary file to store the ZIP
|
223 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
|
224 |
+
with zipfile.ZipFile(tmp_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
225 |
+
# Add screenshots to ZIP
|
226 |
+
for screenshot_url, screenshot_data in screenshots:
|
227 |
+
sanitized_screenshot_url = sanitize_filename(screenshot_url)
|
228 |
+
filename = f"{sanitized_screenshot_url}.png"
|
229 |
+
zipf.writestr(filename, screenshot_data)
|
230 |
+
|
231 |
+
# Add scraped data and changes log to ZIP
|
232 |
+
if scraped_data:
|
233 |
+
data_to_save = {
|
234 |
+
'scraped_data': scraped_data,
|
235 |
+
'changes_log': changes_log,
|
236 |
+
'timestamp': datetime.datetime.now().isoformat()
|
237 |
+
}
|
238 |
+
zipf.writestr('data.json', json.dumps(data_to_save, indent=2))
|
239 |
|
240 |
+
# Get the path to the temporary file
|
241 |
+
zip_file_path = tmp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
+
# Prepare display data
|
244 |
+
display_data = {
|
245 |
+
'total_scraped_urls': len(scraped_data),
|
246 |
+
'total_screenshots_taken': len(screenshots),
|
247 |
+
'changes_detected': changes_log,
|
248 |
+
'scraped_data': scraped_data # Include full scraped data
|
249 |
+
}
|
250 |
+
|
251 |
+
# Return the path to the temporary ZIP file and display data
|
252 |
+
return zip_file_path, json.dumps(display_data, indent=2)
|
|
|
|
|
|
|
|
|
|
|
253 |
|
|
|
254 |
def recognize_intent(instruction: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
instruction = instruction.lower()
|
256 |
if "scrape all links" in instruction:
|
257 |
return "scrape_links"
|
258 |
elif "extract all images" in instruction:
|
259 |
return "extract_images"
|
260 |
+
elif "monitor changes" in instruction:
|
261 |
+
return "monitor_changes"
|
262 |
else:
|
263 |
return "unknown"
|
264 |
|
265 |
+
def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int, session_id: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
|
267 |
urls = [url.strip() for url in urls if url.strip()]
|
268 |
urls = urls[:max_urls]
|
|
|
283 |
all_images.extend(images)
|
284 |
return f"Extracted images: {', '.join(all_images)}"
|
285 |
|
286 |
+
elif intent == "monitor_changes":
|
287 |
+
changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
|
288 |
+
return changes_log
|
289 |
+
|
290 |
return "Instruction not recognized. Please try again."
|
291 |
|
292 |
+
def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth, session_id):
|
|
|
293 |
# Recognize intent
|
294 |
intent = recognize_intent(instruction)
|
295 |
|
296 |
# Generate command based on the recognized intent
|
297 |
+
command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth, session_id)
|
298 |
|
299 |
return command_output
|
300 |
|
|
|
380 |
step=1,
|
381 |
label="Max URLs to process"
|
382 |
)
|
383 |
+
chat_crawl_depth = gr.Slider(
|
384 |
+
minimum=1,
|
385 |
+
maximum=3,
|
386 |
+
value=1,
|
387 |
+
step=1,
|
388 |
+
label="Crawl Depth"
|
389 |
+
)
|
390 |
chat_output = gr.Textbox(label="Chat Output")
|
391 |
|
392 |
chat_button = gr.Button("Submit Instruction", variant="primary")
|
|
|
397 |
chat_instruction,
|
398 |
chat_url_input,
|
399 |
chat_bulk_toggle,
|
400 |
+
chat_max_urls,
|
401 |
+
chat_crawl_depth,
|
402 |
+
gr.Session
|
403 |
],
|
404 |
outputs=chat_output
|
405 |
)
|
|
|
416 |
)
|
417 |
|
418 |
return demo
|
419 |
+
|
420 |
if __name__ == "__main__":
|
421 |
demo = create_interface() # Call the function to create the interface
|
422 |
demo.launch() # Launch the Gradio app
|