Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ import zipfile
|
|
12 |
import os
|
13 |
import datetime
|
14 |
from urllib.parse import urlparse
|
|
|
15 |
|
16 |
# Configure logging
|
17 |
logging.basicConfig(level=logging.INFO,
|
@@ -253,56 +254,109 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mo
|
|
253 |
def recognize_intent(instruction: str) -> str:
|
254 |
instruction = instruction.lower()
|
255 |
|
256 |
-
#
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
# Patterns for monitoring changes
|
265 |
-
elif re.search(r'\b(monitor changes|watch for updates|detect changes|track updates)', instruction):
|
266 |
-
return "monitor_changes"
|
267 |
-
|
268 |
-
else:
|
269 |
-
return "unknown"
|
270 |
|
271 |
-
def
|
272 |
-
|
273 |
-
|
274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
return
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
|
|
|
|
|
|
292 |
elif intent == "monitor_changes":
|
293 |
-
changes_log =
|
294 |
return changes_log
|
295 |
-
|
296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
|
298 |
-
def
|
299 |
-
|
300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
intent = recognize_intent(instruction)
|
302 |
-
|
303 |
|
304 |
# Generate command based on the recognized intent
|
305 |
-
command_output = generate_command(intent, url_input,
|
306 |
|
307 |
return command_output
|
308 |
|
@@ -371,44 +425,28 @@ def create_interface():
|
|
371 |
)
|
372 |
|
373 |
with gr.Tab("Chat-Based Scrape"):
|
374 |
-
|
375 |
label="Enter Instruction",
|
376 |
placeholder="e.g., 'Scrape all links' or 'Extract all images'"
|
377 |
)
|
378 |
-
|
379 |
-
label="Enter URL
|
380 |
value="https://example.com",
|
381 |
-
placeholder="Enter
|
382 |
-
)
|
383 |
-
chat_bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
|
384 |
-
chat_max_urls = gr.Slider(
|
385 |
-
minimum=1,
|
386 |
-
maximum=20,
|
387 |
-
value=5,
|
388 |
-
step=1,
|
389 |
-
label="Max URLs to process"
|
390 |
)
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
value=
|
395 |
-
step=1,
|
396 |
-
label="Crawl Depth"
|
397 |
)
|
398 |
-
|
399 |
|
400 |
-
chat_button = gr.Button("
|
401 |
|
402 |
chat_button.click(
|
403 |
fn=chat_based_scrape,
|
404 |
-
inputs=[
|
405 |
-
|
406 |
-
chat_url_input,
|
407 |
-
chat_bulk_toggle,
|
408 |
-
chat_max_urls,
|
409 |
-
chat_crawl_depth
|
410 |
-
],
|
411 |
-
outputs=chat_output
|
412 |
)
|
413 |
|
414 |
gr.Markdown(
|
|
|
12 |
import os
|
13 |
import datetime
|
14 |
from urllib.parse import urlparse
|
15 |
+
import tempfile
|
16 |
|
17 |
# Configure logging
|
18 |
logging.basicConfig(level=logging.INFO,
|
|
|
254 |
def recognize_intent(instruction: str) -> str:
|
255 |
instruction = instruction.lower()
|
256 |
|
257 |
+
# General patterns for actions and data types
|
258 |
+
action_patterns = {
|
259 |
+
r'\b(find|extract|scrape)\s+(links|images|videos|texts|prices|product names|reviews)\b': 'extract_data',
|
260 |
+
r'\b(count)\s+(links|images|videos|products)\b': 'count_data',
|
261 |
+
r'\b(what is|get|fetch)\s+(channel name|subscriber count|viewers)\b': 'fetch_specific_data',
|
262 |
+
r'\b(monitor)\s+changes\b': 'monitor_changes',
|
263 |
+
}
|
264 |
|
265 |
+
for pattern, intent in action_patterns.items():
|
266 |
+
if re.search(pattern, instruction):
|
267 |
+
return intent
|
268 |
+
return "unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
+
def extract_data_type(instruction: str) -> str:
|
271 |
+
instruction = instruction.lower()
|
272 |
+
data_types = {
|
273 |
+
r'\b(links|images|videos|texts|prices|product names|reviews)\b': 'links',
|
274 |
+
r'\b(links|images|videos|products)\b': 'images',
|
275 |
+
r'\b(channel name|subscriber count|viewers)\b': 'channel name',
|
276 |
+
}
|
277 |
+
for pattern, data_type in data_types.items():
|
278 |
+
if re.search(pattern, instruction):
|
279 |
+
return data_type
|
280 |
+
return "unknown"
|
281 |
|
282 |
+
def format_output(data, output_format):
|
283 |
+
if output_format == "JSON":
|
284 |
+
return json.dumps(data, indent=2)
|
285 |
+
elif output_format == "Cleaned JSON":
|
286 |
+
# Implement data cleaning logic here
|
287 |
+
return json.dumps(data, indent=2)
|
288 |
+
else:
|
289 |
+
return str(data)
|
290 |
+
|
291 |
+
def generate_command(intent: str, url_input: str, data_type: str, output_format: str) -> str:
|
292 |
+
if intent == "extract_data":
|
293 |
+
data = extract_data(url_input, data_type)
|
294 |
+
return format_output(data, output_format)
|
295 |
+
elif intent == "count_data":
|
296 |
+
count = count_data(url_input, data_type)
|
297 |
+
return f"The number of {data_type} is {count}."
|
298 |
+
elif intent == "fetch_specific_data":
|
299 |
+
specific_data = fetch_specific_data(url_input, data_type)
|
300 |
+
return specific_data
|
301 |
elif intent == "monitor_changes":
|
302 |
+
changes_log = monitor_changes(url_input)
|
303 |
return changes_log
|
304 |
+
else:
|
305 |
+
return "Instruction not recognized. Please try again."
|
306 |
+
|
307 |
+
def extract_data(url, data_type):
|
308 |
+
try:
|
309 |
+
response = requests.get(url)
|
310 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
311 |
+
|
312 |
+
if data_type == "links":
|
313 |
+
return [a['href'] for a in soup.find_all('a', href=True)]
|
314 |
+
elif data_type == "images":
|
315 |
+
return [img['src'] for img in soup.find_all('img', src=True)]
|
316 |
+
# Add more data types as needed
|
317 |
+
else:
|
318 |
+
return []
|
319 |
+
except Exception as e:
|
320 |
+
return f"Error extracting {data_type}: {str(e)}"
|
321 |
|
322 |
+
def count_data(url, data_type):
|
323 |
+
try:
|
324 |
+
response = requests.get(url)
|
325 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
326 |
+
|
327 |
+
if data_type == "links":
|
328 |
+
return len(soup.find_all('a', href=True))
|
329 |
+
elif data_type == "images":
|
330 |
+
return len(soup.find_all('img', src=True))
|
331 |
+
# Add more data types as needed
|
332 |
+
else:
|
333 |
+
return 0
|
334 |
+
except Exception as e:
|
335 |
+
return f"Error counting {data_type}: {str(e)}"
|
336 |
+
|
337 |
+
def fetch_specific_data(url, data_type):
|
338 |
+
try:
|
339 |
+
# Implement specific data fetching logic here
|
340 |
+
# For demonstration, return a placeholder
|
341 |
+
return f"Fetched {data_type} from {url}"
|
342 |
+
except Exception as e:
|
343 |
+
return f"Error fetching {data_type}: {str(e)}"
|
344 |
+
|
345 |
+
def monitor_changes(url_input):
|
346 |
+
try:
|
347 |
+
# Implement change monitoring logic here
|
348 |
+
# For demonstration, return a placeholder
|
349 |
+
return f"Changes monitored for {url_input}"
|
350 |
+
except Exception as e:
|
351 |
+
return f"Error monitoring changes: {str(e)}"
|
352 |
+
|
353 |
+
def chat_based_scrape(instruction, url_input, output_format):
|
354 |
+
# Recognize intent and extract data type if applicable
|
355 |
intent = recognize_intent(instruction)
|
356 |
+
data_type = extract_data_type(instruction)
|
357 |
|
358 |
# Generate command based on the recognized intent
|
359 |
+
command_output = generate_command(intent, url_input, data_type, output_format)
|
360 |
|
361 |
return command_output
|
362 |
|
|
|
425 |
)
|
426 |
|
427 |
with gr.Tab("Chat-Based Scrape"):
|
428 |
+
instruction = gr.Textbox(
|
429 |
label="Enter Instruction",
|
430 |
placeholder="e.g., 'Scrape all links' or 'Extract all images'"
|
431 |
)
|
432 |
+
url_input = gr.Textbox(
|
433 |
+
label="Enter URL",
|
434 |
value="https://example.com",
|
435 |
+
placeholder="Enter the target URL"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
436 |
)
|
437 |
+
output_format = gr.Radio(
|
438 |
+
["JSON", "Cleaned JSON", "Raw Data"],
|
439 |
+
label="Output Format",
|
440 |
+
value="JSON"
|
|
|
|
|
441 |
)
|
442 |
+
output = gr.Textbox(label="Output")
|
443 |
|
444 |
+
chat_button = gr.Button("Execute Instruction", variant="primary")
|
445 |
|
446 |
chat_button.click(
|
447 |
fn=chat_based_scrape,
|
448 |
+
inputs=[instruction, url_input, output_format],
|
449 |
+
outputs=output
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
)
|
451 |
|
452 |
gr.Markdown(
|